{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999969938373666, "eval_steps": 500, "global_step": 16632, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.05028445, "auxiliary_loss_mlp": 0.02215396, "balance_loss_clip": 2.43573999, "balance_loss_mlp": 1.76983953, "epoch": 6.012325266796934e-05, "flos": 24456507091200.0, "grad_norm": 55.00561300220404, "language_loss": 2.85272503, "learning_rate": 0.0, "loss": 1.94613922, "num_input_tokens_seen": 19155, "step": 1, "time_per_iteration": 18.059409618377686 }, { "auxiliary_loss_clip": 0.03380539, "auxiliary_loss_mlp": 0.01459449, "balance_loss_clip": 1.62786555, "balance_loss_mlp": 1.18936849, "epoch": 0.00012024650533593868, "flos": 20225931246720.0, "grad_norm": 34.93149751452764, "language_loss": 1.82606053, "learning_rate": 4.4628432569317594e-07, "loss": 1.87446034, "num_input_tokens_seen": 36175, "step": 2, "time_per_iteration": 2.6318798065185547 }, { "auxiliary_loss_clip": 0.03320229, "auxiliary_loss_mlp": 0.01440978, "balance_loss_clip": 1.62577581, "balance_loss_mlp": 1.18882656, "epoch": 0.000180369758003908, "flos": 22309935454080.0, "grad_norm": 32.71870482280511, "language_loss": 1.57573509, "learning_rate": 7.073439208833112e-07, "loss": 1.62334716, "num_input_tokens_seen": 54870, "step": 3, "time_per_iteration": 2.6362481117248535 }, { "auxiliary_loss_clip": 0.03361497, "auxiliary_loss_mlp": 0.01451404, "balance_loss_clip": 1.62418985, "balance_loss_mlp": 1.15500188, "epoch": 0.00024049301067187735, "flos": 22414650577920.0, "grad_norm": 51.2387172839747, "language_loss": 1.67362881, "learning_rate": 8.925686513863519e-07, "loss": 1.72175777, "num_input_tokens_seen": 74575, "step": 4, "time_per_iteration": 2.7070822715759277 }, { "auxiliary_loss_clip": 0.03402497, "auxiliary_loss_mlp": 0.01505358, "balance_loss_clip": 1.62493396, "balance_loss_mlp": 1.21715808, "epoch": 0.0003006162633398467, "flos": 21396978449280.0, "grad_norm": 56.088721215944275, "language_loss": 1.91627169, "learning_rate": 1.0362401141348472e-06, "loss": 1.96535027, "num_input_tokens_seen": 92580, "step": 5, "time_per_iteration": 2.91436767578125 }, { "auxiliary_loss_clip": 0.03370454, "auxiliary_loss_mlp": 0.01515599, "balance_loss_clip": 1.61556244, "balance_loss_mlp": 1.22110426, "epoch": 0.000360739516007816, "flos": 21652375127040.0, "grad_norm": 33.397169652317885, "language_loss": 1.60591149, "learning_rate": 1.153628246576487e-06, "loss": 1.65477204, "num_input_tokens_seen": 109705, "step": 6, "time_per_iteration": 2.994969367980957 }, { "auxiliary_loss_clip": 0.03354239, "auxiliary_loss_mlp": 0.01486417, "balance_loss_clip": 1.61577415, "balance_loss_mlp": 1.20336628, "epoch": 0.0004208627686757854, "flos": 27159742897920.0, "grad_norm": 24.6270766983672, "language_loss": 1.53276002, "learning_rate": 1.2528784983718962e-06, "loss": 1.58116663, "num_input_tokens_seen": 129425, "step": 7, "time_per_iteration": 3.0675876140594482 }, { "auxiliary_loss_clip": 0.03321216, "auxiliary_loss_mlp": 0.0144328, "balance_loss_clip": 1.61205018, "balance_loss_mlp": 1.16499734, "epoch": 0.0004809860213437547, "flos": 31319096135040.0, "grad_norm": 31.71613063643349, "language_loss": 1.43881059, "learning_rate": 1.338852977079528e-06, "loss": 1.48645568, "num_input_tokens_seen": 149210, "step": 8, "time_per_iteration": 3.172358751296997 }, { "auxiliary_loss_clip": 0.03368839, "auxiliary_loss_mlp": 0.01496105, "balance_loss_clip": 1.6120348, "balance_loss_mlp": 1.21229148, "epoch": 0.000541109274011724, "flos": 32160411463680.0, "grad_norm": 28.204490849684397, "language_loss": 1.4969244, "learning_rate": 1.4146878417666224e-06, "loss": 1.54557395, "num_input_tokens_seen": 169055, "step": 9, "time_per_iteration": 3.112215280532837 }, { "auxiliary_loss_clip": 0.03308365, "auxiliary_loss_mlp": 0.01475035, "balance_loss_clip": 1.61541438, "balance_loss_mlp": 1.20647991, "epoch": 0.0006012325266796934, "flos": 18916808163840.0, "grad_norm": 23.420774723604698, "language_loss": 1.44714785, "learning_rate": 1.4825244398280232e-06, "loss": 1.49498188, "num_input_tokens_seen": 188045, "step": 10, "time_per_iteration": 2.9495606422424316 }, { "auxiliary_loss_clip": 0.03364194, "auxiliary_loss_mlp": 0.01494262, "balance_loss_clip": 1.62042511, "balance_loss_mlp": 1.22036684, "epoch": 0.0006613557793476627, "flos": 20774861867520.0, "grad_norm": 18.353281468858004, "language_loss": 1.4520936, "learning_rate": 1.5438901072051983e-06, "loss": 1.50067806, "num_input_tokens_seen": 207035, "step": 11, "time_per_iteration": 3.0797431468963623 }, { "auxiliary_loss_clip": 0.03292683, "auxiliary_loss_mlp": 0.0145154, "balance_loss_clip": 1.60771322, "balance_loss_mlp": 1.17554641, "epoch": 0.000721479032015632, "flos": 16581680997120.0, "grad_norm": 16.61869254675767, "language_loss": 1.45121813, "learning_rate": 1.5999125722696629e-06, "loss": 1.49866033, "num_input_tokens_seen": 223225, "step": 12, "time_per_iteration": 2.9887659549713135 }, { "auxiliary_loss_clip": 0.03321669, "auxiliary_loss_mlp": 0.01405912, "balance_loss_clip": 1.61740756, "balance_loss_mlp": 1.14765704, "epoch": 0.0007816022846836014, "flos": 23805471144960.0, "grad_norm": 14.02187318243825, "language_loss": 1.23759985, "learning_rate": 1.6514482443788434e-06, "loss": 1.28487587, "num_input_tokens_seen": 242570, "step": 13, "time_per_iteration": 3.032742977142334 }, { "auxiliary_loss_clip": 0.03287474, "auxiliary_loss_mlp": 0.01470749, "balance_loss_clip": 1.61299658, "balance_loss_mlp": 1.20257616, "epoch": 0.0008417255373515708, "flos": 19172204841600.0, "grad_norm": 5.790568956401358, "language_loss": 1.20684385, "learning_rate": 1.6991628240650723e-06, "loss": 1.254426, "num_input_tokens_seen": 261215, "step": 14, "time_per_iteration": 3.002887487411499 }, { "auxiliary_loss_clip": 0.03272826, "auxiliary_loss_mlp": 0.01431255, "balance_loss_clip": 1.6181426, "balance_loss_mlp": 1.16804111, "epoch": 0.00090184879001954, "flos": 26395564026240.0, "grad_norm": 6.353887091300461, "language_loss": 1.12925518, "learning_rate": 1.7435840350181584e-06, "loss": 1.176296, "num_input_tokens_seen": 280035, "step": 15, "time_per_iteration": 3.0238780975341797 }, { "auxiliary_loss_clip": 0.03238489, "auxiliary_loss_mlp": 0.01411651, "balance_loss_clip": 1.60288334, "balance_loss_mlp": 1.16197944, "epoch": 0.0009619720426875094, "flos": 24679500785280.0, "grad_norm": 4.670310144758637, "language_loss": 1.11125767, "learning_rate": 1.7851373027727038e-06, "loss": 1.15775907, "num_input_tokens_seen": 300265, "step": 16, "time_per_iteration": 4.605847120285034 }, { "auxiliary_loss_clip": 0.03223993, "auxiliary_loss_mlp": 0.01417304, "balance_loss_clip": 1.60910368, "balance_loss_mlp": 1.17774093, "epoch": 0.0010220952953554788, "flos": 18624531196800.0, "grad_norm": 8.838429022323517, "language_loss": 1.12645221, "learning_rate": 1.8241705979033208e-06, "loss": 1.17286515, "num_input_tokens_seen": 317375, "step": 17, "time_per_iteration": 4.579033851623535 }, { "auxiliary_loss_clip": 0.03161492, "auxiliary_loss_mlp": 0.01379312, "balance_loss_clip": 1.60685277, "balance_loss_mlp": 1.1475693, "epoch": 0.001082218548023448, "flos": 26142537646080.0, "grad_norm": 3.823557061532633, "language_loss": 1.08069181, "learning_rate": 1.860972167459798e-06, "loss": 1.12609982, "num_input_tokens_seen": 337975, "step": 18, "time_per_iteration": 3.0132579803466797 }, { "auxiliary_loss_clip": 0.0318761, "auxiliary_loss_mlp": 0.01403306, "balance_loss_clip": 1.60585093, "balance_loss_mlp": 1.13799417, "epoch": 0.0011423418006914173, "flos": 19609776322560.0, "grad_norm": 4.403621106373983, "language_loss": 1.02445412, "learning_rate": 1.89578346593066e-06, "loss": 1.07036328, "num_input_tokens_seen": 356635, "step": 19, "time_per_iteration": 3.016176462173462 }, { "auxiliary_loss_clip": 0.0313029, "auxiliary_loss_mlp": 0.01342049, "balance_loss_clip": 1.60759044, "balance_loss_mlp": 1.12155962, "epoch": 0.0012024650533593868, "flos": 17895365107200.0, "grad_norm": 3.958333686933058, "language_loss": 1.16706228, "learning_rate": 1.928808765521199e-06, "loss": 1.21178555, "num_input_tokens_seen": 375625, "step": 20, "time_per_iteration": 3.0274486541748047 }, { "auxiliary_loss_clip": 0.03118109, "auxiliary_loss_mlp": 0.01378536, "balance_loss_clip": 1.58886433, "balance_loss_mlp": 1.1298182, "epoch": 0.001262588306027356, "flos": 21252043071360.0, "grad_norm": 4.333519066420982, "language_loss": 1.06129968, "learning_rate": 1.9602224192552076e-06, "loss": 1.10626626, "num_input_tokens_seen": 394350, "step": 21, "time_per_iteration": 2.9418578147888184 }, { "auxiliary_loss_clip": 0.03013912, "auxiliary_loss_mlp": 0.0137937, "balance_loss_clip": 1.57028937, "balance_loss_mlp": 1.14552903, "epoch": 0.0013227115586953253, "flos": 26104077158400.0, "grad_norm": 3.63841390311849, "language_loss": 1.05861485, "learning_rate": 1.9901744328983746e-06, "loss": 1.10254765, "num_input_tokens_seen": 413255, "step": 22, "time_per_iteration": 2.9651288986206055 }, { "auxiliary_loss_clip": 0.02966296, "auxiliary_loss_mlp": 0.01334065, "balance_loss_clip": 1.57175612, "balance_loss_mlp": 1.12377954, "epoch": 0.0013828348113632948, "flos": 23951376190080.0, "grad_norm": 2.8746130742538347, "language_loss": 0.9177655, "learning_rate": 2.018794797290208e-06, "loss": 0.96076906, "num_input_tokens_seen": 433065, "step": 23, "time_per_iteration": 3.049853563308716 }, { "auxiliary_loss_clip": 0.02932793, "auxiliary_loss_mlp": 0.01362183, "balance_loss_clip": 1.56404662, "balance_loss_mlp": 1.14236116, "epoch": 0.001442958064031264, "flos": 15959851724160.0, "grad_norm": 3.0897201135857735, "language_loss": 1.08192635, "learning_rate": 2.046196897962839e-06, "loss": 1.12487614, "num_input_tokens_seen": 451175, "step": 24, "time_per_iteration": 3.0543172359466553 }, { "auxiliary_loss_clip": 0.02823838, "auxiliary_loss_mlp": 0.01329007, "balance_loss_clip": 1.55692792, "balance_loss_mlp": 1.11853111, "epoch": 0.0015030813166992333, "flos": 18108350801280.0, "grad_norm": 4.111246686692462, "language_loss": 1.01367807, "learning_rate": 2.0724802282696944e-06, "loss": 1.05520654, "num_input_tokens_seen": 468775, "step": 25, "time_per_iteration": 3.0059614181518555 }, { "auxiliary_loss_clip": 0.02818207, "auxiliary_loss_mlp": 0.01309454, "balance_loss_clip": 1.55974329, "balance_loss_mlp": 1.10012197, "epoch": 0.0015632045693672028, "flos": 22234558763520.0, "grad_norm": 2.7163042439620018, "language_loss": 1.0669204, "learning_rate": 2.0977325700720194e-06, "loss": 1.10819697, "num_input_tokens_seen": 488530, "step": 26, "time_per_iteration": 3.1159534454345703 }, { "auxiliary_loss_clip": 0.0276047, "auxiliary_loss_mlp": 0.01325034, "balance_loss_clip": 1.54973662, "balance_loss_mlp": 1.12533486, "epoch": 0.001623327822035172, "flos": 23991955580160.0, "grad_norm": 2.562596284241794, "language_loss": 0.95537072, "learning_rate": 2.122031762649933e-06, "loss": 0.99622583, "num_input_tokens_seen": 510495, "step": 27, "time_per_iteration": 3.018643617630005 }, { "auxiliary_loss_clip": 0.02736222, "auxiliary_loss_mlp": 0.01311707, "balance_loss_clip": 1.55399776, "balance_loss_mlp": 1.13089037, "epoch": 0.0016834510747031415, "flos": 19677647070720.0, "grad_norm": 2.42975125432869, "language_loss": 1.06393945, "learning_rate": 2.1454471497582483e-06, "loss": 1.10441875, "num_input_tokens_seen": 528605, "step": 28, "time_per_iteration": 2.9263083934783936 }, { "auxiliary_loss_clip": 0.0270011, "auxiliary_loss_mlp": 0.0131913, "balance_loss_clip": 1.53841436, "balance_loss_mlp": 1.13297284, "epoch": 0.0017435743273711108, "flos": 20923819568640.0, "grad_norm": 4.42090805909513, "language_loss": 1.02493238, "learning_rate": 2.1680407726407727e-06, "loss": 1.06512475, "num_input_tokens_seen": 548515, "step": 29, "time_per_iteration": 3.0062997341156006 }, { "auxiliary_loss_clip": 0.0269246, "auxiliary_loss_mlp": 0.01312758, "balance_loss_clip": 1.53459728, "balance_loss_mlp": 1.12631428, "epoch": 0.00180369758003908, "flos": 19528976678400.0, "grad_norm": 3.1534114534186446, "language_loss": 1.19265521, "learning_rate": 2.189868360711334e-06, "loss": 1.23270726, "num_input_tokens_seen": 564025, "step": 30, "time_per_iteration": 2.931145429611206 }, { "auxiliary_loss_clip": 0.02610377, "auxiliary_loss_mlp": 0.01337183, "balance_loss_clip": 1.52116311, "balance_loss_mlp": 1.15665221, "epoch": 0.0018638208327070496, "flos": 27453169100160.0, "grad_norm": 2.735994596991484, "language_loss": 1.02616811, "learning_rate": 2.2109801597326265e-06, "loss": 1.06564379, "num_input_tokens_seen": 583345, "step": 31, "time_per_iteration": 2.993251085281372 }, { "auxiliary_loss_clip": 0.02582044, "auxiliary_loss_mlp": 0.01331305, "balance_loss_clip": 1.522609, "balance_loss_mlp": 1.15163302, "epoch": 0.0019239440853750188, "flos": 13589460380160.0, "grad_norm": 3.9056907796043654, "language_loss": 0.95266509, "learning_rate": 2.2314216284658796e-06, "loss": 0.99179864, "num_input_tokens_seen": 600010, "step": 32, "time_per_iteration": 2.9459571838378906 }, { "auxiliary_loss_clip": 0.02564836, "auxiliary_loss_mlp": 0.01302659, "balance_loss_clip": 1.51811624, "balance_loss_mlp": 1.13586164, "epoch": 0.001984067338042988, "flos": 11253866336640.0, "grad_norm": 3.226486022987097, "language_loss": 0.95143497, "learning_rate": 2.2512340280885094e-06, "loss": 0.99010992, "num_input_tokens_seen": 616295, "step": 33, "time_per_iteration": 2.9855570793151855 }, { "auxiliary_loss_clip": 0.02421202, "auxiliary_loss_mlp": 0.01304214, "balance_loss_clip": 1.48474145, "balance_loss_mlp": 1.14676213, "epoch": 0.0020441905907109576, "flos": 22386245898240.0, "grad_norm": 2.1714659525821247, "language_loss": 0.91547924, "learning_rate": 2.270454923596497e-06, "loss": 0.9527334, "num_input_tokens_seen": 637640, "step": 34, "time_per_iteration": 2.981541872024536 }, { "auxiliary_loss_clip": 0.02375249, "auxiliary_loss_mlp": 0.01271963, "balance_loss_clip": 1.45095515, "balance_loss_mlp": 1.11689591, "epoch": 0.0021043138433789266, "flos": 49778580337920.0, "grad_norm": 2.2635429103650386, "language_loss": 0.76603377, "learning_rate": 2.2891186125067434e-06, "loss": 0.80250585, "num_input_tokens_seen": 659710, "step": 35, "time_per_iteration": 3.2267208099365234 }, { "auxiliary_loss_clip": 0.02347187, "auxiliary_loss_mlp": 0.01276388, "balance_loss_clip": 1.46356034, "balance_loss_mlp": 1.13238275, "epoch": 0.002164437096046896, "flos": 20557961591040.0, "grad_norm": 2.3605884715298506, "language_loss": 0.88713098, "learning_rate": 2.307256493152974e-06, "loss": 0.92336679, "num_input_tokens_seen": 679670, "step": 36, "time_per_iteration": 2.948162078857422 }, { "auxiliary_loss_clip": 0.02289192, "auxiliary_loss_mlp": 0.01338204, "balance_loss_clip": 1.45043015, "balance_loss_mlp": 1.19105196, "epoch": 0.0022245603487148656, "flos": 26542295084160.0, "grad_norm": 2.4929063351918166, "language_loss": 0.93038809, "learning_rate": 2.3248973825097614e-06, "loss": 0.96666199, "num_input_tokens_seen": 700170, "step": 37, "time_per_iteration": 2.9556422233581543 }, { "auxiliary_loss_clip": 0.02249098, "auxiliary_loss_mlp": 0.01276785, "balance_loss_clip": 1.44485605, "balance_loss_mlp": 1.15500069, "epoch": 0.0022846836013828346, "flos": 20338188226560.0, "grad_norm": 2.177909778954084, "language_loss": 1.03952074, "learning_rate": 2.3420677916238357e-06, "loss": 1.07477951, "num_input_tokens_seen": 718545, "step": 38, "time_per_iteration": 2.9959065914154053 }, { "auxiliary_loss_clip": 0.02216028, "auxiliary_loss_mlp": 0.01260768, "balance_loss_clip": 1.43807542, "balance_loss_mlp": 1.13726676, "epoch": 0.002344806854050804, "flos": 26247575992320.0, "grad_norm": 2.22652515093943, "language_loss": 0.85297108, "learning_rate": 2.358792165262154e-06, "loss": 0.887739, "num_input_tokens_seen": 739865, "step": 39, "time_per_iteration": 3.035399913787842 }, { "auxiliary_loss_clip": 0.02192275, "auxiliary_loss_mlp": 0.01250434, "balance_loss_clip": 1.4289664, "balance_loss_mlp": 1.12216496, "epoch": 0.0024049301067187736, "flos": 11801539981440.0, "grad_norm": 3.258228308703562, "language_loss": 0.90279335, "learning_rate": 2.3750930912143747e-06, "loss": 0.93722045, "num_input_tokens_seen": 755770, "step": 40, "time_per_iteration": 3.060368299484253 }, { "auxiliary_loss_clip": 0.02142113, "auxiliary_loss_mlp": 0.01273783, "balance_loss_clip": 1.41895449, "balance_loss_mlp": 1.16086745, "epoch": 0.0024650533593867426, "flos": 20631506688000.0, "grad_norm": 3.245861029799582, "language_loss": 0.93271625, "learning_rate": 2.3909914837471044e-06, "loss": 0.9668752, "num_input_tokens_seen": 773440, "step": 41, "time_per_iteration": 2.9518353939056396 }, { "auxiliary_loss_clip": 0.02105753, "auxiliary_loss_mlp": 0.01254821, "balance_loss_clip": 1.41097844, "balance_loss_mlp": 1.15168142, "epoch": 0.002525176612054712, "flos": 18406122549120.0, "grad_norm": 3.3039479788253536, "language_loss": 0.97533798, "learning_rate": 2.4065067449483835e-06, "loss": 1.0089438, "num_input_tokens_seen": 790455, "step": 42, "time_per_iteration": 2.933177947998047 }, { "auxiliary_loss_clip": 0.020675, "auxiliary_loss_mlp": 0.01298422, "balance_loss_clip": 1.41198874, "balance_loss_mlp": 1.19189644, "epoch": 0.0025852998647226816, "flos": 28184023128960.0, "grad_norm": 3.15071165872949, "language_loss": 0.97562659, "learning_rate": 2.4216569070848724e-06, "loss": 1.00928593, "num_input_tokens_seen": 810645, "step": 43, "time_per_iteration": 2.9760589599609375 }, { "auxiliary_loss_clip": 0.02086351, "auxiliary_loss_mlp": 0.01314601, "balance_loss_clip": 1.41042757, "balance_loss_mlp": 1.20283043, "epoch": 0.0026454231173906506, "flos": 14283110897280.0, "grad_norm": 2.3612650137146574, "language_loss": 0.93435001, "learning_rate": 2.4364587585915504e-06, "loss": 0.96835947, "num_input_tokens_seen": 827470, "step": 44, "time_per_iteration": 2.9239895343780518 }, { "auxiliary_loss_clip": 0.02043996, "auxiliary_loss_mlp": 0.01272131, "balance_loss_clip": 1.40557313, "balance_loss_mlp": 1.17399764, "epoch": 0.00270554637005862, "flos": 22419211605120.0, "grad_norm": 2.1476860292916644, "language_loss": 0.98677421, "learning_rate": 2.450927955901469e-06, "loss": 1.01993537, "num_input_tokens_seen": 847285, "step": 45, "time_per_iteration": 2.9626305103302 }, { "auxiliary_loss_clip": 0.02018804, "auxiliary_loss_mlp": 0.01228873, "balance_loss_clip": 1.39126372, "balance_loss_mlp": 1.14208817, "epoch": 0.0027656696227265896, "flos": 23985778440960.0, "grad_norm": 1.8862192248435494, "language_loss": 1.02800822, "learning_rate": 2.465079122983384e-06, "loss": 1.06048501, "num_input_tokens_seen": 867545, "step": 46, "time_per_iteration": 2.9913573265075684 }, { "auxiliary_loss_clip": 0.0198766, "auxiliary_loss_mlp": 0.01272862, "balance_loss_clip": 1.38388658, "balance_loss_mlp": 1.182549, "epoch": 0.0028257928753945586, "flos": 37669503087360.0, "grad_norm": 2.1076645953887696, "language_loss": 0.87839413, "learning_rate": 2.4789259401737868e-06, "loss": 0.9109993, "num_input_tokens_seen": 889915, "step": 47, "time_per_iteration": 3.0189881324768066 }, { "auxiliary_loss_clip": 0.01949271, "auxiliary_loss_mlp": 0.01255947, "balance_loss_clip": 1.37360096, "balance_loss_mlp": 1.16963911, "epoch": 0.002885916128062528, "flos": 22454547609600.0, "grad_norm": 4.4561049138068, "language_loss": 0.87809587, "learning_rate": 2.492481223656015e-06, "loss": 0.91014802, "num_input_tokens_seen": 908975, "step": 48, "time_per_iteration": 2.863565444946289 }, { "auxiliary_loss_clip": 0.01949016, "auxiliary_loss_mlp": 0.0124182, "balance_loss_clip": 1.36337733, "balance_loss_mlp": 1.15069616, "epoch": 0.0029460393807304976, "flos": 27012796358400.0, "grad_norm": 2.9451035624229855, "language_loss": 0.89691317, "learning_rate": 2.5057569967437924e-06, "loss": 0.9288215, "num_input_tokens_seen": 929810, "step": 49, "time_per_iteration": 2.9967453479766846 }, { "auxiliary_loss_clip": 0.0194038, "auxiliary_loss_mlp": 0.01234077, "balance_loss_clip": 1.35742152, "balance_loss_mlp": 1.14996314, "epoch": 0.0030061626333984666, "flos": 15851832549120.0, "grad_norm": 3.162716210197168, "language_loss": 0.90914285, "learning_rate": 2.51876455396287e-06, "loss": 0.94088745, "num_input_tokens_seen": 948650, "step": 50, "time_per_iteration": 2.8832523822784424 }, { "auxiliary_loss_clip": 0.01938537, "auxiliary_loss_mlp": 0.01199505, "balance_loss_clip": 1.36240602, "balance_loss_mlp": 1.11844242, "epoch": 0.003066285886066436, "flos": 31827052316160.0, "grad_norm": 6.098010360158733, "language_loss": 0.86977792, "learning_rate": 2.5315145187866316e-06, "loss": 0.90115827, "num_input_tokens_seen": 966455, "step": 51, "time_per_iteration": 2.9061717987060547 }, { "auxiliary_loss_clip": 0.01895637, "auxiliary_loss_mlp": 0.01206588, "balance_loss_clip": 1.35252357, "balance_loss_mlp": 1.12829173, "epoch": 0.0031264091387344056, "flos": 41427482774400.0, "grad_norm": 2.043292881862276, "language_loss": 0.95171362, "learning_rate": 2.5440168957651953e-06, "loss": 0.98273587, "num_input_tokens_seen": 988110, "step": 52, "time_per_iteration": 3.0266616344451904 }, { "auxiliary_loss_clip": 0.01893195, "auxiliary_loss_mlp": 0.01241159, "balance_loss_clip": 1.34894896, "balance_loss_mlp": 1.16162264, "epoch": 0.0031865323914023747, "flos": 23440941970560.0, "grad_norm": 4.2358840345824635, "language_loss": 0.92323011, "learning_rate": 2.5562811176888872e-06, "loss": 0.95457363, "num_input_tokens_seen": 1008550, "step": 53, "time_per_iteration": 2.8850226402282715 }, { "auxiliary_loss_clip": 0.01882736, "auxiliary_loss_mlp": 0.01197045, "balance_loss_clip": 1.35264134, "balance_loss_mlp": 1.11669779, "epoch": 0.003246655644070344, "flos": 14429195510400.0, "grad_norm": 2.290226623360683, "language_loss": 0.8260113, "learning_rate": 2.5683160883431093e-06, "loss": 0.85680908, "num_input_tokens_seen": 1026840, "step": 54, "time_per_iteration": 2.9433553218841553 }, { "auxiliary_loss_clip": 0.01880073, "auxiliary_loss_mlp": 0.01210775, "balance_loss_clip": 1.34162152, "balance_loss_mlp": 1.13233542, "epoch": 0.0033067788967383136, "flos": 35918247496320.0, "grad_norm": 2.911577423572303, "language_loss": 0.81303245, "learning_rate": 2.580130221340046e-06, "loss": 0.84394085, "num_input_tokens_seen": 1048875, "step": 55, "time_per_iteration": 3.0040643215179443 }, { "auxiliary_loss_clip": 0.01870075, "auxiliary_loss_mlp": 0.0120375, "balance_loss_clip": 1.33644819, "balance_loss_mlp": 1.12521541, "epoch": 0.003366902149406283, "flos": 22958732862720.0, "grad_norm": 2.639118679342801, "language_loss": 0.87089968, "learning_rate": 2.5917314754514246e-06, "loss": 0.90163803, "num_input_tokens_seen": 1066435, "step": 56, "time_per_iteration": 2.830453395843506 }, { "auxiliary_loss_clip": 0.01869912, "auxiliary_loss_mlp": 0.01161425, "balance_loss_clip": 1.32921791, "balance_loss_mlp": 1.08851671, "epoch": 0.003427025402074252, "flos": 26582838560640.0, "grad_norm": 2.101574700040827, "language_loss": 0.92890096, "learning_rate": 2.6031273868139713e-06, "loss": 0.95921433, "num_input_tokens_seen": 1090330, "step": 57, "time_per_iteration": 7.0071024894714355 }, { "auxiliary_loss_clip": 0.01833802, "auxiliary_loss_mlp": 0.0121675, "balance_loss_clip": 1.33333457, "balance_loss_mlp": 1.14493799, "epoch": 0.0034871486547422216, "flos": 23951196622080.0, "grad_norm": 14.610065921505914, "language_loss": 0.9972856, "learning_rate": 2.614325098333948e-06, "loss": 1.02779114, "num_input_tokens_seen": 1109840, "step": 58, "time_per_iteration": 2.830960273742676 }, { "auxiliary_loss_clip": 0.0181804, "auxiliary_loss_mlp": 0.01199311, "balance_loss_clip": 1.32073379, "balance_loss_mlp": 1.12835753, "epoch": 0.003547271907410191, "flos": 21214983214080.0, "grad_norm": 2.120622270947527, "language_loss": 0.88172519, "learning_rate": 2.625331386578098e-06, "loss": 0.91189873, "num_input_tokens_seen": 1128415, "step": 59, "time_per_iteration": 2.8507089614868164 }, { "auxiliary_loss_clip": 0.01839573, "auxiliary_loss_mlp": 0.01163328, "balance_loss_clip": 1.32924581, "balance_loss_mlp": 1.09075332, "epoch": 0.00360739516007816, "flos": 16504903676160.0, "grad_norm": 2.021991994360373, "language_loss": 0.93542433, "learning_rate": 2.63615268640451e-06, "loss": 0.96545339, "num_input_tokens_seen": 1146515, "step": 60, "time_per_iteration": 2.8517534732818604 }, { "auxiliary_loss_clip": 0.0181893, "auxiliary_loss_mlp": 0.01176948, "balance_loss_clip": 1.31414318, "balance_loss_mlp": 1.10923755, "epoch": 0.0036675184127461296, "flos": 19464805031040.0, "grad_norm": 2.908283338489548, "language_loss": 0.90021706, "learning_rate": 2.6467951135575943e-06, "loss": 0.9301759, "num_input_tokens_seen": 1166330, "step": 61, "time_per_iteration": 2.8853390216827393 }, { "auxiliary_loss_clip": 0.01803943, "auxiliary_loss_mlp": 0.01142904, "balance_loss_clip": 1.31131864, "balance_loss_mlp": 1.07581341, "epoch": 0.003727641665414099, "flos": 20957323979520.0, "grad_norm": 1.8428161811646855, "language_loss": 0.88479733, "learning_rate": 2.657264485425803e-06, "loss": 0.91426575, "num_input_tokens_seen": 1186010, "step": 62, "time_per_iteration": 2.8860812187194824 }, { "auxiliary_loss_clip": 0.01785338, "auxiliary_loss_mlp": 0.0116457, "balance_loss_clip": 1.30233741, "balance_loss_mlp": 1.09504724, "epoch": 0.003787764918082068, "flos": 18406050721920.0, "grad_norm": 2.4385306002926512, "language_loss": 0.96280968, "learning_rate": 2.6675663401385186e-06, "loss": 0.99230874, "num_input_tokens_seen": 1204985, "step": 63, "time_per_iteration": 2.9081404209136963 }, { "auxiliary_loss_clip": 0.01795068, "auxiliary_loss_mlp": 0.01171321, "balance_loss_clip": 1.31071985, "balance_loss_mlp": 1.10499322, "epoch": 0.0038478881707500376, "flos": 12459243962880.0, "grad_norm": 3.0781639748926697, "language_loss": 0.98840165, "learning_rate": 2.677705954159056e-06, "loss": 1.01806557, "num_input_tokens_seen": 1223545, "step": 64, "time_per_iteration": 2.893603801727295 }, { "auxiliary_loss_clip": 0.01801311, "auxiliary_loss_mlp": 0.01151112, "balance_loss_clip": 1.30960393, "balance_loss_mlp": 1.08368695, "epoch": 0.003908011423418007, "flos": 13553334276480.0, "grad_norm": 2.4813676281781554, "language_loss": 0.85397774, "learning_rate": 2.6876883585136904e-06, "loss": 0.88350195, "num_input_tokens_seen": 1241175, "step": 65, "time_per_iteration": 2.8768796920776367 }, { "auxiliary_loss_clip": 0.01777474, "auxiliary_loss_mlp": 0.01155217, "balance_loss_clip": 1.29563761, "balance_loss_mlp": 1.087888, "epoch": 0.003968134676085976, "flos": 18333475292160.0, "grad_norm": 1.8550079005121831, "language_loss": 0.85281348, "learning_rate": 2.697518353781685e-06, "loss": 0.88214046, "num_input_tokens_seen": 1259315, "step": 66, "time_per_iteration": 2.769274950027466 }, { "auxiliary_loss_clip": 0.01779987, "auxiliary_loss_mlp": 0.01151372, "balance_loss_clip": 1.29312515, "balance_loss_mlp": 1.07650828, "epoch": 0.004028257928753946, "flos": 20485242506880.0, "grad_norm": 2.74895944689593, "language_loss": 0.96567476, "learning_rate": 2.7072005239581103e-06, "loss": 0.99498826, "num_input_tokens_seen": 1277055, "step": 67, "time_per_iteration": 2.889369249343872 }, { "auxiliary_loss_clip": 0.01752442, "auxiliary_loss_mlp": 0.01152779, "balance_loss_clip": 1.28765118, "balance_loss_mlp": 1.08120584, "epoch": 0.004088381181421915, "flos": 18843837684480.0, "grad_norm": 2.109359538419204, "language_loss": 0.94516367, "learning_rate": 2.7167392492896727e-06, "loss": 0.97421581, "num_input_tokens_seen": 1294355, "step": 68, "time_per_iteration": 2.8107409477233887 }, { "auxiliary_loss_clip": 0.01747204, "auxiliary_loss_mlp": 0.0115424, "balance_loss_clip": 1.28511512, "balance_loss_mlp": 1.08476448, "epoch": 0.004148504434089885, "flos": 19427817000960.0, "grad_norm": 2.2931216646069092, "language_loss": 0.96014255, "learning_rate": 2.7261387181735195e-06, "loss": 0.98915702, "num_input_tokens_seen": 1313525, "step": 69, "time_per_iteration": 2.8138387203216553 }, { "auxiliary_loss_clip": 0.01741342, "auxiliary_loss_mlp": 0.01160375, "balance_loss_clip": 1.28807163, "balance_loss_mlp": 1.09581161, "epoch": 0.004208627686757853, "flos": 20811023884800.0, "grad_norm": 2.1764096137707494, "language_loss": 0.98070192, "learning_rate": 2.7354029381999196e-06, "loss": 1.00971913, "num_input_tokens_seen": 1330505, "step": 70, "time_per_iteration": 2.8319084644317627 }, { "auxiliary_loss_clip": 0.0174721, "auxiliary_loss_mlp": 0.01145619, "balance_loss_clip": 1.27791202, "balance_loss_mlp": 1.07685876, "epoch": 0.004268750939425823, "flos": 19098623831040.0, "grad_norm": 2.9300158782571324, "language_loss": 0.94016141, "learning_rate": 2.7445357464116983e-06, "loss": 0.96908975, "num_input_tokens_seen": 1349615, "step": 71, "time_per_iteration": 2.8469433784484863 }, { "auxiliary_loss_clip": 0.01815227, "auxiliary_loss_mlp": 0.01294388, "balance_loss_clip": 1.43495834, "balance_loss_mlp": 1.25490558, "epoch": 0.004328874192093792, "flos": 52439635514880.0, "grad_norm": 2.409331683106634, "language_loss": 0.65682542, "learning_rate": 2.75354081884615e-06, "loss": 0.68792164, "num_input_tokens_seen": 1410275, "step": 72, "time_per_iteration": 3.2019593715667725 }, { "auxiliary_loss_clip": 0.01799527, "auxiliary_loss_mlp": 0.01271558, "balance_loss_clip": 1.43197393, "balance_loss_mlp": 1.2316941, "epoch": 0.004388997444761762, "flos": 66473239564800.0, "grad_norm": 2.25068040880696, "language_loss": 0.63694263, "learning_rate": 2.7624216794188286e-06, "loss": 0.66765356, "num_input_tokens_seen": 1473020, "step": 73, "time_per_iteration": 3.3545596599578857 }, { "auxiliary_loss_clip": 0.01720805, "auxiliary_loss_mlp": 0.01140553, "balance_loss_clip": 1.26912856, "balance_loss_mlp": 1.07279444, "epoch": 0.004449120697429731, "flos": 18952970181120.0, "grad_norm": 2.554977860093902, "language_loss": 0.86212188, "learning_rate": 2.771181708202938e-06, "loss": 0.89073551, "num_input_tokens_seen": 1490385, "step": 74, "time_per_iteration": 2.823498487472534 }, { "auxiliary_loss_clip": 0.0172287, "auxiliary_loss_mlp": 0.01162493, "balance_loss_clip": 1.26811171, "balance_loss_mlp": 1.09344697, "epoch": 0.004509243950097701, "flos": 21105491581440.0, "grad_norm": 3.0087618017840105, "language_loss": 0.97196102, "learning_rate": 2.779824149153005e-06, "loss": 1.00081468, "num_input_tokens_seen": 1509725, "step": 75, "time_per_iteration": 2.888415575027466 }, { "auxiliary_loss_clip": 0.0170198, "auxiliary_loss_mlp": 0.01142315, "balance_loss_clip": 1.26420689, "balance_loss_mlp": 1.07608271, "epoch": 0.004569367202765669, "flos": 20698730991360.0, "grad_norm": 2.6610382542709043, "language_loss": 0.87740695, "learning_rate": 2.788352117317012e-06, "loss": 0.90584993, "num_input_tokens_seen": 1527245, "step": 76, "time_per_iteration": 2.9226863384246826 }, { "auxiliary_loss_clip": 0.01702512, "auxiliary_loss_mlp": 0.01145374, "balance_loss_clip": 1.26239479, "balance_loss_mlp": 1.07656646, "epoch": 0.004629490455433639, "flos": 28658474899200.0, "grad_norm": 2.4272090643104574, "language_loss": 0.91791159, "learning_rate": 2.796768605577095e-06, "loss": 0.94639051, "num_input_tokens_seen": 1548930, "step": 77, "time_per_iteration": 2.8720929622650146 }, { "auxiliary_loss_clip": 0.01693018, "auxiliary_loss_mlp": 0.01165978, "balance_loss_clip": 1.26398146, "balance_loss_mlp": 1.09569168, "epoch": 0.004689613708101608, "flos": 11072409805440.0, "grad_norm": 2.2822185142383034, "language_loss": 0.9211635, "learning_rate": 2.80507649095533e-06, "loss": 0.94975346, "num_input_tokens_seen": 1565695, "step": 78, "time_per_iteration": 2.7832391262054443 }, { "auxiliary_loss_clip": 0.01689271, "auxiliary_loss_mlp": 0.01153255, "balance_loss_clip": 1.25836253, "balance_loss_mlp": 1.08482933, "epoch": 0.004749736960769578, "flos": 21799106184960.0, "grad_norm": 2.263191265943929, "language_loss": 0.82771945, "learning_rate": 2.813278540517843e-06, "loss": 0.85614467, "num_input_tokens_seen": 1582625, "step": 79, "time_per_iteration": 2.7723355293273926 }, { "auxiliary_loss_clip": 0.01702468, "auxiliary_loss_mlp": 0.01130708, "balance_loss_clip": 1.26147008, "balance_loss_mlp": 1.0609467, "epoch": 0.004809860213437547, "flos": 19792597570560.0, "grad_norm": 1.9992491725405546, "language_loss": 0.91272199, "learning_rate": 2.8213774169075505e-06, "loss": 0.94105375, "num_input_tokens_seen": 1601725, "step": 80, "time_per_iteration": 2.742046356201172 }, { "auxiliary_loss_clip": 0.01671156, "auxiliary_loss_mlp": 0.01144048, "balance_loss_clip": 1.25365841, "balance_loss_mlp": 1.07371473, "epoch": 0.004869983466105517, "flos": 26574327037440.0, "grad_norm": 2.0371265012476742, "language_loss": 0.95241439, "learning_rate": 2.829375683533245e-06, "loss": 0.9805665, "num_input_tokens_seen": 1622420, "step": 81, "time_per_iteration": 2.8996386528015137 }, { "auxiliary_loss_clip": 0.01686092, "auxiliary_loss_mlp": 0.01147828, "balance_loss_clip": 1.25779653, "balance_loss_mlp": 1.08149946, "epoch": 0.004930106718773485, "flos": 12823378087680.0, "grad_norm": 2.9441337112970296, "language_loss": 0.96288472, "learning_rate": 2.8372758094402803e-06, "loss": 0.99122393, "num_input_tokens_seen": 1640715, "step": 82, "time_per_iteration": 2.819120407104492 }, { "auxiliary_loss_clip": 0.01668255, "auxiliary_loss_mlp": 0.01156428, "balance_loss_clip": 1.2461338, "balance_loss_mlp": 1.08709574, "epoch": 0.004990229971441455, "flos": 25774919902080.0, "grad_norm": 2.6601797838877856, "language_loss": 0.86762071, "learning_rate": 2.84508017388607e-06, "loss": 0.89586747, "num_input_tokens_seen": 1662210, "step": 83, "time_per_iteration": 2.7959344387054443 }, { "auxiliary_loss_clip": 0.01662665, "auxiliary_loss_mlp": 0.01154043, "balance_loss_clip": 1.24844718, "balance_loss_mlp": 1.084234, "epoch": 0.005050353224109424, "flos": 17457254922240.0, "grad_norm": 2.5416281292503986, "language_loss": 0.92081314, "learning_rate": 2.852791070641559e-06, "loss": 0.94898021, "num_input_tokens_seen": 1681070, "step": 84, "time_per_iteration": 2.7176246643066406 }, { "auxiliary_loss_clip": 0.01647627, "auxiliary_loss_mlp": 0.01154949, "balance_loss_clip": 1.36429358, "balance_loss_mlp": 1.11527622, "epoch": 0.005110476476777394, "flos": 69805460367360.0, "grad_norm": 1.4023430227621099, "language_loss": 0.6252538, "learning_rate": 2.8604107120381682e-06, "loss": 0.65327954, "num_input_tokens_seen": 1747140, "step": 85, "time_per_iteration": 3.296835422515869 }, { "auxiliary_loss_clip": 0.01649469, "auxiliary_loss_mlp": 0.0112642, "balance_loss_clip": 1.23797417, "balance_loss_mlp": 1.05642033, "epoch": 0.005170599729445363, "flos": 24790105739520.0, "grad_norm": 1.805253124779358, "language_loss": 0.90709531, "learning_rate": 2.8679412327780482e-06, "loss": 0.93485421, "num_input_tokens_seen": 1767475, "step": 86, "time_per_iteration": 2.761484146118164 }, { "auxiliary_loss_clip": 0.01653351, "auxiliary_loss_mlp": 0.01158608, "balance_loss_clip": 1.24437881, "balance_loss_mlp": 1.08741617, "epoch": 0.005230722982113333, "flos": 23258048895360.0, "grad_norm": 2.3398213465495776, "language_loss": 0.81961077, "learning_rate": 2.8753846935240833e-06, "loss": 0.8477304, "num_input_tokens_seen": 1784980, "step": 87, "time_per_iteration": 2.763185739517212 }, { "auxiliary_loss_clip": 0.01641581, "auxiliary_loss_mlp": 0.01152623, "balance_loss_clip": 1.24129367, "balance_loss_mlp": 1.08457828, "epoch": 0.005290846234781301, "flos": 16727909264640.0, "grad_norm": 3.1951080427559857, "language_loss": 0.95790672, "learning_rate": 2.8827430842847267e-06, "loss": 0.98584872, "num_input_tokens_seen": 1803030, "step": 88, "time_per_iteration": 2.7855517864227295 }, { "auxiliary_loss_clip": 0.01658657, "auxiliary_loss_mlp": 0.01147064, "balance_loss_clip": 1.24130976, "balance_loss_mlp": 1.07978201, "epoch": 0.005350969487449271, "flos": 20886077352960.0, "grad_norm": 3.405407923072192, "language_loss": 0.86023164, "learning_rate": 2.8900183276075957e-06, "loss": 0.88828892, "num_input_tokens_seen": 1822865, "step": 89, "time_per_iteration": 2.7517924308776855 }, { "auxiliary_loss_clip": 0.01647446, "auxiliary_loss_mlp": 0.01133456, "balance_loss_clip": 1.23541856, "balance_loss_mlp": 1.06727123, "epoch": 0.00541109274011724, "flos": 26209977431040.0, "grad_norm": 2.130771496386599, "language_loss": 0.9150058, "learning_rate": 2.8972122815946455e-06, "loss": 0.94281483, "num_input_tokens_seen": 1842435, "step": 90, "time_per_iteration": 2.7526872158050537 }, { "auxiliary_loss_clip": 0.01629409, "auxiliary_loss_mlp": 0.01133822, "balance_loss_clip": 1.23219132, "balance_loss_mlp": 1.06582534, "epoch": 0.00547121599278521, "flos": 21178569801600.0, "grad_norm": 2.6928798867856796, "language_loss": 0.86073506, "learning_rate": 2.90432674275074e-06, "loss": 0.88836741, "num_input_tokens_seen": 1860065, "step": 91, "time_per_iteration": 2.7995588779449463 }, { "auxiliary_loss_clip": 0.01628638, "auxiliary_loss_mlp": 0.01138916, "balance_loss_clip": 1.22774827, "balance_loss_mlp": 1.07335091, "epoch": 0.005531339245453179, "flos": 19718801078400.0, "grad_norm": 5.062847798051961, "language_loss": 0.87041199, "learning_rate": 2.91136344867656e-06, "loss": 0.8980875, "num_input_tokens_seen": 1878135, "step": 92, "time_per_iteration": 2.7813079357147217 }, { "auxiliary_loss_clip": 0.01620799, "auxiliary_loss_mlp": 0.01174163, "balance_loss_clip": 1.21933174, "balance_loss_mlp": 1.10650027, "epoch": 0.005591462498121149, "flos": 17636089760640.0, "grad_norm": 4.340668874696889, "language_loss": 0.9210887, "learning_rate": 2.918324080615938e-06, "loss": 0.94903833, "num_input_tokens_seen": 1894895, "step": 93, "time_per_iteration": 2.7582218647003174 }, { "auxiliary_loss_clip": 0.0163427, "auxiliary_loss_mlp": 0.01153574, "balance_loss_clip": 1.22659743, "balance_loss_mlp": 1.08238208, "epoch": 0.005651585750789117, "flos": 20011221699840.0, "grad_norm": 4.327341326162078, "language_loss": 0.87578797, "learning_rate": 2.925210265866963e-06, "loss": 0.90366644, "num_input_tokens_seen": 1913220, "step": 94, "time_per_iteration": 2.783581256866455 }, { "auxiliary_loss_clip": 0.01570285, "auxiliary_loss_mlp": 0.01051726, "balance_loss_clip": 1.31970167, "balance_loss_mlp": 1.01376939, "epoch": 0.005711709003457087, "flos": 59812957981440.0, "grad_norm": 1.3608185384271176, "language_loss": 0.68098927, "learning_rate": 2.932023580065507e-06, "loss": 0.70720935, "num_input_tokens_seen": 1970970, "step": 95, "time_per_iteration": 3.1328847408294678 }, { "auxiliary_loss_clip": 0.01612519, "auxiliary_loss_mlp": 0.01150182, "balance_loss_clip": 1.21488237, "balance_loss_mlp": 1.08318627, "epoch": 0.005771832256125056, "flos": 15559591495680.0, "grad_norm": 6.736145376327001, "language_loss": 0.90221369, "learning_rate": 2.9387655493491906e-06, "loss": 0.92984068, "num_input_tokens_seen": 1988930, "step": 96, "time_per_iteration": 2.8015241622924805 }, { "auxiliary_loss_clip": 0.01605814, "auxiliary_loss_mlp": 0.01142022, "balance_loss_clip": 1.21851277, "balance_loss_mlp": 1.08003318, "epoch": 0.005831955508793026, "flos": 22528380015360.0, "grad_norm": 3.8307865500968044, "language_loss": 0.89869905, "learning_rate": 2.9454376524092147e-06, "loss": 0.92617744, "num_input_tokens_seen": 2006285, "step": 97, "time_per_iteration": 4.387299060821533 }, { "auxiliary_loss_clip": 0.01593214, "auxiliary_loss_mlp": 0.01140673, "balance_loss_clip": 1.2102325, "balance_loss_mlp": 1.07200789, "epoch": 0.005892078761460995, "flos": 22049834094720.0, "grad_norm": 2.291581893082518, "language_loss": 0.76274347, "learning_rate": 2.952041322436969e-06, "loss": 0.79008234, "num_input_tokens_seen": 2024905, "step": 98, "time_per_iteration": 2.751507043838501 }, { "auxiliary_loss_clip": 0.01533926, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.29271698, "balance_loss_mlp": 1.00129879, "epoch": 0.005952202014128965, "flos": 68539143317760.0, "grad_norm": 1.0388395506080574, "language_loss": 0.65518898, "learning_rate": 2.9585779489718204e-06, "loss": 0.68089598, "num_input_tokens_seen": 2086220, "step": 99, "time_per_iteration": 3.3125040531158447 }, { "auxiliary_loss_clip": 0.01595694, "auxiliary_loss_mlp": 0.01142556, "balance_loss_clip": 1.21028757, "balance_loss_mlp": 1.07217503, "epoch": 0.006012325266796933, "flos": 22960887678720.0, "grad_norm": 2.051483688350497, "language_loss": 0.90885437, "learning_rate": 2.9650488796560464e-06, "loss": 0.93623686, "num_input_tokens_seen": 2103365, "step": 100, "time_per_iteration": 2.7632548809051514 }, { "auxiliary_loss_clip": 0.01607235, "auxiliary_loss_mlp": 0.01150276, "balance_loss_clip": 1.21294045, "balance_loss_mlp": 1.08394814, "epoch": 0.006072448519464903, "flos": 17347942857600.0, "grad_norm": 2.0181737234491566, "language_loss": 0.91081136, "learning_rate": 2.971455421902446e-06, "loss": 0.9383865, "num_input_tokens_seen": 2121995, "step": 101, "time_per_iteration": 2.7214279174804688 }, { "auxiliary_loss_clip": 0.015938, "auxiliary_loss_mlp": 0.01152009, "balance_loss_clip": 1.21248627, "balance_loss_mlp": 1.08124638, "epoch": 0.006132571772132872, "flos": 24681116897280.0, "grad_norm": 2.076276442041171, "language_loss": 0.90774924, "learning_rate": 2.9777988444798075e-06, "loss": 0.93520737, "num_input_tokens_seen": 2141815, "step": 102, "time_per_iteration": 2.8389108180999756 }, { "auxiliary_loss_clip": 0.01588155, "auxiliary_loss_mlp": 0.01133785, "balance_loss_clip": 1.20914173, "balance_loss_mlp": 1.06912589, "epoch": 0.006192695024800842, "flos": 21465675210240.0, "grad_norm": 2.3272829989328456, "language_loss": 0.88006896, "learning_rate": 2.9840803790210285e-06, "loss": 0.90728837, "num_input_tokens_seen": 2161125, "step": 103, "time_per_iteration": 2.768784761428833 }, { "auxiliary_loss_clip": 0.01588751, "auxiliary_loss_mlp": 0.01136216, "balance_loss_clip": 1.21138883, "balance_loss_mlp": 1.06998372, "epoch": 0.006252818277468811, "flos": 17420410546560.0, "grad_norm": 1.9182889224259552, "language_loss": 0.93644351, "learning_rate": 2.990301221458371e-06, "loss": 0.96369314, "num_input_tokens_seen": 2179510, "step": 104, "time_per_iteration": 2.7109038829803467 }, { "auxiliary_loss_clip": 0.01579421, "auxiliary_loss_mlp": 0.01146524, "balance_loss_clip": 1.20086741, "balance_loss_mlp": 1.08258009, "epoch": 0.006312941530136781, "flos": 19099557584640.0, "grad_norm": 3.0437899698059367, "language_loss": 0.96655375, "learning_rate": 2.9964625333900544e-06, "loss": 0.99381316, "num_input_tokens_seen": 2197870, "step": 105, "time_per_iteration": 2.7254133224487305 }, { "auxiliary_loss_clip": 0.01578331, "auxiliary_loss_mlp": 0.01158544, "balance_loss_clip": 1.20144236, "balance_loss_mlp": 1.08768642, "epoch": 0.006373064782804749, "flos": 24060831909120.0, "grad_norm": 3.1837681777002302, "language_loss": 0.87119448, "learning_rate": 3.002565443382063e-06, "loss": 0.89856327, "num_input_tokens_seen": 2217495, "step": 106, "time_per_iteration": 2.7705447673797607 }, { "auxiliary_loss_clip": 0.01561845, "auxiliary_loss_mlp": 0.01143018, "balance_loss_clip": 1.18746924, "balance_loss_mlp": 1.0751636, "epoch": 0.006433188035472719, "flos": 18332433797760.0, "grad_norm": 2.228856706842439, "language_loss": 0.83398581, "learning_rate": 3.008611048208843e-06, "loss": 0.86103439, "num_input_tokens_seen": 2236520, "step": 107, "time_per_iteration": 2.6885263919830322 }, { "auxiliary_loss_clip": 0.01469631, "auxiliary_loss_mlp": 0.0103327, "balance_loss_clip": 1.25210869, "balance_loss_mlp": 1.00179863, "epoch": 0.006493311288140688, "flos": 62562387594240.0, "grad_norm": 0.9900995959758047, "language_loss": 0.64796811, "learning_rate": 3.014600414036285e-06, "loss": 0.67299712, "num_input_tokens_seen": 2300140, "step": 108, "time_per_iteration": 3.278621196746826 }, { "auxiliary_loss_clip": 0.01552898, "auxiliary_loss_mlp": 0.01132858, "balance_loss_clip": 1.18960094, "balance_loss_mlp": 1.06424141, "epoch": 0.006553434540808658, "flos": 19500141035520.0, "grad_norm": 2.019247660217844, "language_loss": 0.97709465, "learning_rate": 3.0205345775501937e-06, "loss": 1.00395215, "num_input_tokens_seen": 2317320, "step": 109, "time_per_iteration": 2.750502347946167 }, { "auxiliary_loss_clip": 0.01550996, "auxiliary_loss_mlp": 0.01140204, "balance_loss_clip": 1.19136214, "balance_loss_mlp": 1.07430482, "epoch": 0.006613557793476627, "flos": 21105132445440.0, "grad_norm": 1.9540987754213832, "language_loss": 0.84243041, "learning_rate": 3.0264145470332218e-06, "loss": 0.86934245, "num_input_tokens_seen": 2337820, "step": 110, "time_per_iteration": 2.82443904876709 }, { "auxiliary_loss_clip": 0.01544634, "auxiliary_loss_mlp": 0.01151549, "balance_loss_clip": 1.18396342, "balance_loss_mlp": 1.08493507, "epoch": 0.006673681046144597, "flos": 26030747543040.0, "grad_norm": 2.4580319150483563, "language_loss": 0.82940048, "learning_rate": 3.032241303393073e-06, "loss": 0.85636234, "num_input_tokens_seen": 2358560, "step": 111, "time_per_iteration": 2.8308968544006348 }, { "auxiliary_loss_clip": 0.0154596, "auxiliary_loss_mlp": 0.01133366, "balance_loss_clip": 1.18776846, "balance_loss_mlp": 1.06970847, "epoch": 0.006733804298812566, "flos": 23147767163520.0, "grad_norm": 2.356589096997363, "language_loss": 0.93989801, "learning_rate": 3.0380158011446e-06, "loss": 0.9666912, "num_input_tokens_seen": 2379005, "step": 112, "time_per_iteration": 2.8007922172546387 }, { "auxiliary_loss_clip": 0.01549647, "auxiliary_loss_mlp": 0.01136979, "balance_loss_clip": 1.18394601, "balance_loss_mlp": 1.07322621, "epoch": 0.006793927551480535, "flos": 11764444210560.0, "grad_norm": 2.521639841990545, "language_loss": 0.79509294, "learning_rate": 3.0437389693482466e-06, "loss": 0.82195914, "num_input_tokens_seen": 2395610, "step": 113, "time_per_iteration": 2.7599966526031494 }, { "auxiliary_loss_clip": 0.0153736, "auxiliary_loss_mlp": 0.01131524, "balance_loss_clip": 1.18028498, "balance_loss_mlp": 1.06562555, "epoch": 0.006854050804148504, "flos": 19171953446400.0, "grad_norm": 2.343117351168218, "language_loss": 0.93439317, "learning_rate": 3.0494117125071475e-06, "loss": 0.96108204, "num_input_tokens_seen": 2415005, "step": 114, "time_per_iteration": 2.723540782928467 }, { "auxiliary_loss_clip": 0.01544971, "auxiliary_loss_mlp": 0.01138932, "balance_loss_clip": 1.17997146, "balance_loss_mlp": 1.07918465, "epoch": 0.006914174056816474, "flos": 21981891519360.0, "grad_norm": 1.9509019191057126, "language_loss": 0.9463321, "learning_rate": 3.055034911425055e-06, "loss": 0.97317111, "num_input_tokens_seen": 2433965, "step": 115, "time_per_iteration": 2.7077698707580566 }, { "auxiliary_loss_clip": 0.01537699, "auxiliary_loss_mlp": 0.01118178, "balance_loss_clip": 1.17675614, "balance_loss_mlp": 1.05151677, "epoch": 0.006974297309484443, "flos": 16289152634880.0, "grad_norm": 10.363795807176915, "language_loss": 0.82148951, "learning_rate": 3.0606094240271244e-06, "loss": 0.84804827, "num_input_tokens_seen": 2451605, "step": 116, "time_per_iteration": 2.681190013885498 }, { "auxiliary_loss_clip": 0.01528803, "auxiliary_loss_mlp": 0.01126189, "balance_loss_clip": 1.17677391, "balance_loss_mlp": 1.06219721, "epoch": 0.007034420562152413, "flos": 26104005331200.0, "grad_norm": 2.4150591879391627, "language_loss": 0.88368428, "learning_rate": 3.0661360861454656e-06, "loss": 0.91023421, "num_input_tokens_seen": 2472035, "step": 117, "time_per_iteration": 2.776143789291382 }, { "auxiliary_loss_clip": 0.01527909, "auxiliary_loss_mlp": 0.01146127, "balance_loss_clip": 1.17495561, "balance_loss_mlp": 1.08041906, "epoch": 0.007094543814820382, "flos": 14204609723520.0, "grad_norm": 2.3639764059040265, "language_loss": 0.8454417, "learning_rate": 3.071615712271274e-06, "loss": 0.87218207, "num_input_tokens_seen": 2489285, "step": 118, "time_per_iteration": 2.7110469341278076 }, { "auxiliary_loss_clip": 0.01538161, "auxiliary_loss_mlp": 0.01163868, "balance_loss_clip": 1.1759789, "balance_loss_mlp": 1.0984937, "epoch": 0.007154667067488351, "flos": 14976007228800.0, "grad_norm": 2.231843342078736, "language_loss": 0.99319011, "learning_rate": 3.0770490962752172e-06, "loss": 1.02021039, "num_input_tokens_seen": 2506460, "step": 119, "time_per_iteration": 2.674121856689453 }, { "auxiliary_loss_clip": 0.01540018, "auxiliary_loss_mlp": 0.01120611, "balance_loss_clip": 1.17242217, "balance_loss_mlp": 1.05738258, "epoch": 0.00721479032015632, "flos": 20193288762240.0, "grad_norm": 2.7981733983226764, "language_loss": 0.8963809, "learning_rate": 3.082437012097686e-06, "loss": 0.92298722, "num_input_tokens_seen": 2525565, "step": 120, "time_per_iteration": 2.745962381362915 }, { "auxiliary_loss_clip": 0.01524916, "auxiliary_loss_mlp": 0.01129465, "balance_loss_clip": 1.1734432, "balance_loss_mlp": 1.06513989, "epoch": 0.00727491357282429, "flos": 23147228459520.0, "grad_norm": 1.797716104424251, "language_loss": 0.93491542, "learning_rate": 3.0877802144103967e-06, "loss": 0.96145928, "num_input_tokens_seen": 2546605, "step": 121, "time_per_iteration": 2.7924466133117676 }, { "auxiliary_loss_clip": 0.01526294, "auxiliary_loss_mlp": 0.0114832, "balance_loss_clip": 1.17395604, "balance_loss_mlp": 1.08490098, "epoch": 0.007335036825492259, "flos": 15521669712000.0, "grad_norm": 2.3704869501778285, "language_loss": 0.90462255, "learning_rate": 3.09307943925077e-06, "loss": 0.93136871, "num_input_tokens_seen": 2560730, "step": 122, "time_per_iteration": 2.930413246154785 }, { "auxiliary_loss_clip": 0.01521826, "auxiliary_loss_mlp": 0.01146566, "balance_loss_clip": 1.1681807, "balance_loss_mlp": 1.07861674, "epoch": 0.007395160078160229, "flos": 24243365848320.0, "grad_norm": 2.4867163179710037, "language_loss": 0.92660481, "learning_rate": 3.0983354046304154e-06, "loss": 0.95328873, "num_input_tokens_seen": 2579550, "step": 123, "time_per_iteration": 2.7484309673309326 }, { "auxiliary_loss_clip": 0.01519363, "auxiliary_loss_mlp": 0.01127611, "balance_loss_clip": 1.16324139, "balance_loss_mlp": 1.0651449, "epoch": 0.007455283330828198, "flos": 31759792099200.0, "grad_norm": 2.366639004459226, "language_loss": 0.71187961, "learning_rate": 3.103548811118979e-06, "loss": 0.73834932, "num_input_tokens_seen": 2600390, "step": 124, "time_per_iteration": 2.8419976234436035 }, { "auxiliary_loss_clip": 0.01506936, "auxiliary_loss_mlp": 0.01125571, "balance_loss_clip": 1.16464007, "balance_loss_mlp": 1.06167519, "epoch": 0.007515406583496167, "flos": 26615157822720.0, "grad_norm": 2.1632751269766106, "language_loss": 0.88450015, "learning_rate": 3.108720342404542e-06, "loss": 0.91082525, "num_input_tokens_seen": 2620770, "step": 125, "time_per_iteration": 2.823296308517456 }, { "auxiliary_loss_clip": 0.01522239, "auxiliary_loss_mlp": 0.01142214, "balance_loss_clip": 1.16456664, "balance_loss_mlp": 1.07912827, "epoch": 0.007575529836164136, "flos": 18223696350720.0, "grad_norm": 2.6632616920164067, "language_loss": 0.82381976, "learning_rate": 3.1138506658316945e-06, "loss": 0.85046428, "num_input_tokens_seen": 2639900, "step": 126, "time_per_iteration": 2.7325809001922607 }, { "auxiliary_loss_clip": 0.015153, "auxiliary_loss_mlp": 0.01142869, "balance_loss_clip": 1.16330886, "balance_loss_mlp": 1.08088017, "epoch": 0.007635653088832106, "flos": 21580410228480.0, "grad_norm": 3.925284628341409, "language_loss": 0.6743899, "learning_rate": 3.1189404329183404e-06, "loss": 0.7009716, "num_input_tokens_seen": 2657450, "step": 127, "time_per_iteration": 2.709821939468384 }, { "auxiliary_loss_clip": 0.01503057, "auxiliary_loss_mlp": 0.01132416, "balance_loss_clip": 1.165169, "balance_loss_mlp": 1.06861567, "epoch": 0.007695776341500075, "flos": 25375054723200.0, "grad_norm": 2.0535131533503734, "language_loss": 0.8819322, "learning_rate": 3.1239902798522317e-06, "loss": 0.90828693, "num_input_tokens_seen": 2678150, "step": 128, "time_per_iteration": 2.764707565307617 }, { "auxiliary_loss_clip": 0.01505955, "auxiliary_loss_mlp": 0.01144223, "balance_loss_clip": 1.16043079, "balance_loss_mlp": 1.08042252, "epoch": 0.007755899594168045, "flos": 22343906741760.0, "grad_norm": 2.6427711693827005, "language_loss": 0.84719259, "learning_rate": 3.129000827968184e-06, "loss": 0.87369436, "num_input_tokens_seen": 2698290, "step": 129, "time_per_iteration": 2.7472774982452393 }, { "auxiliary_loss_clip": 0.01497871, "auxiliary_loss_mlp": 0.01130211, "balance_loss_clip": 1.15871263, "balance_loss_mlp": 1.06655347, "epoch": 0.007816022846836013, "flos": 22638230784000.0, "grad_norm": 2.366492959329914, "language_loss": 0.97564614, "learning_rate": 3.133972684206866e-06, "loss": 1.00192702, "num_input_tokens_seen": 2717630, "step": 130, "time_per_iteration": 2.6955018043518066 }, { "auxiliary_loss_clip": 0.01492272, "auxiliary_loss_mlp": 0.01134965, "balance_loss_clip": 1.15630865, "balance_loss_mlp": 1.06987715, "epoch": 0.007876146099503984, "flos": 18182901479040.0, "grad_norm": 2.2164470079204572, "language_loss": 0.82658112, "learning_rate": 3.138906441556014e-06, "loss": 0.85285342, "num_input_tokens_seen": 2735835, "step": 131, "time_per_iteration": 2.722247362136841 }, { "auxiliary_loss_clip": 0.01500937, "auxiliary_loss_mlp": 0.01128359, "balance_loss_clip": 1.15885806, "balance_loss_mlp": 1.06694245, "epoch": 0.007936269352171952, "flos": 27119486730240.0, "grad_norm": 2.7663180664822193, "language_loss": 0.82781422, "learning_rate": 3.143802679474861e-06, "loss": 0.85410714, "num_input_tokens_seen": 2756335, "step": 132, "time_per_iteration": 2.7937612533569336 }, { "auxiliary_loss_clip": 0.01491919, "auxiliary_loss_mlp": 0.01128624, "balance_loss_clip": 1.15346444, "balance_loss_mlp": 1.0664922, "epoch": 0.007996392604839923, "flos": 19026335710080.0, "grad_norm": 2.182366740159355, "language_loss": 0.95499313, "learning_rate": 3.1486619643025565e-06, "loss": 0.98119843, "num_input_tokens_seen": 2775090, "step": 133, "time_per_iteration": 2.7380354404449463 }, { "auxiliary_loss_clip": 0.01487746, "auxiliary_loss_mlp": 0.0112871, "balance_loss_clip": 1.16170454, "balance_loss_mlp": 1.06843781, "epoch": 0.008056515857507891, "flos": 25484151306240.0, "grad_norm": 1.8164116645967854, "language_loss": 0.73478442, "learning_rate": 3.153484849651286e-06, "loss": 0.76094896, "num_input_tokens_seen": 2795320, "step": 134, "time_per_iteration": 2.7483408451080322 }, { "auxiliary_loss_clip": 0.01484621, "auxiliary_loss_mlp": 0.01132134, "balance_loss_clip": 1.15115011, "balance_loss_mlp": 1.06695068, "epoch": 0.00811663911017586, "flos": 20557566541440.0, "grad_norm": 5.027018494085059, "language_loss": 0.88792509, "learning_rate": 3.1582718767847806e-06, "loss": 0.91409266, "num_input_tokens_seen": 2812815, "step": 135, "time_per_iteration": 2.6838128566741943 }, { "auxiliary_loss_clip": 0.01487119, "auxiliary_loss_mlp": 0.0113257, "balance_loss_clip": 1.15490174, "balance_loss_mlp": 1.06714821, "epoch": 0.00817676236284383, "flos": 18799738761600.0, "grad_norm": 1.9282722528396903, "language_loss": 0.89138198, "learning_rate": 3.1630235749828485e-06, "loss": 0.91757882, "num_input_tokens_seen": 2830445, "step": 136, "time_per_iteration": 2.726475238800049 }, { "auxiliary_loss_clip": 0.01483417, "auxiliary_loss_mlp": 0.01110724, "balance_loss_clip": 1.1494019, "balance_loss_mlp": 1.05078554, "epoch": 0.008236885615511799, "flos": 23873593288320.0, "grad_norm": 2.2984339413846078, "language_loss": 0.84091324, "learning_rate": 3.1677404618925676e-06, "loss": 0.86685467, "num_input_tokens_seen": 2846965, "step": 137, "time_per_iteration": 7.4708640575408936 }, { "auxiliary_loss_clip": 0.01481848, "auxiliary_loss_mlp": 0.01118837, "balance_loss_clip": 1.1500535, "balance_loss_mlp": 1.05894589, "epoch": 0.00829700886817977, "flos": 24643626076800.0, "grad_norm": 1.69378413504035, "language_loss": 0.9018681, "learning_rate": 3.1724230438666953e-06, "loss": 0.92787492, "num_input_tokens_seen": 2867520, "step": 138, "time_per_iteration": 4.311830520629883 }, { "auxiliary_loss_clip": 0.01469655, "auxiliary_loss_mlp": 0.01123604, "balance_loss_clip": 1.14824438, "balance_loss_mlp": 1.05904007, "epoch": 0.008357132120847738, "flos": 25262007644160.0, "grad_norm": 2.1515203004813785, "language_loss": 0.91478992, "learning_rate": 3.177071816289865e-06, "loss": 0.94072247, "num_input_tokens_seen": 2885675, "step": 139, "time_per_iteration": 2.7678122520446777 }, { "auxiliary_loss_clip": 0.01486799, "auxiliary_loss_mlp": 0.01124947, "balance_loss_clip": 1.15521085, "balance_loss_mlp": 1.06195688, "epoch": 0.008417255373515706, "flos": 27344898529920.0, "grad_norm": 2.305315677890536, "language_loss": 0.85667789, "learning_rate": 3.181687263893095e-06, "loss": 0.88279533, "num_input_tokens_seen": 2905960, "step": 140, "time_per_iteration": 2.8557639122009277 }, { "auxiliary_loss_clip": 0.01473538, "auxiliary_loss_mlp": 0.01122701, "balance_loss_clip": 1.14923954, "balance_loss_mlp": 1.06166625, "epoch": 0.008477378626183677, "flos": 17639070589440.0, "grad_norm": 2.3443620963590455, "language_loss": 0.84346074, "learning_rate": 3.186269861057098e-06, "loss": 0.86942315, "num_input_tokens_seen": 2922780, "step": 141, "time_per_iteration": 2.7656807899475098 }, { "auxiliary_loss_clip": 0.01477141, "auxiliary_loss_mlp": 0.01135217, "balance_loss_clip": 1.14718878, "balance_loss_mlp": 1.07360983, "epoch": 0.008537501878851645, "flos": 13881342297600.0, "grad_norm": 2.29020652115343, "language_loss": 0.8105557, "learning_rate": 3.1908200721048745e-06, "loss": 0.83667928, "num_input_tokens_seen": 2938765, "step": 142, "time_per_iteration": 2.747598171234131 }, { "auxiliary_loss_clip": 0.01378886, "auxiliary_loss_mlp": 0.01060004, "balance_loss_clip": 1.19240355, "balance_loss_mlp": 1.03406358, "epoch": 0.008597625131519616, "flos": 71248101281280.0, "grad_norm": 1.056887207538052, "language_loss": 0.66899812, "learning_rate": 3.195338351584042e-06, "loss": 0.69338703, "num_input_tokens_seen": 3006665, "step": 143, "time_per_iteration": 3.346982002258301 }, { "auxiliary_loss_clip": 0.01467707, "auxiliary_loss_mlp": 0.01123721, "balance_loss_clip": 1.14666772, "balance_loss_mlp": 1.06273365, "epoch": 0.008657748384187584, "flos": 17602836744960.0, "grad_norm": 2.6467048454978523, "language_loss": 0.84356761, "learning_rate": 3.1998251445393258e-06, "loss": 0.86948192, "num_input_tokens_seen": 3024335, "step": 144, "time_per_iteration": 2.762087345123291 }, { "auxiliary_loss_clip": 0.01455701, "auxiliary_loss_mlp": 0.01114511, "balance_loss_clip": 1.14058816, "balance_loss_mlp": 1.05085373, "epoch": 0.008717871636855555, "flos": 19715317459200.0, "grad_norm": 1.8692883316747366, "language_loss": 0.88353741, "learning_rate": 3.204280886775619e-06, "loss": 0.90923953, "num_input_tokens_seen": 3043300, "step": 145, "time_per_iteration": 2.7050039768218994 }, { "auxiliary_loss_clip": 0.01470385, "auxiliary_loss_mlp": 0.01121817, "balance_loss_clip": 1.14247775, "balance_loss_mlp": 1.05873132, "epoch": 0.008777994889523523, "flos": 24717422568960.0, "grad_norm": 1.860830881508538, "language_loss": 0.86182559, "learning_rate": 3.208706005112005e-06, "loss": 0.88774765, "num_input_tokens_seen": 3064610, "step": 146, "time_per_iteration": 2.741013288497925 }, { "auxiliary_loss_clip": 0.01356998, "auxiliary_loss_mlp": 0.01029681, "balance_loss_clip": 1.18072379, "balance_loss_mlp": 1.00431335, "epoch": 0.008838118142191492, "flos": 70132067758080.0, "grad_norm": 0.8598047517885464, "language_loss": 0.60122073, "learning_rate": 3.213100917627104e-06, "loss": 0.6250875, "num_input_tokens_seen": 3130385, "step": 147, "time_per_iteration": 3.27382230758667 }, { "auxiliary_loss_clip": 0.01463009, "auxiliary_loss_mlp": 0.01123472, "balance_loss_clip": 1.14658976, "balance_loss_mlp": 1.06548882, "epoch": 0.008898241394859462, "flos": 20044797937920.0, "grad_norm": 1.8116070485228748, "language_loss": 0.84620225, "learning_rate": 3.2174660338961135e-06, "loss": 0.87206709, "num_input_tokens_seen": 3149760, "step": 148, "time_per_iteration": 2.72910475730896 }, { "auxiliary_loss_clip": 0.01466623, "auxiliary_loss_mlp": 0.01144944, "balance_loss_clip": 1.14777792, "balance_loss_mlp": 1.07985532, "epoch": 0.008958364647527431, "flos": 10743611685120.0, "grad_norm": 2.5530775415688205, "language_loss": 0.88680327, "learning_rate": 3.2218017552198588e-06, "loss": 0.91291893, "num_input_tokens_seen": 3164500, "step": 149, "time_per_iteration": 2.688528537750244 }, { "auxiliary_loss_clip": 0.01463954, "auxiliary_loss_mlp": 0.01114885, "balance_loss_clip": 1.14290714, "balance_loss_mlp": 1.05728304, "epoch": 0.009018487900195401, "flos": 29127467802240.0, "grad_norm": 2.1996557200804823, "language_loss": 0.93269086, "learning_rate": 3.226108474846181e-06, "loss": 0.95847929, "num_input_tokens_seen": 3182455, "step": 150, "time_per_iteration": 2.7901580333709717 }, { "auxiliary_loss_clip": 0.01450819, "auxiliary_loss_mlp": 0.01114571, "balance_loss_clip": 1.13812149, "balance_loss_mlp": 1.05839944, "epoch": 0.00907861115286337, "flos": 32963661354240.0, "grad_norm": 4.690239135210318, "language_loss": 0.7421813, "learning_rate": 3.2303865781839817e-06, "loss": 0.7678352, "num_input_tokens_seen": 3203995, "step": 151, "time_per_iteration": 2.79590106010437 }, { "auxiliary_loss_clip": 0.01463077, "auxiliary_loss_mlp": 0.01128244, "balance_loss_clip": 1.14311624, "balance_loss_mlp": 1.06954527, "epoch": 0.009138734405531338, "flos": 21762441377280.0, "grad_norm": 4.291097242497492, "language_loss": 0.88460332, "learning_rate": 3.234636443010188e-06, "loss": 0.9105165, "num_input_tokens_seen": 3222575, "step": 152, "time_per_iteration": 2.701775550842285 }, { "auxiliary_loss_clip": 0.01462099, "auxiliary_loss_mlp": 0.01122264, "balance_loss_clip": 1.14743185, "balance_loss_mlp": 1.06275451, "epoch": 0.009198857658199309, "flos": 20842517134080.0, "grad_norm": 3.861411936226758, "language_loss": 0.83918798, "learning_rate": 3.238858439669943e-06, "loss": 0.8650316, "num_input_tokens_seen": 3240180, "step": 153, "time_per_iteration": 2.730654716491699 }, { "auxiliary_loss_clip": 0.01453756, "auxiliary_loss_mlp": 0.01136244, "balance_loss_clip": 1.14024806, "balance_loss_mlp": 1.07554269, "epoch": 0.009258980910867277, "flos": 24827381078400.0, "grad_norm": 1.8788427995178905, "language_loss": 0.89924759, "learning_rate": 3.2430529312702712e-06, "loss": 0.92514759, "num_input_tokens_seen": 3259800, "step": 154, "time_per_iteration": 2.8150386810302734 }, { "auxiliary_loss_clip": 0.01457041, "auxiliary_loss_mlp": 0.01148182, "balance_loss_clip": 1.1422174, "balance_loss_mlp": 1.08934021, "epoch": 0.009319104163535248, "flos": 28767786963840.0, "grad_norm": 2.155148564981828, "language_loss": 0.89730597, "learning_rate": 3.2472202738674737e-06, "loss": 0.9233582, "num_input_tokens_seen": 3280400, "step": 155, "time_per_iteration": 2.7780215740203857 }, { "auxiliary_loss_clip": 0.01462257, "auxiliary_loss_mlp": 0.01115972, "balance_loss_clip": 1.14140153, "balance_loss_mlp": 1.0580368, "epoch": 0.009379227416203216, "flos": 16582004219520.0, "grad_norm": 2.6722626388977986, "language_loss": 0.86758631, "learning_rate": 3.2513608166485063e-06, "loss": 0.8933686, "num_input_tokens_seen": 3297600, "step": 156, "time_per_iteration": 2.7195818424224854 }, { "auxiliary_loss_clip": 0.01460326, "auxiliary_loss_mlp": 0.01116019, "balance_loss_clip": 1.14530039, "balance_loss_mlp": 1.05770147, "epoch": 0.009439350668871187, "flos": 18329919845760.0, "grad_norm": 2.3212743339319926, "language_loss": 0.99652225, "learning_rate": 3.2554749021065498e-06, "loss": 1.0222857, "num_input_tokens_seen": 3313635, "step": 157, "time_per_iteration": 2.7530624866485596 }, { "auxiliary_loss_clip": 0.01445494, "auxiliary_loss_mlp": 0.01139991, "balance_loss_clip": 1.14011836, "balance_loss_mlp": 1.08162606, "epoch": 0.009499473921539155, "flos": 24349912565760.0, "grad_norm": 2.2650385025378834, "language_loss": 0.88388717, "learning_rate": 3.2595628662110186e-06, "loss": 0.90974212, "num_input_tokens_seen": 3333735, "step": 158, "time_per_iteration": 2.744640588760376 }, { "auxiliary_loss_clip": 0.01451838, "auxiliary_loss_mlp": 0.01122147, "balance_loss_clip": 1.13977575, "balance_loss_mlp": 1.0630666, "epoch": 0.009559597174207124, "flos": 16399326625920.0, "grad_norm": 2.1807440045696165, "language_loss": 0.86407602, "learning_rate": 3.2636250385721982e-06, "loss": 0.88981581, "num_input_tokens_seen": 3348800, "step": 159, "time_per_iteration": 2.7330005168914795 }, { "auxiliary_loss_clip": 0.01441743, "auxiliary_loss_mlp": 0.01137796, "balance_loss_clip": 1.13474953, "balance_loss_mlp": 1.07752383, "epoch": 0.009619720426875094, "flos": 22856890826880.0, "grad_norm": 1.7296815250329798, "language_loss": 0.86756837, "learning_rate": 3.2676617426007263e-06, "loss": 0.89336377, "num_input_tokens_seen": 3368595, "step": 160, "time_per_iteration": 2.844817876815796 }, { "auxiliary_loss_clip": 0.01447614, "auxiliary_loss_mlp": 0.0112266, "balance_loss_clip": 1.13978457, "balance_loss_mlp": 1.06725168, "epoch": 0.009679843679543063, "flos": 19135001329920.0, "grad_norm": 2.462408333273543, "language_loss": 0.91543746, "learning_rate": 3.2716732956621042e-06, "loss": 0.94114017, "num_input_tokens_seen": 3384975, "step": 161, "time_per_iteration": 2.667666435241699 }, { "auxiliary_loss_clip": 0.01453392, "auxiliary_loss_mlp": 0.01111804, "balance_loss_clip": 1.14104879, "balance_loss_mlp": 1.05610919, "epoch": 0.009739966932211033, "flos": 20302995876480.0, "grad_norm": 1.7914334411859298, "language_loss": 0.91582954, "learning_rate": 3.2756600092264203e-06, "loss": 0.94148147, "num_input_tokens_seen": 3404755, "step": 162, "time_per_iteration": 2.6779961585998535 }, { "auxiliary_loss_clip": 0.0131522, "auxiliary_loss_mlp": 0.01056953, "balance_loss_clip": 1.15019548, "balance_loss_mlp": 1.03358769, "epoch": 0.009800090184879002, "flos": 67034234177280.0, "grad_norm": 1.183297200633083, "language_loss": 0.72292268, "learning_rate": 3.279622189013474e-06, "loss": 0.74664438, "num_input_tokens_seen": 3467210, "step": 163, "time_per_iteration": 3.226755142211914 }, { "auxiliary_loss_clip": 0.01439788, "auxiliary_loss_mlp": 0.01116102, "balance_loss_clip": 1.13873029, "balance_loss_mlp": 1.05921507, "epoch": 0.00986021343754697, "flos": 17164690646400.0, "grad_norm": 3.3372881081540937, "language_loss": 0.84684807, "learning_rate": 3.283560135133457e-06, "loss": 0.87240696, "num_input_tokens_seen": 3483220, "step": 164, "time_per_iteration": 2.768935203552246 }, { "auxiliary_loss_clip": 0.01430933, "auxiliary_loss_mlp": 0.0110117, "balance_loss_clip": 1.13048434, "balance_loss_mlp": 1.04533219, "epoch": 0.00992033669021494, "flos": 17749424148480.0, "grad_norm": 4.079659732294038, "language_loss": 0.89080763, "learning_rate": 3.2874741422233565e-06, "loss": 0.91612864, "num_input_tokens_seen": 3501465, "step": 165, "time_per_iteration": 2.673292875289917 }, { "auxiliary_loss_clip": 0.01433192, "auxiliary_loss_mlp": 0.01128138, "balance_loss_clip": 1.13111067, "balance_loss_mlp": 1.06819916, "epoch": 0.00998045994288291, "flos": 25297164080640.0, "grad_norm": 1.7359539169577796, "language_loss": 0.79931343, "learning_rate": 3.2913644995792465e-06, "loss": 0.82492673, "num_input_tokens_seen": 3520480, "step": 166, "time_per_iteration": 2.762742757797241 }, { "auxiliary_loss_clip": 0.01438026, "auxiliary_loss_mlp": 0.01129718, "balance_loss_clip": 1.13488948, "balance_loss_mlp": 1.07066131, "epoch": 0.01004058319555088, "flos": 32298954220800.0, "grad_norm": 2.3252666324684585, "language_loss": 0.92125285, "learning_rate": 3.2952314912845914e-06, "loss": 0.94693023, "num_input_tokens_seen": 3539570, "step": 167, "time_per_iteration": 2.970964193344116 }, { "auxiliary_loss_clip": 0.01429698, "auxiliary_loss_mlp": 0.01133324, "balance_loss_clip": 1.13294363, "balance_loss_mlp": 1.07734346, "epoch": 0.010100706448218848, "flos": 11319941404800.0, "grad_norm": 13.512238716069085, "language_loss": 0.90781063, "learning_rate": 3.299075396334735e-06, "loss": 0.93344086, "num_input_tokens_seen": 3555465, "step": 168, "time_per_iteration": 2.8039841651916504 }, { "auxiliary_loss_clip": 0.01424367, "auxiliary_loss_mlp": 0.01104795, "balance_loss_clip": 1.12848639, "balance_loss_mlp": 1.04700291, "epoch": 0.010160829700886819, "flos": 29719491765120.0, "grad_norm": 1.6705351130563955, "language_loss": 0.87173021, "learning_rate": 3.3028964887576868e-06, "loss": 0.89702177, "num_input_tokens_seen": 3578970, "step": 169, "time_per_iteration": 2.8215444087982178 }, { "auxiliary_loss_clip": 0.01425902, "auxiliary_loss_mlp": 0.01110538, "balance_loss_clip": 1.13139379, "balance_loss_mlp": 1.05317438, "epoch": 0.010220952953554787, "flos": 20412343854720.0, "grad_norm": 1.7404257397879006, "language_loss": 0.84622329, "learning_rate": 3.306695037731344e-06, "loss": 0.87158769, "num_input_tokens_seen": 3597275, "step": 170, "time_per_iteration": 2.6759181022644043 }, { "auxiliary_loss_clip": 0.0143612, "auxiliary_loss_mlp": 0.01137162, "balance_loss_clip": 1.13149834, "balance_loss_mlp": 1.07874942, "epoch": 0.010281076206222756, "flos": 31285124847360.0, "grad_norm": 2.174517661608974, "language_loss": 0.89936447, "learning_rate": 3.3104713076972827e-06, "loss": 0.92509729, "num_input_tokens_seen": 3618905, "step": 171, "time_per_iteration": 2.800394058227539 }, { "auxiliary_loss_clip": 0.01430673, "auxiliary_loss_mlp": 0.01108779, "balance_loss_clip": 1.1347487, "balance_loss_mlp": 1.05382347, "epoch": 0.010341199458890726, "flos": 21982286568960.0, "grad_norm": 1.938241860949196, "language_loss": 0.88895655, "learning_rate": 3.314225558471224e-06, "loss": 0.91435111, "num_input_tokens_seen": 3639610, "step": 172, "time_per_iteration": 2.755190849304199 }, { "auxiliary_loss_clip": 0.01418638, "auxiliary_loss_mlp": 0.01118471, "balance_loss_clip": 1.12744904, "balance_loss_mlp": 1.06270456, "epoch": 0.010401322711558695, "flos": 30810529422720.0, "grad_norm": 1.7925778946034159, "language_loss": 0.80943549, "learning_rate": 3.317958045350308e-06, "loss": 0.83480656, "num_input_tokens_seen": 3664030, "step": 173, "time_per_iteration": 2.751945734024048 }, { "auxiliary_loss_clip": 0.01429615, "auxiliary_loss_mlp": 0.01107965, "balance_loss_clip": 1.13108575, "balance_loss_mlp": 1.05534625, "epoch": 0.010461445964226665, "flos": 24715124098560.0, "grad_norm": 2.1644843911099216, "language_loss": 0.82763064, "learning_rate": 3.3216690192172596e-06, "loss": 0.85300648, "num_input_tokens_seen": 3683615, "step": 174, "time_per_iteration": 2.676630735397339 }, { "auxiliary_loss_clip": 0.01423443, "auxiliary_loss_mlp": 0.01120976, "balance_loss_clip": 1.12816644, "balance_loss_mlp": 1.06523335, "epoch": 0.010521569216894634, "flos": 27710361457920.0, "grad_norm": 2.331494685324117, "language_loss": 0.72837007, "learning_rate": 3.325358726641591e-06, "loss": 0.75381434, "num_input_tokens_seen": 3704540, "step": 175, "time_per_iteration": 2.6876866817474365 }, { "auxiliary_loss_clip": 0.01425333, "auxiliary_loss_mlp": 0.01127215, "balance_loss_clip": 1.12866652, "balance_loss_mlp": 1.06980324, "epoch": 0.010581692469562603, "flos": 12458346122880.0, "grad_norm": 4.811985773634618, "language_loss": 0.97983754, "learning_rate": 3.329027409977902e-06, "loss": 1.00536299, "num_input_tokens_seen": 3721320, "step": 176, "time_per_iteration": 2.8159937858581543 }, { "auxiliary_loss_clip": 0.0141033, "auxiliary_loss_mlp": 0.01130651, "balance_loss_clip": 1.12546706, "balance_loss_mlp": 1.07738805, "epoch": 0.010641815722230573, "flos": 19427601519360.0, "grad_norm": 2.8326118759658585, "language_loss": 0.76926064, "learning_rate": 3.3326753074614087e-06, "loss": 0.7946704, "num_input_tokens_seen": 3739385, "step": 177, "time_per_iteration": 5.7707555294036865 }, { "auxiliary_loss_clip": 0.01421858, "auxiliary_loss_mlp": 0.01104718, "balance_loss_clip": 1.12455702, "balance_loss_mlp": 1.05002475, "epoch": 0.010701938974898541, "flos": 18332577452160.0, "grad_norm": 2.6517911185675014, "language_loss": 0.76942402, "learning_rate": 3.3363026533007716e-06, "loss": 0.79468977, "num_input_tokens_seen": 3756360, "step": 178, "time_per_iteration": 4.337082386016846 }, { "auxiliary_loss_clip": 0.01430293, "auxiliary_loss_mlp": 0.01109414, "balance_loss_clip": 1.1303575, "balance_loss_mlp": 1.05252683, "epoch": 0.010762062227566512, "flos": 19203985399680.0, "grad_norm": 2.6843360372821925, "language_loss": 0.84022826, "learning_rate": 3.3399096777683303e-06, "loss": 0.86562538, "num_input_tokens_seen": 3773930, "step": 179, "time_per_iteration": 2.6826629638671875 }, { "auxiliary_loss_clip": 0.01418094, "auxiliary_loss_mlp": 0.01108667, "balance_loss_clip": 1.12202275, "balance_loss_mlp": 1.05158973, "epoch": 0.01082218548023448, "flos": 31425427370880.0, "grad_norm": 2.0256655839140083, "language_loss": 0.83674574, "learning_rate": 3.3434966072878213e-06, "loss": 0.86201334, "num_input_tokens_seen": 3793630, "step": 180, "time_per_iteration": 2.7483785152435303 }, { "auxiliary_loss_clip": 0.01421326, "auxiliary_loss_mlp": 0.01120347, "balance_loss_clip": 1.12740374, "balance_loss_mlp": 1.0646286, "epoch": 0.01088230873290245, "flos": 25046436170880.0, "grad_norm": 3.253139118534122, "language_loss": 0.77958715, "learning_rate": 3.3470636645196674e-06, "loss": 0.80500388, "num_input_tokens_seen": 3813610, "step": 181, "time_per_iteration": 2.698941469192505 }, { "auxiliary_loss_clip": 0.01414948, "auxiliary_loss_mlp": 0.01130231, "balance_loss_clip": 1.12188053, "balance_loss_mlp": 1.07577634, "epoch": 0.01094243198557042, "flos": 22893411980160.0, "grad_norm": 2.56637338396407, "language_loss": 0.76438594, "learning_rate": 3.3506110684439156e-06, "loss": 0.78983772, "num_input_tokens_seen": 3831390, "step": 182, "time_per_iteration": 2.6951375007629395 }, { "auxiliary_loss_clip": 0.01412526, "auxiliary_loss_mlp": 0.01126665, "balance_loss_clip": 1.12167537, "balance_loss_mlp": 1.0702554, "epoch": 0.011002555238238388, "flos": 17165049782400.0, "grad_norm": 2.083158831639218, "language_loss": 0.87484097, "learning_rate": 3.3541390344409054e-06, "loss": 0.90023291, "num_input_tokens_seen": 3849705, "step": 183, "time_per_iteration": 2.733753204345703 }, { "auxiliary_loss_clip": 0.01415922, "auxiliary_loss_mlp": 0.01110585, "balance_loss_clip": 1.12529624, "balance_loss_mlp": 1.05922985, "epoch": 0.011062678490906358, "flos": 22310150935680.0, "grad_norm": 3.105080129831269, "language_loss": 0.86911464, "learning_rate": 3.357647774369736e-06, "loss": 0.89437973, "num_input_tokens_seen": 3869230, "step": 184, "time_per_iteration": 2.6783828735351562 }, { "auxiliary_loss_clip": 0.01410648, "auxiliary_loss_mlp": 0.01108321, "balance_loss_clip": 1.12499499, "balance_loss_mlp": 1.05203021, "epoch": 0.011122801743574327, "flos": 24388373053440.0, "grad_norm": 1.8650514063709744, "language_loss": 0.83885491, "learning_rate": 3.3611374966446085e-06, "loss": 0.86404455, "num_input_tokens_seen": 3889735, "step": 185, "time_per_iteration": 2.6863327026367188 }, { "auxiliary_loss_clip": 0.01419384, "auxiliary_loss_mlp": 0.01107812, "balance_loss_clip": 1.12355363, "balance_loss_mlp": 1.04999495, "epoch": 0.011182924996242297, "flos": 18150258994560.0, "grad_norm": 2.8933407749520743, "language_loss": 0.71027243, "learning_rate": 3.3646084063091142e-06, "loss": 0.73554444, "num_input_tokens_seen": 3908855, "step": 186, "time_per_iteration": 2.819805383682251 }, { "auxiliary_loss_clip": 0.01415699, "auxiliary_loss_mlp": 0.01108312, "balance_loss_clip": 1.12262082, "balance_loss_mlp": 1.05574071, "epoch": 0.011243048248910266, "flos": 15486800584320.0, "grad_norm": 2.4244794785226733, "language_loss": 1.01999915, "learning_rate": 3.3680607051085194e-06, "loss": 1.04523933, "num_input_tokens_seen": 3923865, "step": 187, "time_per_iteration": 2.65875506401062 }, { "auxiliary_loss_clip": 0.01404987, "auxiliary_loss_mlp": 0.01107995, "balance_loss_clip": 1.12269068, "balance_loss_mlp": 1.05253887, "epoch": 0.011303171501578235, "flos": 40916868986880.0, "grad_norm": 2.0089158406542524, "language_loss": 0.74998611, "learning_rate": 3.371494591560139e-06, "loss": 0.77511597, "num_input_tokens_seen": 3946870, "step": 188, "time_per_iteration": 2.8631174564361572 }, { "auxiliary_loss_clip": 0.01298557, "auxiliary_loss_mlp": 0.01067058, "balance_loss_clip": 1.14124644, "balance_loss_mlp": 1.04474187, "epoch": 0.011363294754246205, "flos": 66302697790080.0, "grad_norm": 0.7620731385906954, "language_loss": 0.56192517, "learning_rate": 3.3749102610218297e-06, "loss": 0.5855813, "num_input_tokens_seen": 4010005, "step": 189, "time_per_iteration": 3.2704074382781982 }, { "auxiliary_loss_clip": 0.01402206, "auxiliary_loss_mlp": 0.011217, "balance_loss_clip": 1.11730003, "balance_loss_mlp": 1.06662548, "epoch": 0.011423418006914174, "flos": 24900279730560.0, "grad_norm": 2.640219984380571, "language_loss": 0.95085573, "learning_rate": 3.3783079057586833e-06, "loss": 0.97609472, "num_input_tokens_seen": 4029035, "step": 190, "time_per_iteration": 2.6898255348205566 }, { "auxiliary_loss_clip": 0.01405088, "auxiliary_loss_mlp": 0.01103893, "balance_loss_clip": 1.11979234, "balance_loss_mlp": 1.05167961, "epoch": 0.011483541259582144, "flos": 19791879298560.0, "grad_norm": 4.133813113517846, "language_loss": 0.8463847, "learning_rate": 3.3816877150079665e-06, "loss": 0.8714745, "num_input_tokens_seen": 4046995, "step": 191, "time_per_iteration": 2.71589994430542 }, { "auxiliary_loss_clip": 0.01403196, "auxiliary_loss_mlp": 0.01118385, "balance_loss_clip": 1.11570346, "balance_loss_mlp": 1.06624269, "epoch": 0.011543664512250112, "flos": 26176939896960.0, "grad_norm": 2.0065119945705887, "language_loss": 0.91894913, "learning_rate": 3.385049875042367e-06, "loss": 0.94416493, "num_input_tokens_seen": 4065865, "step": 192, "time_per_iteration": 2.775974988937378 }, { "auxiliary_loss_clip": 0.01398496, "auxiliary_loss_mlp": 0.01118924, "balance_loss_clip": 1.11665678, "balance_loss_mlp": 1.06117916, "epoch": 0.011603787764918083, "flos": 23768985905280.0, "grad_norm": 2.10033302347605, "language_loss": 0.86923265, "learning_rate": 3.3883945692315938e-06, "loss": 0.89440691, "num_input_tokens_seen": 4085305, "step": 193, "time_per_iteration": 2.792947292327881 }, { "auxiliary_loss_clip": 0.01402535, "auxiliary_loss_mlp": 0.01102276, "balance_loss_clip": 1.11514282, "balance_loss_mlp": 1.05061066, "epoch": 0.011663911017586051, "flos": 25954688494080.0, "grad_norm": 2.2253165290939076, "language_loss": 0.92296255, "learning_rate": 3.3917219781023906e-06, "loss": 0.94801068, "num_input_tokens_seen": 4105185, "step": 194, "time_per_iteration": 2.6886558532714844 }, { "auxiliary_loss_clip": 0.01407209, "auxiliary_loss_mlp": 0.01108641, "balance_loss_clip": 1.11930478, "balance_loss_mlp": 1.05630851, "epoch": 0.01172403427025402, "flos": 17895149625600.0, "grad_norm": 2.4241235245311503, "language_loss": 0.89768875, "learning_rate": 3.3950322793970014e-06, "loss": 0.92284721, "num_input_tokens_seen": 4123160, "step": 195, "time_per_iteration": 2.654517889022827 }, { "auxiliary_loss_clip": 0.01400339, "auxiliary_loss_mlp": 0.01114485, "balance_loss_clip": 1.11779022, "balance_loss_mlp": 1.05981565, "epoch": 0.01178415752292199, "flos": 17894539094400.0, "grad_norm": 3.1130999341447385, "language_loss": 0.86019921, "learning_rate": 3.3983256481301445e-06, "loss": 0.88534749, "num_input_tokens_seen": 4140425, "step": 196, "time_per_iteration": 2.643598794937134 }, { "auxiliary_loss_clip": 0.01398067, "auxiliary_loss_mlp": 0.01107082, "balance_loss_clip": 1.11464977, "balance_loss_mlp": 1.05308056, "epoch": 0.011844280775589959, "flos": 22893555634560.0, "grad_norm": 3.666533247373141, "language_loss": 0.93052697, "learning_rate": 3.4016022566445335e-06, "loss": 0.95557845, "num_input_tokens_seen": 4159555, "step": 197, "time_per_iteration": 2.7120354175567627 }, { "auxiliary_loss_clip": 0.01396424, "auxiliary_loss_mlp": 0.01112388, "balance_loss_clip": 1.11625624, "balance_loss_mlp": 1.05943501, "epoch": 0.01190440402825793, "flos": 26980333441920.0, "grad_norm": 1.9614954763997827, "language_loss": 0.79043806, "learning_rate": 3.4048622746649966e-06, "loss": 0.81552619, "num_input_tokens_seen": 4180480, "step": 198, "time_per_iteration": 2.774059772491455 }, { "auxiliary_loss_clip": 0.0139305, "auxiliary_loss_mlp": 0.01120527, "balance_loss_clip": 1.11708748, "balance_loss_mlp": 1.06821764, "epoch": 0.011964527280925898, "flos": 20521584092160.0, "grad_norm": 1.8823459083646328, "language_loss": 0.88239717, "learning_rate": 3.4081058693512278e-06, "loss": 0.90753293, "num_input_tokens_seen": 4198835, "step": 199, "time_per_iteration": 2.6808881759643555 }, { "auxiliary_loss_clip": 0.01403709, "auxiliary_loss_mlp": 0.0112899, "balance_loss_clip": 1.11951399, "balance_loss_mlp": 1.07200766, "epoch": 0.012024650533593867, "flos": 27745984771200.0, "grad_norm": 2.0663906916258497, "language_loss": 0.81151628, "learning_rate": 3.411333205349222e-06, "loss": 0.83684325, "num_input_tokens_seen": 4219335, "step": 200, "time_per_iteration": 2.625380516052246 }, { "auxiliary_loss_clip": 0.0140201, "auxiliary_loss_mlp": 0.01104413, "balance_loss_clip": 1.11633158, "balance_loss_mlp": 1.05048287, "epoch": 0.012084773786261837, "flos": 10452017076480.0, "grad_norm": 2.253120238884594, "language_loss": 0.87696433, "learning_rate": 3.4145444448414217e-06, "loss": 0.90202856, "num_input_tokens_seen": 4236940, "step": 201, "time_per_iteration": 2.6062326431274414 }, { "auxiliary_loss_clip": 0.01399494, "auxiliary_loss_mlp": 0.01115643, "balance_loss_clip": 1.11764228, "balance_loss_mlp": 1.0614028, "epoch": 0.012144897038929806, "flos": 23105751229440.0, "grad_norm": 2.088192664231089, "language_loss": 0.84052485, "learning_rate": 3.4177397475956223e-06, "loss": 0.86567622, "num_input_tokens_seen": 4256755, "step": 202, "time_per_iteration": 2.6981592178344727 }, { "auxiliary_loss_clip": 0.01388741, "auxiliary_loss_mlp": 0.0111019, "balance_loss_clip": 1.11006808, "balance_loss_mlp": 1.05771446, "epoch": 0.012205020291597776, "flos": 21033203460480.0, "grad_norm": 1.7861279575653157, "language_loss": 0.89964712, "learning_rate": 3.4209192710126685e-06, "loss": 0.92463642, "num_input_tokens_seen": 4276505, "step": 203, "time_per_iteration": 2.668757438659668 }, { "auxiliary_loss_clip": 0.01276289, "auxiliary_loss_mlp": 0.01095021, "balance_loss_clip": 1.12578154, "balance_loss_mlp": 1.07470798, "epoch": 0.012265143544265745, "flos": 68447785075200.0, "grad_norm": 1.0265297625980543, "language_loss": 0.61255801, "learning_rate": 3.4240831701729837e-06, "loss": 0.63627112, "num_input_tokens_seen": 4330965, "step": 204, "time_per_iteration": 3.161599636077881 }, { "auxiliary_loss_clip": 0.01396271, "auxiliary_loss_mlp": 0.01111806, "balance_loss_clip": 1.11291122, "balance_loss_mlp": 1.05930579, "epoch": 0.012325266796933715, "flos": 17019252478080.0, "grad_norm": 2.3248674300118184, "language_loss": 0.91324663, "learning_rate": 3.4272315978819516e-06, "loss": 0.93832743, "num_input_tokens_seen": 4348200, "step": 205, "time_per_iteration": 2.6764047145843506 }, { "auxiliary_loss_clip": 0.01404558, "auxiliary_loss_mlp": 0.0112167, "balance_loss_clip": 1.11773109, "balance_loss_mlp": 1.06773925, "epoch": 0.012385390049601683, "flos": 20190056538240.0, "grad_norm": 2.1088315130515207, "language_loss": 0.89305568, "learning_rate": 3.4303647047142043e-06, "loss": 0.91831797, "num_input_tokens_seen": 4365460, "step": 206, "time_per_iteration": 2.7157227993011475 }, { "auxiliary_loss_clip": 0.0139534, "auxiliary_loss_mlp": 0.01100957, "balance_loss_clip": 1.11176991, "balance_loss_mlp": 1.04888678, "epoch": 0.012445513302269652, "flos": 16253134272000.0, "grad_norm": 2.399816031687551, "language_loss": 0.95542914, "learning_rate": 3.43348263905683e-06, "loss": 0.9803921, "num_input_tokens_seen": 4383650, "step": 207, "time_per_iteration": 2.611348867416382 }, { "auxiliary_loss_clip": 0.01393005, "auxiliary_loss_mlp": 0.01117764, "balance_loss_clip": 1.11658561, "balance_loss_mlp": 1.06497812, "epoch": 0.012505636554937622, "flos": 23769380954880.0, "grad_norm": 1.8144323603981871, "language_loss": 0.75985783, "learning_rate": 3.436585547151547e-06, "loss": 0.78496552, "num_input_tokens_seen": 4403765, "step": 208, "time_per_iteration": 2.7184154987335205 }, { "auxiliary_loss_clip": 0.0138146, "auxiliary_loss_mlp": 0.01108623, "balance_loss_clip": 1.11071992, "balance_loss_mlp": 1.05576587, "epoch": 0.012565759807605591, "flos": 30591546157440.0, "grad_norm": 2.2326965650696855, "language_loss": 0.98386943, "learning_rate": 3.4396735731358586e-06, "loss": 1.00877023, "num_input_tokens_seen": 4421935, "step": 209, "time_per_iteration": 2.7354249954223633 }, { "auxiliary_loss_clip": 0.01387012, "auxiliary_loss_mlp": 0.0111836, "balance_loss_clip": 1.11136842, "balance_loss_mlp": 1.06490695, "epoch": 0.012625883060273561, "flos": 40113511355520.0, "grad_norm": 9.084733304650118, "language_loss": 0.85514843, "learning_rate": 3.4427468590832302e-06, "loss": 0.88020217, "num_input_tokens_seen": 4441470, "step": 210, "time_per_iteration": 2.888749122619629 }, { "auxiliary_loss_clip": 0.01384384, "auxiliary_loss_mlp": 0.01121559, "balance_loss_clip": 1.11018038, "balance_loss_mlp": 1.07115781, "epoch": 0.01268600631294153, "flos": 27089178629760.0, "grad_norm": 3.431917100192063, "language_loss": 0.97194636, "learning_rate": 3.445805545042314e-06, "loss": 0.99700582, "num_input_tokens_seen": 4459950, "step": 211, "time_per_iteration": 2.7465193271636963 }, { "auxiliary_loss_clip": 0.01393556, "auxiliary_loss_mlp": 0.01123542, "balance_loss_clip": 1.11511767, "balance_loss_mlp": 1.06999326, "epoch": 0.012746129565609499, "flos": 16982767238400.0, "grad_norm": 2.3992368053115163, "language_loss": 0.9508543, "learning_rate": 3.448849769075239e-06, "loss": 0.97602528, "num_input_tokens_seen": 4478390, "step": 212, "time_per_iteration": 2.6340651512145996 }, { "auxiliary_loss_clip": 0.01381697, "auxiliary_loss_mlp": 0.01116386, "balance_loss_clip": 1.112149, "balance_loss_mlp": 1.06381512, "epoch": 0.012806252818277469, "flos": 46533476995200.0, "grad_norm": 1.701444843398511, "language_loss": 0.76078421, "learning_rate": 3.4518796672950093e-06, "loss": 0.78576505, "num_input_tokens_seen": 4501665, "step": 213, "time_per_iteration": 2.9250640869140625 }, { "auxiliary_loss_clip": 0.01385821, "auxiliary_loss_mlp": 0.01111776, "balance_loss_clip": 1.11002433, "balance_loss_mlp": 1.06056333, "epoch": 0.012866376070945438, "flos": 14388616120320.0, "grad_norm": 3.5300370267625922, "language_loss": 0.86698866, "learning_rate": 3.4548953739020187e-06, "loss": 0.89196461, "num_input_tokens_seen": 4519055, "step": 214, "time_per_iteration": 2.645289659500122 }, { "auxiliary_loss_clip": 0.01383455, "auxiliary_loss_mlp": 0.01128262, "balance_loss_clip": 1.1159339, "balance_loss_mlp": 1.07359219, "epoch": 0.012926499323613408, "flos": 26140813793280.0, "grad_norm": 2.14433888305053, "language_loss": 0.77582061, "learning_rate": 3.4578970212197196e-06, "loss": 0.80093777, "num_input_tokens_seen": 4540870, "step": 215, "time_per_iteration": 2.7315175533294678 }, { "auxiliary_loss_clip": 0.01391951, "auxiliary_loss_mlp": 0.01115104, "balance_loss_clip": 1.11440635, "balance_loss_mlp": 1.0638206, "epoch": 0.012986622576281377, "flos": 30117202128000.0, "grad_norm": 2.2964706747038233, "language_loss": 0.90423942, "learning_rate": 3.460884739729461e-06, "loss": 0.92930996, "num_input_tokens_seen": 4560395, "step": 216, "time_per_iteration": 2.724698781967163 }, { "auxiliary_loss_clip": 0.01384729, "auxiliary_loss_mlp": 0.01113374, "balance_loss_clip": 1.10847259, "balance_loss_mlp": 1.06096959, "epoch": 0.013046745828949347, "flos": 13954025468160.0, "grad_norm": 3.60062834696173, "language_loss": 0.93473232, "learning_rate": 3.463858658104523e-06, "loss": 0.95971346, "num_input_tokens_seen": 4575785, "step": 217, "time_per_iteration": 5.762276649475098 }, { "auxiliary_loss_clip": 0.01377712, "auxiliary_loss_mlp": 0.0110874, "balance_loss_clip": 1.10726643, "balance_loss_mlp": 1.05433273, "epoch": 0.013106869081617315, "flos": 17347835116800.0, "grad_norm": 1.943339896357513, "language_loss": 0.93811166, "learning_rate": 3.4668189032433696e-06, "loss": 0.96297616, "num_input_tokens_seen": 4594985, "step": 218, "time_per_iteration": 5.832701206207275 }, { "auxiliary_loss_clip": 0.01372884, "auxiliary_loss_mlp": 0.01106717, "balance_loss_clip": 1.10647273, "balance_loss_mlp": 1.05552888, "epoch": 0.013166992334285284, "flos": 25884914325120.0, "grad_norm": 2.252873600345955, "language_loss": 0.86196327, "learning_rate": 3.46976560030214e-06, "loss": 0.88675928, "num_input_tokens_seen": 4616125, "step": 219, "time_per_iteration": 2.794581651687622 }, { "auxiliary_loss_clip": 0.0137885, "auxiliary_loss_mlp": 0.01102953, "balance_loss_clip": 1.10957599, "balance_loss_mlp": 1.05188394, "epoch": 0.013227115586953254, "flos": 31175956437120.0, "grad_norm": 1.897987121161891, "language_loss": 0.8748548, "learning_rate": 3.4726988727263976e-06, "loss": 0.89967287, "num_input_tokens_seen": 4637795, "step": 220, "time_per_iteration": 2.799927234649658 }, { "auxiliary_loss_clip": 0.01370688, "auxiliary_loss_mlp": 0.01115596, "balance_loss_clip": 1.10440111, "balance_loss_mlp": 1.0679127, "epoch": 0.013287238839621223, "flos": 20409470766720.0, "grad_norm": 3.2557072980071795, "language_loss": 0.86437249, "learning_rate": 3.475618842282164e-06, "loss": 0.88923532, "num_input_tokens_seen": 4656835, "step": 221, "time_per_iteration": 2.7040672302246094 }, { "auxiliary_loss_clip": 0.01376134, "auxiliary_loss_mlp": 0.01116397, "balance_loss_clip": 1.10384834, "balance_loss_mlp": 1.0637064, "epoch": 0.013347362092289193, "flos": 14137134024960.0, "grad_norm": 2.585706849100757, "language_loss": 0.92369294, "learning_rate": 3.4785256290862486e-06, "loss": 0.94861829, "num_input_tokens_seen": 4673015, "step": 222, "time_per_iteration": 2.6648194789886475 }, { "auxiliary_loss_clip": 0.01373283, "auxiliary_loss_mlp": 0.01106423, "balance_loss_clip": 1.10636806, "balance_loss_mlp": 1.05156267, "epoch": 0.013407485344957162, "flos": 21797705554560.0, "grad_norm": 7.739608779999776, "language_loss": 0.95708215, "learning_rate": 3.481419351635897e-06, "loss": 0.98187923, "num_input_tokens_seen": 4692355, "step": 223, "time_per_iteration": 2.7261807918548584 }, { "auxiliary_loss_clip": 0.01374555, "auxiliary_loss_mlp": 0.0110963, "balance_loss_clip": 1.10768425, "balance_loss_mlp": 1.05870414, "epoch": 0.013467608597625132, "flos": 18621622195200.0, "grad_norm": 2.673591615227502, "language_loss": 0.88031876, "learning_rate": 3.484300126837776e-06, "loss": 0.90516055, "num_input_tokens_seen": 4710080, "step": 224, "time_per_iteration": 2.601686477661133 }, { "auxiliary_loss_clip": 0.01374533, "auxiliary_loss_mlp": 0.01103, "balance_loss_clip": 1.10679817, "balance_loss_mlp": 1.04804444, "epoch": 0.013527731850293101, "flos": 18552314903040.0, "grad_norm": 3.0722216996453535, "language_loss": 0.89625597, "learning_rate": 3.487168070036317e-06, "loss": 0.9210313, "num_input_tokens_seen": 4728980, "step": 225, "time_per_iteration": 2.6677513122558594 }, { "auxiliary_loss_clip": 0.01369955, "auxiliary_loss_mlp": 0.0112021, "balance_loss_clip": 1.10561275, "balance_loss_mlp": 1.06675696, "epoch": 0.01358785510296107, "flos": 19165381257600.0, "grad_norm": 1.9576206039109396, "language_loss": 0.98980033, "learning_rate": 3.4900232950414224e-06, "loss": 1.01470196, "num_input_tokens_seen": 4747020, "step": 226, "time_per_iteration": 2.8320930004119873 }, { "auxiliary_loss_clip": 0.01375268, "auxiliary_loss_mlp": 0.01110039, "balance_loss_clip": 1.10837173, "balance_loss_mlp": 1.05572701, "epoch": 0.01364797835562904, "flos": 23329941966720.0, "grad_norm": 2.3303410550109245, "language_loss": 0.90965348, "learning_rate": 3.4928659141555727e-06, "loss": 0.93450654, "num_input_tokens_seen": 4765000, "step": 227, "time_per_iteration": 2.648606061935425 }, { "auxiliary_loss_clip": 0.01255161, "auxiliary_loss_mlp": 0.01079249, "balance_loss_clip": 1.11229861, "balance_loss_mlp": 1.06017554, "epoch": 0.013708101608297009, "flos": 70993746097920.0, "grad_norm": 0.9472069433514878, "language_loss": 0.57650995, "learning_rate": 3.4956960382003234e-06, "loss": 0.59985405, "num_input_tokens_seen": 4833210, "step": 228, "time_per_iteration": 3.246328592300415 }, { "auxiliary_loss_clip": 0.01366835, "auxiliary_loss_mlp": 0.01117377, "balance_loss_clip": 1.10507822, "balance_loss_mlp": 1.06711841, "epoch": 0.013768224860964979, "flos": 16325170997760.0, "grad_norm": 2.957038430634678, "language_loss": 0.87773621, "learning_rate": 3.4985137765422354e-06, "loss": 0.90257835, "num_input_tokens_seen": 4850120, "step": 229, "time_per_iteration": 2.6319024562835693 }, { "auxiliary_loss_clip": 0.01375278, "auxiliary_loss_mlp": 0.01098609, "balance_loss_clip": 1.10567176, "balance_loss_mlp": 1.04873204, "epoch": 0.013828348113632948, "flos": 20193037367040.0, "grad_norm": 4.72663824849547, "language_loss": 0.83937395, "learning_rate": 3.501319237118231e-06, "loss": 0.86411285, "num_input_tokens_seen": 4866215, "step": 230, "time_per_iteration": 2.7026398181915283 }, { "auxiliary_loss_clip": 0.01373544, "auxiliary_loss_mlp": 0.01113683, "balance_loss_clip": 1.10701275, "balance_loss_mlp": 1.06361556, "epoch": 0.013888471366300916, "flos": 20741070147840.0, "grad_norm": 2.2562202151287867, "language_loss": 0.904212, "learning_rate": 3.5041125264604056e-06, "loss": 0.9290843, "num_input_tokens_seen": 4885630, "step": 231, "time_per_iteration": 2.6424474716186523 }, { "auxiliary_loss_clip": 0.01377759, "auxiliary_loss_mlp": 0.01110232, "balance_loss_clip": 1.11118639, "balance_loss_mlp": 1.06030726, "epoch": 0.013948594618968886, "flos": 22090628966400.0, "grad_norm": 2.0229562700819215, "language_loss": 0.83624899, "learning_rate": 3.5068937497203002e-06, "loss": 0.86112887, "num_input_tokens_seen": 4905570, "step": 232, "time_per_iteration": 2.621704339981079 }, { "auxiliary_loss_clip": 0.01377798, "auxiliary_loss_mlp": 0.01094369, "balance_loss_clip": 1.10229027, "balance_loss_mlp": 1.04253721, "epoch": 0.014008717871636855, "flos": 19063108258560.0, "grad_norm": 5.516695444379509, "language_loss": 0.74727643, "learning_rate": 3.509663010692652e-06, "loss": 0.77199805, "num_input_tokens_seen": 4923535, "step": 233, "time_per_iteration": 2.659188747406006 }, { "auxiliary_loss_clip": 0.01382744, "auxiliary_loss_mlp": 0.01125121, "balance_loss_clip": 1.1099937, "balance_loss_mlp": 1.0723356, "epoch": 0.014068841124304825, "flos": 14530822064640.0, "grad_norm": 2.5763093382937483, "language_loss": 0.85633421, "learning_rate": 3.512420411838642e-06, "loss": 0.88141286, "num_input_tokens_seen": 4939200, "step": 234, "time_per_iteration": 2.610635757446289 }, { "auxiliary_loss_clip": 0.01374562, "auxiliary_loss_mlp": 0.01114672, "balance_loss_clip": 1.10890436, "balance_loss_mlp": 1.06467605, "epoch": 0.014128964376972794, "flos": 18077396256000.0, "grad_norm": 2.467487286445388, "language_loss": 0.89192498, "learning_rate": 3.515166054308634e-06, "loss": 0.91681731, "num_input_tokens_seen": 4956620, "step": 235, "time_per_iteration": 2.668769359588623 }, { "auxiliary_loss_clip": 0.01373018, "auxiliary_loss_mlp": 0.01131641, "balance_loss_clip": 1.11011076, "balance_loss_mlp": 1.08073914, "epoch": 0.014189087629640764, "flos": 25334331678720.0, "grad_norm": 2.143165146200321, "language_loss": 0.85535377, "learning_rate": 3.5179000379644498e-06, "loss": 0.88040036, "num_input_tokens_seen": 4975650, "step": 236, "time_per_iteration": 2.7570323944091797 }, { "auxiliary_loss_clip": 0.01369632, "auxiliary_loss_mlp": 0.01100269, "balance_loss_clip": 1.10296702, "balance_loss_mlp": 1.04905629, "epoch": 0.014249210882308733, "flos": 36139744713600.0, "grad_norm": 2.1351980688483136, "language_loss": 0.82550979, "learning_rate": 3.520622461401154e-06, "loss": 0.85020876, "num_input_tokens_seen": 4997415, "step": 237, "time_per_iteration": 2.811617374420166 }, { "auxiliary_loss_clip": 0.01369728, "auxiliary_loss_mlp": 0.01124352, "balance_loss_clip": 1.10659075, "balance_loss_mlp": 1.07085085, "epoch": 0.014309334134976702, "flos": 12932977461120.0, "grad_norm": 2.0241581748099313, "language_loss": 0.77096599, "learning_rate": 3.5233334219683935e-06, "loss": 0.79590684, "num_input_tokens_seen": 5013905, "step": 238, "time_per_iteration": 2.8044662475585938 }, { "auxiliary_loss_clip": 0.01367496, "auxiliary_loss_mlp": 0.01111406, "balance_loss_clip": 1.10897434, "balance_loss_mlp": 1.06343579, "epoch": 0.014369457387644672, "flos": 20777519473920.0, "grad_norm": 1.8300428555870456, "language_loss": 0.8707583, "learning_rate": 3.526033015791284e-06, "loss": 0.89554727, "num_input_tokens_seen": 5033645, "step": 239, "time_per_iteration": 2.681452751159668 }, { "auxiliary_loss_clip": 0.01353036, "auxiliary_loss_mlp": 0.01103184, "balance_loss_clip": 1.10036874, "balance_loss_mlp": 1.05516672, "epoch": 0.01442958064031264, "flos": 25848536826240.0, "grad_norm": 2.109315431148974, "language_loss": 0.93055749, "learning_rate": 3.528721337790862e-06, "loss": 0.95511973, "num_input_tokens_seen": 5052875, "step": 240, "time_per_iteration": 2.679826021194458 }, { "auxiliary_loss_clip": 0.01360794, "auxiliary_loss_mlp": 0.01103084, "balance_loss_clip": 1.10475957, "balance_loss_mlp": 1.05611515, "epoch": 0.014489703892980611, "flos": 28219718269440.0, "grad_norm": 3.7136133710916575, "language_loss": 0.8482846, "learning_rate": 3.531398481704111e-06, "loss": 0.87292337, "num_input_tokens_seen": 5075005, "step": 241, "time_per_iteration": 2.679126262664795 }, { "auxiliary_loss_clip": 0.01359518, "auxiliary_loss_mlp": 0.01119602, "balance_loss_clip": 1.11010456, "balance_loss_mlp": 1.06931913, "epoch": 0.01454982714564858, "flos": 22490925108480.0, "grad_norm": 1.8502491938168453, "language_loss": 0.88590866, "learning_rate": 3.534064540103573e-06, "loss": 0.9106999, "num_input_tokens_seen": 5091875, "step": 242, "time_per_iteration": 2.7366583347320557 }, { "auxiliary_loss_clip": 0.01359534, "auxiliary_loss_mlp": 0.01104713, "balance_loss_clip": 1.10356677, "balance_loss_mlp": 1.05342889, "epoch": 0.014609950398316548, "flos": 21653201139840.0, "grad_norm": 2.261458758817042, "language_loss": 0.86688942, "learning_rate": 3.536719604416555e-06, "loss": 0.89153194, "num_input_tokens_seen": 5111290, "step": 243, "time_per_iteration": 2.764378070831299 }, { "auxiliary_loss_clip": 0.01364897, "auxiliary_loss_mlp": 0.01106776, "balance_loss_clip": 1.10636568, "balance_loss_mlp": 1.05656552, "epoch": 0.014670073650984519, "flos": 21869993675520.0, "grad_norm": 1.6964959858678799, "language_loss": 0.84256208, "learning_rate": 3.5393637649439464e-06, "loss": 0.86727887, "num_input_tokens_seen": 5132265, "step": 244, "time_per_iteration": 2.630441188812256 }, { "auxiliary_loss_clip": 0.01372266, "auxiliary_loss_mlp": 0.01115072, "balance_loss_clip": 1.10771632, "balance_loss_mlp": 1.06328762, "epoch": 0.014730196903652487, "flos": 23183713699200.0, "grad_norm": 8.49550264430495, "language_loss": 0.78613877, "learning_rate": 3.54199711087864e-06, "loss": 0.81101215, "num_input_tokens_seen": 5148575, "step": 245, "time_per_iteration": 2.6991443634033203 }, { "auxiliary_loss_clip": 0.01371598, "auxiliary_loss_mlp": 0.0110404, "balance_loss_clip": 1.10405719, "balance_loss_mlp": 1.05008554, "epoch": 0.014790320156320457, "flos": 23222605150080.0, "grad_norm": 2.2582939339926305, "language_loss": 0.84165329, "learning_rate": 3.5446197303235913e-06, "loss": 0.86640966, "num_input_tokens_seen": 5170415, "step": 246, "time_per_iteration": 2.726743221282959 }, { "auxiliary_loss_clip": 0.01365538, "auxiliary_loss_mlp": 0.01101456, "balance_loss_clip": 1.10242295, "balance_loss_mlp": 1.05062532, "epoch": 0.014850443408988426, "flos": 15815490963840.0, "grad_norm": 1.9870849133800452, "language_loss": 0.89958012, "learning_rate": 3.5472317103095034e-06, "loss": 0.92425001, "num_input_tokens_seen": 5188565, "step": 247, "time_per_iteration": 2.5998406410217285 }, { "auxiliary_loss_clip": 0.01364581, "auxiliary_loss_mlp": 0.01098108, "balance_loss_clip": 1.09896278, "balance_loss_mlp": 1.0489223, "epoch": 0.014910566661656396, "flos": 22781657790720.0, "grad_norm": 2.0527635487774343, "language_loss": 0.783005, "learning_rate": 3.549833136812155e-06, "loss": 0.80763197, "num_input_tokens_seen": 5207810, "step": 248, "time_per_iteration": 2.689784049987793 }, { "auxiliary_loss_clip": 0.01365896, "auxiliary_loss_mlp": 0.01110511, "balance_loss_clip": 1.10732806, "balance_loss_mlp": 1.06044269, "epoch": 0.014970689914324365, "flos": 26865023806080.0, "grad_norm": 1.9405946352322343, "language_loss": 0.83855766, "learning_rate": 3.552424094769381e-06, "loss": 0.86332172, "num_input_tokens_seen": 5226210, "step": 249, "time_per_iteration": 2.8210339546203613 }, { "auxiliary_loss_clip": 0.01358179, "auxiliary_loss_mlp": 0.01106801, "balance_loss_clip": 1.10089588, "balance_loss_mlp": 1.05802023, "epoch": 0.015030813166992334, "flos": 13985662371840.0, "grad_norm": 2.0689026358419786, "language_loss": 0.93631709, "learning_rate": 3.5550046680977174e-06, "loss": 0.96096689, "num_input_tokens_seen": 5241660, "step": 250, "time_per_iteration": 2.7074570655822754 }, { "auxiliary_loss_clip": 0.01368183, "auxiliary_loss_mlp": 0.01115393, "balance_loss_clip": 1.1065619, "balance_loss_mlp": 1.06415713, "epoch": 0.015090936419660304, "flos": 24717817618560.0, "grad_norm": 2.6509740932573127, "language_loss": 0.9678722, "learning_rate": 3.5575749397087034e-06, "loss": 0.99270797, "num_input_tokens_seen": 5261090, "step": 251, "time_per_iteration": 2.6740176677703857 }, { "auxiliary_loss_clip": 0.01361249, "auxiliary_loss_mlp": 0.01108489, "balance_loss_clip": 1.10063529, "balance_loss_mlp": 1.0597558, "epoch": 0.015151059672328273, "flos": 25738793798400.0, "grad_norm": 1.996044018630987, "language_loss": 0.84516245, "learning_rate": 3.5601349915248707e-06, "loss": 0.86985981, "num_input_tokens_seen": 5279175, "step": 252, "time_per_iteration": 2.7198123931884766 }, { "auxiliary_loss_clip": 0.01356789, "auxiliary_loss_mlp": 0.0111346, "balance_loss_clip": 1.1023767, "balance_loss_mlp": 1.06346345, "epoch": 0.015211182924996243, "flos": 21871214737920.0, "grad_norm": 2.3132428526475275, "language_loss": 0.98516917, "learning_rate": 3.5626849044954064e-06, "loss": 1.0098716, "num_input_tokens_seen": 5296975, "step": 253, "time_per_iteration": 2.6751561164855957 }, { "auxiliary_loss_clip": 0.01244193, "auxiliary_loss_mlp": 0.01100072, "balance_loss_clip": 1.1058414, "balance_loss_mlp": 1.08338308, "epoch": 0.015271306177664212, "flos": 66895080888960.0, "grad_norm": 0.8719135194962525, "language_loss": 0.55628473, "learning_rate": 3.5652247586115167e-06, "loss": 0.57972741, "num_input_tokens_seen": 5358375, "step": 254, "time_per_iteration": 3.2305996417999268 }, { "auxiliary_loss_clip": 0.0136146, "auxiliary_loss_mlp": 0.01119692, "balance_loss_clip": 1.0985806, "balance_loss_mlp": 1.06952846, "epoch": 0.01533142943033218, "flos": 26834069260800.0, "grad_norm": 2.113472843461701, "language_loss": 0.90234184, "learning_rate": 3.567754632921479e-06, "loss": 0.92715329, "num_input_tokens_seen": 5377255, "step": 255, "time_per_iteration": 2.7138473987579346 }, { "auxiliary_loss_clip": 0.01357311, "auxiliary_loss_mlp": 0.01137867, "balance_loss_clip": 1.1001389, "balance_loss_mlp": 1.08803785, "epoch": 0.01539155268300015, "flos": 20813753318400.0, "grad_norm": 2.320838285045027, "language_loss": 0.85392761, "learning_rate": 3.5702746055454075e-06, "loss": 0.87887937, "num_input_tokens_seen": 5395320, "step": 256, "time_per_iteration": 2.7135775089263916 }, { "auxiliary_loss_clip": 0.01363873, "auxiliary_loss_mlp": 0.0112257, "balance_loss_clip": 1.10053098, "balance_loss_mlp": 1.07281172, "epoch": 0.01545167593566812, "flos": 15961862885760.0, "grad_norm": 4.480294478847577, "language_loss": 0.71472508, "learning_rate": 3.5727847536897254e-06, "loss": 0.73958945, "num_input_tokens_seen": 5411970, "step": 257, "time_per_iteration": 6.340675592422485 }, { "auxiliary_loss_clip": 0.01355912, "auxiliary_loss_mlp": 0.01112611, "balance_loss_clip": 1.10014856, "balance_loss_mlp": 1.06280565, "epoch": 0.01551179918833609, "flos": 22601745544320.0, "grad_norm": 2.0292888191897673, "language_loss": 0.94713151, "learning_rate": 3.5752851536613596e-06, "loss": 0.97181678, "num_input_tokens_seen": 5430245, "step": 258, "time_per_iteration": 5.674164772033691 }, { "auxiliary_loss_clip": 0.01356656, "auxiliary_loss_mlp": 0.01113313, "balance_loss_clip": 1.09867072, "balance_loss_mlp": 1.0645566, "epoch": 0.015571922441004058, "flos": 22816706486400.0, "grad_norm": 2.3215886633849236, "language_loss": 0.93037683, "learning_rate": 3.577775880881658e-06, "loss": 0.95507646, "num_input_tokens_seen": 5448905, "step": 259, "time_per_iteration": 2.6286497116088867 }, { "auxiliary_loss_clip": 0.01348977, "auxiliary_loss_mlp": 0.01102171, "balance_loss_clip": 1.10076857, "balance_loss_mlp": 1.05625176, "epoch": 0.015632045693672027, "flos": 18947439486720.0, "grad_norm": 1.9575053933526474, "language_loss": 0.97368109, "learning_rate": 3.5802570099000424e-06, "loss": 0.99819261, "num_input_tokens_seen": 5466405, "step": 260, "time_per_iteration": 2.625072717666626 }, { "auxiliary_loss_clip": 0.01362999, "auxiliary_loss_mlp": 0.01127943, "balance_loss_clip": 1.1010474, "balance_loss_mlp": 1.07940137, "epoch": 0.015692168946339995, "flos": 29971728046080.0, "grad_norm": 2.2828802632863305, "language_loss": 0.87807435, "learning_rate": 3.5827286144073947e-06, "loss": 0.90298378, "num_input_tokens_seen": 5487055, "step": 261, "time_per_iteration": 2.6737279891967773 }, { "auxiliary_loss_clip": 0.01357008, "auxiliary_loss_mlp": 0.01125312, "balance_loss_clip": 1.09822345, "balance_loss_mlp": 1.07665133, "epoch": 0.015752292199007967, "flos": 19392085946880.0, "grad_norm": 5.057676675675106, "language_loss": 0.67100549, "learning_rate": 3.5851907672491904e-06, "loss": 0.69582868, "num_input_tokens_seen": 5506600, "step": 262, "time_per_iteration": 2.651690721511841 }, { "auxiliary_loss_clip": 0.01353953, "auxiliary_loss_mlp": 0.01135541, "balance_loss_clip": 1.09924924, "balance_loss_mlp": 1.08499634, "epoch": 0.015812415451675936, "flos": 20339804338560.0, "grad_norm": 3.0820356667611337, "language_loss": 0.68077701, "learning_rate": 3.587643540438383e-06, "loss": 0.70567191, "num_input_tokens_seen": 5524350, "step": 263, "time_per_iteration": 2.6885130405426025 }, { "auxiliary_loss_clip": 0.01355592, "auxiliary_loss_mlp": 0.01116799, "balance_loss_clip": 1.09620881, "balance_loss_mlp": 1.06766081, "epoch": 0.015872538704343905, "flos": 17525412979200.0, "grad_norm": 3.9089218881424674, "language_loss": 0.85002583, "learning_rate": 3.590087005168037e-06, "loss": 0.87474978, "num_input_tokens_seen": 5542145, "step": 264, "time_per_iteration": 2.6557912826538086 }, { "auxiliary_loss_clip": 0.01360388, "auxiliary_loss_mlp": 0.01102763, "balance_loss_clip": 1.10088885, "balance_loss_mlp": 1.056319, "epoch": 0.015932661957011873, "flos": 15260490944640.0, "grad_norm": 2.7020928553211476, "language_loss": 1.04234743, "learning_rate": 3.5925212318237344e-06, "loss": 1.06697881, "num_input_tokens_seen": 5557920, "step": 265, "time_per_iteration": 2.6262216567993164 }, { "auxiliary_loss_clip": 0.01364512, "auxiliary_loss_mlp": 0.01120309, "balance_loss_clip": 1.1033864, "balance_loss_mlp": 1.06835794, "epoch": 0.015992785209679845, "flos": 20302528999680.0, "grad_norm": 3.1220748516520134, "language_loss": 0.74914098, "learning_rate": 3.5949462899957323e-06, "loss": 0.7739892, "num_input_tokens_seen": 5576290, "step": 266, "time_per_iteration": 2.6244583129882812 }, { "auxiliary_loss_clip": 0.01349738, "auxiliary_loss_mlp": 0.0111189, "balance_loss_clip": 1.1000762, "balance_loss_mlp": 1.06206095, "epoch": 0.016052908462347814, "flos": 23362368969600.0, "grad_norm": 1.8166776194063956, "language_loss": 0.90909529, "learning_rate": 3.5973622484909068e-06, "loss": 0.93371153, "num_input_tokens_seen": 5595205, "step": 267, "time_per_iteration": 2.6753580570220947 }, { "auxiliary_loss_clip": 0.01359091, "auxiliary_loss_mlp": 0.01115968, "balance_loss_clip": 1.10122573, "balance_loss_mlp": 1.06797481, "epoch": 0.016113031715015783, "flos": 21286588976640.0, "grad_norm": 2.450608875877181, "language_loss": 0.85636413, "learning_rate": 3.599769175344462e-06, "loss": 0.88111478, "num_input_tokens_seen": 5612645, "step": 268, "time_per_iteration": 2.7161567211151123 }, { "auxiliary_loss_clip": 0.01351132, "auxiliary_loss_mlp": 0.01102276, "balance_loss_clip": 1.10226274, "balance_loss_mlp": 1.05475891, "epoch": 0.01617315496768375, "flos": 18914689261440.0, "grad_norm": 2.1714201716772457, "language_loss": 0.88080788, "learning_rate": 3.602167137831432e-06, "loss": 0.90534198, "num_input_tokens_seen": 5628345, "step": 269, "time_per_iteration": 2.6403756141662598 }, { "auxiliary_loss_clip": 0.01357907, "auxiliary_loss_mlp": 0.01111574, "balance_loss_clip": 1.10001528, "balance_loss_mlp": 1.06021833, "epoch": 0.01623327822035172, "flos": 16546488647040.0, "grad_norm": 2.5848702107942803, "language_loss": 0.97077739, "learning_rate": 3.6045562024779565e-06, "loss": 0.99547219, "num_input_tokens_seen": 5645940, "step": 270, "time_per_iteration": 2.635546922683716 }, { "auxiliary_loss_clip": 0.01356007, "auxiliary_loss_mlp": 0.01118132, "balance_loss_clip": 1.10402, "balance_loss_mlp": 1.06918478, "epoch": 0.016293401473019692, "flos": 23513481486720.0, "grad_norm": 2.1115750591463223, "language_loss": 0.86112005, "learning_rate": 3.606936435072361e-06, "loss": 0.8858614, "num_input_tokens_seen": 5665690, "step": 271, "time_per_iteration": 2.6877286434173584 }, { "auxiliary_loss_clip": 0.013537, "auxiliary_loss_mlp": 0.01105687, "balance_loss_clip": 1.0962286, "balance_loss_mlp": 1.057693, "epoch": 0.01635352472568766, "flos": 29016072748800.0, "grad_norm": 2.5391912683658413, "language_loss": 0.81550127, "learning_rate": 3.609307900676025e-06, "loss": 0.84009504, "num_input_tokens_seen": 5683190, "step": 272, "time_per_iteration": 2.6728365421295166 }, { "auxiliary_loss_clip": 0.01348527, "auxiliary_loss_mlp": 0.01120864, "balance_loss_clip": 1.09806561, "balance_loss_mlp": 1.07368064, "epoch": 0.01641364797835563, "flos": 13370513028480.0, "grad_norm": 2.3613573538590487, "language_loss": 0.81075382, "learning_rate": 3.611670663634051e-06, "loss": 0.83544779, "num_input_tokens_seen": 5699780, "step": 273, "time_per_iteration": 2.595008134841919 }, { "auxiliary_loss_clip": 0.01346135, "auxiliary_loss_mlp": 0.01105539, "balance_loss_clip": 1.09398317, "balance_loss_mlp": 1.05749762, "epoch": 0.016473771231023598, "flos": 18878239935360.0, "grad_norm": 2.1979313648400547, "language_loss": 0.9131726, "learning_rate": 3.614024787585744e-06, "loss": 0.9376893, "num_input_tokens_seen": 5716980, "step": 274, "time_per_iteration": 2.684718132019043 }, { "auxiliary_loss_clip": 0.013432, "auxiliary_loss_mlp": 0.01108715, "balance_loss_clip": 1.09515727, "balance_loss_mlp": 1.06062579, "epoch": 0.016533894483691566, "flos": 22601637803520.0, "grad_norm": 1.9719932168994616, "language_loss": 0.88054645, "learning_rate": 3.6163703354748927e-06, "loss": 0.90506566, "num_input_tokens_seen": 5737780, "step": 275, "time_per_iteration": 2.7204532623291016 }, { "auxiliary_loss_clip": 0.01346726, "auxiliary_loss_mlp": 0.01102856, "balance_loss_clip": 1.09623361, "balance_loss_mlp": 1.05312169, "epoch": 0.01659401773635954, "flos": 21507188353920.0, "grad_norm": 1.7930545784536995, "language_loss": 0.80726624, "learning_rate": 3.6187073695598707e-06, "loss": 0.83176208, "num_input_tokens_seen": 5758330, "step": 276, "time_per_iteration": 3.04716157913208 }, { "auxiliary_loss_clip": 0.0133817, "auxiliary_loss_mlp": 0.01096103, "balance_loss_clip": 1.09588337, "balance_loss_mlp": 1.05220985, "epoch": 0.016654140989027507, "flos": 32850973411200.0, "grad_norm": 1.9196343116615175, "language_loss": 0.80707026, "learning_rate": 3.621035951423551e-06, "loss": 0.83141291, "num_input_tokens_seen": 5778340, "step": 277, "time_per_iteration": 2.809645652770996 }, { "auxiliary_loss_clip": 0.01337061, "auxiliary_loss_mlp": 0.0109637, "balance_loss_clip": 1.08979487, "balance_loss_mlp": 1.04923487, "epoch": 0.016714264241695476, "flos": 12306228024960.0, "grad_norm": 2.3224792061881185, "language_loss": 0.80508065, "learning_rate": 3.623356141983041e-06, "loss": 0.82941496, "num_input_tokens_seen": 5794295, "step": 278, "time_per_iteration": 2.604830741882324 }, { "auxiliary_loss_clip": 0.01341116, "auxiliary_loss_mlp": 0.01101968, "balance_loss_clip": 1.09395671, "balance_loss_mlp": 1.05585837, "epoch": 0.016774387494363444, "flos": 27123796362240.0, "grad_norm": 2.0021377353660057, "language_loss": 0.90582991, "learning_rate": 3.6256680014992486e-06, "loss": 0.93026078, "num_input_tokens_seen": 5814405, "step": 279, "time_per_iteration": 2.7193243503570557 }, { "auxiliary_loss_clip": 0.01346095, "auxiliary_loss_mlp": 0.01112065, "balance_loss_clip": 1.09383631, "balance_loss_mlp": 1.06450009, "epoch": 0.016834510747031413, "flos": 20191493082240.0, "grad_norm": 2.9314445951013988, "language_loss": 0.94049025, "learning_rate": 3.6279715895862713e-06, "loss": 0.96507192, "num_input_tokens_seen": 5832795, "step": 280, "time_per_iteration": 2.680924654006958 }, { "auxiliary_loss_clip": 0.01346658, "auxiliary_loss_mlp": 0.01109166, "balance_loss_clip": 1.09285879, "balance_loss_mlp": 1.06060064, "epoch": 0.016894633999699385, "flos": 27274262434560.0, "grad_norm": 2.6758913403282483, "language_loss": 0.74425459, "learning_rate": 3.6302669652206183e-06, "loss": 0.76881289, "num_input_tokens_seen": 5855750, "step": 281, "time_per_iteration": 2.691152811050415 }, { "auxiliary_loss_clip": 0.01343371, "auxiliary_loss_mlp": 0.01117708, "balance_loss_clip": 1.09609079, "balance_loss_mlp": 1.0724318, "epoch": 0.016954757252367354, "flos": 14902964922240.0, "grad_norm": 3.4878028680462005, "language_loss": 0.80255079, "learning_rate": 3.632554186750274e-06, "loss": 0.82716167, "num_input_tokens_seen": 5872610, "step": 282, "time_per_iteration": 2.592664957046509 }, { "auxiliary_loss_clip": 0.01348082, "auxiliary_loss_mlp": 0.01118449, "balance_loss_clip": 1.09700727, "balance_loss_mlp": 1.07114697, "epoch": 0.017014880505035322, "flos": 21358805270400.0, "grad_norm": 2.296781711700251, "language_loss": 0.77719986, "learning_rate": 3.6348333119035937e-06, "loss": 0.80186516, "num_input_tokens_seen": 5892985, "step": 283, "time_per_iteration": 2.6502227783203125 }, { "auxiliary_loss_clip": 0.01347311, "auxiliary_loss_mlp": 0.01092934, "balance_loss_clip": 1.0977478, "balance_loss_mlp": 1.04804015, "epoch": 0.01707500375770329, "flos": 35333154858240.0, "grad_norm": 2.3467060832193414, "language_loss": 0.84246969, "learning_rate": 3.6371043977980503e-06, "loss": 0.86687213, "num_input_tokens_seen": 5914060, "step": 284, "time_per_iteration": 2.8534958362579346 }, { "auxiliary_loss_clip": 0.01337962, "auxiliary_loss_mlp": 0.01100399, "balance_loss_clip": 1.09212708, "balance_loss_mlp": 1.05297756, "epoch": 0.01713512701037126, "flos": 23582070506880.0, "grad_norm": 2.7335752956200388, "language_loss": 0.96998906, "learning_rate": 3.639367500948819e-06, "loss": 0.99437273, "num_input_tokens_seen": 5932860, "step": 285, "time_per_iteration": 2.6338655948638916 }, { "auxiliary_loss_clip": 0.01341319, "auxiliary_loss_mlp": 0.01095606, "balance_loss_clip": 1.09538078, "balance_loss_mlp": 1.05123687, "epoch": 0.01719525026303923, "flos": 27634661544960.0, "grad_norm": 2.294843469150046, "language_loss": 0.94079655, "learning_rate": 3.6416226772772178e-06, "loss": 0.96516573, "num_input_tokens_seen": 5952725, "step": 286, "time_per_iteration": 2.711087942123413 }, { "auxiliary_loss_clip": 0.01332862, "auxiliary_loss_mlp": 0.0109035, "balance_loss_clip": 1.08986938, "balance_loss_mlp": 1.04409683, "epoch": 0.0172553735157072, "flos": 26979722910720.0, "grad_norm": 1.9277896882465477, "language_loss": 0.92464817, "learning_rate": 3.643869982119001e-06, "loss": 0.94888031, "num_input_tokens_seen": 5970560, "step": 287, "time_per_iteration": 2.640267848968506 }, { "auxiliary_loss_clip": 0.01338192, "auxiliary_loss_mlp": 0.01092315, "balance_loss_clip": 1.09039164, "balance_loss_mlp": 1.04651475, "epoch": 0.01731549676837517, "flos": 14056621689600.0, "grad_norm": 2.7883535936791035, "language_loss": 1.01873291, "learning_rate": 3.646109470232502e-06, "loss": 1.04303789, "num_input_tokens_seen": 5982980, "step": 288, "time_per_iteration": 2.558312177658081 }, { "auxiliary_loss_clip": 0.01225082, "auxiliary_loss_mlp": 0.01188305, "balance_loss_clip": 1.09194219, "balance_loss_mlp": 1.17228377, "epoch": 0.017375620021043137, "flos": 66510694471680.0, "grad_norm": 0.9289960013542303, "language_loss": 0.63867617, "learning_rate": 3.6483411958066417e-06, "loss": 0.66281009, "num_input_tokens_seen": 6049445, "step": 289, "time_per_iteration": 3.386254072189331 }, { "auxiliary_loss_clip": 0.01341215, "auxiliary_loss_mlp": 0.01107788, "balance_loss_clip": 1.09622383, "balance_loss_mlp": 1.06482446, "epoch": 0.01743574327371111, "flos": 15225154940160.0, "grad_norm": 2.368974734045724, "language_loss": 0.88156199, "learning_rate": 3.6505652124687957e-06, "loss": 0.90605205, "num_input_tokens_seen": 6064150, "step": 290, "time_per_iteration": 2.5670948028564453 }, { "auxiliary_loss_clip": 0.0133848, "auxiliary_loss_mlp": 0.010946, "balance_loss_clip": 1.09388971, "balance_loss_mlp": 1.04965782, "epoch": 0.017495866526379078, "flos": 25373869574400.0, "grad_norm": 2.2011772664145504, "language_loss": 0.84472585, "learning_rate": 3.6527815732925258e-06, "loss": 0.8690567, "num_input_tokens_seen": 6083920, "step": 291, "time_per_iteration": 2.648452043533325 }, { "auxiliary_loss_clip": 0.01343563, "auxiliary_loss_mlp": 0.01115116, "balance_loss_clip": 1.10129941, "balance_loss_mlp": 1.06607366, "epoch": 0.017555989779047047, "flos": 26359473836160.0, "grad_norm": 1.7675259544479762, "language_loss": 0.72679955, "learning_rate": 3.6549903308051806e-06, "loss": 0.75138628, "num_input_tokens_seen": 6105460, "step": 292, "time_per_iteration": 2.7239537239074707 }, { "auxiliary_loss_clip": 0.01334066, "auxiliary_loss_mlp": 0.01107289, "balance_loss_clip": 1.09397244, "balance_loss_mlp": 1.06170392, "epoch": 0.017616113031715015, "flos": 22338807010560.0, "grad_norm": 2.419616990787406, "language_loss": 0.86866581, "learning_rate": 3.6571915369953646e-06, "loss": 0.89307928, "num_input_tokens_seen": 6122890, "step": 293, "time_per_iteration": 2.642854690551758 }, { "auxiliary_loss_clip": 0.01333726, "auxiliary_loss_mlp": 0.0110557, "balance_loss_clip": 1.09271646, "balance_loss_mlp": 1.06086659, "epoch": 0.017676236284382984, "flos": 20156911263360.0, "grad_norm": 2.112624444766753, "language_loss": 0.80896151, "learning_rate": 3.6593852433202797e-06, "loss": 0.83335447, "num_input_tokens_seen": 6142890, "step": 294, "time_per_iteration": 2.598176956176758 }, { "auxiliary_loss_clip": 0.01334179, "auxiliary_loss_mlp": 0.01113433, "balance_loss_clip": 1.09030747, "balance_loss_mlp": 1.06892014, "epoch": 0.017736359537050956, "flos": 25223331674880.0, "grad_norm": 2.8289841764142416, "language_loss": 0.83806521, "learning_rate": 3.6615715007129453e-06, "loss": 0.86254132, "num_input_tokens_seen": 6162030, "step": 295, "time_per_iteration": 2.750103712081909 }, { "auxiliary_loss_clip": 0.01339845, "auxiliary_loss_mlp": 0.01121984, "balance_loss_clip": 1.09978509, "balance_loss_mlp": 1.0772326, "epoch": 0.017796482789718925, "flos": 20338798757760.0, "grad_norm": 1.8804378237246864, "language_loss": 0.84576106, "learning_rate": 3.6637503595892897e-06, "loss": 0.87037927, "num_input_tokens_seen": 6180540, "step": 296, "time_per_iteration": 4.154251337051392 }, { "auxiliary_loss_clip": 0.01337678, "auxiliary_loss_mlp": 0.01105295, "balance_loss_clip": 1.09463406, "balance_loss_mlp": 1.06154561, "epoch": 0.017856606042386893, "flos": 22379206832640.0, "grad_norm": 2.055710812588959, "language_loss": 0.87810111, "learning_rate": 3.665921869855132e-06, "loss": 0.90253091, "num_input_tokens_seen": 6199425, "step": 297, "time_per_iteration": 4.379676103591919 }, { "auxiliary_loss_clip": 0.0133717, "auxiliary_loss_mlp": 0.01103766, "balance_loss_clip": 1.09343684, "balance_loss_mlp": 1.06004047, "epoch": 0.017916729295054862, "flos": 20230061310720.0, "grad_norm": 2.689351030321763, "language_loss": 0.88947791, "learning_rate": 3.6680860809130346e-06, "loss": 0.91388726, "num_input_tokens_seen": 6219170, "step": 298, "time_per_iteration": 4.1055779457092285 }, { "auxiliary_loss_clip": 0.01333843, "auxiliary_loss_mlp": 0.01121179, "balance_loss_clip": 1.09470236, "balance_loss_mlp": 1.07499719, "epoch": 0.01797685254772283, "flos": 19390972625280.0, "grad_norm": 1.8935027270905305, "language_loss": 0.88550889, "learning_rate": 3.6702430416690516e-06, "loss": 0.91005915, "num_input_tokens_seen": 6237930, "step": 299, "time_per_iteration": 2.611168622970581 }, { "auxiliary_loss_clip": 0.0133938, "auxiliary_loss_mlp": 0.0110718, "balance_loss_clip": 1.09468794, "balance_loss_mlp": 1.06130886, "epoch": 0.018036975800390802, "flos": 24426007528320.0, "grad_norm": 4.075580609786654, "language_loss": 0.64664406, "learning_rate": 3.672392800539357e-06, "loss": 0.67110968, "num_input_tokens_seen": 6257170, "step": 300, "time_per_iteration": 2.645603656768799 }, { "auxiliary_loss_clip": 0.01338559, "auxiliary_loss_mlp": 0.01111665, "balance_loss_clip": 1.09775913, "balance_loss_mlp": 1.06636548, "epoch": 0.01809709905305877, "flos": 15778933896960.0, "grad_norm": 2.5071418214687515, "language_loss": 0.87940675, "learning_rate": 3.6745354054567686e-06, "loss": 0.90390897, "num_input_tokens_seen": 6274780, "step": 301, "time_per_iteration": 2.6035923957824707 }, { "auxiliary_loss_clip": 0.01238361, "auxiliary_loss_mlp": 0.01073699, "balance_loss_clip": 1.1100142, "balance_loss_mlp": 1.05901265, "epoch": 0.01815722230572674, "flos": 67348382526720.0, "grad_norm": 0.8350739260664176, "language_loss": 0.62219667, "learning_rate": 3.676670903877158e-06, "loss": 0.64531732, "num_input_tokens_seen": 6340435, "step": 302, "time_per_iteration": 3.3307297229766846 }, { "auxiliary_loss_clip": 0.0132981, "auxiliary_loss_mlp": 0.01110918, "balance_loss_clip": 1.0910126, "balance_loss_mlp": 1.06507051, "epoch": 0.01821734555839471, "flos": 15485615435520.0, "grad_norm": 2.115144575016314, "language_loss": 0.89737153, "learning_rate": 3.6787993427857567e-06, "loss": 0.9217788, "num_input_tokens_seen": 6358160, "step": 303, "time_per_iteration": 2.6773293018341064 }, { "auxiliary_loss_clip": 0.01335628, "auxiliary_loss_mlp": 0.01118481, "balance_loss_clip": 1.09579217, "balance_loss_mlp": 1.07237101, "epoch": 0.018277468811062677, "flos": 24097424889600.0, "grad_norm": 1.8670669350935472, "language_loss": 0.80417514, "learning_rate": 3.680920768703364e-06, "loss": 0.82871628, "num_input_tokens_seen": 6378485, "step": 304, "time_per_iteration": 2.691347360610962 }, { "auxiliary_loss_clip": 0.01330802, "auxiliary_loss_mlp": 0.01091671, "balance_loss_clip": 1.09832263, "balance_loss_mlp": 1.04858923, "epoch": 0.01833759206373065, "flos": 20959335141120.0, "grad_norm": 1.6863564291935742, "language_loss": 0.82761526, "learning_rate": 3.6830352276924415e-06, "loss": 0.85184002, "num_input_tokens_seen": 6397845, "step": 305, "time_per_iteration": 2.6883981227874756 }, { "auxiliary_loss_clip": 0.01330759, "auxiliary_loss_mlp": 0.01093908, "balance_loss_clip": 1.09012437, "balance_loss_mlp": 1.05115986, "epoch": 0.018397715316398618, "flos": 19390757143680.0, "grad_norm": 2.1780708917523297, "language_loss": 0.91148543, "learning_rate": 3.685142765363119e-06, "loss": 0.93573213, "num_input_tokens_seen": 6416475, "step": 306, "time_per_iteration": 2.6465187072753906 }, { "auxiliary_loss_clip": 0.01324743, "auxiliary_loss_mlp": 0.01091696, "balance_loss_clip": 1.08900762, "balance_loss_mlp": 1.04882836, "epoch": 0.018457838569066586, "flos": 29132531619840.0, "grad_norm": 3.4680205003751072, "language_loss": 0.86581063, "learning_rate": 3.687243426879095e-06, "loss": 0.88997507, "num_input_tokens_seen": 6437520, "step": 307, "time_per_iteration": 2.7787318229675293 }, { "auxiliary_loss_clip": 0.01326572, "auxiliary_loss_mlp": 0.01110018, "balance_loss_clip": 1.09346747, "balance_loss_mlp": 1.06247783, "epoch": 0.018517961821734555, "flos": 19208654167680.0, "grad_norm": 2.413130156754219, "language_loss": 0.71650648, "learning_rate": 3.6893372569634466e-06, "loss": 0.74087244, "num_input_tokens_seen": 6455680, "step": 308, "time_per_iteration": 2.652973175048828 }, { "auxiliary_loss_clip": 0.01331912, "auxiliary_loss_mlp": 0.01102766, "balance_loss_clip": 1.09061241, "balance_loss_mlp": 1.05911207, "epoch": 0.018578085074402523, "flos": 19863018184320.0, "grad_norm": 2.1869498369051077, "language_loss": 0.91841364, "learning_rate": 3.6914242999043395e-06, "loss": 0.94276047, "num_input_tokens_seen": 6474880, "step": 309, "time_per_iteration": 2.6613030433654785 }, { "auxiliary_loss_clip": 0.01339178, "auxiliary_loss_mlp": 0.01096668, "balance_loss_clip": 1.09145641, "balance_loss_mlp": 1.05084395, "epoch": 0.018638208327070496, "flos": 29606947476480.0, "grad_norm": 2.0400456475786353, "language_loss": 0.72784412, "learning_rate": 3.69350459956065e-06, "loss": 0.75220263, "num_input_tokens_seen": 6495945, "step": 310, "time_per_iteration": 2.705345392227173 }, { "auxiliary_loss_clip": 0.01331019, "auxiliary_loss_mlp": 0.01113021, "balance_loss_clip": 1.09560525, "balance_loss_mlp": 1.06922317, "epoch": 0.018698331579738464, "flos": 45731555907840.0, "grad_norm": 2.1345597100799645, "language_loss": 0.74162471, "learning_rate": 3.695578199367497e-06, "loss": 0.76606506, "num_input_tokens_seen": 6519930, "step": 311, "time_per_iteration": 2.846503496170044 }, { "auxiliary_loss_clip": 0.01338389, "auxiliary_loss_mlp": 0.01104203, "balance_loss_clip": 1.09206033, "balance_loss_mlp": 1.0609777, "epoch": 0.018758454832406433, "flos": 20483662308480.0, "grad_norm": 3.713635021153945, "language_loss": 0.91668129, "learning_rate": 3.6976451423416825e-06, "loss": 0.94110715, "num_input_tokens_seen": 6535070, "step": 312, "time_per_iteration": 2.598400592803955 }, { "auxiliary_loss_clip": 0.01339145, "auxiliary_loss_mlp": 0.01116197, "balance_loss_clip": 1.09512305, "balance_loss_mlp": 1.07034922, "epoch": 0.0188185780850744, "flos": 15777784661760.0, "grad_norm": 4.5530066286460045, "language_loss": 0.89634913, "learning_rate": 3.699705471087043e-06, "loss": 0.92090249, "num_input_tokens_seen": 6554135, "step": 313, "time_per_iteration": 2.6944596767425537 }, { "auxiliary_loss_clip": 0.01340962, "auxiliary_loss_mlp": 0.0109941, "balance_loss_clip": 1.09381938, "balance_loss_mlp": 1.05430174, "epoch": 0.018878701337742373, "flos": 22455732758400.0, "grad_norm": 2.3990870717118455, "language_loss": 0.7335974, "learning_rate": 3.7017592277997256e-06, "loss": 0.75800109, "num_input_tokens_seen": 6572275, "step": 314, "time_per_iteration": 2.6550133228302 }, { "auxiliary_loss_clip": 0.01329658, "auxiliary_loss_mlp": 0.01105546, "balance_loss_clip": 1.09075165, "balance_loss_mlp": 1.06246412, "epoch": 0.018938824590410342, "flos": 30993530238720.0, "grad_norm": 5.81191681220521, "language_loss": 0.89890182, "learning_rate": 3.7038064542733654e-06, "loss": 0.92325383, "num_input_tokens_seen": 6594520, "step": 315, "time_per_iteration": 2.7121222019195557 }, { "auxiliary_loss_clip": 0.0133262, "auxiliary_loss_mlp": 0.01096177, "balance_loss_clip": 1.09287357, "balance_loss_mlp": 1.05209303, "epoch": 0.01899894784307831, "flos": 23258910821760.0, "grad_norm": 2.446494284682687, "language_loss": 0.80517328, "learning_rate": 3.7058471919041945e-06, "loss": 0.82946122, "num_input_tokens_seen": 6614245, "step": 316, "time_per_iteration": 2.640573501586914 }, { "auxiliary_loss_clip": 0.01326654, "auxiliary_loss_mlp": 0.01094904, "balance_loss_clip": 1.09036672, "balance_loss_mlp": 1.05046248, "epoch": 0.01905907109574628, "flos": 17457901367040.0, "grad_norm": 2.3705495670370524, "language_loss": 0.90161496, "learning_rate": 3.7078814816960605e-06, "loss": 0.92583054, "num_input_tokens_seen": 6632015, "step": 317, "time_per_iteration": 2.594388246536255 }, { "auxiliary_loss_clip": 0.01324014, "auxiliary_loss_mlp": 0.01097498, "balance_loss_clip": 1.08944559, "balance_loss_mlp": 1.05281842, "epoch": 0.019119194348414248, "flos": 14970225139200.0, "grad_norm": 7.443622240044352, "language_loss": 0.90836811, "learning_rate": 3.709909364265374e-06, "loss": 0.93258321, "num_input_tokens_seen": 6649015, "step": 318, "time_per_iteration": 2.6647114753723145 }, { "auxiliary_loss_clip": 0.01326579, "auxiliary_loss_mlp": 0.01092817, "balance_loss_clip": 1.0886786, "balance_loss_mlp": 1.05102181, "epoch": 0.01917931760108222, "flos": 25482822503040.0, "grad_norm": 2.232217614618188, "language_loss": 0.93955356, "learning_rate": 3.7119308798459706e-06, "loss": 0.9637475, "num_input_tokens_seen": 6669225, "step": 319, "time_per_iteration": 2.6901800632476807 }, { "auxiliary_loss_clip": 0.01209258, "auxiliary_loss_mlp": 0.01057567, "balance_loss_clip": 1.08611965, "balance_loss_mlp": 1.04288089, "epoch": 0.01923944085375019, "flos": 71556967353600.0, "grad_norm": 1.0009907084180605, "language_loss": 0.59817195, "learning_rate": 3.7139460682939026e-06, "loss": 0.62084019, "num_input_tokens_seen": 6725775, "step": 320, "time_per_iteration": 3.1044812202453613 }, { "auxiliary_loss_clip": 0.01323701, "auxiliary_loss_mlp": 0.01105882, "balance_loss_clip": 1.08827436, "balance_loss_mlp": 1.06291938, "epoch": 0.019299564106418157, "flos": 19682495406720.0, "grad_norm": 3.6735645336458163, "language_loss": 0.89620435, "learning_rate": 3.715954969092154e-06, "loss": 0.92050016, "num_input_tokens_seen": 6744170, "step": 321, "time_per_iteration": 2.650325298309326 }, { "auxiliary_loss_clip": 0.01333523, "auxiliary_loss_mlp": 0.01118534, "balance_loss_clip": 1.09200621, "balance_loss_mlp": 1.07440257, "epoch": 0.019359687359086126, "flos": 24387151991040.0, "grad_norm": 2.289334718991835, "language_loss": 0.82897186, "learning_rate": 3.7179576213552805e-06, "loss": 0.85349244, "num_input_tokens_seen": 6764565, "step": 322, "time_per_iteration": 2.65793514251709 }, { "auxiliary_loss_clip": 0.01332983, "auxiliary_loss_mlp": 0.01092262, "balance_loss_clip": 1.09035325, "balance_loss_mlp": 1.05061018, "epoch": 0.019419810611754094, "flos": 23951376190080.0, "grad_norm": 2.3678949255052912, "language_loss": 0.72983897, "learning_rate": 3.719954063833981e-06, "loss": 0.75409144, "num_input_tokens_seen": 6785310, "step": 323, "time_per_iteration": 2.6827828884124756 }, { "auxiliary_loss_clip": 0.01321298, "auxiliary_loss_mlp": 0.01092254, "balance_loss_clip": 1.08474624, "balance_loss_mlp": 1.04974401, "epoch": 0.019479933864422067, "flos": 22160223567360.0, "grad_norm": 1.9971507164977458, "language_loss": 0.92358303, "learning_rate": 3.721944334919596e-06, "loss": 0.9477185, "num_input_tokens_seen": 6803290, "step": 324, "time_per_iteration": 2.667363405227661 }, { "auxiliary_loss_clip": 0.0133014, "auxiliary_loss_mlp": 0.01089098, "balance_loss_clip": 1.09217644, "balance_loss_mlp": 1.04878139, "epoch": 0.019540057117090035, "flos": 22236821320320.0, "grad_norm": 6.407507213214319, "language_loss": 0.65127969, "learning_rate": 3.7239284726485375e-06, "loss": 0.67547202, "num_input_tokens_seen": 6822570, "step": 325, "time_per_iteration": 2.658700466156006 }, { "auxiliary_loss_clip": 0.01328385, "auxiliary_loss_mlp": 0.01109788, "balance_loss_clip": 1.09598839, "balance_loss_mlp": 1.06675363, "epoch": 0.019600180369758004, "flos": 23076771932160.0, "grad_norm": 1.7177375017641943, "language_loss": 0.76394802, "learning_rate": 3.72590651470665e-06, "loss": 0.78832972, "num_input_tokens_seen": 6841910, "step": 326, "time_per_iteration": 2.6326630115509033 }, { "auxiliary_loss_clip": 0.01322824, "auxiliary_loss_mlp": 0.01103487, "balance_loss_clip": 1.09083152, "balance_loss_mlp": 1.06040514, "epoch": 0.019660303622425972, "flos": 25410857604480.0, "grad_norm": 2.041100065316132, "language_loss": 0.79262185, "learning_rate": 3.727878498433505e-06, "loss": 0.81688493, "num_input_tokens_seen": 6862480, "step": 327, "time_per_iteration": 2.7195518016815186 }, { "auxiliary_loss_clip": 0.0132945, "auxiliary_loss_mlp": 0.01099712, "balance_loss_clip": 1.09292865, "balance_loss_mlp": 1.05832207, "epoch": 0.01972042687509394, "flos": 23657519024640.0, "grad_norm": 2.852301933148325, "language_loss": 0.80569315, "learning_rate": 3.7298444608266328e-06, "loss": 0.82998472, "num_input_tokens_seen": 6882015, "step": 328, "time_per_iteration": 2.6789369583129883 }, { "auxiliary_loss_clip": 0.01327544, "auxiliary_loss_mlp": 0.01094059, "balance_loss_clip": 1.08719349, "balance_loss_mlp": 1.05045235, "epoch": 0.019780550127761913, "flos": 18223480869120.0, "grad_norm": 2.280823996815513, "language_loss": 0.93599927, "learning_rate": 3.731804438545683e-06, "loss": 0.96021533, "num_input_tokens_seen": 6899785, "step": 329, "time_per_iteration": 2.6043548583984375 }, { "auxiliary_loss_clip": 0.0133329, "auxiliary_loss_mlp": 0.0110952, "balance_loss_clip": 1.09211767, "balance_loss_mlp": 1.06629419, "epoch": 0.01984067338042988, "flos": 22418780641920.0, "grad_norm": 2.788704520584699, "language_loss": 0.7476396, "learning_rate": 3.7337584679165324e-06, "loss": 0.77206767, "num_input_tokens_seen": 6918575, "step": 330, "time_per_iteration": 2.706001043319702 }, { "auxiliary_loss_clip": 0.0133006, "auxiliary_loss_mlp": 0.01115344, "balance_loss_clip": 1.09077096, "balance_loss_mlp": 1.07280993, "epoch": 0.01990079663309785, "flos": 17055199013760.0, "grad_norm": 4.201650057157668, "language_loss": 0.93435889, "learning_rate": 3.7357065849353186e-06, "loss": 0.95881295, "num_input_tokens_seen": 6936965, "step": 331, "time_per_iteration": 2.6499180793762207 }, { "auxiliary_loss_clip": 0.01316843, "auxiliary_loss_mlp": 0.01085812, "balance_loss_clip": 1.08825564, "balance_loss_mlp": 1.04563856, "epoch": 0.01996091988576582, "flos": 15961791058560.0, "grad_norm": 2.5475056489813968, "language_loss": 0.9293468, "learning_rate": 3.737648825272422e-06, "loss": 0.95337331, "num_input_tokens_seen": 6953475, "step": 332, "time_per_iteration": 2.5990231037139893 }, { "auxiliary_loss_clip": 0.01325701, "auxiliary_loss_mlp": 0.01091941, "balance_loss_clip": 1.09376514, "balance_loss_mlp": 1.04902601, "epoch": 0.02002104313843379, "flos": 23586451966080.0, "grad_norm": 2.7319388202061106, "language_loss": 0.75380504, "learning_rate": 3.739585224276384e-06, "loss": 0.77798152, "num_input_tokens_seen": 6971630, "step": 333, "time_per_iteration": 2.6225569248199463 }, { "auxiliary_loss_clip": 0.01323488, "auxiliary_loss_mlp": 0.01083816, "balance_loss_clip": 1.08822608, "balance_loss_mlp": 1.04249835, "epoch": 0.02008116639110176, "flos": 34094883352320.0, "grad_norm": 3.3732742696494924, "language_loss": 0.78797042, "learning_rate": 3.7415158169777673e-06, "loss": 0.81204355, "num_input_tokens_seen": 6992775, "step": 334, "time_per_iteration": 2.725562572479248 }, { "auxiliary_loss_clip": 0.01325152, "auxiliary_loss_mlp": 0.01093257, "balance_loss_clip": 1.08535278, "balance_loss_mlp": 1.04867256, "epoch": 0.020141289643769728, "flos": 19683716469120.0, "grad_norm": 1.945115565921162, "language_loss": 0.83465719, "learning_rate": 3.7434406380929575e-06, "loss": 0.8588413, "num_input_tokens_seen": 7011425, "step": 335, "time_per_iteration": 2.638871192932129 }, { "auxiliary_loss_clip": 0.01322365, "auxiliary_loss_mlp": 0.01085854, "balance_loss_clip": 1.08842373, "balance_loss_mlp": 1.04405963, "epoch": 0.020201412896437697, "flos": 20740567357440.0, "grad_norm": 2.3527147371949058, "language_loss": 0.92432821, "learning_rate": 3.745359722027911e-06, "loss": 0.94841033, "num_input_tokens_seen": 7029450, "step": 336, "time_per_iteration": 2.6654980182647705 }, { "auxiliary_loss_clip": 0.01321531, "auxiliary_loss_mlp": 0.01079695, "balance_loss_clip": 1.08577883, "balance_loss_mlp": 1.03818631, "epoch": 0.020261536149105665, "flos": 20266510636800.0, "grad_norm": 1.7223490941555537, "language_loss": 0.88663971, "learning_rate": 3.7472731028818428e-06, "loss": 0.91065204, "num_input_tokens_seen": 7047555, "step": 337, "time_per_iteration": 4.246743440628052 }, { "auxiliary_loss_clip": 0.01312441, "auxiliary_loss_mlp": 0.01102336, "balance_loss_clip": 1.08320296, "balance_loss_mlp": 1.05841899, "epoch": 0.020321659401773638, "flos": 25848752307840.0, "grad_norm": 1.6493597356962735, "language_loss": 0.89869279, "learning_rate": 3.7491808144508626e-06, "loss": 0.92284054, "num_input_tokens_seen": 7068185, "step": 338, "time_per_iteration": 5.869866609573364 }, { "auxiliary_loss_clip": 0.01321566, "auxiliary_loss_mlp": 0.0109858, "balance_loss_clip": 1.08546185, "balance_loss_mlp": 1.05554605, "epoch": 0.020381782654441606, "flos": 17495033051520.0, "grad_norm": 2.1603069065052694, "language_loss": 0.85168982, "learning_rate": 3.7510828902315576e-06, "loss": 0.87589133, "num_input_tokens_seen": 7085955, "step": 339, "time_per_iteration": 2.603130340576172 }, { "auxiliary_loss_clip": 0.01328225, "auxiliary_loss_mlp": 0.01099064, "balance_loss_clip": 1.0902226, "balance_loss_mlp": 1.05524242, "epoch": 0.020441905907109575, "flos": 24243940465920.0, "grad_norm": 2.1746002196087817, "language_loss": 0.88821882, "learning_rate": 3.75297936342452e-06, "loss": 0.91249174, "num_input_tokens_seen": 7106345, "step": 340, "time_per_iteration": 2.7247626781463623 }, { "auxiliary_loss_clip": 0.01322505, "auxiliary_loss_mlp": 0.01085559, "balance_loss_clip": 1.08594203, "balance_loss_mlp": 1.04004502, "epoch": 0.020502029159777543, "flos": 22233301787520.0, "grad_norm": 2.004763613818719, "language_loss": 0.88489276, "learning_rate": 3.7548702669378253e-06, "loss": 0.9089734, "num_input_tokens_seen": 7125070, "step": 341, "time_per_iteration": 2.731411933898926 }, { "auxiliary_loss_clip": 0.01324734, "auxiliary_loss_mlp": 0.01098572, "balance_loss_clip": 1.08451748, "balance_loss_mlp": 1.05479813, "epoch": 0.020562152412445512, "flos": 23987861429760.0, "grad_norm": 2.3638593093640736, "language_loss": 0.80611861, "learning_rate": 3.756755633390458e-06, "loss": 0.83035159, "num_input_tokens_seen": 7144675, "step": 342, "time_per_iteration": 2.6085095405578613 }, { "auxiliary_loss_clip": 0.01313805, "auxiliary_loss_mlp": 0.01098164, "balance_loss_clip": 1.08411694, "balance_loss_mlp": 1.05138612, "epoch": 0.020622275665113484, "flos": 26975305537920.0, "grad_norm": 1.727276092160433, "language_loss": 0.89612651, "learning_rate": 3.7586354951156886e-06, "loss": 0.92024612, "num_input_tokens_seen": 7165505, "step": 343, "time_per_iteration": 2.739912509918213 }, { "auxiliary_loss_clip": 0.01324722, "auxiliary_loss_mlp": 0.01096954, "balance_loss_clip": 1.09109879, "balance_loss_mlp": 1.05518293, "epoch": 0.020682398917781453, "flos": 22600704049920.0, "grad_norm": 2.6902665590614663, "language_loss": 0.78381217, "learning_rate": 3.7605098841644e-06, "loss": 0.80802888, "num_input_tokens_seen": 7184605, "step": 344, "time_per_iteration": 2.638439655303955 }, { "auxiliary_loss_clip": 0.01310552, "auxiliary_loss_mlp": 0.01103983, "balance_loss_clip": 1.08375537, "balance_loss_mlp": 1.05982804, "epoch": 0.02074252217044942, "flos": 15013605790080.0, "grad_norm": 2.2675296623639114, "language_loss": 0.75051636, "learning_rate": 3.7623788323083666e-06, "loss": 0.77466166, "num_input_tokens_seen": 7203065, "step": 345, "time_per_iteration": 2.581258773803711 }, { "auxiliary_loss_clip": 0.01316305, "auxiliary_loss_mlp": 0.01107937, "balance_loss_clip": 1.08855689, "balance_loss_mlp": 1.06447339, "epoch": 0.02080264542311739, "flos": 25337958952320.0, "grad_norm": 2.2144688897761395, "language_loss": 0.90414572, "learning_rate": 3.7642423710434837e-06, "loss": 0.92838824, "num_input_tokens_seen": 7222995, "step": 346, "time_per_iteration": 2.6281676292419434 }, { "auxiliary_loss_clip": 0.01312286, "auxiliary_loss_mlp": 0.01096576, "balance_loss_clip": 1.08357453, "balance_loss_mlp": 1.05621195, "epoch": 0.02086276867578536, "flos": 24388804016640.0, "grad_norm": 3.1106741063140366, "language_loss": 0.79133296, "learning_rate": 3.7661005315929563e-06, "loss": 0.81542158, "num_input_tokens_seen": 7244625, "step": 347, "time_per_iteration": 2.6477038860321045 }, { "auxiliary_loss_clip": 0.01317665, "auxiliary_loss_mlp": 0.01097416, "balance_loss_clip": 1.08921003, "balance_loss_mlp": 1.05328524, "epoch": 0.02092289192845333, "flos": 24462205459200.0, "grad_norm": 3.7065871267995893, "language_loss": 0.71211165, "learning_rate": 3.7679533449104354e-06, "loss": 0.73626244, "num_input_tokens_seen": 7263255, "step": 348, "time_per_iteration": 2.6215686798095703 }, { "auxiliary_loss_clip": 0.01319168, "auxiliary_loss_mlp": 0.01104109, "balance_loss_clip": 1.0859139, "balance_loss_mlp": 1.06066906, "epoch": 0.0209830151811213, "flos": 17451185523840.0, "grad_norm": 2.3976328225512495, "language_loss": 0.77118891, "learning_rate": 3.7698008416831116e-06, "loss": 0.79542166, "num_input_tokens_seen": 7279275, "step": 349, "time_per_iteration": 2.60102915763855 }, { "auxiliary_loss_clip": 0.01304146, "auxiliary_loss_mlp": 0.01101496, "balance_loss_clip": 1.08412242, "balance_loss_mlp": 1.06017756, "epoch": 0.021043138433789268, "flos": 24573995562240.0, "grad_norm": 1.7599420553547571, "language_loss": 0.85191035, "learning_rate": 3.7716430523347664e-06, "loss": 0.87596673, "num_input_tokens_seen": 7300180, "step": 350, "time_per_iteration": 2.7636313438415527 }, { "auxiliary_loss_clip": 0.01310639, "auxiliary_loss_mlp": 0.01090182, "balance_loss_clip": 1.08742464, "balance_loss_mlp": 1.05015147, "epoch": 0.021103261686457236, "flos": 24454053072000.0, "grad_norm": 2.2188224040826956, "language_loss": 0.7998929, "learning_rate": 3.773480007028776e-06, "loss": 0.82390112, "num_input_tokens_seen": 7317430, "step": 351, "time_per_iteration": 2.651803493499756 }, { "auxiliary_loss_clip": 0.01318922, "auxiliary_loss_mlp": 0.01104903, "balance_loss_clip": 1.08851838, "balance_loss_mlp": 1.06093884, "epoch": 0.021163384939125205, "flos": 14683083816960.0, "grad_norm": 2.30399977815629, "language_loss": 0.8746841, "learning_rate": 3.775311735671078e-06, "loss": 0.89892232, "num_input_tokens_seen": 7334875, "step": 352, "time_per_iteration": 2.687080144882202 }, { "auxiliary_loss_clip": 0.01311303, "auxiliary_loss_mlp": 0.01101912, "balance_loss_clip": 1.0859803, "balance_loss_mlp": 1.05861485, "epoch": 0.021223508191793177, "flos": 24493195918080.0, "grad_norm": 2.574621592267882, "language_loss": 0.8247534, "learning_rate": 3.7771382679130878e-06, "loss": 0.84888554, "num_input_tokens_seen": 7355185, "step": 353, "time_per_iteration": 2.7096078395843506 }, { "auxiliary_loss_clip": 0.01308698, "auxiliary_loss_mlp": 0.01092448, "balance_loss_clip": 1.08573294, "balance_loss_mlp": 1.05160654, "epoch": 0.021283631444461146, "flos": 24126978804480.0, "grad_norm": 1.9591973719581535, "language_loss": 0.8089481, "learning_rate": 3.7789596331545845e-06, "loss": 0.83295953, "num_input_tokens_seen": 7374425, "step": 354, "time_per_iteration": 2.658649444580078 }, { "auxiliary_loss_clip": 0.01314249, "auxiliary_loss_mlp": 0.01095812, "balance_loss_clip": 1.08369493, "balance_loss_mlp": 1.05218124, "epoch": 0.021343754697129114, "flos": 25192233475200.0, "grad_norm": 2.22170783568627, "language_loss": 0.81311834, "learning_rate": 3.780775860546545e-06, "loss": 0.837219, "num_input_tokens_seen": 7394175, "step": 355, "time_per_iteration": 2.619551420211792 }, { "auxiliary_loss_clip": 0.01310207, "auxiliary_loss_mlp": 0.01090401, "balance_loss_clip": 1.08222032, "balance_loss_mlp": 1.04851055, "epoch": 0.021403877949797083, "flos": 17274182279040.0, "grad_norm": 2.212340256471132, "language_loss": 0.89746779, "learning_rate": 3.7825869789939474e-06, "loss": 0.92147392, "num_input_tokens_seen": 7412645, "step": 356, "time_per_iteration": 2.5877137184143066 }, { "auxiliary_loss_clip": 0.01308298, "auxiliary_loss_mlp": 0.0108474, "balance_loss_clip": 1.08573771, "balance_loss_mlp": 1.04191971, "epoch": 0.021464001202465055, "flos": 30917435276160.0, "grad_norm": 1.9878508054592678, "language_loss": 0.79956681, "learning_rate": 3.784393017158528e-06, "loss": 0.82349718, "num_input_tokens_seen": 7432275, "step": 357, "time_per_iteration": 2.781755208969116 }, { "auxiliary_loss_clip": 0.0130988, "auxiliary_loss_mlp": 0.01083565, "balance_loss_clip": 1.08250284, "balance_loss_mlp": 1.04417801, "epoch": 0.021524124455133024, "flos": 18186385098240.0, "grad_norm": 2.6679617624252137, "language_loss": 0.76516652, "learning_rate": 3.786194003461506e-06, "loss": 0.78910094, "num_input_tokens_seen": 7450245, "step": 358, "time_per_iteration": 2.63144850730896 }, { "auxiliary_loss_clip": 0.01307251, "auxiliary_loss_mlp": 0.01092013, "balance_loss_clip": 1.08083165, "balance_loss_mlp": 1.04842997, "epoch": 0.021584247707800992, "flos": 13805786039040.0, "grad_norm": 2.344744226979962, "language_loss": 0.88770491, "learning_rate": 3.787989966086264e-06, "loss": 0.91169769, "num_input_tokens_seen": 7466845, "step": 359, "time_per_iteration": 2.641932964324951 }, { "auxiliary_loss_clip": 0.01315087, "auxiliary_loss_mlp": 0.01090441, "balance_loss_clip": 1.08486438, "balance_loss_mlp": 1.05088758, "epoch": 0.02164437096046896, "flos": 23294713703040.0, "grad_norm": 3.6505103877164804, "language_loss": 0.75853801, "learning_rate": 3.789780932980997e-06, "loss": 0.78259325, "num_input_tokens_seen": 7485450, "step": 360, "time_per_iteration": 2.5901477336883545 }, { "auxiliary_loss_clip": 0.01203506, "auxiliary_loss_mlp": 0.0103078, "balance_loss_clip": 1.07682121, "balance_loss_mlp": 1.01781011, "epoch": 0.02170449421313693, "flos": 68899578341760.0, "grad_norm": 0.8439708743577624, "language_loss": 0.64861441, "learning_rate": 3.79156693186132e-06, "loss": 0.67095727, "num_input_tokens_seen": 7553780, "step": 361, "time_per_iteration": 3.278409957885742 }, { "auxiliary_loss_clip": 0.01306068, "auxiliary_loss_mlp": 0.01086116, "balance_loss_clip": 1.0792098, "balance_loss_mlp": 1.04501224, "epoch": 0.0217646174658049, "flos": 25228539146880.0, "grad_norm": 3.144635825096315, "language_loss": 0.78844237, "learning_rate": 3.7933479902128433e-06, "loss": 0.81236422, "num_input_tokens_seen": 7574155, "step": 362, "time_per_iteration": 2.6302051544189453 }, { "auxiliary_loss_clip": 0.01309585, "auxiliary_loss_mlp": 0.01093258, "balance_loss_clip": 1.08188891, "balance_loss_mlp": 1.05244076, "epoch": 0.02182474071847287, "flos": 22893124671360.0, "grad_norm": 2.019833715135914, "language_loss": 0.92474592, "learning_rate": 3.7951241352937077e-06, "loss": 0.94877434, "num_input_tokens_seen": 7592320, "step": 363, "time_per_iteration": 2.6566081047058105 }, { "auxiliary_loss_clip": 0.01305173, "auxiliary_loss_mlp": 0.01096467, "balance_loss_clip": 1.0816617, "balance_loss_mlp": 1.05693769, "epoch": 0.02188486397114084, "flos": 23658991482240.0, "grad_norm": 2.282586403147275, "language_loss": 0.89844346, "learning_rate": 3.7968953941370915e-06, "loss": 0.92245984, "num_input_tokens_seen": 7611185, "step": 364, "time_per_iteration": 2.711911201477051 }, { "auxiliary_loss_clip": 0.01311963, "auxiliary_loss_mlp": 0.0109247, "balance_loss_clip": 1.08607888, "balance_loss_mlp": 1.04955506, "epoch": 0.021944987223808807, "flos": 21543637680000.0, "grad_norm": 1.948927065488749, "language_loss": 0.79460645, "learning_rate": 3.798661793553676e-06, "loss": 0.81865084, "num_input_tokens_seen": 7631970, "step": 365, "time_per_iteration": 2.6396052837371826 }, { "auxiliary_loss_clip": 0.01306043, "auxiliary_loss_mlp": 0.01100405, "balance_loss_clip": 1.08267248, "balance_loss_mlp": 1.05658317, "epoch": 0.022005110476476776, "flos": 16070887641600.0, "grad_norm": 1.85181498507666, "language_loss": 0.84341359, "learning_rate": 3.8004233601340808e-06, "loss": 0.86747801, "num_input_tokens_seen": 7649745, "step": 366, "time_per_iteration": 2.6278867721557617 }, { "auxiliary_loss_clip": 0.01312113, "auxiliary_loss_mlp": 0.01087574, "balance_loss_clip": 1.08304918, "balance_loss_mlp": 1.04859269, "epoch": 0.022065233729144748, "flos": 21433715084160.0, "grad_norm": 1.9326288300300676, "language_loss": 0.87040466, "learning_rate": 3.8021801202512694e-06, "loss": 0.89440155, "num_input_tokens_seen": 7668830, "step": 367, "time_per_iteration": 2.6410560607910156 }, { "auxiliary_loss_clip": 0.01312217, "auxiliary_loss_mlp": 0.01096053, "balance_loss_clip": 1.08074582, "balance_loss_mlp": 1.05335259, "epoch": 0.022125356981812717, "flos": 21543709507200.0, "grad_norm": 2.7247329926128976, "language_loss": 0.8487373, "learning_rate": 3.803932100062912e-06, "loss": 0.87282002, "num_input_tokens_seen": 7687240, "step": 368, "time_per_iteration": 2.652012825012207 }, { "auxiliary_loss_clip": 0.01312089, "auxiliary_loss_mlp": 0.01079926, "balance_loss_clip": 1.0801568, "balance_loss_mlp": 1.04027653, "epoch": 0.022185480234480685, "flos": 20704153944960.0, "grad_norm": 2.4839328990540794, "language_loss": 0.75997221, "learning_rate": 3.8056793255137264e-06, "loss": 0.78389233, "num_input_tokens_seen": 7704440, "step": 369, "time_per_iteration": 2.601384401321411 }, { "auxiliary_loss_clip": 0.01306737, "auxiliary_loss_mlp": 0.01099274, "balance_loss_clip": 1.08232927, "balance_loss_mlp": 1.05836105, "epoch": 0.022245603487148654, "flos": 25193203142400.0, "grad_norm": 2.189428421230448, "language_loss": 0.82977992, "learning_rate": 3.8074218223377844e-06, "loss": 0.85383999, "num_input_tokens_seen": 7727160, "step": 370, "time_per_iteration": 2.6538548469543457 }, { "auxiliary_loss_clip": 0.01306327, "auxiliary_loss_mlp": 0.01099594, "balance_loss_clip": 1.08127654, "balance_loss_mlp": 1.05713177, "epoch": 0.022305726739816623, "flos": 21395936954880.0, "grad_norm": 1.8569755368340455, "language_loss": 0.81588483, "learning_rate": 3.8091596160607834e-06, "loss": 0.83994406, "num_input_tokens_seen": 7747730, "step": 371, "time_per_iteration": 2.6779489517211914 }, { "auxiliary_loss_clip": 0.01311283, "auxiliary_loss_mlp": 0.01093653, "balance_loss_clip": 1.08593988, "balance_loss_mlp": 1.05169153, "epoch": 0.022365849992484595, "flos": 22492146170880.0, "grad_norm": 2.0622769904034817, "language_loss": 0.83493644, "learning_rate": 3.8108927320022896e-06, "loss": 0.85898578, "num_input_tokens_seen": 7766765, "step": 372, "time_per_iteration": 2.676797866821289 }, { "auxiliary_loss_clip": 0.01303906, "auxiliary_loss_mlp": 0.01091688, "balance_loss_clip": 1.08125615, "balance_loss_mlp": 1.05022752, "epoch": 0.022425973245152563, "flos": 17856581397120.0, "grad_norm": 2.8569846697004424, "language_loss": 0.79004842, "learning_rate": 3.8126211952779548e-06, "loss": 0.81400436, "num_input_tokens_seen": 7784010, "step": 373, "time_per_iteration": 2.593186616897583 }, { "auxiliary_loss_clip": 0.01309731, "auxiliary_loss_mlp": 0.01087409, "balance_loss_clip": 1.08431911, "balance_loss_mlp": 1.0448271, "epoch": 0.022486096497820532, "flos": 15483029656320.0, "grad_norm": 2.5442660874947385, "language_loss": 0.77622557, "learning_rate": 3.8143450308016952e-06, "loss": 0.80019701, "num_input_tokens_seen": 7801305, "step": 374, "time_per_iteration": 2.628392457962036 }, { "auxiliary_loss_clip": 0.0129871, "auxiliary_loss_mlp": 0.01076131, "balance_loss_clip": 1.07404125, "balance_loss_mlp": 1.03395462, "epoch": 0.0225462197504885, "flos": 27784157950080.0, "grad_norm": 1.574507922341891, "language_loss": 0.86032569, "learning_rate": 3.8160642632878525e-06, "loss": 0.88407415, "num_input_tokens_seen": 7823965, "step": 375, "time_per_iteration": 2.6783435344696045 }, { "auxiliary_loss_clip": 0.01307026, "auxiliary_loss_mlp": 0.01102393, "balance_loss_clip": 1.08340597, "balance_loss_mlp": 1.0590483, "epoch": 0.02260634300315647, "flos": 19975490645760.0, "grad_norm": 2.1279260859120286, "language_loss": 0.8901403, "learning_rate": 3.817778917253314e-06, "loss": 0.91423446, "num_input_tokens_seen": 7842115, "step": 376, "time_per_iteration": 2.621629476547241 }, { "auxiliary_loss_clip": 0.01306872, "auxiliary_loss_mlp": 0.01087647, "balance_loss_clip": 1.07870364, "balance_loss_mlp": 1.04868913, "epoch": 0.02266646625582444, "flos": 16028189349120.0, "grad_norm": 3.0367767906095917, "language_loss": 0.75437558, "learning_rate": 3.8194890170196155e-06, "loss": 0.77832079, "num_input_tokens_seen": 7857830, "step": 377, "time_per_iteration": 2.5465245246887207 }, { "auxiliary_loss_clip": 0.01298987, "auxiliary_loss_mlp": 0.01093623, "balance_loss_clip": 1.08128345, "balance_loss_mlp": 1.0517087, "epoch": 0.02272658950849241, "flos": 20404622430720.0, "grad_norm": 2.1955644054597374, "language_loss": 0.99231368, "learning_rate": 3.8211945867150055e-06, "loss": 1.01623976, "num_input_tokens_seen": 7875840, "step": 378, "time_per_iteration": 7.184643983840942 }, { "auxiliary_loss_clip": 0.01202133, "auxiliary_loss_mlp": 0.01040839, "balance_loss_clip": 1.0828104, "balance_loss_mlp": 1.0283463, "epoch": 0.02278671276116038, "flos": 69847332647040.0, "grad_norm": 0.9608118941287621, "language_loss": 0.75395739, "learning_rate": 3.822895650276492e-06, "loss": 0.7763871, "num_input_tokens_seen": 7940190, "step": 379, "time_per_iteration": 4.961140394210815 }, { "auxiliary_loss_clip": 0.01308523, "auxiliary_loss_mlp": 0.01087195, "balance_loss_clip": 1.07820678, "balance_loss_mlp": 1.04792738, "epoch": 0.022846836013828347, "flos": 38508771340800.0, "grad_norm": 3.7276648293904375, "language_loss": 0.78197825, "learning_rate": 3.824592231451859e-06, "loss": 0.8059355, "num_input_tokens_seen": 7960840, "step": 380, "time_per_iteration": 2.7892863750457764 }, { "auxiliary_loss_clip": 0.01301718, "auxiliary_loss_mlp": 0.01088822, "balance_loss_clip": 1.07955217, "balance_loss_mlp": 1.04945946, "epoch": 0.02290695926649632, "flos": 20959478795520.0, "grad_norm": 2.0941800643649855, "language_loss": 0.96743369, "learning_rate": 3.826284353801652e-06, "loss": 0.99133915, "num_input_tokens_seen": 7975500, "step": 381, "time_per_iteration": 2.619854688644409 }, { "auxiliary_loss_clip": 0.01311313, "auxiliary_loss_mlp": 0.01093973, "balance_loss_clip": 1.08192921, "balance_loss_mlp": 1.0539186, "epoch": 0.022967082519164288, "flos": 24022407335040.0, "grad_norm": 2.122042453210184, "language_loss": 0.87664795, "learning_rate": 3.827972040701142e-06, "loss": 0.90070075, "num_input_tokens_seen": 7993880, "step": 382, "time_per_iteration": 2.617398500442505 }, { "auxiliary_loss_clip": 0.01304042, "auxiliary_loss_mlp": 0.01096828, "balance_loss_clip": 1.0821979, "balance_loss_mlp": 1.05760849, "epoch": 0.023027205771832256, "flos": 20997149184000.0, "grad_norm": 1.978420170714987, "language_loss": 0.84990942, "learning_rate": 3.829655315342268e-06, "loss": 0.87391812, "num_input_tokens_seen": 8012730, "step": 383, "time_per_iteration": 2.6345314979553223 }, { "auxiliary_loss_clip": 0.01300873, "auxiliary_loss_mlp": 0.0111136, "balance_loss_clip": 1.08199024, "balance_loss_mlp": 1.0716393, "epoch": 0.023087329024500225, "flos": 21360816432000.0, "grad_norm": 2.0575071112917778, "language_loss": 0.83349717, "learning_rate": 3.831334200735543e-06, "loss": 0.8576194, "num_input_tokens_seen": 8031275, "step": 384, "time_per_iteration": 2.6339902877807617 }, { "auxiliary_loss_clip": 0.0129979, "auxiliary_loss_mlp": 0.010893, "balance_loss_clip": 1.08362782, "balance_loss_mlp": 1.05255938, "epoch": 0.023147452277168194, "flos": 21872435800320.0, "grad_norm": 1.7828777740185773, "language_loss": 0.89289594, "learning_rate": 3.8330087197119426e-06, "loss": 0.91678685, "num_input_tokens_seen": 8051600, "step": 385, "time_per_iteration": 2.690460205078125 }, { "auxiliary_loss_clip": 0.01305297, "auxiliary_loss_mlp": 0.01118129, "balance_loss_clip": 1.08288455, "balance_loss_mlp": 1.07926655, "epoch": 0.023207575529836166, "flos": 18916700423040.0, "grad_norm": 1.9487706588237765, "language_loss": 0.70157433, "learning_rate": 3.83467889492477e-06, "loss": 0.72580856, "num_input_tokens_seen": 8070600, "step": 386, "time_per_iteration": 2.681957721710205 }, { "auxiliary_loss_clip": 0.01305989, "auxiliary_loss_mlp": 0.0109088, "balance_loss_clip": 1.08441973, "balance_loss_mlp": 1.05309081, "epoch": 0.023267698782504134, "flos": 25046005207680.0, "grad_norm": 2.354342660334866, "language_loss": 0.87840039, "learning_rate": 3.836344748851495e-06, "loss": 0.90236908, "num_input_tokens_seen": 8090680, "step": 387, "time_per_iteration": 2.6511123180389404 }, { "auxiliary_loss_clip": 0.01304298, "auxiliary_loss_mlp": 0.01075541, "balance_loss_clip": 1.08178413, "balance_loss_mlp": 1.03658366, "epoch": 0.023327822035172103, "flos": 28879217930880.0, "grad_norm": 2.2068948332198643, "language_loss": 0.8341614, "learning_rate": 3.838006303795566e-06, "loss": 0.85795981, "num_input_tokens_seen": 8114610, "step": 388, "time_per_iteration": 2.7062034606933594 }, { "auxiliary_loss_clip": 0.01301997, "auxiliary_loss_mlp": 0.01089724, "balance_loss_clip": 1.08110905, "balance_loss_mlp": 1.05284107, "epoch": 0.02338794528784007, "flos": 27121533805440.0, "grad_norm": 2.1887236217853863, "language_loss": 0.93710232, "learning_rate": 3.839663581888206e-06, "loss": 0.96101958, "num_input_tokens_seen": 8133975, "step": 389, "time_per_iteration": 2.680280923843384 }, { "auxiliary_loss_clip": 0.01296082, "auxiliary_loss_mlp": 0.01083127, "balance_loss_clip": 1.0818491, "balance_loss_mlp": 1.04397893, "epoch": 0.02344806854050804, "flos": 21322355944320.0, "grad_norm": 1.981860280002506, "language_loss": 0.87747037, "learning_rate": 3.841316605090178e-06, "loss": 0.9012624, "num_input_tokens_seen": 8153570, "step": 390, "time_per_iteration": 2.65970516204834 }, { "auxiliary_loss_clip": 0.01301203, "auxiliary_loss_mlp": 0.01092853, "balance_loss_clip": 1.08357048, "balance_loss_mlp": 1.0568521, "epoch": 0.023508191793176012, "flos": 24789997998720.0, "grad_norm": 2.134782100250632, "language_loss": 0.89370871, "learning_rate": 3.842965395193529e-06, "loss": 0.91764927, "num_input_tokens_seen": 8170075, "step": 391, "time_per_iteration": 2.620009660720825 }, { "auxiliary_loss_clip": 0.01296395, "auxiliary_loss_mlp": 0.01072264, "balance_loss_clip": 1.07956719, "balance_loss_mlp": 1.03521371, "epoch": 0.02356831504584398, "flos": 25995375624960.0, "grad_norm": 2.366558958564603, "language_loss": 0.86076117, "learning_rate": 3.84460997382332e-06, "loss": 0.88444775, "num_input_tokens_seen": 8190420, "step": 392, "time_per_iteration": 2.7171695232391357 }, { "auxiliary_loss_clip": 0.01293283, "auxiliary_loss_mlp": 0.01084283, "balance_loss_clip": 1.07891107, "balance_loss_mlp": 1.04763794, "epoch": 0.02362843829851195, "flos": 19062461813760.0, "grad_norm": 2.038818686720474, "language_loss": 0.89096916, "learning_rate": 3.8462503624393256e-06, "loss": 0.91474473, "num_input_tokens_seen": 8208790, "step": 393, "time_per_iteration": 2.632129669189453 }, { "auxiliary_loss_clip": 0.01304158, "auxiliary_loss_mlp": 0.01102255, "balance_loss_clip": 1.08471596, "balance_loss_mlp": 1.06279635, "epoch": 0.023688561551179918, "flos": 16071031296000.0, "grad_norm": 1.7920692319020195, "language_loss": 0.8156364, "learning_rate": 3.84788658233771e-06, "loss": 0.83970058, "num_input_tokens_seen": 8226885, "step": 394, "time_per_iteration": 2.5932936668395996 }, { "auxiliary_loss_clip": 0.01296851, "auxiliary_loss_mlp": 0.01088191, "balance_loss_clip": 1.07939875, "balance_loss_mlp": 1.04920936, "epoch": 0.023748684803847887, "flos": 21724375939200.0, "grad_norm": 4.539737106404062, "language_loss": 0.85808635, "learning_rate": 3.84951865465269e-06, "loss": 0.88193679, "num_input_tokens_seen": 8246825, "step": 395, "time_per_iteration": 2.6112868785858154 }, { "auxiliary_loss_clip": 0.01194704, "auxiliary_loss_mlp": 0.01034684, "balance_loss_clip": 1.07210529, "balance_loss_mlp": 1.02319229, "epoch": 0.02380880805651586, "flos": 61926192881280.0, "grad_norm": 0.9258089920958834, "language_loss": 0.6380353, "learning_rate": 3.851146600358172e-06, "loss": 0.66032922, "num_input_tokens_seen": 8302835, "step": 396, "time_per_iteration": 3.031489133834839 }, { "auxiliary_loss_clip": 0.0129188, "auxiliary_loss_mlp": 0.01071022, "balance_loss_clip": 1.07806754, "balance_loss_mlp": 1.03447223, "epoch": 0.023868931309183827, "flos": 20266331068800.0, "grad_norm": 2.3741099598177624, "language_loss": 0.83878696, "learning_rate": 3.852770440269372e-06, "loss": 0.86241591, "num_input_tokens_seen": 8320745, "step": 397, "time_per_iteration": 2.6049532890319824 }, { "auxiliary_loss_clip": 0.01297108, "auxiliary_loss_mlp": 0.01087341, "balance_loss_clip": 1.08104038, "balance_loss_mlp": 1.04890823, "epoch": 0.023929054561851796, "flos": 21139103733120.0, "grad_norm": 4.6847154905409205, "language_loss": 0.84066498, "learning_rate": 3.854390195044404e-06, "loss": 0.86450952, "num_input_tokens_seen": 8339540, "step": 398, "time_per_iteration": 2.6516692638397217 }, { "auxiliary_loss_clip": 0.01295876, "auxiliary_loss_mlp": 0.01078722, "balance_loss_clip": 1.07671928, "balance_loss_mlp": 1.04007471, "epoch": 0.023989177814519765, "flos": 13698521049600.0, "grad_norm": 2.80358563189936, "language_loss": 0.86029691, "learning_rate": 3.856005885185868e-06, "loss": 0.88404286, "num_input_tokens_seen": 8354890, "step": 399, "time_per_iteration": 2.5452589988708496 }, { "auxiliary_loss_clip": 0.01292698, "auxiliary_loss_mlp": 0.01090822, "balance_loss_clip": 1.08074594, "balance_loss_mlp": 1.05308056, "epoch": 0.024049301067187733, "flos": 26322018929280.0, "grad_norm": 2.021318687641168, "language_loss": 0.86254489, "learning_rate": 3.857617531042398e-06, "loss": 0.88638014, "num_input_tokens_seen": 8375845, "step": 400, "time_per_iteration": 2.6626927852630615 }, { "auxiliary_loss_clip": 0.01299822, "auxiliary_loss_mlp": 0.01083301, "balance_loss_clip": 1.08346462, "balance_loss_mlp": 1.04687035, "epoch": 0.024109424319855705, "flos": 24425432910720.0, "grad_norm": 1.735822397657743, "language_loss": 0.79276752, "learning_rate": 3.8592251528102065e-06, "loss": 0.81659877, "num_input_tokens_seen": 8395240, "step": 401, "time_per_iteration": 2.68418025970459 }, { "auxiliary_loss_clip": 0.0129275, "auxiliary_loss_mlp": 0.01091389, "balance_loss_clip": 1.07852793, "balance_loss_mlp": 1.05493474, "epoch": 0.024169547572523674, "flos": 29604397610880.0, "grad_norm": 3.889755427752258, "language_loss": 0.78890866, "learning_rate": 3.8608287705345976e-06, "loss": 0.81274998, "num_input_tokens_seen": 8416950, "step": 402, "time_per_iteration": 2.7509379386901855 }, { "auxiliary_loss_clip": 0.01296434, "auxiliary_loss_mlp": 0.01082712, "balance_loss_clip": 1.07797897, "balance_loss_mlp": 1.04399323, "epoch": 0.024229670825191642, "flos": 22601458235520.0, "grad_norm": 2.49356632429363, "language_loss": 0.94936156, "learning_rate": 3.86242840411147e-06, "loss": 0.97315305, "num_input_tokens_seen": 8433660, "step": 403, "time_per_iteration": 2.5760560035705566 }, { "auxiliary_loss_clip": 0.0129994, "auxiliary_loss_mlp": 0.01091893, "balance_loss_clip": 1.07754242, "balance_loss_mlp": 1.05315053, "epoch": 0.02428979407785961, "flos": 18150258994560.0, "grad_norm": 2.361656575803209, "language_loss": 0.99877387, "learning_rate": 3.864024073288798e-06, "loss": 1.0226922, "num_input_tokens_seen": 8450180, "step": 404, "time_per_iteration": 2.5966458320617676 }, { "auxiliary_loss_clip": 0.01298911, "auxiliary_loss_mlp": 0.01100127, "balance_loss_clip": 1.08096266, "balance_loss_mlp": 1.06312442, "epoch": 0.024349917330527583, "flos": 15304984917120.0, "grad_norm": 2.3162348618509276, "language_loss": 0.8802169, "learning_rate": 3.865615797668091e-06, "loss": 0.90420723, "num_input_tokens_seen": 8467775, "step": 405, "time_per_iteration": 2.5728275775909424 }, { "auxiliary_loss_clip": 0.01306827, "auxiliary_loss_mlp": 0.01097881, "balance_loss_clip": 1.084512, "balance_loss_mlp": 1.06004393, "epoch": 0.024410040583195552, "flos": 20773892200320.0, "grad_norm": 2.7399607903318275, "language_loss": 0.93386561, "learning_rate": 3.867203596705844e-06, "loss": 0.95791268, "num_input_tokens_seen": 8486765, "step": 406, "time_per_iteration": 2.612668991088867 }, { "auxiliary_loss_clip": 0.01299426, "auxiliary_loss_mlp": 0.01088378, "balance_loss_clip": 1.08213782, "balance_loss_mlp": 1.0500164, "epoch": 0.02447016383586352, "flos": 21798854789760.0, "grad_norm": 2.1742012769968526, "language_loss": 0.87128031, "learning_rate": 3.86878748971496e-06, "loss": 0.89515841, "num_input_tokens_seen": 8506515, "step": 407, "time_per_iteration": 2.5982017517089844 }, { "auxiliary_loss_clip": 0.01298266, "auxiliary_loss_mlp": 0.01083858, "balance_loss_clip": 1.08472157, "balance_loss_mlp": 1.04630709, "epoch": 0.02453028708853149, "flos": 33948116380800.0, "grad_norm": 2.1458430439144234, "language_loss": 0.74102569, "learning_rate": 3.8703674958661596e-06, "loss": 0.76484692, "num_input_tokens_seen": 8528035, "step": 408, "time_per_iteration": 2.708670139312744 }, { "auxiliary_loss_clip": 0.01300128, "auxiliary_loss_mlp": 0.01089985, "balance_loss_clip": 1.08222318, "balance_loss_mlp": 1.05233896, "epoch": 0.024590410341199458, "flos": 21793000872960.0, "grad_norm": 2.4878473813549675, "language_loss": 0.92509401, "learning_rate": 3.871943634189376e-06, "loss": 0.94899511, "num_input_tokens_seen": 8546455, "step": 409, "time_per_iteration": 2.665321111679077 }, { "auxiliary_loss_clip": 0.01296394, "auxiliary_loss_mlp": 0.01077538, "balance_loss_clip": 1.08126342, "balance_loss_mlp": 1.04291987, "epoch": 0.02465053359386743, "flos": 35114782124160.0, "grad_norm": 2.2521095969191722, "language_loss": 0.82792604, "learning_rate": 3.873515923575128e-06, "loss": 0.85166532, "num_input_tokens_seen": 8568450, "step": 410, "time_per_iteration": 2.848928213119507 }, { "auxiliary_loss_clip": 0.01299459, "auxiliary_loss_mlp": 0.01089133, "balance_loss_clip": 1.08187068, "balance_loss_mlp": 1.05284572, "epoch": 0.0247106568465354, "flos": 27451409333760.0, "grad_norm": 2.1393760271628595, "language_loss": 0.77577484, "learning_rate": 3.875084382775879e-06, "loss": 0.79966074, "num_input_tokens_seen": 8589340, "step": 411, "time_per_iteration": 2.6645278930664062 }, { "auxiliary_loss_clip": 0.01298341, "auxiliary_loss_mlp": 0.0110154, "balance_loss_clip": 1.07977521, "balance_loss_mlp": 1.06289268, "epoch": 0.024770780099203367, "flos": 20703794808960.0, "grad_norm": 2.2974658872162665, "language_loss": 0.86379063, "learning_rate": 3.87664903040738e-06, "loss": 0.88778943, "num_input_tokens_seen": 8607150, "step": 412, "time_per_iteration": 2.6091151237487793 }, { "auxiliary_loss_clip": 0.01187014, "auxiliary_loss_mlp": 0.01031436, "balance_loss_clip": 1.07387948, "balance_loss_mlp": 1.02089787, "epoch": 0.024830903351871336, "flos": 69551859369600.0, "grad_norm": 0.8687159185244209, "language_loss": 0.5852263, "learning_rate": 3.878209884949994e-06, "loss": 0.60741079, "num_input_tokens_seen": 8669865, "step": 413, "time_per_iteration": 3.2269625663757324 }, { "auxiliary_loss_clip": 0.0129043, "auxiliary_loss_mlp": 0.01091958, "balance_loss_clip": 1.07709181, "balance_loss_mlp": 1.05249953, "epoch": 0.024891026604539304, "flos": 32270477713920.0, "grad_norm": 1.8280666153990437, "language_loss": 0.80517173, "learning_rate": 3.879766964750006e-06, "loss": 0.82899559, "num_input_tokens_seen": 8690235, "step": 414, "time_per_iteration": 2.720341444015503 }, { "auxiliary_loss_clip": 0.01287097, "auxiliary_loss_mlp": 0.0109242, "balance_loss_clip": 1.0756042, "balance_loss_mlp": 1.0556556, "epoch": 0.024951149857207276, "flos": 18840282238080.0, "grad_norm": 2.1921003994701302, "language_loss": 0.80227423, "learning_rate": 3.881320288020917e-06, "loss": 0.82606936, "num_input_tokens_seen": 8706295, "step": 415, "time_per_iteration": 2.6473400592803955 }, { "auxiliary_loss_clip": 0.01302694, "auxiliary_loss_mlp": 0.01082455, "balance_loss_clip": 1.08156919, "balance_loss_mlp": 1.04497528, "epoch": 0.025011273109875245, "flos": 15377201210880.0, "grad_norm": 2.9318871737289776, "language_loss": 0.96236515, "learning_rate": 3.882869872844723e-06, "loss": 0.9862166, "num_input_tokens_seen": 8724200, "step": 416, "time_per_iteration": 2.596189260482788 }, { "auxiliary_loss_clip": 0.01291636, "auxiliary_loss_mlp": 0.01074465, "balance_loss_clip": 1.07628798, "balance_loss_mlp": 1.0355792, "epoch": 0.025071396362543213, "flos": 18915515274240.0, "grad_norm": 1.741746736079687, "language_loss": 0.77381694, "learning_rate": 3.884415737173176e-06, "loss": 0.79747796, "num_input_tokens_seen": 8744170, "step": 417, "time_per_iteration": 5.610344171524048 }, { "auxiliary_loss_clip": 0.01290746, "auxiliary_loss_mlp": 0.0109022, "balance_loss_clip": 1.08072221, "balance_loss_mlp": 1.05264485, "epoch": 0.025131519615211182, "flos": 25337958952320.0, "grad_norm": 1.554385639735456, "language_loss": 0.77076226, "learning_rate": 3.8859578988290344e-06, "loss": 0.79457194, "num_input_tokens_seen": 8765120, "step": 418, "time_per_iteration": 5.837290525436401 }, { "auxiliary_loss_clip": 0.01297026, "auxiliary_loss_mlp": 0.01071197, "balance_loss_clip": 1.08019948, "balance_loss_mlp": 1.03550553, "epoch": 0.02519164286787915, "flos": 18953149749120.0, "grad_norm": 2.4603268634516207, "language_loss": 0.81445098, "learning_rate": 3.887496375507294e-06, "loss": 0.83813322, "num_input_tokens_seen": 8783500, "step": 419, "time_per_iteration": 2.582590341567993 }, { "auxiliary_loss_clip": 0.01291114, "auxiliary_loss_mlp": 0.01086736, "balance_loss_clip": 1.07929599, "balance_loss_mlp": 1.04708743, "epoch": 0.025251766120547123, "flos": 17421092904960.0, "grad_norm": 1.8078532084212713, "language_loss": 0.73618573, "learning_rate": 3.8890311847764065e-06, "loss": 0.75996423, "num_input_tokens_seen": 8801175, "step": 420, "time_per_iteration": 2.6739418506622314 }, { "auxiliary_loss_clip": 0.01290485, "auxiliary_loss_mlp": 0.01096292, "balance_loss_clip": 1.07605243, "balance_loss_mlp": 1.05924153, "epoch": 0.02531188937321509, "flos": 25045430590080.0, "grad_norm": 1.77336014903074, "language_loss": 0.79040134, "learning_rate": 3.890562344079484e-06, "loss": 0.81426907, "num_input_tokens_seen": 8820215, "step": 421, "time_per_iteration": 2.6928632259368896 }, { "auxiliary_loss_clip": 0.01290689, "auxiliary_loss_mlp": 0.01088863, "balance_loss_clip": 1.07922924, "balance_loss_mlp": 1.04983425, "epoch": 0.02537201262588306, "flos": 30592228515840.0, "grad_norm": 2.2139016136437104, "language_loss": 0.8203755, "learning_rate": 3.89208987073549e-06, "loss": 0.84417105, "num_input_tokens_seen": 8839660, "step": 422, "time_per_iteration": 2.714707851409912 }, { "auxiliary_loss_clip": 0.01293659, "auxiliary_loss_mlp": 0.01078975, "balance_loss_clip": 1.07677865, "balance_loss_mlp": 1.04430926, "epoch": 0.02543213587855103, "flos": 26065365275520.0, "grad_norm": 2.1259138778576356, "language_loss": 0.83458018, "learning_rate": 3.893613781940409e-06, "loss": 0.85830647, "num_input_tokens_seen": 8859280, "step": 423, "time_per_iteration": 2.652757167816162 }, { "auxiliary_loss_clip": 0.01287497, "auxiliary_loss_mlp": 0.01078335, "balance_loss_clip": 1.0742569, "balance_loss_mlp": 1.04221487, "epoch": 0.025492259131218997, "flos": 36022818965760.0, "grad_norm": 2.012741083661608, "language_loss": 0.74129444, "learning_rate": 3.895134094768415e-06, "loss": 0.76495278, "num_input_tokens_seen": 8880560, "step": 424, "time_per_iteration": 2.7724521160125732 }, { "auxiliary_loss_clip": 0.01296446, "auxiliary_loss_mlp": 0.01093799, "balance_loss_clip": 1.07987142, "balance_loss_mlp": 1.05782199, "epoch": 0.02555238238388697, "flos": 18588045957120.0, "grad_norm": 4.623670538116741, "language_loss": 0.83193713, "learning_rate": 3.896650826173015e-06, "loss": 0.85583955, "num_input_tokens_seen": 8899155, "step": 425, "time_per_iteration": 2.608029842376709 }, { "auxiliary_loss_clip": 0.01292462, "auxiliary_loss_mlp": 0.01092376, "balance_loss_clip": 1.07259536, "balance_loss_mlp": 1.0544672, "epoch": 0.025612505636554938, "flos": 24243186280320.0, "grad_norm": 2.5075767706443566, "language_loss": 0.853073, "learning_rate": 3.898163992988186e-06, "loss": 0.87692136, "num_input_tokens_seen": 8917890, "step": 426, "time_per_iteration": 2.6445271968841553 }, { "auxiliary_loss_clip": 0.01175923, "auxiliary_loss_mlp": 0.01017688, "balance_loss_clip": 1.06532824, "balance_loss_mlp": 1.00781715, "epoch": 0.025672628889222907, "flos": 60586941265920.0, "grad_norm": 0.8949637292547264, "language_loss": 0.57219732, "learning_rate": 3.899673611929491e-06, "loss": 0.5941335, "num_input_tokens_seen": 8978260, "step": 427, "time_per_iteration": 3.2690517902374268 }, { "auxiliary_loss_clip": 0.01291989, "auxiliary_loss_mlp": 0.01092649, "balance_loss_clip": 1.08155811, "balance_loss_mlp": 1.05674267, "epoch": 0.025732752141890875, "flos": 19573255169280.0, "grad_norm": 2.4869215225306673, "language_loss": 0.88130605, "learning_rate": 3.901179699595194e-06, "loss": 0.90515244, "num_input_tokens_seen": 8994460, "step": 428, "time_per_iteration": 2.6143813133239746 }, { "auxiliary_loss_clip": 0.01283603, "auxiliary_loss_mlp": 0.0107531, "balance_loss_clip": 1.07418942, "balance_loss_mlp": 1.03735399, "epoch": 0.025792875394558847, "flos": 31284262920960.0, "grad_norm": 2.067247304638145, "language_loss": 0.85790849, "learning_rate": 3.902682272467353e-06, "loss": 0.88149762, "num_input_tokens_seen": 9016670, "step": 429, "time_per_iteration": 2.749328374862671 }, { "auxiliary_loss_clip": 0.01288943, "auxiliary_loss_mlp": 0.01083888, "balance_loss_clip": 1.07337689, "balance_loss_mlp": 1.04590786, "epoch": 0.025852998647226816, "flos": 32379610210560.0, "grad_norm": 2.4411876712444034, "language_loss": 0.8815223, "learning_rate": 3.904181346912895e-06, "loss": 0.90525061, "num_input_tokens_seen": 9039720, "step": 430, "time_per_iteration": 2.7483572959899902 }, { "auxiliary_loss_clip": 0.01290726, "auxiliary_loss_mlp": 0.01080495, "balance_loss_clip": 1.0803287, "balance_loss_mlp": 1.04573333, "epoch": 0.025913121899894784, "flos": 20193288762240.0, "grad_norm": 2.086180078538185, "language_loss": 0.84249514, "learning_rate": 3.905676939184698e-06, "loss": 0.8662073, "num_input_tokens_seen": 9059850, "step": 431, "time_per_iteration": 2.6531126499176025 }, { "auxiliary_loss_clip": 0.01286945, "auxiliary_loss_mlp": 0.01073345, "balance_loss_clip": 1.07570636, "balance_loss_mlp": 1.03951311, "epoch": 0.025973245152562753, "flos": 14720430983040.0, "grad_norm": 2.681931959502968, "language_loss": 0.86511916, "learning_rate": 3.907169065422638e-06, "loss": 0.88872206, "num_input_tokens_seen": 9077590, "step": 432, "time_per_iteration": 2.7582762241363525 }, { "auxiliary_loss_clip": 0.01287429, "auxiliary_loss_mlp": 0.01072961, "balance_loss_clip": 1.07632601, "balance_loss_mlp": 1.03891492, "epoch": 0.02603336840523072, "flos": 30992991534720.0, "grad_norm": 1.95596969308187, "language_loss": 0.76036298, "learning_rate": 3.908657741654636e-06, "loss": 0.7839669, "num_input_tokens_seen": 9099880, "step": 433, "time_per_iteration": 2.707771062850952 }, { "auxiliary_loss_clip": 0.01289436, "auxiliary_loss_mlp": 0.01088504, "balance_loss_clip": 1.07470191, "balance_loss_mlp": 1.04973757, "epoch": 0.026093491657898694, "flos": 17674262939520.0, "grad_norm": 2.157056093147959, "language_loss": 0.8979522, "learning_rate": 3.910142983797699e-06, "loss": 0.92173159, "num_input_tokens_seen": 9118620, "step": 434, "time_per_iteration": 2.5665409564971924 }, { "auxiliary_loss_clip": 0.01289617, "auxiliary_loss_mlp": 0.01096405, "balance_loss_clip": 1.07960439, "balance_loss_mlp": 1.05904448, "epoch": 0.026153614910566662, "flos": 17857874286720.0, "grad_norm": 2.306071945033866, "language_loss": 0.80187833, "learning_rate": 3.9116248076589305e-06, "loss": 0.82573849, "num_input_tokens_seen": 9135655, "step": 435, "time_per_iteration": 2.614440679550171 }, { "auxiliary_loss_clip": 0.01285396, "auxiliary_loss_mlp": 0.01092207, "balance_loss_clip": 1.07367229, "balance_loss_mlp": 1.05503798, "epoch": 0.02621373816323463, "flos": 20011113959040.0, "grad_norm": 3.0257040949539356, "language_loss": 0.86361396, "learning_rate": 3.913103228936546e-06, "loss": 0.88739002, "num_input_tokens_seen": 9153520, "step": 436, "time_per_iteration": 2.635033130645752 }, { "auxiliary_loss_clip": 0.01289558, "auxiliary_loss_mlp": 0.01096903, "balance_loss_clip": 1.07716811, "balance_loss_mlp": 1.06080687, "epoch": 0.0262738614159026, "flos": 19281193683840.0, "grad_norm": 2.4233286399217993, "language_loss": 0.74725163, "learning_rate": 3.914578263220868e-06, "loss": 0.77111626, "num_input_tokens_seen": 9170750, "step": 437, "time_per_iteration": 2.6614880561828613 }, { "auxiliary_loss_clip": 0.01286403, "auxiliary_loss_mlp": 0.01100399, "balance_loss_clip": 1.07628679, "balance_loss_mlp": 1.06220388, "epoch": 0.026333984668570568, "flos": 18807208790400.0, "grad_norm": 2.79370908187484, "language_loss": 0.9131338, "learning_rate": 3.916049925995316e-06, "loss": 0.93700182, "num_input_tokens_seen": 9188430, "step": 438, "time_per_iteration": 2.674877166748047 }, { "auxiliary_loss_clip": 0.01169678, "auxiliary_loss_mlp": 0.01072518, "balance_loss_clip": 1.0602653, "balance_loss_mlp": 1.06250465, "epoch": 0.02639410792123854, "flos": 64572020691840.0, "grad_norm": 0.8871275810137318, "language_loss": 0.62631273, "learning_rate": 3.917518232637377e-06, "loss": 0.64873469, "num_input_tokens_seen": 9255835, "step": 439, "time_per_iteration": 3.2527849674224854 }, { "auxiliary_loss_clip": 0.01296492, "auxiliary_loss_mlp": 0.01095184, "balance_loss_clip": 1.08175814, "balance_loss_mlp": 1.05758572, "epoch": 0.02645423117390651, "flos": 28473462921600.0, "grad_norm": 3.31985956061953, "language_loss": 0.75982475, "learning_rate": 3.918983198419573e-06, "loss": 0.78374153, "num_input_tokens_seen": 9276835, "step": 440, "time_per_iteration": 2.6770262718200684 }, { "auxiliary_loss_clip": 0.01286342, "auxiliary_loss_mlp": 0.01076505, "balance_loss_clip": 1.07652593, "balance_loss_mlp": 1.04048026, "epoch": 0.026514354426574478, "flos": 18551237495040.0, "grad_norm": 3.0236705091068283, "language_loss": 0.83197021, "learning_rate": 3.920444838510415e-06, "loss": 0.85559869, "num_input_tokens_seen": 9295075, "step": 441, "time_per_iteration": 2.591306209564209 }, { "auxiliary_loss_clip": 0.01291817, "auxiliary_loss_mlp": 0.01086154, "balance_loss_clip": 1.07703269, "balance_loss_mlp": 1.04829359, "epoch": 0.026574477679242446, "flos": 20667812359680.0, "grad_norm": 2.202684635319811, "language_loss": 0.78490162, "learning_rate": 3.92190316797534e-06, "loss": 0.80868137, "num_input_tokens_seen": 9314205, "step": 442, "time_per_iteration": 2.633054733276367 }, { "auxiliary_loss_clip": 0.0116251, "auxiliary_loss_mlp": 0.01015158, "balance_loss_clip": 1.05336332, "balance_loss_mlp": 1.0054301, "epoch": 0.026634600931910415, "flos": 57956125340160.0, "grad_norm": 0.9609264438471399, "language_loss": 0.64459753, "learning_rate": 3.92335820177765e-06, "loss": 0.66637421, "num_input_tokens_seen": 9367395, "step": 443, "time_per_iteration": 3.1241400241851807 }, { "auxiliary_loss_clip": 0.01291897, "auxiliary_loss_mlp": 0.01085882, "balance_loss_clip": 1.08147204, "balance_loss_mlp": 1.04906964, "epoch": 0.026694724184578387, "flos": 15815131827840.0, "grad_norm": 2.121488874389134, "language_loss": 0.82093638, "learning_rate": 3.924809954779425e-06, "loss": 0.84471416, "num_input_tokens_seen": 9385185, "step": 444, "time_per_iteration": 2.6202428340911865 }, { "auxiliary_loss_clip": 0.0129406, "auxiliary_loss_mlp": 0.01082041, "balance_loss_clip": 1.07940578, "balance_loss_mlp": 1.04263067, "epoch": 0.026754847437246355, "flos": 23440259612160.0, "grad_norm": 2.2213674770888607, "language_loss": 0.95689106, "learning_rate": 3.9262584417424425e-06, "loss": 0.98065209, "num_input_tokens_seen": 9403225, "step": 445, "time_per_iteration": 2.6071228981018066 }, { "auxiliary_loss_clip": 0.01289866, "auxiliary_loss_mlp": 0.01094053, "balance_loss_clip": 1.07953668, "balance_loss_mlp": 1.05492878, "epoch": 0.026814970689914324, "flos": 17341801632000.0, "grad_norm": 2.775359545549618, "language_loss": 0.91932094, "learning_rate": 3.9277036773290725e-06, "loss": 0.94316012, "num_input_tokens_seen": 9420540, "step": 446, "time_per_iteration": 2.5791916847229004 }, { "auxiliary_loss_clip": 0.01289847, "auxiliary_loss_mlp": 0.01088114, "balance_loss_clip": 1.08072042, "balance_loss_mlp": 1.05092025, "epoch": 0.026875093942582293, "flos": 17894718662400.0, "grad_norm": 2.0562763127679204, "language_loss": 0.79831308, "learning_rate": 3.92914567610317e-06, "loss": 0.82209271, "num_input_tokens_seen": 9438840, "step": 447, "time_per_iteration": 2.6420843601226807 }, { "auxiliary_loss_clip": 0.01289397, "auxiliary_loss_mlp": 0.01079607, "balance_loss_clip": 1.07901013, "balance_loss_mlp": 1.04446411, "epoch": 0.026935217195250265, "flos": 21723980889600.0, "grad_norm": 2.231264914467203, "language_loss": 0.86402845, "learning_rate": 3.930584452530952e-06, "loss": 0.8877185, "num_input_tokens_seen": 9457215, "step": 448, "time_per_iteration": 2.590277910232544 }, { "auxiliary_loss_clip": 0.01282455, "auxiliary_loss_mlp": 0.01091099, "balance_loss_clip": 1.07706833, "balance_loss_mlp": 1.05662322, "epoch": 0.026995340447918233, "flos": 23622685810560.0, "grad_norm": 1.941778256808524, "language_loss": 0.88581634, "learning_rate": 3.9320200209818755e-06, "loss": 0.90955186, "num_input_tokens_seen": 9475615, "step": 449, "time_per_iteration": 2.610065460205078 }, { "auxiliary_loss_clip": 0.01293472, "auxiliary_loss_mlp": 0.01085576, "balance_loss_clip": 1.07856452, "balance_loss_mlp": 1.04814398, "epoch": 0.027055463700586202, "flos": 17931275729280.0, "grad_norm": 2.199007921978797, "language_loss": 0.80395782, "learning_rate": 3.933452395729493e-06, "loss": 0.8277483, "num_input_tokens_seen": 9493975, "step": 450, "time_per_iteration": 2.637465238571167 }, { "auxiliary_loss_clip": 0.01284612, "auxiliary_loss_mlp": 0.0108001, "balance_loss_clip": 1.08025336, "balance_loss_mlp": 1.04384232, "epoch": 0.02711558695325417, "flos": 25118903859840.0, "grad_norm": 1.599374223212879, "language_loss": 0.81562543, "learning_rate": 3.934881590952304e-06, "loss": 0.83927161, "num_input_tokens_seen": 9514810, "step": 451, "time_per_iteration": 2.6506927013397217 }, { "auxiliary_loss_clip": 0.0128567, "auxiliary_loss_mlp": 0.01090719, "balance_loss_clip": 1.08126068, "balance_loss_mlp": 1.0533824, "epoch": 0.02717571020592214, "flos": 24239559006720.0, "grad_norm": 1.9677929562692107, "language_loss": 0.77019048, "learning_rate": 3.936307620734599e-06, "loss": 0.79395437, "num_input_tokens_seen": 9533635, "step": 452, "time_per_iteration": 2.5751442909240723 }, { "auxiliary_loss_clip": 0.01286865, "auxiliary_loss_mlp": 0.01088287, "balance_loss_clip": 1.08011293, "balance_loss_mlp": 1.05135596, "epoch": 0.02723583345859011, "flos": 25118939773440.0, "grad_norm": 1.7205362750177517, "language_loss": 0.72874546, "learning_rate": 3.937730499067294e-06, "loss": 0.75249696, "num_input_tokens_seen": 9555420, "step": 453, "time_per_iteration": 2.668083667755127 }, { "auxiliary_loss_clip": 0.01281405, "auxiliary_loss_mlp": 0.01083223, "balance_loss_clip": 1.07714963, "balance_loss_mlp": 1.04748416, "epoch": 0.02729595671125808, "flos": 42741597847680.0, "grad_norm": 1.8353680194819204, "language_loss": 0.82419729, "learning_rate": 3.939150239848748e-06, "loss": 0.84784359, "num_input_tokens_seen": 9578950, "step": 454, "time_per_iteration": 2.8580126762390137 }, { "auxiliary_loss_clip": 0.01285525, "auxiliary_loss_mlp": 0.01077241, "balance_loss_clip": 1.07935429, "balance_loss_mlp": 1.043648, "epoch": 0.02735607996392605, "flos": 21430985650560.0, "grad_norm": 1.985829769195046, "language_loss": 0.75404847, "learning_rate": 3.9405668568855866e-06, "loss": 0.77767611, "num_input_tokens_seen": 9598160, "step": 455, "time_per_iteration": 2.6593477725982666 }, { "auxiliary_loss_clip": 0.01282853, "auxiliary_loss_mlp": 0.01094959, "balance_loss_clip": 1.07477236, "balance_loss_mlp": 1.0597918, "epoch": 0.027416203216594017, "flos": 20851280052480.0, "grad_norm": 1.92483069519606, "language_loss": 0.80670613, "learning_rate": 3.941980363893499e-06, "loss": 0.83048427, "num_input_tokens_seen": 9616010, "step": 456, "time_per_iteration": 2.6798384189605713 }, { "auxiliary_loss_clip": 0.01280135, "auxiliary_loss_mlp": 0.01080319, "balance_loss_clip": 1.07714963, "balance_loss_mlp": 1.0435549, "epoch": 0.027476326469261986, "flos": 13224500242560.0, "grad_norm": 2.171481572134165, "language_loss": 0.81587321, "learning_rate": 3.9433907744980384e-06, "loss": 0.83947778, "num_input_tokens_seen": 9634000, "step": 457, "time_per_iteration": 5.62308406829834 }, { "auxiliary_loss_clip": 0.01283922, "auxiliary_loss_mlp": 0.01084055, "balance_loss_clip": 1.07603848, "balance_loss_mlp": 1.04891229, "epoch": 0.027536449721929958, "flos": 24024526237440.0, "grad_norm": 2.024184269172234, "language_loss": 0.94030929, "learning_rate": 3.944798102235412e-06, "loss": 0.96398914, "num_input_tokens_seen": 9653455, "step": 458, "time_per_iteration": 5.694372653961182 }, { "auxiliary_loss_clip": 0.01280807, "auxiliary_loss_mlp": 0.01091426, "balance_loss_clip": 1.07479525, "balance_loss_mlp": 1.05666471, "epoch": 0.027596572974597926, "flos": 13006055681280.0, "grad_norm": 2.356061876390436, "language_loss": 0.79279089, "learning_rate": 3.9462023605532545e-06, "loss": 0.81651318, "num_input_tokens_seen": 9669650, "step": 459, "time_per_iteration": 2.626948595046997 }, { "auxiliary_loss_clip": 0.01286253, "auxiliary_loss_mlp": 0.01081623, "balance_loss_clip": 1.08119941, "balance_loss_mlp": 1.04278445, "epoch": 0.027656696227265895, "flos": 26143076350080.0, "grad_norm": 2.0583603779546404, "language_loss": 0.83362132, "learning_rate": 3.947603562811407e-06, "loss": 0.85730016, "num_input_tokens_seen": 9691415, "step": 460, "time_per_iteration": 2.7191598415374756 }, { "auxiliary_loss_clip": 0.01158037, "auxiliary_loss_mlp": 0.01054463, "balance_loss_clip": 1.05032754, "balance_loss_mlp": 1.044402, "epoch": 0.027716819479933864, "flos": 60697222997760.0, "grad_norm": 1.612511499168885, "language_loss": 0.7351321, "learning_rate": 3.949001722282675e-06, "loss": 0.7572571, "num_input_tokens_seen": 9755605, "step": 461, "time_per_iteration": 3.210820436477661 }, { "auxiliary_loss_clip": 0.01284234, "auxiliary_loss_mlp": 0.01079832, "balance_loss_clip": 1.08432341, "balance_loss_mlp": 1.04700136, "epoch": 0.027776942732601832, "flos": 31211938886400.0, "grad_norm": 2.4500038571081073, "language_loss": 0.81596625, "learning_rate": 3.950396852153582e-06, "loss": 0.839607, "num_input_tokens_seen": 9776270, "step": 462, "time_per_iteration": 2.683197021484375 }, { "auxiliary_loss_clip": 0.01280414, "auxiliary_loss_mlp": 0.0107864, "balance_loss_clip": 1.07752454, "balance_loss_mlp": 1.0454762, "epoch": 0.027837065985269804, "flos": 22674644196480.0, "grad_norm": 2.258526594266715, "language_loss": 0.90062451, "learning_rate": 3.951788965525118e-06, "loss": 0.92421508, "num_input_tokens_seen": 9794465, "step": 463, "time_per_iteration": 2.641674757003784 }, { "auxiliary_loss_clip": 0.01151842, "auxiliary_loss_mlp": 0.01010002, "balance_loss_clip": 1.04755902, "balance_loss_mlp": 1.00027454, "epoch": 0.027897189237937773, "flos": 62182487399040.0, "grad_norm": 0.8962796480673014, "language_loss": 0.59058654, "learning_rate": 3.953178075413476e-06, "loss": 0.61220491, "num_input_tokens_seen": 9849685, "step": 464, "time_per_iteration": 3.1129612922668457 }, { "auxiliary_loss_clip": 0.01292933, "auxiliary_loss_mlp": 0.01100533, "balance_loss_clip": 1.08296049, "balance_loss_mlp": 1.06412649, "epoch": 0.02795731249060574, "flos": 24493160004480.0, "grad_norm": 2.3712654859298055, "language_loss": 0.81454253, "learning_rate": 3.954564194750784e-06, "loss": 0.83847719, "num_input_tokens_seen": 9869505, "step": 465, "time_per_iteration": 2.723144769668579 }, { "auxiliary_loss_clip": 0.01279938, "auxiliary_loss_mlp": 0.01092668, "balance_loss_clip": 1.07546401, "balance_loss_mlp": 1.05630863, "epoch": 0.02801743574327371, "flos": 23733003456000.0, "grad_norm": 1.9968224423519798, "language_loss": 0.78396618, "learning_rate": 3.955947336385828e-06, "loss": 0.80769229, "num_input_tokens_seen": 9890950, "step": 466, "time_per_iteration": 2.6278555393218994 }, { "auxiliary_loss_clip": 0.0127853, "auxiliary_loss_mlp": 0.01091802, "balance_loss_clip": 1.07703936, "balance_loss_mlp": 1.05661178, "epoch": 0.02807755899594168, "flos": 20629100476800.0, "grad_norm": 2.010021605622182, "language_loss": 0.87699366, "learning_rate": 3.957327513084761e-06, "loss": 0.90069699, "num_input_tokens_seen": 9911265, "step": 467, "time_per_iteration": 2.6687490940093994 }, { "auxiliary_loss_clip": 0.01285129, "auxiliary_loss_mlp": 0.01112935, "balance_loss_clip": 1.07874036, "balance_loss_mlp": 1.07576585, "epoch": 0.02813768224860965, "flos": 19244564789760.0, "grad_norm": 2.2302958424490416, "language_loss": 0.86091757, "learning_rate": 3.958704737531818e-06, "loss": 0.88489819, "num_input_tokens_seen": 9929025, "step": 468, "time_per_iteration": 2.5745644569396973 }, { "auxiliary_loss_clip": 0.01281128, "auxiliary_loss_mlp": 0.01085455, "balance_loss_clip": 1.07529211, "balance_loss_mlp": 1.04857147, "epoch": 0.02819780550127762, "flos": 20813968800000.0, "grad_norm": 2.1866562002509875, "language_loss": 0.91690558, "learning_rate": 3.9600790223300065e-06, "loss": 0.94057143, "num_input_tokens_seen": 9945190, "step": 469, "time_per_iteration": 2.610821008682251 }, { "auxiliary_loss_clip": 0.0127909, "auxiliary_loss_mlp": 0.0110095, "balance_loss_clip": 1.07675052, "balance_loss_mlp": 1.06482995, "epoch": 0.028257928753945588, "flos": 19974125928960.0, "grad_norm": 2.674428223667968, "language_loss": 0.81758964, "learning_rate": 3.96145038000181e-06, "loss": 0.84139001, "num_input_tokens_seen": 9962820, "step": 470, "time_per_iteration": 2.6004326343536377 }, { "auxiliary_loss_clip": 0.0128074, "auxiliary_loss_mlp": 0.01086643, "balance_loss_clip": 1.07482624, "balance_loss_mlp": 1.04947352, "epoch": 0.028318052006613557, "flos": 20484488321280.0, "grad_norm": 1.788793606991614, "language_loss": 0.93071401, "learning_rate": 3.962818822989861e-06, "loss": 0.95438784, "num_input_tokens_seen": 9982595, "step": 471, "time_per_iteration": 2.556288719177246 }, { "auxiliary_loss_clip": 0.01273697, "auxiliary_loss_mlp": 0.0110454, "balance_loss_clip": 1.07223165, "balance_loss_mlp": 1.06884849, "epoch": 0.02837817525928153, "flos": 28514832410880.0, "grad_norm": 1.8550872135639116, "language_loss": 0.7613501, "learning_rate": 3.964184363657625e-06, "loss": 0.78513247, "num_input_tokens_seen": 10004645, "step": 472, "time_per_iteration": 2.667804002761841 }, { "auxiliary_loss_clip": 0.01280341, "auxiliary_loss_mlp": 0.01090649, "balance_loss_clip": 1.07279634, "balance_loss_mlp": 1.05624473, "epoch": 0.028438298511949497, "flos": 18551668458240.0, "grad_norm": 1.9914661475951314, "language_loss": 0.93097353, "learning_rate": 3.965547014290071e-06, "loss": 0.95468336, "num_input_tokens_seen": 10022555, "step": 473, "time_per_iteration": 2.6402342319488525 }, { "auxiliary_loss_clip": 0.01287339, "auxiliary_loss_mlp": 0.01124194, "balance_loss_clip": 1.07773685, "balance_loss_mlp": 1.08979011, "epoch": 0.028498421764617466, "flos": 16910227722240.0, "grad_norm": 3.2560638787193237, "language_loss": 0.88488632, "learning_rate": 3.96690678709433e-06, "loss": 0.90900171, "num_input_tokens_seen": 10041025, "step": 474, "time_per_iteration": 2.5853888988494873 }, { "auxiliary_loss_clip": 0.0127783, "auxiliary_loss_mlp": 0.01093132, "balance_loss_clip": 1.07535374, "balance_loss_mlp": 1.05620146, "epoch": 0.028558545017285435, "flos": 27778699082880.0, "grad_norm": 3.1427023167402006, "language_loss": 0.78901398, "learning_rate": 3.968263694200355e-06, "loss": 0.81272364, "num_input_tokens_seen": 10060775, "step": 475, "time_per_iteration": 2.654519557952881 }, { "auxiliary_loss_clip": 0.01148107, "auxiliary_loss_mlp": 0.01095224, "balance_loss_clip": 1.04505777, "balance_loss_mlp": 1.08583021, "epoch": 0.028618668269953403, "flos": 65654367258240.0, "grad_norm": 0.9280065830162254, "language_loss": 0.66926932, "learning_rate": 3.969617747661569e-06, "loss": 0.6917026, "num_input_tokens_seen": 10120225, "step": 476, "time_per_iteration": 3.1292569637298584 }, { "auxiliary_loss_clip": 0.01279748, "auxiliary_loss_mlp": 0.01088794, "balance_loss_clip": 1.07638311, "balance_loss_mlp": 1.05188656, "epoch": 0.028678791522621375, "flos": 21937074324480.0, "grad_norm": 2.985672001195028, "language_loss": 0.83807188, "learning_rate": 3.970968959455509e-06, "loss": 0.86175728, "num_input_tokens_seen": 10137880, "step": 477, "time_per_iteration": 2.651493549346924 }, { "auxiliary_loss_clip": 0.01284956, "auxiliary_loss_mlp": 0.0108711, "balance_loss_clip": 1.07924342, "balance_loss_mlp": 1.05089426, "epoch": 0.028738914775289344, "flos": 24572128055040.0, "grad_norm": 2.1929055744411943, "language_loss": 0.8233152, "learning_rate": 3.97231734148446e-06, "loss": 0.84703588, "num_input_tokens_seen": 10156930, "step": 478, "time_per_iteration": 2.6986753940582275 }, { "auxiliary_loss_clip": 0.01277687, "auxiliary_loss_mlp": 0.01080644, "balance_loss_clip": 1.07448888, "balance_loss_mlp": 1.04500043, "epoch": 0.028799038027957313, "flos": 23257977068160.0, "grad_norm": 4.057107988717453, "language_loss": 0.81195259, "learning_rate": 3.973662905576082e-06, "loss": 0.83553594, "num_input_tokens_seen": 10176295, "step": 479, "time_per_iteration": 2.6321041584014893 }, { "auxiliary_loss_clip": 0.01273765, "auxiliary_loss_mlp": 0.01083313, "balance_loss_clip": 1.07335579, "balance_loss_mlp": 1.04552341, "epoch": 0.02885916128062528, "flos": 22164102236160.0, "grad_norm": 2.352225573775279, "language_loss": 0.7335608, "learning_rate": 3.975005663484038e-06, "loss": 0.75713164, "num_input_tokens_seen": 10195790, "step": 480, "time_per_iteration": 2.650696277618408 }, { "auxiliary_loss_clip": 0.01273107, "auxiliary_loss_mlp": 0.01075586, "balance_loss_clip": 1.07424879, "balance_loss_mlp": 1.04277968, "epoch": 0.02891928453329325, "flos": 22932842135040.0, "grad_norm": 1.867890428108999, "language_loss": 0.87560165, "learning_rate": 3.976345626888605e-06, "loss": 0.89908862, "num_input_tokens_seen": 10218405, "step": 481, "time_per_iteration": 2.6585533618927 }, { "auxiliary_loss_clip": 0.01142103, "auxiliary_loss_mlp": 0.01017301, "balance_loss_clip": 1.04286921, "balance_loss_mlp": 1.00895679, "epoch": 0.028979407785961222, "flos": 57432941792640.0, "grad_norm": 0.8486437303263991, "language_loss": 0.66030192, "learning_rate": 3.9776828073972864e-06, "loss": 0.68189597, "num_input_tokens_seen": 10271005, "step": 482, "time_per_iteration": 2.9788918495178223 }, { "auxiliary_loss_clip": 0.01287904, "auxiliary_loss_mlp": 0.01082416, "balance_loss_clip": 1.07739437, "balance_loss_mlp": 1.04868007, "epoch": 0.02903953103862919, "flos": 16722737706240.0, "grad_norm": 2.6473263724689873, "language_loss": 0.7899214, "learning_rate": 3.979017216545415e-06, "loss": 0.81362462, "num_input_tokens_seen": 10288405, "step": 483, "time_per_iteration": 2.5642752647399902 }, { "auxiliary_loss_clip": 0.01283775, "auxiliary_loss_mlp": 0.01097438, "balance_loss_clip": 1.07794189, "balance_loss_mlp": 1.06155562, "epoch": 0.02909965429129716, "flos": 16763640318720.0, "grad_norm": 2.6777328906555766, "language_loss": 0.75510043, "learning_rate": 3.980348865796749e-06, "loss": 0.77891254, "num_input_tokens_seen": 10306875, "step": 484, "time_per_iteration": 2.608337640762329 }, { "auxiliary_loss_clip": 0.0127962, "auxiliary_loss_mlp": 0.01081582, "balance_loss_clip": 1.07543373, "balance_loss_mlp": 1.04760778, "epoch": 0.029159777543965128, "flos": 19785343023360.0, "grad_norm": 2.3457282915841113, "language_loss": 0.8378315, "learning_rate": 3.9816777665440615e-06, "loss": 0.86144352, "num_input_tokens_seen": 10323965, "step": 485, "time_per_iteration": 2.591409921646118 }, { "auxiliary_loss_clip": 0.01282377, "auxiliary_loss_mlp": 0.01084922, "balance_loss_clip": 1.08029485, "balance_loss_mlp": 1.04956484, "epoch": 0.029219900796633096, "flos": 19642670202240.0, "grad_norm": 2.044831141886674, "language_loss": 0.84432101, "learning_rate": 3.983003930109732e-06, "loss": 0.86799401, "num_input_tokens_seen": 10342620, "step": 486, "time_per_iteration": 2.7101452350616455 }, { "auxiliary_loss_clip": 0.01276806, "auxiliary_loss_mlp": 0.01090739, "balance_loss_clip": 1.07363296, "balance_loss_mlp": 1.05476189, "epoch": 0.02928002404930107, "flos": 25885704424320.0, "grad_norm": 12.432525192672303, "language_loss": 0.88968349, "learning_rate": 3.984327367746315e-06, "loss": 0.91335887, "num_input_tokens_seen": 10364610, "step": 487, "time_per_iteration": 2.637910842895508 }, { "auxiliary_loss_clip": 0.01283084, "auxiliary_loss_mlp": 0.01069223, "balance_loss_clip": 1.07921362, "balance_loss_mlp": 1.03677416, "epoch": 0.029340147301969037, "flos": 20660234590080.0, "grad_norm": 2.566388301054309, "language_loss": 0.88581878, "learning_rate": 3.985648090637122e-06, "loss": 0.90934181, "num_input_tokens_seen": 10380910, "step": 488, "time_per_iteration": 2.6569244861602783 }, { "auxiliary_loss_clip": 0.01275613, "auxiliary_loss_mlp": 0.01081415, "balance_loss_clip": 1.07419777, "balance_loss_mlp": 1.04667735, "epoch": 0.029400270554637006, "flos": 24428018689920.0, "grad_norm": 2.0135021623582503, "language_loss": 0.88869834, "learning_rate": 3.986966109896785e-06, "loss": 0.91226858, "num_input_tokens_seen": 10400665, "step": 489, "time_per_iteration": 2.805555582046509 }, { "auxiliary_loss_clip": 0.01271096, "auxiliary_loss_mlp": 0.01077182, "balance_loss_clip": 1.0704807, "balance_loss_mlp": 1.04168141, "epoch": 0.029460393807304974, "flos": 20120892900480.0, "grad_norm": 2.807428314395572, "language_loss": 0.88554472, "learning_rate": 3.988281436571815e-06, "loss": 0.90902752, "num_input_tokens_seen": 10420150, "step": 490, "time_per_iteration": 2.612993001937866 }, { "auxiliary_loss_clip": 0.01276687, "auxiliary_loss_mlp": 0.01088031, "balance_loss_clip": 1.0729506, "balance_loss_mlp": 1.0536747, "epoch": 0.029520517059972943, "flos": 17675914965120.0, "grad_norm": 2.430337539839543, "language_loss": 0.91496718, "learning_rate": 3.989594081641164e-06, "loss": 0.93861437, "num_input_tokens_seen": 10438210, "step": 491, "time_per_iteration": 2.6203627586364746 }, { "auxiliary_loss_clip": 0.01266864, "auxiliary_loss_mlp": 0.01072939, "balance_loss_clip": 1.07131863, "balance_loss_mlp": 1.03984618, "epoch": 0.029580640312640915, "flos": 18953185662720.0, "grad_norm": 1.9753258841331502, "language_loss": 0.85654163, "learning_rate": 3.9909040560167675e-06, "loss": 0.87993968, "num_input_tokens_seen": 10455125, "step": 492, "time_per_iteration": 2.636378288269043 }, { "auxiliary_loss_clip": 0.01279009, "auxiliary_loss_mlp": 0.01100381, "balance_loss_clip": 1.07765996, "balance_loss_mlp": 1.06471384, "epoch": 0.029640763565308884, "flos": 18726121837440.0, "grad_norm": 4.076790847855052, "language_loss": 0.84615922, "learning_rate": 3.992211370544093e-06, "loss": 0.86995316, "num_input_tokens_seen": 10470990, "step": 493, "time_per_iteration": 2.6144914627075195 }, { "auxiliary_loss_clip": 0.01272514, "auxiliary_loss_mlp": 0.01074657, "balance_loss_clip": 1.07140934, "balance_loss_mlp": 1.04042029, "epoch": 0.029700886817976852, "flos": 20595308757120.0, "grad_norm": 1.8084917907335818, "language_loss": 0.8658669, "learning_rate": 3.99351603600268e-06, "loss": 0.88933873, "num_input_tokens_seen": 10490685, "step": 494, "time_per_iteration": 2.7063095569610596 }, { "auxiliary_loss_clip": 0.01281688, "auxiliary_loss_mlp": 0.01084428, "balance_loss_clip": 1.07739305, "balance_loss_mlp": 1.05279028, "epoch": 0.02976101007064482, "flos": 22236857233920.0, "grad_norm": 7.125038043922513, "language_loss": 0.86841047, "learning_rate": 3.994818063106668e-06, "loss": 0.8920716, "num_input_tokens_seen": 10509435, "step": 495, "time_per_iteration": 2.641700267791748 }, { "auxiliary_loss_clip": 0.01268945, "auxiliary_loss_mlp": 0.01078198, "balance_loss_clip": 1.07384837, "balance_loss_mlp": 1.04508162, "epoch": 0.029821133323312793, "flos": 23732644320000.0, "grad_norm": 2.201071528053665, "language_loss": 0.61988759, "learning_rate": 3.99611746250533e-06, "loss": 0.64335901, "num_input_tokens_seen": 10530050, "step": 496, "time_per_iteration": 2.6524407863616943 }, { "auxiliary_loss_clip": 0.01270994, "auxiliary_loss_mlp": 0.01089922, "balance_loss_clip": 1.07575428, "balance_loss_mlp": 1.05680561, "epoch": 0.02988125657598076, "flos": 22419498913920.0, "grad_norm": 1.7538974268426115, "language_loss": 0.88820887, "learning_rate": 3.997414244783595e-06, "loss": 0.91181797, "num_input_tokens_seen": 10551370, "step": 497, "time_per_iteration": 5.648245811462402 }, { "auxiliary_loss_clip": 0.01277289, "auxiliary_loss_mlp": 0.01079642, "balance_loss_clip": 1.07670021, "balance_loss_mlp": 1.04604888, "epoch": 0.02994137982864873, "flos": 13845108453120.0, "grad_norm": 2.8395997319333204, "language_loss": 0.85091698, "learning_rate": 3.998708420462557e-06, "loss": 0.87448633, "num_input_tokens_seen": 10569225, "step": 498, "time_per_iteration": 4.362173080444336 }, { "auxiliary_loss_clip": 0.0127249, "auxiliary_loss_mlp": 0.01078673, "balance_loss_clip": 1.07436109, "balance_loss_mlp": 1.04691589, "epoch": 0.0300015030813167, "flos": 23908354675200.0, "grad_norm": 3.2275044857926605, "language_loss": 0.77883017, "learning_rate": 4e-06, "loss": 0.80234182, "num_input_tokens_seen": 10586170, "step": 499, "time_per_iteration": 2.6029655933380127 }, { "auxiliary_loss_clip": 0.01272525, "auxiliary_loss_mlp": 0.01082339, "balance_loss_clip": 1.07433248, "balance_loss_mlp": 1.04905546, "epoch": 0.030061626333984667, "flos": 22016796560640.0, "grad_norm": 2.244229511477372, "language_loss": 0.82687509, "learning_rate": 3.9999999620799e-06, "loss": 0.85042375, "num_input_tokens_seen": 10606205, "step": 500, "time_per_iteration": 2.6293113231658936 }, { "auxiliary_loss_clip": 0.01266453, "auxiliary_loss_mlp": 0.0108458, "balance_loss_clip": 1.07100737, "balance_loss_mlp": 1.04922247, "epoch": 0.03012174958665264, "flos": 23039747988480.0, "grad_norm": 3.2569274145363356, "language_loss": 0.88086087, "learning_rate": 3.9999998483196e-06, "loss": 0.90437114, "num_input_tokens_seen": 10625995, "step": 501, "time_per_iteration": 2.601081132888794 }, { "auxiliary_loss_clip": 0.01273997, "auxiliary_loss_mlp": 0.01071746, "balance_loss_clip": 1.07361674, "balance_loss_mlp": 1.04025102, "epoch": 0.030181872839320608, "flos": 18953257489920.0, "grad_norm": 3.3627001763511855, "language_loss": 0.86654103, "learning_rate": 3.9999996587191065e-06, "loss": 0.88999844, "num_input_tokens_seen": 10644105, "step": 502, "time_per_iteration": 2.5507659912109375 }, { "auxiliary_loss_clip": 0.01270542, "auxiliary_loss_mlp": 0.01081534, "balance_loss_clip": 1.07475543, "balance_loss_mlp": 1.04827452, "epoch": 0.030241996091988577, "flos": 16728017005440.0, "grad_norm": 2.4572357458963876, "language_loss": 0.84281206, "learning_rate": 3.999999393278425e-06, "loss": 0.86633277, "num_input_tokens_seen": 10661090, "step": 503, "time_per_iteration": 2.618587017059326 }, { "auxiliary_loss_clip": 0.01262547, "auxiliary_loss_mlp": 0.01091143, "balance_loss_clip": 1.0710721, "balance_loss_mlp": 1.05781209, "epoch": 0.030302119344656545, "flos": 28621271387520.0, "grad_norm": 1.6994359255159197, "language_loss": 0.88137805, "learning_rate": 3.999999051997567e-06, "loss": 0.90491492, "num_input_tokens_seen": 10682380, "step": 504, "time_per_iteration": 2.6794183254241943 }, { "auxiliary_loss_clip": 0.01264601, "auxiliary_loss_mlp": 0.01086749, "balance_loss_clip": 1.07040262, "balance_loss_mlp": 1.0541091, "epoch": 0.030362242597324514, "flos": 15669334523520.0, "grad_norm": 2.074855698516145, "language_loss": 0.786093, "learning_rate": 3.9999986348765425e-06, "loss": 0.80960649, "num_input_tokens_seen": 10699925, "step": 505, "time_per_iteration": 2.564960479736328 }, { "auxiliary_loss_clip": 0.01134686, "auxiliary_loss_mlp": 0.010147, "balance_loss_clip": 1.03763247, "balance_loss_mlp": 1.00692737, "epoch": 0.030422365849992486, "flos": 72125973676800.0, "grad_norm": 0.9565689962416369, "language_loss": 0.54981297, "learning_rate": 3.999998141915371e-06, "loss": 0.57130682, "num_input_tokens_seen": 10766525, "step": 506, "time_per_iteration": 3.3345654010772705 }, { "auxiliary_loss_clip": 0.01266577, "auxiliary_loss_mlp": 0.01090299, "balance_loss_clip": 1.07119894, "balance_loss_mlp": 1.05687308, "epoch": 0.030482489102660455, "flos": 19427817000960.0, "grad_norm": 2.2738865373146684, "language_loss": 0.83377159, "learning_rate": 3.999997573114069e-06, "loss": 0.8573404, "num_input_tokens_seen": 10786725, "step": 507, "time_per_iteration": 2.645613670349121 }, { "auxiliary_loss_clip": 0.01269938, "auxiliary_loss_mlp": 0.01076205, "balance_loss_clip": 1.07151937, "balance_loss_mlp": 1.04344678, "epoch": 0.030542612355328423, "flos": 20375822701440.0, "grad_norm": 2.375369924968869, "language_loss": 0.88842839, "learning_rate": 3.999996928472659e-06, "loss": 0.91188985, "num_input_tokens_seen": 10805390, "step": 508, "time_per_iteration": 2.617283344268799 }, { "auxiliary_loss_clip": 0.01272148, "auxiliary_loss_mlp": 0.01067206, "balance_loss_clip": 1.07232118, "balance_loss_mlp": 1.03394616, "epoch": 0.030602735607996392, "flos": 34677354297600.0, "grad_norm": 6.964954749829821, "language_loss": 0.71807706, "learning_rate": 3.999996207991165e-06, "loss": 0.74147063, "num_input_tokens_seen": 10828030, "step": 509, "time_per_iteration": 2.7723498344421387 }, { "auxiliary_loss_clip": 0.01264594, "auxiliary_loss_mlp": 0.01074377, "balance_loss_clip": 1.07241154, "balance_loss_mlp": 1.04333544, "epoch": 0.03066285886066436, "flos": 23658668259840.0, "grad_norm": 1.9285974370038053, "language_loss": 0.82031929, "learning_rate": 3.999995411669614e-06, "loss": 0.84370899, "num_input_tokens_seen": 10845240, "step": 510, "time_per_iteration": 2.6254217624664307 }, { "auxiliary_loss_clip": 0.01268793, "auxiliary_loss_mlp": 0.01075379, "balance_loss_clip": 1.07532823, "balance_loss_mlp": 1.04252458, "epoch": 0.030722982113332332, "flos": 23002975440000.0, "grad_norm": 5.706057095430757, "language_loss": 0.83572316, "learning_rate": 3.999994539508036e-06, "loss": 0.85916495, "num_input_tokens_seen": 10864325, "step": 511, "time_per_iteration": 2.613457441329956 }, { "auxiliary_loss_clip": 0.01269742, "auxiliary_loss_mlp": 0.01081314, "balance_loss_clip": 1.07207167, "balance_loss_mlp": 1.0496521, "epoch": 0.0307831053660003, "flos": 24750855152640.0, "grad_norm": 2.025270681093948, "language_loss": 0.82109964, "learning_rate": 3.9999935915064655e-06, "loss": 0.84461015, "num_input_tokens_seen": 10883860, "step": 512, "time_per_iteration": 2.630404233932495 }, { "auxiliary_loss_clip": 0.01266054, "auxiliary_loss_mlp": 0.01084436, "balance_loss_clip": 1.07086158, "balance_loss_mlp": 1.05070007, "epoch": 0.03084322861866827, "flos": 26140885620480.0, "grad_norm": 2.500363981205655, "language_loss": 0.86933553, "learning_rate": 3.9999925676649374e-06, "loss": 0.89284045, "num_input_tokens_seen": 10904555, "step": 513, "time_per_iteration": 2.671926259994507 }, { "auxiliary_loss_clip": 0.01272542, "auxiliary_loss_mlp": 0.01080065, "balance_loss_clip": 1.07461214, "balance_loss_mlp": 1.04744935, "epoch": 0.03090335187133624, "flos": 18771298168320.0, "grad_norm": 1.704575426690477, "language_loss": 0.79124331, "learning_rate": 3.999991467983491e-06, "loss": 0.81476939, "num_input_tokens_seen": 10923700, "step": 514, "time_per_iteration": 2.6158573627471924 }, { "auxiliary_loss_clip": 0.01265821, "auxiliary_loss_mlp": 0.01067844, "balance_loss_clip": 1.07397485, "balance_loss_mlp": 1.03711247, "epoch": 0.030963475124004207, "flos": 23221886878080.0, "grad_norm": 2.729063628201222, "language_loss": 0.77758944, "learning_rate": 3.999990292462167e-06, "loss": 0.80092615, "num_input_tokens_seen": 10942730, "step": 515, "time_per_iteration": 2.636294364929199 }, { "auxiliary_loss_clip": 0.0126398, "auxiliary_loss_mlp": 0.01072575, "balance_loss_clip": 1.06835747, "balance_loss_mlp": 1.03874326, "epoch": 0.03102359837667218, "flos": 42525595411200.0, "grad_norm": 2.1228851207681503, "language_loss": 0.82452714, "learning_rate": 3.999989041101011e-06, "loss": 0.84789264, "num_input_tokens_seen": 10967120, "step": 516, "time_per_iteration": 2.8078057765960693 }, { "auxiliary_loss_clip": 0.01263726, "auxiliary_loss_mlp": 0.01073859, "balance_loss_clip": 1.0712111, "balance_loss_mlp": 1.04090929, "epoch": 0.031083721629340148, "flos": 21176953689600.0, "grad_norm": 1.9016724574566626, "language_loss": 0.79088318, "learning_rate": 3.999987713900071e-06, "loss": 0.81425899, "num_input_tokens_seen": 10986775, "step": 517, "time_per_iteration": 2.5935981273651123 }, { "auxiliary_loss_clip": 0.0125895, "auxiliary_loss_mlp": 0.0107836, "balance_loss_clip": 1.07049131, "balance_loss_mlp": 1.04629326, "epoch": 0.031143844882008116, "flos": 29716187713920.0, "grad_norm": 1.6829619528007147, "language_loss": 0.90798068, "learning_rate": 3.999986310859396e-06, "loss": 0.93135381, "num_input_tokens_seen": 11011360, "step": 518, "time_per_iteration": 2.6855509281158447 }, { "auxiliary_loss_clip": 0.01272237, "auxiliary_loss_mlp": 0.01097567, "balance_loss_clip": 1.07848859, "balance_loss_mlp": 1.06230497, "epoch": 0.031203968134676085, "flos": 23112467072640.0, "grad_norm": 1.8835331125391583, "language_loss": 0.86759162, "learning_rate": 3.999984831979039e-06, "loss": 0.89128959, "num_input_tokens_seen": 11030150, "step": 519, "time_per_iteration": 2.628380060195923 }, { "auxiliary_loss_clip": 0.01265864, "auxiliary_loss_mlp": 0.01086943, "balance_loss_clip": 1.06901193, "balance_loss_mlp": 1.05578136, "epoch": 0.03126409138734405, "flos": 20954379064320.0, "grad_norm": 3.8823628482318164, "language_loss": 0.87246573, "learning_rate": 3.999983277259057e-06, "loss": 0.89599377, "num_input_tokens_seen": 11049145, "step": 520, "time_per_iteration": 2.5850255489349365 }, { "auxiliary_loss_clip": 0.01269157, "auxiliary_loss_mlp": 0.01086266, "balance_loss_clip": 1.07231963, "balance_loss_mlp": 1.0528394, "epoch": 0.031324214640012026, "flos": 21650112570240.0, "grad_norm": 1.7050130216714323, "language_loss": 0.89274424, "learning_rate": 3.999981646699509e-06, "loss": 0.91629851, "num_input_tokens_seen": 11068835, "step": 521, "time_per_iteration": 2.6412506103515625 }, { "auxiliary_loss_clip": 0.01263772, "auxiliary_loss_mlp": 0.01082584, "balance_loss_clip": 1.0717473, "balance_loss_mlp": 1.04827595, "epoch": 0.03138433789267999, "flos": 23441337020160.0, "grad_norm": 2.085624200373119, "language_loss": 0.71452564, "learning_rate": 3.999979940300456e-06, "loss": 0.73798925, "num_input_tokens_seen": 11088980, "step": 522, "time_per_iteration": 2.6561174392700195 }, { "auxiliary_loss_clip": 0.01265725, "auxiliary_loss_mlp": 0.01082552, "balance_loss_clip": 1.06871116, "balance_loss_mlp": 1.05079484, "epoch": 0.03144446114534796, "flos": 18982164960000.0, "grad_norm": 4.223323698032832, "language_loss": 0.84758592, "learning_rate": 3.999978158061963e-06, "loss": 0.87106872, "num_input_tokens_seen": 11104300, "step": 523, "time_per_iteration": 2.608565330505371 }, { "auxiliary_loss_clip": 0.01271589, "auxiliary_loss_mlp": 0.01076253, "balance_loss_clip": 1.07193565, "balance_loss_mlp": 1.04296994, "epoch": 0.031504584398015935, "flos": 22637692080000.0, "grad_norm": 2.324094801199308, "language_loss": 0.89989722, "learning_rate": 3.999976299984099e-06, "loss": 0.92337573, "num_input_tokens_seen": 11123335, "step": 524, "time_per_iteration": 2.68269944190979 }, { "auxiliary_loss_clip": 0.01273471, "auxiliary_loss_mlp": 0.0108318, "balance_loss_clip": 1.07427168, "balance_loss_mlp": 1.04944324, "epoch": 0.0315647076506839, "flos": 25297056339840.0, "grad_norm": 2.4635323942475766, "language_loss": 0.80114233, "learning_rate": 3.999974366066933e-06, "loss": 0.82470882, "num_input_tokens_seen": 11140880, "step": 525, "time_per_iteration": 2.6396324634552 }, { "auxiliary_loss_clip": 0.01264716, "auxiliary_loss_mlp": 0.01080959, "balance_loss_clip": 1.0681529, "balance_loss_mlp": 1.04798603, "epoch": 0.03162483090335187, "flos": 16982839065600.0, "grad_norm": 2.3553733144031948, "language_loss": 0.81162, "learning_rate": 3.999972356310538e-06, "loss": 0.83507675, "num_input_tokens_seen": 11158710, "step": 526, "time_per_iteration": 2.6167168617248535 }, { "auxiliary_loss_clip": 0.01273987, "auxiliary_loss_mlp": 0.01072725, "balance_loss_clip": 1.07507181, "balance_loss_mlp": 1.03736734, "epoch": 0.03168495415601984, "flos": 18734489706240.0, "grad_norm": 1.9666844995001491, "language_loss": 0.81491739, "learning_rate": 3.999970270714991e-06, "loss": 0.83838451, "num_input_tokens_seen": 11177550, "step": 527, "time_per_iteration": 2.580310821533203 }, { "auxiliary_loss_clip": 0.01261155, "auxiliary_loss_mlp": 0.01080842, "balance_loss_clip": 1.06786597, "balance_loss_mlp": 1.04717755, "epoch": 0.03174507740868781, "flos": 21214875473280.0, "grad_norm": 1.9105688869262756, "language_loss": 0.93801636, "learning_rate": 3.999968109280371e-06, "loss": 0.96143627, "num_input_tokens_seen": 11196230, "step": 528, "time_per_iteration": 2.5901002883911133 }, { "auxiliary_loss_clip": 0.01263275, "auxiliary_loss_mlp": 0.01071724, "balance_loss_clip": 1.06776333, "balance_loss_mlp": 1.0387274, "epoch": 0.03180520066135578, "flos": 24787663614720.0, "grad_norm": 1.8924176613796981, "language_loss": 0.84130204, "learning_rate": 3.99996587200676e-06, "loss": 0.86465204, "num_input_tokens_seen": 11214935, "step": 529, "time_per_iteration": 2.593867063522339 }, { "auxiliary_loss_clip": 0.01266309, "auxiliary_loss_mlp": 0.01088988, "balance_loss_clip": 1.07501197, "balance_loss_mlp": 1.0563724, "epoch": 0.03186532391402375, "flos": 24864261367680.0, "grad_norm": 2.316883777672742, "language_loss": 0.90458709, "learning_rate": 3.999963558894243e-06, "loss": 0.92814004, "num_input_tokens_seen": 11235310, "step": 530, "time_per_iteration": 2.5994982719421387 }, { "auxiliary_loss_clip": 0.01261024, "auxiliary_loss_mlp": 0.0107627, "balance_loss_clip": 1.06481552, "balance_loss_mlp": 1.04188991, "epoch": 0.03192544716669172, "flos": 21215055041280.0, "grad_norm": 2.2744046769674324, "language_loss": 0.76334512, "learning_rate": 3.999961169942907e-06, "loss": 0.78671807, "num_input_tokens_seen": 11254425, "step": 531, "time_per_iteration": 2.618149757385254 }, { "auxiliary_loss_clip": 0.01260981, "auxiliary_loss_mlp": 0.01064937, "balance_loss_clip": 1.0669558, "balance_loss_mlp": 1.03143883, "epoch": 0.03198557041935969, "flos": 24353216616960.0, "grad_norm": 2.467757262816931, "language_loss": 0.90483695, "learning_rate": 3.999958705152843e-06, "loss": 0.92809618, "num_input_tokens_seen": 11274595, "step": 532, "time_per_iteration": 2.647947072982788 }, { "auxiliary_loss_clip": 0.01146464, "auxiliary_loss_mlp": 0.01012028, "balance_loss_clip": 1.04988623, "balance_loss_mlp": 1.00325394, "epoch": 0.032045693672027656, "flos": 61827367587840.0, "grad_norm": 1.9655071928838626, "language_loss": 0.57953775, "learning_rate": 3.9999561645241445e-06, "loss": 0.60112268, "num_input_tokens_seen": 11336705, "step": 533, "time_per_iteration": 3.2502808570861816 }, { "auxiliary_loss_clip": 0.01260941, "auxiliary_loss_mlp": 0.01084263, "balance_loss_clip": 1.06724441, "balance_loss_mlp": 1.0516715, "epoch": 0.03210581692469563, "flos": 28401174800640.0, "grad_norm": 1.7138682169725878, "language_loss": 0.86666048, "learning_rate": 3.999953548056907e-06, "loss": 0.89011252, "num_input_tokens_seen": 11356820, "step": 534, "time_per_iteration": 2.678739070892334 }, { "auxiliary_loss_clip": 0.01259554, "auxiliary_loss_mlp": 0.01066669, "balance_loss_clip": 1.06782031, "balance_loss_mlp": 1.03407741, "epoch": 0.03216594017736359, "flos": 24717709877760.0, "grad_norm": 2.12774196295415, "language_loss": 0.77627808, "learning_rate": 3.999950855751232e-06, "loss": 0.79954034, "num_input_tokens_seen": 11376645, "step": 535, "time_per_iteration": 2.7128217220306396 }, { "auxiliary_loss_clip": 0.01261708, "auxiliary_loss_mlp": 0.01081378, "balance_loss_clip": 1.06843078, "balance_loss_mlp": 1.0485003, "epoch": 0.032226063430031565, "flos": 31175453646720.0, "grad_norm": 3.9913279940153585, "language_loss": 0.80939913, "learning_rate": 3.999948087607219e-06, "loss": 0.83283001, "num_input_tokens_seen": 11397310, "step": 536, "time_per_iteration": 2.7490127086639404 }, { "auxiliary_loss_clip": 0.01262237, "auxiliary_loss_mlp": 0.01075987, "balance_loss_clip": 1.06839073, "balance_loss_mlp": 1.04167831, "epoch": 0.03228618668269954, "flos": 32198225506560.0, "grad_norm": 1.6888601787189168, "language_loss": 0.7009111, "learning_rate": 3.999945243624975e-06, "loss": 0.72429335, "num_input_tokens_seen": 11418475, "step": 537, "time_per_iteration": 5.5609166622161865 }, { "auxiliary_loss_clip": 0.0126357, "auxiliary_loss_mlp": 0.01084205, "balance_loss_clip": 1.07331729, "balance_loss_mlp": 1.05161297, "epoch": 0.0323463099353675, "flos": 22670154996480.0, "grad_norm": 2.146306428033486, "language_loss": 0.82684958, "learning_rate": 3.999942323804607e-06, "loss": 0.85032725, "num_input_tokens_seen": 11436630, "step": 538, "time_per_iteration": 2.5465030670166016 }, { "auxiliary_loss_clip": 0.01269537, "auxiliary_loss_mlp": 0.01078099, "balance_loss_clip": 1.06987572, "balance_loss_mlp": 1.04536414, "epoch": 0.032406433188035474, "flos": 26905172232960.0, "grad_norm": 1.8709064214989917, "language_loss": 0.79146457, "learning_rate": 3.999939328146225e-06, "loss": 0.81494099, "num_input_tokens_seen": 11457275, "step": 539, "time_per_iteration": 4.172123432159424 }, { "auxiliary_loss_clip": 0.0126143, "auxiliary_loss_mlp": 0.01069528, "balance_loss_clip": 1.06830835, "balance_loss_mlp": 1.03567231, "epoch": 0.03246655644070344, "flos": 31503928544640.0, "grad_norm": 35.59051030008172, "language_loss": 0.77379727, "learning_rate": 3.999936256649943e-06, "loss": 0.79710686, "num_input_tokens_seen": 11476925, "step": 540, "time_per_iteration": 2.5633046627044678 }, { "auxiliary_loss_clip": 0.01269863, "auxiliary_loss_mlp": 0.01073669, "balance_loss_clip": 1.07271969, "balance_loss_mlp": 1.04124355, "epoch": 0.03252667969337141, "flos": 23218331431680.0, "grad_norm": 2.0489065110302636, "language_loss": 0.85458571, "learning_rate": 3.999933109315878e-06, "loss": 0.878021, "num_input_tokens_seen": 11496830, "step": 541, "time_per_iteration": 2.6079938411712646 }, { "auxiliary_loss_clip": 0.01258504, "auxiliary_loss_mlp": 0.01082451, "balance_loss_clip": 1.06961954, "balance_loss_mlp": 1.04835749, "epoch": 0.032586802946039384, "flos": 14757454926720.0, "grad_norm": 2.674731240129174, "language_loss": 0.89234567, "learning_rate": 3.9999298861441496e-06, "loss": 0.91575521, "num_input_tokens_seen": 11515605, "step": 542, "time_per_iteration": 2.597036600112915 }, { "auxiliary_loss_clip": 0.0126351, "auxiliary_loss_mlp": 0.01081041, "balance_loss_clip": 1.06974792, "balance_loss_mlp": 1.04792452, "epoch": 0.03264692619870735, "flos": 24280677100800.0, "grad_norm": 2.2714121360014334, "language_loss": 0.71123677, "learning_rate": 3.999926587134879e-06, "loss": 0.73468232, "num_input_tokens_seen": 11536230, "step": 543, "time_per_iteration": 2.634601354598999 }, { "auxiliary_loss_clip": 0.01259994, "auxiliary_loss_mlp": 0.01088763, "balance_loss_clip": 1.06379187, "balance_loss_mlp": 1.05545604, "epoch": 0.03270704945137532, "flos": 22893160584960.0, "grad_norm": 4.777521083182084, "language_loss": 0.91540575, "learning_rate": 3.999923212288192e-06, "loss": 0.93889332, "num_input_tokens_seen": 11554715, "step": 544, "time_per_iteration": 2.6173009872436523 }, { "auxiliary_loss_clip": 0.01264485, "auxiliary_loss_mlp": 0.01085684, "balance_loss_clip": 1.06989884, "balance_loss_mlp": 1.05571437, "epoch": 0.032767172704043286, "flos": 18041018757120.0, "grad_norm": 2.6951315012120025, "language_loss": 0.65799558, "learning_rate": 3.999919761604216e-06, "loss": 0.68149722, "num_input_tokens_seen": 11571370, "step": 545, "time_per_iteration": 2.6500988006591797 }, { "auxiliary_loss_clip": 0.012623, "auxiliary_loss_mlp": 0.0107161, "balance_loss_clip": 1.06693912, "balance_loss_mlp": 1.0393517, "epoch": 0.03282729595671126, "flos": 22528739151360.0, "grad_norm": 2.2564766449723908, "language_loss": 0.92221987, "learning_rate": 3.999916235083083e-06, "loss": 0.94555902, "num_input_tokens_seen": 11588560, "step": 546, "time_per_iteration": 2.673250913619995 }, { "auxiliary_loss_clip": 0.01260258, "auxiliary_loss_mlp": 0.01077296, "balance_loss_clip": 1.06488204, "balance_loss_mlp": 1.04313052, "epoch": 0.03288741920937923, "flos": 20410620001920.0, "grad_norm": 2.1923718908590653, "language_loss": 0.81706661, "learning_rate": 3.999912632724925e-06, "loss": 0.84044212, "num_input_tokens_seen": 11605685, "step": 547, "time_per_iteration": 2.725198745727539 }, { "auxiliary_loss_clip": 0.0126227, "auxiliary_loss_mlp": 0.0107871, "balance_loss_clip": 1.06794477, "balance_loss_mlp": 1.04480648, "epoch": 0.032947542462047195, "flos": 20777986350720.0, "grad_norm": 1.730652582963277, "language_loss": 0.81227565, "learning_rate": 3.999908954529881e-06, "loss": 0.83568549, "num_input_tokens_seen": 11626290, "step": 548, "time_per_iteration": 2.714073419570923 }, { "auxiliary_loss_clip": 0.01264818, "auxiliary_loss_mlp": 0.01084154, "balance_loss_clip": 1.06963027, "balance_loss_mlp": 1.04870164, "epoch": 0.03300766571471517, "flos": 19901263190400.0, "grad_norm": 3.8540092911047603, "language_loss": 0.67460287, "learning_rate": 3.999905200498087e-06, "loss": 0.69809258, "num_input_tokens_seen": 11643950, "step": 549, "time_per_iteration": 2.6747171878814697 }, { "auxiliary_loss_clip": 0.0125805, "auxiliary_loss_mlp": 0.01076001, "balance_loss_clip": 1.06968856, "balance_loss_mlp": 1.04236054, "epoch": 0.03306778896738313, "flos": 17967760968960.0, "grad_norm": 1.933615596136007, "language_loss": 0.86379111, "learning_rate": 3.999901370629689e-06, "loss": 0.88713157, "num_input_tokens_seen": 11662560, "step": 550, "time_per_iteration": 2.553386926651001 }, { "auxiliary_loss_clip": 0.01264951, "auxiliary_loss_mlp": 0.01095377, "balance_loss_clip": 1.07279766, "balance_loss_mlp": 1.06142652, "epoch": 0.033127912220051105, "flos": 21653380707840.0, "grad_norm": 3.1958143211070977, "language_loss": 0.8127178, "learning_rate": 3.99989746492483e-06, "loss": 0.83632112, "num_input_tokens_seen": 11682265, "step": 551, "time_per_iteration": 2.6231682300567627 }, { "auxiliary_loss_clip": 0.01271579, "auxiliary_loss_mlp": 0.0108998, "balance_loss_clip": 1.07285261, "balance_loss_mlp": 1.05626702, "epoch": 0.03318803547271908, "flos": 30188376927360.0, "grad_norm": 2.9473143774727606, "language_loss": 0.86134821, "learning_rate": 3.999893483383658e-06, "loss": 0.88496381, "num_input_tokens_seen": 11699300, "step": 552, "time_per_iteration": 2.7002694606781006 }, { "auxiliary_loss_clip": 0.01267081, "auxiliary_loss_mlp": 0.01081671, "balance_loss_clip": 1.07191086, "balance_loss_mlp": 1.04650474, "epoch": 0.03324815872538704, "flos": 20376038183040.0, "grad_norm": 2.990469903058063, "language_loss": 0.9301765, "learning_rate": 3.999889426006326e-06, "loss": 0.95366406, "num_input_tokens_seen": 11716955, "step": 553, "time_per_iteration": 2.6629648208618164 }, { "auxiliary_loss_clip": 0.01262345, "auxiliary_loss_mlp": 0.01077186, "balance_loss_clip": 1.06925786, "balance_loss_mlp": 1.04149485, "epoch": 0.033308281978055014, "flos": 24494560634880.0, "grad_norm": 2.1924330874053166, "language_loss": 0.78881586, "learning_rate": 3.999885292792986e-06, "loss": 0.8122111, "num_input_tokens_seen": 11736130, "step": 554, "time_per_iteration": 2.668970823287964 }, { "auxiliary_loss_clip": 0.01258048, "auxiliary_loss_mlp": 0.0108557, "balance_loss_clip": 1.06745815, "balance_loss_mlp": 1.05045104, "epoch": 0.03336840523072298, "flos": 23400326666880.0, "grad_norm": 2.2144550089326938, "language_loss": 0.81971425, "learning_rate": 3.999881083743795e-06, "loss": 0.84315038, "num_input_tokens_seen": 11754425, "step": 555, "time_per_iteration": 2.610807418823242 }, { "auxiliary_loss_clip": 0.01264442, "auxiliary_loss_mlp": 0.0108339, "balance_loss_clip": 1.06914032, "balance_loss_mlp": 1.04805672, "epoch": 0.03342852848339095, "flos": 30550571717760.0, "grad_norm": 3.7821745066525487, "language_loss": 0.88661897, "learning_rate": 3.999876798858914e-06, "loss": 0.9100973, "num_input_tokens_seen": 11772845, "step": 556, "time_per_iteration": 2.6288907527923584 }, { "auxiliary_loss_clip": 0.01262553, "auxiliary_loss_mlp": 0.01084158, "balance_loss_clip": 1.06896496, "balance_loss_mlp": 1.04863358, "epoch": 0.03348865173605892, "flos": 22893304239360.0, "grad_norm": 1.974910128087634, "language_loss": 0.83708388, "learning_rate": 3.999872438138503e-06, "loss": 0.860551, "num_input_tokens_seen": 11792850, "step": 557, "time_per_iteration": 2.649401903152466 }, { "auxiliary_loss_clip": 0.01268198, "auxiliary_loss_mlp": 0.01069057, "balance_loss_clip": 1.07400489, "balance_loss_mlp": 1.03684711, "epoch": 0.03354877498872689, "flos": 17676022705920.0, "grad_norm": 3.176542206824637, "language_loss": 0.94202292, "learning_rate": 3.999868001582729e-06, "loss": 0.96539545, "num_input_tokens_seen": 11809670, "step": 558, "time_per_iteration": 2.550515651702881 }, { "auxiliary_loss_clip": 0.01258948, "auxiliary_loss_mlp": 0.01074291, "balance_loss_clip": 1.06591845, "balance_loss_mlp": 1.04036427, "epoch": 0.03360889824139486, "flos": 21652985658240.0, "grad_norm": 2.6619487077732384, "language_loss": 0.77115649, "learning_rate": 3.99986348919176e-06, "loss": 0.79448891, "num_input_tokens_seen": 11829665, "step": 559, "time_per_iteration": 2.729597330093384 }, { "auxiliary_loss_clip": 0.01261947, "auxiliary_loss_mlp": 0.01080822, "balance_loss_clip": 1.06835234, "balance_loss_mlp": 1.04882574, "epoch": 0.033669021494062826, "flos": 21795730306560.0, "grad_norm": 1.945022837871561, "language_loss": 0.87472397, "learning_rate": 3.9998589009657675e-06, "loss": 0.89815164, "num_input_tokens_seen": 11848190, "step": 560, "time_per_iteration": 2.6082279682159424 }, { "auxiliary_loss_clip": 0.01257198, "auxiliary_loss_mlp": 0.0107356, "balance_loss_clip": 1.06704283, "balance_loss_mlp": 1.04199314, "epoch": 0.0337291447467308, "flos": 21866222747520.0, "grad_norm": 2.4061219554407502, "language_loss": 0.81578708, "learning_rate": 3.999854236904925e-06, "loss": 0.83909464, "num_input_tokens_seen": 11864795, "step": 561, "time_per_iteration": 2.602193832397461 }, { "auxiliary_loss_clip": 0.01254722, "auxiliary_loss_mlp": 0.01076361, "balance_loss_clip": 1.06685936, "balance_loss_mlp": 1.04422247, "epoch": 0.03378926799939877, "flos": 24245951627520.0, "grad_norm": 1.683217504050761, "language_loss": 0.82320511, "learning_rate": 3.999849497009409e-06, "loss": 0.84651601, "num_input_tokens_seen": 11885275, "step": 562, "time_per_iteration": 2.675872564315796 }, { "auxiliary_loss_clip": 0.01262146, "auxiliary_loss_mlp": 0.01084212, "balance_loss_clip": 1.06894755, "balance_loss_mlp": 1.0508337, "epoch": 0.033849391252066735, "flos": 16507812677760.0, "grad_norm": 2.262509698135982, "language_loss": 0.84285647, "learning_rate": 3.999844681279401e-06, "loss": 0.86632001, "num_input_tokens_seen": 11903595, "step": 563, "time_per_iteration": 2.586944103240967 }, { "auxiliary_loss_clip": 0.01258135, "auxiliary_loss_mlp": 0.01083866, "balance_loss_clip": 1.0675565, "balance_loss_mlp": 1.05094075, "epoch": 0.03390951450473471, "flos": 15669298609920.0, "grad_norm": 2.115200912185494, "language_loss": 0.94438875, "learning_rate": 3.99983978971508e-06, "loss": 0.96780878, "num_input_tokens_seen": 11917815, "step": 564, "time_per_iteration": 2.5444440841674805 }, { "auxiliary_loss_clip": 0.01259509, "auxiliary_loss_mlp": 0.01073406, "balance_loss_clip": 1.06518865, "balance_loss_mlp": 1.03907406, "epoch": 0.03396963775740267, "flos": 22674787850880.0, "grad_norm": 2.6560391741906924, "language_loss": 0.94669235, "learning_rate": 3.999834822316635e-06, "loss": 0.97002149, "num_input_tokens_seen": 11936305, "step": 565, "time_per_iteration": 2.5614171028137207 }, { "auxiliary_loss_clip": 0.01150452, "auxiliary_loss_mlp": 0.01081579, "balance_loss_clip": 1.04835606, "balance_loss_mlp": 1.07499874, "epoch": 0.034029761010070644, "flos": 64392683063040.0, "grad_norm": 1.0610477485673708, "language_loss": 0.54800498, "learning_rate": 3.9998297790842535e-06, "loss": 0.57032537, "num_input_tokens_seen": 11998940, "step": 566, "time_per_iteration": 3.229137659072876 }, { "auxiliary_loss_clip": 0.0126129, "auxiliary_loss_mlp": 0.01073482, "balance_loss_clip": 1.06798041, "balance_loss_mlp": 1.03793335, "epoch": 0.034089884262738616, "flos": 25004204755200.0, "grad_norm": 3.1955261820278564, "language_loss": 0.76836932, "learning_rate": 3.999824660018126e-06, "loss": 0.79171705, "num_input_tokens_seen": 12018860, "step": 567, "time_per_iteration": 2.632741928100586 }, { "auxiliary_loss_clip": 0.01253596, "auxiliary_loss_mlp": 0.01083559, "balance_loss_clip": 1.06611466, "balance_loss_mlp": 1.05153918, "epoch": 0.03415000751540658, "flos": 28439096584320.0, "grad_norm": 2.115683621050472, "language_loss": 0.80834144, "learning_rate": 3.999819465118447e-06, "loss": 0.83171296, "num_input_tokens_seen": 12039675, "step": 568, "time_per_iteration": 2.7206337451934814 }, { "auxiliary_loss_clip": 0.01254921, "auxiliary_loss_mlp": 0.01082401, "balance_loss_clip": 1.06888509, "balance_loss_mlp": 1.04940367, "epoch": 0.034210130768074554, "flos": 21468727866240.0, "grad_norm": 1.891360159585894, "language_loss": 0.86560667, "learning_rate": 3.999814194385413e-06, "loss": 0.88897985, "num_input_tokens_seen": 12057680, "step": 569, "time_per_iteration": 2.7271673679351807 }, { "auxiliary_loss_clip": 0.01255135, "auxiliary_loss_mlp": 0.01082251, "balance_loss_clip": 1.06644094, "balance_loss_mlp": 1.04922962, "epoch": 0.03427025402074252, "flos": 18697501676160.0, "grad_norm": 1.6888504559193653, "language_loss": 0.95945716, "learning_rate": 3.9998088478192255e-06, "loss": 0.982831, "num_input_tokens_seen": 12076135, "step": 570, "time_per_iteration": 2.5918867588043213 }, { "auxiliary_loss_clip": 0.01255487, "auxiliary_loss_mlp": 0.0108066, "balance_loss_clip": 1.06228065, "balance_loss_mlp": 1.0435617, "epoch": 0.03433037727341049, "flos": 20849987162880.0, "grad_norm": 2.39132447086081, "language_loss": 0.7964232, "learning_rate": 3.9998034254200846e-06, "loss": 0.8197847, "num_input_tokens_seen": 12094785, "step": 571, "time_per_iteration": 2.590184450149536 }, { "auxiliary_loss_clip": 0.01256218, "auxiliary_loss_mlp": 0.01091484, "balance_loss_clip": 1.06740785, "balance_loss_mlp": 1.0565083, "epoch": 0.03439050052607846, "flos": 25410282986880.0, "grad_norm": 2.0738695690993, "language_loss": 0.80214274, "learning_rate": 3.999797927188199e-06, "loss": 0.82561976, "num_input_tokens_seen": 12114590, "step": 572, "time_per_iteration": 2.6862123012542725 }, { "auxiliary_loss_clip": 0.01263024, "auxiliary_loss_mlp": 0.01074173, "balance_loss_clip": 1.06995344, "balance_loss_mlp": 1.04098535, "epoch": 0.03445062377874643, "flos": 17640147997440.0, "grad_norm": 2.2324763929909284, "language_loss": 0.84548658, "learning_rate": 3.999792353123774e-06, "loss": 0.86885858, "num_input_tokens_seen": 12132390, "step": 573, "time_per_iteration": 2.78487229347229 }, { "auxiliary_loss_clip": 0.01256326, "auxiliary_loss_mlp": 0.01068789, "balance_loss_clip": 1.0644815, "balance_loss_mlp": 1.03781831, "epoch": 0.0345107470314144, "flos": 16764502245120.0, "grad_norm": 2.576428901855709, "language_loss": 0.76602584, "learning_rate": 3.999786703227023e-06, "loss": 0.78927696, "num_input_tokens_seen": 12149035, "step": 574, "time_per_iteration": 2.5697100162506104 }, { "auxiliary_loss_clip": 0.01255191, "auxiliary_loss_mlp": 0.0107671, "balance_loss_clip": 1.06581593, "balance_loss_mlp": 1.04502439, "epoch": 0.03457087028408237, "flos": 14684448533760.0, "grad_norm": 2.156110110571344, "language_loss": 0.83854586, "learning_rate": 3.9997809774981606e-06, "loss": 0.86186486, "num_input_tokens_seen": 12167530, "step": 575, "time_per_iteration": 2.596418619155884 }, { "auxiliary_loss_clip": 0.01249695, "auxiliary_loss_mlp": 0.01076053, "balance_loss_clip": 1.06684637, "balance_loss_mlp": 1.04334211, "epoch": 0.03463099353675034, "flos": 20011293527040.0, "grad_norm": 2.350120742735315, "language_loss": 0.83990753, "learning_rate": 3.9997751759374025e-06, "loss": 0.86316502, "num_input_tokens_seen": 12186340, "step": 576, "time_per_iteration": 5.821930646896362 }, { "auxiliary_loss_clip": 0.01257114, "auxiliary_loss_mlp": 0.01079503, "balance_loss_clip": 1.07237518, "balance_loss_mlp": 1.04817426, "epoch": 0.03469111678941831, "flos": 25301150490240.0, "grad_norm": 2.138457686407641, "language_loss": 0.85803086, "learning_rate": 3.99976929854497e-06, "loss": 0.88139701, "num_input_tokens_seen": 12204090, "step": 577, "time_per_iteration": 4.225277423858643 }, { "auxiliary_loss_clip": 0.01253845, "auxiliary_loss_mlp": 0.01080214, "balance_loss_clip": 1.06869018, "balance_loss_mlp": 1.04712176, "epoch": 0.034751240042086275, "flos": 23259413612160.0, "grad_norm": 4.535240156776142, "language_loss": 0.72226608, "learning_rate": 3.9997633453210845e-06, "loss": 0.74560666, "num_input_tokens_seen": 12224850, "step": 578, "time_per_iteration": 4.486239433288574 }, { "auxiliary_loss_clip": 0.01251871, "auxiliary_loss_mlp": 0.01080519, "balance_loss_clip": 1.06461096, "balance_loss_mlp": 1.04663968, "epoch": 0.03481136329475425, "flos": 23769237300480.0, "grad_norm": 1.9496379050984929, "language_loss": 0.77785492, "learning_rate": 3.999757316265973e-06, "loss": 0.80117887, "num_input_tokens_seen": 12244935, "step": 579, "time_per_iteration": 2.6706583499908447 }, { "auxiliary_loss_clip": 0.01251647, "auxiliary_loss_mlp": 0.01087497, "balance_loss_clip": 1.06656826, "balance_loss_mlp": 1.05435717, "epoch": 0.03487148654742222, "flos": 20157521794560.0, "grad_norm": 2.054973215074824, "language_loss": 0.86841297, "learning_rate": 3.999751211379863e-06, "loss": 0.8918044, "num_input_tokens_seen": 12262140, "step": 580, "time_per_iteration": 2.639146566390991 }, { "auxiliary_loss_clip": 0.01256528, "auxiliary_loss_mlp": 0.01069029, "balance_loss_clip": 1.06636667, "balance_loss_mlp": 1.0398469, "epoch": 0.034931609800090184, "flos": 15669585918720.0, "grad_norm": 2.205850105033732, "language_loss": 0.82570344, "learning_rate": 3.999745030662987e-06, "loss": 0.84895897, "num_input_tokens_seen": 12280930, "step": 581, "time_per_iteration": 2.6505649089813232 }, { "auxiliary_loss_clip": 0.01252942, "auxiliary_loss_mlp": 0.01072317, "balance_loss_clip": 1.06823969, "balance_loss_mlp": 1.04168022, "epoch": 0.034991733052758156, "flos": 16362374509440.0, "grad_norm": 2.1922492117358146, "language_loss": 0.7733047, "learning_rate": 3.99973877411558e-06, "loss": 0.79655731, "num_input_tokens_seen": 12299125, "step": 582, "time_per_iteration": 2.7323596477508545 }, { "auxiliary_loss_clip": 0.01250253, "auxiliary_loss_mlp": 0.01082356, "balance_loss_clip": 1.06794167, "balance_loss_mlp": 1.04861939, "epoch": 0.03505185630542612, "flos": 19387309438080.0, "grad_norm": 2.1536178016194327, "language_loss": 0.87679923, "learning_rate": 3.999732441737877e-06, "loss": 0.90012532, "num_input_tokens_seen": 12316905, "step": 583, "time_per_iteration": 2.6049294471740723 }, { "auxiliary_loss_clip": 0.01255473, "auxiliary_loss_mlp": 0.01092826, "balance_loss_clip": 1.06699181, "balance_loss_mlp": 1.06104505, "epoch": 0.03511197955809409, "flos": 21323828401920.0, "grad_norm": 3.7027110169592015, "language_loss": 0.81196821, "learning_rate": 3.99972603353012e-06, "loss": 0.83545119, "num_input_tokens_seen": 12335070, "step": 584, "time_per_iteration": 2.6011815071105957 }, { "auxiliary_loss_clip": 0.01251161, "auxiliary_loss_mlp": 0.01069463, "balance_loss_clip": 1.06472683, "balance_loss_mlp": 1.03832567, "epoch": 0.035172102810762065, "flos": 14136595320960.0, "grad_norm": 3.067717812226321, "language_loss": 0.92399198, "learning_rate": 3.999719549492551e-06, "loss": 0.94719815, "num_input_tokens_seen": 12350315, "step": 585, "time_per_iteration": 2.5592780113220215 }, { "auxiliary_loss_clip": 0.01251271, "auxiliary_loss_mlp": 0.01077423, "balance_loss_clip": 1.06562734, "balance_loss_mlp": 1.04552317, "epoch": 0.03523222606343003, "flos": 20296890564480.0, "grad_norm": 2.196660024103635, "language_loss": 0.87644351, "learning_rate": 3.9997129896254165e-06, "loss": 0.89973044, "num_input_tokens_seen": 12366030, "step": 586, "time_per_iteration": 2.5486221313476562 }, { "auxiliary_loss_clip": 0.01256485, "auxiliary_loss_mlp": 0.0108018, "balance_loss_clip": 1.06803596, "balance_loss_mlp": 1.04918551, "epoch": 0.035292349316098, "flos": 20375822701440.0, "grad_norm": 2.1222089199850878, "language_loss": 0.76079381, "learning_rate": 3.999706353928965e-06, "loss": 0.78416049, "num_input_tokens_seen": 12384895, "step": 587, "time_per_iteration": 2.5923714637756348 }, { "auxiliary_loss_clip": 0.01257125, "auxiliary_loss_mlp": 0.01068649, "balance_loss_clip": 1.06683922, "balance_loss_mlp": 1.03586686, "epoch": 0.03535247256876597, "flos": 21468871520640.0, "grad_norm": 2.212352192395094, "language_loss": 0.78601038, "learning_rate": 3.999699642403449e-06, "loss": 0.80926806, "num_input_tokens_seen": 12404980, "step": 588, "time_per_iteration": 2.579280138015747 }, { "auxiliary_loss_clip": 0.0125398, "auxiliary_loss_mlp": 0.0107827, "balance_loss_clip": 1.06582928, "balance_loss_mlp": 1.04367518, "epoch": 0.03541259582143394, "flos": 23623044946560.0, "grad_norm": 2.153589114745919, "language_loss": 0.94312829, "learning_rate": 3.99969285504912e-06, "loss": 0.96645081, "num_input_tokens_seen": 12423835, "step": 589, "time_per_iteration": 2.5964701175689697 }, { "auxiliary_loss_clip": 0.01256884, "auxiliary_loss_mlp": 0.01078108, "balance_loss_clip": 1.06697679, "balance_loss_mlp": 1.04666042, "epoch": 0.03547271907410191, "flos": 33726367768320.0, "grad_norm": 2.1162556876212695, "language_loss": 0.84116042, "learning_rate": 3.99968599186624e-06, "loss": 0.8645103, "num_input_tokens_seen": 12443135, "step": 590, "time_per_iteration": 2.746436357498169 }, { "auxiliary_loss_clip": 0.01249398, "auxiliary_loss_mlp": 0.01068452, "balance_loss_clip": 1.06658125, "balance_loss_mlp": 1.03893578, "epoch": 0.03553284232676988, "flos": 21142695093120.0, "grad_norm": 1.984522351394552, "language_loss": 0.8684091, "learning_rate": 3.999679052855065e-06, "loss": 0.89158762, "num_input_tokens_seen": 12462895, "step": 591, "time_per_iteration": 2.692303419113159 }, { "auxiliary_loss_clip": 0.01250641, "auxiliary_loss_mlp": 0.01082122, "balance_loss_clip": 1.06297326, "balance_loss_mlp": 1.04883862, "epoch": 0.03559296557943785, "flos": 20046593617920.0, "grad_norm": 2.0873185001780783, "language_loss": 0.83075488, "learning_rate": 3.999672038015861e-06, "loss": 0.85408247, "num_input_tokens_seen": 12481515, "step": 592, "time_per_iteration": 2.7822203636169434 }, { "auxiliary_loss_clip": 0.01146211, "auxiliary_loss_mlp": 0.01034159, "balance_loss_clip": 1.05013406, "balance_loss_mlp": 1.02676773, "epoch": 0.035653088832105814, "flos": 60334597244160.0, "grad_norm": 0.8804992705477848, "language_loss": 0.59754086, "learning_rate": 3.999664947348893e-06, "loss": 0.61934447, "num_input_tokens_seen": 12548220, "step": 593, "time_per_iteration": 3.274080276489258 }, { "auxiliary_loss_clip": 0.01249386, "auxiliary_loss_mlp": 0.0107742, "balance_loss_clip": 1.06737614, "balance_loss_mlp": 1.04473329, "epoch": 0.035713212084773786, "flos": 20113135562880.0, "grad_norm": 1.8086551314359374, "language_loss": 0.87077361, "learning_rate": 3.999657780854429e-06, "loss": 0.89404166, "num_input_tokens_seen": 12566105, "step": 594, "time_per_iteration": 2.682236671447754 }, { "auxiliary_loss_clip": 0.012487, "auxiliary_loss_mlp": 0.01082358, "balance_loss_clip": 1.06235993, "balance_loss_mlp": 1.05057716, "epoch": 0.03577333533744176, "flos": 26285785084800.0, "grad_norm": 5.516524335860627, "language_loss": 0.83920246, "learning_rate": 3.999650538532742e-06, "loss": 0.86251307, "num_input_tokens_seen": 12586680, "step": 595, "time_per_iteration": 2.773669481277466 }, { "auxiliary_loss_clip": 0.01248678, "auxiliary_loss_mlp": 0.01090544, "balance_loss_clip": 1.06579614, "balance_loss_mlp": 1.05850017, "epoch": 0.035833458590109724, "flos": 10889732211840.0, "grad_norm": 2.3448814752825204, "language_loss": 0.96041518, "learning_rate": 3.999643220384106e-06, "loss": 0.98380733, "num_input_tokens_seen": 12601605, "step": 596, "time_per_iteration": 2.6541590690612793 }, { "auxiliary_loss_clip": 0.01252662, "auxiliary_loss_mlp": 0.01081887, "balance_loss_clip": 1.0675534, "balance_loss_mlp": 1.05165553, "epoch": 0.035893581842777696, "flos": 22090198003200.0, "grad_norm": 2.4353221882859004, "language_loss": 0.82993281, "learning_rate": 3.999635826408799e-06, "loss": 0.85327828, "num_input_tokens_seen": 12620365, "step": 597, "time_per_iteration": 2.7023818492889404 }, { "auxiliary_loss_clip": 0.01247839, "auxiliary_loss_mlp": 0.01079829, "balance_loss_clip": 1.0668776, "balance_loss_mlp": 1.04766583, "epoch": 0.03595370509544566, "flos": 23038347358080.0, "grad_norm": 2.374757318483944, "language_loss": 0.81364304, "learning_rate": 3.999628356607101e-06, "loss": 0.83691972, "num_input_tokens_seen": 12641140, "step": 598, "time_per_iteration": 2.731229782104492 }, { "auxiliary_loss_clip": 0.01243692, "auxiliary_loss_mlp": 0.01077827, "balance_loss_clip": 1.0663228, "balance_loss_mlp": 1.04587913, "epoch": 0.03601382834811363, "flos": 20777734955520.0, "grad_norm": 1.817680341814684, "language_loss": 0.81172699, "learning_rate": 3.999620810979295e-06, "loss": 0.83494222, "num_input_tokens_seen": 12661080, "step": 599, "time_per_iteration": 2.710191011428833 }, { "auxiliary_loss_clip": 0.01250419, "auxiliary_loss_mlp": 0.01074577, "balance_loss_clip": 1.06356514, "balance_loss_mlp": 1.045228, "epoch": 0.036073951600781605, "flos": 23951627585280.0, "grad_norm": 2.3963649020429627, "language_loss": 0.8651731, "learning_rate": 3.999613189525668e-06, "loss": 0.88842309, "num_input_tokens_seen": 12678270, "step": 600, "time_per_iteration": 2.682262420654297 }, { "auxiliary_loss_clip": 0.01241882, "auxiliary_loss_mlp": 0.01084809, "balance_loss_clip": 1.05918193, "balance_loss_mlp": 1.05297971, "epoch": 0.03613407485344957, "flos": 18912283050240.0, "grad_norm": 2.0308947613075423, "language_loss": 0.82355881, "learning_rate": 3.999605492246508e-06, "loss": 0.84682572, "num_input_tokens_seen": 12697295, "step": 601, "time_per_iteration": 2.6570894718170166 }, { "auxiliary_loss_clip": 0.01240868, "auxiliary_loss_mlp": 0.010708, "balance_loss_clip": 1.06129336, "balance_loss_mlp": 1.03920949, "epoch": 0.03619419810611754, "flos": 23038526926080.0, "grad_norm": 2.3080142694085555, "language_loss": 0.7502507, "learning_rate": 3.999597719142107e-06, "loss": 0.77336735, "num_input_tokens_seen": 12716165, "step": 602, "time_per_iteration": 2.6434237957000732 }, { "auxiliary_loss_clip": 0.01239543, "auxiliary_loss_mlp": 0.01066859, "balance_loss_clip": 1.0604254, "balance_loss_mlp": 1.03562629, "epoch": 0.03625432135878551, "flos": 29457774293760.0, "grad_norm": 1.9681237382646195, "language_loss": 0.79599822, "learning_rate": 3.999589870212761e-06, "loss": 0.81906223, "num_input_tokens_seen": 12735475, "step": 603, "time_per_iteration": 2.7201666831970215 }, { "auxiliary_loss_clip": 0.01244834, "auxiliary_loss_mlp": 0.01071177, "balance_loss_clip": 1.06545615, "balance_loss_mlp": 1.04130292, "epoch": 0.03631444461145348, "flos": 23508525409920.0, "grad_norm": 1.8363641170913294, "language_loss": 0.86668456, "learning_rate": 3.9995819454587664e-06, "loss": 0.88984472, "num_input_tokens_seen": 12754540, "step": 604, "time_per_iteration": 2.60249924659729 }, { "auxiliary_loss_clip": 0.01248906, "auxiliary_loss_mlp": 0.01072985, "balance_loss_clip": 1.0674324, "balance_loss_mlp": 1.04010737, "epoch": 0.03637456786412145, "flos": 16618130323200.0, "grad_norm": 2.510130211393037, "language_loss": 0.80746496, "learning_rate": 3.999573944880424e-06, "loss": 0.83068383, "num_input_tokens_seen": 12773050, "step": 605, "time_per_iteration": 2.766684055328369 }, { "auxiliary_loss_clip": 0.01244274, "auxiliary_loss_mlp": 0.0107873, "balance_loss_clip": 1.0630821, "balance_loss_mlp": 1.04846251, "epoch": 0.03643469111678942, "flos": 15851832549120.0, "grad_norm": 2.2216143800596835, "language_loss": 0.85942292, "learning_rate": 3.9995658684780375e-06, "loss": 0.882653, "num_input_tokens_seen": 12791240, "step": 606, "time_per_iteration": 2.6133925914764404 }, { "auxiliary_loss_clip": 0.01247732, "auxiliary_loss_mlp": 0.01077404, "balance_loss_clip": 1.06413972, "balance_loss_mlp": 1.04588532, "epoch": 0.03649481436945739, "flos": 23620387340160.0, "grad_norm": 2.0684825764003394, "language_loss": 0.82179952, "learning_rate": 3.999557716251912e-06, "loss": 0.84505081, "num_input_tokens_seen": 12812245, "step": 607, "time_per_iteration": 2.6805856227874756 }, { "auxiliary_loss_clip": 0.01245394, "auxiliary_loss_mlp": 0.01073743, "balance_loss_clip": 1.06585169, "balance_loss_mlp": 1.04317796, "epoch": 0.036554937622125354, "flos": 21755581879680.0, "grad_norm": 2.3717179235904533, "language_loss": 0.83567071, "learning_rate": 3.999549488202358e-06, "loss": 0.8588621, "num_input_tokens_seen": 12831085, "step": 608, "time_per_iteration": 2.6593453884124756 }, { "auxiliary_loss_clip": 0.01251062, "auxiliary_loss_mlp": 0.01073705, "balance_loss_clip": 1.06682992, "balance_loss_mlp": 1.04006422, "epoch": 0.036615060874793326, "flos": 17819772935040.0, "grad_norm": 2.4795108668903305, "language_loss": 0.8201133, "learning_rate": 3.999541184329688e-06, "loss": 0.84336102, "num_input_tokens_seen": 12849115, "step": 609, "time_per_iteration": 2.6299383640289307 }, { "auxiliary_loss_clip": 0.01255655, "auxiliary_loss_mlp": 0.01091893, "balance_loss_clip": 1.07322037, "balance_loss_mlp": 1.06158984, "epoch": 0.0366751841274613, "flos": 26753808320640.0, "grad_norm": 1.992640540297191, "language_loss": 0.79448462, "learning_rate": 3.999532804634215e-06, "loss": 0.81796008, "num_input_tokens_seen": 12868005, "step": 610, "time_per_iteration": 2.65120530128479 }, { "auxiliary_loss_clip": 0.01254423, "auxiliary_loss_mlp": 0.01088228, "balance_loss_clip": 1.06914616, "balance_loss_mlp": 1.05656588, "epoch": 0.03673530738012926, "flos": 22196960202240.0, "grad_norm": 1.9328503999291824, "language_loss": 0.87282723, "learning_rate": 3.9995243491162575e-06, "loss": 0.89625371, "num_input_tokens_seen": 12886890, "step": 611, "time_per_iteration": 2.7398059368133545 }, { "auxiliary_loss_clip": 0.01248885, "auxiliary_loss_mlp": 0.01097673, "balance_loss_clip": 1.06917143, "balance_loss_mlp": 1.06651139, "epoch": 0.036795430632797235, "flos": 24681655601280.0, "grad_norm": 3.7435200854847266, "language_loss": 0.72589231, "learning_rate": 3.999515817776136e-06, "loss": 0.74935788, "num_input_tokens_seen": 12906130, "step": 612, "time_per_iteration": 2.700406551361084 }, { "auxiliary_loss_clip": 0.01249112, "auxiliary_loss_mlp": 0.01076924, "balance_loss_clip": 1.06581926, "balance_loss_mlp": 1.04480934, "epoch": 0.0368555538854652, "flos": 17748921358080.0, "grad_norm": 3.0863603820013434, "language_loss": 0.79110008, "learning_rate": 3.999507210614175e-06, "loss": 0.81436038, "num_input_tokens_seen": 12925260, "step": 613, "time_per_iteration": 2.630472183227539 }, { "auxiliary_loss_clip": 0.01242581, "auxiliary_loss_mlp": 0.01090278, "balance_loss_clip": 1.06378841, "balance_loss_mlp": 1.05961776, "epoch": 0.03691567713813317, "flos": 20594554571520.0, "grad_norm": 2.2015687298668336, "language_loss": 0.93885028, "learning_rate": 3.9994985276307e-06, "loss": 0.96217889, "num_input_tokens_seen": 12944590, "step": 614, "time_per_iteration": 2.6977972984313965 }, { "auxiliary_loss_clip": 0.01254503, "auxiliary_loss_mlp": 0.01081137, "balance_loss_clip": 1.07009673, "balance_loss_mlp": 1.04732919, "epoch": 0.036975800390801145, "flos": 33650380546560.0, "grad_norm": 3.0661216019279576, "language_loss": 0.72932875, "learning_rate": 3.999489768826041e-06, "loss": 0.75268513, "num_input_tokens_seen": 12964785, "step": 615, "time_per_iteration": 2.697291612625122 }, { "auxiliary_loss_clip": 0.01250213, "auxiliary_loss_mlp": 0.010716, "balance_loss_clip": 1.06649876, "balance_loss_mlp": 1.04015231, "epoch": 0.03703592364346911, "flos": 28293694329600.0, "grad_norm": 2.9941392641088695, "language_loss": 0.81630868, "learning_rate": 3.999480934200528e-06, "loss": 0.83952683, "num_input_tokens_seen": 12986705, "step": 616, "time_per_iteration": 4.1762495040893555 }, { "auxiliary_loss_clip": 0.0124999, "auxiliary_loss_mlp": 0.01076541, "balance_loss_clip": 1.06807041, "balance_loss_mlp": 1.0467627, "epoch": 0.03709604689613708, "flos": 31504215853440.0, "grad_norm": 2.320593216419041, "language_loss": 0.68178958, "learning_rate": 3.999472023754499e-06, "loss": 0.70505488, "num_input_tokens_seen": 13010560, "step": 617, "time_per_iteration": 4.224538564682007 }, { "auxiliary_loss_clip": 0.01254259, "auxiliary_loss_mlp": 0.010771, "balance_loss_clip": 1.07098567, "balance_loss_mlp": 1.04415071, "epoch": 0.03715617014880505, "flos": 19609381272960.0, "grad_norm": 2.245411088847763, "language_loss": 0.80595517, "learning_rate": 3.99946303748829e-06, "loss": 0.82926875, "num_input_tokens_seen": 13028935, "step": 618, "time_per_iteration": 4.200341463088989 }, { "auxiliary_loss_clip": 0.01257669, "auxiliary_loss_mlp": 0.01079294, "balance_loss_clip": 1.06808555, "balance_loss_mlp": 1.04605901, "epoch": 0.03721629340147302, "flos": 15924192497280.0, "grad_norm": 10.155035046705617, "language_loss": 0.91591841, "learning_rate": 3.999453975402242e-06, "loss": 0.93928802, "num_input_tokens_seen": 13046000, "step": 619, "time_per_iteration": 2.5787301063537598 }, { "auxiliary_loss_clip": 0.01251145, "auxiliary_loss_mlp": 0.01083548, "balance_loss_clip": 1.06999123, "balance_loss_mlp": 1.05181432, "epoch": 0.03727641665414099, "flos": 21104090951040.0, "grad_norm": 2.0803022158745406, "language_loss": 0.94071603, "learning_rate": 3.9994448374967e-06, "loss": 0.96406299, "num_input_tokens_seen": 13062995, "step": 620, "time_per_iteration": 2.5987205505371094 }, { "auxiliary_loss_clip": 0.01249568, "auxiliary_loss_mlp": 0.0108317, "balance_loss_clip": 1.06624317, "balance_loss_mlp": 1.0502919, "epoch": 0.037336539906808956, "flos": 24131683486080.0, "grad_norm": 1.7431896174296577, "language_loss": 0.77319217, "learning_rate": 3.999435623772008e-06, "loss": 0.79651952, "num_input_tokens_seen": 13084120, "step": 621, "time_per_iteration": 2.68758225440979 }, { "auxiliary_loss_clip": 0.01247252, "auxiliary_loss_mlp": 0.01071013, "balance_loss_clip": 1.06894088, "balance_loss_mlp": 1.03792048, "epoch": 0.03739666315947693, "flos": 22346384780160.0, "grad_norm": 2.3852872810563364, "language_loss": 0.86546707, "learning_rate": 3.999426334228518e-06, "loss": 0.88864976, "num_input_tokens_seen": 13100035, "step": 622, "time_per_iteration": 2.607121467590332 }, { "auxiliary_loss_clip": 0.012499, "auxiliary_loss_mlp": 0.01072461, "balance_loss_clip": 1.06715882, "balance_loss_mlp": 1.04048872, "epoch": 0.0374567864121449, "flos": 20449511452800.0, "grad_norm": 2.2621736327299766, "language_loss": 0.90008956, "learning_rate": 3.999416968866581e-06, "loss": 0.92331314, "num_input_tokens_seen": 13118070, "step": 623, "time_per_iteration": 2.6513512134552 }, { "auxiliary_loss_clip": 0.01251762, "auxiliary_loss_mlp": 0.01090534, "balance_loss_clip": 1.07006013, "balance_loss_mlp": 1.05844235, "epoch": 0.037516909664812866, "flos": 19208043636480.0, "grad_norm": 2.760597076727266, "language_loss": 0.84095174, "learning_rate": 3.999407527686551e-06, "loss": 0.8643747, "num_input_tokens_seen": 13136355, "step": 624, "time_per_iteration": 2.66623592376709 }, { "auxiliary_loss_clip": 0.01252431, "auxiliary_loss_mlp": 0.01076353, "balance_loss_clip": 1.06697702, "balance_loss_mlp": 1.04423809, "epoch": 0.03757703291748084, "flos": 35005218664320.0, "grad_norm": 4.259276014089895, "language_loss": 0.66778994, "learning_rate": 3.999398010688788e-06, "loss": 0.69107783, "num_input_tokens_seen": 13155435, "step": 625, "time_per_iteration": 2.7288877964019775 }, { "auxiliary_loss_clip": 0.01244959, "auxiliary_loss_mlp": 0.01076274, "balance_loss_clip": 1.06605244, "balance_loss_mlp": 1.042943, "epoch": 0.0376371561701488, "flos": 25483899911040.0, "grad_norm": 3.375450269409945, "language_loss": 0.77496696, "learning_rate": 3.999388417873652e-06, "loss": 0.79817927, "num_input_tokens_seen": 13174295, "step": 626, "time_per_iteration": 2.648942470550537 }, { "auxiliary_loss_clip": 0.01249107, "auxiliary_loss_mlp": 0.0108376, "balance_loss_clip": 1.06770003, "balance_loss_mlp": 1.05200303, "epoch": 0.037697279422816775, "flos": 18185630912640.0, "grad_norm": 2.0480468386724766, "language_loss": 0.81463408, "learning_rate": 3.999378749241506e-06, "loss": 0.83796275, "num_input_tokens_seen": 13192500, "step": 627, "time_per_iteration": 2.6209845542907715 }, { "auxiliary_loss_clip": 0.01254363, "auxiliary_loss_mlp": 0.01084942, "balance_loss_clip": 1.07041132, "balance_loss_mlp": 1.05215955, "epoch": 0.03775740267548475, "flos": 24644272521600.0, "grad_norm": 1.6934072791943036, "language_loss": 0.88809037, "learning_rate": 3.999369004792719e-06, "loss": 0.91148341, "num_input_tokens_seen": 13213470, "step": 628, "time_per_iteration": 2.7221415042877197 }, { "auxiliary_loss_clip": 0.01247303, "auxiliary_loss_mlp": 0.01080197, "balance_loss_clip": 1.0627017, "balance_loss_mlp": 1.04765344, "epoch": 0.03781752592815271, "flos": 21288205088640.0, "grad_norm": 2.536151380104699, "language_loss": 0.79840028, "learning_rate": 3.999359184527658e-06, "loss": 0.82167524, "num_input_tokens_seen": 13232365, "step": 629, "time_per_iteration": 2.6535024642944336 }, { "auxiliary_loss_clip": 0.01249218, "auxiliary_loss_mlp": 0.0106958, "balance_loss_clip": 1.06675959, "balance_loss_mlp": 1.03885961, "epoch": 0.037877649180820684, "flos": 22089623385600.0, "grad_norm": 1.6861994278356789, "language_loss": 0.76824844, "learning_rate": 3.999349288446696e-06, "loss": 0.79143643, "num_input_tokens_seen": 13251920, "step": 630, "time_per_iteration": 2.6175966262817383 }, { "auxiliary_loss_clip": 0.01254291, "auxiliary_loss_mlp": 0.01075963, "balance_loss_clip": 1.06833327, "balance_loss_mlp": 1.04504025, "epoch": 0.03793777243348865, "flos": 14501339976960.0, "grad_norm": 3.12435515576561, "language_loss": 0.91593724, "learning_rate": 3.99933931655021e-06, "loss": 0.93923974, "num_input_tokens_seen": 13267440, "step": 631, "time_per_iteration": 2.565293788909912 }, { "auxiliary_loss_clip": 0.01243525, "auxiliary_loss_mlp": 0.01087901, "balance_loss_clip": 1.06386209, "balance_loss_mlp": 1.05356884, "epoch": 0.03799789568615662, "flos": 21908418249600.0, "grad_norm": 1.6822536287963328, "language_loss": 0.92157543, "learning_rate": 3.999329268838575e-06, "loss": 0.94488978, "num_input_tokens_seen": 13287850, "step": 632, "time_per_iteration": 2.6235203742980957 }, { "auxiliary_loss_clip": 0.01248362, "auxiliary_loss_mlp": 0.01067296, "balance_loss_clip": 1.06696796, "balance_loss_mlp": 1.03613472, "epoch": 0.03805801893882459, "flos": 24827021942400.0, "grad_norm": 2.1097171792430456, "language_loss": 0.83139223, "learning_rate": 3.999319145312175e-06, "loss": 0.85454881, "num_input_tokens_seen": 13307760, "step": 633, "time_per_iteration": 2.6461985111236572 }, { "auxiliary_loss_clip": 0.01247735, "auxiliary_loss_mlp": 0.01079895, "balance_loss_clip": 1.06473529, "balance_loss_mlp": 1.04811358, "epoch": 0.03811814219149256, "flos": 30482952364800.0, "grad_norm": 1.599115294194595, "language_loss": 0.69883299, "learning_rate": 3.999308945971392e-06, "loss": 0.72210932, "num_input_tokens_seen": 13331230, "step": 634, "time_per_iteration": 2.709033727645874 }, { "auxiliary_loss_clip": 0.01133204, "auxiliary_loss_mlp": 0.01009504, "balance_loss_clip": 1.04124916, "balance_loss_mlp": 1.00249422, "epoch": 0.03817826544416053, "flos": 66992577379200.0, "grad_norm": 0.893126545279708, "language_loss": 0.61645919, "learning_rate": 3.999298670816614e-06, "loss": 0.63788629, "num_input_tokens_seen": 13394760, "step": 635, "time_per_iteration": 3.2099475860595703 }, { "auxiliary_loss_clip": 0.01244276, "auxiliary_loss_mlp": 0.01072984, "balance_loss_clip": 1.06475401, "balance_loss_mlp": 1.04129851, "epoch": 0.038238388696828496, "flos": 20485350247680.0, "grad_norm": 2.0563589539657205, "language_loss": 0.83629507, "learning_rate": 3.9992883198482294e-06, "loss": 0.85946769, "num_input_tokens_seen": 13412775, "step": 636, "time_per_iteration": 2.6278960704803467 }, { "auxiliary_loss_clip": 0.01248078, "auxiliary_loss_mlp": 0.01096471, "balance_loss_clip": 1.06714165, "balance_loss_mlp": 1.06530952, "epoch": 0.03829851194949647, "flos": 17965893461760.0, "grad_norm": 2.346379148367956, "language_loss": 0.79578567, "learning_rate": 3.999277893066632e-06, "loss": 0.81923115, "num_input_tokens_seen": 13427835, "step": 637, "time_per_iteration": 2.646414279937744 }, { "auxiliary_loss_clip": 0.01247939, "auxiliary_loss_mlp": 0.01088528, "balance_loss_clip": 1.06356907, "balance_loss_mlp": 1.0562222, "epoch": 0.03835863520216444, "flos": 22456522857600.0, "grad_norm": 1.9563283234999833, "language_loss": 0.83989692, "learning_rate": 3.999267390472215e-06, "loss": 0.86326158, "num_input_tokens_seen": 13447295, "step": 638, "time_per_iteration": 2.6416285037994385 }, { "auxiliary_loss_clip": 0.01253172, "auxiliary_loss_mlp": 0.01074704, "balance_loss_clip": 1.06563985, "balance_loss_mlp": 1.04163575, "epoch": 0.038418758454832405, "flos": 22164425458560.0, "grad_norm": 2.5596504471077224, "language_loss": 0.70109725, "learning_rate": 3.999256812065381e-06, "loss": 0.72437602, "num_input_tokens_seen": 13468455, "step": 639, "time_per_iteration": 2.610682487487793 }, { "auxiliary_loss_clip": 0.01248829, "auxiliary_loss_mlp": 0.01081808, "balance_loss_clip": 1.06618333, "balance_loss_mlp": 1.04790449, "epoch": 0.03847888170750038, "flos": 22747435107840.0, "grad_norm": 2.5791624605537082, "language_loss": 0.85322344, "learning_rate": 3.999246157846526e-06, "loss": 0.87652987, "num_input_tokens_seen": 13489085, "step": 640, "time_per_iteration": 2.700456380844116 }, { "auxiliary_loss_clip": 0.01252579, "auxiliary_loss_mlp": 0.01083722, "balance_loss_clip": 1.06751871, "balance_loss_mlp": 1.04934239, "epoch": 0.03853900496016834, "flos": 22711201263360.0, "grad_norm": 2.331268680461456, "language_loss": 0.82141805, "learning_rate": 3.9992354278160574e-06, "loss": 0.84478104, "num_input_tokens_seen": 13509120, "step": 641, "time_per_iteration": 2.6572046279907227 }, { "auxiliary_loss_clip": 0.0112759, "auxiliary_loss_mlp": 0.01008008, "balance_loss_clip": 1.03825259, "balance_loss_mlp": 1.00095105, "epoch": 0.038599128212836314, "flos": 70399136355840.0, "grad_norm": 0.9037629700551453, "language_loss": 0.65444964, "learning_rate": 3.999224621974381e-06, "loss": 0.67580563, "num_input_tokens_seen": 13562005, "step": 642, "time_per_iteration": 3.199925422668457 }, { "auxiliary_loss_clip": 0.01246698, "auxiliary_loss_mlp": 0.01064563, "balance_loss_clip": 1.0651319, "balance_loss_mlp": 1.03453398, "epoch": 0.03865925146550429, "flos": 23295144666240.0, "grad_norm": 1.9113268312481755, "language_loss": 0.79272145, "learning_rate": 3.999213740321906e-06, "loss": 0.81583405, "num_input_tokens_seen": 13582185, "step": 643, "time_per_iteration": 2.641437292098999 }, { "auxiliary_loss_clip": 0.01244786, "auxiliary_loss_mlp": 0.01076057, "balance_loss_clip": 1.06219232, "balance_loss_mlp": 1.04599261, "epoch": 0.03871937471817225, "flos": 21430446946560.0, "grad_norm": 2.2104774200729262, "language_loss": 0.8294487, "learning_rate": 3.999202782859046e-06, "loss": 0.85265714, "num_input_tokens_seen": 13599555, "step": 644, "time_per_iteration": 2.600558280944824 }, { "auxiliary_loss_clip": 0.01247273, "auxiliary_loss_mlp": 0.01074554, "balance_loss_clip": 1.06383467, "balance_loss_mlp": 1.04193854, "epoch": 0.038779497970840224, "flos": 34277309550720.0, "grad_norm": 1.994902925690418, "language_loss": 0.82286513, "learning_rate": 3.9991917495862165e-06, "loss": 0.8460834, "num_input_tokens_seen": 13621160, "step": 645, "time_per_iteration": 2.6751983165740967 }, { "auxiliary_loss_clip": 0.01248631, "auxiliary_loss_mlp": 0.01070807, "balance_loss_clip": 1.06525111, "balance_loss_mlp": 1.03890657, "epoch": 0.03883962122350819, "flos": 22748189293440.0, "grad_norm": 2.290384247239265, "language_loss": 0.81889713, "learning_rate": 3.9991806405038345e-06, "loss": 0.84209144, "num_input_tokens_seen": 13641915, "step": 646, "time_per_iteration": 2.6987667083740234 }, { "auxiliary_loss_clip": 0.01250204, "auxiliary_loss_mlp": 0.01078836, "balance_loss_clip": 1.06982899, "balance_loss_mlp": 1.04791331, "epoch": 0.03889974447617616, "flos": 21945837242880.0, "grad_norm": 1.9171219640425325, "language_loss": 0.82015383, "learning_rate": 3.999169455612323e-06, "loss": 0.84344423, "num_input_tokens_seen": 13661410, "step": 647, "time_per_iteration": 2.590102195739746 }, { "auxiliary_loss_clip": 0.0124696, "auxiliary_loss_mlp": 0.01072111, "balance_loss_clip": 1.06628954, "balance_loss_mlp": 1.04216528, "epoch": 0.03895986772884413, "flos": 31504826384640.0, "grad_norm": 1.9398424653049293, "language_loss": 0.84477997, "learning_rate": 3.999158194912106e-06, "loss": 0.86797059, "num_input_tokens_seen": 13681705, "step": 648, "time_per_iteration": 2.7516121864318848 }, { "auxiliary_loss_clip": 0.01244808, "auxiliary_loss_mlp": 0.0107293, "balance_loss_clip": 1.06524062, "balance_loss_mlp": 1.04210222, "epoch": 0.0390199909815121, "flos": 19901011795200.0, "grad_norm": 2.3870859420748136, "language_loss": 0.84254295, "learning_rate": 3.9991468584036086e-06, "loss": 0.86572027, "num_input_tokens_seen": 13700400, "step": 649, "time_per_iteration": 2.6116180419921875 }, { "auxiliary_loss_clip": 0.01246653, "auxiliary_loss_mlp": 0.01073574, "balance_loss_clip": 1.06560743, "balance_loss_mlp": 1.0416739, "epoch": 0.03908011423418007, "flos": 21612478095360.0, "grad_norm": 2.00775905451926, "language_loss": 0.79783499, "learning_rate": 3.999135446087263e-06, "loss": 0.82103723, "num_input_tokens_seen": 13720145, "step": 650, "time_per_iteration": 2.574939727783203 }, { "auxiliary_loss_clip": 0.01242721, "auxiliary_loss_mlp": 0.01077536, "balance_loss_clip": 1.06209707, "balance_loss_mlp": 1.04534984, "epoch": 0.039140237486848035, "flos": 18661411486080.0, "grad_norm": 2.334811800093409, "language_loss": 0.78698987, "learning_rate": 3.9991239579635e-06, "loss": 0.81019247, "num_input_tokens_seen": 13737500, "step": 651, "time_per_iteration": 2.5930917263031006 }, { "auxiliary_loss_clip": 0.0124425, "auxiliary_loss_mlp": 0.010838, "balance_loss_clip": 1.06317663, "balance_loss_mlp": 1.05087411, "epoch": 0.03920036073951601, "flos": 18661124177280.0, "grad_norm": 3.361008988618244, "language_loss": 0.87392938, "learning_rate": 3.999112394032757e-06, "loss": 0.89720988, "num_input_tokens_seen": 13754750, "step": 652, "time_per_iteration": 2.6072869300842285 }, { "auxiliary_loss_clip": 0.01239638, "auxiliary_loss_mlp": 0.01073938, "balance_loss_clip": 1.06362963, "balance_loss_mlp": 1.0434916, "epoch": 0.03926048399218398, "flos": 31354468053120.0, "grad_norm": 2.6218665998754904, "language_loss": 0.79297256, "learning_rate": 3.999100754295471e-06, "loss": 0.81610829, "num_input_tokens_seen": 13771990, "step": 653, "time_per_iteration": 2.626145362854004 }, { "auxiliary_loss_clip": 0.01250652, "auxiliary_loss_mlp": 0.01075546, "balance_loss_clip": 1.06496143, "balance_loss_mlp": 1.04374111, "epoch": 0.039320607244851945, "flos": 29603499770880.0, "grad_norm": 2.0720296605490094, "language_loss": 0.85909009, "learning_rate": 3.999089038752085e-06, "loss": 0.88235211, "num_input_tokens_seen": 13792750, "step": 654, "time_per_iteration": 2.6775124073028564 }, { "auxiliary_loss_clip": 0.01126661, "auxiliary_loss_mlp": 0.01016641, "balance_loss_clip": 1.03977203, "balance_loss_mlp": 1.01001298, "epoch": 0.03938073049751992, "flos": 66534609951360.0, "grad_norm": 0.7366259780501333, "language_loss": 0.4997997, "learning_rate": 3.999077247403041e-06, "loss": 0.52123272, "num_input_tokens_seen": 13858570, "step": 655, "time_per_iteration": 3.3006510734558105 }, { "auxiliary_loss_clip": 0.01241143, "auxiliary_loss_mlp": 0.01076374, "balance_loss_clip": 1.0658412, "balance_loss_mlp": 1.04680991, "epoch": 0.03944085375018788, "flos": 23367827836800.0, "grad_norm": 4.17474796245144, "language_loss": 0.80903178, "learning_rate": 3.9990653802487886e-06, "loss": 0.83220696, "num_input_tokens_seen": 13876335, "step": 656, "time_per_iteration": 4.228931427001953 }, { "auxiliary_loss_clip": 0.01251519, "auxiliary_loss_mlp": 0.01093573, "balance_loss_clip": 1.06740427, "balance_loss_mlp": 1.05802524, "epoch": 0.039500977002855854, "flos": 18548292579840.0, "grad_norm": 2.068956760077258, "language_loss": 0.76289558, "learning_rate": 3.999053437289776e-06, "loss": 0.7863465, "num_input_tokens_seen": 13892640, "step": 657, "time_per_iteration": 4.218473434448242 }, { "auxiliary_loss_clip": 0.0124824, "auxiliary_loss_mlp": 0.01076812, "balance_loss_clip": 1.06641233, "balance_loss_mlp": 1.04522133, "epoch": 0.039561100255523826, "flos": 25338174433920.0, "grad_norm": 2.07475431213476, "language_loss": 0.8179062, "learning_rate": 3.999041418526457e-06, "loss": 0.84115672, "num_input_tokens_seen": 13910085, "step": 658, "time_per_iteration": 2.671675682067871 }, { "auxiliary_loss_clip": 0.01242678, "auxiliary_loss_mlp": 0.01077963, "balance_loss_clip": 1.06347871, "balance_loss_mlp": 1.0454669, "epoch": 0.03962122350819179, "flos": 18219889509120.0, "grad_norm": 2.2444983110753625, "language_loss": 0.90790772, "learning_rate": 3.999029323959287e-06, "loss": 0.93111408, "num_input_tokens_seen": 13928800, "step": 659, "time_per_iteration": 4.2601988315582275 }, { "auxiliary_loss_clip": 0.01247633, "auxiliary_loss_mlp": 0.01073069, "balance_loss_clip": 1.06654835, "balance_loss_mlp": 1.04215825, "epoch": 0.03968134676085976, "flos": 20522230536960.0, "grad_norm": 2.2083626038373656, "language_loss": 0.79760063, "learning_rate": 3.999017153588724e-06, "loss": 0.82080764, "num_input_tokens_seen": 13948325, "step": 660, "time_per_iteration": 2.62716007232666 }, { "auxiliary_loss_clip": 0.01246027, "auxiliary_loss_mlp": 0.01077579, "balance_loss_clip": 1.0675652, "balance_loss_mlp": 1.0456785, "epoch": 0.03974147001352773, "flos": 22422587483520.0, "grad_norm": 1.6747851381362888, "language_loss": 0.81757367, "learning_rate": 3.999004907415231e-06, "loss": 0.8408097, "num_input_tokens_seen": 13969090, "step": 661, "time_per_iteration": 2.645423412322998 }, { "auxiliary_loss_clip": 0.01119895, "auxiliary_loss_mlp": 0.01007167, "balance_loss_clip": 1.03320217, "balance_loss_mlp": 1.00077713, "epoch": 0.0398015932661957, "flos": 71128769322240.0, "grad_norm": 0.9117564509831767, "language_loss": 0.69349593, "learning_rate": 3.998992585439272e-06, "loss": 0.71476656, "num_input_tokens_seen": 14037555, "step": 662, "time_per_iteration": 3.3032331466674805 }, { "auxiliary_loss_clip": 0.01249217, "auxiliary_loss_mlp": 0.01074722, "balance_loss_clip": 1.06995225, "balance_loss_mlp": 1.04322648, "epoch": 0.03986171651886367, "flos": 16800951571200.0, "grad_norm": 2.160679749799672, "language_loss": 0.82765651, "learning_rate": 3.998980187661314e-06, "loss": 0.85089582, "num_input_tokens_seen": 14055765, "step": 663, "time_per_iteration": 2.6217782497406006 }, { "auxiliary_loss_clip": 0.01252759, "auxiliary_loss_mlp": 0.01063705, "balance_loss_clip": 1.06966817, "balance_loss_mlp": 1.03254378, "epoch": 0.03992183977153164, "flos": 24535068197760.0, "grad_norm": 2.19374813563436, "language_loss": 0.87302262, "learning_rate": 3.998967714081826e-06, "loss": 0.89618725, "num_input_tokens_seen": 14074195, "step": 664, "time_per_iteration": 2.6729183197021484 }, { "auxiliary_loss_clip": 0.01241647, "auxiliary_loss_mlp": 0.0106515, "balance_loss_clip": 1.06656313, "balance_loss_mlp": 1.03346384, "epoch": 0.03998196302419961, "flos": 15595897167360.0, "grad_norm": 2.036983550581997, "language_loss": 0.84821391, "learning_rate": 3.998955164701281e-06, "loss": 0.87128186, "num_input_tokens_seen": 14090215, "step": 665, "time_per_iteration": 2.593832015991211 }, { "auxiliary_loss_clip": 0.012521, "auxiliary_loss_mlp": 0.01085682, "balance_loss_clip": 1.06867695, "balance_loss_mlp": 1.05223155, "epoch": 0.04004208627686758, "flos": 25305065072640.0, "grad_norm": 2.172699570421913, "language_loss": 0.81745672, "learning_rate": 3.998942539520158e-06, "loss": 0.8408345, "num_input_tokens_seen": 14112150, "step": 666, "time_per_iteration": 2.6743290424346924 }, { "auxiliary_loss_clip": 0.01241565, "auxiliary_loss_mlp": 0.01073617, "balance_loss_clip": 1.06443083, "balance_loss_mlp": 1.04007161, "epoch": 0.04010220952953555, "flos": 23475847011840.0, "grad_norm": 2.1003520396389828, "language_loss": 0.87117827, "learning_rate": 3.998929838538932e-06, "loss": 0.89433014, "num_input_tokens_seen": 14131475, "step": 667, "time_per_iteration": 2.6147067546844482 }, { "auxiliary_loss_clip": 0.0124275, "auxiliary_loss_mlp": 0.01071583, "balance_loss_clip": 1.07009172, "balance_loss_mlp": 1.04161382, "epoch": 0.04016233278220352, "flos": 18617025254400.0, "grad_norm": 2.331266403294307, "language_loss": 0.80641299, "learning_rate": 3.998917061758087e-06, "loss": 0.82955635, "num_input_tokens_seen": 14146165, "step": 668, "time_per_iteration": 2.6015820503234863 }, { "auxiliary_loss_clip": 0.01115034, "auxiliary_loss_mlp": 0.01008949, "balance_loss_clip": 1.02975297, "balance_loss_mlp": 1.00317907, "epoch": 0.040222456034871484, "flos": 70906194696960.0, "grad_norm": 0.7870483750596657, "language_loss": 0.60066259, "learning_rate": 3.998904209178107e-06, "loss": 0.62190247, "num_input_tokens_seen": 14215005, "step": 669, "time_per_iteration": 3.2993202209472656 }, { "auxiliary_loss_clip": 0.01242272, "auxiliary_loss_mlp": 0.01071485, "balance_loss_clip": 1.06408751, "balance_loss_mlp": 1.04120564, "epoch": 0.040282579287539456, "flos": 23764712186880.0, "grad_norm": 1.7022357666604506, "language_loss": 0.86290276, "learning_rate": 3.9988912807994785e-06, "loss": 0.88604033, "num_input_tokens_seen": 14235510, "step": 670, "time_per_iteration": 2.700657844543457 }, { "auxiliary_loss_clip": 0.01242087, "auxiliary_loss_mlp": 0.01080448, "balance_loss_clip": 1.06647801, "balance_loss_mlp": 1.05014467, "epoch": 0.04034270254020743, "flos": 18478518410880.0, "grad_norm": 1.8224152334464152, "language_loss": 0.75569212, "learning_rate": 3.998878276622692e-06, "loss": 0.77891749, "num_input_tokens_seen": 14254565, "step": 671, "time_per_iteration": 2.6698572635650635 }, { "auxiliary_loss_clip": 0.01248936, "auxiliary_loss_mlp": 0.01076667, "balance_loss_clip": 1.06943047, "balance_loss_mlp": 1.04605412, "epoch": 0.040402825792875394, "flos": 17201858244480.0, "grad_norm": 1.9730812981627939, "language_loss": 0.92416775, "learning_rate": 3.998865196648242e-06, "loss": 0.94742376, "num_input_tokens_seen": 14271885, "step": 672, "time_per_iteration": 2.567563533782959 }, { "auxiliary_loss_clip": 0.01245231, "auxiliary_loss_mlp": 0.010776, "balance_loss_clip": 1.0677104, "balance_loss_mlp": 1.04422188, "epoch": 0.040462949045543366, "flos": 19172168928000.0, "grad_norm": 1.800141829654062, "language_loss": 0.90174723, "learning_rate": 3.998852040876622e-06, "loss": 0.92497551, "num_input_tokens_seen": 14289670, "step": 673, "time_per_iteration": 2.547154426574707 }, { "auxiliary_loss_clip": 0.01239752, "auxiliary_loss_mlp": 0.01084248, "balance_loss_clip": 1.06466973, "balance_loss_mlp": 1.05184698, "epoch": 0.04052307229821133, "flos": 24019821555840.0, "grad_norm": 2.3989934860433486, "language_loss": 0.75016737, "learning_rate": 3.998838809308334e-06, "loss": 0.7734074, "num_input_tokens_seen": 14309285, "step": 674, "time_per_iteration": 2.681896924972534 }, { "auxiliary_loss_clip": 0.01249861, "auxiliary_loss_mlp": 0.01064308, "balance_loss_clip": 1.06744063, "balance_loss_mlp": 1.03334963, "epoch": 0.0405831955508793, "flos": 16436601964800.0, "grad_norm": 2.55613513039197, "language_loss": 0.78289407, "learning_rate": 3.9988255019438766e-06, "loss": 0.80603576, "num_input_tokens_seen": 14328300, "step": 675, "time_per_iteration": 2.6965043544769287 }, { "auxiliary_loss_clip": 0.01241749, "auxiliary_loss_mlp": 0.01079652, "balance_loss_clip": 1.06532836, "balance_loss_mlp": 1.04648817, "epoch": 0.040643318803547275, "flos": 24279922915200.0, "grad_norm": 2.047384767684118, "language_loss": 0.76844448, "learning_rate": 3.998812118783757e-06, "loss": 0.79165846, "num_input_tokens_seen": 14346395, "step": 676, "time_per_iteration": 2.6216623783111572 }, { "auxiliary_loss_clip": 0.01248147, "auxiliary_loss_mlp": 0.01079294, "balance_loss_clip": 1.06811619, "balance_loss_mlp": 1.04813254, "epoch": 0.04070344205621524, "flos": 17712076982400.0, "grad_norm": 2.318905665785744, "language_loss": 0.85139382, "learning_rate": 3.9987986598284804e-06, "loss": 0.8746683, "num_input_tokens_seen": 14364605, "step": 677, "time_per_iteration": 2.5663015842437744 }, { "auxiliary_loss_clip": 0.01240385, "auxiliary_loss_mlp": 0.01070741, "balance_loss_clip": 1.06558609, "balance_loss_mlp": 1.03901923, "epoch": 0.04076356530888321, "flos": 26177658168960.0, "grad_norm": 2.5041724349122645, "language_loss": 0.76572061, "learning_rate": 3.998785125078559e-06, "loss": 0.78883183, "num_input_tokens_seen": 14385265, "step": 678, "time_per_iteration": 2.624689817428589 }, { "auxiliary_loss_clip": 0.01240972, "auxiliary_loss_mlp": 0.01072606, "balance_loss_clip": 1.06374967, "balance_loss_mlp": 1.04242194, "epoch": 0.04082368856155118, "flos": 35773455772800.0, "grad_norm": 1.7096242150987748, "language_loss": 0.82139099, "learning_rate": 3.998771514534505e-06, "loss": 0.84452677, "num_input_tokens_seen": 14406090, "step": 679, "time_per_iteration": 2.7073023319244385 }, { "auxiliary_loss_clip": 0.01248879, "auxiliary_loss_mlp": 0.01064116, "balance_loss_clip": 1.07185793, "balance_loss_mlp": 1.0340035, "epoch": 0.04088381181421915, "flos": 28146640049280.0, "grad_norm": 1.963288262989073, "language_loss": 0.76260424, "learning_rate": 3.998757828196835e-06, "loss": 0.78573418, "num_input_tokens_seen": 14425130, "step": 680, "time_per_iteration": 2.6767218112945557 }, { "auxiliary_loss_clip": 0.01244441, "auxiliary_loss_mlp": 0.01071738, "balance_loss_clip": 1.06458521, "balance_loss_mlp": 1.03864551, "epoch": 0.04094393506688712, "flos": 27597673514880.0, "grad_norm": 1.713943858995997, "language_loss": 0.83089912, "learning_rate": 3.9987440660660685e-06, "loss": 0.85406095, "num_input_tokens_seen": 14447355, "step": 681, "time_per_iteration": 2.6386382579803467 }, { "auxiliary_loss_clip": 0.01244279, "auxiliary_loss_mlp": 0.01073303, "balance_loss_clip": 1.06438065, "balance_loss_mlp": 1.04127121, "epoch": 0.04100405831955509, "flos": 23112036109440.0, "grad_norm": 1.706698119772261, "language_loss": 0.71538687, "learning_rate": 3.998730228142726e-06, "loss": 0.7385627, "num_input_tokens_seen": 14466790, "step": 682, "time_per_iteration": 2.618792772293091 }, { "auxiliary_loss_clip": 0.01243156, "auxiliary_loss_mlp": 0.01078429, "balance_loss_clip": 1.06440282, "balance_loss_mlp": 1.04781592, "epoch": 0.04106418157222306, "flos": 20156731695360.0, "grad_norm": 1.6947476714586034, "language_loss": 0.72599399, "learning_rate": 3.998716314427333e-06, "loss": 0.74920982, "num_input_tokens_seen": 14485195, "step": 683, "time_per_iteration": 2.676133394241333 }, { "auxiliary_loss_clip": 0.01241071, "auxiliary_loss_mlp": 0.01079531, "balance_loss_clip": 1.07077932, "balance_loss_mlp": 1.04851258, "epoch": 0.041124304824891024, "flos": 17420697855360.0, "grad_norm": 2.098652785935233, "language_loss": 0.81419414, "learning_rate": 3.998702324920417e-06, "loss": 0.8374002, "num_input_tokens_seen": 14503370, "step": 684, "time_per_iteration": 2.6538476943969727 }, { "auxiliary_loss_clip": 0.01242791, "auxiliary_loss_mlp": 0.0107365, "balance_loss_clip": 1.06783867, "balance_loss_mlp": 1.04139185, "epoch": 0.041184428077558996, "flos": 25780163287680.0, "grad_norm": 1.5053911947555274, "language_loss": 0.90680599, "learning_rate": 3.9986882596225085e-06, "loss": 0.92997038, "num_input_tokens_seen": 14526415, "step": 685, "time_per_iteration": 2.6541450023651123 }, { "auxiliary_loss_clip": 0.01244219, "auxiliary_loss_mlp": 0.01072481, "balance_loss_clip": 1.06659365, "balance_loss_mlp": 1.04093838, "epoch": 0.04124455133022697, "flos": 22964766347520.0, "grad_norm": 2.2251875217653185, "language_loss": 0.87851977, "learning_rate": 3.998674118534141e-06, "loss": 0.90168673, "num_input_tokens_seen": 14546595, "step": 686, "time_per_iteration": 2.7298531532287598 }, { "auxiliary_loss_clip": 0.01247476, "auxiliary_loss_mlp": 0.01073385, "balance_loss_clip": 1.06586432, "balance_loss_mlp": 1.04224789, "epoch": 0.04130467458289493, "flos": 21289067015040.0, "grad_norm": 1.8582614005091855, "language_loss": 0.7152915, "learning_rate": 3.998659901655851e-06, "loss": 0.73850012, "num_input_tokens_seen": 14566590, "step": 687, "time_per_iteration": 2.6284232139587402 }, { "auxiliary_loss_clip": 0.01243582, "auxiliary_loss_mlp": 0.01076448, "balance_loss_clip": 1.06979251, "balance_loss_mlp": 1.04756403, "epoch": 0.041364797835562905, "flos": 19974233669760.0, "grad_norm": 2.596672934278983, "language_loss": 0.86028284, "learning_rate": 3.998645608988177e-06, "loss": 0.88348317, "num_input_tokens_seen": 14585965, "step": 688, "time_per_iteration": 2.522634506225586 }, { "auxiliary_loss_clip": 0.01241593, "auxiliary_loss_mlp": 0.01079647, "balance_loss_clip": 1.06802177, "balance_loss_mlp": 1.04908216, "epoch": 0.04142492108823087, "flos": 21906227520000.0, "grad_norm": 2.852238187591699, "language_loss": 0.83393514, "learning_rate": 3.998631240531661e-06, "loss": 0.85714757, "num_input_tokens_seen": 14606015, "step": 689, "time_per_iteration": 2.6140944957733154 }, { "auxiliary_loss_clip": 0.01238254, "auxiliary_loss_mlp": 0.01085009, "balance_loss_clip": 1.06293654, "balance_loss_mlp": 1.05463421, "epoch": 0.04148504434089884, "flos": 27639617621760.0, "grad_norm": 2.870474577544969, "language_loss": 0.68398476, "learning_rate": 3.998616796286848e-06, "loss": 0.70721734, "num_input_tokens_seen": 14629955, "step": 690, "time_per_iteration": 2.658987522125244 }, { "auxiliary_loss_clip": 0.01235903, "auxiliary_loss_mlp": 0.01075275, "balance_loss_clip": 1.0625304, "balance_loss_mlp": 1.04565191, "epoch": 0.041545167593566815, "flos": 20518387781760.0, "grad_norm": 1.634289561889102, "language_loss": 0.74927461, "learning_rate": 3.998602276254286e-06, "loss": 0.77238643, "num_input_tokens_seen": 14648000, "step": 691, "time_per_iteration": 2.599957227706909 }, { "auxiliary_loss_clip": 0.01239089, "auxiliary_loss_mlp": 0.01081705, "balance_loss_clip": 1.06458938, "balance_loss_mlp": 1.04978108, "epoch": 0.04160529084623478, "flos": 11868907939200.0, "grad_norm": 2.123432521314224, "language_loss": 0.84469771, "learning_rate": 3.998587680434526e-06, "loss": 0.86790562, "num_input_tokens_seen": 14662235, "step": 692, "time_per_iteration": 2.5748491287231445 }, { "auxiliary_loss_clip": 0.01242126, "auxiliary_loss_mlp": 0.01076613, "balance_loss_clip": 1.06274796, "balance_loss_mlp": 1.04409313, "epoch": 0.04166541409890275, "flos": 14828306503680.0, "grad_norm": 2.3463094595874665, "language_loss": 0.88948715, "learning_rate": 3.99857300882812e-06, "loss": 0.91267455, "num_input_tokens_seen": 14676065, "step": 693, "time_per_iteration": 2.569277286529541 }, { "auxiliary_loss_clip": 0.01245438, "auxiliary_loss_mlp": 0.01071471, "balance_loss_clip": 1.06845784, "balance_loss_mlp": 1.04123962, "epoch": 0.04172553735157072, "flos": 25808137004160.0, "grad_norm": 5.499777597079252, "language_loss": 0.81987685, "learning_rate": 3.998558261435626e-06, "loss": 0.84304595, "num_input_tokens_seen": 14694955, "step": 694, "time_per_iteration": 2.6798722743988037 }, { "auxiliary_loss_clip": 0.01242101, "auxiliary_loss_mlp": 0.01073692, "balance_loss_clip": 1.06179321, "balance_loss_mlp": 1.04303181, "epoch": 0.04178566060423869, "flos": 24279815174400.0, "grad_norm": 2.051302362473346, "language_loss": 0.83672506, "learning_rate": 3.9985434382576015e-06, "loss": 0.85988301, "num_input_tokens_seen": 14715510, "step": 695, "time_per_iteration": 2.684537649154663 }, { "auxiliary_loss_clip": 0.01242205, "auxiliary_loss_mlp": 0.01080004, "balance_loss_clip": 1.06535804, "balance_loss_mlp": 1.04822254, "epoch": 0.04184578385690666, "flos": 18222008411520.0, "grad_norm": 2.113561459264794, "language_loss": 0.84351176, "learning_rate": 3.99852853929461e-06, "loss": 0.86673379, "num_input_tokens_seen": 14731755, "step": 696, "time_per_iteration": 4.1141321659088135 }, { "auxiliary_loss_clip": 0.01238462, "auxiliary_loss_mlp": 0.01083207, "balance_loss_clip": 1.06265593, "balance_loss_mlp": 1.05099702, "epoch": 0.041905907109574626, "flos": 22776342577920.0, "grad_norm": 6.921460264787684, "language_loss": 0.93193012, "learning_rate": 3.998513564547216e-06, "loss": 0.95514685, "num_input_tokens_seen": 14750810, "step": 697, "time_per_iteration": 5.71666693687439 }, { "auxiliary_loss_clip": 0.01235964, "auxiliary_loss_mlp": 0.01074448, "balance_loss_clip": 1.06324339, "balance_loss_mlp": 1.04495573, "epoch": 0.0419660303622426, "flos": 20156947176960.0, "grad_norm": 2.1002029886241904, "language_loss": 0.83775562, "learning_rate": 3.998498514015987e-06, "loss": 0.86085975, "num_input_tokens_seen": 14768435, "step": 698, "time_per_iteration": 4.194530010223389 }, { "auxiliary_loss_clip": 0.01239177, "auxiliary_loss_mlp": 0.01093516, "balance_loss_clip": 1.06274605, "balance_loss_mlp": 1.06175828, "epoch": 0.042026153614910564, "flos": 23076376882560.0, "grad_norm": 2.1234669437327955, "language_loss": 0.91715962, "learning_rate": 3.998483387701495e-06, "loss": 0.94048655, "num_input_tokens_seen": 14786690, "step": 699, "time_per_iteration": 2.6399078369140625 }, { "auxiliary_loss_clip": 0.01113327, "auxiliary_loss_mlp": 0.0102038, "balance_loss_clip": 1.03020263, "balance_loss_mlp": 1.01403797, "epoch": 0.042086276867578536, "flos": 64495243370880.0, "grad_norm": 0.9035134571641164, "language_loss": 0.67873394, "learning_rate": 3.998468185604312e-06, "loss": 0.70007098, "num_input_tokens_seen": 14853840, "step": 700, "time_per_iteration": 3.192026376724243 }, { "auxiliary_loss_clip": 0.01246765, "auxiliary_loss_mlp": 0.01082955, "balance_loss_clip": 1.06717515, "balance_loss_mlp": 1.05017269, "epoch": 0.04214640012024651, "flos": 15487016065920.0, "grad_norm": 2.2754848646841888, "language_loss": 0.884673, "learning_rate": 3.998452907725016e-06, "loss": 0.90797025, "num_input_tokens_seen": 14869580, "step": 701, "time_per_iteration": 2.5790441036224365 }, { "auxiliary_loss_clip": 0.01242428, "auxiliary_loss_mlp": 0.01080259, "balance_loss_clip": 1.06793952, "balance_loss_mlp": 1.04833448, "epoch": 0.04220652337291447, "flos": 23877040993920.0, "grad_norm": 2.000128536077818, "language_loss": 0.67100394, "learning_rate": 3.998437554064184e-06, "loss": 0.69423079, "num_input_tokens_seen": 14891065, "step": 702, "time_per_iteration": 2.6247870922088623 }, { "auxiliary_loss_clip": 0.01107168, "auxiliary_loss_mlp": 0.01005563, "balance_loss_clip": 1.02512407, "balance_loss_mlp": 0.99922067, "epoch": 0.042266646625582445, "flos": 63795451628160.0, "grad_norm": 0.8439205282656718, "language_loss": 0.60756463, "learning_rate": 3.9984221246224006e-06, "loss": 0.62869191, "num_input_tokens_seen": 14954815, "step": 703, "time_per_iteration": 3.1991655826568604 }, { "auxiliary_loss_clip": 0.01107933, "auxiliary_loss_mlp": 0.01006502, "balance_loss_clip": 1.02562141, "balance_loss_mlp": 0.99973089, "epoch": 0.04232676987825041, "flos": 50018863345920.0, "grad_norm": 1.0471369072250156, "language_loss": 0.57677412, "learning_rate": 3.9984066194002494e-06, "loss": 0.59791845, "num_input_tokens_seen": 15003050, "step": 704, "time_per_iteration": 3.037705659866333 }, { "auxiliary_loss_clip": 0.01241513, "auxiliary_loss_mlp": 0.01072126, "balance_loss_clip": 1.06549489, "balance_loss_mlp": 1.0406549, "epoch": 0.04238689313091838, "flos": 21616105368960.0, "grad_norm": 2.9488804643242488, "language_loss": 0.87553984, "learning_rate": 3.998391038398319e-06, "loss": 0.89867628, "num_input_tokens_seen": 15021990, "step": 705, "time_per_iteration": 2.6233222484588623 }, { "auxiliary_loss_clip": 0.01230342, "auxiliary_loss_mlp": 0.0107194, "balance_loss_clip": 1.0605582, "balance_loss_mlp": 1.04204249, "epoch": 0.042447016383586354, "flos": 19135109070720.0, "grad_norm": 2.556815837902013, "language_loss": 0.71071029, "learning_rate": 3.998375381617201e-06, "loss": 0.73373306, "num_input_tokens_seen": 15040700, "step": 706, "time_per_iteration": 2.560434579849243 }, { "auxiliary_loss_clip": 0.0123412, "auxiliary_loss_mlp": 0.01070349, "balance_loss_clip": 1.06249404, "balance_loss_mlp": 1.03799582, "epoch": 0.04250713963625432, "flos": 24426007528320.0, "grad_norm": 2.0814078632624167, "language_loss": 0.93418455, "learning_rate": 3.9983596490574875e-06, "loss": 0.95722926, "num_input_tokens_seen": 15056725, "step": 707, "time_per_iteration": 2.6130473613739014 }, { "auxiliary_loss_clip": 0.01237541, "auxiliary_loss_mlp": 0.01067908, "balance_loss_clip": 1.05994225, "balance_loss_mlp": 1.03617477, "epoch": 0.04256726288892229, "flos": 30367391333760.0, "grad_norm": 2.424205580643553, "language_loss": 0.81514043, "learning_rate": 3.998343840719776e-06, "loss": 0.83819497, "num_input_tokens_seen": 15077550, "step": 708, "time_per_iteration": 2.656277894973755 }, { "auxiliary_loss_clip": 0.01243932, "auxiliary_loss_mlp": 0.0108167, "balance_loss_clip": 1.06461239, "balance_loss_mlp": 1.04934049, "epoch": 0.04262738614159026, "flos": 16362661818240.0, "grad_norm": 2.0883592727868145, "language_loss": 0.82027614, "learning_rate": 3.998327956604666e-06, "loss": 0.8435322, "num_input_tokens_seen": 15094955, "step": 709, "time_per_iteration": 2.5758891105651855 }, { "auxiliary_loss_clip": 0.01243538, "auxiliary_loss_mlp": 0.01071217, "balance_loss_clip": 1.06374872, "balance_loss_mlp": 1.03960264, "epoch": 0.04268750939425823, "flos": 20412379768320.0, "grad_norm": 2.7686525844665133, "language_loss": 0.8502059, "learning_rate": 3.99831199671276e-06, "loss": 0.87335348, "num_input_tokens_seen": 15113395, "step": 710, "time_per_iteration": 2.571559429168701 }, { "auxiliary_loss_clip": 0.0124498, "auxiliary_loss_mlp": 0.01072229, "balance_loss_clip": 1.06788397, "balance_loss_mlp": 1.04166365, "epoch": 0.0427476326469262, "flos": 20302959962880.0, "grad_norm": 7.911177124524585, "language_loss": 0.84914303, "learning_rate": 3.998295961044662e-06, "loss": 0.87231517, "num_input_tokens_seen": 15132920, "step": 711, "time_per_iteration": 2.569959878921509 }, { "auxiliary_loss_clip": 0.01237769, "auxiliary_loss_mlp": 0.01074338, "balance_loss_clip": 1.06188083, "balance_loss_mlp": 1.04229426, "epoch": 0.042807755899594166, "flos": 21650794928640.0, "grad_norm": 1.7189790042473796, "language_loss": 0.85439789, "learning_rate": 3.9982798496009804e-06, "loss": 0.87751901, "num_input_tokens_seen": 15153115, "step": 712, "time_per_iteration": 2.6200509071350098 }, { "auxiliary_loss_clip": 0.01242397, "auxiliary_loss_mlp": 0.01069523, "balance_loss_clip": 1.06085837, "balance_loss_mlp": 1.03989983, "epoch": 0.04286787915226214, "flos": 21435007973760.0, "grad_norm": 5.490507523621204, "language_loss": 0.91178697, "learning_rate": 3.998263662382328e-06, "loss": 0.93490618, "num_input_tokens_seen": 15172770, "step": 713, "time_per_iteration": 2.6353416442871094 }, { "auxiliary_loss_clip": 0.01104693, "auxiliary_loss_mlp": 0.01006514, "balance_loss_clip": 1.02325606, "balance_loss_mlp": 0.99955195, "epoch": 0.04292800240493011, "flos": 66397970615040.0, "grad_norm": 0.9310328114407391, "language_loss": 0.63725489, "learning_rate": 3.9982473993893165e-06, "loss": 0.65836698, "num_input_tokens_seen": 15240055, "step": 714, "time_per_iteration": 3.2544445991516113 }, { "auxiliary_loss_clip": 0.01239175, "auxiliary_loss_mlp": 0.01085992, "balance_loss_clip": 1.06602359, "balance_loss_mlp": 1.05552244, "epoch": 0.042988125657598075, "flos": 31650264552960.0, "grad_norm": 1.8449858143817996, "language_loss": 0.75010103, "learning_rate": 3.998231060622563e-06, "loss": 0.77335274, "num_input_tokens_seen": 15261585, "step": 715, "time_per_iteration": 2.7048466205596924 }, { "auxiliary_loss_clip": 0.01242734, "auxiliary_loss_mlp": 0.01074126, "balance_loss_clip": 1.0666225, "balance_loss_mlp": 1.04227352, "epoch": 0.04304824891026605, "flos": 33248468292480.0, "grad_norm": 1.9505519101092619, "language_loss": 0.72289199, "learning_rate": 3.998214646082688e-06, "loss": 0.74606061, "num_input_tokens_seen": 15281160, "step": 716, "time_per_iteration": 2.7807397842407227 }, { "auxiliary_loss_clip": 0.01104303, "auxiliary_loss_mlp": 0.01006894, "balance_loss_clip": 1.02277207, "balance_loss_mlp": 0.99997944, "epoch": 0.04310837216293401, "flos": 64064782782720.0, "grad_norm": 0.9245106661639481, "language_loss": 0.65587437, "learning_rate": 3.998198155770314e-06, "loss": 0.67698634, "num_input_tokens_seen": 15344505, "step": 717, "time_per_iteration": 3.250870943069458 }, { "auxiliary_loss_clip": 0.01103971, "auxiliary_loss_mlp": 0.01009587, "balance_loss_clip": 1.02238059, "balance_loss_mlp": 1.00267255, "epoch": 0.043168495415601985, "flos": 61343757849600.0, "grad_norm": 0.9849394627593366, "language_loss": 0.58785796, "learning_rate": 3.998181589686065e-06, "loss": 0.60899353, "num_input_tokens_seen": 15404050, "step": 718, "time_per_iteration": 3.0402464866638184 }, { "auxiliary_loss_clip": 0.0124025, "auxiliary_loss_mlp": 0.0107507, "balance_loss_clip": 1.06784248, "balance_loss_mlp": 1.0424546, "epoch": 0.04322861866826996, "flos": 20704261685760.0, "grad_norm": 1.9557310597444375, "language_loss": 0.91440111, "learning_rate": 3.99816494783057e-06, "loss": 0.9375543, "num_input_tokens_seen": 15424190, "step": 719, "time_per_iteration": 2.6500089168548584 }, { "auxiliary_loss_clip": 0.01235843, "auxiliary_loss_mlp": 0.01072906, "balance_loss_clip": 1.06020999, "balance_loss_mlp": 1.04296041, "epoch": 0.04328874192093792, "flos": 30373352991360.0, "grad_norm": 1.7057721639328365, "language_loss": 0.66461253, "learning_rate": 3.99814823020446e-06, "loss": 0.68770003, "num_input_tokens_seen": 15446500, "step": 720, "time_per_iteration": 2.673184871673584 }, { "auxiliary_loss_clip": 0.01234245, "auxiliary_loss_mlp": 0.01072069, "balance_loss_clip": 1.06111717, "balance_loss_mlp": 1.04131258, "epoch": 0.043348865173605894, "flos": 21944795748480.0, "grad_norm": 1.9491363249287763, "language_loss": 0.77460182, "learning_rate": 3.9981314368083684e-06, "loss": 0.79766488, "num_input_tokens_seen": 15465830, "step": 721, "time_per_iteration": 2.6695611476898193 }, { "auxiliary_loss_clip": 0.01241854, "auxiliary_loss_mlp": 0.01087169, "balance_loss_clip": 1.06622314, "balance_loss_mlp": 1.05719972, "epoch": 0.04340898842627386, "flos": 15264225959040.0, "grad_norm": 2.8383174670702718, "language_loss": 0.88298881, "learning_rate": 3.998114567642933e-06, "loss": 0.90627909, "num_input_tokens_seen": 15479985, "step": 722, "time_per_iteration": 2.661313533782959 }, { "auxiliary_loss_clip": 0.01244836, "auxiliary_loss_mlp": 0.01076885, "balance_loss_clip": 1.06665182, "balance_loss_mlp": 1.0480125, "epoch": 0.04346911167894183, "flos": 27965434913280.0, "grad_norm": 5.515838365549148, "language_loss": 0.84387141, "learning_rate": 3.998097622708792e-06, "loss": 0.86708868, "num_input_tokens_seen": 15501545, "step": 723, "time_per_iteration": 2.6447954177856445 }, { "auxiliary_loss_clip": 0.01245825, "auxiliary_loss_mlp": 0.01081354, "balance_loss_clip": 1.06723523, "balance_loss_mlp": 1.05019248, "epoch": 0.0435292349316098, "flos": 29242202820480.0, "grad_norm": 1.7852936089408447, "language_loss": 0.82789439, "learning_rate": 3.99808060200659e-06, "loss": 0.85116619, "num_input_tokens_seen": 15521725, "step": 724, "time_per_iteration": 2.676985263824463 }, { "auxiliary_loss_clip": 0.0124127, "auxiliary_loss_mlp": 0.01087491, "balance_loss_clip": 1.06535757, "balance_loss_mlp": 1.05609179, "epoch": 0.04358935818427777, "flos": 20558356640640.0, "grad_norm": 2.011685360503238, "language_loss": 0.79444051, "learning_rate": 3.998063505536971e-06, "loss": 0.81772816, "num_input_tokens_seen": 15540910, "step": 725, "time_per_iteration": 2.6241447925567627 }, { "auxiliary_loss_clip": 0.01251777, "auxiliary_loss_mlp": 0.01074923, "balance_loss_clip": 1.06783843, "balance_loss_mlp": 1.04309392, "epoch": 0.04364948143694574, "flos": 14464926564480.0, "grad_norm": 2.2160842755462817, "language_loss": 0.87175703, "learning_rate": 3.998046333300584e-06, "loss": 0.89502406, "num_input_tokens_seen": 15558640, "step": 726, "time_per_iteration": 2.555551052093506 }, { "auxiliary_loss_clip": 0.01100917, "auxiliary_loss_mlp": 0.01015411, "balance_loss_clip": 1.02171838, "balance_loss_mlp": 1.00947404, "epoch": 0.043709604689613706, "flos": 50067268922880.0, "grad_norm": 0.908981905466007, "language_loss": 0.55868411, "learning_rate": 3.998029085298079e-06, "loss": 0.5798474, "num_input_tokens_seen": 15612975, "step": 727, "time_per_iteration": 3.375901699066162 }, { "auxiliary_loss_clip": 0.01245647, "auxiliary_loss_mlp": 0.0108809, "balance_loss_clip": 1.06717396, "balance_loss_mlp": 1.05614173, "epoch": 0.04376972794228168, "flos": 13991588115840.0, "grad_norm": 2.282663852625415, "language_loss": 0.82326066, "learning_rate": 3.998011761530112e-06, "loss": 0.84659809, "num_input_tokens_seen": 15631070, "step": 728, "time_per_iteration": 2.605970621109009 }, { "auxiliary_loss_clip": 0.01237902, "auxiliary_loss_mlp": 0.01073495, "balance_loss_clip": 1.06600416, "balance_loss_mlp": 1.04321551, "epoch": 0.04382985119494965, "flos": 22009901149440.0, "grad_norm": 2.1303486954703152, "language_loss": 0.76890069, "learning_rate": 3.997994361997338e-06, "loss": 0.7920146, "num_input_tokens_seen": 15647825, "step": 729, "time_per_iteration": 2.652466297149658 }, { "auxiliary_loss_clip": 0.01243746, "auxiliary_loss_mlp": 0.01079207, "balance_loss_clip": 1.06438255, "balance_loss_mlp": 1.04859376, "epoch": 0.043889974447617615, "flos": 24206521472640.0, "grad_norm": 2.1385115795714107, "language_loss": 0.95153189, "learning_rate": 3.997976886700417e-06, "loss": 0.97476137, "num_input_tokens_seen": 15668260, "step": 730, "time_per_iteration": 2.734614133834839 }, { "auxiliary_loss_clip": 0.01238581, "auxiliary_loss_mlp": 0.01074727, "balance_loss_clip": 1.06093788, "balance_loss_mlp": 1.04315984, "epoch": 0.04395009770028559, "flos": 17274541415040.0, "grad_norm": 2.333073864238008, "language_loss": 0.88456279, "learning_rate": 3.997959335640013e-06, "loss": 0.90769589, "num_input_tokens_seen": 15685630, "step": 731, "time_per_iteration": 2.5912294387817383 }, { "auxiliary_loss_clip": 0.01242247, "auxiliary_loss_mlp": 0.01076563, "balance_loss_clip": 1.06636512, "balance_loss_mlp": 1.04757094, "epoch": 0.04401022095295355, "flos": 12310286261760.0, "grad_norm": 3.0398759554531254, "language_loss": 0.88683128, "learning_rate": 3.997941708816791e-06, "loss": 0.9100194, "num_input_tokens_seen": 15698645, "step": 732, "time_per_iteration": 2.5897367000579834 }, { "auxiliary_loss_clip": 0.01242736, "auxiliary_loss_mlp": 0.01087795, "balance_loss_clip": 1.06544232, "balance_loss_mlp": 1.05646718, "epoch": 0.044070344205621524, "flos": 20959658363520.0, "grad_norm": 2.304959545118842, "language_loss": 0.85829747, "learning_rate": 3.997924006231419e-06, "loss": 0.88160276, "num_input_tokens_seen": 15716775, "step": 733, "time_per_iteration": 2.650681972503662 }, { "auxiliary_loss_clip": 0.01246603, "auxiliary_loss_mlp": 0.01088724, "balance_loss_clip": 1.06722379, "balance_loss_mlp": 1.05544066, "epoch": 0.044130467458289496, "flos": 13845288021120.0, "grad_norm": 2.207780377909299, "language_loss": 0.91189414, "learning_rate": 3.9979062278845685e-06, "loss": 0.93524742, "num_input_tokens_seen": 15733320, "step": 734, "time_per_iteration": 2.5956180095672607 }, { "auxiliary_loss_clip": 0.01238395, "auxiliary_loss_mlp": 0.01067579, "balance_loss_clip": 1.06596422, "balance_loss_mlp": 1.03781235, "epoch": 0.04419059071095746, "flos": 28655063107200.0, "grad_norm": 1.9297536072777384, "language_loss": 0.77884138, "learning_rate": 3.9978883737769125e-06, "loss": 0.8019011, "num_input_tokens_seen": 15752705, "step": 735, "time_per_iteration": 2.603809118270874 }, { "auxiliary_loss_clip": 0.01234188, "auxiliary_loss_mlp": 0.01070499, "balance_loss_clip": 1.06063068, "balance_loss_mlp": 1.04091144, "epoch": 0.04425071396362543, "flos": 28183304856960.0, "grad_norm": 2.266122200005257, "language_loss": 0.8832593, "learning_rate": 3.9978704439091305e-06, "loss": 0.90630615, "num_input_tokens_seen": 15772800, "step": 736, "time_per_iteration": 5.841086149215698 }, { "auxiliary_loss_clip": 0.01235947, "auxiliary_loss_mlp": 0.01081098, "balance_loss_clip": 1.06597185, "balance_loss_mlp": 1.05165362, "epoch": 0.0443108372162934, "flos": 23658452778240.0, "grad_norm": 1.8984177574034653, "language_loss": 0.84481263, "learning_rate": 3.997852438281901e-06, "loss": 0.8679831, "num_input_tokens_seen": 15793665, "step": 737, "time_per_iteration": 4.1386003494262695 }, { "auxiliary_loss_clip": 0.01240863, "auxiliary_loss_mlp": 0.01072388, "balance_loss_clip": 1.0653491, "balance_loss_mlp": 1.03961766, "epoch": 0.04437096046896137, "flos": 33979861025280.0, "grad_norm": 2.2366199062134706, "language_loss": 0.84712577, "learning_rate": 3.997834356895906e-06, "loss": 0.87025833, "num_input_tokens_seen": 15813175, "step": 738, "time_per_iteration": 4.447159290313721 }, { "auxiliary_loss_clip": 0.01098733, "auxiliary_loss_mlp": 0.0102196, "balance_loss_clip": 1.02144337, "balance_loss_mlp": 1.01685739, "epoch": 0.04443108372162934, "flos": 67397506375680.0, "grad_norm": 0.8779518557387592, "language_loss": 0.59179878, "learning_rate": 3.9978161997518324e-06, "loss": 0.61300576, "num_input_tokens_seen": 15872050, "step": 739, "time_per_iteration": 3.0780396461486816 }, { "auxiliary_loss_clip": 0.012386, "auxiliary_loss_mlp": 0.01067387, "balance_loss_clip": 1.06604302, "balance_loss_mlp": 1.03717899, "epoch": 0.04449120697429731, "flos": 29752672953600.0, "grad_norm": 2.295102845773205, "language_loss": 0.91329807, "learning_rate": 3.997797966850369e-06, "loss": 0.93635798, "num_input_tokens_seen": 15891085, "step": 740, "time_per_iteration": 2.6687562465667725 }, { "auxiliary_loss_clip": 0.01243424, "auxiliary_loss_mlp": 0.01067832, "balance_loss_clip": 1.06807768, "balance_loss_mlp": 1.03929377, "epoch": 0.04455133022696528, "flos": 36502119072000.0, "grad_norm": 2.0543845689042484, "language_loss": 0.71875739, "learning_rate": 3.997779658192205e-06, "loss": 0.74186987, "num_input_tokens_seen": 15914225, "step": 741, "time_per_iteration": 2.707231283187866 }, { "auxiliary_loss_clip": 0.01233192, "auxiliary_loss_mlp": 0.01084138, "balance_loss_clip": 1.062482, "balance_loss_mlp": 1.05476475, "epoch": 0.044611453479633245, "flos": 28803661672320.0, "grad_norm": 1.7086571433899975, "language_loss": 0.88933527, "learning_rate": 3.997761273778037e-06, "loss": 0.91250861, "num_input_tokens_seen": 15934540, "step": 742, "time_per_iteration": 2.6647751331329346 }, { "auxiliary_loss_clip": 0.01237248, "auxiliary_loss_mlp": 0.0106534, "balance_loss_clip": 1.06481838, "balance_loss_mlp": 1.03367805, "epoch": 0.04467157673230122, "flos": 20010970304640.0, "grad_norm": 1.9055071619943689, "language_loss": 0.83840811, "learning_rate": 3.997742813608561e-06, "loss": 0.86143398, "num_input_tokens_seen": 15952560, "step": 743, "time_per_iteration": 2.697864055633545 }, { "auxiliary_loss_clip": 0.01239398, "auxiliary_loss_mlp": 0.01073846, "balance_loss_clip": 1.06395566, "balance_loss_mlp": 1.04373407, "epoch": 0.04473169998496919, "flos": 18004964480640.0, "grad_norm": 2.2041873634107696, "language_loss": 0.80026019, "learning_rate": 3.997724277684479e-06, "loss": 0.82339263, "num_input_tokens_seen": 15970620, "step": 744, "time_per_iteration": 2.6551101207733154 }, { "auxiliary_loss_clip": 0.01236158, "auxiliary_loss_mlp": 0.01076186, "balance_loss_clip": 1.06385589, "balance_loss_mlp": 1.04665816, "epoch": 0.044791823237637154, "flos": 20631722169600.0, "grad_norm": 2.139129927663487, "language_loss": 0.85502481, "learning_rate": 3.99770566600649e-06, "loss": 0.87814826, "num_input_tokens_seen": 15987325, "step": 745, "time_per_iteration": 2.6686010360717773 }, { "auxiliary_loss_clip": 0.01235001, "auxiliary_loss_mlp": 0.01066107, "balance_loss_clip": 1.06320596, "balance_loss_mlp": 1.03594685, "epoch": 0.04485194649030513, "flos": 31176171918720.0, "grad_norm": 1.8251828520192552, "language_loss": 0.69291008, "learning_rate": 3.997686978575302e-06, "loss": 0.71592116, "num_input_tokens_seen": 16008310, "step": 746, "time_per_iteration": 2.6782095432281494 }, { "auxiliary_loss_clip": 0.01244022, "auxiliary_loss_mlp": 0.01081644, "balance_loss_clip": 1.07012939, "balance_loss_mlp": 1.05000615, "epoch": 0.04491206974297309, "flos": 26143291831680.0, "grad_norm": 3.6053643469900982, "language_loss": 0.68531066, "learning_rate": 3.997668215391625e-06, "loss": 0.70856726, "num_input_tokens_seen": 16029620, "step": 747, "time_per_iteration": 2.6589114665985107 }, { "auxiliary_loss_clip": 0.0124018, "auxiliary_loss_mlp": 0.01083594, "balance_loss_clip": 1.0652504, "balance_loss_mlp": 1.05183625, "epoch": 0.044972192995641064, "flos": 20667668705280.0, "grad_norm": 1.8376208182131786, "language_loss": 0.66778374, "learning_rate": 3.997649376456168e-06, "loss": 0.69102144, "num_input_tokens_seen": 16049065, "step": 748, "time_per_iteration": 2.674691677093506 }, { "auxiliary_loss_clip": 0.01243343, "auxiliary_loss_mlp": 0.01085665, "balance_loss_clip": 1.07101417, "balance_loss_mlp": 1.05596995, "epoch": 0.045032316248309036, "flos": 16106834177280.0, "grad_norm": 2.4197486882062322, "language_loss": 0.76684916, "learning_rate": 3.997630461769647e-06, "loss": 0.7901392, "num_input_tokens_seen": 16066765, "step": 749, "time_per_iteration": 2.5940611362457275 }, { "auxiliary_loss_clip": 0.01243381, "auxiliary_loss_mlp": 0.01083303, "balance_loss_clip": 1.06892776, "balance_loss_mlp": 1.05338168, "epoch": 0.045092439500977, "flos": 17858843953920.0, "grad_norm": 1.926675828378473, "language_loss": 0.88739896, "learning_rate": 3.997611471332778e-06, "loss": 0.91066581, "num_input_tokens_seen": 16085980, "step": 750, "time_per_iteration": 2.551717758178711 }, { "auxiliary_loss_clip": 0.01238484, "auxiliary_loss_mlp": 0.01077419, "balance_loss_clip": 1.062783, "balance_loss_mlp": 1.04404092, "epoch": 0.04515256275364497, "flos": 24462815990400.0, "grad_norm": 3.4910287963746116, "language_loss": 0.74371743, "learning_rate": 3.9975924051462825e-06, "loss": 0.76687646, "num_input_tokens_seen": 16106260, "step": 751, "time_per_iteration": 2.6299028396606445 }, { "auxiliary_loss_clip": 0.0123577, "auxiliary_loss_mlp": 0.01078322, "balance_loss_clip": 1.06347609, "balance_loss_mlp": 1.04884171, "epoch": 0.04521268600631294, "flos": 20916385453440.0, "grad_norm": 3.3938056459605583, "language_loss": 0.69115144, "learning_rate": 3.997573263210883e-06, "loss": 0.71429229, "num_input_tokens_seen": 16123475, "step": 752, "time_per_iteration": 2.571223020553589 }, { "auxiliary_loss_clip": 0.01235899, "auxiliary_loss_mlp": 0.01060876, "balance_loss_clip": 1.0627141, "balance_loss_mlp": 1.03212225, "epoch": 0.04527280925898091, "flos": 13371374954880.0, "grad_norm": 2.69328062598792, "language_loss": 0.92126763, "learning_rate": 3.997554045527305e-06, "loss": 0.94423538, "num_input_tokens_seen": 16138335, "step": 753, "time_per_iteration": 2.6100237369537354 }, { "auxiliary_loss_clip": 0.01239023, "auxiliary_loss_mlp": 0.01080271, "balance_loss_clip": 1.06628633, "balance_loss_mlp": 1.05116034, "epoch": 0.04533293251164888, "flos": 23254565276160.0, "grad_norm": 4.138305317267875, "language_loss": 0.91373456, "learning_rate": 3.997534752096277e-06, "loss": 0.93692756, "num_input_tokens_seen": 16157110, "step": 754, "time_per_iteration": 2.642747402191162 }, { "auxiliary_loss_clip": 0.01229195, "auxiliary_loss_mlp": 0.01078016, "balance_loss_clip": 1.06402516, "balance_loss_mlp": 1.04725957, "epoch": 0.04539305576431685, "flos": 12422004537600.0, "grad_norm": 4.559941934311277, "language_loss": 0.78558046, "learning_rate": 3.997515382918531e-06, "loss": 0.80865264, "num_input_tokens_seen": 16174155, "step": 755, "time_per_iteration": 2.6316659450531006 }, { "auxiliary_loss_clip": 0.01240044, "auxiliary_loss_mlp": 0.01081048, "balance_loss_clip": 1.06624937, "balance_loss_mlp": 1.05099559, "epoch": 0.04545317901698482, "flos": 16070995382400.0, "grad_norm": 2.193539224658874, "language_loss": 0.78473848, "learning_rate": 3.9974959379948015e-06, "loss": 0.80794942, "num_input_tokens_seen": 16192240, "step": 756, "time_per_iteration": 2.6390748023986816 }, { "auxiliary_loss_clip": 0.01101224, "auxiliary_loss_mlp": 0.01013849, "balance_loss_clip": 1.02455997, "balance_loss_mlp": 1.0089612, "epoch": 0.045513302269652785, "flos": 66396139021440.0, "grad_norm": 0.8202876780471967, "language_loss": 0.62756521, "learning_rate": 3.997476417325827e-06, "loss": 0.64871597, "num_input_tokens_seen": 16255775, "step": 757, "time_per_iteration": 3.2393198013305664 }, { "auxiliary_loss_clip": 0.01235136, "auxiliary_loss_mlp": 0.01071767, "balance_loss_clip": 1.06455243, "balance_loss_mlp": 1.04346693, "epoch": 0.04557342552232076, "flos": 21471169991040.0, "grad_norm": 1.6528285304744148, "language_loss": 0.84211069, "learning_rate": 3.997456820912346e-06, "loss": 0.86517978, "num_input_tokens_seen": 16277015, "step": 758, "time_per_iteration": 2.6508655548095703 }, { "auxiliary_loss_clip": 0.01228461, "auxiliary_loss_mlp": 0.01067033, "balance_loss_clip": 1.05912399, "balance_loss_mlp": 1.0391618, "epoch": 0.04563354877498873, "flos": 23732680233600.0, "grad_norm": 2.695805662282291, "language_loss": 0.88150775, "learning_rate": 3.997437148755101e-06, "loss": 0.9044627, "num_input_tokens_seen": 16296005, "step": 759, "time_per_iteration": 2.7782890796661377 }, { "auxiliary_loss_clip": 0.01240589, "auxiliary_loss_mlp": 0.01078815, "balance_loss_clip": 1.06747675, "balance_loss_mlp": 1.04846466, "epoch": 0.045693672027656694, "flos": 25735741142400.0, "grad_norm": 2.392455009776849, "language_loss": 0.73440695, "learning_rate": 3.9974174008548405e-06, "loss": 0.75760102, "num_input_tokens_seen": 16315300, "step": 760, "time_per_iteration": 2.7138822078704834 }, { "auxiliary_loss_clip": 0.01240372, "auxiliary_loss_mlp": 0.01079791, "balance_loss_clip": 1.07095265, "balance_loss_mlp": 1.05162191, "epoch": 0.045753795280324666, "flos": 19719016560000.0, "grad_norm": 3.497321311688565, "language_loss": 0.81781888, "learning_rate": 3.9973975772123105e-06, "loss": 0.84102058, "num_input_tokens_seen": 16333820, "step": 761, "time_per_iteration": 2.631303310394287 }, { "auxiliary_loss_clip": 0.01231969, "auxiliary_loss_mlp": 0.01078623, "balance_loss_clip": 1.06324267, "balance_loss_mlp": 1.04922605, "epoch": 0.04581391853299264, "flos": 23255786338560.0, "grad_norm": 2.0632320043111965, "language_loss": 0.79811668, "learning_rate": 3.997377677828266e-06, "loss": 0.82122266, "num_input_tokens_seen": 16355290, "step": 762, "time_per_iteration": 2.646928071975708 }, { "auxiliary_loss_clip": 0.01093869, "auxiliary_loss_mlp": 0.01027943, "balance_loss_clip": 1.01857328, "balance_loss_mlp": 1.02288842, "epoch": 0.0458740417856606, "flos": 64231155601920.0, "grad_norm": 1.0128965743658471, "language_loss": 0.58723813, "learning_rate": 3.9973577027034585e-06, "loss": 0.60845619, "num_input_tokens_seen": 16415995, "step": 763, "time_per_iteration": 3.1712563037872314 }, { "auxiliary_loss_clip": 0.012343, "auxiliary_loss_mlp": 0.01082461, "balance_loss_clip": 1.06205368, "balance_loss_mlp": 1.0531354, "epoch": 0.045934165038328575, "flos": 20770121272320.0, "grad_norm": 4.978761831483118, "language_loss": 0.87544954, "learning_rate": 3.9973376518386475e-06, "loss": 0.89861715, "num_input_tokens_seen": 16433120, "step": 764, "time_per_iteration": 2.5985426902770996 }, { "auxiliary_loss_clip": 0.01236145, "auxiliary_loss_mlp": 0.01087868, "balance_loss_clip": 1.06553543, "balance_loss_mlp": 1.05854285, "epoch": 0.04599428829099654, "flos": 30262891691520.0, "grad_norm": 2.0894169515773067, "language_loss": 0.85966802, "learning_rate": 3.997317525234592e-06, "loss": 0.88290817, "num_input_tokens_seen": 16453360, "step": 765, "time_per_iteration": 2.6572606563568115 }, { "auxiliary_loss_clip": 0.01239644, "auxiliary_loss_mlp": 0.01077398, "balance_loss_clip": 1.06530261, "balance_loss_mlp": 1.04573584, "epoch": 0.04605441154366451, "flos": 23038921975680.0, "grad_norm": 2.628046285830335, "language_loss": 0.88265938, "learning_rate": 3.997297322892056e-06, "loss": 0.90582979, "num_input_tokens_seen": 16471160, "step": 766, "time_per_iteration": 2.673226833343506 }, { "auxiliary_loss_clip": 0.01235506, "auxiliary_loss_mlp": 0.0107998, "balance_loss_clip": 1.06371713, "balance_loss_mlp": 1.05115545, "epoch": 0.046114534796332485, "flos": 22017407091840.0, "grad_norm": 2.343908591401411, "language_loss": 0.84302223, "learning_rate": 3.997277044811806e-06, "loss": 0.86617708, "num_input_tokens_seen": 16488940, "step": 767, "time_per_iteration": 2.683429002761841 }, { "auxiliary_loss_clip": 0.01236229, "auxiliary_loss_mlp": 0.01067844, "balance_loss_clip": 1.06769753, "balance_loss_mlp": 1.03791094, "epoch": 0.04617465804900045, "flos": 29862380067840.0, "grad_norm": 1.9268984031305718, "language_loss": 0.8669976, "learning_rate": 3.99725669099461e-06, "loss": 0.89003831, "num_input_tokens_seen": 16509505, "step": 768, "time_per_iteration": 2.8125200271606445 }, { "auxiliary_loss_clip": 0.01234175, "auxiliary_loss_mlp": 0.01076069, "balance_loss_clip": 1.06150854, "balance_loss_mlp": 1.04738712, "epoch": 0.04623478130166842, "flos": 25630056351360.0, "grad_norm": 2.115272554881108, "language_loss": 0.75152099, "learning_rate": 3.9972362614412395e-06, "loss": 0.77462339, "num_input_tokens_seen": 16528840, "step": 769, "time_per_iteration": 2.7286128997802734 }, { "auxiliary_loss_clip": 0.01229956, "auxiliary_loss_mlp": 0.01072391, "balance_loss_clip": 1.06326365, "balance_loss_mlp": 1.04462695, "epoch": 0.04629490455433639, "flos": 20449080489600.0, "grad_norm": 1.8368669953292174, "language_loss": 0.86292851, "learning_rate": 3.997215756152471e-06, "loss": 0.885952, "num_input_tokens_seen": 16548335, "step": 770, "time_per_iteration": 2.68608021736145 }, { "auxiliary_loss_clip": 0.01239009, "auxiliary_loss_mlp": 0.01072125, "balance_loss_clip": 1.06274092, "balance_loss_mlp": 1.04284704, "epoch": 0.04635502780700436, "flos": 23148736830720.0, "grad_norm": 2.058802627607224, "language_loss": 0.86842889, "learning_rate": 3.99719517512908e-06, "loss": 0.89154023, "num_input_tokens_seen": 16567725, "step": 771, "time_per_iteration": 2.637509822845459 }, { "auxiliary_loss_clip": 0.01239449, "auxiliary_loss_mlp": 0.01079651, "balance_loss_clip": 1.06184912, "balance_loss_mlp": 1.04884768, "epoch": 0.04641515105967233, "flos": 23292020183040.0, "grad_norm": 1.87920888608735, "language_loss": 0.83691382, "learning_rate": 3.997174518371848e-06, "loss": 0.8601048, "num_input_tokens_seen": 16588175, "step": 772, "time_per_iteration": 2.745006561279297 }, { "auxiliary_loss_clip": 0.01236322, "auxiliary_loss_mlp": 0.0107061, "balance_loss_clip": 1.06672883, "balance_loss_mlp": 1.04220271, "epoch": 0.046475274312340296, "flos": 25115204759040.0, "grad_norm": 1.9655107083336736, "language_loss": 0.73639083, "learning_rate": 3.997153785881557e-06, "loss": 0.75946015, "num_input_tokens_seen": 16607735, "step": 773, "time_per_iteration": 2.869290828704834 }, { "auxiliary_loss_clip": 0.01231219, "auxiliary_loss_mlp": 0.01071681, "balance_loss_clip": 1.06529772, "balance_loss_mlp": 1.04054356, "epoch": 0.04653539756500827, "flos": 25264916645760.0, "grad_norm": 2.096431798380756, "language_loss": 0.78228974, "learning_rate": 3.997132977658996e-06, "loss": 0.80531871, "num_input_tokens_seen": 16627225, "step": 774, "time_per_iteration": 2.6967568397521973 }, { "auxiliary_loss_clip": 0.01230587, "auxiliary_loss_mlp": 0.01069519, "balance_loss_clip": 1.06347871, "balance_loss_mlp": 1.04131365, "epoch": 0.046595520817676234, "flos": 35404150089600.0, "grad_norm": 2.018140205527256, "language_loss": 0.73187691, "learning_rate": 3.997112093704952e-06, "loss": 0.75487792, "num_input_tokens_seen": 16647785, "step": 775, "time_per_iteration": 2.737140417098999 }, { "auxiliary_loss_clip": 0.01231996, "auxiliary_loss_mlp": 0.01066454, "balance_loss_clip": 1.06187618, "balance_loss_mlp": 1.03650832, "epoch": 0.046655644070344206, "flos": 18112516778880.0, "grad_norm": 1.668093168561758, "language_loss": 0.77180624, "learning_rate": 3.997091134020217e-06, "loss": 0.7947908, "num_input_tokens_seen": 16667555, "step": 776, "time_per_iteration": 4.154085159301758 }, { "auxiliary_loss_clip": 0.0122577, "auxiliary_loss_mlp": 0.01071334, "balance_loss_clip": 1.06031108, "balance_loss_mlp": 1.04352236, "epoch": 0.04671576732301218, "flos": 29205286617600.0, "grad_norm": 1.9054628166827923, "language_loss": 0.7087816, "learning_rate": 3.997070098605585e-06, "loss": 0.73175263, "num_input_tokens_seen": 16686875, "step": 777, "time_per_iteration": 4.176887512207031 }, { "auxiliary_loss_clip": 0.0122979, "auxiliary_loss_mlp": 0.01076806, "balance_loss_clip": 1.06275606, "balance_loss_mlp": 1.04705119, "epoch": 0.04677589057568014, "flos": 30478319510400.0, "grad_norm": 1.8083238359854679, "language_loss": 0.77069759, "learning_rate": 3.997048987461856e-06, "loss": 0.79376352, "num_input_tokens_seen": 16706420, "step": 778, "time_per_iteration": 5.943394422531128 }, { "auxiliary_loss_clip": 0.01227067, "auxiliary_loss_mlp": 0.01064982, "balance_loss_clip": 1.06043744, "balance_loss_mlp": 1.03563297, "epoch": 0.046836013828348115, "flos": 20557674282240.0, "grad_norm": 2.1737778598926463, "language_loss": 0.79181123, "learning_rate": 3.997027800589829e-06, "loss": 0.81473172, "num_input_tokens_seen": 16726390, "step": 779, "time_per_iteration": 2.611804485321045 }, { "auxiliary_loss_clip": 0.01219629, "auxiliary_loss_mlp": 0.01070238, "balance_loss_clip": 1.05842376, "balance_loss_mlp": 1.04271269, "epoch": 0.04689613708101608, "flos": 25447378757760.0, "grad_norm": 1.888854926622149, "language_loss": 0.77364886, "learning_rate": 3.997006537990308e-06, "loss": 0.79654753, "num_input_tokens_seen": 16748965, "step": 780, "time_per_iteration": 2.668239116668701 }, { "auxiliary_loss_clip": 0.012253, "auxiliary_loss_mlp": 0.01073321, "balance_loss_clip": 1.06098521, "balance_loss_mlp": 1.04605746, "epoch": 0.04695626033368405, "flos": 23001395241600.0, "grad_norm": 1.7616538282563206, "language_loss": 0.76700419, "learning_rate": 3.996985199664099e-06, "loss": 0.78999043, "num_input_tokens_seen": 16768620, "step": 781, "time_per_iteration": 2.5979926586151123 }, { "auxiliary_loss_clip": 0.01236637, "auxiliary_loss_mlp": 0.01077479, "balance_loss_clip": 1.0639379, "balance_loss_mlp": 1.04836786, "epoch": 0.047016383586352024, "flos": 29133357632640.0, "grad_norm": 3.0946494667490856, "language_loss": 0.73786414, "learning_rate": 3.99696378561201e-06, "loss": 0.76100528, "num_input_tokens_seen": 16789755, "step": 782, "time_per_iteration": 2.708855390548706 }, { "auxiliary_loss_clip": 0.0122968, "auxiliary_loss_mlp": 0.01069368, "balance_loss_clip": 1.06431556, "balance_loss_mlp": 1.04253423, "epoch": 0.04707650683901999, "flos": 14976330451200.0, "grad_norm": 2.1459158015790183, "language_loss": 0.80524659, "learning_rate": 3.996942295834855e-06, "loss": 0.82823706, "num_input_tokens_seen": 16807585, "step": 783, "time_per_iteration": 2.6355738639831543 }, { "auxiliary_loss_clip": 0.01222415, "auxiliary_loss_mlp": 0.01063155, "balance_loss_clip": 1.06221437, "balance_loss_mlp": 1.03663135, "epoch": 0.04713663009168796, "flos": 21651118151040.0, "grad_norm": 1.9084512066318515, "language_loss": 0.81687874, "learning_rate": 3.996920730333448e-06, "loss": 0.83973444, "num_input_tokens_seen": 16827220, "step": 784, "time_per_iteration": 2.64365291595459 }, { "auxiliary_loss_clip": 0.01226632, "auxiliary_loss_mlp": 0.01074549, "balance_loss_clip": 1.0582943, "balance_loss_mlp": 1.04719007, "epoch": 0.04719675334435593, "flos": 21325408600320.0, "grad_norm": 3.970707764370453, "language_loss": 0.80619848, "learning_rate": 3.996899089108607e-06, "loss": 0.82921028, "num_input_tokens_seen": 16846230, "step": 785, "time_per_iteration": 2.682971715927124 }, { "auxiliary_loss_clip": 0.01231621, "auxiliary_loss_mlp": 0.01063774, "balance_loss_clip": 1.06683421, "balance_loss_mlp": 1.03784585, "epoch": 0.0472568765970239, "flos": 17931383470080.0, "grad_norm": 2.074448818096939, "language_loss": 0.89784658, "learning_rate": 3.996877372161152e-06, "loss": 0.92080051, "num_input_tokens_seen": 16865325, "step": 786, "time_per_iteration": 2.6072235107421875 }, { "auxiliary_loss_clip": 0.01227201, "auxiliary_loss_mlp": 0.01069453, "balance_loss_clip": 1.05475712, "balance_loss_mlp": 1.03912568, "epoch": 0.04731699984969187, "flos": 18077324428800.0, "grad_norm": 6.783818284100465, "language_loss": 0.76794451, "learning_rate": 3.9968555794919065e-06, "loss": 0.79091108, "num_input_tokens_seen": 16882930, "step": 787, "time_per_iteration": 2.595069646835327 }, { "auxiliary_loss_clip": 0.01233526, "auxiliary_loss_mlp": 0.01070856, "balance_loss_clip": 1.06563127, "balance_loss_mlp": 1.04248405, "epoch": 0.047377123102359836, "flos": 23185078416000.0, "grad_norm": 2.309745026689568, "language_loss": 0.81301165, "learning_rate": 3.996833711101698e-06, "loss": 0.83605546, "num_input_tokens_seen": 16900710, "step": 788, "time_per_iteration": 2.633812427520752 }, { "auxiliary_loss_clip": 0.01225447, "auxiliary_loss_mlp": 0.01078934, "balance_loss_clip": 1.06370282, "balance_loss_mlp": 1.04934621, "epoch": 0.04743724635502781, "flos": 22747794243840.0, "grad_norm": 2.941245147417381, "language_loss": 0.84428835, "learning_rate": 3.996811766991355e-06, "loss": 0.86733222, "num_input_tokens_seen": 16919210, "step": 789, "time_per_iteration": 2.6711082458496094 }, { "auxiliary_loss_clip": 0.01230866, "auxiliary_loss_mlp": 0.01071483, "balance_loss_clip": 1.06367648, "balance_loss_mlp": 1.0441606, "epoch": 0.04749736960769577, "flos": 17238702620160.0, "grad_norm": 2.0289407228390615, "language_loss": 0.81787878, "learning_rate": 3.996789747161709e-06, "loss": 0.84090227, "num_input_tokens_seen": 16937125, "step": 790, "time_per_iteration": 2.6136717796325684 }, { "auxiliary_loss_clip": 0.01224033, "auxiliary_loss_mlp": 0.01064065, "balance_loss_clip": 1.05880189, "balance_loss_mlp": 1.03546715, "epoch": 0.047557492860363745, "flos": 40479261592320.0, "grad_norm": 2.9735437778568965, "language_loss": 0.88116109, "learning_rate": 3.996767651613597e-06, "loss": 0.90404207, "num_input_tokens_seen": 16958610, "step": 791, "time_per_iteration": 2.747586727142334 }, { "auxiliary_loss_clip": 0.01226267, "auxiliary_loss_mlp": 0.01066471, "balance_loss_clip": 1.06144643, "balance_loss_mlp": 1.03743124, "epoch": 0.04761761611303172, "flos": 18698004466560.0, "grad_norm": 2.1239226540804537, "language_loss": 0.90671498, "learning_rate": 3.996745480347854e-06, "loss": 0.92964232, "num_input_tokens_seen": 16977300, "step": 792, "time_per_iteration": 2.591477870941162 }, { "auxiliary_loss_clip": 0.01226882, "auxiliary_loss_mlp": 0.0107926, "balance_loss_clip": 1.05968022, "balance_loss_mlp": 1.05225897, "epoch": 0.04767773936569968, "flos": 20921987975040.0, "grad_norm": 1.9120988315570397, "language_loss": 0.73246223, "learning_rate": 3.996723233365324e-06, "loss": 0.75552362, "num_input_tokens_seen": 16994950, "step": 793, "time_per_iteration": 2.6319899559020996 }, { "auxiliary_loss_clip": 0.01231301, "auxiliary_loss_mlp": 0.01070716, "balance_loss_clip": 1.06213653, "balance_loss_mlp": 1.04146254, "epoch": 0.047737862618367655, "flos": 23732680233600.0, "grad_norm": 1.86347948201136, "language_loss": 0.86139679, "learning_rate": 3.996700910666847e-06, "loss": 0.88441694, "num_input_tokens_seen": 17014760, "step": 794, "time_per_iteration": 2.6835687160491943 }, { "auxiliary_loss_clip": 0.01228204, "auxiliary_loss_mlp": 0.01077895, "balance_loss_clip": 1.05969596, "balance_loss_mlp": 1.04935622, "epoch": 0.04779798587103562, "flos": 23695764030720.0, "grad_norm": 2.370166301863074, "language_loss": 0.69069195, "learning_rate": 3.996678512253272e-06, "loss": 0.71375293, "num_input_tokens_seen": 17032715, "step": 795, "time_per_iteration": 2.669261932373047 }, { "auxiliary_loss_clip": 0.01225748, "auxiliary_loss_mlp": 0.01076275, "balance_loss_clip": 1.06129098, "balance_loss_mlp": 1.04756904, "epoch": 0.04785810912370359, "flos": 23183641872000.0, "grad_norm": 1.744925212230271, "language_loss": 0.810256, "learning_rate": 3.996656038125449e-06, "loss": 0.83327615, "num_input_tokens_seen": 17052215, "step": 796, "time_per_iteration": 2.5800065994262695 }, { "auxiliary_loss_clip": 0.01228235, "auxiliary_loss_mlp": 0.01065433, "balance_loss_clip": 1.06224668, "balance_loss_mlp": 1.03638172, "epoch": 0.047918232376371564, "flos": 18040623707520.0, "grad_norm": 1.979164246440182, "language_loss": 0.8128069, "learning_rate": 3.996633488284228e-06, "loss": 0.83574355, "num_input_tokens_seen": 17069225, "step": 797, "time_per_iteration": 2.58878493309021 }, { "auxiliary_loss_clip": 0.01100259, "auxiliary_loss_mlp": 0.01007215, "balance_loss_clip": 1.02779806, "balance_loss_mlp": 1.00266171, "epoch": 0.04797835562903953, "flos": 62442588758400.0, "grad_norm": 0.912416075283383, "language_loss": 0.64532876, "learning_rate": 3.996610862730465e-06, "loss": 0.66640353, "num_input_tokens_seen": 17126680, "step": 798, "time_per_iteration": 3.0779380798339844 }, { "auxiliary_loss_clip": 0.01229665, "auxiliary_loss_mlp": 0.01068747, "balance_loss_clip": 1.05799031, "balance_loss_mlp": 1.04121017, "epoch": 0.0480384788817075, "flos": 21507296094720.0, "grad_norm": 2.0206600610723333, "language_loss": 0.91274291, "learning_rate": 3.996588161465018e-06, "loss": 0.935727, "num_input_tokens_seen": 17144835, "step": 799, "time_per_iteration": 2.660438299179077 }, { "auxiliary_loss_clip": 0.01230751, "auxiliary_loss_mlp": 0.010715, "balance_loss_clip": 1.06640434, "balance_loss_mlp": 1.04274678, "epoch": 0.048098602134375466, "flos": 21726710323200.0, "grad_norm": 2.0752654205923866, "language_loss": 0.86825287, "learning_rate": 3.996565384488748e-06, "loss": 0.89127541, "num_input_tokens_seen": 17165030, "step": 800, "time_per_iteration": 2.6700456142425537 }, { "auxiliary_loss_clip": 0.01229893, "auxiliary_loss_mlp": 0.01072058, "balance_loss_clip": 1.06186771, "balance_loss_mlp": 1.04618931, "epoch": 0.04815872538704344, "flos": 22931082368640.0, "grad_norm": 2.5310108886746976, "language_loss": 0.83949852, "learning_rate": 3.996542531802518e-06, "loss": 0.86251807, "num_input_tokens_seen": 17184895, "step": 801, "time_per_iteration": 2.7724695205688477 }, { "auxiliary_loss_clip": 0.01227846, "auxiliary_loss_mlp": 0.010756, "balance_loss_clip": 1.06226814, "balance_loss_mlp": 1.04847932, "epoch": 0.04821884863971141, "flos": 43174716042240.0, "grad_norm": 1.9607091513106172, "language_loss": 0.79818648, "learning_rate": 3.996519603407196e-06, "loss": 0.82122099, "num_input_tokens_seen": 17208225, "step": 802, "time_per_iteration": 2.861309766769409 }, { "auxiliary_loss_clip": 0.0122832, "auxiliary_loss_mlp": 0.01069086, "balance_loss_clip": 1.06392837, "balance_loss_mlp": 1.04278886, "epoch": 0.048278971892379376, "flos": 18620006083200.0, "grad_norm": 1.798745906633195, "language_loss": 0.86600745, "learning_rate": 3.996496599303649e-06, "loss": 0.88898146, "num_input_tokens_seen": 17226305, "step": 803, "time_per_iteration": 2.612684965133667 }, { "auxiliary_loss_clip": 0.01222438, "auxiliary_loss_mlp": 0.01063116, "balance_loss_clip": 1.06214345, "balance_loss_mlp": 1.03643703, "epoch": 0.04833909514504735, "flos": 20230061310720.0, "grad_norm": 5.958214069975319, "language_loss": 0.85139012, "learning_rate": 3.996473519492753e-06, "loss": 0.8742457, "num_input_tokens_seen": 17244545, "step": 804, "time_per_iteration": 2.596965789794922 }, { "auxiliary_loss_clip": 0.01225485, "auxiliary_loss_mlp": 0.0106948, "balance_loss_clip": 1.06206632, "balance_loss_mlp": 1.04222918, "epoch": 0.04839921839771532, "flos": 24645170361600.0, "grad_norm": 1.9492340448514227, "language_loss": 0.85939878, "learning_rate": 3.99645036397538e-06, "loss": 0.88234842, "num_input_tokens_seen": 17265730, "step": 805, "time_per_iteration": 2.6773781776428223 }, { "auxiliary_loss_clip": 0.01221339, "auxiliary_loss_mlp": 0.01071867, "balance_loss_clip": 1.05968738, "balance_loss_mlp": 1.04591477, "epoch": 0.048459341650383285, "flos": 24827452905600.0, "grad_norm": 1.8764849579047527, "language_loss": 0.68025368, "learning_rate": 3.9964271327524085e-06, "loss": 0.70318574, "num_input_tokens_seen": 17284820, "step": 806, "time_per_iteration": 2.6270596981048584 }, { "auxiliary_loss_clip": 0.01221043, "auxiliary_loss_mlp": 0.01060505, "balance_loss_clip": 1.06064904, "balance_loss_mlp": 1.03384972, "epoch": 0.04851946490305126, "flos": 22163204396160.0, "grad_norm": 8.586680684018, "language_loss": 0.76488906, "learning_rate": 3.9964038258247214e-06, "loss": 0.78770459, "num_input_tokens_seen": 17305085, "step": 807, "time_per_iteration": 2.6783089637756348 }, { "auxiliary_loss_clip": 0.01218859, "auxiliary_loss_mlp": 0.01068871, "balance_loss_clip": 1.05734789, "balance_loss_mlp": 1.04290676, "epoch": 0.04857958815571922, "flos": 19792022952960.0, "grad_norm": 2.4056749627509157, "language_loss": 0.86882269, "learning_rate": 3.9963804431932005e-06, "loss": 0.89170003, "num_input_tokens_seen": 17322715, "step": 808, "time_per_iteration": 2.6447641849517822 }, { "auxiliary_loss_clip": 0.01227529, "auxiliary_loss_mlp": 0.01069446, "balance_loss_clip": 1.06140316, "balance_loss_mlp": 1.0424329, "epoch": 0.048639711408387194, "flos": 18697968552960.0, "grad_norm": 2.6040733531164424, "language_loss": 0.89710444, "learning_rate": 3.996356984858732e-06, "loss": 0.92007422, "num_input_tokens_seen": 17341455, "step": 809, "time_per_iteration": 2.6679790019989014 }, { "auxiliary_loss_clip": 0.01226608, "auxiliary_loss_mlp": 0.01067211, "balance_loss_clip": 1.0643065, "balance_loss_mlp": 1.04060316, "epoch": 0.048699834661055166, "flos": 24863507182080.0, "grad_norm": 3.0721319202916324, "language_loss": 0.84918916, "learning_rate": 3.996333450822208e-06, "loss": 0.87212729, "num_input_tokens_seen": 17360765, "step": 810, "time_per_iteration": 2.696772575378418 }, { "auxiliary_loss_clip": 0.01227202, "auxiliary_loss_mlp": 0.01067343, "balance_loss_clip": 1.0622344, "balance_loss_mlp": 1.04049683, "epoch": 0.04875995791372313, "flos": 20704010290560.0, "grad_norm": 1.8136675943398954, "language_loss": 0.80799425, "learning_rate": 3.99630984108452e-06, "loss": 0.83093977, "num_input_tokens_seen": 17380625, "step": 811, "time_per_iteration": 2.653808355331421 }, { "auxiliary_loss_clip": 0.01217843, "auxiliary_loss_mlp": 0.01070621, "balance_loss_clip": 1.05928314, "balance_loss_mlp": 1.04466903, "epoch": 0.048820081166391104, "flos": 18588297352320.0, "grad_norm": 1.7193599003225197, "language_loss": 0.74634516, "learning_rate": 3.9962861556465615e-06, "loss": 0.76922977, "num_input_tokens_seen": 17399355, "step": 812, "time_per_iteration": 2.7274649143218994 }, { "auxiliary_loss_clip": 0.01222659, "auxiliary_loss_mlp": 0.01073562, "balance_loss_clip": 1.06445217, "balance_loss_mlp": 1.04862356, "epoch": 0.04888020441905907, "flos": 22707322594560.0, "grad_norm": 1.9311665765462733, "language_loss": 0.90124279, "learning_rate": 3.996262394509233e-06, "loss": 0.92420495, "num_input_tokens_seen": 17418240, "step": 813, "time_per_iteration": 2.654874801635742 }, { "auxiliary_loss_clip": 0.0122, "auxiliary_loss_mlp": 0.01057827, "balance_loss_clip": 1.06157589, "balance_loss_mlp": 1.03248262, "epoch": 0.04894032767172704, "flos": 22784351310720.0, "grad_norm": 1.9238840150723209, "language_loss": 0.74904704, "learning_rate": 3.9962385576734335e-06, "loss": 0.77182531, "num_input_tokens_seen": 17436250, "step": 814, "time_per_iteration": 2.7381603717803955 }, { "auxiliary_loss_clip": 0.01223782, "auxiliary_loss_mlp": 0.01069686, "balance_loss_clip": 1.06125045, "balance_loss_mlp": 1.04289961, "epoch": 0.04900045092439501, "flos": 25516147345920.0, "grad_norm": 2.1966001004582596, "language_loss": 0.83816808, "learning_rate": 3.9962146451400675e-06, "loss": 0.86110282, "num_input_tokens_seen": 17455750, "step": 815, "time_per_iteration": 2.7289621829986572 }, { "auxiliary_loss_clip": 0.01227011, "auxiliary_loss_mlp": 0.01060571, "balance_loss_clip": 1.06326818, "balance_loss_mlp": 1.0344646, "epoch": 0.04906057417706298, "flos": 25958136199680.0, "grad_norm": 2.3329994981275943, "language_loss": 0.90796101, "learning_rate": 3.996190656910043e-06, "loss": 0.93083686, "num_input_tokens_seen": 17474995, "step": 816, "time_per_iteration": 4.174290180206299 }, { "auxiliary_loss_clip": 0.01226278, "auxiliary_loss_mlp": 0.0105651, "balance_loss_clip": 1.06172895, "balance_loss_mlp": 1.03054583, "epoch": 0.04912069742973095, "flos": 18624638937600.0, "grad_norm": 2.2253098946667853, "language_loss": 0.79834002, "learning_rate": 3.996166592984268e-06, "loss": 0.82116789, "num_input_tokens_seen": 17493395, "step": 817, "time_per_iteration": 4.2819907665252686 }, { "auxiliary_loss_clip": 0.01222491, "auxiliary_loss_mlp": 0.01072358, "balance_loss_clip": 1.06228495, "balance_loss_mlp": 1.04563141, "epoch": 0.049180820682398915, "flos": 23699786353920.0, "grad_norm": 1.9292138186207266, "language_loss": 0.8532303, "learning_rate": 3.996142453363656e-06, "loss": 0.8761788, "num_input_tokens_seen": 17514565, "step": 818, "time_per_iteration": 7.687308073043823 }, { "auxiliary_loss_clip": 0.01228571, "auxiliary_loss_mlp": 0.01064433, "balance_loss_clip": 1.06170368, "balance_loss_mlp": 1.0369786, "epoch": 0.04924094393506689, "flos": 22420396753920.0, "grad_norm": 2.1064810754058407, "language_loss": 0.75623614, "learning_rate": 3.996118238049124e-06, "loss": 0.77916616, "num_input_tokens_seen": 17534590, "step": 819, "time_per_iteration": 2.5708072185516357 }, { "auxiliary_loss_clip": 0.01227988, "auxiliary_loss_mlp": 0.010616, "balance_loss_clip": 1.06580663, "balance_loss_mlp": 1.03785336, "epoch": 0.04930106718773486, "flos": 15738246766080.0, "grad_norm": 2.8685299631500487, "language_loss": 0.85082126, "learning_rate": 3.996093947041586e-06, "loss": 0.87371719, "num_input_tokens_seen": 17551900, "step": 820, "time_per_iteration": 2.695204973220825 }, { "auxiliary_loss_clip": 0.01224953, "auxiliary_loss_mlp": 0.01065985, "balance_loss_clip": 1.06082845, "balance_loss_mlp": 1.04037917, "epoch": 0.049361190440402825, "flos": 26250628648320.0, "grad_norm": 1.734636988660555, "language_loss": 0.90459162, "learning_rate": 3.996069580341966e-06, "loss": 0.92750102, "num_input_tokens_seen": 17571485, "step": 821, "time_per_iteration": 2.6284992694854736 }, { "auxiliary_loss_clip": 0.01222526, "auxiliary_loss_mlp": 0.01080357, "balance_loss_clip": 1.06015635, "balance_loss_mlp": 1.05485809, "epoch": 0.0494213136930708, "flos": 21252366293760.0, "grad_norm": 1.7915267676548876, "language_loss": 0.89795959, "learning_rate": 3.996045137951188e-06, "loss": 0.92098844, "num_input_tokens_seen": 17591410, "step": 822, "time_per_iteration": 2.6085855960845947 }, { "auxiliary_loss_clip": 0.0122571, "auxiliary_loss_mlp": 0.01062887, "balance_loss_clip": 1.0639379, "balance_loss_mlp": 1.03472972, "epoch": 0.04948143694573876, "flos": 27965506740480.0, "grad_norm": 2.28747155105076, "language_loss": 0.67558801, "learning_rate": 3.996020619870178e-06, "loss": 0.69847399, "num_input_tokens_seen": 17612010, "step": 823, "time_per_iteration": 2.644277572631836 }, { "auxiliary_loss_clip": 0.01099376, "auxiliary_loss_mlp": 0.0100741, "balance_loss_clip": 1.0267303, "balance_loss_mlp": 1.00266516, "epoch": 0.049541560198406734, "flos": 66180995533440.0, "grad_norm": 1.3456360586087317, "language_loss": 0.62254131, "learning_rate": 3.995996026099866e-06, "loss": 0.64360917, "num_input_tokens_seen": 17673430, "step": 824, "time_per_iteration": 3.230381488800049 }, { "auxiliary_loss_clip": 0.01228758, "auxiliary_loss_mlp": 0.01066541, "balance_loss_clip": 1.06346989, "balance_loss_mlp": 1.03909945, "epoch": 0.049601683451074706, "flos": 22892693708160.0, "grad_norm": 1.8854339538524305, "language_loss": 0.90479428, "learning_rate": 3.995971356641185e-06, "loss": 0.92774737, "num_input_tokens_seen": 17689545, "step": 825, "time_per_iteration": 2.58868670463562 }, { "auxiliary_loss_clip": 0.01227734, "auxiliary_loss_mlp": 0.01066527, "balance_loss_clip": 1.06315517, "balance_loss_mlp": 1.03844118, "epoch": 0.04966180670374267, "flos": 21433643256960.0, "grad_norm": 2.307419213246734, "language_loss": 0.66851091, "learning_rate": 3.9959466114950695e-06, "loss": 0.69145352, "num_input_tokens_seen": 17705965, "step": 826, "time_per_iteration": 2.59468412399292 }, { "auxiliary_loss_clip": 0.01230149, "auxiliary_loss_mlp": 0.01069061, "balance_loss_clip": 1.06421614, "balance_loss_mlp": 1.04216766, "epoch": 0.04972192995641064, "flos": 23107367341440.0, "grad_norm": 1.8316571551414482, "language_loss": 0.78298402, "learning_rate": 3.995921790662459e-06, "loss": 0.80597603, "num_input_tokens_seen": 17724580, "step": 827, "time_per_iteration": 2.7148005962371826 }, { "auxiliary_loss_clip": 0.01230507, "auxiliary_loss_mlp": 0.01079145, "balance_loss_clip": 1.06385946, "balance_loss_mlp": 1.05119085, "epoch": 0.04978205320907861, "flos": 40406147458560.0, "grad_norm": 1.6017511297862308, "language_loss": 0.78696525, "learning_rate": 3.995896894144294e-06, "loss": 0.81006181, "num_input_tokens_seen": 17747755, "step": 828, "time_per_iteration": 2.86991548538208 }, { "auxiliary_loss_clip": 0.0121958, "auxiliary_loss_mlp": 0.01059689, "balance_loss_clip": 1.05939984, "balance_loss_mlp": 1.03390431, "epoch": 0.04984217646174658, "flos": 25228539146880.0, "grad_norm": 2.48577103336206, "language_loss": 0.83530867, "learning_rate": 3.995871921941519e-06, "loss": 0.85810131, "num_input_tokens_seen": 17768550, "step": 829, "time_per_iteration": 2.655895948410034 }, { "auxiliary_loss_clip": 0.01226863, "auxiliary_loss_mlp": 0.01080723, "balance_loss_clip": 1.06109536, "balance_loss_mlp": 1.05068195, "epoch": 0.04990229971441455, "flos": 15959636242560.0, "grad_norm": 2.078538436430036, "language_loss": 0.74857247, "learning_rate": 3.99584687405508e-06, "loss": 0.77164829, "num_input_tokens_seen": 17786080, "step": 830, "time_per_iteration": 2.5820400714874268 }, { "auxiliary_loss_clip": 0.0122584, "auxiliary_loss_mlp": 0.01074077, "balance_loss_clip": 1.06154907, "balance_loss_mlp": 1.04667115, "epoch": 0.04996242296708252, "flos": 18405116968320.0, "grad_norm": 1.8327841960194244, "language_loss": 0.79279459, "learning_rate": 3.995821750485929e-06, "loss": 0.81579381, "num_input_tokens_seen": 17803635, "step": 831, "time_per_iteration": 2.5980231761932373 }, { "auxiliary_loss_clip": 0.01173206, "auxiliary_loss_mlp": 0.01072743, "balance_loss_clip": 1.0542444, "balance_loss_mlp": 1.04725623, "epoch": 0.05002254621975049, "flos": 17858053854720.0, "grad_norm": 3.034319898285603, "language_loss": 0.91497368, "learning_rate": 3.995796551235016e-06, "loss": 0.93743312, "num_input_tokens_seen": 17822190, "step": 832, "time_per_iteration": 2.7498815059661865 }, { "auxiliary_loss_clip": 0.01194428, "auxiliary_loss_mlp": 0.01081719, "balance_loss_clip": 1.05826366, "balance_loss_mlp": 1.05667353, "epoch": 0.050082669472418455, "flos": 45660273367680.0, "grad_norm": 1.887029338258115, "language_loss": 0.83167893, "learning_rate": 3.9957712763032974e-06, "loss": 0.85444039, "num_input_tokens_seen": 17846915, "step": 833, "time_per_iteration": 2.863208770751953 }, { "auxiliary_loss_clip": 0.01199525, "auxiliary_loss_mlp": 0.01061962, "balance_loss_clip": 1.05888343, "balance_loss_mlp": 1.03468657, "epoch": 0.05014279272508643, "flos": 37962067363200.0, "grad_norm": 2.8753922020214033, "language_loss": 0.82409853, "learning_rate": 3.995745925691733e-06, "loss": 0.84671336, "num_input_tokens_seen": 17867270, "step": 834, "time_per_iteration": 2.7868030071258545 }, { "auxiliary_loss_clip": 0.01216246, "auxiliary_loss_mlp": 0.01064427, "balance_loss_clip": 1.06272483, "balance_loss_mlp": 1.03672278, "epoch": 0.0502029159777544, "flos": 20996179516800.0, "grad_norm": 2.2306487397141646, "language_loss": 0.92186153, "learning_rate": 3.995720499401282e-06, "loss": 0.94466823, "num_input_tokens_seen": 17884880, "step": 835, "time_per_iteration": 2.6224496364593506 }, { "auxiliary_loss_clip": 0.01229494, "auxiliary_loss_mlp": 0.01074922, "balance_loss_clip": 1.06143415, "balance_loss_mlp": 1.0464313, "epoch": 0.050263039230422364, "flos": 15888066393600.0, "grad_norm": 2.196832783808158, "language_loss": 0.76143622, "learning_rate": 3.995694997432911e-06, "loss": 0.78448039, "num_input_tokens_seen": 17903695, "step": 836, "time_per_iteration": 2.5648462772369385 }, { "auxiliary_loss_clip": 0.01211162, "auxiliary_loss_mlp": 0.01075977, "balance_loss_clip": 1.06259084, "balance_loss_mlp": 1.04992962, "epoch": 0.050323162483090336, "flos": 23732752060800.0, "grad_norm": 2.100773352560791, "language_loss": 0.83627856, "learning_rate": 3.9956694197875855e-06, "loss": 0.85914999, "num_input_tokens_seen": 17920745, "step": 837, "time_per_iteration": 2.7420156002044678 }, { "auxiliary_loss_clip": 0.01198815, "auxiliary_loss_mlp": 0.0078439, "balance_loss_clip": 1.06345344, "balance_loss_mlp": 1.00053763, "epoch": 0.0503832857357583, "flos": 20266223328000.0, "grad_norm": 2.1353335821274477, "language_loss": 0.72857559, "learning_rate": 3.995643766466275e-06, "loss": 0.7484076, "num_input_tokens_seen": 17938220, "step": 838, "time_per_iteration": 2.679177761077881 }, { "auxiliary_loss_clip": 0.01189223, "auxiliary_loss_mlp": 0.01071526, "balance_loss_clip": 1.05415273, "balance_loss_mlp": 1.04510927, "epoch": 0.05044340898842627, "flos": 17785011548160.0, "grad_norm": 1.8138261016039334, "language_loss": 0.83462799, "learning_rate": 3.995618037469953e-06, "loss": 0.85723549, "num_input_tokens_seen": 17957325, "step": 839, "time_per_iteration": 2.69063663482666 }, { "auxiliary_loss_clip": 0.01220356, "auxiliary_loss_mlp": 0.01069331, "balance_loss_clip": 1.05991399, "balance_loss_mlp": 1.04411805, "epoch": 0.050503532241094246, "flos": 22966526113920.0, "grad_norm": 1.7513762525269907, "language_loss": 0.85775483, "learning_rate": 3.995592232799595e-06, "loss": 0.88065171, "num_input_tokens_seen": 17975875, "step": 840, "time_per_iteration": 2.6477303504943848 }, { "auxiliary_loss_clip": 0.01192112, "auxiliary_loss_mlp": 0.01064377, "balance_loss_clip": 1.05451894, "balance_loss_mlp": 1.036291, "epoch": 0.05056365549376221, "flos": 22776989022720.0, "grad_norm": 1.7956760046069329, "language_loss": 0.9457823, "learning_rate": 3.99556635245618e-06, "loss": 0.96834719, "num_input_tokens_seen": 17994340, "step": 841, "time_per_iteration": 2.8354220390319824 }, { "auxiliary_loss_clip": 0.0122473, "auxiliary_loss_mlp": 0.01070125, "balance_loss_clip": 1.06219172, "balance_loss_mlp": 1.04329097, "epoch": 0.05062377874643018, "flos": 30916968399360.0, "grad_norm": 2.3106044659054104, "language_loss": 0.77566791, "learning_rate": 3.995540396440688e-06, "loss": 0.79861641, "num_input_tokens_seen": 18015260, "step": 842, "time_per_iteration": 2.6909749507904053 }, { "auxiliary_loss_clip": 0.01214637, "auxiliary_loss_mlp": 0.01071033, "balance_loss_clip": 1.06270838, "balance_loss_mlp": 1.04391265, "epoch": 0.05068390199909815, "flos": 19647159402240.0, "grad_norm": 2.8849837971101864, "language_loss": 0.78126526, "learning_rate": 3.995514364754105e-06, "loss": 0.80412203, "num_input_tokens_seen": 18033960, "step": 843, "time_per_iteration": 2.6534156799316406 }, { "auxiliary_loss_clip": 0.01212948, "auxiliary_loss_mlp": 0.01063612, "balance_loss_clip": 1.06317043, "balance_loss_mlp": 1.03894806, "epoch": 0.05074402525176612, "flos": 37962103276800.0, "grad_norm": 1.9320015451631862, "language_loss": 0.83256191, "learning_rate": 3.995488257397417e-06, "loss": 0.85532749, "num_input_tokens_seen": 18056700, "step": 844, "time_per_iteration": 2.7682149410247803 }, { "auxiliary_loss_clip": 0.01216308, "auxiliary_loss_mlp": 0.01067162, "balance_loss_clip": 1.06307864, "balance_loss_mlp": 1.04138875, "epoch": 0.05080414850443409, "flos": 22054610603520.0, "grad_norm": 2.113957107027846, "language_loss": 0.77108061, "learning_rate": 3.995462074371614e-06, "loss": 0.79391527, "num_input_tokens_seen": 18075815, "step": 845, "time_per_iteration": 2.6720399856567383 }, { "auxiliary_loss_clip": 0.01206643, "auxiliary_loss_mlp": 0.01065522, "balance_loss_clip": 1.05881417, "balance_loss_mlp": 1.03885484, "epoch": 0.05086427175710206, "flos": 20225787592320.0, "grad_norm": 1.8497392628450484, "language_loss": 0.87773871, "learning_rate": 3.99543581567769e-06, "loss": 0.90046036, "num_input_tokens_seen": 18095095, "step": 846, "time_per_iteration": 2.696049690246582 }, { "auxiliary_loss_clip": 0.01206291, "auxiliary_loss_mlp": 0.01069231, "balance_loss_clip": 1.06204462, "balance_loss_mlp": 1.04330277, "epoch": 0.05092439500977003, "flos": 15159223526400.0, "grad_norm": 1.695550491545423, "language_loss": 0.87364423, "learning_rate": 3.9954094813166394e-06, "loss": 0.89639944, "num_input_tokens_seen": 18112675, "step": 847, "time_per_iteration": 2.666907548904419 }, { "auxiliary_loss_clip": 0.01175052, "auxiliary_loss_mlp": 0.01071976, "balance_loss_clip": 1.06267309, "balance_loss_mlp": 1.0447005, "epoch": 0.050984518262437994, "flos": 22055149307520.0, "grad_norm": 2.5687168450386637, "language_loss": 0.81878662, "learning_rate": 3.995383071289462e-06, "loss": 0.84125686, "num_input_tokens_seen": 18130745, "step": 848, "time_per_iteration": 2.782135486602783 }, { "auxiliary_loss_clip": 0.0122638, "auxiliary_loss_mlp": 0.01071388, "balance_loss_clip": 1.06619906, "balance_loss_mlp": 1.04544854, "epoch": 0.05104464151510597, "flos": 30225329043840.0, "grad_norm": 1.678404869397893, "language_loss": 0.87187904, "learning_rate": 3.995356585597158e-06, "loss": 0.89485669, "num_input_tokens_seen": 18152410, "step": 849, "time_per_iteration": 2.787992000579834 }, { "auxiliary_loss_clip": 0.01220251, "auxiliary_loss_mlp": 0.0106131, "balance_loss_clip": 1.06049275, "balance_loss_mlp": 1.03545308, "epoch": 0.05110476476777394, "flos": 18332900674560.0, "grad_norm": 2.125711462362114, "language_loss": 0.8315587, "learning_rate": 3.995330024240732e-06, "loss": 0.85437429, "num_input_tokens_seen": 18170870, "step": 850, "time_per_iteration": 2.6548752784729004 }, { "auxiliary_loss_clip": 0.01210598, "auxiliary_loss_mlp": 0.01063491, "balance_loss_clip": 1.06061506, "balance_loss_mlp": 1.0379566, "epoch": 0.051164888020441904, "flos": 37998732170880.0, "grad_norm": 2.2115645013354253, "language_loss": 0.65423882, "learning_rate": 3.995303387221192e-06, "loss": 0.67697972, "num_input_tokens_seen": 18191555, "step": 851, "time_per_iteration": 2.817197322845459 }, { "auxiliary_loss_clip": 0.0120566, "auxiliary_loss_mlp": 0.01075745, "balance_loss_clip": 1.05822444, "balance_loss_mlp": 1.04761147, "epoch": 0.051225011273109876, "flos": 23038634666880.0, "grad_norm": 2.3720786299251073, "language_loss": 0.83587611, "learning_rate": 3.995276674539547e-06, "loss": 0.8586902, "num_input_tokens_seen": 18208620, "step": 852, "time_per_iteration": 2.685727119445801 }, { "auxiliary_loss_clip": 0.01193575, "auxiliary_loss_mlp": 0.01074152, "balance_loss_clip": 1.05924761, "balance_loss_mlp": 1.04737723, "epoch": 0.05128513452577785, "flos": 18259822454400.0, "grad_norm": 2.1832763559951234, "language_loss": 0.80761266, "learning_rate": 3.995249886196811e-06, "loss": 0.8302899, "num_input_tokens_seen": 18226370, "step": 853, "time_per_iteration": 2.6078240871429443 }, { "auxiliary_loss_clip": 0.01222394, "auxiliary_loss_mlp": 0.01065268, "balance_loss_clip": 1.06223083, "balance_loss_mlp": 1.03780222, "epoch": 0.05134525777844581, "flos": 27198957571200.0, "grad_norm": 1.8511550328562763, "language_loss": 0.75617325, "learning_rate": 3.995223022193999e-06, "loss": 0.77904987, "num_input_tokens_seen": 18247075, "step": 854, "time_per_iteration": 2.633543014526367 }, { "auxiliary_loss_clip": 0.01202415, "auxiliary_loss_mlp": 0.01065973, "balance_loss_clip": 1.06141627, "balance_loss_mlp": 1.03828049, "epoch": 0.051405381031113785, "flos": 28362247436160.0, "grad_norm": 2.04057054323539, "language_loss": 0.81722355, "learning_rate": 3.99519608253213e-06, "loss": 0.83990741, "num_input_tokens_seen": 18265680, "step": 855, "time_per_iteration": 2.760880708694458 }, { "auxiliary_loss_clip": 0.01076712, "auxiliary_loss_mlp": 0.00762392, "balance_loss_clip": 1.0358243, "balance_loss_mlp": 1.00074518, "epoch": 0.05146550428378175, "flos": 65618169327360.0, "grad_norm": 0.9894594919315515, "language_loss": 0.65634769, "learning_rate": 3.995169067212227e-06, "loss": 0.67473871, "num_input_tokens_seen": 18327015, "step": 856, "time_per_iteration": 6.271182298660278 }, { "auxiliary_loss_clip": 0.01194232, "auxiliary_loss_mlp": 0.01056626, "balance_loss_clip": 1.05972147, "balance_loss_mlp": 1.02994716, "epoch": 0.05152562753644972, "flos": 22054861998720.0, "grad_norm": 1.8001295724347575, "language_loss": 0.77139348, "learning_rate": 3.9951419762353116e-06, "loss": 0.79390204, "num_input_tokens_seen": 18345235, "step": 857, "time_per_iteration": 4.905239582061768 }, { "auxiliary_loss_clip": 0.01183581, "auxiliary_loss_mlp": 0.01059685, "balance_loss_clip": 1.05640614, "balance_loss_mlp": 1.03291047, "epoch": 0.051585750789117694, "flos": 18509544783360.0, "grad_norm": 2.111656321737554, "language_loss": 0.89194518, "learning_rate": 3.995114809602412e-06, "loss": 0.91437781, "num_input_tokens_seen": 18362350, "step": 858, "time_per_iteration": 2.7349045276641846 }, { "auxiliary_loss_clip": 0.01196113, "auxiliary_loss_mlp": 0.01060739, "balance_loss_clip": 1.06114125, "balance_loss_mlp": 1.03398848, "epoch": 0.05164587404178566, "flos": 23730238108800.0, "grad_norm": 2.030377637624243, "language_loss": 0.75684321, "learning_rate": 3.9950875673145605e-06, "loss": 0.77941179, "num_input_tokens_seen": 18383390, "step": 859, "time_per_iteration": 2.7611751556396484 }, { "auxiliary_loss_clip": 0.01186313, "auxiliary_loss_mlp": 0.0107269, "balance_loss_clip": 1.05708003, "balance_loss_mlp": 1.04354358, "epoch": 0.05170599729445363, "flos": 16252882876800.0, "grad_norm": 2.134655488493178, "language_loss": 0.91122925, "learning_rate": 3.995060249372788e-06, "loss": 0.93381929, "num_input_tokens_seen": 18399220, "step": 860, "time_per_iteration": 2.666740894317627 }, { "auxiliary_loss_clip": 0.0122488, "auxiliary_loss_mlp": 0.01060586, "balance_loss_clip": 1.06531346, "balance_loss_mlp": 1.03536153, "epoch": 0.0517661205471216, "flos": 23985922095360.0, "grad_norm": 1.7954568874114027, "language_loss": 0.82378531, "learning_rate": 3.99503285577813e-06, "loss": 0.84663993, "num_input_tokens_seen": 18419005, "step": 861, "time_per_iteration": 2.6337814331054688 }, { "auxiliary_loss_clip": 0.01198486, "auxiliary_loss_mlp": 0.01060236, "balance_loss_clip": 1.06147969, "balance_loss_mlp": 1.03437924, "epoch": 0.05182624379978957, "flos": 29277718392960.0, "grad_norm": 2.5785699637959776, "language_loss": 0.78664875, "learning_rate": 3.995005386531627e-06, "loss": 0.80923599, "num_input_tokens_seen": 18440550, "step": 862, "time_per_iteration": 2.7570109367370605 }, { "auxiliary_loss_clip": 0.01189664, "auxiliary_loss_mlp": 0.01070327, "balance_loss_clip": 1.058797, "balance_loss_mlp": 1.04547238, "epoch": 0.05188636705245754, "flos": 24170826332160.0, "grad_norm": 1.7880881456146414, "language_loss": 0.89090264, "learning_rate": 3.9949778416343195e-06, "loss": 0.91350257, "num_input_tokens_seen": 18461950, "step": 863, "time_per_iteration": 2.7118866443634033 }, { "auxiliary_loss_clip": 0.01201772, "auxiliary_loss_mlp": 0.01064316, "balance_loss_clip": 1.06488204, "balance_loss_mlp": 1.0369451, "epoch": 0.051946490305125506, "flos": 26760703731840.0, "grad_norm": 2.081656150811602, "language_loss": 0.76119763, "learning_rate": 3.9949502210872525e-06, "loss": 0.78385854, "num_input_tokens_seen": 18480555, "step": 864, "time_per_iteration": 2.6946637630462646 }, { "auxiliary_loss_clip": 0.01186585, "auxiliary_loss_mlp": 0.01067959, "balance_loss_clip": 1.05559874, "balance_loss_mlp": 1.04046965, "epoch": 0.05200661355779348, "flos": 21502519585920.0, "grad_norm": 1.9374308734697678, "language_loss": 0.7908361, "learning_rate": 3.994922524891474e-06, "loss": 0.81338149, "num_input_tokens_seen": 18499645, "step": 865, "time_per_iteration": 2.7700579166412354 }, { "auxiliary_loss_clip": 0.01210067, "auxiliary_loss_mlp": 0.01067568, "balance_loss_clip": 1.06164694, "balance_loss_mlp": 1.04152083, "epoch": 0.05206673681046144, "flos": 18114492026880.0, "grad_norm": 2.269489500676155, "language_loss": 0.85860598, "learning_rate": 3.994894753048032e-06, "loss": 0.88138229, "num_input_tokens_seen": 18516810, "step": 866, "time_per_iteration": 2.659614086151123 }, { "auxiliary_loss_clip": 0.01186536, "auxiliary_loss_mlp": 0.01070465, "balance_loss_clip": 1.06327558, "balance_loss_mlp": 1.04371393, "epoch": 0.052126860063129415, "flos": 17524191916800.0, "grad_norm": 2.1733876112564565, "language_loss": 0.87495244, "learning_rate": 3.9948669055579815e-06, "loss": 0.89752245, "num_input_tokens_seen": 18532510, "step": 867, "time_per_iteration": 2.740238904953003 }, { "auxiliary_loss_clip": 0.01167585, "auxiliary_loss_mlp": 0.01078445, "balance_loss_clip": 1.05696058, "balance_loss_mlp": 1.05437636, "epoch": 0.05218698331579739, "flos": 32598054771840.0, "grad_norm": 1.8498678854952728, "language_loss": 0.63917863, "learning_rate": 3.9948389824223785e-06, "loss": 0.66163892, "num_input_tokens_seen": 18557380, "step": 868, "time_per_iteration": 2.9310383796691895 }, { "auxiliary_loss_clip": 0.01225135, "auxiliary_loss_mlp": 0.01069894, "balance_loss_clip": 1.06287289, "balance_loss_mlp": 1.04173636, "epoch": 0.05224710656846535, "flos": 22127293774080.0, "grad_norm": 2.742912036955754, "language_loss": 0.83379138, "learning_rate": 3.994810983642281e-06, "loss": 0.85674161, "num_input_tokens_seen": 18575720, "step": 869, "time_per_iteration": 2.6453137397766113 }, { "auxiliary_loss_clip": 0.01216406, "auxiliary_loss_mlp": 0.01056401, "balance_loss_clip": 1.0645746, "balance_loss_mlp": 1.03053236, "epoch": 0.052307229821133325, "flos": 11145092976000.0, "grad_norm": 2.188953802542244, "language_loss": 0.87822217, "learning_rate": 3.994782909218751e-06, "loss": 0.90095031, "num_input_tokens_seen": 18592185, "step": 870, "time_per_iteration": 2.7044875621795654 }, { "auxiliary_loss_clip": 0.01226316, "auxiliary_loss_mlp": 0.01064746, "balance_loss_clip": 1.06603277, "balance_loss_mlp": 1.03965199, "epoch": 0.05236735307380129, "flos": 19128070005120.0, "grad_norm": 1.975067156516721, "language_loss": 0.80651748, "learning_rate": 3.994754759152854e-06, "loss": 0.82942802, "num_input_tokens_seen": 18609560, "step": 871, "time_per_iteration": 2.6892175674438477 }, { "auxiliary_loss_clip": 0.0119502, "auxiliary_loss_mlp": 0.01064309, "balance_loss_clip": 1.0650804, "balance_loss_mlp": 1.0396452, "epoch": 0.05242747632646926, "flos": 20960663944320.0, "grad_norm": 1.7402390708810018, "language_loss": 0.81330585, "learning_rate": 3.994726533445656e-06, "loss": 0.83589917, "num_input_tokens_seen": 18629405, "step": 872, "time_per_iteration": 2.8044185638427734 }, { "auxiliary_loss_clip": 0.0107835, "auxiliary_loss_mlp": 0.01020667, "balance_loss_clip": 1.03168392, "balance_loss_mlp": 1.01515913, "epoch": 0.052487599579137234, "flos": 65020542842880.0, "grad_norm": 0.883483589670371, "language_loss": 0.61589074, "learning_rate": 3.9946982320982274e-06, "loss": 0.63688087, "num_input_tokens_seen": 18681480, "step": 873, "time_per_iteration": 3.1711297035217285 }, { "auxiliary_loss_clip": 0.01197438, "auxiliary_loss_mlp": 0.01056818, "balance_loss_clip": 1.06202292, "balance_loss_mlp": 1.03120041, "epoch": 0.0525477228318052, "flos": 23288859786240.0, "grad_norm": 2.1995328011281488, "language_loss": 0.88965189, "learning_rate": 3.994669855111643e-06, "loss": 0.91219449, "num_input_tokens_seen": 18700390, "step": 874, "time_per_iteration": 2.8240153789520264 }, { "auxiliary_loss_clip": 0.01197247, "auxiliary_loss_mlp": 0.01063458, "balance_loss_clip": 1.0614326, "balance_loss_mlp": 1.03682709, "epoch": 0.05260784608447317, "flos": 32230221546240.0, "grad_norm": 1.858649685360537, "language_loss": 0.74537963, "learning_rate": 3.994641402486977e-06, "loss": 0.76798666, "num_input_tokens_seen": 18721280, "step": 875, "time_per_iteration": 2.9111931324005127 }, { "auxiliary_loss_clip": 0.01206205, "auxiliary_loss_mlp": 0.01058912, "balance_loss_clip": 1.06306934, "balance_loss_mlp": 1.03210175, "epoch": 0.052667969337141136, "flos": 24463211040000.0, "grad_norm": 1.7697857141051123, "language_loss": 0.92843151, "learning_rate": 3.99461287422531e-06, "loss": 0.95108265, "num_input_tokens_seen": 18741545, "step": 876, "time_per_iteration": 2.800252676010132 }, { "auxiliary_loss_clip": 0.01100151, "auxiliary_loss_mlp": 0.01006341, "balance_loss_clip": 1.02669787, "balance_loss_mlp": 1.0020256, "epoch": 0.05272809258980911, "flos": 57784329567360.0, "grad_norm": 0.8383495859932864, "language_loss": 0.62929404, "learning_rate": 3.994584270327722e-06, "loss": 0.65035897, "num_input_tokens_seen": 18801400, "step": 877, "time_per_iteration": 3.2090368270874023 }, { "auxiliary_loss_clip": 0.01200578, "auxiliary_loss_mlp": 0.0106702, "balance_loss_clip": 1.06150424, "balance_loss_mlp": 1.03931606, "epoch": 0.05278821584247708, "flos": 17420805596160.0, "grad_norm": 2.042786693643985, "language_loss": 0.85383844, "learning_rate": 3.994555590795299e-06, "loss": 0.87651443, "num_input_tokens_seen": 18819670, "step": 878, "time_per_iteration": 2.823835849761963 }, { "auxiliary_loss_clip": 0.0122514, "auxiliary_loss_mlp": 0.01061117, "balance_loss_clip": 1.0635035, "balance_loss_mlp": 1.03551078, "epoch": 0.052848339095145046, "flos": 26137258346880.0, "grad_norm": 1.7462717669338121, "language_loss": 0.83076209, "learning_rate": 3.9945268356291275e-06, "loss": 0.8536247, "num_input_tokens_seen": 18840580, "step": 879, "time_per_iteration": 2.743673086166382 }, { "auxiliary_loss_clip": 0.0119139, "auxiliary_loss_mlp": 0.01066471, "balance_loss_clip": 1.06152987, "balance_loss_mlp": 1.04013824, "epoch": 0.05290846234781302, "flos": 16472081623680.0, "grad_norm": 1.9601789563010765, "language_loss": 0.84284604, "learning_rate": 3.9944980048302985e-06, "loss": 0.86542469, "num_input_tokens_seen": 18859295, "step": 880, "time_per_iteration": 2.7560529708862305 }, { "auxiliary_loss_clip": 0.01184956, "auxiliary_loss_mlp": 0.01065063, "balance_loss_clip": 1.05969453, "balance_loss_mlp": 1.03887296, "epoch": 0.05296858560048098, "flos": 19865173000320.0, "grad_norm": 2.4477328752698564, "language_loss": 0.86870736, "learning_rate": 3.994469098399906e-06, "loss": 0.89120758, "num_input_tokens_seen": 18877485, "step": 881, "time_per_iteration": 2.855395555496216 }, { "auxiliary_loss_clip": 0.01207858, "auxiliary_loss_mlp": 0.01070235, "balance_loss_clip": 1.05984437, "balance_loss_mlp": 1.04238808, "epoch": 0.053028708853148955, "flos": 24388588535040.0, "grad_norm": 1.7611192020675561, "language_loss": 0.87967896, "learning_rate": 3.994440116339046e-06, "loss": 0.90245986, "num_input_tokens_seen": 18898275, "step": 882, "time_per_iteration": 2.8480119705200195 }, { "auxiliary_loss_clip": 0.01224906, "auxiliary_loss_mlp": 0.01057944, "balance_loss_clip": 1.06268644, "balance_loss_mlp": 1.03059733, "epoch": 0.05308883210581693, "flos": 36393166143360.0, "grad_norm": 2.3555018967788635, "language_loss": 0.69469339, "learning_rate": 3.994411058648816e-06, "loss": 0.71752191, "num_input_tokens_seen": 18920665, "step": 883, "time_per_iteration": 2.8808236122131348 }, { "auxiliary_loss_clip": 0.01166777, "auxiliary_loss_mlp": 0.01063991, "balance_loss_clip": 1.05333591, "balance_loss_mlp": 1.03855157, "epoch": 0.05314895535848489, "flos": 22855095146880.0, "grad_norm": 2.039016812023355, "language_loss": 0.76100993, "learning_rate": 3.994381925330319e-06, "loss": 0.78331757, "num_input_tokens_seen": 18939835, "step": 884, "time_per_iteration": 2.8462212085723877 }, { "auxiliary_loss_clip": 0.01172569, "auxiliary_loss_mlp": 0.01066856, "balance_loss_clip": 1.06269383, "balance_loss_mlp": 1.04147613, "epoch": 0.053209078611152864, "flos": 12860330204160.0, "grad_norm": 1.9865896222141148, "language_loss": 0.86195529, "learning_rate": 3.994352716384659e-06, "loss": 0.88434947, "num_input_tokens_seen": 18958405, "step": 885, "time_per_iteration": 2.7825753688812256 }, { "auxiliary_loss_clip": 0.0118405, "auxiliary_loss_mlp": 0.01068976, "balance_loss_clip": 1.05229151, "balance_loss_mlp": 1.04203486, "epoch": 0.05326920186382083, "flos": 12164596698240.0, "grad_norm": 2.608647457747672, "language_loss": 0.85971159, "learning_rate": 3.994323431812945e-06, "loss": 0.88224185, "num_input_tokens_seen": 18975445, "step": 886, "time_per_iteration": 2.7393639087677 }, { "auxiliary_loss_clip": 0.0117343, "auxiliary_loss_mlp": 0.01065966, "balance_loss_clip": 1.05620933, "balance_loss_mlp": 1.03879774, "epoch": 0.0533293251164888, "flos": 22704485420160.0, "grad_norm": 2.040002880698432, "language_loss": 0.8961553, "learning_rate": 3.994294071616286e-06, "loss": 0.91854936, "num_input_tokens_seen": 18991930, "step": 887, "time_per_iteration": 2.8606581687927246 }, { "auxiliary_loss_clip": 0.01144444, "auxiliary_loss_mlp": 0.01072438, "balance_loss_clip": 1.04453194, "balance_loss_mlp": 1.04411352, "epoch": 0.053389448369156774, "flos": 26940939200640.0, "grad_norm": 2.062562868466936, "language_loss": 0.74852538, "learning_rate": 3.994264635795796e-06, "loss": 0.77069414, "num_input_tokens_seen": 19009790, "step": 888, "time_per_iteration": 2.8675312995910645 }, { "auxiliary_loss_clip": 0.01164085, "auxiliary_loss_mlp": 0.01072324, "balance_loss_clip": 1.05659473, "balance_loss_mlp": 1.04525173, "epoch": 0.05344957162182474, "flos": 25556331686400.0, "grad_norm": 1.7884280759117637, "language_loss": 0.88440782, "learning_rate": 3.994235124352592e-06, "loss": 0.9067719, "num_input_tokens_seen": 19030170, "step": 889, "time_per_iteration": 2.9419636726379395 }, { "auxiliary_loss_clip": 0.0121577, "auxiliary_loss_mlp": 0.0105125, "balance_loss_clip": 1.06085157, "balance_loss_mlp": 1.02607334, "epoch": 0.05350969487449271, "flos": 19719591177600.0, "grad_norm": 1.9333059575084248, "language_loss": 0.88386381, "learning_rate": 3.994205537287791e-06, "loss": 0.90653402, "num_input_tokens_seen": 19048075, "step": 890, "time_per_iteration": 2.7030327320098877 }, { "auxiliary_loss_clip": 0.01195034, "auxiliary_loss_mlp": 0.01069003, "balance_loss_clip": 1.05835462, "balance_loss_mlp": 1.04450595, "epoch": 0.053569818127160676, "flos": 27016351804800.0, "grad_norm": 2.435204176890571, "language_loss": 0.93450797, "learning_rate": 3.994175874602517e-06, "loss": 0.95714831, "num_input_tokens_seen": 19067465, "step": 891, "time_per_iteration": 2.81527042388916 }, { "auxiliary_loss_clip": 0.01190797, "auxiliary_loss_mlp": 0.01066955, "balance_loss_clip": 1.05605483, "balance_loss_mlp": 1.03909576, "epoch": 0.05362994137982865, "flos": 13188338225280.0, "grad_norm": 2.3400199158693087, "language_loss": 0.71625131, "learning_rate": 3.994146136297893e-06, "loss": 0.73882878, "num_input_tokens_seen": 19085505, "step": 892, "time_per_iteration": 2.825984239578247 }, { "auxiliary_loss_clip": 0.01191313, "auxiliary_loss_mlp": 0.0078394, "balance_loss_clip": 1.05727172, "balance_loss_mlp": 1.00024366, "epoch": 0.05369006463249662, "flos": 28658008022400.0, "grad_norm": 1.6058100223173828, "language_loss": 0.82331586, "learning_rate": 3.994116322375049e-06, "loss": 0.84306836, "num_input_tokens_seen": 19104360, "step": 893, "time_per_iteration": 2.8618266582489014 }, { "auxiliary_loss_clip": 0.01192677, "auxiliary_loss_mlp": 0.01063531, "balance_loss_clip": 1.0572021, "balance_loss_mlp": 1.03850877, "epoch": 0.053750187885164585, "flos": 28913153304960.0, "grad_norm": 2.0228714136718122, "language_loss": 0.82052565, "learning_rate": 3.994086432835114e-06, "loss": 0.84308773, "num_input_tokens_seen": 19124680, "step": 894, "time_per_iteration": 2.8347885608673096 }, { "auxiliary_loss_clip": 0.0120111, "auxiliary_loss_mlp": 0.01065233, "balance_loss_clip": 1.0570271, "balance_loss_mlp": 1.03997254, "epoch": 0.05381031113783256, "flos": 15158828476800.0, "grad_norm": 2.260594705980758, "language_loss": 0.76133072, "learning_rate": 3.994056467679221e-06, "loss": 0.78399414, "num_input_tokens_seen": 19142895, "step": 895, "time_per_iteration": 2.7288858890533447 }, { "auxiliary_loss_clip": 0.01200143, "auxiliary_loss_mlp": 0.01060588, "balance_loss_clip": 1.06422663, "balance_loss_mlp": 1.03547084, "epoch": 0.05387043439050053, "flos": 21835232288640.0, "grad_norm": 2.0450623179174974, "language_loss": 0.86767507, "learning_rate": 3.9940264269085065e-06, "loss": 0.89028239, "num_input_tokens_seen": 19163125, "step": 896, "time_per_iteration": 4.404265642166138 }, { "auxiliary_loss_clip": 0.0122203, "auxiliary_loss_mlp": 0.00782931, "balance_loss_clip": 1.06062579, "balance_loss_mlp": 1.0002867, "epoch": 0.053930557643168495, "flos": 17310308382720.0, "grad_norm": 3.0866230440609805, "language_loss": 0.8797363, "learning_rate": 3.9939963105241115e-06, "loss": 0.89978594, "num_input_tokens_seen": 19179385, "step": 897, "time_per_iteration": 4.843130588531494 }, { "auxiliary_loss_clip": 0.01201639, "auxiliary_loss_mlp": 0.01063724, "balance_loss_clip": 1.05896854, "balance_loss_mlp": 1.03658032, "epoch": 0.05399068089583647, "flos": 17348481561600.0, "grad_norm": 1.8270040910241792, "language_loss": 0.90170419, "learning_rate": 3.993966118527175e-06, "loss": 0.92435783, "num_input_tokens_seen": 19198725, "step": 898, "time_per_iteration": 2.695235252380371 }, { "auxiliary_loss_clip": 0.01200189, "auxiliary_loss_mlp": 0.01076438, "balance_loss_clip": 1.05787873, "balance_loss_mlp": 1.05105805, "epoch": 0.05405080414850443, "flos": 17486952491520.0, "grad_norm": 2.793625116693953, "language_loss": 0.91544139, "learning_rate": 3.993935850918845e-06, "loss": 0.93820769, "num_input_tokens_seen": 19212380, "step": 899, "time_per_iteration": 2.7509548664093018 }, { "auxiliary_loss_clip": 0.01186479, "auxiliary_loss_mlp": 0.01068594, "balance_loss_clip": 1.05614042, "balance_loss_mlp": 1.04154527, "epoch": 0.054110927401172404, "flos": 24496787278080.0, "grad_norm": 1.983572968760697, "language_loss": 0.75742769, "learning_rate": 3.9939055077002665e-06, "loss": 0.77997845, "num_input_tokens_seen": 19232235, "step": 900, "time_per_iteration": 2.771371364593506 }, { "auxiliary_loss_clip": 0.01211506, "auxiliary_loss_mlp": 0.01058176, "balance_loss_clip": 1.05839145, "balance_loss_mlp": 1.03401244, "epoch": 0.054171050653840376, "flos": 22930040874240.0, "grad_norm": 2.192527627735503, "language_loss": 0.74331856, "learning_rate": 3.993875088872592e-06, "loss": 0.76601535, "num_input_tokens_seen": 19251460, "step": 901, "time_per_iteration": 2.859912157058716 }, { "auxiliary_loss_clip": 0.01177502, "auxiliary_loss_mlp": 0.01065445, "balance_loss_clip": 1.0569309, "balance_loss_mlp": 1.04166329, "epoch": 0.05423117390650834, "flos": 12933192942720.0, "grad_norm": 2.352700712836257, "language_loss": 0.85287452, "learning_rate": 3.9938445944369745e-06, "loss": 0.87530404, "num_input_tokens_seen": 19269060, "step": 902, "time_per_iteration": 2.7940642833709717 }, { "auxiliary_loss_clip": 0.01161069, "auxiliary_loss_mlp": 0.01066664, "balance_loss_clip": 1.04903233, "balance_loss_mlp": 1.04112983, "epoch": 0.05429129715917631, "flos": 19901335017600.0, "grad_norm": 1.9620711230312637, "language_loss": 0.86385572, "learning_rate": 3.993814024394569e-06, "loss": 0.88613302, "num_input_tokens_seen": 19288620, "step": 903, "time_per_iteration": 2.9258980751037598 }, { "auxiliary_loss_clip": 0.0121005, "auxiliary_loss_mlp": 0.01059616, "balance_loss_clip": 1.06094384, "balance_loss_mlp": 1.03534508, "epoch": 0.05435142041184428, "flos": 16908611610240.0, "grad_norm": 2.175127974944855, "language_loss": 0.74927866, "learning_rate": 3.993783378746537e-06, "loss": 0.7719754, "num_input_tokens_seen": 19306615, "step": 904, "time_per_iteration": 2.7239954471588135 }, { "auxiliary_loss_clip": 0.01208402, "auxiliary_loss_mlp": 0.01067543, "balance_loss_clip": 1.06052148, "balance_loss_mlp": 1.04325962, "epoch": 0.05441154366451225, "flos": 23948323534080.0, "grad_norm": 2.5191963984804535, "language_loss": 0.85946918, "learning_rate": 3.993752657494039e-06, "loss": 0.88222867, "num_input_tokens_seen": 19321680, "step": 905, "time_per_iteration": 2.693896532058716 }, { "auxiliary_loss_clip": 0.01198232, "auxiliary_loss_mlp": 0.01078072, "balance_loss_clip": 1.06483209, "balance_loss_mlp": 1.05400348, "epoch": 0.05447166691718022, "flos": 19975382904960.0, "grad_norm": 1.7753581401878566, "language_loss": 0.74413162, "learning_rate": 3.993721860638241e-06, "loss": 0.7668947, "num_input_tokens_seen": 19339760, "step": 906, "time_per_iteration": 2.6679019927978516 }, { "auxiliary_loss_clip": 0.01192373, "auxiliary_loss_mlp": 0.01064381, "balance_loss_clip": 1.05954027, "balance_loss_mlp": 1.0397284, "epoch": 0.05453179016984819, "flos": 24936513575040.0, "grad_norm": 2.3037248114268896, "language_loss": 0.87340188, "learning_rate": 3.993690988180309e-06, "loss": 0.89596951, "num_input_tokens_seen": 19359585, "step": 907, "time_per_iteration": 2.7363240718841553 }, { "auxiliary_loss_clip": 0.01205519, "auxiliary_loss_mlp": 0.01068463, "balance_loss_clip": 1.0616293, "balance_loss_mlp": 1.04332149, "epoch": 0.05459191342251616, "flos": 18115102558080.0, "grad_norm": 1.6666873589767146, "language_loss": 0.86928803, "learning_rate": 3.9936600401214165e-06, "loss": 0.89202785, "num_input_tokens_seen": 19378590, "step": 908, "time_per_iteration": 2.6266026496887207 }, { "auxiliary_loss_clip": 0.01198848, "auxiliary_loss_mlp": 0.01067336, "balance_loss_clip": 1.05974221, "balance_loss_mlp": 1.04107404, "epoch": 0.054652036675184125, "flos": 19208295031680.0, "grad_norm": 2.1282794409977215, "language_loss": 0.89792144, "learning_rate": 3.9936290164627345e-06, "loss": 0.92058325, "num_input_tokens_seen": 19397910, "step": 909, "time_per_iteration": 2.7163166999816895 }, { "auxiliary_loss_clip": 0.01200393, "auxiliary_loss_mlp": 0.01073374, "balance_loss_clip": 1.06157839, "balance_loss_mlp": 1.04742169, "epoch": 0.0547121599278521, "flos": 16325745615360.0, "grad_norm": 2.095924869989121, "language_loss": 0.70949811, "learning_rate": 3.99359791720544e-06, "loss": 0.73223579, "num_input_tokens_seen": 19415950, "step": 910, "time_per_iteration": 2.6697354316711426 }, { "auxiliary_loss_clip": 0.01187784, "auxiliary_loss_mlp": 0.01054671, "balance_loss_clip": 1.05651259, "balance_loss_mlp": 1.02975583, "epoch": 0.05477228318052007, "flos": 20339014239360.0, "grad_norm": 1.6633724338567386, "language_loss": 0.83651805, "learning_rate": 3.993566742350714e-06, "loss": 0.85894263, "num_input_tokens_seen": 19435275, "step": 911, "time_per_iteration": 2.692798137664795 }, { "auxiliary_loss_clip": 0.01187113, "auxiliary_loss_mlp": 0.01073028, "balance_loss_clip": 1.05334687, "balance_loss_mlp": 1.04719508, "epoch": 0.054832406433188034, "flos": 21973092687360.0, "grad_norm": 2.283907419545301, "language_loss": 0.76320881, "learning_rate": 3.993535491899736e-06, "loss": 0.78581023, "num_input_tokens_seen": 19452090, "step": 912, "time_per_iteration": 2.6653189659118652 }, { "auxiliary_loss_clip": 0.01186313, "auxiliary_loss_mlp": 0.01051652, "balance_loss_clip": 1.05707574, "balance_loss_mlp": 1.0271548, "epoch": 0.054892529685856006, "flos": 16398931576320.0, "grad_norm": 2.366460016615147, "language_loss": 0.82826668, "learning_rate": 3.993504165853694e-06, "loss": 0.85064626, "num_input_tokens_seen": 19470865, "step": 913, "time_per_iteration": 2.6826348304748535 }, { "auxiliary_loss_clip": 0.01194515, "auxiliary_loss_mlp": 0.01060483, "balance_loss_clip": 1.0581125, "balance_loss_mlp": 1.03651023, "epoch": 0.05495265293852397, "flos": 23912341084800.0, "grad_norm": 3.3338391252510586, "language_loss": 0.8373239, "learning_rate": 3.993472764213772e-06, "loss": 0.85987389, "num_input_tokens_seen": 19492145, "step": 914, "time_per_iteration": 2.7358829975128174 }, { "auxiliary_loss_clip": 0.0120705, "auxiliary_loss_mlp": 0.0078227, "balance_loss_clip": 1.06039774, "balance_loss_mlp": 1.00027478, "epoch": 0.055012776191191944, "flos": 23586954756480.0, "grad_norm": 2.520244909384168, "language_loss": 0.90146536, "learning_rate": 3.9934412869811655e-06, "loss": 0.92135859, "num_input_tokens_seen": 19511015, "step": 915, "time_per_iteration": 2.9398341178894043 }, { "auxiliary_loss_clip": 0.01201461, "auxiliary_loss_mlp": 0.01059252, "balance_loss_clip": 1.06274199, "balance_loss_mlp": 1.03558862, "epoch": 0.055072899443859916, "flos": 17528501548800.0, "grad_norm": 2.182721785653499, "language_loss": 0.89710975, "learning_rate": 3.993409734157064e-06, "loss": 0.91971689, "num_input_tokens_seen": 19529040, "step": 916, "time_per_iteration": 2.7210159301757812 }, { "auxiliary_loss_clip": 0.01175226, "auxiliary_loss_mlp": 0.01066073, "balance_loss_clip": 1.05741024, "balance_loss_mlp": 1.04103947, "epoch": 0.05513302269652788, "flos": 21687172427520.0, "grad_norm": 1.7899379897310368, "language_loss": 0.8016991, "learning_rate": 3.993378105742666e-06, "loss": 0.82411212, "num_input_tokens_seen": 19549540, "step": 917, "time_per_iteration": 2.7923104763031006 }, { "auxiliary_loss_clip": 0.01139072, "auxiliary_loss_mlp": 0.0105947, "balance_loss_clip": 1.05135942, "balance_loss_mlp": 1.03414989, "epoch": 0.05519314594919585, "flos": 21613340021760.0, "grad_norm": 2.106744179667805, "language_loss": 0.79437333, "learning_rate": 3.9933464017391705e-06, "loss": 0.81635869, "num_input_tokens_seen": 19567570, "step": 918, "time_per_iteration": 2.8051092624664307 }, { "auxiliary_loss_clip": 0.01196947, "auxiliary_loss_mlp": 0.01055679, "balance_loss_clip": 1.05616307, "balance_loss_mlp": 1.03166997, "epoch": 0.05525326920186382, "flos": 21798567480960.0, "grad_norm": 2.454030193031321, "language_loss": 0.89019686, "learning_rate": 3.99331462214778e-06, "loss": 0.91272312, "num_input_tokens_seen": 19585330, "step": 919, "time_per_iteration": 2.6846773624420166 }, { "auxiliary_loss_clip": 0.01213326, "auxiliary_loss_mlp": 0.01069349, "balance_loss_clip": 1.05950904, "balance_loss_mlp": 1.04417229, "epoch": 0.05531339245453179, "flos": 28439635288320.0, "grad_norm": 2.246354931091656, "language_loss": 0.8746047, "learning_rate": 3.993282766969699e-06, "loss": 0.89743137, "num_input_tokens_seen": 19604970, "step": 920, "time_per_iteration": 2.6699845790863037 }, { "auxiliary_loss_clip": 0.01190424, "auxiliary_loss_mlp": 0.0106036, "balance_loss_clip": 1.06023288, "balance_loss_mlp": 1.03657782, "epoch": 0.05537351570719976, "flos": 37375143131520.0, "grad_norm": 1.975714125194334, "language_loss": 0.6568011, "learning_rate": 3.993250836206136e-06, "loss": 0.67930895, "num_input_tokens_seen": 19626235, "step": 921, "time_per_iteration": 2.833644390106201 }, { "auxiliary_loss_clip": 0.01209678, "auxiliary_loss_mlp": 0.01065483, "balance_loss_clip": 1.06060767, "balance_loss_mlp": 1.03874445, "epoch": 0.05543363895986773, "flos": 20084479488000.0, "grad_norm": 1.7242493696651606, "language_loss": 0.71861136, "learning_rate": 3.993218829858301e-06, "loss": 0.74136293, "num_input_tokens_seen": 19644305, "step": 922, "time_per_iteration": 2.6168808937072754 }, { "auxiliary_loss_clip": 0.01187138, "auxiliary_loss_mlp": 0.01067213, "balance_loss_clip": 1.05423355, "balance_loss_mlp": 1.04223895, "epoch": 0.0554937622125357, "flos": 24533200690560.0, "grad_norm": 2.6848185900705412, "language_loss": 0.82304025, "learning_rate": 3.993186747927408e-06, "loss": 0.8455838, "num_input_tokens_seen": 19662130, "step": 923, "time_per_iteration": 2.7298316955566406 }, { "auxiliary_loss_clip": 0.01202941, "auxiliary_loss_mlp": 0.01064106, "balance_loss_clip": 1.05725455, "balance_loss_mlp": 1.03933442, "epoch": 0.055553885465203665, "flos": 14320063013760.0, "grad_norm": 1.9334372940525173, "language_loss": 0.78759122, "learning_rate": 3.993154590414675e-06, "loss": 0.81026167, "num_input_tokens_seen": 19680715, "step": 924, "time_per_iteration": 2.6869630813598633 }, { "auxiliary_loss_clip": 0.0116422, "auxiliary_loss_mlp": 0.01053758, "balance_loss_clip": 1.05395627, "balance_loss_mlp": 1.02844954, "epoch": 0.05561400871787164, "flos": 27381132374400.0, "grad_norm": 2.005203138116014, "language_loss": 1.02005315, "learning_rate": 3.993122357321319e-06, "loss": 1.04223299, "num_input_tokens_seen": 19700535, "step": 925, "time_per_iteration": 2.716089963912964 }, { "auxiliary_loss_clip": 0.01163201, "auxiliary_loss_mlp": 0.01052104, "balance_loss_clip": 1.05070591, "balance_loss_mlp": 1.02739179, "epoch": 0.05567413197053961, "flos": 23221096778880.0, "grad_norm": 2.0106641835017482, "language_loss": 0.80939209, "learning_rate": 3.993090048648564e-06, "loss": 0.83154511, "num_input_tokens_seen": 19718825, "step": 926, "time_per_iteration": 2.895803451538086 }, { "auxiliary_loss_clip": 0.01207515, "auxiliary_loss_mlp": 0.01068168, "balance_loss_clip": 1.05892682, "balance_loss_mlp": 1.0419066, "epoch": 0.055734255223207574, "flos": 25264952559360.0, "grad_norm": 2.9732625845644045, "language_loss": 0.73220479, "learning_rate": 3.993057664397634e-06, "loss": 0.75496161, "num_input_tokens_seen": 19739080, "step": 927, "time_per_iteration": 2.677725076675415 }, { "auxiliary_loss_clip": 0.01101002, "auxiliary_loss_mlp": 0.01015011, "balance_loss_clip": 1.02922702, "balance_loss_mlp": 1.01014709, "epoch": 0.055794378475875546, "flos": 66503116702080.0, "grad_norm": 0.8406874373244947, "language_loss": 0.59841412, "learning_rate": 3.9930252045697585e-06, "loss": 0.61957431, "num_input_tokens_seen": 19802960, "step": 928, "time_per_iteration": 3.187382221221924 }, { "auxiliary_loss_clip": 0.01202438, "auxiliary_loss_mlp": 0.01065066, "balance_loss_clip": 1.05921853, "balance_loss_mlp": 1.04070008, "epoch": 0.05585450172854351, "flos": 25337635729920.0, "grad_norm": 2.0668361967965994, "language_loss": 0.95411372, "learning_rate": 3.992992669166168e-06, "loss": 0.97678876, "num_input_tokens_seen": 19822765, "step": 929, "time_per_iteration": 2.6930506229400635 }, { "auxiliary_loss_clip": 0.01171806, "auxiliary_loss_mlp": 0.01068051, "balance_loss_clip": 1.05343258, "balance_loss_mlp": 1.04101443, "epoch": 0.05591462498121148, "flos": 33911738881920.0, "grad_norm": 2.1442452677256627, "language_loss": 0.71756601, "learning_rate": 3.992960058188094e-06, "loss": 0.7399646, "num_input_tokens_seen": 19843590, "step": 930, "time_per_iteration": 2.803219795227051 }, { "auxiliary_loss_clip": 0.01188277, "auxiliary_loss_mlp": 0.01058888, "balance_loss_clip": 1.05783677, "balance_loss_mlp": 1.03377056, "epoch": 0.055974748233879455, "flos": 17930880679680.0, "grad_norm": 2.381261552273062, "language_loss": 0.85279298, "learning_rate": 3.992927371636776e-06, "loss": 0.87526459, "num_input_tokens_seen": 19860230, "step": 931, "time_per_iteration": 2.6215872764587402 }, { "auxiliary_loss_clip": 0.01203533, "auxiliary_loss_mlp": 0.00783076, "balance_loss_clip": 1.05677414, "balance_loss_mlp": 1.00025761, "epoch": 0.05603487148654742, "flos": 24021976371840.0, "grad_norm": 2.2861197477099973, "language_loss": 0.83645165, "learning_rate": 3.9928946095134525e-06, "loss": 0.85631776, "num_input_tokens_seen": 19880795, "step": 932, "time_per_iteration": 2.664062261581421 }, { "auxiliary_loss_clip": 0.01200637, "auxiliary_loss_mlp": 0.0107041, "balance_loss_clip": 1.05897784, "balance_loss_mlp": 1.04407716, "epoch": 0.05609499473921539, "flos": 17307758517120.0, "grad_norm": 1.8036739452122519, "language_loss": 0.73694205, "learning_rate": 3.992861771819365e-06, "loss": 0.7596525, "num_input_tokens_seen": 19897960, "step": 933, "time_per_iteration": 2.631620168685913 }, { "auxiliary_loss_clip": 0.01153445, "auxiliary_loss_mlp": 0.01076903, "balance_loss_clip": 1.04885209, "balance_loss_mlp": 1.05060577, "epoch": 0.05615511799188336, "flos": 20994742972800.0, "grad_norm": 2.385249039382274, "language_loss": 0.86660421, "learning_rate": 3.99282885855576e-06, "loss": 0.88890779, "num_input_tokens_seen": 19913315, "step": 934, "time_per_iteration": 2.7739439010620117 }, { "auxiliary_loss_clip": 0.01164295, "auxiliary_loss_mlp": 0.0108083, "balance_loss_clip": 1.05509257, "balance_loss_mlp": 1.0557723, "epoch": 0.05621524124455133, "flos": 17273535834240.0, "grad_norm": 2.2740258482680433, "language_loss": 0.80388415, "learning_rate": 3.992795869723885e-06, "loss": 0.82633543, "num_input_tokens_seen": 19928790, "step": 935, "time_per_iteration": 5.93512487411499 }, { "auxiliary_loss_clip": 0.01093927, "auxiliary_loss_mlp": 0.01019701, "balance_loss_clip": 1.02288604, "balance_loss_mlp": 1.01540911, "epoch": 0.0562753644972193, "flos": 58719370458240.0, "grad_norm": 0.820561718243334, "language_loss": 0.69191676, "learning_rate": 3.99276280532499e-06, "loss": 0.71305299, "num_input_tokens_seen": 19988785, "step": 936, "time_per_iteration": 4.862478733062744 }, { "auxiliary_loss_clip": 0.01213648, "auxiliary_loss_mlp": 0.01068507, "balance_loss_clip": 1.05806684, "balance_loss_mlp": 1.04429567, "epoch": 0.05633548774988727, "flos": 17457039440640.0, "grad_norm": 1.9573264311231433, "language_loss": 0.7572521, "learning_rate": 3.992729665360331e-06, "loss": 0.78007358, "num_input_tokens_seen": 20007685, "step": 937, "time_per_iteration": 4.219425916671753 }, { "auxiliary_loss_clip": 0.01085529, "auxiliary_loss_mlp": 0.01013805, "balance_loss_clip": 1.02476001, "balance_loss_mlp": 1.00944233, "epoch": 0.05639561100255524, "flos": 70654928083200.0, "grad_norm": 0.9053055994078011, "language_loss": 0.64309287, "learning_rate": 3.992696449831162e-06, "loss": 0.66408622, "num_input_tokens_seen": 20072750, "step": 938, "time_per_iteration": 3.1298794746398926 }, { "auxiliary_loss_clip": 0.01171203, "auxiliary_loss_mlp": 0.01068815, "balance_loss_clip": 1.05175185, "balance_loss_mlp": 1.0426966, "epoch": 0.056455734255223204, "flos": 20485996692480.0, "grad_norm": 2.7427540631348832, "language_loss": 0.79751205, "learning_rate": 3.992663158738745e-06, "loss": 0.8199122, "num_input_tokens_seen": 20089070, "step": 939, "time_per_iteration": 2.6863484382629395 }, { "auxiliary_loss_clip": 0.01175528, "auxiliary_loss_mlp": 0.01068297, "balance_loss_clip": 1.0509069, "balance_loss_mlp": 1.04338217, "epoch": 0.056515857507891176, "flos": 22053569109120.0, "grad_norm": 1.8374791395473227, "language_loss": 0.73919088, "learning_rate": 3.992629792084341e-06, "loss": 0.76162916, "num_input_tokens_seen": 20108790, "step": 940, "time_per_iteration": 2.7111120223999023 }, { "auxiliary_loss_clip": 0.01198483, "auxiliary_loss_mlp": 0.01058511, "balance_loss_clip": 1.05900669, "balance_loss_mlp": 1.03252339, "epoch": 0.05657598076055915, "flos": 24025316336640.0, "grad_norm": 2.2993716569389813, "language_loss": 0.70622003, "learning_rate": 3.992596349869216e-06, "loss": 0.72878999, "num_input_tokens_seen": 20128455, "step": 941, "time_per_iteration": 2.657594680786133 }, { "auxiliary_loss_clip": 0.01135396, "auxiliary_loss_mlp": 0.01059543, "balance_loss_clip": 1.04961574, "balance_loss_mlp": 1.03382993, "epoch": 0.05663610401322711, "flos": 20480609652480.0, "grad_norm": 2.0678542992190847, "language_loss": 0.80921417, "learning_rate": 3.992562832094637e-06, "loss": 0.83116359, "num_input_tokens_seen": 20145775, "step": 942, "time_per_iteration": 2.7379891872406006 }, { "auxiliary_loss_clip": 0.01186767, "auxiliary_loss_mlp": 0.01062055, "balance_loss_clip": 1.05228579, "balance_loss_mlp": 1.03554332, "epoch": 0.056696227265895086, "flos": 21069042255360.0, "grad_norm": 2.245249922529115, "language_loss": 0.88858449, "learning_rate": 3.9925292387618755e-06, "loss": 0.91107273, "num_input_tokens_seen": 20164315, "step": 943, "time_per_iteration": 2.6502583026885986 }, { "auxiliary_loss_clip": 0.01199122, "auxiliary_loss_mlp": 0.0105963, "balance_loss_clip": 1.05991781, "balance_loss_mlp": 1.03534663, "epoch": 0.05675635051856306, "flos": 17821317219840.0, "grad_norm": 2.5514256959015995, "language_loss": 0.74771839, "learning_rate": 3.992495569872206e-06, "loss": 0.77030593, "num_input_tokens_seen": 20182760, "step": 944, "time_per_iteration": 2.676079034805298 }, { "auxiliary_loss_clip": 0.01204502, "auxiliary_loss_mlp": 0.01064591, "balance_loss_clip": 1.05980551, "balance_loss_mlp": 1.04085672, "epoch": 0.05681647377123102, "flos": 23114945111040.0, "grad_norm": 1.5959266123312272, "language_loss": 0.79406166, "learning_rate": 3.992461825426906e-06, "loss": 0.81675267, "num_input_tokens_seen": 20203830, "step": 945, "time_per_iteration": 2.734299421310425 }, { "auxiliary_loss_clip": 0.01195984, "auxiliary_loss_mlp": 0.0105672, "balance_loss_clip": 1.05686593, "balance_loss_mlp": 1.03156662, "epoch": 0.056876597023898995, "flos": 16070528505600.0, "grad_norm": 2.5637081249861824, "language_loss": 0.82651746, "learning_rate": 3.992428005427252e-06, "loss": 0.84904456, "num_input_tokens_seen": 20220365, "step": 946, "time_per_iteration": 2.6636929512023926 }, { "auxiliary_loss_clip": 0.0122014, "auxiliary_loss_mlp": 0.01061449, "balance_loss_clip": 1.06224144, "balance_loss_mlp": 1.03524721, "epoch": 0.05693672027656696, "flos": 16835641130880.0, "grad_norm": 1.8433174156507384, "language_loss": 0.79031301, "learning_rate": 3.992394109874529e-06, "loss": 0.81312895, "num_input_tokens_seen": 20238640, "step": 947, "time_per_iteration": 2.623671293258667 }, { "auxiliary_loss_clip": 0.0117587, "auxiliary_loss_mlp": 0.01061489, "balance_loss_clip": 1.05605412, "balance_loss_mlp": 1.03569245, "epoch": 0.05699684352923493, "flos": 21389113370880.0, "grad_norm": 6.8661947111986725, "language_loss": 0.85425055, "learning_rate": 3.9923601387700225e-06, "loss": 0.87662411, "num_input_tokens_seen": 20251025, "step": 948, "time_per_iteration": 2.7410409450531006 }, { "auxiliary_loss_clip": 0.01214005, "auxiliary_loss_mlp": 0.01063231, "balance_loss_clip": 1.05969238, "balance_loss_mlp": 1.03598022, "epoch": 0.057056966781902904, "flos": 15560309767680.0, "grad_norm": 3.649211317819821, "language_loss": 0.87346625, "learning_rate": 3.992326092115019e-06, "loss": 0.89623863, "num_input_tokens_seen": 20269775, "step": 949, "time_per_iteration": 2.6893157958984375 }, { "auxiliary_loss_clip": 0.01194543, "auxiliary_loss_mlp": 0.0106695, "balance_loss_clip": 1.05799937, "balance_loss_mlp": 1.04266715, "epoch": 0.05711709003457087, "flos": 19937856170880.0, "grad_norm": 1.8324883776363103, "language_loss": 0.7874645, "learning_rate": 3.992291969910811e-06, "loss": 0.8100794, "num_input_tokens_seen": 20287715, "step": 950, "time_per_iteration": 2.623924732208252 }, { "auxiliary_loss_clip": 0.01180518, "auxiliary_loss_mlp": 0.01068771, "balance_loss_clip": 1.05322623, "balance_loss_mlp": 1.04384422, "epoch": 0.05717721328723884, "flos": 30332701774080.0, "grad_norm": 3.8045132244795816, "language_loss": 0.82477522, "learning_rate": 3.992257772158691e-06, "loss": 0.8472681, "num_input_tokens_seen": 20307070, "step": 951, "time_per_iteration": 2.697479724884033 }, { "auxiliary_loss_clip": 0.01167302, "auxiliary_loss_mlp": 0.01061039, "balance_loss_clip": 1.04906607, "balance_loss_mlp": 1.03375173, "epoch": 0.05723733653990681, "flos": 23654358627840.0, "grad_norm": 2.4180383362968634, "language_loss": 0.86899263, "learning_rate": 3.992223498859958e-06, "loss": 0.89127606, "num_input_tokens_seen": 20324945, "step": 952, "time_per_iteration": 2.707716226577759 }, { "auxiliary_loss_clip": 0.01191405, "auxiliary_loss_mlp": 0.01064705, "balance_loss_clip": 1.05511189, "balance_loss_mlp": 1.03630924, "epoch": 0.05729745979257478, "flos": 22055759838720.0, "grad_norm": 2.195434645270168, "language_loss": 0.79087842, "learning_rate": 3.9921891500159084e-06, "loss": 0.81343949, "num_input_tokens_seen": 20346135, "step": 953, "time_per_iteration": 2.671255588531494 }, { "auxiliary_loss_clip": 0.01190026, "auxiliary_loss_mlp": 0.01066447, "balance_loss_clip": 1.05984342, "balance_loss_mlp": 1.04056656, "epoch": 0.05735758304524275, "flos": 19604353368960.0, "grad_norm": 2.2066085695914466, "language_loss": 0.86644447, "learning_rate": 3.992154725627848e-06, "loss": 0.88900924, "num_input_tokens_seen": 20364450, "step": 954, "time_per_iteration": 2.671657085418701 }, { "auxiliary_loss_clip": 0.01210569, "auxiliary_loss_mlp": 0.01062619, "balance_loss_clip": 1.06119955, "balance_loss_mlp": 1.03723955, "epoch": 0.057417706297910716, "flos": 19099018880640.0, "grad_norm": 2.2872795023766113, "language_loss": 0.88071024, "learning_rate": 3.9921202256970804e-06, "loss": 0.90344214, "num_input_tokens_seen": 20383500, "step": 955, "time_per_iteration": 2.69960880279541 }, { "auxiliary_loss_clip": 0.01179864, "auxiliary_loss_mlp": 0.01068889, "balance_loss_clip": 1.0523231, "balance_loss_mlp": 1.04209054, "epoch": 0.05747782955057869, "flos": 16654507822080.0, "grad_norm": 1.9113555723128555, "language_loss": 0.89160776, "learning_rate": 3.992085650224914e-06, "loss": 0.91409534, "num_input_tokens_seen": 20400295, "step": 956, "time_per_iteration": 2.667868137359619 }, { "auxiliary_loss_clip": 0.01167867, "auxiliary_loss_mlp": 0.01060669, "balance_loss_clip": 1.05720079, "balance_loss_mlp": 1.03450251, "epoch": 0.05753795280324665, "flos": 14502058248960.0, "grad_norm": 3.2877973901728095, "language_loss": 0.75473189, "learning_rate": 3.99205099921266e-06, "loss": 0.77701724, "num_input_tokens_seen": 20419085, "step": 957, "time_per_iteration": 2.6938796043395996 }, { "auxiliary_loss_clip": 0.0117627, "auxiliary_loss_mlp": 0.01072849, "balance_loss_clip": 1.05432248, "balance_loss_mlp": 1.0448705, "epoch": 0.057598076055914625, "flos": 18076318848000.0, "grad_norm": 2.0004055711005257, "language_loss": 0.79582155, "learning_rate": 3.992016272661633e-06, "loss": 0.81831264, "num_input_tokens_seen": 20437465, "step": 958, "time_per_iteration": 2.6933834552764893 }, { "auxiliary_loss_clip": 0.01186244, "auxiliary_loss_mlp": 0.01059908, "balance_loss_clip": 1.05851364, "balance_loss_mlp": 1.03572011, "epoch": 0.0576581993085826, "flos": 22124600254080.0, "grad_norm": 2.669863855173802, "language_loss": 0.8840394, "learning_rate": 3.99198147057315e-06, "loss": 0.906501, "num_input_tokens_seen": 20456235, "step": 959, "time_per_iteration": 2.7094578742980957 }, { "auxiliary_loss_clip": 0.01169479, "auxiliary_loss_mlp": 0.01063656, "balance_loss_clip": 1.05511999, "balance_loss_mlp": 1.03881276, "epoch": 0.05771832256125056, "flos": 33181746779520.0, "grad_norm": 2.0960373333994764, "language_loss": 0.78850955, "learning_rate": 3.991946592948529e-06, "loss": 0.8108409, "num_input_tokens_seen": 20476825, "step": 960, "time_per_iteration": 2.822922945022583 }, { "auxiliary_loss_clip": 0.0113413, "auxiliary_loss_mlp": 0.01067189, "balance_loss_clip": 1.05177355, "balance_loss_mlp": 1.04020023, "epoch": 0.057778445813918534, "flos": 24170143973760.0, "grad_norm": 2.063464892179025, "language_loss": 0.92986894, "learning_rate": 3.991911639789094e-06, "loss": 0.95188212, "num_input_tokens_seen": 20496965, "step": 961, "time_per_iteration": 2.793952226638794 }, { "auxiliary_loss_clip": 0.01182535, "auxiliary_loss_mlp": 0.0106764, "balance_loss_clip": 1.0554297, "balance_loss_mlp": 1.04091299, "epoch": 0.0578385690665865, "flos": 29643037666560.0, "grad_norm": 2.0649993155313067, "language_loss": 0.68164188, "learning_rate": 3.991876611096169e-06, "loss": 0.70414358, "num_input_tokens_seen": 20518035, "step": 962, "time_per_iteration": 2.8396694660186768 }, { "auxiliary_loss_clip": 0.01159524, "auxiliary_loss_mlp": 0.01073851, "balance_loss_clip": 1.05128908, "balance_loss_mlp": 1.04909074, "epoch": 0.05789869231925447, "flos": 20885430908160.0, "grad_norm": 2.2685465488517074, "language_loss": 0.8848027, "learning_rate": 3.991841506871084e-06, "loss": 0.90713644, "num_input_tokens_seen": 20534740, "step": 963, "time_per_iteration": 2.7077019214630127 }, { "auxiliary_loss_clip": 0.01183778, "auxiliary_loss_mlp": 0.01061251, "balance_loss_clip": 1.06018209, "balance_loss_mlp": 1.03516829, "epoch": 0.057958815571922444, "flos": 26031106679040.0, "grad_norm": 2.392959969035536, "language_loss": 0.85288298, "learning_rate": 3.99180632711517e-06, "loss": 0.87533331, "num_input_tokens_seen": 20553485, "step": 964, "time_per_iteration": 2.7218217849731445 }, { "auxiliary_loss_clip": 0.01188683, "auxiliary_loss_mlp": 0.01069422, "balance_loss_clip": 1.05959499, "balance_loss_mlp": 1.04325557, "epoch": 0.05801893882459041, "flos": 18077683564800.0, "grad_norm": 3.087349735715565, "language_loss": 0.78159416, "learning_rate": 3.99177107182976e-06, "loss": 0.80417526, "num_input_tokens_seen": 20572155, "step": 965, "time_per_iteration": 2.6902661323547363 }, { "auxiliary_loss_clip": 0.01156531, "auxiliary_loss_mlp": 0.0107109, "balance_loss_clip": 1.0523715, "balance_loss_mlp": 1.04462528, "epoch": 0.05807906207725838, "flos": 17748885444480.0, "grad_norm": 1.9742288518319486, "language_loss": 0.81403655, "learning_rate": 3.99173574101619e-06, "loss": 0.83631277, "num_input_tokens_seen": 20590395, "step": 966, "time_per_iteration": 2.7423267364501953 }, { "auxiliary_loss_clip": 0.01198908, "auxiliary_loss_mlp": 0.01065021, "balance_loss_clip": 1.058887, "balance_loss_mlp": 1.04113197, "epoch": 0.058139185329926346, "flos": 18040372312320.0, "grad_norm": 1.8776530142118544, "language_loss": 0.76480806, "learning_rate": 3.9917003346758035e-06, "loss": 0.78744727, "num_input_tokens_seen": 20608435, "step": 967, "time_per_iteration": 2.642885446548462 }, { "auxiliary_loss_clip": 0.01084339, "auxiliary_loss_mlp": 0.0103139, "balance_loss_clip": 1.02675521, "balance_loss_mlp": 1.0269078, "epoch": 0.05819930858259432, "flos": 62363297485440.0, "grad_norm": 0.985564929959949, "language_loss": 0.57357776, "learning_rate": 3.991664852809939e-06, "loss": 0.59473509, "num_input_tokens_seen": 20668575, "step": 968, "time_per_iteration": 3.1017024517059326 }, { "auxiliary_loss_clip": 0.01188824, "auxiliary_loss_mlp": 0.01057715, "balance_loss_clip": 1.05784404, "balance_loss_mlp": 1.03147697, "epoch": 0.05825943183526229, "flos": 19135360465920.0, "grad_norm": 2.1276337565108485, "language_loss": 0.82286429, "learning_rate": 3.991629295419945e-06, "loss": 0.84532964, "num_input_tokens_seen": 20687355, "step": 969, "time_per_iteration": 2.669055461883545 }, { "auxiliary_loss_clip": 0.01206272, "auxiliary_loss_mlp": 0.00782724, "balance_loss_clip": 1.06255269, "balance_loss_mlp": 1.00024962, "epoch": 0.058319555087930255, "flos": 29022465369600.0, "grad_norm": 7.916507288074279, "language_loss": 0.7803669, "learning_rate": 3.991593662507167e-06, "loss": 0.80025685, "num_input_tokens_seen": 20705710, "step": 970, "time_per_iteration": 2.733030080795288 }, { "auxiliary_loss_clip": 0.01181452, "auxiliary_loss_mlp": 0.01064945, "balance_loss_clip": 1.05691695, "balance_loss_mlp": 1.03887415, "epoch": 0.05837967834059823, "flos": 18879999701760.0, "grad_norm": 3.163102883752813, "language_loss": 0.92229038, "learning_rate": 3.991557954072958e-06, "loss": 0.94475436, "num_input_tokens_seen": 20722405, "step": 971, "time_per_iteration": 2.730377435684204 }, { "auxiliary_loss_clip": 0.01180948, "auxiliary_loss_mlp": 0.01062613, "balance_loss_clip": 1.05320477, "balance_loss_mlp": 1.03722143, "epoch": 0.05843980159326619, "flos": 25703062744320.0, "grad_norm": 1.700187330091603, "language_loss": 0.85959208, "learning_rate": 3.991522170118673e-06, "loss": 0.88202775, "num_input_tokens_seen": 20741480, "step": 972, "time_per_iteration": 2.687185049057007 }, { "auxiliary_loss_clip": 0.0116993, "auxiliary_loss_mlp": 0.01079713, "balance_loss_clip": 1.05714142, "balance_loss_mlp": 1.05601454, "epoch": 0.058499924845934165, "flos": 25552129795200.0, "grad_norm": 2.00599255988541, "language_loss": 0.87503272, "learning_rate": 3.991486310645667e-06, "loss": 0.89752913, "num_input_tokens_seen": 20759685, "step": 973, "time_per_iteration": 2.7166664600372314 }, { "auxiliary_loss_clip": 0.01206524, "auxiliary_loss_mlp": 0.00784111, "balance_loss_clip": 1.06111121, "balance_loss_mlp": 1.00026989, "epoch": 0.05856004809860214, "flos": 16436171001600.0, "grad_norm": 1.879365930358842, "language_loss": 0.74800295, "learning_rate": 3.991450375655301e-06, "loss": 0.76790935, "num_input_tokens_seen": 20778180, "step": 974, "time_per_iteration": 2.713594675064087 }, { "auxiliary_loss_clip": 0.01197101, "auxiliary_loss_mlp": 0.00782207, "balance_loss_clip": 1.059551, "balance_loss_mlp": 1.00025892, "epoch": 0.0586201713512701, "flos": 39458824116480.0, "grad_norm": 1.5923993506380014, "language_loss": 0.76874506, "learning_rate": 3.991414365148936e-06, "loss": 0.78853816, "num_input_tokens_seen": 20802705, "step": 975, "time_per_iteration": 7.600914716720581 }, { "auxiliary_loss_clip": 0.01215491, "auxiliary_loss_mlp": 0.01069506, "balance_loss_clip": 1.06030774, "balance_loss_mlp": 1.0444721, "epoch": 0.058680294603938074, "flos": 23365170230400.0, "grad_norm": 3.6132976830219734, "language_loss": 0.76748288, "learning_rate": 3.99137827912794e-06, "loss": 0.79033279, "num_input_tokens_seen": 20822540, "step": 976, "time_per_iteration": 4.324799537658691 }, { "auxiliary_loss_clip": 0.01176132, "auxiliary_loss_mlp": 0.01077003, "balance_loss_clip": 1.05271626, "balance_loss_mlp": 1.04963279, "epoch": 0.05874041785660604, "flos": 32232017226240.0, "grad_norm": 1.943198757110789, "language_loss": 0.87343585, "learning_rate": 3.991342117593679e-06, "loss": 0.89596725, "num_input_tokens_seen": 20844175, "step": 977, "time_per_iteration": 2.7742488384246826 }, { "auxiliary_loss_clip": 0.01187161, "auxiliary_loss_mlp": 0.01067914, "balance_loss_clip": 1.06209528, "balance_loss_mlp": 1.04231977, "epoch": 0.05880054110927401, "flos": 22310043194880.0, "grad_norm": 1.718987046197629, "language_loss": 0.7969116, "learning_rate": 3.991305880547527e-06, "loss": 0.81946236, "num_input_tokens_seen": 20864730, "step": 978, "time_per_iteration": 2.733372926712036 }, { "auxiliary_loss_clip": 0.01136264, "auxiliary_loss_mlp": 0.01076585, "balance_loss_clip": 1.05591321, "balance_loss_mlp": 1.04927468, "epoch": 0.05886066436194198, "flos": 27380450016000.0, "grad_norm": 1.8692877257975375, "language_loss": 0.80665666, "learning_rate": 3.991269567990855e-06, "loss": 0.82878518, "num_input_tokens_seen": 20885200, "step": 979, "time_per_iteration": 3.2624220848083496 }, { "auxiliary_loss_clip": 0.01074686, "auxiliary_loss_mlp": 0.01029701, "balance_loss_clip": 1.02640033, "balance_loss_mlp": 1.02495658, "epoch": 0.05892078761460995, "flos": 59584493525760.0, "grad_norm": 0.9436493040005753, "language_loss": 0.59004962, "learning_rate": 3.9912331799250415e-06, "loss": 0.6110934, "num_input_tokens_seen": 20940325, "step": 980, "time_per_iteration": 3.4688587188720703 }, { "auxiliary_loss_clip": 0.01211665, "auxiliary_loss_mlp": 0.01078603, "balance_loss_clip": 1.06178868, "balance_loss_mlp": 1.05242431, "epoch": 0.05898091086727792, "flos": 15414081500160.0, "grad_norm": 2.2770545408130514, "language_loss": 0.86436182, "learning_rate": 3.9911967163514665e-06, "loss": 0.88726455, "num_input_tokens_seen": 20958220, "step": 981, "time_per_iteration": 2.5824644565582275 }, { "auxiliary_loss_clip": 0.01190085, "auxiliary_loss_mlp": 0.0106921, "balance_loss_clip": 1.05943286, "balance_loss_mlp": 1.04629803, "epoch": 0.059041034119945886, "flos": 23655328295040.0, "grad_norm": 2.1333982175691855, "language_loss": 0.79293346, "learning_rate": 3.991160177271513e-06, "loss": 0.81552643, "num_input_tokens_seen": 20978920, "step": 982, "time_per_iteration": 2.68428897857666 }, { "auxiliary_loss_clip": 0.01192274, "auxiliary_loss_mlp": 0.01068234, "balance_loss_clip": 1.05926657, "balance_loss_mlp": 1.04356933, "epoch": 0.05910115737261386, "flos": 24754087376640.0, "grad_norm": 2.319627739094249, "language_loss": 0.84413779, "learning_rate": 3.9911235626865654e-06, "loss": 0.86674285, "num_input_tokens_seen": 20999490, "step": 983, "time_per_iteration": 2.7006261348724365 }, { "auxiliary_loss_clip": 0.0120015, "auxiliary_loss_mlp": 0.01072669, "balance_loss_clip": 1.05969584, "balance_loss_mlp": 1.04799283, "epoch": 0.05916128062528183, "flos": 11728749070080.0, "grad_norm": 1.8014395118859294, "language_loss": 0.84510243, "learning_rate": 3.9910868725980125e-06, "loss": 0.86783063, "num_input_tokens_seen": 21017865, "step": 984, "time_per_iteration": 2.640246868133545 }, { "auxiliary_loss_clip": 0.01188594, "auxiliary_loss_mlp": 0.01055296, "balance_loss_clip": 1.05650342, "balance_loss_mlp": 1.03171611, "epoch": 0.059221403877949795, "flos": 21902995296000.0, "grad_norm": 2.473231587287368, "language_loss": 0.77611595, "learning_rate": 3.9910501070072465e-06, "loss": 0.7985549, "num_input_tokens_seen": 21035900, "step": 985, "time_per_iteration": 2.626371383666992 }, { "auxiliary_loss_clip": 0.01150113, "auxiliary_loss_mlp": 0.01060814, "balance_loss_clip": 1.05341148, "balance_loss_mlp": 1.03542209, "epoch": 0.05928152713061777, "flos": 20514580940160.0, "grad_norm": 1.9082382068459252, "language_loss": 0.90593231, "learning_rate": 3.991013265915661e-06, "loss": 0.92804158, "num_input_tokens_seen": 21053235, "step": 986, "time_per_iteration": 2.7834935188293457 }, { "auxiliary_loss_clip": 0.01200704, "auxiliary_loss_mlp": 0.01061312, "balance_loss_clip": 1.05555892, "balance_loss_mlp": 1.03425193, "epoch": 0.05934165038328574, "flos": 24495135252480.0, "grad_norm": 2.216017383423336, "language_loss": 0.75688565, "learning_rate": 3.9909763493246525e-06, "loss": 0.77950585, "num_input_tokens_seen": 21073090, "step": 987, "time_per_iteration": 2.6669981479644775 }, { "auxiliary_loss_clip": 0.01203558, "auxiliary_loss_mlp": 0.01057756, "balance_loss_clip": 1.06134868, "balance_loss_mlp": 1.03331852, "epoch": 0.059401773635953704, "flos": 38728041914880.0, "grad_norm": 2.2869993581633827, "language_loss": 0.71867943, "learning_rate": 3.990939357235621e-06, "loss": 0.7412926, "num_input_tokens_seen": 21094895, "step": 988, "time_per_iteration": 2.805851697921753 }, { "auxiliary_loss_clip": 0.0105006, "auxiliary_loss_mlp": 0.0101134, "balance_loss_clip": 1.02230322, "balance_loss_mlp": 1.00688171, "epoch": 0.059461896888621676, "flos": 58023565125120.0, "grad_norm": 0.9416454944601763, "language_loss": 0.7124939, "learning_rate": 3.99090228964997e-06, "loss": 0.73310792, "num_input_tokens_seen": 21147555, "step": 989, "time_per_iteration": 3.100306749343872 }, { "auxiliary_loss_clip": 0.0117797, "auxiliary_loss_mlp": 0.01072264, "balance_loss_clip": 1.05793095, "balance_loss_mlp": 1.04389191, "epoch": 0.05952202014128964, "flos": 22127760650880.0, "grad_norm": 2.0167260155417113, "language_loss": 0.78245646, "learning_rate": 3.990865146569105e-06, "loss": 0.80495882, "num_input_tokens_seen": 21167845, "step": 990, "time_per_iteration": 2.8133904933929443 }, { "auxiliary_loss_clip": 0.01198295, "auxiliary_loss_mlp": 0.01053485, "balance_loss_clip": 1.06166339, "balance_loss_mlp": 1.02761686, "epoch": 0.059582143393957614, "flos": 20445776438400.0, "grad_norm": 2.2411623387553727, "language_loss": 0.86522102, "learning_rate": 3.990827927994434e-06, "loss": 0.88773882, "num_input_tokens_seen": 21185085, "step": 991, "time_per_iteration": 2.6964831352233887 }, { "auxiliary_loss_clip": 0.0121783, "auxiliary_loss_mlp": 0.01064707, "balance_loss_clip": 1.0613625, "balance_loss_mlp": 1.03943431, "epoch": 0.059642266646625586, "flos": 20594877793920.0, "grad_norm": 1.8566945591898132, "language_loss": 0.76738375, "learning_rate": 3.9907906339273674e-06, "loss": 0.79020917, "num_input_tokens_seen": 21204230, "step": 992, "time_per_iteration": 2.646942377090454 }, { "auxiliary_loss_clip": 0.01146457, "auxiliary_loss_mlp": 0.01062309, "balance_loss_clip": 1.05571234, "balance_loss_mlp": 1.03834832, "epoch": 0.05970238989929355, "flos": 19352655792000.0, "grad_norm": 2.3469050968731233, "language_loss": 0.75117075, "learning_rate": 3.9907532643693215e-06, "loss": 0.77325845, "num_input_tokens_seen": 21222655, "step": 993, "time_per_iteration": 2.7642974853515625 }, { "auxiliary_loss_clip": 0.01157785, "auxiliary_loss_mlp": 0.01075532, "balance_loss_clip": 1.05397618, "balance_loss_mlp": 1.04774487, "epoch": 0.05976251315196152, "flos": 30264040926720.0, "grad_norm": 2.725207959052886, "language_loss": 0.79177904, "learning_rate": 3.990715819321712e-06, "loss": 0.81411219, "num_input_tokens_seen": 21242310, "step": 994, "time_per_iteration": 2.8414714336395264 }, { "auxiliary_loss_clip": 0.01214724, "auxiliary_loss_mlp": 0.01079016, "balance_loss_clip": 1.06264019, "balance_loss_mlp": 1.05361295, "epoch": 0.05982263640462949, "flos": 23185150243200.0, "grad_norm": 2.8097993094234983, "language_loss": 0.79917169, "learning_rate": 3.99067829878596e-06, "loss": 0.82210916, "num_input_tokens_seen": 21261410, "step": 995, "time_per_iteration": 2.6524364948272705 }, { "auxiliary_loss_clip": 0.0116696, "auxiliary_loss_mlp": 0.01068218, "balance_loss_clip": 1.05704355, "balance_loss_mlp": 1.04208767, "epoch": 0.05988275965729746, "flos": 27850879463040.0, "grad_norm": 1.902030256537741, "language_loss": 0.87013257, "learning_rate": 3.990640702763487e-06, "loss": 0.89248431, "num_input_tokens_seen": 21280080, "step": 996, "time_per_iteration": 2.7431676387786865 }, { "auxiliary_loss_clip": 0.01177854, "auxiliary_loss_mlp": 0.01081123, "balance_loss_clip": 1.05684328, "balance_loss_mlp": 1.05055761, "epoch": 0.05994288290996543, "flos": 24680003575680.0, "grad_norm": 2.971039758986745, "language_loss": 0.87273014, "learning_rate": 3.990603031255718e-06, "loss": 0.89531994, "num_input_tokens_seen": 21296765, "step": 997, "time_per_iteration": 2.748448371887207 }, { "auxiliary_loss_clip": 0.01069915, "auxiliary_loss_mlp": 0.01014417, "balance_loss_clip": 1.02303648, "balance_loss_mlp": 1.00972033, "epoch": 0.0600030061626334, "flos": 69929568835200.0, "grad_norm": 1.0091092068179202, "language_loss": 0.75381488, "learning_rate": 3.990565284264083e-06, "loss": 0.7746582, "num_input_tokens_seen": 21363345, "step": 998, "time_per_iteration": 3.2950518131256104 }, { "auxiliary_loss_clip": 0.01170062, "auxiliary_loss_mlp": 0.01065521, "balance_loss_clip": 1.05893683, "balance_loss_mlp": 1.03893745, "epoch": 0.06006312941530137, "flos": 26540140268160.0, "grad_norm": 1.8197691299520968, "language_loss": 0.76053095, "learning_rate": 3.990527461790013e-06, "loss": 0.7828868, "num_input_tokens_seen": 21385290, "step": 999, "time_per_iteration": 2.733802556991577 }, { "auxiliary_loss_clip": 0.01197834, "auxiliary_loss_mlp": 0.01059542, "balance_loss_clip": 1.05646563, "balance_loss_mlp": 1.03339899, "epoch": 0.060123252667969335, "flos": 27344000689920.0, "grad_norm": 2.5948629341774874, "language_loss": 0.82992184, "learning_rate": 3.990489563834943e-06, "loss": 0.85249555, "num_input_tokens_seen": 21407625, "step": 1000, "time_per_iteration": 2.710981845855713 }, { "auxiliary_loss_clip": 0.0118571, "auxiliary_loss_mlp": 0.01062188, "balance_loss_clip": 1.05856955, "balance_loss_mlp": 1.03480577, "epoch": 0.06018337592063731, "flos": 27016710940800.0, "grad_norm": 2.111409807940472, "language_loss": 0.85820085, "learning_rate": 3.990451590400309e-06, "loss": 0.88067985, "num_input_tokens_seen": 21426835, "step": 1001, "time_per_iteration": 2.73445463180542 }, { "auxiliary_loss_clip": 0.01191917, "auxiliary_loss_mlp": 0.01062059, "balance_loss_clip": 1.06167853, "balance_loss_mlp": 1.03719211, "epoch": 0.06024349917330528, "flos": 25592960580480.0, "grad_norm": 1.8359711451165206, "language_loss": 0.74128318, "learning_rate": 3.990413541487551e-06, "loss": 0.76382297, "num_input_tokens_seen": 21444920, "step": 1002, "time_per_iteration": 2.8861100673675537 }, { "auxiliary_loss_clip": 0.01214316, "auxiliary_loss_mlp": 0.01062589, "balance_loss_clip": 1.06316125, "balance_loss_mlp": 1.03737664, "epoch": 0.060303622425973244, "flos": 26133271937280.0, "grad_norm": 2.1835040648243997, "language_loss": 0.75520515, "learning_rate": 3.990375417098112e-06, "loss": 0.77797419, "num_input_tokens_seen": 21463555, "step": 1003, "time_per_iteration": 2.632889747619629 }, { "auxiliary_loss_clip": 0.01187709, "auxiliary_loss_mlp": 0.01064806, "balance_loss_clip": 1.05934548, "balance_loss_mlp": 1.03928304, "epoch": 0.060363745678641216, "flos": 20377187418240.0, "grad_norm": 2.3150099602993155, "language_loss": 0.70349169, "learning_rate": 3.990337217233437e-06, "loss": 0.72601682, "num_input_tokens_seen": 21481990, "step": 1004, "time_per_iteration": 2.6947617530822754 }, { "auxiliary_loss_clip": 0.01212815, "auxiliary_loss_mlp": 0.01077454, "balance_loss_clip": 1.06629324, "balance_loss_mlp": 1.05168116, "epoch": 0.06042386893130918, "flos": 17749172753280.0, "grad_norm": 2.276868338025253, "language_loss": 0.83444524, "learning_rate": 3.990298941894976e-06, "loss": 0.85734791, "num_input_tokens_seen": 21500385, "step": 1005, "time_per_iteration": 2.581683397293091 }, { "auxiliary_loss_clip": 0.01077621, "auxiliary_loss_mlp": 0.01004707, "balance_loss_clip": 1.02541244, "balance_loss_mlp": 1.00029612, "epoch": 0.06048399218397715, "flos": 68538496872960.0, "grad_norm": 0.903813421793838, "language_loss": 0.59018111, "learning_rate": 3.9902605910841794e-06, "loss": 0.61100447, "num_input_tokens_seen": 21561040, "step": 1006, "time_per_iteration": 3.222104787826538 }, { "auxiliary_loss_clip": 0.01183553, "auxiliary_loss_mlp": 0.01059038, "balance_loss_clip": 1.05334234, "balance_loss_mlp": 1.03284812, "epoch": 0.060544115436645125, "flos": 23258515772160.0, "grad_norm": 2.1584333764290853, "language_loss": 0.74229443, "learning_rate": 3.990222164802503e-06, "loss": 0.76472032, "num_input_tokens_seen": 21580655, "step": 1007, "time_per_iteration": 2.7130653858184814 }, { "auxiliary_loss_clip": 0.0119408, "auxiliary_loss_mlp": 0.01060431, "balance_loss_clip": 1.06008601, "balance_loss_mlp": 1.03493261, "epoch": 0.06060423868931309, "flos": 23878441624320.0, "grad_norm": 1.7956876298455304, "language_loss": 0.8081426, "learning_rate": 3.9901836630514006e-06, "loss": 0.8306877, "num_input_tokens_seen": 21599650, "step": 1008, "time_per_iteration": 2.7151994705200195 }, { "auxiliary_loss_clip": 0.01175291, "auxiliary_loss_mlp": 0.01056357, "balance_loss_clip": 1.05982351, "balance_loss_mlp": 1.0305717, "epoch": 0.06066436194198106, "flos": 18728061171840.0, "grad_norm": 2.3069524306559837, "language_loss": 0.78198558, "learning_rate": 3.990145085832335e-06, "loss": 0.8043021, "num_input_tokens_seen": 21617550, "step": 1009, "time_per_iteration": 2.7313599586486816 }, { "auxiliary_loss_clip": 0.01194621, "auxiliary_loss_mlp": 0.01061233, "balance_loss_clip": 1.06150866, "balance_loss_mlp": 1.03726041, "epoch": 0.06072448519464903, "flos": 24640465680000.0, "grad_norm": 1.7452257697216769, "language_loss": 0.93148172, "learning_rate": 3.990106433146769e-06, "loss": 0.95404023, "num_input_tokens_seen": 21635865, "step": 1010, "time_per_iteration": 2.7233662605285645 }, { "auxiliary_loss_clip": 0.01148246, "auxiliary_loss_mlp": 0.00784144, "balance_loss_clip": 1.05304599, "balance_loss_mlp": 1.00029802, "epoch": 0.060784608447317, "flos": 17378825575680.0, "grad_norm": 2.9999367504779517, "language_loss": 0.72022474, "learning_rate": 3.9900677049961665e-06, "loss": 0.73954868, "num_input_tokens_seen": 21653945, "step": 1011, "time_per_iteration": 2.804858446121216 }, { "auxiliary_loss_clip": 0.01194231, "auxiliary_loss_mlp": 0.01077344, "balance_loss_clip": 1.05968046, "balance_loss_mlp": 1.04868615, "epoch": 0.06084473169998497, "flos": 23692208584320.0, "grad_norm": 1.9573218215833301, "language_loss": 0.87526691, "learning_rate": 3.990028901381999e-06, "loss": 0.89798272, "num_input_tokens_seen": 21671230, "step": 1012, "time_per_iteration": 2.6466245651245117 }, { "auxiliary_loss_clip": 0.01184459, "auxiliary_loss_mlp": 0.01064264, "balance_loss_clip": 1.05652905, "balance_loss_mlp": 1.03838325, "epoch": 0.06090485495265294, "flos": 23546339452800.0, "grad_norm": 1.9062230938156723, "language_loss": 0.76947677, "learning_rate": 3.989990022305734e-06, "loss": 0.79196405, "num_input_tokens_seen": 21691155, "step": 1013, "time_per_iteration": 4.297588586807251 }, { "auxiliary_loss_clip": 0.01207383, "auxiliary_loss_mlp": 0.00783488, "balance_loss_clip": 1.06573224, "balance_loss_mlp": 1.00034499, "epoch": 0.06096497820532091, "flos": 20339301548160.0, "grad_norm": 2.441711811862119, "language_loss": 0.86151874, "learning_rate": 3.98995106776885e-06, "loss": 0.88142747, "num_input_tokens_seen": 21707405, "step": 1014, "time_per_iteration": 4.301488637924194 }, { "auxiliary_loss_clip": 0.0121503, "auxiliary_loss_mlp": 0.01072817, "balance_loss_clip": 1.06605387, "balance_loss_mlp": 1.04508948, "epoch": 0.061025101457988874, "flos": 26939035779840.0, "grad_norm": 2.4309754772209184, "language_loss": 0.73197287, "learning_rate": 3.98991203777282e-06, "loss": 0.75485134, "num_input_tokens_seen": 21728090, "step": 1015, "time_per_iteration": 4.384514808654785 }, { "auxiliary_loss_clip": 0.01187374, "auxiliary_loss_mlp": 0.01068593, "balance_loss_clip": 1.06084347, "balance_loss_mlp": 1.04228365, "epoch": 0.061085224710656846, "flos": 25375054723200.0, "grad_norm": 1.5896529502124837, "language_loss": 0.79109907, "learning_rate": 3.9898729323191275e-06, "loss": 0.81365877, "num_input_tokens_seen": 21747950, "step": 1016, "time_per_iteration": 4.3249351978302 }, { "auxiliary_loss_clip": 0.01173015, "auxiliary_loss_mlp": 0.0105746, "balance_loss_clip": 1.06036103, "balance_loss_mlp": 1.03249741, "epoch": 0.06114534796332482, "flos": 24824759385600.0, "grad_norm": 1.6772682648410928, "language_loss": 0.76014191, "learning_rate": 3.989833751409254e-06, "loss": 0.78244662, "num_input_tokens_seen": 21767900, "step": 1017, "time_per_iteration": 2.7983243465423584 }, { "auxiliary_loss_clip": 0.01188817, "auxiliary_loss_mlp": 0.01074603, "balance_loss_clip": 1.06584609, "balance_loss_mlp": 1.0483532, "epoch": 0.061205471215992784, "flos": 20631434860800.0, "grad_norm": 2.001716657382839, "language_loss": 0.85798436, "learning_rate": 3.989794495044685e-06, "loss": 0.88061857, "num_input_tokens_seen": 21787375, "step": 1018, "time_per_iteration": 2.702399253845215 }, { "auxiliary_loss_clip": 0.01174344, "auxiliary_loss_mlp": 0.01069438, "balance_loss_clip": 1.06325769, "balance_loss_mlp": 1.04231787, "epoch": 0.061265594468660756, "flos": 16508351381760.0, "grad_norm": 2.9546929267460813, "language_loss": 0.76985347, "learning_rate": 3.989755163226909e-06, "loss": 0.79229128, "num_input_tokens_seen": 21806275, "step": 1019, "time_per_iteration": 2.780104875564575 }, { "auxiliary_loss_clip": 0.01160861, "auxiliary_loss_mlp": 0.0106141, "balance_loss_clip": 1.05355084, "balance_loss_mlp": 1.03511262, "epoch": 0.06132571772132872, "flos": 26246211275520.0, "grad_norm": 2.1848809329980288, "language_loss": 0.84122044, "learning_rate": 3.989715755957418e-06, "loss": 0.86344314, "num_input_tokens_seen": 21826430, "step": 1020, "time_per_iteration": 2.785963535308838 }, { "auxiliary_loss_clip": 0.01198473, "auxiliary_loss_mlp": 0.01063342, "balance_loss_clip": 1.06365371, "balance_loss_mlp": 1.03604269, "epoch": 0.06138584097399669, "flos": 37414788768000.0, "grad_norm": 1.933053672026977, "language_loss": 0.79114467, "learning_rate": 3.989676273237705e-06, "loss": 0.81376278, "num_input_tokens_seen": 21847800, "step": 1021, "time_per_iteration": 2.7968955039978027 }, { "auxiliary_loss_clip": 0.01189659, "auxiliary_loss_mlp": 0.01064044, "balance_loss_clip": 1.06159925, "balance_loss_mlp": 1.04114437, "epoch": 0.061445964226664665, "flos": 17420661941760.0, "grad_norm": 2.089525934673828, "language_loss": 0.87768298, "learning_rate": 3.9896367150692705e-06, "loss": 0.90022004, "num_input_tokens_seen": 21863385, "step": 1022, "time_per_iteration": 2.70906138420105 }, { "auxiliary_loss_clip": 0.01198737, "auxiliary_loss_mlp": 0.0106635, "balance_loss_clip": 1.06627858, "balance_loss_mlp": 1.04079151, "epoch": 0.06150608747933263, "flos": 22600021691520.0, "grad_norm": 1.7284486983379121, "language_loss": 0.82892007, "learning_rate": 3.989597081453611e-06, "loss": 0.85157096, "num_input_tokens_seen": 21881880, "step": 1023, "time_per_iteration": 2.71539568901062 }, { "auxiliary_loss_clip": 0.01100664, "auxiliary_loss_mlp": 0.01010751, "balance_loss_clip": 1.03727341, "balance_loss_mlp": 1.00614953, "epoch": 0.0615662107320006, "flos": 56741482005120.0, "grad_norm": 0.8894752517384502, "language_loss": 0.6505782, "learning_rate": 3.989557372392231e-06, "loss": 0.67169237, "num_input_tokens_seen": 21940550, "step": 1024, "time_per_iteration": 3.175217628479004 }, { "auxiliary_loss_clip": 0.01167458, "auxiliary_loss_mlp": 0.01073669, "balance_loss_clip": 1.05906856, "balance_loss_mlp": 1.04553604, "epoch": 0.06162633398466857, "flos": 22564793427840.0, "grad_norm": 2.320347485789288, "language_loss": 0.88069236, "learning_rate": 3.989517587886636e-06, "loss": 0.90310359, "num_input_tokens_seen": 21958390, "step": 1025, "time_per_iteration": 2.690725564956665 }, { "auxiliary_loss_clip": 0.01197064, "auxiliary_loss_mlp": 0.01066504, "balance_loss_clip": 1.06452, "balance_loss_mlp": 1.04173219, "epoch": 0.06168645723733654, "flos": 25593104234880.0, "grad_norm": 2.5217294712155414, "language_loss": 0.84536898, "learning_rate": 3.989477727938335e-06, "loss": 0.86800468, "num_input_tokens_seen": 21978625, "step": 1026, "time_per_iteration": 2.7420806884765625 }, { "auxiliary_loss_clip": 0.01160797, "auxiliary_loss_mlp": 0.0107525, "balance_loss_clip": 1.05669701, "balance_loss_mlp": 1.04934609, "epoch": 0.06174658049000451, "flos": 15997917162240.0, "grad_norm": 2.354014397396182, "language_loss": 0.8228389, "learning_rate": 3.989437792548839e-06, "loss": 0.84519935, "num_input_tokens_seen": 21996035, "step": 1027, "time_per_iteration": 2.6683874130249023 }, { "auxiliary_loss_clip": 0.01160199, "auxiliary_loss_mlp": 0.01067253, "balance_loss_clip": 1.06181073, "balance_loss_mlp": 1.04232645, "epoch": 0.06180670374267248, "flos": 11285970117120.0, "grad_norm": 4.43492107605727, "language_loss": 0.83898664, "learning_rate": 3.989397781719663e-06, "loss": 0.86126107, "num_input_tokens_seen": 22011625, "step": 1028, "time_per_iteration": 2.705387592315674 }, { "auxiliary_loss_clip": 0.0106503, "auxiliary_loss_mlp": 0.01008074, "balance_loss_clip": 1.02410197, "balance_loss_mlp": 1.00347257, "epoch": 0.06186682699534045, "flos": 65130142216320.0, "grad_norm": 0.9383255649985517, "language_loss": 0.604738, "learning_rate": 3.989357695452323e-06, "loss": 0.62546903, "num_input_tokens_seen": 22066035, "step": 1029, "time_per_iteration": 3.0268616676330566 }, { "auxiliary_loss_clip": 0.01176182, "auxiliary_loss_mlp": 0.01074173, "balance_loss_clip": 1.05641246, "balance_loss_mlp": 1.04737473, "epoch": 0.061926950248008414, "flos": 21105742976640.0, "grad_norm": 4.246634693563946, "language_loss": 0.82589179, "learning_rate": 3.98931753374834e-06, "loss": 0.84839535, "num_input_tokens_seen": 22085015, "step": 1030, "time_per_iteration": 2.7035892009735107 }, { "auxiliary_loss_clip": 0.0122298, "auxiliary_loss_mlp": 0.01077745, "balance_loss_clip": 1.06850278, "balance_loss_mlp": 1.05185235, "epoch": 0.061987073500676386, "flos": 17748454481280.0, "grad_norm": 2.585240230669548, "language_loss": 0.79576576, "learning_rate": 3.989277296609237e-06, "loss": 0.81877303, "num_input_tokens_seen": 22102775, "step": 1031, "time_per_iteration": 2.60622501373291 }, { "auxiliary_loss_clip": 0.01188957, "auxiliary_loss_mlp": 0.01076754, "balance_loss_clip": 1.06396544, "balance_loss_mlp": 1.04982424, "epoch": 0.06204719675334436, "flos": 21836237869440.0, "grad_norm": 1.8815476991595563, "language_loss": 0.77384412, "learning_rate": 3.98923698403654e-06, "loss": 0.79650116, "num_input_tokens_seen": 22121680, "step": 1032, "time_per_iteration": 2.6753971576690674 }, { "auxiliary_loss_clip": 0.01198757, "auxiliary_loss_mlp": 0.01074736, "balance_loss_clip": 1.05916619, "balance_loss_mlp": 1.04848623, "epoch": 0.06210732000601232, "flos": 19353697286400.0, "grad_norm": 3.147941025479245, "language_loss": 0.89323574, "learning_rate": 3.989196596031776e-06, "loss": 0.91597068, "num_input_tokens_seen": 22138155, "step": 1033, "time_per_iteration": 2.7313079833984375 }, { "auxiliary_loss_clip": 0.01209161, "auxiliary_loss_mlp": 0.01066082, "balance_loss_clip": 1.06214237, "balance_loss_mlp": 1.04119134, "epoch": 0.062167443258680295, "flos": 24749382695040.0, "grad_norm": 2.1035343880884145, "language_loss": 0.8455385, "learning_rate": 3.989156132596479e-06, "loss": 0.8682909, "num_input_tokens_seen": 22157420, "step": 1034, "time_per_iteration": 2.7541439533233643 }, { "auxiliary_loss_clip": 0.01180042, "auxiliary_loss_mlp": 0.01057312, "balance_loss_clip": 1.05896068, "balance_loss_mlp": 1.03155136, "epoch": 0.06222756651134827, "flos": 34458478773120.0, "grad_norm": 1.8983498110529735, "language_loss": 0.8082794, "learning_rate": 3.989115593732182e-06, "loss": 0.83065289, "num_input_tokens_seen": 22178620, "step": 1035, "time_per_iteration": 2.7965424060821533 }, { "auxiliary_loss_clip": 0.01158806, "auxiliary_loss_mlp": 0.01072478, "balance_loss_clip": 1.05936599, "balance_loss_mlp": 1.04432034, "epoch": 0.06228768976401623, "flos": 25666469763840.0, "grad_norm": 2.145216314952277, "language_loss": 0.78365827, "learning_rate": 3.989074979440421e-06, "loss": 0.80597103, "num_input_tokens_seen": 22197125, "step": 1036, "time_per_iteration": 2.7858450412750244 }, { "auxiliary_loss_clip": 0.01192097, "auxiliary_loss_mlp": 0.01071382, "balance_loss_clip": 1.05977845, "balance_loss_mlp": 1.04663444, "epoch": 0.062347813016684205, "flos": 25295619795840.0, "grad_norm": 1.9535870339716077, "language_loss": 0.86544567, "learning_rate": 3.989034289722739e-06, "loss": 0.88808048, "num_input_tokens_seen": 22217575, "step": 1037, "time_per_iteration": 2.685373306274414 }, { "auxiliary_loss_clip": 0.01197778, "auxiliary_loss_mlp": 0.01057095, "balance_loss_clip": 1.06127763, "balance_loss_mlp": 1.02966499, "epoch": 0.06240793626935217, "flos": 26907039740160.0, "grad_norm": 2.697396725345887, "language_loss": 0.8067717, "learning_rate": 3.988993524580676e-06, "loss": 0.82932043, "num_input_tokens_seen": 22236840, "step": 1038, "time_per_iteration": 2.7305831909179688 }, { "auxiliary_loss_clip": 0.01145896, "auxiliary_loss_mlp": 0.01072721, "balance_loss_clip": 1.05226004, "balance_loss_mlp": 1.04330015, "epoch": 0.06246805952202014, "flos": 21615782146560.0, "grad_norm": 1.8888526922505675, "language_loss": 0.85465872, "learning_rate": 3.98895268401578e-06, "loss": 0.87684488, "num_input_tokens_seen": 22256465, "step": 1039, "time_per_iteration": 2.7351109981536865 }, { "auxiliary_loss_clip": 0.01188545, "auxiliary_loss_mlp": 0.01070323, "balance_loss_clip": 1.05834138, "balance_loss_mlp": 1.04472923, "epoch": 0.0625281827746881, "flos": 19311896833920.0, "grad_norm": 2.217895985816133, "language_loss": 0.81172895, "learning_rate": 3.9889117680296e-06, "loss": 0.83431756, "num_input_tokens_seen": 22274025, "step": 1040, "time_per_iteration": 2.6532907485961914 }, { "auxiliary_loss_clip": 0.0121654, "auxiliary_loss_mlp": 0.0106312, "balance_loss_clip": 1.06718016, "balance_loss_mlp": 1.03808582, "epoch": 0.06258830602735609, "flos": 27745769289600.0, "grad_norm": 2.1960038080149817, "language_loss": 0.69304991, "learning_rate": 3.988870776623685e-06, "loss": 0.71584648, "num_input_tokens_seen": 22292245, "step": 1041, "time_per_iteration": 2.6445486545562744 }, { "auxiliary_loss_clip": 0.01214659, "auxiliary_loss_mlp": 0.01057975, "balance_loss_clip": 1.06247008, "balance_loss_mlp": 1.03182077, "epoch": 0.06264842928002405, "flos": 23222605150080.0, "grad_norm": 2.7326158002445, "language_loss": 0.81187552, "learning_rate": 3.9888297097995905e-06, "loss": 0.83460188, "num_input_tokens_seen": 22311455, "step": 1042, "time_per_iteration": 2.6111559867858887 }, { "auxiliary_loss_clip": 0.01211653, "auxiliary_loss_mlp": 0.01052676, "balance_loss_clip": 1.06253886, "balance_loss_mlp": 1.02871442, "epoch": 0.06270855253269202, "flos": 38399495189760.0, "grad_norm": 1.7165873820424848, "language_loss": 0.76349056, "learning_rate": 3.988788567558874e-06, "loss": 0.78613389, "num_input_tokens_seen": 22333750, "step": 1043, "time_per_iteration": 2.761768341064453 }, { "auxiliary_loss_clip": 0.0118944, "auxiliary_loss_mlp": 0.01063189, "balance_loss_clip": 1.06111181, "balance_loss_mlp": 1.03912091, "epoch": 0.06276867578535998, "flos": 22453542028800.0, "grad_norm": 8.34017761542712, "language_loss": 0.92031956, "learning_rate": 3.988747349903097e-06, "loss": 0.94284582, "num_input_tokens_seen": 22351940, "step": 1044, "time_per_iteration": 2.636179208755493 }, { "auxiliary_loss_clip": 0.01192566, "auxiliary_loss_mlp": 0.01070128, "balance_loss_clip": 1.05862689, "balance_loss_mlp": 1.0456785, "epoch": 0.06282879903802796, "flos": 22930435923840.0, "grad_norm": 2.3486674311430944, "language_loss": 0.85913992, "learning_rate": 3.988706056833821e-06, "loss": 0.88176692, "num_input_tokens_seen": 22372085, "step": 1045, "time_per_iteration": 2.7749502658843994 }, { "auxiliary_loss_clip": 0.01179197, "auxiliary_loss_mlp": 0.01065176, "balance_loss_clip": 1.05804443, "balance_loss_mlp": 1.04053521, "epoch": 0.06288892229069593, "flos": 34819237019520.0, "grad_norm": 1.9846122850853416, "language_loss": 0.7796576, "learning_rate": 3.9886646883526125e-06, "loss": 0.80210131, "num_input_tokens_seen": 22392020, "step": 1046, "time_per_iteration": 2.803135871887207 }, { "auxiliary_loss_clip": 0.01197344, "auxiliary_loss_mlp": 0.01069269, "balance_loss_clip": 1.06361508, "balance_loss_mlp": 1.04558206, "epoch": 0.06294904554336389, "flos": 19427134642560.0, "grad_norm": 2.174325060947129, "language_loss": 0.77326387, "learning_rate": 3.988623244461039e-06, "loss": 0.79592997, "num_input_tokens_seen": 22411180, "step": 1047, "time_per_iteration": 2.647446632385254 }, { "auxiliary_loss_clip": 0.01200907, "auxiliary_loss_mlp": 0.0105799, "balance_loss_clip": 1.06238222, "balance_loss_mlp": 1.03314662, "epoch": 0.06300916879603187, "flos": 40661867358720.0, "grad_norm": 2.4899372640825046, "language_loss": 0.77190751, "learning_rate": 3.988581725160672e-06, "loss": 0.79449654, "num_input_tokens_seen": 22435105, "step": 1048, "time_per_iteration": 2.8167293071746826 }, { "auxiliary_loss_clip": 0.0118184, "auxiliary_loss_mlp": 0.01064361, "balance_loss_clip": 1.0613215, "balance_loss_mlp": 1.03914821, "epoch": 0.06306929204869983, "flos": 23804142341760.0, "grad_norm": 4.606540291271834, "language_loss": 0.77258086, "learning_rate": 3.988540130453087e-06, "loss": 0.79504287, "num_input_tokens_seen": 22452710, "step": 1049, "time_per_iteration": 2.6908538341522217 }, { "auxiliary_loss_clip": 0.01194538, "auxiliary_loss_mlp": 0.0105682, "balance_loss_clip": 1.06043661, "balance_loss_mlp": 1.03290701, "epoch": 0.0631294153013678, "flos": 18915802583040.0, "grad_norm": 2.515998307474139, "language_loss": 0.83302009, "learning_rate": 3.988498460339862e-06, "loss": 0.85553372, "num_input_tokens_seen": 22470175, "step": 1050, "time_per_iteration": 2.62186861038208 }, { "auxiliary_loss_clip": 0.01210654, "auxiliary_loss_mlp": 0.01062894, "balance_loss_clip": 1.06468701, "balance_loss_mlp": 1.04008913, "epoch": 0.06318953855403578, "flos": 24280174310400.0, "grad_norm": 5.5478202090132065, "language_loss": 0.76564771, "learning_rate": 3.988456714822575e-06, "loss": 0.78838319, "num_input_tokens_seen": 22490020, "step": 1051, "time_per_iteration": 2.732269525527954 }, { "auxiliary_loss_clip": 0.01188416, "auxiliary_loss_mlp": 0.01069443, "balance_loss_clip": 1.06340146, "balance_loss_mlp": 1.04492211, "epoch": 0.06324966180670374, "flos": 22528918719360.0, "grad_norm": 1.9993900469270787, "language_loss": 0.80410004, "learning_rate": 3.98841489390281e-06, "loss": 0.82667863, "num_input_tokens_seen": 22509685, "step": 1052, "time_per_iteration": 2.7683873176574707 }, { "auxiliary_loss_clip": 0.01211333, "auxiliary_loss_mlp": 0.01058255, "balance_loss_clip": 1.06324601, "balance_loss_mlp": 1.03468728, "epoch": 0.06330978505937171, "flos": 15778107884160.0, "grad_norm": 2.370007457349547, "language_loss": 0.77433288, "learning_rate": 3.988372997582155e-06, "loss": 0.79702866, "num_input_tokens_seen": 22527905, "step": 1053, "time_per_iteration": 5.757168531417847 }, { "auxiliary_loss_clip": 0.01190721, "auxiliary_loss_mlp": 0.00780448, "balance_loss_clip": 1.06378174, "balance_loss_mlp": 1.00028598, "epoch": 0.06336990831203967, "flos": 21471098163840.0, "grad_norm": 3.085258828985267, "language_loss": 0.84931248, "learning_rate": 3.988331025862195e-06, "loss": 0.86902416, "num_input_tokens_seen": 22546335, "step": 1054, "time_per_iteration": 2.7733829021453857 }, { "auxiliary_loss_clip": 0.01172281, "auxiliary_loss_mlp": 0.01061232, "balance_loss_clip": 1.05722666, "balance_loss_mlp": 1.03753328, "epoch": 0.06343003156470765, "flos": 18478877546880.0, "grad_norm": 2.0168531459993435, "language_loss": 0.85884213, "learning_rate": 3.9882889787445225e-06, "loss": 0.88117731, "num_input_tokens_seen": 22563885, "step": 1055, "time_per_iteration": 4.490305185317993 }, { "auxiliary_loss_clip": 0.01164237, "auxiliary_loss_mlp": 0.01069785, "balance_loss_clip": 1.05727792, "balance_loss_mlp": 1.04534709, "epoch": 0.06349015481737562, "flos": 25154886309120.0, "grad_norm": 2.4509218988768, "language_loss": 0.8113938, "learning_rate": 3.988246856230734e-06, "loss": 0.83373404, "num_input_tokens_seen": 22583035, "step": 1056, "time_per_iteration": 5.345282793045044 }, { "auxiliary_loss_clip": 0.01144181, "auxiliary_loss_mlp": 0.01061125, "balance_loss_clip": 1.04991364, "balance_loss_mlp": 1.03449368, "epoch": 0.06355027807004358, "flos": 26871775562880.0, "grad_norm": 2.2117272688527128, "language_loss": 0.81083393, "learning_rate": 3.988204658322426e-06, "loss": 0.83288693, "num_input_tokens_seen": 22605055, "step": 1057, "time_per_iteration": 2.866757392883301 }, { "auxiliary_loss_clip": 0.01139076, "auxiliary_loss_mlp": 0.01061742, "balance_loss_clip": 1.04970908, "balance_loss_mlp": 1.03918755, "epoch": 0.06361040132271156, "flos": 21396691140480.0, "grad_norm": 1.9636971972870172, "language_loss": 0.83353591, "learning_rate": 3.988162385021196e-06, "loss": 0.85554409, "num_input_tokens_seen": 22623760, "step": 1058, "time_per_iteration": 2.767024278640747 }, { "auxiliary_loss_clip": 0.0117752, "auxiliary_loss_mlp": 0.01059639, "balance_loss_clip": 1.0576936, "balance_loss_mlp": 1.03408027, "epoch": 0.06367052457537953, "flos": 25733765894400.0, "grad_norm": 2.137077300251244, "language_loss": 0.87556928, "learning_rate": 3.988120036328651e-06, "loss": 0.89794087, "num_input_tokens_seen": 22643000, "step": 1059, "time_per_iteration": 2.794734239578247 }, { "auxiliary_loss_clip": 0.01169658, "auxiliary_loss_mlp": 0.01063463, "balance_loss_clip": 1.06196678, "balance_loss_mlp": 1.0383693, "epoch": 0.0637306478280475, "flos": 17631420992640.0, "grad_norm": 2.543966627588717, "language_loss": 0.91561133, "learning_rate": 3.988077612246394e-06, "loss": 0.93794256, "num_input_tokens_seen": 22660460, "step": 1060, "time_per_iteration": 2.8223626613616943 }, { "auxiliary_loss_clip": 0.01173933, "auxiliary_loss_mlp": 0.01065151, "balance_loss_clip": 1.05715585, "balance_loss_mlp": 1.03981876, "epoch": 0.06379077108071547, "flos": 13662610427520.0, "grad_norm": 1.9401711052692647, "language_loss": 0.87242293, "learning_rate": 3.988035112776035e-06, "loss": 0.89481378, "num_input_tokens_seen": 22679270, "step": 1061, "time_per_iteration": 2.7783865928649902 }, { "auxiliary_loss_clip": 0.01190039, "auxiliary_loss_mlp": 0.01059971, "balance_loss_clip": 1.05976009, "balance_loss_mlp": 1.03388786, "epoch": 0.06385089433338344, "flos": 28478849961600.0, "grad_norm": 5.360593029379932, "language_loss": 0.77407908, "learning_rate": 3.987992537919185e-06, "loss": 0.79657912, "num_input_tokens_seen": 22699330, "step": 1062, "time_per_iteration": 2.872587203979492 }, { "auxiliary_loss_clip": 0.01172912, "auxiliary_loss_mlp": 0.01061175, "balance_loss_clip": 1.05884075, "balance_loss_mlp": 1.03798842, "epoch": 0.0639110175860514, "flos": 24311057028480.0, "grad_norm": 2.2658654128491436, "language_loss": 0.86522883, "learning_rate": 3.987949887677459e-06, "loss": 0.88756967, "num_input_tokens_seen": 22717945, "step": 1063, "time_per_iteration": 2.7915029525756836 }, { "auxiliary_loss_clip": 0.01207773, "auxiliary_loss_mlp": 0.01062698, "balance_loss_clip": 1.05969334, "balance_loss_mlp": 1.03846335, "epoch": 0.06397114083871938, "flos": 22090772620800.0, "grad_norm": 2.302236346678267, "language_loss": 0.79908657, "learning_rate": 3.9879071620524744e-06, "loss": 0.82179129, "num_input_tokens_seen": 22736790, "step": 1064, "time_per_iteration": 2.6880991458892822 }, { "auxiliary_loss_clip": 0.01198826, "auxiliary_loss_mlp": 0.01066465, "balance_loss_clip": 1.0603801, "balance_loss_mlp": 1.04149103, "epoch": 0.06403126409138735, "flos": 19572824206080.0, "grad_norm": 3.1552731138796215, "language_loss": 0.84327948, "learning_rate": 3.987864361045851e-06, "loss": 0.8659324, "num_input_tokens_seen": 22754745, "step": 1065, "time_per_iteration": 2.6956398487091064 }, { "auxiliary_loss_clip": 0.01168098, "auxiliary_loss_mlp": 0.01054905, "balance_loss_clip": 1.0597136, "balance_loss_mlp": 1.03162324, "epoch": 0.06409138734405531, "flos": 40807413267840.0, "grad_norm": 1.52830872536012, "language_loss": 0.68177885, "learning_rate": 3.987821484659211e-06, "loss": 0.70400894, "num_input_tokens_seen": 22776780, "step": 1066, "time_per_iteration": 2.9867773056030273 }, { "auxiliary_loss_clip": 0.01214184, "auxiliary_loss_mlp": 0.01070649, "balance_loss_clip": 1.06780005, "balance_loss_mlp": 1.04609215, "epoch": 0.06415151059672328, "flos": 20441610460800.0, "grad_norm": 1.8546001537284342, "language_loss": 0.90349269, "learning_rate": 3.987778532894181e-06, "loss": 0.926341, "num_input_tokens_seen": 22793915, "step": 1067, "time_per_iteration": 2.685896873474121 }, { "auxiliary_loss_clip": 0.01188134, "auxiliary_loss_mlp": 0.01063022, "balance_loss_clip": 1.0623709, "balance_loss_mlp": 1.03969264, "epoch": 0.06421163384939126, "flos": 18072045129600.0, "grad_norm": 2.189788428445167, "language_loss": 0.83437371, "learning_rate": 3.987735505752391e-06, "loss": 0.85688531, "num_input_tokens_seen": 22812670, "step": 1068, "time_per_iteration": 2.851602554321289 }, { "auxiliary_loss_clip": 0.01178972, "auxiliary_loss_mlp": 0.01057745, "balance_loss_clip": 1.05909026, "balance_loss_mlp": 1.03426039, "epoch": 0.06427175710205922, "flos": 25119442563840.0, "grad_norm": 3.045176948020938, "language_loss": 0.89311272, "learning_rate": 3.987692403235471e-06, "loss": 0.9154799, "num_input_tokens_seen": 22832440, "step": 1069, "time_per_iteration": 2.7825255393981934 }, { "auxiliary_loss_clip": 0.01185672, "auxiliary_loss_mlp": 0.01071834, "balance_loss_clip": 1.06158304, "balance_loss_mlp": 1.04663301, "epoch": 0.06433188035472719, "flos": 17380549428480.0, "grad_norm": 2.7038488706194808, "language_loss": 0.95759481, "learning_rate": 3.987649225345056e-06, "loss": 0.98016989, "num_input_tokens_seen": 22845495, "step": 1070, "time_per_iteration": 2.715296506881714 }, { "auxiliary_loss_clip": 0.01140792, "auxiliary_loss_mlp": 0.01056718, "balance_loss_clip": 1.05607581, "balance_loss_mlp": 1.03027749, "epoch": 0.06439200360739517, "flos": 23546267625600.0, "grad_norm": 1.630790580283393, "language_loss": 0.8811003, "learning_rate": 3.987605972082782e-06, "loss": 0.90307534, "num_input_tokens_seen": 22865390, "step": 1071, "time_per_iteration": 2.8445394039154053 }, { "auxiliary_loss_clip": 0.01155172, "auxiliary_loss_mlp": 0.01054986, "balance_loss_clip": 1.05483651, "balance_loss_mlp": 1.03102481, "epoch": 0.06445212686006313, "flos": 21979772616960.0, "grad_norm": 1.8349443396730127, "language_loss": 0.76116478, "learning_rate": 3.987562643450292e-06, "loss": 0.78326637, "num_input_tokens_seen": 22885495, "step": 1072, "time_per_iteration": 2.8330819606781006 }, { "auxiliary_loss_clip": 0.01172997, "auxiliary_loss_mlp": 0.01070104, "balance_loss_clip": 1.05975842, "balance_loss_mlp": 1.04362798, "epoch": 0.0645122501127311, "flos": 25921291824000.0, "grad_norm": 2.724283900767911, "language_loss": 0.80849886, "learning_rate": 3.987519239449226e-06, "loss": 0.83092993, "num_input_tokens_seen": 22904845, "step": 1073, "time_per_iteration": 2.748286247253418 }, { "auxiliary_loss_clip": 0.01194712, "auxiliary_loss_mlp": 0.01062452, "balance_loss_clip": 1.06345201, "balance_loss_mlp": 1.03825283, "epoch": 0.06457237336539907, "flos": 25626034028160.0, "grad_norm": 5.0538746884234245, "language_loss": 0.80282539, "learning_rate": 3.987475760081233e-06, "loss": 0.82539707, "num_input_tokens_seen": 22925940, "step": 1074, "time_per_iteration": 2.7482337951660156 }, { "auxiliary_loss_clip": 0.01173366, "auxiliary_loss_mlp": 0.01057774, "balance_loss_clip": 1.05920076, "balance_loss_mlp": 1.03256142, "epoch": 0.06463249661806704, "flos": 19463979018240.0, "grad_norm": 2.0209756517373707, "language_loss": 0.79249811, "learning_rate": 3.987432205347958e-06, "loss": 0.8148095, "num_input_tokens_seen": 22944375, "step": 1075, "time_per_iteration": 2.6937224864959717 }, { "auxiliary_loss_clip": 0.01171297, "auxiliary_loss_mlp": 0.01063569, "balance_loss_clip": 1.05735481, "balance_loss_mlp": 1.04025126, "epoch": 0.064692619870735, "flos": 24498044254080.0, "grad_norm": 2.9028991302223357, "language_loss": 0.88208115, "learning_rate": 3.987388575251055e-06, "loss": 0.90442967, "num_input_tokens_seen": 22959145, "step": 1076, "time_per_iteration": 2.878103256225586 }, { "auxiliary_loss_clip": 0.01192915, "auxiliary_loss_mlp": 0.01052877, "balance_loss_clip": 1.06164443, "balance_loss_mlp": 1.0288558, "epoch": 0.06475274312340297, "flos": 17018677860480.0, "grad_norm": 2.225760792581628, "language_loss": 0.80876106, "learning_rate": 3.98734486979218e-06, "loss": 0.83121902, "num_input_tokens_seen": 22978100, "step": 1077, "time_per_iteration": 2.7221076488494873 }, { "auxiliary_loss_clip": 0.01200466, "auxiliary_loss_mlp": 0.01064019, "balance_loss_clip": 1.0656153, "balance_loss_mlp": 1.03866291, "epoch": 0.06481286637607095, "flos": 24572379450240.0, "grad_norm": 2.256787147683815, "language_loss": 0.91727465, "learning_rate": 3.987301088972986e-06, "loss": 0.93991947, "num_input_tokens_seen": 22997285, "step": 1078, "time_per_iteration": 2.862365484237671 }, { "auxiliary_loss_clip": 0.0122435, "auxiliary_loss_mlp": 0.01060225, "balance_loss_clip": 1.06826639, "balance_loss_mlp": 1.03552508, "epoch": 0.06487298962873891, "flos": 21105635235840.0, "grad_norm": 2.080056711608912, "language_loss": 0.78349572, "learning_rate": 3.987257232795137e-06, "loss": 0.80634147, "num_input_tokens_seen": 23016285, "step": 1079, "time_per_iteration": 2.6435368061065674 }, { "auxiliary_loss_clip": 0.01156927, "auxiliary_loss_mlp": 0.01063794, "balance_loss_clip": 1.05512071, "balance_loss_mlp": 1.03899896, "epoch": 0.06493311288140688, "flos": 24608182331520.0, "grad_norm": 2.274862403364013, "language_loss": 0.68702769, "learning_rate": 3.987213301260294e-06, "loss": 0.70923495, "num_input_tokens_seen": 23036420, "step": 1080, "time_per_iteration": 2.7782626152038574 }, { "auxiliary_loss_clip": 0.01175684, "auxiliary_loss_mlp": 0.01062351, "balance_loss_clip": 1.06640029, "balance_loss_mlp": 1.03610086, "epoch": 0.06499323613407486, "flos": 25337994865920.0, "grad_norm": 1.886196453243775, "language_loss": 0.72291583, "learning_rate": 3.987169294370123e-06, "loss": 0.74529618, "num_input_tokens_seen": 23056945, "step": 1081, "time_per_iteration": 2.7983880043029785 }, { "auxiliary_loss_clip": 0.01139671, "auxiliary_loss_mlp": 0.01066686, "balance_loss_clip": 1.0504055, "balance_loss_mlp": 1.04076982, "epoch": 0.06505335938674282, "flos": 20375714960640.0, "grad_norm": 3.3093934650613566, "language_loss": 0.84059012, "learning_rate": 3.987125212126294e-06, "loss": 0.86265367, "num_input_tokens_seen": 23074940, "step": 1082, "time_per_iteration": 2.8351900577545166 }, { "auxiliary_loss_clip": 0.01204185, "auxiliary_loss_mlp": 0.01063692, "balance_loss_clip": 1.06306195, "balance_loss_mlp": 1.03809738, "epoch": 0.06511348263941079, "flos": 25337923038720.0, "grad_norm": 2.894360492506304, "language_loss": 0.82550305, "learning_rate": 3.987081054530478e-06, "loss": 0.84818184, "num_input_tokens_seen": 23093420, "step": 1083, "time_per_iteration": 2.866729974746704 }, { "auxiliary_loss_clip": 0.01168245, "auxiliary_loss_mlp": 0.01062938, "balance_loss_clip": 1.06021011, "balance_loss_mlp": 1.03655696, "epoch": 0.06517360589207877, "flos": 20332801186560.0, "grad_norm": 2.468736383036802, "language_loss": 0.79289383, "learning_rate": 3.987036821584348e-06, "loss": 0.81520569, "num_input_tokens_seen": 23111550, "step": 1084, "time_per_iteration": 2.816601276397705 }, { "auxiliary_loss_clip": 0.01174068, "auxiliary_loss_mlp": 0.0106167, "balance_loss_clip": 1.05854714, "balance_loss_mlp": 1.03667152, "epoch": 0.06523372914474673, "flos": 31681650061440.0, "grad_norm": 2.571590277205686, "language_loss": 0.66443276, "learning_rate": 3.986992513289584e-06, "loss": 0.68679011, "num_input_tokens_seen": 23130335, "step": 1085, "time_per_iteration": 2.8260092735290527 }, { "auxiliary_loss_clip": 0.01170818, "auxiliary_loss_mlp": 0.01062435, "balance_loss_clip": 1.0600934, "balance_loss_mlp": 1.03833067, "epoch": 0.0652938523974147, "flos": 20778165918720.0, "grad_norm": 2.0478791529086977, "language_loss": 0.76548934, "learning_rate": 3.9869481296478645e-06, "loss": 0.78782183, "num_input_tokens_seen": 23152380, "step": 1086, "time_per_iteration": 2.7937023639678955 }, { "auxiliary_loss_clip": 0.01198609, "auxiliary_loss_mlp": 0.01059288, "balance_loss_clip": 1.06335294, "balance_loss_mlp": 1.03519547, "epoch": 0.06535397565008266, "flos": 16690993061760.0, "grad_norm": 2.1629448601391017, "language_loss": 0.85109925, "learning_rate": 3.986903670660872e-06, "loss": 0.87367821, "num_input_tokens_seen": 23171630, "step": 1087, "time_per_iteration": 2.7510013580322266 }, { "auxiliary_loss_clip": 0.01184978, "auxiliary_loss_mlp": 0.01059017, "balance_loss_clip": 1.06293821, "balance_loss_mlp": 1.03510392, "epoch": 0.06541409890275064, "flos": 26868220116480.0, "grad_norm": 1.7886353193129139, "language_loss": 0.77776635, "learning_rate": 3.9868591363302945e-06, "loss": 0.80020636, "num_input_tokens_seen": 23192520, "step": 1088, "time_per_iteration": 2.7792751789093018 }, { "auxiliary_loss_clip": 0.01192707, "auxiliary_loss_mlp": 0.01067634, "balance_loss_clip": 1.06569457, "balance_loss_mlp": 1.04498422, "epoch": 0.06547422215541861, "flos": 20521620005760.0, "grad_norm": 3.0334087154373375, "language_loss": 0.71050513, "learning_rate": 3.9868145266578186e-06, "loss": 0.73310852, "num_input_tokens_seen": 23210710, "step": 1089, "time_per_iteration": 2.8832852840423584 }, { "auxiliary_loss_clip": 0.01173663, "auxiliary_loss_mlp": 0.00781529, "balance_loss_clip": 1.06159782, "balance_loss_mlp": 1.00019014, "epoch": 0.06553434540808657, "flos": 22016616992640.0, "grad_norm": 2.02973275746688, "language_loss": 0.85650897, "learning_rate": 3.9867698416451366e-06, "loss": 0.87606084, "num_input_tokens_seen": 23230305, "step": 1090, "time_per_iteration": 2.7933149337768555 }, { "auxiliary_loss_clip": 0.01214666, "auxiliary_loss_mlp": 0.0105885, "balance_loss_clip": 1.06735325, "balance_loss_mlp": 1.03460288, "epoch": 0.06559446866075455, "flos": 24608649208320.0, "grad_norm": 2.137212216289862, "language_loss": 0.71829313, "learning_rate": 3.9867250812939434e-06, "loss": 0.74102825, "num_input_tokens_seen": 23249015, "step": 1091, "time_per_iteration": 2.646592855453491 }, { "auxiliary_loss_clip": 0.01121055, "auxiliary_loss_mlp": 0.0106405, "balance_loss_clip": 1.05242276, "balance_loss_mlp": 1.03961205, "epoch": 0.06565459191342252, "flos": 24274679529600.0, "grad_norm": 2.2773849385721956, "language_loss": 0.82839823, "learning_rate": 3.986680245605936e-06, "loss": 0.85024923, "num_input_tokens_seen": 23265105, "step": 1092, "time_per_iteration": 4.799649715423584 }, { "auxiliary_loss_clip": 0.01215092, "auxiliary_loss_mlp": 0.01059151, "balance_loss_clip": 1.0640471, "balance_loss_mlp": 1.03352082, "epoch": 0.06571471516609048, "flos": 24787124910720.0, "grad_norm": 2.268968080418226, "language_loss": 0.71134168, "learning_rate": 3.986635334582814e-06, "loss": 0.73408413, "num_input_tokens_seen": 23283950, "step": 1093, "time_per_iteration": 5.3356239795684814 }, { "auxiliary_loss_clip": 0.01190682, "auxiliary_loss_mlp": 0.01064498, "balance_loss_clip": 1.06751943, "balance_loss_mlp": 1.0392611, "epoch": 0.06577483841875846, "flos": 26214071581440.0, "grad_norm": 3.829837904337144, "language_loss": 0.87996346, "learning_rate": 3.986590348226282e-06, "loss": 0.90251523, "num_input_tokens_seen": 23305005, "step": 1094, "time_per_iteration": 2.853489637374878 }, { "auxiliary_loss_clip": 0.01192742, "auxiliary_loss_mlp": 0.01065068, "balance_loss_clip": 1.06367433, "balance_loss_mlp": 1.03843689, "epoch": 0.06583496167142643, "flos": 25080802508160.0, "grad_norm": 1.6736216436017588, "language_loss": 0.81483954, "learning_rate": 3.986545286538044e-06, "loss": 0.8374176, "num_input_tokens_seen": 23323220, "step": 1095, "time_per_iteration": 5.1613922119140625 }, { "auxiliary_loss_clip": 0.01166049, "auxiliary_loss_mlp": 0.01058945, "balance_loss_clip": 1.06295943, "balance_loss_mlp": 1.03598547, "epoch": 0.06589508492409439, "flos": 25629804956160.0, "grad_norm": 2.0200125290673068, "language_loss": 0.69789279, "learning_rate": 3.986500149519811e-06, "loss": 0.72014272, "num_input_tokens_seen": 23342235, "step": 1096, "time_per_iteration": 2.804025173187256 }, { "auxiliary_loss_clip": 0.01201939, "auxiliary_loss_mlp": 0.01070786, "balance_loss_clip": 1.06405246, "balance_loss_mlp": 1.04614568, "epoch": 0.06595520817676236, "flos": 23621249266560.0, "grad_norm": 1.7011375462517908, "language_loss": 0.77430046, "learning_rate": 3.986454937173292e-06, "loss": 0.79702777, "num_input_tokens_seen": 23363680, "step": 1097, "time_per_iteration": 2.7658958435058594 }, { "auxiliary_loss_clip": 0.01215996, "auxiliary_loss_mlp": 0.01063445, "balance_loss_clip": 1.06707537, "balance_loss_mlp": 1.03959155, "epoch": 0.06601533142943034, "flos": 33801708545280.0, "grad_norm": 1.8316558452843608, "language_loss": 0.78217584, "learning_rate": 3.986409649500203e-06, "loss": 0.80497026, "num_input_tokens_seen": 23385590, "step": 1098, "time_per_iteration": 2.865684747695923 }, { "auxiliary_loss_clip": 0.01197349, "auxiliary_loss_mlp": 0.01069192, "balance_loss_clip": 1.06328607, "balance_loss_mlp": 1.04443276, "epoch": 0.0660754546820983, "flos": 20259184262400.0, "grad_norm": 1.9237510259783663, "language_loss": 0.81525648, "learning_rate": 3.986364286502261e-06, "loss": 0.83792192, "num_input_tokens_seen": 23402945, "step": 1099, "time_per_iteration": 2.690377950668335 }, { "auxiliary_loss_clip": 0.01179995, "auxiliary_loss_mlp": 0.0105819, "balance_loss_clip": 1.0578239, "balance_loss_mlp": 1.03428841, "epoch": 0.06613557793476627, "flos": 19354164163200.0, "grad_norm": 1.9906927310803755, "language_loss": 0.82793295, "learning_rate": 3.986318848181186e-06, "loss": 0.8503148, "num_input_tokens_seen": 23421410, "step": 1100, "time_per_iteration": 2.7613909244537354 }, { "auxiliary_loss_clip": 0.01191263, "auxiliary_loss_mlp": 0.0105903, "balance_loss_clip": 1.06985724, "balance_loss_mlp": 1.03529549, "epoch": 0.06619570118743424, "flos": 13772568936960.0, "grad_norm": 2.079994286400427, "language_loss": 0.73502243, "learning_rate": 3.986273334538702e-06, "loss": 0.75752538, "num_input_tokens_seen": 23438870, "step": 1101, "time_per_iteration": 2.7795870304107666 }, { "auxiliary_loss_clip": 0.01199256, "auxiliary_loss_mlp": 0.01061171, "balance_loss_clip": 1.06278944, "balance_loss_mlp": 1.03773487, "epoch": 0.06625582444010221, "flos": 17857874286720.0, "grad_norm": 2.875757629612747, "language_loss": 0.85861301, "learning_rate": 3.986227745576533e-06, "loss": 0.88121736, "num_input_tokens_seen": 23456975, "step": 1102, "time_per_iteration": 2.737269401550293 }, { "auxiliary_loss_clip": 0.01191982, "auxiliary_loss_mlp": 0.01058639, "balance_loss_clip": 1.06898165, "balance_loss_mlp": 1.03410578, "epoch": 0.06631594769277017, "flos": 11838707579520.0, "grad_norm": 2.8924251757501778, "language_loss": 0.81655926, "learning_rate": 3.98618208129641e-06, "loss": 0.83906543, "num_input_tokens_seen": 23473440, "step": 1103, "time_per_iteration": 2.9345293045043945 }, { "auxiliary_loss_clip": 0.01203522, "auxiliary_loss_mlp": 0.00780451, "balance_loss_clip": 1.06721628, "balance_loss_mlp": 1.00042021, "epoch": 0.06637607094543815, "flos": 19793351756160.0, "grad_norm": 5.176370819061919, "language_loss": 0.81749105, "learning_rate": 3.986136341700063e-06, "loss": 0.83733076, "num_input_tokens_seen": 23493880, "step": 1104, "time_per_iteration": 2.753657102584839 }, { "auxiliary_loss_clip": 0.0116508, "auxiliary_loss_mlp": 0.01050687, "balance_loss_clip": 1.0576005, "balance_loss_mlp": 1.02608228, "epoch": 0.06643619419810612, "flos": 25485659677440.0, "grad_norm": 1.5448539486038575, "language_loss": 0.80422902, "learning_rate": 3.986090526789227e-06, "loss": 0.82638663, "num_input_tokens_seen": 23514920, "step": 1105, "time_per_iteration": 2.8904521465301514 }, { "auxiliary_loss_clip": 0.01179397, "auxiliary_loss_mlp": 0.0106197, "balance_loss_clip": 1.06348729, "balance_loss_mlp": 1.0391891, "epoch": 0.06649631745077408, "flos": 16946533393920.0, "grad_norm": 2.7426455725749896, "language_loss": 0.96762037, "learning_rate": 3.986044636565639e-06, "loss": 0.99003398, "num_input_tokens_seen": 23531635, "step": 1106, "time_per_iteration": 2.890073299407959 }, { "auxiliary_loss_clip": 0.01198065, "auxiliary_loss_mlp": 0.01059975, "balance_loss_clip": 1.06069684, "balance_loss_mlp": 1.03511953, "epoch": 0.06655644070344206, "flos": 17858592558720.0, "grad_norm": 1.9297768479693453, "language_loss": 0.82528949, "learning_rate": 3.985998671031039e-06, "loss": 0.84786987, "num_input_tokens_seen": 23551020, "step": 1107, "time_per_iteration": 2.778857469558716 }, { "auxiliary_loss_clip": 0.01104176, "auxiliary_loss_mlp": 0.01010935, "balance_loss_clip": 1.04708242, "balance_loss_mlp": 1.0072155, "epoch": 0.06661656395611003, "flos": 61419350021760.0, "grad_norm": 0.7967940032222198, "language_loss": 0.56789279, "learning_rate": 3.9859526301871705e-06, "loss": 0.58904392, "num_input_tokens_seen": 23610675, "step": 1108, "time_per_iteration": 3.2717819213867188 }, { "auxiliary_loss_clip": 0.0118327, "auxiliary_loss_mlp": 0.01062625, "balance_loss_clip": 1.05651307, "balance_loss_mlp": 1.0376507, "epoch": 0.066676687208778, "flos": 20662856282880.0, "grad_norm": 2.682842555407744, "language_loss": 0.7287578, "learning_rate": 3.9859065140357795e-06, "loss": 0.75121677, "num_input_tokens_seen": 23628710, "step": 1109, "time_per_iteration": 2.829623222351074 }, { "auxiliary_loss_clip": 0.01148971, "auxiliary_loss_mlp": 0.01071895, "balance_loss_clip": 1.05459642, "balance_loss_mlp": 1.04714715, "epoch": 0.06673681046144596, "flos": 20923280864640.0, "grad_norm": 1.7914435942805436, "language_loss": 0.78140426, "learning_rate": 3.985860322578614e-06, "loss": 0.80361295, "num_input_tokens_seen": 23649160, "step": 1110, "time_per_iteration": 2.892786741256714 }, { "auxiliary_loss_clip": 0.01153553, "auxiliary_loss_mlp": 0.0106147, "balance_loss_clip": 1.05590594, "balance_loss_mlp": 1.03700781, "epoch": 0.06679693371411394, "flos": 31065818359680.0, "grad_norm": 2.5260725451831805, "language_loss": 0.71425366, "learning_rate": 3.985814055817427e-06, "loss": 0.73640382, "num_input_tokens_seen": 23671995, "step": 1111, "time_per_iteration": 2.9349052906036377 }, { "auxiliary_loss_clip": 0.01170538, "auxiliary_loss_mlp": 0.01066103, "balance_loss_clip": 1.05776191, "balance_loss_mlp": 1.04199934, "epoch": 0.0668570569667819, "flos": 21726135705600.0, "grad_norm": 1.8396663794990693, "language_loss": 0.78767776, "learning_rate": 3.985767713753971e-06, "loss": 0.81004417, "num_input_tokens_seen": 23690705, "step": 1112, "time_per_iteration": 2.8676345348358154 }, { "auxiliary_loss_clip": 0.01153291, "auxiliary_loss_mlp": 0.01065421, "balance_loss_clip": 1.05340791, "balance_loss_mlp": 1.04163861, "epoch": 0.06691718021944987, "flos": 22747255539840.0, "grad_norm": 2.071048188460824, "language_loss": 0.78481978, "learning_rate": 3.985721296390005e-06, "loss": 0.80700684, "num_input_tokens_seen": 23709990, "step": 1113, "time_per_iteration": 2.8688411712646484 }, { "auxiliary_loss_clip": 0.0114872, "auxiliary_loss_mlp": 0.01057074, "balance_loss_clip": 1.05157375, "balance_loss_mlp": 1.03376842, "epoch": 0.06697730347211785, "flos": 16545626720640.0, "grad_norm": 1.7560007918285245, "language_loss": 0.82399213, "learning_rate": 3.985674803727289e-06, "loss": 0.84605002, "num_input_tokens_seen": 23728485, "step": 1114, "time_per_iteration": 2.832458019256592 }, { "auxiliary_loss_clip": 0.01075626, "auxiliary_loss_mlp": 0.01006906, "balance_loss_clip": 1.04995251, "balance_loss_mlp": 1.00271022, "epoch": 0.06703742672478581, "flos": 59782326658560.0, "grad_norm": 0.8370646888074905, "language_loss": 0.58147323, "learning_rate": 3.985628235767584e-06, "loss": 0.60229862, "num_input_tokens_seen": 23786650, "step": 1115, "time_per_iteration": 3.550837755203247 }, { "auxiliary_loss_clip": 0.01177193, "auxiliary_loss_mlp": 0.01059174, "balance_loss_clip": 1.05986214, "balance_loss_mlp": 1.03381801, "epoch": 0.06709754997745378, "flos": 16800197385600.0, "grad_norm": 2.8944873563712235, "language_loss": 0.91280693, "learning_rate": 3.985581592512658e-06, "loss": 0.93517065, "num_input_tokens_seen": 23802555, "step": 1116, "time_per_iteration": 2.994608163833618 }, { "auxiliary_loss_clip": 0.01169376, "auxiliary_loss_mlp": 0.0078227, "balance_loss_clip": 1.05839634, "balance_loss_mlp": 1.00045347, "epoch": 0.06715767323012176, "flos": 22123917895680.0, "grad_norm": 1.9249158333763592, "language_loss": 0.87154609, "learning_rate": 3.985534873964279e-06, "loss": 0.89106256, "num_input_tokens_seen": 23822945, "step": 1117, "time_per_iteration": 2.794400453567505 }, { "auxiliary_loss_clip": 0.01095782, "auxiliary_loss_mlp": 0.01003785, "balance_loss_clip": 1.0387876, "balance_loss_mlp": 0.99963647, "epoch": 0.06721779648278972, "flos": 66618100137600.0, "grad_norm": 0.8644388721740246, "language_loss": 0.5981611, "learning_rate": 3.985488080124218e-06, "loss": 0.61915678, "num_input_tokens_seen": 23874075, "step": 1118, "time_per_iteration": 3.1695809364318848 }, { "auxiliary_loss_clip": 0.01178972, "auxiliary_loss_mlp": 0.01051993, "balance_loss_clip": 1.05301392, "balance_loss_mlp": 1.02780545, "epoch": 0.06727791973545769, "flos": 22382474970240.0, "grad_norm": 3.6923711141076447, "language_loss": 0.83045954, "learning_rate": 3.985441210994251e-06, "loss": 0.85276914, "num_input_tokens_seen": 23889720, "step": 1119, "time_per_iteration": 2.7538814544677734 }, { "auxiliary_loss_clip": 0.01182384, "auxiliary_loss_mlp": 0.01058422, "balance_loss_clip": 1.06102347, "balance_loss_mlp": 1.03566504, "epoch": 0.06733804298812565, "flos": 24280210224000.0, "grad_norm": 4.541743494234462, "language_loss": 0.8451674, "learning_rate": 3.9853942665761545e-06, "loss": 0.86757541, "num_input_tokens_seen": 23909385, "step": 1120, "time_per_iteration": 2.76581072807312 }, { "auxiliary_loss_clip": 0.0121565, "auxiliary_loss_mlp": 0.01064916, "balance_loss_clip": 1.06757379, "balance_loss_mlp": 1.04028773, "epoch": 0.06739816624079363, "flos": 15918230839680.0, "grad_norm": 2.503866645162978, "language_loss": 0.78722781, "learning_rate": 3.985347246871708e-06, "loss": 0.81003344, "num_input_tokens_seen": 23926830, "step": 1121, "time_per_iteration": 2.651175022125244 }, { "auxiliary_loss_clip": 0.01080914, "auxiliary_loss_mlp": 0.01011889, "balance_loss_clip": 1.03108025, "balance_loss_mlp": 1.00802636, "epoch": 0.0674582894934616, "flos": 71398567353600.0, "grad_norm": 0.7540288133642103, "language_loss": 0.58320796, "learning_rate": 3.985300151882694e-06, "loss": 0.60413599, "num_input_tokens_seen": 23992640, "step": 1122, "time_per_iteration": 3.3794541358947754 }, { "auxiliary_loss_clip": 0.01145486, "auxiliary_loss_mlp": 0.01066136, "balance_loss_clip": 1.05581403, "balance_loss_mlp": 1.04167438, "epoch": 0.06751841274612956, "flos": 25264952559360.0, "grad_norm": 2.3361170394687076, "language_loss": 0.71965349, "learning_rate": 3.985252981610901e-06, "loss": 0.74176967, "num_input_tokens_seen": 24011135, "step": 1123, "time_per_iteration": 2.8049354553222656 }, { "auxiliary_loss_clip": 0.01144994, "auxiliary_loss_mlp": 0.01064196, "balance_loss_clip": 1.05373979, "balance_loss_mlp": 1.03612232, "epoch": 0.06757853599879754, "flos": 23802741711360.0, "grad_norm": 1.7380479869896208, "language_loss": 0.78987843, "learning_rate": 3.985205736058114e-06, "loss": 0.81197035, "num_input_tokens_seen": 24030695, "step": 1124, "time_per_iteration": 2.8595056533813477 }, { "auxiliary_loss_clip": 0.01189686, "auxiliary_loss_mlp": 0.01055169, "balance_loss_clip": 1.05663013, "balance_loss_mlp": 1.03200674, "epoch": 0.0676386592514655, "flos": 21033742164480.0, "grad_norm": 3.1450673626590793, "language_loss": 0.70999855, "learning_rate": 3.985158415226128e-06, "loss": 0.73244709, "num_input_tokens_seen": 24050680, "step": 1125, "time_per_iteration": 2.726163625717163 }, { "auxiliary_loss_clip": 0.01165518, "auxiliary_loss_mlp": 0.01068918, "balance_loss_clip": 1.05826426, "balance_loss_mlp": 1.04290628, "epoch": 0.06769878250413347, "flos": 25556331686400.0, "grad_norm": 3.340323364887528, "language_loss": 0.81440383, "learning_rate": 3.985111019116736e-06, "loss": 0.83674812, "num_input_tokens_seen": 24067205, "step": 1126, "time_per_iteration": 2.7356598377227783 }, { "auxiliary_loss_clip": 0.0107201, "auxiliary_loss_mlp": 0.01004999, "balance_loss_clip": 1.0293622, "balance_loss_mlp": 1.00092208, "epoch": 0.06775890575680145, "flos": 70655251305600.0, "grad_norm": 0.77802311726495, "language_loss": 0.59720373, "learning_rate": 3.985063547731735e-06, "loss": 0.6179738, "num_input_tokens_seen": 24131320, "step": 1127, "time_per_iteration": 3.2627320289611816 }, { "auxiliary_loss_clip": 0.01206438, "auxiliary_loss_mlp": 0.01055509, "balance_loss_clip": 1.06308687, "balance_loss_mlp": 1.03189397, "epoch": 0.06781902900946941, "flos": 24235500769920.0, "grad_norm": 2.2535941175889054, "language_loss": 0.81097019, "learning_rate": 3.985016001072925e-06, "loss": 0.83358967, "num_input_tokens_seen": 24149930, "step": 1128, "time_per_iteration": 2.6652371883392334 }, { "auxiliary_loss_clip": 0.01158345, "auxiliary_loss_mlp": 0.01052658, "balance_loss_clip": 1.05360484, "balance_loss_mlp": 1.02804112, "epoch": 0.06787915226213738, "flos": 22417523665920.0, "grad_norm": 2.24200367657907, "language_loss": 0.75559127, "learning_rate": 3.984968379142109e-06, "loss": 0.77770138, "num_input_tokens_seen": 24169590, "step": 1129, "time_per_iteration": 2.7023732662200928 }, { "auxiliary_loss_clip": 0.01117595, "auxiliary_loss_mlp": 0.01053995, "balance_loss_clip": 1.04627228, "balance_loss_mlp": 1.03006983, "epoch": 0.06793927551480534, "flos": 37706922080640.0, "grad_norm": 1.890559803272908, "language_loss": 0.71710479, "learning_rate": 3.984920681941094e-06, "loss": 0.73882067, "num_input_tokens_seen": 24189965, "step": 1130, "time_per_iteration": 3.0757689476013184 }, { "auxiliary_loss_clip": 0.01158117, "auxiliary_loss_mlp": 0.010592, "balance_loss_clip": 1.05734515, "balance_loss_mlp": 1.03481019, "epoch": 0.06799939876747332, "flos": 20631398947200.0, "grad_norm": 2.24421862356218, "language_loss": 0.80776262, "learning_rate": 3.984872909471688e-06, "loss": 0.82993579, "num_input_tokens_seen": 24208045, "step": 1131, "time_per_iteration": 5.00832724571228 }, { "auxiliary_loss_clip": 0.01195331, "auxiliary_loss_mlp": 0.01070142, "balance_loss_clip": 1.06155944, "balance_loss_mlp": 1.04614532, "epoch": 0.06805952202014129, "flos": 14864755829760.0, "grad_norm": 2.0533244923502463, "language_loss": 0.80371779, "learning_rate": 3.984825061735701e-06, "loss": 0.8263725, "num_input_tokens_seen": 24223805, "step": 1132, "time_per_iteration": 4.487931251525879 }, { "auxiliary_loss_clip": 0.01170581, "auxiliary_loss_mlp": 0.01061867, "balance_loss_clip": 1.05438542, "balance_loss_mlp": 1.03756022, "epoch": 0.06811964527280925, "flos": 48909434947200.0, "grad_norm": 1.7182324226465766, "language_loss": 0.6341064, "learning_rate": 3.9847771387349495e-06, "loss": 0.65643084, "num_input_tokens_seen": 24249475, "step": 1133, "time_per_iteration": 4.48089337348938 }, { "auxiliary_loss_clip": 0.01125599, "auxiliary_loss_mlp": 0.01055984, "balance_loss_clip": 1.04700482, "balance_loss_mlp": 1.02973366, "epoch": 0.06817976852547723, "flos": 15377273038080.0, "grad_norm": 1.9264963116598819, "language_loss": 0.74771935, "learning_rate": 3.9847291404712506e-06, "loss": 0.76953518, "num_input_tokens_seen": 24267980, "step": 1134, "time_per_iteration": 5.287277936935425 }, { "auxiliary_loss_clip": 0.01169269, "auxiliary_loss_mlp": 0.00782536, "balance_loss_clip": 1.05878353, "balance_loss_mlp": 1.00042605, "epoch": 0.0682398917781452, "flos": 20155690200960.0, "grad_norm": 2.151108605399924, "language_loss": 0.86871451, "learning_rate": 3.984681066946423e-06, "loss": 0.88823259, "num_input_tokens_seen": 24286805, "step": 1135, "time_per_iteration": 2.8024110794067383 }, { "auxiliary_loss_clip": 0.0117656, "auxiliary_loss_mlp": 0.007818, "balance_loss_clip": 1.0543226, "balance_loss_mlp": 1.00046515, "epoch": 0.06830001503081316, "flos": 23440618748160.0, "grad_norm": 2.521942237810997, "language_loss": 0.78131735, "learning_rate": 3.984632918162291e-06, "loss": 0.80090094, "num_input_tokens_seen": 24305855, "step": 1136, "time_per_iteration": 2.7595040798187256 }, { "auxiliary_loss_clip": 0.01185832, "auxiliary_loss_mlp": 0.01063587, "balance_loss_clip": 1.05952621, "balance_loss_mlp": 1.03868449, "epoch": 0.06836013828348114, "flos": 34349813153280.0, "grad_norm": 2.275643110468061, "language_loss": 0.83968467, "learning_rate": 3.984584694120679e-06, "loss": 0.86217892, "num_input_tokens_seen": 24326535, "step": 1137, "time_per_iteration": 2.7738285064697266 }, { "auxiliary_loss_clip": 0.01153105, "auxiliary_loss_mlp": 0.01059471, "balance_loss_clip": 1.05239427, "balance_loss_mlp": 1.0348897, "epoch": 0.06842026153614911, "flos": 23148844571520.0, "grad_norm": 2.068206081593879, "language_loss": 0.788486, "learning_rate": 3.984536394823418e-06, "loss": 0.81061178, "num_input_tokens_seen": 24345810, "step": 1138, "time_per_iteration": 2.804537296295166 }, { "auxiliary_loss_clip": 0.01209658, "auxiliary_loss_mlp": 0.01058353, "balance_loss_clip": 1.06288362, "balance_loss_mlp": 1.03415346, "epoch": 0.06848038478881707, "flos": 24608972430720.0, "grad_norm": 2.3335265924104096, "language_loss": 0.85507643, "learning_rate": 3.984488020272336e-06, "loss": 0.87775654, "num_input_tokens_seen": 24366095, "step": 1139, "time_per_iteration": 2.746884822845459 }, { "auxiliary_loss_clip": 0.01153855, "auxiliary_loss_mlp": 0.01063721, "balance_loss_clip": 1.05325532, "balance_loss_mlp": 1.03679228, "epoch": 0.06854050804148504, "flos": 40880994278400.0, "grad_norm": 1.9254794009430078, "language_loss": 0.74899161, "learning_rate": 3.984439570469271e-06, "loss": 0.7711674, "num_input_tokens_seen": 24388665, "step": 1140, "time_per_iteration": 2.938143253326416 }, { "auxiliary_loss_clip": 0.01186218, "auxiliary_loss_mlp": 0.00782227, "balance_loss_clip": 1.06101704, "balance_loss_mlp": 1.00036597, "epoch": 0.06860063129415302, "flos": 31686354743040.0, "grad_norm": 2.1250887020504767, "language_loss": 0.68258876, "learning_rate": 3.9843910454160574e-06, "loss": 0.70227319, "num_input_tokens_seen": 24407705, "step": 1141, "time_per_iteration": 2.8180530071258545 }, { "auxiliary_loss_clip": 0.01197117, "auxiliary_loss_mlp": 0.01067748, "balance_loss_clip": 1.05978489, "balance_loss_mlp": 1.04266596, "epoch": 0.06866075454682098, "flos": 26542007775360.0, "grad_norm": 1.8460768582410394, "language_loss": 0.78959155, "learning_rate": 3.984342445114538e-06, "loss": 0.81224018, "num_input_tokens_seen": 24428390, "step": 1142, "time_per_iteration": 2.712876558303833 }, { "auxiliary_loss_clip": 0.01186915, "auxiliary_loss_mlp": 0.01060882, "balance_loss_clip": 1.06245089, "balance_loss_mlp": 1.03702831, "epoch": 0.06872087779948895, "flos": 29789768724480.0, "grad_norm": 1.7867268614306446, "language_loss": 0.68287402, "learning_rate": 3.984293769566553e-06, "loss": 0.70535195, "num_input_tokens_seen": 24450810, "step": 1143, "time_per_iteration": 2.752659320831299 }, { "auxiliary_loss_clip": 0.01177843, "auxiliary_loss_mlp": 0.01059894, "balance_loss_clip": 1.05798244, "balance_loss_mlp": 1.03773308, "epoch": 0.06878100105215693, "flos": 26941118768640.0, "grad_norm": 1.7582250309313294, "language_loss": 0.74307454, "learning_rate": 3.98424501877395e-06, "loss": 0.76545191, "num_input_tokens_seen": 24469965, "step": 1144, "time_per_iteration": 2.6448662281036377 }, { "auxiliary_loss_clip": 0.01189197, "auxiliary_loss_mlp": 0.0106544, "balance_loss_clip": 1.0565474, "balance_loss_mlp": 1.04039407, "epoch": 0.06884112430482489, "flos": 10670748946560.0, "grad_norm": 2.699041414372958, "language_loss": 0.91755033, "learning_rate": 3.984196192738577e-06, "loss": 0.94009674, "num_input_tokens_seen": 24486370, "step": 1145, "time_per_iteration": 2.6621482372283936 }, { "auxiliary_loss_clip": 0.01212189, "auxiliary_loss_mlp": 0.0106819, "balance_loss_clip": 1.06225932, "balance_loss_mlp": 1.04258406, "epoch": 0.06890124755749286, "flos": 20193647898240.0, "grad_norm": 2.2014676012481487, "language_loss": 0.81726635, "learning_rate": 3.984147291462285e-06, "loss": 0.84007025, "num_input_tokens_seen": 24503780, "step": 1146, "time_per_iteration": 2.623964548110962 }, { "auxiliary_loss_clip": 0.01204602, "auxiliary_loss_mlp": 0.01065301, "balance_loss_clip": 1.06215203, "balance_loss_mlp": 1.04191244, "epoch": 0.06896137081016084, "flos": 20449224144000.0, "grad_norm": 2.1265245828428108, "language_loss": 0.84968954, "learning_rate": 3.98409831494693e-06, "loss": 0.8723886, "num_input_tokens_seen": 24522320, "step": 1147, "time_per_iteration": 2.5898265838623047 }, { "auxiliary_loss_clip": 0.01156886, "auxiliary_loss_mlp": 0.01064453, "balance_loss_clip": 1.05563867, "balance_loss_mlp": 1.03949046, "epoch": 0.0690214940628288, "flos": 18368703555840.0, "grad_norm": 1.7557033260323716, "language_loss": 0.86094105, "learning_rate": 3.984049263194367e-06, "loss": 0.88315445, "num_input_tokens_seen": 24540445, "step": 1148, "time_per_iteration": 2.748782157897949 }, { "auxiliary_loss_clip": 0.01173365, "auxiliary_loss_mlp": 0.01060047, "balance_loss_clip": 1.05569541, "balance_loss_mlp": 1.03370178, "epoch": 0.06908161731549677, "flos": 20558033418240.0, "grad_norm": 2.322434023005448, "language_loss": 0.69602191, "learning_rate": 3.9840001362064575e-06, "loss": 0.71835601, "num_input_tokens_seen": 24557105, "step": 1149, "time_per_iteration": 2.741854429244995 }, { "auxiliary_loss_clip": 0.01207871, "auxiliary_loss_mlp": 0.01051245, "balance_loss_clip": 1.06034219, "balance_loss_mlp": 1.02692604, "epoch": 0.06914174056816474, "flos": 27563666313600.0, "grad_norm": 1.9440351937259064, "language_loss": 0.8374452, "learning_rate": 3.983950933985064e-06, "loss": 0.86003637, "num_input_tokens_seen": 24578240, "step": 1150, "time_per_iteration": 2.6919586658477783 }, { "auxiliary_loss_clip": 0.01181406, "auxiliary_loss_mlp": 0.01058015, "balance_loss_clip": 1.06063652, "balance_loss_mlp": 1.03380394, "epoch": 0.06920186382083271, "flos": 15304015249920.0, "grad_norm": 4.11905785776886, "language_loss": 0.81464434, "learning_rate": 3.983901656532052e-06, "loss": 0.83703858, "num_input_tokens_seen": 24593585, "step": 1151, "time_per_iteration": 2.7979934215545654 }, { "auxiliary_loss_clip": 0.01206831, "auxiliary_loss_mlp": 0.01058184, "balance_loss_clip": 1.06409955, "balance_loss_mlp": 1.03434169, "epoch": 0.06926198707350067, "flos": 25191227894400.0, "grad_norm": 2.0324362571668724, "language_loss": 0.85408235, "learning_rate": 3.983852303849291e-06, "loss": 0.87673247, "num_input_tokens_seen": 24613110, "step": 1152, "time_per_iteration": 2.686021089553833 }, { "auxiliary_loss_clip": 0.01190935, "auxiliary_loss_mlp": 0.01062076, "balance_loss_clip": 1.06250155, "balance_loss_mlp": 1.03866374, "epoch": 0.06932211032616864, "flos": 13256137146240.0, "grad_norm": 2.182544196511779, "language_loss": 0.90594423, "learning_rate": 3.983802875938651e-06, "loss": 0.92847437, "num_input_tokens_seen": 24628795, "step": 1153, "time_per_iteration": 2.58366060256958 }, { "auxiliary_loss_clip": 0.01169877, "auxiliary_loss_mlp": 0.01055253, "balance_loss_clip": 1.05681062, "balance_loss_mlp": 1.03088629, "epoch": 0.06938223357883662, "flos": 24827381078400.0, "grad_norm": 2.1214794624630846, "language_loss": 0.81526846, "learning_rate": 3.983753372802008e-06, "loss": 0.83751976, "num_input_tokens_seen": 24645480, "step": 1154, "time_per_iteration": 2.696794271469116 }, { "auxiliary_loss_clip": 0.01188774, "auxiliary_loss_mlp": 0.01066335, "balance_loss_clip": 1.0691216, "balance_loss_mlp": 1.04200506, "epoch": 0.06944235683150458, "flos": 27267977554560.0, "grad_norm": 2.102018399986892, "language_loss": 0.75022292, "learning_rate": 3.983703794441237e-06, "loss": 0.77277398, "num_input_tokens_seen": 24664630, "step": 1155, "time_per_iteration": 2.7718143463134766 }, { "auxiliary_loss_clip": 0.01180696, "auxiliary_loss_mlp": 0.00782152, "balance_loss_clip": 1.05586052, "balance_loss_mlp": 1.00041056, "epoch": 0.06950248008417255, "flos": 25808065176960.0, "grad_norm": 1.7459449483933205, "language_loss": 0.7110405, "learning_rate": 3.98365414085822e-06, "loss": 0.73066902, "num_input_tokens_seen": 24684210, "step": 1156, "time_per_iteration": 2.7014200687408447 }, { "auxiliary_loss_clip": 0.01179101, "auxiliary_loss_mlp": 0.00782674, "balance_loss_clip": 1.0593586, "balance_loss_mlp": 1.00037348, "epoch": 0.06956260333684053, "flos": 22271546793600.0, "grad_norm": 2.067241397655847, "language_loss": 0.74882817, "learning_rate": 3.98360441205484e-06, "loss": 0.76844591, "num_input_tokens_seen": 24702490, "step": 1157, "time_per_iteration": 2.7571897506713867 }, { "auxiliary_loss_clip": 0.01178249, "auxiliary_loss_mlp": 0.01061737, "balance_loss_clip": 1.05653787, "balance_loss_mlp": 1.03697729, "epoch": 0.0696227265895085, "flos": 29681390413440.0, "grad_norm": 1.9827644507913538, "language_loss": 0.7165724, "learning_rate": 3.983554608032982e-06, "loss": 0.73897225, "num_input_tokens_seen": 24724340, "step": 1158, "time_per_iteration": 2.839745044708252 }, { "auxiliary_loss_clip": 0.01207855, "auxiliary_loss_mlp": 0.01058558, "balance_loss_clip": 1.0605582, "balance_loss_mlp": 1.03370285, "epoch": 0.06968284984217646, "flos": 25523545547520.0, "grad_norm": 1.9692207215605615, "language_loss": 0.79595017, "learning_rate": 3.983504728794533e-06, "loss": 0.8186143, "num_input_tokens_seen": 24745550, "step": 1159, "time_per_iteration": 2.7535817623138428 }, { "auxiliary_loss_clip": 0.01212717, "auxiliary_loss_mlp": 0.01068535, "balance_loss_clip": 1.06535673, "balance_loss_mlp": 1.04094958, "epoch": 0.06974297309484444, "flos": 20698192287360.0, "grad_norm": 3.5530789367722373, "language_loss": 0.80517769, "learning_rate": 3.983454774341387e-06, "loss": 0.82799017, "num_input_tokens_seen": 24762575, "step": 1160, "time_per_iteration": 2.7455785274505615 }, { "auxiliary_loss_clip": 0.0119075, "auxiliary_loss_mlp": 0.01057887, "balance_loss_clip": 1.05680609, "balance_loss_mlp": 1.03294837, "epoch": 0.0698030963475124, "flos": 26505199313280.0, "grad_norm": 1.6303409062485206, "language_loss": 0.7607069, "learning_rate": 3.983404744675437e-06, "loss": 0.78319323, "num_input_tokens_seen": 24782605, "step": 1161, "time_per_iteration": 2.773775100708008 }, { "auxiliary_loss_clip": 0.01175787, "auxiliary_loss_mlp": 0.01062083, "balance_loss_clip": 1.05773759, "balance_loss_mlp": 1.03673923, "epoch": 0.06986321960018037, "flos": 23040430346880.0, "grad_norm": 1.6605796421434038, "language_loss": 0.82758528, "learning_rate": 3.9833546397985794e-06, "loss": 0.84996402, "num_input_tokens_seen": 24802910, "step": 1162, "time_per_iteration": 2.7426044940948486 }, { "auxiliary_loss_clip": 0.01182513, "auxiliary_loss_mlp": 0.01058124, "balance_loss_clip": 1.05717576, "balance_loss_mlp": 1.03092098, "epoch": 0.06992334285284833, "flos": 28584822061440.0, "grad_norm": 1.9523155091610094, "language_loss": 0.79563475, "learning_rate": 3.983304459712716e-06, "loss": 0.81804121, "num_input_tokens_seen": 24823305, "step": 1163, "time_per_iteration": 2.720947742462158 }, { "auxiliary_loss_clip": 0.01190519, "auxiliary_loss_mlp": 0.01063375, "balance_loss_clip": 1.05861616, "balance_loss_mlp": 1.03722012, "epoch": 0.06998346610551631, "flos": 20595344670720.0, "grad_norm": 2.213365660843382, "language_loss": 0.79187214, "learning_rate": 3.983254204419749e-06, "loss": 0.81441104, "num_input_tokens_seen": 24842155, "step": 1164, "time_per_iteration": 2.6554183959960938 }, { "auxiliary_loss_clip": 0.01143916, "auxiliary_loss_mlp": 0.01067459, "balance_loss_clip": 1.05240798, "balance_loss_mlp": 1.03875315, "epoch": 0.07004358935818428, "flos": 22528810978560.0, "grad_norm": 1.421930435008642, "language_loss": 0.72855628, "learning_rate": 3.983203873921583e-06, "loss": 0.75067008, "num_input_tokens_seen": 24862080, "step": 1165, "time_per_iteration": 2.753063440322876 }, { "auxiliary_loss_clip": 0.01183824, "auxiliary_loss_mlp": 0.01059612, "balance_loss_clip": 1.06135893, "balance_loss_mlp": 1.03522193, "epoch": 0.07010371261085224, "flos": 28949997680640.0, "grad_norm": 2.453348821242437, "language_loss": 0.81136239, "learning_rate": 3.983153468220128e-06, "loss": 0.83379674, "num_input_tokens_seen": 24886165, "step": 1166, "time_per_iteration": 2.802016496658325 }, { "auxiliary_loss_clip": 0.011718, "auxiliary_loss_mlp": 0.01053529, "balance_loss_clip": 1.05450797, "balance_loss_mlp": 1.02754176, "epoch": 0.07016383586352022, "flos": 23659171050240.0, "grad_norm": 2.457667377154448, "language_loss": 0.84640259, "learning_rate": 3.983102987317295e-06, "loss": 0.86865586, "num_input_tokens_seen": 24905775, "step": 1167, "time_per_iteration": 2.7066097259521484 }, { "auxiliary_loss_clip": 0.01193446, "auxiliary_loss_mlp": 0.01064209, "balance_loss_clip": 1.06136739, "balance_loss_mlp": 1.03887713, "epoch": 0.07022395911618819, "flos": 19792130693760.0, "grad_norm": 2.6158204436543, "language_loss": 0.89524722, "learning_rate": 3.983052431214997e-06, "loss": 0.91782373, "num_input_tokens_seen": 24924295, "step": 1168, "time_per_iteration": 2.6258392333984375 }, { "auxiliary_loss_clip": 0.01190821, "auxiliary_loss_mlp": 0.01065905, "balance_loss_clip": 1.06090224, "balance_loss_mlp": 1.03705645, "epoch": 0.07028408236885615, "flos": 21689147675520.0, "grad_norm": 2.6445150319591035, "language_loss": 0.89008862, "learning_rate": 3.983001799915153e-06, "loss": 0.91265589, "num_input_tokens_seen": 24943210, "step": 1169, "time_per_iteration": 2.6858527660369873 }, { "auxiliary_loss_clip": 0.01211063, "auxiliary_loss_mlp": 0.01065533, "balance_loss_clip": 1.06400895, "balance_loss_mlp": 1.03950977, "epoch": 0.07034420562152413, "flos": 25630271832960.0, "grad_norm": 1.9672897290124218, "language_loss": 0.83834457, "learning_rate": 3.982951093419681e-06, "loss": 0.86111057, "num_input_tokens_seen": 24960360, "step": 1170, "time_per_iteration": 2.6278069019317627 }, { "auxiliary_loss_clip": 0.01180333, "auxiliary_loss_mlp": 0.00782328, "balance_loss_clip": 1.0613637, "balance_loss_mlp": 1.00041986, "epoch": 0.0704043288741921, "flos": 20810449267200.0, "grad_norm": 1.8542795171503503, "language_loss": 0.75687242, "learning_rate": 3.982900311730506e-06, "loss": 0.77649903, "num_input_tokens_seen": 24978290, "step": 1171, "time_per_iteration": 5.806530475616455 }, { "auxiliary_loss_clip": 0.01179645, "auxiliary_loss_mlp": 0.0106394, "balance_loss_clip": 1.06133175, "balance_loss_mlp": 1.03919196, "epoch": 0.07046445212686006, "flos": 25593176062080.0, "grad_norm": 2.482864122539831, "language_loss": 0.88865125, "learning_rate": 3.9828494548495514e-06, "loss": 0.91108704, "num_input_tokens_seen": 24997055, "step": 1172, "time_per_iteration": 4.371561288833618 }, { "auxiliary_loss_clip": 0.01197698, "auxiliary_loss_mlp": 0.01054991, "balance_loss_clip": 1.06532764, "balance_loss_mlp": 1.02858603, "epoch": 0.07052457537952803, "flos": 25556978131200.0, "grad_norm": 1.6816354314161714, "language_loss": 0.82075119, "learning_rate": 3.982798522778748e-06, "loss": 0.84327805, "num_input_tokens_seen": 25017490, "step": 1173, "time_per_iteration": 4.611542463302612 }, { "auxiliary_loss_clip": 0.01200886, "auxiliary_loss_mlp": 0.01060851, "balance_loss_clip": 1.06317592, "balance_loss_mlp": 1.03503036, "epoch": 0.070584698632196, "flos": 17968515154560.0, "grad_norm": 2.007232853627583, "language_loss": 0.82071686, "learning_rate": 3.9827475155200245e-06, "loss": 0.8433342, "num_input_tokens_seen": 25035660, "step": 1174, "time_per_iteration": 2.6334969997406006 }, { "auxiliary_loss_clip": 0.01180907, "auxiliary_loss_mlp": 0.01059972, "balance_loss_clip": 1.05857778, "balance_loss_mlp": 1.03473568, "epoch": 0.07064482188486397, "flos": 25370888745600.0, "grad_norm": 2.09222115072597, "language_loss": 0.85013211, "learning_rate": 3.982696433075317e-06, "loss": 0.87254095, "num_input_tokens_seen": 25054785, "step": 1175, "time_per_iteration": 2.861591339111328 }, { "auxiliary_loss_clip": 0.01196955, "auxiliary_loss_mlp": 0.01069941, "balance_loss_clip": 1.06447482, "balance_loss_mlp": 1.04605186, "epoch": 0.07070494513753194, "flos": 24899848767360.0, "grad_norm": 1.7270820646539309, "language_loss": 0.83103871, "learning_rate": 3.982645275446563e-06, "loss": 0.85370767, "num_input_tokens_seen": 25075180, "step": 1176, "time_per_iteration": 2.754521608352661 }, { "auxiliary_loss_clip": 0.01152261, "auxiliary_loss_mlp": 0.01062154, "balance_loss_clip": 1.05370057, "balance_loss_mlp": 1.0352838, "epoch": 0.07076506839019991, "flos": 22338447874560.0, "grad_norm": 3.4939498355716996, "language_loss": 0.74409902, "learning_rate": 3.982594042635701e-06, "loss": 0.7662431, "num_input_tokens_seen": 25093035, "step": 1177, "time_per_iteration": 2.692426919937134 }, { "auxiliary_loss_clip": 0.01188551, "auxiliary_loss_mlp": 0.0106394, "balance_loss_clip": 1.06080353, "balance_loss_mlp": 1.03801203, "epoch": 0.07082519164286788, "flos": 18660800954880.0, "grad_norm": 1.8240190288677762, "language_loss": 0.85965598, "learning_rate": 3.982542734644673e-06, "loss": 0.88218087, "num_input_tokens_seen": 25112520, "step": 1178, "time_per_iteration": 2.7197048664093018 }, { "auxiliary_loss_clip": 0.01082521, "auxiliary_loss_mlp": 0.01013999, "balance_loss_clip": 1.03661168, "balance_loss_mlp": 1.01023197, "epoch": 0.07088531489553584, "flos": 63654107610240.0, "grad_norm": 0.8453670789764802, "language_loss": 0.63256603, "learning_rate": 3.982491351475427e-06, "loss": 0.65353125, "num_input_tokens_seen": 25177760, "step": 1179, "time_per_iteration": 3.3419978618621826 }, { "auxiliary_loss_clip": 0.01211274, "auxiliary_loss_mlp": 0.01073372, "balance_loss_clip": 1.06935215, "balance_loss_mlp": 1.04858887, "epoch": 0.07094543814820382, "flos": 21572688804480.0, "grad_norm": 3.2714198066984177, "language_loss": 0.83388901, "learning_rate": 3.98243989312991e-06, "loss": 0.85673553, "num_input_tokens_seen": 25195260, "step": 1180, "time_per_iteration": 2.631992816925049 }, { "auxiliary_loss_clip": 0.01182661, "auxiliary_loss_mlp": 0.01071326, "balance_loss_clip": 1.06119037, "balance_loss_mlp": 1.04624391, "epoch": 0.07100556140087179, "flos": 22089946608000.0, "grad_norm": 2.0409456536886386, "language_loss": 0.88649988, "learning_rate": 3.982388359610074e-06, "loss": 0.90903974, "num_input_tokens_seen": 25212740, "step": 1181, "time_per_iteration": 2.696789264678955 }, { "auxiliary_loss_clip": 0.01180377, "auxiliary_loss_mlp": 0.01070036, "balance_loss_clip": 1.06187141, "balance_loss_mlp": 1.04516935, "epoch": 0.07106568465353975, "flos": 47922286400640.0, "grad_norm": 1.8294049229574356, "language_loss": 0.83244783, "learning_rate": 3.9823367509178725e-06, "loss": 0.85495198, "num_input_tokens_seen": 25236420, "step": 1182, "time_per_iteration": 2.9415605068206787 }, { "auxiliary_loss_clip": 0.01193669, "auxiliary_loss_mlp": 0.01067019, "balance_loss_clip": 1.0641923, "balance_loss_mlp": 1.04150808, "epoch": 0.07112580790620772, "flos": 23440798316160.0, "grad_norm": 3.5892595189310903, "language_loss": 0.79067838, "learning_rate": 3.982285067055262e-06, "loss": 0.81328523, "num_input_tokens_seen": 25255120, "step": 1183, "time_per_iteration": 2.7284862995147705 }, { "auxiliary_loss_clip": 0.01211976, "auxiliary_loss_mlp": 0.01064792, "balance_loss_clip": 1.06126475, "balance_loss_mlp": 1.03866172, "epoch": 0.0711859311588757, "flos": 31868888682240.0, "grad_norm": 2.5463322111759354, "language_loss": 0.788867, "learning_rate": 3.982233308024204e-06, "loss": 0.81163466, "num_input_tokens_seen": 25275150, "step": 1184, "time_per_iteration": 2.7531635761260986 }, { "auxiliary_loss_clip": 0.01152059, "auxiliary_loss_mlp": 0.01062006, "balance_loss_clip": 1.05961919, "balance_loss_mlp": 1.03752065, "epoch": 0.07124605441154366, "flos": 19610315026560.0, "grad_norm": 1.904751850318294, "language_loss": 0.76806915, "learning_rate": 3.98218147382666e-06, "loss": 0.79020983, "num_input_tokens_seen": 25293680, "step": 1185, "time_per_iteration": 2.732539176940918 }, { "auxiliary_loss_clip": 0.01208288, "auxiliary_loss_mlp": 0.01073792, "balance_loss_clip": 1.06328642, "balance_loss_mlp": 1.04903185, "epoch": 0.07130617766421163, "flos": 14684448533760.0, "grad_norm": 2.1301142092644696, "language_loss": 0.65472758, "learning_rate": 3.982129564464596e-06, "loss": 0.67754835, "num_input_tokens_seen": 25310050, "step": 1186, "time_per_iteration": 2.757812261581421 }, { "auxiliary_loss_clip": 0.01195497, "auxiliary_loss_mlp": 0.01057322, "balance_loss_clip": 1.06479859, "balance_loss_mlp": 1.03274107, "epoch": 0.07136630091687961, "flos": 26067915141120.0, "grad_norm": 2.1671481434625894, "language_loss": 0.69743419, "learning_rate": 3.98207757993998e-06, "loss": 0.71996236, "num_input_tokens_seen": 25331020, "step": 1187, "time_per_iteration": 2.746615409851074 }, { "auxiliary_loss_clip": 0.01151827, "auxiliary_loss_mlp": 0.01067347, "balance_loss_clip": 1.05412316, "balance_loss_mlp": 1.04367232, "epoch": 0.07142642416954757, "flos": 15669190869120.0, "grad_norm": 2.8037131445876597, "language_loss": 0.7861973, "learning_rate": 3.9820255202547845e-06, "loss": 0.80838895, "num_input_tokens_seen": 25347875, "step": 1188, "time_per_iteration": 2.738281726837158 }, { "auxiliary_loss_clip": 0.01203626, "auxiliary_loss_mlp": 0.01059966, "balance_loss_clip": 1.06304908, "balance_loss_mlp": 1.03530121, "epoch": 0.07148654742221554, "flos": 19755322231680.0, "grad_norm": 1.8909260147246576, "language_loss": 0.84754103, "learning_rate": 3.981973385410981e-06, "loss": 0.87017697, "num_input_tokens_seen": 25366715, "step": 1189, "time_per_iteration": 2.5770246982574463 }, { "auxiliary_loss_clip": 0.01173135, "auxiliary_loss_mlp": 0.0078213, "balance_loss_clip": 1.06234396, "balance_loss_mlp": 1.00041807, "epoch": 0.07154667067488352, "flos": 23471824688640.0, "grad_norm": 5.212083930118342, "language_loss": 0.76932275, "learning_rate": 3.9819211754105494e-06, "loss": 0.78887534, "num_input_tokens_seen": 25385450, "step": 1190, "time_per_iteration": 2.7057712078094482 }, { "auxiliary_loss_clip": 0.01208346, "auxiliary_loss_mlp": 0.01074705, "balance_loss_clip": 1.06283545, "balance_loss_mlp": 1.04751348, "epoch": 0.07160679392755148, "flos": 18332936588160.0, "grad_norm": 2.5312098602102084, "language_loss": 0.75201792, "learning_rate": 3.981868890255468e-06, "loss": 0.7748484, "num_input_tokens_seen": 25403940, "step": 1191, "time_per_iteration": 2.6071674823760986 }, { "auxiliary_loss_clip": 0.01162268, "auxiliary_loss_mlp": 0.01063437, "balance_loss_clip": 1.0519917, "balance_loss_mlp": 1.03649545, "epoch": 0.07166691718021945, "flos": 17747017937280.0, "grad_norm": 2.470839013019174, "language_loss": 0.74334443, "learning_rate": 3.981816529947719e-06, "loss": 0.76560152, "num_input_tokens_seen": 25420410, "step": 1192, "time_per_iteration": 2.661078453063965 }, { "auxiliary_loss_clip": 0.01202036, "auxiliary_loss_mlp": 0.01054727, "balance_loss_clip": 1.05904579, "balance_loss_mlp": 1.03099298, "epoch": 0.07172704043288743, "flos": 22451925916800.0, "grad_norm": 2.443309122344248, "language_loss": 0.78010541, "learning_rate": 3.9817640944892896e-06, "loss": 0.8026731, "num_input_tokens_seen": 25439415, "step": 1193, "time_per_iteration": 2.5603158473968506 }, { "auxiliary_loss_clip": 0.01186747, "auxiliary_loss_mlp": 0.01059465, "balance_loss_clip": 1.06358278, "balance_loss_mlp": 1.03319085, "epoch": 0.07178716368555539, "flos": 23222210100480.0, "grad_norm": 2.1011663585924585, "language_loss": 0.85497916, "learning_rate": 3.981711583882166e-06, "loss": 0.87744129, "num_input_tokens_seen": 25458715, "step": 1194, "time_per_iteration": 2.6819851398468018 }, { "auxiliary_loss_clip": 0.01184191, "auxiliary_loss_mlp": 0.01067737, "balance_loss_clip": 1.05706751, "balance_loss_mlp": 1.04135609, "epoch": 0.07184728693822336, "flos": 25150828072320.0, "grad_norm": 2.0205668140023185, "language_loss": 0.8183766, "learning_rate": 3.981658998128341e-06, "loss": 0.84089589, "num_input_tokens_seen": 25477985, "step": 1195, "time_per_iteration": 2.6646647453308105 }, { "auxiliary_loss_clip": 0.01165951, "auxiliary_loss_mlp": 0.01063438, "balance_loss_clip": 1.0578239, "balance_loss_mlp": 1.03976321, "epoch": 0.07190741019089132, "flos": 22711237176960.0, "grad_norm": 2.161995064372768, "language_loss": 0.80093575, "learning_rate": 3.981606337229808e-06, "loss": 0.82322967, "num_input_tokens_seen": 25497110, "step": 1196, "time_per_iteration": 2.7217979431152344 }, { "auxiliary_loss_clip": 0.01176131, "auxiliary_loss_mlp": 0.00784114, "balance_loss_clip": 1.06106043, "balance_loss_mlp": 1.00034249, "epoch": 0.0719675334435593, "flos": 29349791032320.0, "grad_norm": 2.5905261146074263, "language_loss": 0.71339291, "learning_rate": 3.9815536011885655e-06, "loss": 0.73299539, "num_input_tokens_seen": 25516555, "step": 1197, "time_per_iteration": 2.7931766510009766 }, { "auxiliary_loss_clip": 0.01157444, "auxiliary_loss_mlp": 0.01055247, "balance_loss_clip": 1.06130266, "balance_loss_mlp": 1.03074968, "epoch": 0.07202765669622727, "flos": 17639788861440.0, "grad_norm": 3.074283933156949, "language_loss": 0.85951984, "learning_rate": 3.98150079000661e-06, "loss": 0.88164675, "num_input_tokens_seen": 25533895, "step": 1198, "time_per_iteration": 2.7241532802581787 }, { "auxiliary_loss_clip": 0.01160083, "auxiliary_loss_mlp": 0.0106501, "balance_loss_clip": 1.0597434, "balance_loss_mlp": 1.03944004, "epoch": 0.07208777994889523, "flos": 21434038306560.0, "grad_norm": 2.052617638295489, "language_loss": 0.83840948, "learning_rate": 3.981447903685947e-06, "loss": 0.86066043, "num_input_tokens_seen": 25554195, "step": 1199, "time_per_iteration": 2.71362566947937 }, { "auxiliary_loss_clip": 0.01212755, "auxiliary_loss_mlp": 0.01060557, "balance_loss_clip": 1.06877887, "balance_loss_mlp": 1.03709614, "epoch": 0.07214790320156321, "flos": 26940867373440.0, "grad_norm": 3.1601590133124837, "language_loss": 0.7623595, "learning_rate": 3.981394942228581e-06, "loss": 0.78509259, "num_input_tokens_seen": 25574155, "step": 1200, "time_per_iteration": 2.6913061141967773 }, { "auxiliary_loss_clip": 0.0119008, "auxiliary_loss_mlp": 0.010701, "balance_loss_clip": 1.06442261, "balance_loss_mlp": 1.04487491, "epoch": 0.07220802645423118, "flos": 23879949995520.0, "grad_norm": 2.2017873087036226, "language_loss": 0.83013475, "learning_rate": 3.98134190563652e-06, "loss": 0.85273659, "num_input_tokens_seen": 25592735, "step": 1201, "time_per_iteration": 2.6983115673065186 }, { "auxiliary_loss_clip": 0.01196941, "auxiliary_loss_mlp": 0.01065672, "balance_loss_clip": 1.06197119, "balance_loss_mlp": 1.03952968, "epoch": 0.07226814970689914, "flos": 19243631036160.0, "grad_norm": 20.835065187143087, "language_loss": 0.68601412, "learning_rate": 3.981288793911775e-06, "loss": 0.70864022, "num_input_tokens_seen": 25611510, "step": 1202, "time_per_iteration": 2.691742420196533 }, { "auxiliary_loss_clip": 0.01182684, "auxiliary_loss_mlp": 0.00782201, "balance_loss_clip": 1.06256962, "balance_loss_mlp": 1.00038218, "epoch": 0.07232827295956712, "flos": 19172025273600.0, "grad_norm": 1.9661831136137597, "language_loss": 0.87487721, "learning_rate": 3.98123560705636e-06, "loss": 0.89452606, "num_input_tokens_seen": 25629560, "step": 1203, "time_per_iteration": 2.7832019329071045 }, { "auxiliary_loss_clip": 0.01154778, "auxiliary_loss_mlp": 0.01065748, "balance_loss_clip": 1.05210066, "balance_loss_mlp": 1.04065442, "epoch": 0.07238839621223508, "flos": 17639752947840.0, "grad_norm": 1.731721557525142, "language_loss": 0.78053147, "learning_rate": 3.981182345072293e-06, "loss": 0.80273676, "num_input_tokens_seen": 25648330, "step": 1204, "time_per_iteration": 2.7754547595977783 }, { "auxiliary_loss_clip": 0.01191832, "auxiliary_loss_mlp": 0.01065794, "balance_loss_clip": 1.06211591, "balance_loss_mlp": 1.04084373, "epoch": 0.07244851946490305, "flos": 28292401440000.0, "grad_norm": 1.5043252978087258, "language_loss": 0.82094097, "learning_rate": 3.981129007961593e-06, "loss": 0.84351724, "num_input_tokens_seen": 25669470, "step": 1205, "time_per_iteration": 2.680457353591919 }, { "auxiliary_loss_clip": 0.01180244, "auxiliary_loss_mlp": 0.00782807, "balance_loss_clip": 1.06221068, "balance_loss_mlp": 1.00036049, "epoch": 0.07250864271757101, "flos": 22564829341440.0, "grad_norm": 1.6438962430217685, "language_loss": 0.76715982, "learning_rate": 3.981075595726283e-06, "loss": 0.78679025, "num_input_tokens_seen": 25690470, "step": 1206, "time_per_iteration": 2.7028439044952393 }, { "auxiliary_loss_clip": 0.01188223, "auxiliary_loss_mlp": 0.01059861, "balance_loss_clip": 1.06262684, "balance_loss_mlp": 1.03442228, "epoch": 0.072568765970239, "flos": 21762405463680.0, "grad_norm": 1.9378198243304647, "language_loss": 0.77272987, "learning_rate": 3.981022108368387e-06, "loss": 0.79521072, "num_input_tokens_seen": 25709205, "step": 1207, "time_per_iteration": 2.779289960861206 }, { "auxiliary_loss_clip": 0.01185538, "auxiliary_loss_mlp": 0.01053693, "balance_loss_clip": 1.05844951, "balance_loss_mlp": 1.03062558, "epoch": 0.07262888922290696, "flos": 25519702792320.0, "grad_norm": 1.8716528383816402, "language_loss": 0.79480875, "learning_rate": 3.9809685458899345e-06, "loss": 0.81720108, "num_input_tokens_seen": 25728485, "step": 1208, "time_per_iteration": 2.682965040206909 }, { "auxiliary_loss_clip": 0.01184899, "auxiliary_loss_mlp": 0.01054862, "balance_loss_clip": 1.05801737, "balance_loss_mlp": 1.03198612, "epoch": 0.07268901247557492, "flos": 21246548290560.0, "grad_norm": 2.5612886109689765, "language_loss": 0.78537548, "learning_rate": 3.980914908292955e-06, "loss": 0.80777311, "num_input_tokens_seen": 25747730, "step": 1209, "time_per_iteration": 2.6582658290863037 }, { "auxiliary_loss_clip": 0.01191905, "auxiliary_loss_mlp": 0.01067741, "balance_loss_clip": 1.05931175, "balance_loss_mlp": 1.04408956, "epoch": 0.0727491357282429, "flos": 25479302970240.0, "grad_norm": 2.351303434522043, "language_loss": 0.80920583, "learning_rate": 3.980861195579486e-06, "loss": 0.83180225, "num_input_tokens_seen": 25768050, "step": 1210, "time_per_iteration": 4.241993427276611 }, { "auxiliary_loss_clip": 0.0117493, "auxiliary_loss_mlp": 0.01063711, "balance_loss_clip": 1.06087565, "balance_loss_mlp": 1.03891551, "epoch": 0.07280925898091087, "flos": 24462169545600.0, "grad_norm": 1.875347829314158, "language_loss": 0.84302205, "learning_rate": 3.98080740775156e-06, "loss": 0.86540848, "num_input_tokens_seen": 25787985, "step": 1211, "time_per_iteration": 4.289919853210449 }, { "auxiliary_loss_clip": 0.01162055, "auxiliary_loss_mlp": 0.01060218, "balance_loss_clip": 1.05356658, "balance_loss_mlp": 1.03629231, "epoch": 0.07286938223357883, "flos": 18288191220480.0, "grad_norm": 2.991110515222773, "language_loss": 0.90684664, "learning_rate": 3.98075354481122e-06, "loss": 0.92906934, "num_input_tokens_seen": 25803620, "step": 1212, "time_per_iteration": 2.660780906677246 }, { "auxiliary_loss_clip": 0.01202443, "auxiliary_loss_mlp": 0.01058817, "balance_loss_clip": 1.0623759, "balance_loss_mlp": 1.03490353, "epoch": 0.07292950548624681, "flos": 21214803646080.0, "grad_norm": 1.7918815842724805, "language_loss": 0.72358596, "learning_rate": 3.9806996067605055e-06, "loss": 0.74619853, "num_input_tokens_seen": 25823315, "step": 1213, "time_per_iteration": 4.303524017333984 }, { "auxiliary_loss_clip": 0.01153662, "auxiliary_loss_mlp": 0.01055706, "balance_loss_clip": 1.05658662, "balance_loss_mlp": 1.03089869, "epoch": 0.07298962873891478, "flos": 24642009964800.0, "grad_norm": 1.8655932637344164, "language_loss": 0.84356117, "learning_rate": 3.980645593601465e-06, "loss": 0.86565483, "num_input_tokens_seen": 25842605, "step": 1214, "time_per_iteration": 2.7505569458007812 }, { "auxiliary_loss_clip": 0.01208881, "auxiliary_loss_mlp": 0.01062075, "balance_loss_clip": 1.06484771, "balance_loss_mlp": 1.03723145, "epoch": 0.07304975199158274, "flos": 27052765217280.0, "grad_norm": 2.025651344907852, "language_loss": 0.84113681, "learning_rate": 3.980591505336144e-06, "loss": 0.86384636, "num_input_tokens_seen": 25863030, "step": 1215, "time_per_iteration": 2.7235965728759766 }, { "auxiliary_loss_clip": 0.01149957, "auxiliary_loss_mlp": 0.01062992, "balance_loss_clip": 1.05138278, "balance_loss_mlp": 1.03744531, "epoch": 0.07310987524425071, "flos": 33549544091520.0, "grad_norm": 1.9312816725096997, "language_loss": 0.80926049, "learning_rate": 3.980537341966595e-06, "loss": 0.83139002, "num_input_tokens_seen": 25888015, "step": 1216, "time_per_iteration": 2.9129130840301514 }, { "auxiliary_loss_clip": 0.01167944, "auxiliary_loss_mlp": 0.01060276, "balance_loss_clip": 1.05619049, "balance_loss_mlp": 1.03680408, "epoch": 0.07316999849691869, "flos": 28110944908800.0, "grad_norm": 3.2846247291101975, "language_loss": 0.75949144, "learning_rate": 3.980483103494872e-06, "loss": 0.78177369, "num_input_tokens_seen": 25908660, "step": 1217, "time_per_iteration": 2.7106521129608154 }, { "auxiliary_loss_clip": 0.01169026, "auxiliary_loss_mlp": 0.01056631, "balance_loss_clip": 1.06182647, "balance_loss_mlp": 1.03477991, "epoch": 0.07323012174958665, "flos": 14392602529920.0, "grad_norm": 1.9658490798069863, "language_loss": 0.86455309, "learning_rate": 3.98042878992303e-06, "loss": 0.88680959, "num_input_tokens_seen": 25927215, "step": 1218, "time_per_iteration": 2.5911786556243896 }, { "auxiliary_loss_clip": 0.01192266, "auxiliary_loss_mlp": 0.0106258, "balance_loss_clip": 1.06015348, "balance_loss_mlp": 1.03916681, "epoch": 0.07329024500225462, "flos": 21616428591360.0, "grad_norm": 2.2310702082820675, "language_loss": 0.86782354, "learning_rate": 3.9803744012531305e-06, "loss": 0.89037204, "num_input_tokens_seen": 25945500, "step": 1219, "time_per_iteration": 2.608562707901001 }, { "auxiliary_loss_clip": 0.01201545, "auxiliary_loss_mlp": 0.01058282, "balance_loss_clip": 1.06024373, "balance_loss_mlp": 1.03539419, "epoch": 0.0733503682549226, "flos": 13224141106560.0, "grad_norm": 2.095886373367052, "language_loss": 0.84608674, "learning_rate": 3.980319937487235e-06, "loss": 0.86868501, "num_input_tokens_seen": 25963105, "step": 1220, "time_per_iteration": 2.469189405441284 }, { "auxiliary_loss_clip": 0.01158855, "auxiliary_loss_mlp": 0.01063399, "balance_loss_clip": 1.05358922, "balance_loss_mlp": 1.03942597, "epoch": 0.07341049150759056, "flos": 20886975192960.0, "grad_norm": 2.648884311755534, "language_loss": 0.77114344, "learning_rate": 3.98026539862741e-06, "loss": 0.79336596, "num_input_tokens_seen": 25981690, "step": 1221, "time_per_iteration": 2.671762466430664 }, { "auxiliary_loss_clip": 0.01158201, "auxiliary_loss_mlp": 0.01064916, "balance_loss_clip": 1.05726743, "balance_loss_mlp": 1.04082406, "epoch": 0.07347061476025853, "flos": 15413614623360.0, "grad_norm": 2.5357389392469942, "language_loss": 0.91631913, "learning_rate": 3.980210784675722e-06, "loss": 0.93855029, "num_input_tokens_seen": 25999890, "step": 1222, "time_per_iteration": 2.6973063945770264 }, { "auxiliary_loss_clip": 0.01135907, "auxiliary_loss_mlp": 0.01064872, "balance_loss_clip": 1.05333126, "balance_loss_mlp": 1.04169726, "epoch": 0.0735307380129265, "flos": 11108859131520.0, "grad_norm": 2.8024324299253047, "language_loss": 0.90976465, "learning_rate": 3.980156095634242e-06, "loss": 0.93177247, "num_input_tokens_seen": 26016445, "step": 1223, "time_per_iteration": 2.8141093254089355 }, { "auxiliary_loss_clip": 0.01202875, "auxiliary_loss_mlp": 0.01077185, "balance_loss_clip": 1.06232905, "balance_loss_mlp": 1.05341494, "epoch": 0.07359086126559447, "flos": 23732392924800.0, "grad_norm": 1.9348534518871447, "language_loss": 0.82161939, "learning_rate": 3.980101331505045e-06, "loss": 0.84442002, "num_input_tokens_seen": 26036080, "step": 1224, "time_per_iteration": 2.640432119369507 }, { "auxiliary_loss_clip": 0.01200329, "auxiliary_loss_mlp": 0.01057586, "balance_loss_clip": 1.05987597, "balance_loss_mlp": 1.03229022, "epoch": 0.07365098451826244, "flos": 20993270515200.0, "grad_norm": 2.31744406237409, "language_loss": 0.83194047, "learning_rate": 3.9800464922902076e-06, "loss": 0.85451961, "num_input_tokens_seen": 26055805, "step": 1225, "time_per_iteration": 2.6159210205078125 }, { "auxiliary_loss_clip": 0.01170115, "auxiliary_loss_mlp": 0.01056068, "balance_loss_clip": 1.05743551, "balance_loss_mlp": 1.03190422, "epoch": 0.0737111077709304, "flos": 19933582452480.0, "grad_norm": 2.2959030425986544, "language_loss": 0.90388274, "learning_rate": 3.979991577991808e-06, "loss": 0.9261446, "num_input_tokens_seen": 26073905, "step": 1226, "time_per_iteration": 2.6527435779571533 }, { "auxiliary_loss_clip": 0.01207799, "auxiliary_loss_mlp": 0.0104599, "balance_loss_clip": 1.05913424, "balance_loss_mlp": 1.02080154, "epoch": 0.07377123102359838, "flos": 16581537342720.0, "grad_norm": 2.579592162134606, "language_loss": 0.76626784, "learning_rate": 3.97993658861193e-06, "loss": 0.78880572, "num_input_tokens_seen": 26091700, "step": 1227, "time_per_iteration": 2.596151351928711 }, { "auxiliary_loss_clip": 0.0118909, "auxiliary_loss_mlp": 0.01053386, "balance_loss_clip": 1.06296694, "balance_loss_mlp": 1.02954459, "epoch": 0.07383135427626634, "flos": 28328563457280.0, "grad_norm": 7.788838200212175, "language_loss": 0.8555491, "learning_rate": 3.9798815241526575e-06, "loss": 0.87797379, "num_input_tokens_seen": 26114105, "step": 1228, "time_per_iteration": 2.6955716609954834 }, { "auxiliary_loss_clip": 0.01191175, "auxiliary_loss_mlp": 0.01062669, "balance_loss_clip": 1.05897212, "balance_loss_mlp": 1.03860044, "epoch": 0.07389147752893431, "flos": 20047168235520.0, "grad_norm": 2.2575099517148898, "language_loss": 0.79598552, "learning_rate": 3.97982638461608e-06, "loss": 0.818524, "num_input_tokens_seen": 26131165, "step": 1229, "time_per_iteration": 2.6544861793518066 }, { "auxiliary_loss_clip": 0.01192886, "auxiliary_loss_mlp": 0.00782044, "balance_loss_clip": 1.05966699, "balance_loss_mlp": 1.00032902, "epoch": 0.07395160078160229, "flos": 18114132890880.0, "grad_norm": 2.2881874382496377, "language_loss": 0.78209347, "learning_rate": 3.979771170004287e-06, "loss": 0.80184281, "num_input_tokens_seen": 26150040, "step": 1230, "time_per_iteration": 2.6001133918762207 }, { "auxiliary_loss_clip": 0.0120142, "auxiliary_loss_mlp": 0.01052342, "balance_loss_clip": 1.06209648, "balance_loss_mlp": 1.02739108, "epoch": 0.07401172403427025, "flos": 23586918842880.0, "grad_norm": 2.038847041772147, "language_loss": 0.8136946, "learning_rate": 3.979715880319372e-06, "loss": 0.83623219, "num_input_tokens_seen": 26169380, "step": 1231, "time_per_iteration": 2.6364073753356934 }, { "auxiliary_loss_clip": 0.01179975, "auxiliary_loss_mlp": 0.01070917, "balance_loss_clip": 1.05690873, "balance_loss_mlp": 1.04599047, "epoch": 0.07407184728693822, "flos": 26359904799360.0, "grad_norm": 2.096832924731062, "language_loss": 0.95204866, "learning_rate": 3.979660515563434e-06, "loss": 0.97455758, "num_input_tokens_seen": 26189420, "step": 1232, "time_per_iteration": 2.7929203510284424 }, { "auxiliary_loss_clip": 0.01187282, "auxiliary_loss_mlp": 0.01059661, "balance_loss_clip": 1.06202245, "balance_loss_mlp": 1.03733301, "epoch": 0.0741319705396062, "flos": 22200443821440.0, "grad_norm": 1.7778448126368063, "language_loss": 0.80695188, "learning_rate": 3.979605075738569e-06, "loss": 0.82942128, "num_input_tokens_seen": 26209300, "step": 1233, "time_per_iteration": 2.7945051193237305 }, { "auxiliary_loss_clip": 0.01209245, "auxiliary_loss_mlp": 0.0106207, "balance_loss_clip": 1.06238747, "balance_loss_mlp": 1.03602231, "epoch": 0.07419209379227416, "flos": 39200482523520.0, "grad_norm": 2.136728864247421, "language_loss": 0.70708907, "learning_rate": 3.979549560846883e-06, "loss": 0.72980225, "num_input_tokens_seen": 26228110, "step": 1234, "time_per_iteration": 2.9646782875061035 }, { "auxiliary_loss_clip": 0.01167486, "auxiliary_loss_mlp": 0.01068879, "balance_loss_clip": 1.0542618, "balance_loss_mlp": 1.04265285, "epoch": 0.07425221704494213, "flos": 22781657790720.0, "grad_norm": 1.7921102377369336, "language_loss": 0.76852918, "learning_rate": 3.979493970890478e-06, "loss": 0.79089284, "num_input_tokens_seen": 26247020, "step": 1235, "time_per_iteration": 2.820577621459961 }, { "auxiliary_loss_clip": 0.01198028, "auxiliary_loss_mlp": 0.01055883, "balance_loss_clip": 1.05918813, "balance_loss_mlp": 1.0321244, "epoch": 0.0743123402976101, "flos": 22272983337600.0, "grad_norm": 2.3018318065058097, "language_loss": 0.82748145, "learning_rate": 3.979438305871464e-06, "loss": 0.85002053, "num_input_tokens_seen": 26265750, "step": 1236, "time_per_iteration": 2.6302287578582764 }, { "auxiliary_loss_clip": 0.01154783, "auxiliary_loss_mlp": 0.00782014, "balance_loss_clip": 1.05519629, "balance_loss_mlp": 1.00039148, "epoch": 0.07437246355027807, "flos": 29315029645440.0, "grad_norm": 1.7985383717833268, "language_loss": 0.7595011, "learning_rate": 3.979382565791951e-06, "loss": 0.77886909, "num_input_tokens_seen": 26287905, "step": 1237, "time_per_iteration": 2.721931219100952 }, { "auxiliary_loss_clip": 0.01135551, "auxiliary_loss_mlp": 0.00783311, "balance_loss_clip": 1.0505693, "balance_loss_mlp": 1.00031757, "epoch": 0.07443258680294604, "flos": 31944732249600.0, "grad_norm": 1.6915170784810407, "language_loss": 0.77458763, "learning_rate": 3.979326750654053e-06, "loss": 0.79377621, "num_input_tokens_seen": 26311795, "step": 1238, "time_per_iteration": 2.831620931625366 }, { "auxiliary_loss_clip": 0.01177529, "auxiliary_loss_mlp": 0.01057762, "balance_loss_clip": 1.05673254, "balance_loss_mlp": 1.03311002, "epoch": 0.074492710055614, "flos": 22675290641280.0, "grad_norm": 1.9053364150897723, "language_loss": 0.867737, "learning_rate": 3.9792708604598854e-06, "loss": 0.89008987, "num_input_tokens_seen": 26330330, "step": 1239, "time_per_iteration": 2.6697263717651367 }, { "auxiliary_loss_clip": 0.01159844, "auxiliary_loss_mlp": 0.01050954, "balance_loss_clip": 1.05222142, "balance_loss_mlp": 1.02532458, "epoch": 0.07455283330828198, "flos": 21284901037440.0, "grad_norm": 26.978042105238785, "language_loss": 0.89356089, "learning_rate": 3.979214895211569e-06, "loss": 0.91566885, "num_input_tokens_seen": 26348865, "step": 1240, "time_per_iteration": 2.846013069152832 }, { "auxiliary_loss_clip": 0.01174117, "auxiliary_loss_mlp": 0.01063539, "balance_loss_clip": 1.05857158, "balance_loss_mlp": 1.03713393, "epoch": 0.07461295656094995, "flos": 24388408967040.0, "grad_norm": 1.9346624045484253, "language_loss": 0.88873678, "learning_rate": 3.979158854911225e-06, "loss": 0.91111326, "num_input_tokens_seen": 26368210, "step": 1241, "time_per_iteration": 2.6926562786102295 }, { "auxiliary_loss_clip": 0.01079637, "auxiliary_loss_mlp": 0.01009562, "balance_loss_clip": 1.03489435, "balance_loss_mlp": 1.00405502, "epoch": 0.07467307981361791, "flos": 62109660574080.0, "grad_norm": 0.8973011136706247, "language_loss": 0.63067901, "learning_rate": 3.979102739560979e-06, "loss": 0.65157104, "num_input_tokens_seen": 26424890, "step": 1242, "time_per_iteration": 3.298609972000122 }, { "auxiliary_loss_clip": 0.01164269, "auxiliary_loss_mlp": 0.01068833, "balance_loss_clip": 1.05246222, "balance_loss_mlp": 1.03819644, "epoch": 0.07473320306628589, "flos": 24863148046080.0, "grad_norm": 3.87499965477456, "language_loss": 0.62926078, "learning_rate": 3.9790465491629595e-06, "loss": 0.65159178, "num_input_tokens_seen": 26446405, "step": 1243, "time_per_iteration": 2.7774572372436523 }, { "auxiliary_loss_clip": 0.01188864, "auxiliary_loss_mlp": 0.01059918, "balance_loss_clip": 1.05716145, "balance_loss_mlp": 1.03499091, "epoch": 0.07479332631895386, "flos": 24897442556160.0, "grad_norm": 1.6252135866538246, "language_loss": 0.76259589, "learning_rate": 3.978990283719296e-06, "loss": 0.78508377, "num_input_tokens_seen": 26466070, "step": 1244, "time_per_iteration": 2.714459180831909 }, { "auxiliary_loss_clip": 0.01184345, "auxiliary_loss_mlp": 0.00783076, "balance_loss_clip": 1.0611167, "balance_loss_mlp": 1.00038469, "epoch": 0.07485344957162182, "flos": 17815247821440.0, "grad_norm": 5.636002853507256, "language_loss": 0.69419599, "learning_rate": 3.978933943232123e-06, "loss": 0.71387023, "num_input_tokens_seen": 26479350, "step": 1245, "time_per_iteration": 2.640895366668701 }, { "auxiliary_loss_clip": 0.01203955, "auxiliary_loss_mlp": 0.01062684, "balance_loss_clip": 1.06098139, "balance_loss_mlp": 1.0372088, "epoch": 0.0749135728242898, "flos": 25010202326400.0, "grad_norm": 2.5525245798098757, "language_loss": 0.88635457, "learning_rate": 3.978877527703576e-06, "loss": 0.90902102, "num_input_tokens_seen": 26498255, "step": 1246, "time_per_iteration": 2.747765302658081 }, { "auxiliary_loss_clip": 0.01212369, "auxiliary_loss_mlp": 0.01077452, "balance_loss_clip": 1.06102896, "balance_loss_mlp": 1.049402, "epoch": 0.07497369607695777, "flos": 17822071405440.0, "grad_norm": 2.675073323546491, "language_loss": 0.8825295, "learning_rate": 3.9788210371357945e-06, "loss": 0.90542769, "num_input_tokens_seen": 26515375, "step": 1247, "time_per_iteration": 2.6810224056243896 }, { "auxiliary_loss_clip": 0.0118495, "auxiliary_loss_mlp": 0.01069489, "balance_loss_clip": 1.06058884, "balance_loss_mlp": 1.04383492, "epoch": 0.07503381932962573, "flos": 15121086261120.0, "grad_norm": 2.620559853720615, "language_loss": 0.64849806, "learning_rate": 3.978764471530921e-06, "loss": 0.67104244, "num_input_tokens_seen": 26533595, "step": 1248, "time_per_iteration": 2.706862449645996 }, { "auxiliary_loss_clip": 0.01181878, "auxiliary_loss_mlp": 0.00782677, "balance_loss_clip": 1.0575974, "balance_loss_mlp": 1.0004611, "epoch": 0.0750939425822937, "flos": 12816734071680.0, "grad_norm": 2.872208543000993, "language_loss": 0.74216163, "learning_rate": 3.978707830891102e-06, "loss": 0.7618072, "num_input_tokens_seen": 26549405, "step": 1249, "time_per_iteration": 4.309665679931641 }, { "auxiliary_loss_clip": 0.01168375, "auxiliary_loss_mlp": 0.01079691, "balance_loss_clip": 1.0579834, "balance_loss_mlp": 1.05296445, "epoch": 0.07515406583496168, "flos": 24206844695040.0, "grad_norm": 2.679176110316805, "language_loss": 0.82353318, "learning_rate": 3.978651115218482e-06, "loss": 0.84601378, "num_input_tokens_seen": 26567200, "step": 1250, "time_per_iteration": 4.367432594299316 }, { "auxiliary_loss_clip": 0.011507, "auxiliary_loss_mlp": 0.01064103, "balance_loss_clip": 1.05736125, "balance_loss_mlp": 1.0380677, "epoch": 0.07521418908762964, "flos": 26688164215680.0, "grad_norm": 2.015636709873133, "language_loss": 0.6679548, "learning_rate": 3.978594324515215e-06, "loss": 0.69010288, "num_input_tokens_seen": 26586190, "step": 1251, "time_per_iteration": 4.339111089706421 }, { "auxiliary_loss_clip": 0.01061099, "auxiliary_loss_mlp": 0.01007289, "balance_loss_clip": 1.02992618, "balance_loss_mlp": 1.00314093, "epoch": 0.0752743123402976, "flos": 59095140589440.0, "grad_norm": 0.9014655793512963, "language_loss": 0.7038399, "learning_rate": 3.9785374587834515e-06, "loss": 0.72452378, "num_input_tokens_seen": 26650710, "step": 1252, "time_per_iteration": 4.984445333480835 }, { "auxiliary_loss_clip": 0.0120348, "auxiliary_loss_mlp": 0.01071343, "balance_loss_clip": 1.06016684, "balance_loss_mlp": 1.04651129, "epoch": 0.07533443559296558, "flos": 23477032160640.0, "grad_norm": 2.2789224049077226, "language_loss": 0.79936707, "learning_rate": 3.97848051802535e-06, "loss": 0.82211524, "num_input_tokens_seen": 26669000, "step": 1253, "time_per_iteration": 2.613696575164795 }, { "auxiliary_loss_clip": 0.01165402, "auxiliary_loss_mlp": 0.01062493, "balance_loss_clip": 1.05703712, "balance_loss_mlp": 1.03758967, "epoch": 0.07539455884563355, "flos": 20879110114560.0, "grad_norm": 3.1057458778243263, "language_loss": 0.93360364, "learning_rate": 3.978423502243069e-06, "loss": 0.95588255, "num_input_tokens_seen": 26683075, "step": 1254, "time_per_iteration": 2.7332606315612793 }, { "auxiliary_loss_clip": 0.011733, "auxiliary_loss_mlp": 0.01064454, "balance_loss_clip": 1.06050682, "balance_loss_mlp": 1.03958726, "epoch": 0.07545468209830151, "flos": 27672906551040.0, "grad_norm": 2.090631066181037, "language_loss": 0.88087487, "learning_rate": 3.97836641143877e-06, "loss": 0.90325236, "num_input_tokens_seen": 26701875, "step": 1255, "time_per_iteration": 2.713636875152588 }, { "auxiliary_loss_clip": 0.01202338, "auxiliary_loss_mlp": 0.01071467, "balance_loss_clip": 1.06138325, "balance_loss_mlp": 1.04531264, "epoch": 0.0755148053509695, "flos": 14136990370560.0, "grad_norm": 1.9772348994273161, "language_loss": 0.79305708, "learning_rate": 3.978309245614618e-06, "loss": 0.81579506, "num_input_tokens_seen": 26719050, "step": 1256, "time_per_iteration": 2.688812255859375 }, { "auxiliary_loss_clip": 0.01064506, "auxiliary_loss_mlp": 0.01008663, "balance_loss_clip": 1.0281384, "balance_loss_mlp": 1.0043, "epoch": 0.07557492860363746, "flos": 58235257929600.0, "grad_norm": 0.7721513084275832, "language_loss": 0.58031851, "learning_rate": 3.9782520047727825e-06, "loss": 0.6010502, "num_input_tokens_seen": 26780650, "step": 1257, "time_per_iteration": 3.290971517562866 }, { "auxiliary_loss_clip": 0.01154091, "auxiliary_loss_mlp": 0.01065293, "balance_loss_clip": 1.06175375, "balance_loss_mlp": 1.04035461, "epoch": 0.07563505185630542, "flos": 24644380262400.0, "grad_norm": 2.5700283098608026, "language_loss": 0.90029764, "learning_rate": 3.978194688915432e-06, "loss": 0.92249143, "num_input_tokens_seen": 26798725, "step": 1258, "time_per_iteration": 2.800297975540161 }, { "auxiliary_loss_clip": 0.01169581, "auxiliary_loss_mlp": 0.01064585, "balance_loss_clip": 1.06184185, "balance_loss_mlp": 1.03797793, "epoch": 0.07569517510897339, "flos": 15522998515200.0, "grad_norm": 2.1868972302346377, "language_loss": 0.81404132, "learning_rate": 3.978137298044741e-06, "loss": 0.83638299, "num_input_tokens_seen": 26817005, "step": 1259, "time_per_iteration": 2.767717123031616 }, { "auxiliary_loss_clip": 0.01194891, "auxiliary_loss_mlp": 0.01062022, "balance_loss_clip": 1.06317782, "balance_loss_mlp": 1.03766739, "epoch": 0.07575529836164137, "flos": 22928532503040.0, "grad_norm": 1.8876128491153832, "language_loss": 0.7609086, "learning_rate": 3.978079832162885e-06, "loss": 0.78347778, "num_input_tokens_seen": 26836655, "step": 1260, "time_per_iteration": 2.859339714050293 }, { "auxiliary_loss_clip": 0.01160098, "auxiliary_loss_mlp": 0.01068568, "balance_loss_clip": 1.05432057, "balance_loss_mlp": 1.04222322, "epoch": 0.07581542161430933, "flos": 19500428344320.0, "grad_norm": 1.7028037437197219, "language_loss": 0.84734851, "learning_rate": 3.978022291272044e-06, "loss": 0.86963522, "num_input_tokens_seen": 26854925, "step": 1261, "time_per_iteration": 2.773087978363037 }, { "auxiliary_loss_clip": 0.01212087, "auxiliary_loss_mlp": 0.0106726, "balance_loss_clip": 1.06821966, "balance_loss_mlp": 1.04273915, "epoch": 0.0758755448669773, "flos": 24973465691520.0, "grad_norm": 1.8668314773439494, "language_loss": 0.82578814, "learning_rate": 3.977964675374399e-06, "loss": 0.84858155, "num_input_tokens_seen": 26876170, "step": 1262, "time_per_iteration": 2.681764841079712 }, { "auxiliary_loss_clip": 0.01206367, "auxiliary_loss_mlp": 0.0106285, "balance_loss_clip": 1.06333947, "balance_loss_mlp": 1.03685009, "epoch": 0.07593566811964528, "flos": 22747973811840.0, "grad_norm": 2.501362251414687, "language_loss": 0.82448232, "learning_rate": 3.977906984472136e-06, "loss": 0.84717447, "num_input_tokens_seen": 26895005, "step": 1263, "time_per_iteration": 2.6262786388397217 }, { "auxiliary_loss_clip": 0.01166059, "auxiliary_loss_mlp": 0.01068738, "balance_loss_clip": 1.06484997, "balance_loss_mlp": 1.04334641, "epoch": 0.07599579137231324, "flos": 23112395245440.0, "grad_norm": 2.171520639750579, "language_loss": 0.76149648, "learning_rate": 3.977849218567442e-06, "loss": 0.78384447, "num_input_tokens_seen": 26913930, "step": 1264, "time_per_iteration": 2.7735466957092285 }, { "auxiliary_loss_clip": 0.01181777, "auxiliary_loss_mlp": 0.01061673, "balance_loss_clip": 1.06183577, "balance_loss_mlp": 1.03704381, "epoch": 0.07605591462498121, "flos": 14502058248960.0, "grad_norm": 2.252731793921747, "language_loss": 0.80919051, "learning_rate": 3.977791377662507e-06, "loss": 0.83162498, "num_input_tokens_seen": 26931485, "step": 1265, "time_per_iteration": 2.6076793670654297 }, { "auxiliary_loss_clip": 0.01143593, "auxiliary_loss_mlp": 0.01068856, "balance_loss_clip": 1.05383801, "balance_loss_mlp": 1.0411638, "epoch": 0.07611603787764919, "flos": 23514199758720.0, "grad_norm": 2.117217065332582, "language_loss": 0.65244937, "learning_rate": 3.977733461759524e-06, "loss": 0.67457378, "num_input_tokens_seen": 26951670, "step": 1266, "time_per_iteration": 2.714848041534424 }, { "auxiliary_loss_clip": 0.0116364, "auxiliary_loss_mlp": 0.01066982, "balance_loss_clip": 1.05869627, "balance_loss_mlp": 1.04194832, "epoch": 0.07617616113031715, "flos": 21507188353920.0, "grad_norm": 2.0157381540709416, "language_loss": 0.79570109, "learning_rate": 3.977675470860691e-06, "loss": 0.81800735, "num_input_tokens_seen": 26970335, "step": 1267, "time_per_iteration": 2.692220687866211 }, { "auxiliary_loss_clip": 0.01186526, "auxiliary_loss_mlp": 0.01060572, "balance_loss_clip": 1.06368709, "balance_loss_mlp": 1.03644359, "epoch": 0.07623628438298512, "flos": 14573161221120.0, "grad_norm": 2.573855585409162, "language_loss": 0.72936547, "learning_rate": 3.977617404968205e-06, "loss": 0.75183642, "num_input_tokens_seen": 26986025, "step": 1268, "time_per_iteration": 2.666487216949463 }, { "auxiliary_loss_clip": 0.01189272, "auxiliary_loss_mlp": 0.01056943, "balance_loss_clip": 1.05925119, "balance_loss_mlp": 1.03146791, "epoch": 0.07629640763565308, "flos": 14720395069440.0, "grad_norm": 2.3531002902867018, "language_loss": 0.82087409, "learning_rate": 3.977559264084269e-06, "loss": 0.84333622, "num_input_tokens_seen": 27004045, "step": 1269, "time_per_iteration": 2.6196024417877197 }, { "auxiliary_loss_clip": 0.01198264, "auxiliary_loss_mlp": 0.01062408, "balance_loss_clip": 1.06528163, "balance_loss_mlp": 1.03656352, "epoch": 0.07635653088832106, "flos": 14902929008640.0, "grad_norm": 2.6660741307472424, "language_loss": 0.88614184, "learning_rate": 3.977501048211088e-06, "loss": 0.90874851, "num_input_tokens_seen": 27022070, "step": 1270, "time_per_iteration": 2.6423919200897217 }, { "auxiliary_loss_clip": 0.01195764, "auxiliary_loss_mlp": 0.01062092, "balance_loss_clip": 1.06443572, "balance_loss_mlp": 1.0371294, "epoch": 0.07641665414098903, "flos": 26651571235200.0, "grad_norm": 2.486841045046768, "language_loss": 0.7104162, "learning_rate": 3.977442757350869e-06, "loss": 0.73299474, "num_input_tokens_seen": 27041755, "step": 1271, "time_per_iteration": 2.6679437160491943 }, { "auxiliary_loss_clip": 0.01157818, "auxiliary_loss_mlp": 0.01068131, "balance_loss_clip": 1.05973268, "balance_loss_mlp": 1.04282308, "epoch": 0.07647677739365699, "flos": 25192808092800.0, "grad_norm": 1.5691807400142836, "language_loss": 0.82570392, "learning_rate": 3.977384391505823e-06, "loss": 0.84796339, "num_input_tokens_seen": 27061540, "step": 1272, "time_per_iteration": 2.7613680362701416 }, { "auxiliary_loss_clip": 0.01176176, "auxiliary_loss_mlp": 0.00782751, "balance_loss_clip": 1.05822372, "balance_loss_mlp": 1.00051665, "epoch": 0.07653690064632497, "flos": 20558141159040.0, "grad_norm": 1.811509476700225, "language_loss": 0.79854733, "learning_rate": 3.977325950678162e-06, "loss": 0.81813657, "num_input_tokens_seen": 27081395, "step": 1273, "time_per_iteration": 2.696317434310913 }, { "auxiliary_loss_clip": 0.01185133, "auxiliary_loss_mlp": 0.01064308, "balance_loss_clip": 1.06556833, "balance_loss_mlp": 1.03910685, "epoch": 0.07659702389899294, "flos": 22269320150400.0, "grad_norm": 1.7399681078894738, "language_loss": 0.81519866, "learning_rate": 3.977267434870103e-06, "loss": 0.83769304, "num_input_tokens_seen": 27101175, "step": 1274, "time_per_iteration": 2.8570950031280518 }, { "auxiliary_loss_clip": 0.0118748, "auxiliary_loss_mlp": 0.01078696, "balance_loss_clip": 1.06516898, "balance_loss_mlp": 1.05164731, "epoch": 0.0766571471516609, "flos": 32636120209920.0, "grad_norm": 2.6845981005996453, "language_loss": 0.73083639, "learning_rate": 3.977208844083865e-06, "loss": 0.75349814, "num_input_tokens_seen": 27124505, "step": 1275, "time_per_iteration": 2.75947904586792 }, { "auxiliary_loss_clip": 0.0121081, "auxiliary_loss_mlp": 0.01063745, "balance_loss_clip": 1.06740415, "balance_loss_mlp": 1.03694642, "epoch": 0.07671727040432888, "flos": 15267386355840.0, "grad_norm": 2.828157953752124, "language_loss": 0.79507053, "learning_rate": 3.9771501783216685e-06, "loss": 0.81781602, "num_input_tokens_seen": 27140960, "step": 1276, "time_per_iteration": 2.626683473587036 }, { "auxiliary_loss_clip": 0.01198279, "auxiliary_loss_mlp": 0.01058719, "balance_loss_clip": 1.06486118, "balance_loss_mlp": 1.03485298, "epoch": 0.07677739365699685, "flos": 28184094956160.0, "grad_norm": 2.406514987231471, "language_loss": 0.58915478, "learning_rate": 3.97709143758574e-06, "loss": 0.61172473, "num_input_tokens_seen": 27160985, "step": 1277, "time_per_iteration": 2.6684958934783936 }, { "auxiliary_loss_clip": 0.01201282, "auxiliary_loss_mlp": 0.01064396, "balance_loss_clip": 1.06430948, "balance_loss_mlp": 1.03919542, "epoch": 0.07683751690966481, "flos": 18296128126080.0, "grad_norm": 2.8024245322836046, "language_loss": 0.74957907, "learning_rate": 3.977032621878305e-06, "loss": 0.77223587, "num_input_tokens_seen": 27178390, "step": 1278, "time_per_iteration": 2.723675012588501 }, { "auxiliary_loss_clip": 0.01160972, "auxiliary_loss_mlp": 0.01063133, "balance_loss_clip": 1.0584681, "balance_loss_mlp": 1.0390408, "epoch": 0.07689764016233278, "flos": 21981101420160.0, "grad_norm": 5.339853944094037, "language_loss": 0.88594604, "learning_rate": 3.976973731201596e-06, "loss": 0.90818715, "num_input_tokens_seen": 27197505, "step": 1279, "time_per_iteration": 2.655036211013794 }, { "auxiliary_loss_clip": 0.01172627, "auxiliary_loss_mlp": 0.01066586, "balance_loss_clip": 1.06065845, "balance_loss_mlp": 1.04077685, "epoch": 0.07695776341500075, "flos": 22235995307520.0, "grad_norm": 2.4937131241937256, "language_loss": 0.8300451, "learning_rate": 3.976914765557845e-06, "loss": 0.85243726, "num_input_tokens_seen": 27214260, "step": 1280, "time_per_iteration": 2.7717065811157227 }, { "auxiliary_loss_clip": 0.01194022, "auxiliary_loss_mlp": 0.01066533, "balance_loss_clip": 1.06593037, "balance_loss_mlp": 1.04104638, "epoch": 0.07701788666766872, "flos": 16143750380160.0, "grad_norm": 2.044864943195716, "language_loss": 0.7581439, "learning_rate": 3.9768557249492875e-06, "loss": 0.78074944, "num_input_tokens_seen": 27232525, "step": 1281, "time_per_iteration": 2.7444865703582764 }, { "auxiliary_loss_clip": 0.01170775, "auxiliary_loss_mlp": 0.01062526, "balance_loss_clip": 1.05879402, "balance_loss_mlp": 1.03669322, "epoch": 0.07707800992033668, "flos": 19463045264640.0, "grad_norm": 1.8925477349429178, "language_loss": 0.75091648, "learning_rate": 3.9767966093781634e-06, "loss": 0.77324951, "num_input_tokens_seen": 27249800, "step": 1282, "time_per_iteration": 2.829145908355713 }, { "auxiliary_loss_clip": 0.01213222, "auxiliary_loss_mlp": 0.01071082, "balance_loss_clip": 1.07007408, "balance_loss_mlp": 1.04549992, "epoch": 0.07713813317300466, "flos": 18990281433600.0, "grad_norm": 2.1558853998977527, "language_loss": 0.83863324, "learning_rate": 3.976737418846713e-06, "loss": 0.8614763, "num_input_tokens_seen": 27268895, "step": 1283, "time_per_iteration": 2.6955173015594482 }, { "auxiliary_loss_clip": 0.0119621, "auxiliary_loss_mlp": 0.01066889, "balance_loss_clip": 1.06603825, "balance_loss_mlp": 1.03925657, "epoch": 0.07719825642567263, "flos": 18113953322880.0, "grad_norm": 2.520477290704422, "language_loss": 0.75147104, "learning_rate": 3.976678153357181e-06, "loss": 0.77410209, "num_input_tokens_seen": 27288180, "step": 1284, "time_per_iteration": 2.6589291095733643 }, { "auxiliary_loss_clip": 0.01182212, "auxiliary_loss_mlp": 0.01068485, "balance_loss_clip": 1.06304765, "balance_loss_mlp": 1.0438329, "epoch": 0.0772583796783406, "flos": 42194426993280.0, "grad_norm": 5.2953301239297295, "language_loss": 0.76224041, "learning_rate": 3.976618812911817e-06, "loss": 0.78474742, "num_input_tokens_seen": 27311815, "step": 1285, "time_per_iteration": 2.847702741622925 }, { "auxiliary_loss_clip": 0.01216302, "auxiliary_loss_mlp": 0.01071451, "balance_loss_clip": 1.07193899, "balance_loss_mlp": 1.04729891, "epoch": 0.07731850293100857, "flos": 24753692327040.0, "grad_norm": 2.0564733507641, "language_loss": 0.84193194, "learning_rate": 3.9765593975128685e-06, "loss": 0.86480945, "num_input_tokens_seen": 27331890, "step": 1286, "time_per_iteration": 2.713963270187378 }, { "auxiliary_loss_clip": 0.01180469, "auxiliary_loss_mlp": 0.01061062, "balance_loss_clip": 1.06331325, "balance_loss_mlp": 1.03646958, "epoch": 0.07737862618367654, "flos": 17565884628480.0, "grad_norm": 2.810253293244863, "language_loss": 0.76899689, "learning_rate": 3.97649990716259e-06, "loss": 0.79141217, "num_input_tokens_seen": 27348320, "step": 1287, "time_per_iteration": 2.669168472290039 }, { "auxiliary_loss_clip": 0.011763, "auxiliary_loss_mlp": 0.01061108, "balance_loss_clip": 1.05891848, "balance_loss_mlp": 1.03696775, "epoch": 0.0774387494363445, "flos": 25627147349760.0, "grad_norm": 1.6525652726351308, "language_loss": 0.84699571, "learning_rate": 3.976440341863237e-06, "loss": 0.86936986, "num_input_tokens_seen": 27367670, "step": 1288, "time_per_iteration": 2.7794599533081055 }, { "auxiliary_loss_clip": 0.01206182, "auxiliary_loss_mlp": 0.0106604, "balance_loss_clip": 1.06214797, "balance_loss_mlp": 1.04203176, "epoch": 0.07749887268901248, "flos": 12239865648000.0, "grad_norm": 2.0424090794957523, "language_loss": 0.85576034, "learning_rate": 3.976380701617068e-06, "loss": 0.87848258, "num_input_tokens_seen": 27385485, "step": 1289, "time_per_iteration": 4.232934236526489 }, { "auxiliary_loss_clip": 0.01207527, "auxiliary_loss_mlp": 0.01052975, "balance_loss_clip": 1.06487668, "balance_loss_mlp": 1.0291574, "epoch": 0.07755899594168045, "flos": 25081736261760.0, "grad_norm": 2.840721047922519, "language_loss": 0.85548425, "learning_rate": 3.976320986426344e-06, "loss": 0.87808931, "num_input_tokens_seen": 27405110, "step": 1290, "time_per_iteration": 4.218302965164185 }, { "auxiliary_loss_clip": 0.0117374, "auxiliary_loss_mlp": 0.01066698, "balance_loss_clip": 1.06411862, "balance_loss_mlp": 1.04041266, "epoch": 0.07761911919434841, "flos": 14246410176000.0, "grad_norm": 2.3756178078405976, "language_loss": 0.91390574, "learning_rate": 3.9762611962933315e-06, "loss": 0.93631011, "num_input_tokens_seen": 27422855, "step": 1291, "time_per_iteration": 4.468304395675659 }, { "auxiliary_loss_clip": 0.01081301, "auxiliary_loss_mlp": 0.01043026, "balance_loss_clip": 1.04092944, "balance_loss_mlp": 1.03894901, "epoch": 0.07767924244701638, "flos": 67237202954880.0, "grad_norm": 0.8973948861970446, "language_loss": 0.65065891, "learning_rate": 3.9762013312202955e-06, "loss": 0.67190224, "num_input_tokens_seen": 27487190, "step": 1292, "time_per_iteration": 3.3142755031585693 }, { "auxiliary_loss_clip": 0.01195822, "auxiliary_loss_mlp": 0.01062751, "balance_loss_clip": 1.06527543, "balance_loss_mlp": 1.03846776, "epoch": 0.07773936569968436, "flos": 28550635292160.0, "grad_norm": 1.7595227960044768, "language_loss": 0.87530363, "learning_rate": 3.9761413912095075e-06, "loss": 0.89788938, "num_input_tokens_seen": 27510465, "step": 1293, "time_per_iteration": 2.801603078842163 }, { "auxiliary_loss_clip": 0.01116633, "auxiliary_loss_mlp": 0.01078659, "balance_loss_clip": 1.05041039, "balance_loss_mlp": 1.05012059, "epoch": 0.07779948895235232, "flos": 27490264871040.0, "grad_norm": 2.2898991349098528, "language_loss": 0.84518278, "learning_rate": 3.976081376263239e-06, "loss": 0.8671357, "num_input_tokens_seen": 27528645, "step": 1294, "time_per_iteration": 2.898597002029419 }, { "auxiliary_loss_clip": 0.01158796, "auxiliary_loss_mlp": 0.01059505, "balance_loss_clip": 1.05967593, "balance_loss_mlp": 1.0342207, "epoch": 0.07785961220502029, "flos": 18223301301120.0, "grad_norm": 2.7292442592472073, "language_loss": 0.79365373, "learning_rate": 3.976021286383768e-06, "loss": 0.81583679, "num_input_tokens_seen": 27546165, "step": 1295, "time_per_iteration": 2.8481552600860596 }, { "auxiliary_loss_clip": 0.01155886, "auxiliary_loss_mlp": 0.01061351, "balance_loss_clip": 1.06015158, "balance_loss_mlp": 1.0356493, "epoch": 0.07791973545768827, "flos": 24608218245120.0, "grad_norm": 3.472740252224496, "language_loss": 0.88351864, "learning_rate": 3.975961121573371e-06, "loss": 0.90569103, "num_input_tokens_seen": 27566520, "step": 1296, "time_per_iteration": 2.697831392288208 }, { "auxiliary_loss_clip": 0.0120756, "auxiliary_loss_mlp": 0.01074146, "balance_loss_clip": 1.06552935, "balance_loss_mlp": 1.04791999, "epoch": 0.07797985871035623, "flos": 14282069402880.0, "grad_norm": 2.384603846473911, "language_loss": 0.9625901, "learning_rate": 3.9759008818343305e-06, "loss": 0.98540717, "num_input_tokens_seen": 27581960, "step": 1297, "time_per_iteration": 2.62660551071167 }, { "auxiliary_loss_clip": 0.01175852, "auxiliary_loss_mlp": 0.01069298, "balance_loss_clip": 1.06147313, "balance_loss_mlp": 1.04517019, "epoch": 0.0780399819630242, "flos": 26610453141120.0, "grad_norm": 2.15152040651991, "language_loss": 0.7600193, "learning_rate": 3.97584056716893e-06, "loss": 0.78247076, "num_input_tokens_seen": 27601415, "step": 1298, "time_per_iteration": 2.8040499687194824 }, { "auxiliary_loss_clip": 0.0114505, "auxiliary_loss_mlp": 0.00783981, "balance_loss_clip": 1.05864501, "balance_loss_mlp": 1.0006063, "epoch": 0.07810010521569218, "flos": 21834514016640.0, "grad_norm": 1.6697657327886877, "language_loss": 0.8097105, "learning_rate": 3.9757801775794575e-06, "loss": 0.82900077, "num_input_tokens_seen": 27621490, "step": 1299, "time_per_iteration": 2.7667653560638428 }, { "auxiliary_loss_clip": 0.01162638, "auxiliary_loss_mlp": 0.01064395, "balance_loss_clip": 1.06191885, "balance_loss_mlp": 1.0393368, "epoch": 0.07816022846836014, "flos": 25081233471360.0, "grad_norm": 1.9748762517467437, "language_loss": 0.86755943, "learning_rate": 3.975719713068202e-06, "loss": 0.8898297, "num_input_tokens_seen": 27640600, "step": 1300, "time_per_iteration": 2.7819204330444336 }, { "auxiliary_loss_clip": 0.0120807, "auxiliary_loss_mlp": 0.01056805, "balance_loss_clip": 1.06663537, "balance_loss_mlp": 1.03180683, "epoch": 0.0782203517210281, "flos": 40917515431680.0, "grad_norm": 3.040560411644486, "language_loss": 0.71822268, "learning_rate": 3.975659173637458e-06, "loss": 0.74087137, "num_input_tokens_seen": 27663070, "step": 1301, "time_per_iteration": 2.845107316970825 }, { "auxiliary_loss_clip": 0.01196566, "auxiliary_loss_mlp": 0.01075534, "balance_loss_clip": 1.06426311, "balance_loss_mlp": 1.05100083, "epoch": 0.07828047497369607, "flos": 41172014269440.0, "grad_norm": 1.6425838754876312, "language_loss": 0.70782864, "learning_rate": 3.97559855928952e-06, "loss": 0.73054957, "num_input_tokens_seen": 27686425, "step": 1302, "time_per_iteration": 2.898069381713867 }, { "auxiliary_loss_clip": 0.01162032, "auxiliary_loss_mlp": 0.00783256, "balance_loss_clip": 1.06019354, "balance_loss_mlp": 1.00062823, "epoch": 0.07834059822636405, "flos": 23508130360320.0, "grad_norm": 2.067506704059933, "language_loss": 0.82100385, "learning_rate": 3.9755378700266864e-06, "loss": 0.84045678, "num_input_tokens_seen": 27704900, "step": 1303, "time_per_iteration": 2.7862839698791504 }, { "auxiliary_loss_clip": 0.01191742, "auxiliary_loss_mlp": 0.01074585, "balance_loss_clip": 1.06583321, "balance_loss_mlp": 1.04908574, "epoch": 0.07840072147903202, "flos": 20193899293440.0, "grad_norm": 1.8830773419754625, "language_loss": 0.75206572, "learning_rate": 3.9754771058512585e-06, "loss": 0.77472901, "num_input_tokens_seen": 27724890, "step": 1304, "time_per_iteration": 2.7380170822143555 }, { "auxiliary_loss_clip": 0.01211207, "auxiliary_loss_mlp": 0.01074343, "balance_loss_clip": 1.07114935, "balance_loss_mlp": 1.04922605, "epoch": 0.07846084473169998, "flos": 21360816432000.0, "grad_norm": 1.6118444643214749, "language_loss": 0.76141047, "learning_rate": 3.975416266765542e-06, "loss": 0.784266, "num_input_tokens_seen": 27743115, "step": 1305, "time_per_iteration": 2.6788928508758545 }, { "auxiliary_loss_clip": 0.01137547, "auxiliary_loss_mlp": 0.01064795, "balance_loss_clip": 1.05611205, "balance_loss_mlp": 1.04021358, "epoch": 0.07852096798436796, "flos": 25410965345280.0, "grad_norm": 1.9541638070229452, "language_loss": 0.85011744, "learning_rate": 3.975355352771841e-06, "loss": 0.87214082, "num_input_tokens_seen": 27763570, "step": 1306, "time_per_iteration": 3.048137903213501 }, { "auxiliary_loss_clip": 0.01194779, "auxiliary_loss_mlp": 0.01049822, "balance_loss_clip": 1.06754708, "balance_loss_mlp": 1.02668333, "epoch": 0.07858109123703592, "flos": 24571481610240.0, "grad_norm": 6.108459548145404, "language_loss": 0.90882134, "learning_rate": 3.975294363872468e-06, "loss": 0.93126732, "num_input_tokens_seen": 27780030, "step": 1307, "time_per_iteration": 3.1597135066986084 }, { "auxiliary_loss_clip": 0.01145989, "auxiliary_loss_mlp": 0.01060478, "balance_loss_clip": 1.05529833, "balance_loss_mlp": 1.034729, "epoch": 0.07864121448970389, "flos": 20698874645760.0, "grad_norm": 3.4991416096159136, "language_loss": 0.83695096, "learning_rate": 3.975233300069735e-06, "loss": 0.85901558, "num_input_tokens_seen": 27796225, "step": 1308, "time_per_iteration": 2.749174118041992 }, { "auxiliary_loss_clip": 0.01151044, "auxiliary_loss_mlp": 0.01061966, "balance_loss_clip": 1.05445218, "balance_loss_mlp": 1.03789735, "epoch": 0.07870133774237187, "flos": 22966526113920.0, "grad_norm": 1.7092634116882437, "language_loss": 0.77521002, "learning_rate": 3.975172161365958e-06, "loss": 0.7973401, "num_input_tokens_seen": 27815975, "step": 1309, "time_per_iteration": 2.752854108810425 }, { "auxiliary_loss_clip": 0.01200102, "auxiliary_loss_mlp": 0.01070583, "balance_loss_clip": 1.06396675, "balance_loss_mlp": 1.04449987, "epoch": 0.07876146099503983, "flos": 18842832103680.0, "grad_norm": 1.8729662604656268, "language_loss": 0.80561006, "learning_rate": 3.975110947763453e-06, "loss": 0.82831693, "num_input_tokens_seen": 27832255, "step": 1310, "time_per_iteration": 2.6966710090637207 }, { "auxiliary_loss_clip": 0.01173381, "auxiliary_loss_mlp": 0.0078245, "balance_loss_clip": 1.06193507, "balance_loss_mlp": 1.00060987, "epoch": 0.0788215842477078, "flos": 23805794367360.0, "grad_norm": 1.796715978968241, "language_loss": 0.73187977, "learning_rate": 3.9750496592645435e-06, "loss": 0.75143808, "num_input_tokens_seen": 27852180, "step": 1311, "time_per_iteration": 2.7588090896606445 }, { "auxiliary_loss_clip": 0.01188438, "auxiliary_loss_mlp": 0.01078546, "balance_loss_clip": 1.06358969, "balance_loss_mlp": 1.05342865, "epoch": 0.07888170750037576, "flos": 21579907438080.0, "grad_norm": 1.7490617386556226, "language_loss": 0.86002982, "learning_rate": 3.974988295871553e-06, "loss": 0.88269973, "num_input_tokens_seen": 27871435, "step": 1312, "time_per_iteration": 2.6969683170318604 }, { "auxiliary_loss_clip": 0.01178338, "auxiliary_loss_mlp": 0.01059112, "balance_loss_clip": 1.06324685, "balance_loss_mlp": 1.03633142, "epoch": 0.07894183075304374, "flos": 19864849777920.0, "grad_norm": 1.825664315845032, "language_loss": 0.82087892, "learning_rate": 3.9749268575868085e-06, "loss": 0.84325337, "num_input_tokens_seen": 27890625, "step": 1313, "time_per_iteration": 2.6936304569244385 }, { "auxiliary_loss_clip": 0.01184798, "auxiliary_loss_mlp": 0.00783631, "balance_loss_clip": 1.06229842, "balance_loss_mlp": 1.00053823, "epoch": 0.07900195400571171, "flos": 16143463071360.0, "grad_norm": 2.837190319075622, "language_loss": 0.73569417, "learning_rate": 3.97486534441264e-06, "loss": 0.75537837, "num_input_tokens_seen": 27906530, "step": 1314, "time_per_iteration": 2.653505325317383 }, { "auxiliary_loss_clip": 0.01154585, "auxiliary_loss_mlp": 0.00782352, "balance_loss_clip": 1.05730104, "balance_loss_mlp": 1.00044668, "epoch": 0.07906207725837967, "flos": 23730417676800.0, "grad_norm": 1.6153694611764058, "language_loss": 0.79490477, "learning_rate": 3.974803756351379e-06, "loss": 0.81427419, "num_input_tokens_seen": 27926725, "step": 1315, "time_per_iteration": 2.797306776046753 }, { "auxiliary_loss_clip": 0.01189107, "auxiliary_loss_mlp": 0.01060743, "balance_loss_clip": 1.05841756, "balance_loss_mlp": 1.03487444, "epoch": 0.07912220051104765, "flos": 24315905364480.0, "grad_norm": 1.6362349035659796, "language_loss": 0.73546493, "learning_rate": 3.974742093405362e-06, "loss": 0.75796348, "num_input_tokens_seen": 27947875, "step": 1316, "time_per_iteration": 2.688997507095337 }, { "auxiliary_loss_clip": 0.01162651, "auxiliary_loss_mlp": 0.01066617, "balance_loss_clip": 1.05845332, "balance_loss_mlp": 1.0418098, "epoch": 0.07918232376371562, "flos": 18880035615360.0, "grad_norm": 2.157376902111077, "language_loss": 0.65540409, "learning_rate": 3.974680355576927e-06, "loss": 0.67769682, "num_input_tokens_seen": 27965040, "step": 1317, "time_per_iteration": 2.6998519897460938 }, { "auxiliary_loss_clip": 0.01177674, "auxiliary_loss_mlp": 0.01068635, "balance_loss_clip": 1.06280386, "balance_loss_mlp": 1.0428021, "epoch": 0.07924244701638358, "flos": 27376284038400.0, "grad_norm": 2.382161374765057, "language_loss": 0.73105192, "learning_rate": 3.974618542868415e-06, "loss": 0.75351495, "num_input_tokens_seen": 27985330, "step": 1318, "time_per_iteration": 2.8350789546966553 }, { "auxiliary_loss_clip": 0.01139638, "auxiliary_loss_mlp": 0.01058798, "balance_loss_clip": 1.05582452, "balance_loss_mlp": 1.03515935, "epoch": 0.07930257026905156, "flos": 25120340403840.0, "grad_norm": 2.635941883481154, "language_loss": 0.90381306, "learning_rate": 3.97455665528217e-06, "loss": 0.92579746, "num_input_tokens_seen": 28007615, "step": 1319, "time_per_iteration": 2.8553895950317383 }, { "auxiliary_loss_clip": 0.01175059, "auxiliary_loss_mlp": 0.01055333, "balance_loss_clip": 1.05662942, "balance_loss_mlp": 1.03122926, "epoch": 0.07936269352171953, "flos": 21834478103040.0, "grad_norm": 1.9449065990449943, "language_loss": 0.80134505, "learning_rate": 3.974494692820539e-06, "loss": 0.82364893, "num_input_tokens_seen": 28027765, "step": 1320, "time_per_iteration": 2.6651997566223145 }, { "auxiliary_loss_clip": 0.01181808, "auxiliary_loss_mlp": 0.01060151, "balance_loss_clip": 1.06380332, "balance_loss_mlp": 1.03657198, "epoch": 0.07942281677438749, "flos": 16939889377920.0, "grad_norm": 2.1078540484546746, "language_loss": 0.6901226, "learning_rate": 3.974432655485872e-06, "loss": 0.71254218, "num_input_tokens_seen": 28044225, "step": 1321, "time_per_iteration": 2.6500401496887207 }, { "auxiliary_loss_clip": 0.01189002, "auxiliary_loss_mlp": 0.01060598, "balance_loss_clip": 1.06469131, "balance_loss_mlp": 1.03688753, "epoch": 0.07948294002705546, "flos": 18986941468800.0, "grad_norm": 1.9310950096267907, "language_loss": 0.8359012, "learning_rate": 3.9743705432805195e-06, "loss": 0.85839725, "num_input_tokens_seen": 28062915, "step": 1322, "time_per_iteration": 2.684978723526001 }, { "auxiliary_loss_clip": 0.01202147, "auxiliary_loss_mlp": 0.01057117, "balance_loss_clip": 1.06135976, "balance_loss_mlp": 1.03304851, "epoch": 0.07954306327972344, "flos": 21653452535040.0, "grad_norm": 2.128262121046283, "language_loss": 0.90555447, "learning_rate": 3.974308356206838e-06, "loss": 0.92814714, "num_input_tokens_seen": 28082175, "step": 1323, "time_per_iteration": 2.6192240715026855 }, { "auxiliary_loss_clip": 0.01164151, "auxiliary_loss_mlp": 0.01062303, "balance_loss_clip": 1.06272292, "balance_loss_mlp": 1.03809166, "epoch": 0.0796031865323914, "flos": 23220270766080.0, "grad_norm": 1.8373443631598505, "language_loss": 0.82521075, "learning_rate": 3.974246094267187e-06, "loss": 0.84747529, "num_input_tokens_seen": 28102645, "step": 1324, "time_per_iteration": 2.8283956050872803 }, { "auxiliary_loss_clip": 0.01180787, "auxiliary_loss_mlp": 0.01053463, "balance_loss_clip": 1.06256735, "balance_loss_mlp": 1.02834535, "epoch": 0.07966330978505937, "flos": 23294534135040.0, "grad_norm": 2.119290865165494, "language_loss": 0.79162025, "learning_rate": 3.974183757463925e-06, "loss": 0.8139627, "num_input_tokens_seen": 28122805, "step": 1325, "time_per_iteration": 2.6996092796325684 }, { "auxiliary_loss_clip": 0.01119286, "auxiliary_loss_mlp": 0.00785175, "balance_loss_clip": 1.04844928, "balance_loss_mlp": 1.00035501, "epoch": 0.07972343303772735, "flos": 18363783392640.0, "grad_norm": 2.2621745256944448, "language_loss": 0.88038248, "learning_rate": 3.974121345799418e-06, "loss": 0.89942712, "num_input_tokens_seen": 28140530, "step": 1326, "time_per_iteration": 2.881410837173462 }, { "auxiliary_loss_clip": 0.012, "auxiliary_loss_mlp": 0.01056877, "balance_loss_clip": 1.06257951, "balance_loss_mlp": 1.03168797, "epoch": 0.07978355629039531, "flos": 21762513204480.0, "grad_norm": 1.8538865301137586, "language_loss": 0.8328709, "learning_rate": 3.974058859276032e-06, "loss": 0.85543966, "num_input_tokens_seen": 28159640, "step": 1327, "time_per_iteration": 2.7277982234954834 }, { "auxiliary_loss_clip": 0.01207207, "auxiliary_loss_mlp": 0.01056886, "balance_loss_clip": 1.06532371, "balance_loss_mlp": 1.03223395, "epoch": 0.07984367954306328, "flos": 18551309322240.0, "grad_norm": 2.3216818645515636, "language_loss": 0.78599, "learning_rate": 3.9739962978961354e-06, "loss": 0.80863088, "num_input_tokens_seen": 28177050, "step": 1328, "time_per_iteration": 4.2137157917022705 }, { "auxiliary_loss_clip": 0.01201442, "auxiliary_loss_mlp": 0.01052053, "balance_loss_clip": 1.06778932, "balance_loss_mlp": 1.02722156, "epoch": 0.07990380279573125, "flos": 16904050583040.0, "grad_norm": 4.209530911932697, "language_loss": 0.73918134, "learning_rate": 3.973933661662101e-06, "loss": 0.76171625, "num_input_tokens_seen": 28193245, "step": 1329, "time_per_iteration": 5.853717565536499 }, { "auxiliary_loss_clip": 0.01169795, "auxiliary_loss_mlp": 0.01064631, "balance_loss_clip": 1.06039059, "balance_loss_mlp": 1.04069376, "epoch": 0.07996392604839922, "flos": 24098358643200.0, "grad_norm": 1.6102544328312476, "language_loss": 0.81743932, "learning_rate": 3.973870950576305e-06, "loss": 0.83978355, "num_input_tokens_seen": 28213570, "step": 1330, "time_per_iteration": 4.307915687561035 }, { "auxiliary_loss_clip": 0.01205148, "auxiliary_loss_mlp": 0.00780735, "balance_loss_clip": 1.06445098, "balance_loss_mlp": 1.00030971, "epoch": 0.08002404930106718, "flos": 14278729438080.0, "grad_norm": 3.0935981151455865, "language_loss": 0.88962448, "learning_rate": 3.9738081646411255e-06, "loss": 0.90948325, "num_input_tokens_seen": 28229980, "step": 1331, "time_per_iteration": 2.645198345184326 }, { "auxiliary_loss_clip": 0.01196019, "auxiliary_loss_mlp": 0.00781409, "balance_loss_clip": 1.05950165, "balance_loss_mlp": 1.00032377, "epoch": 0.08008417255373516, "flos": 40406219285760.0, "grad_norm": 1.8933982437719925, "language_loss": 0.7335732, "learning_rate": 3.973745303858942e-06, "loss": 0.75334752, "num_input_tokens_seen": 28253840, "step": 1332, "time_per_iteration": 2.792128562927246 }, { "auxiliary_loss_clip": 0.01180359, "auxiliary_loss_mlp": 0.01055118, "balance_loss_clip": 1.06217384, "balance_loss_mlp": 1.03216982, "epoch": 0.08014429580640313, "flos": 18478913460480.0, "grad_norm": 1.7464568676953767, "language_loss": 0.82765031, "learning_rate": 3.973682368232138e-06, "loss": 0.85000509, "num_input_tokens_seen": 28271675, "step": 1333, "time_per_iteration": 2.635579824447632 }, { "auxiliary_loss_clip": 0.01160554, "auxiliary_loss_mlp": 0.01059025, "balance_loss_clip": 1.05944169, "balance_loss_mlp": 1.03502798, "epoch": 0.0802044190590711, "flos": 22053461368320.0, "grad_norm": 2.677615191761892, "language_loss": 0.74862051, "learning_rate": 3.9736193577631015e-06, "loss": 0.77081633, "num_input_tokens_seen": 28291850, "step": 1334, "time_per_iteration": 2.8150298595428467 }, { "auxiliary_loss_clip": 0.01176175, "auxiliary_loss_mlp": 0.01063593, "balance_loss_clip": 1.06460369, "balance_loss_mlp": 1.04010868, "epoch": 0.08026454231173906, "flos": 24572128055040.0, "grad_norm": 1.8723728369534094, "language_loss": 0.79970533, "learning_rate": 3.973556272454221e-06, "loss": 0.82210302, "num_input_tokens_seen": 28310780, "step": 1335, "time_per_iteration": 2.6858503818511963 }, { "auxiliary_loss_clip": 0.01068232, "auxiliary_loss_mlp": 0.01020395, "balance_loss_clip": 1.04101062, "balance_loss_mlp": 1.01693749, "epoch": 0.08032466556440704, "flos": 52581841459200.0, "grad_norm": 0.7491611763509133, "language_loss": 0.56056821, "learning_rate": 3.973493112307889e-06, "loss": 0.58145452, "num_input_tokens_seen": 28369985, "step": 1336, "time_per_iteration": 3.324230670928955 }, { "auxiliary_loss_clip": 0.01179495, "auxiliary_loss_mlp": 0.01064433, "balance_loss_clip": 1.06005239, "balance_loss_mlp": 1.04149771, "epoch": 0.080384788817075, "flos": 23842602829440.0, "grad_norm": 2.8990759307469256, "language_loss": 0.67587668, "learning_rate": 3.9734298773265005e-06, "loss": 0.69831598, "num_input_tokens_seen": 28388670, "step": 1337, "time_per_iteration": 2.755451202392578 }, { "auxiliary_loss_clip": 0.01171763, "auxiliary_loss_mlp": 0.0107788, "balance_loss_clip": 1.06270492, "balance_loss_mlp": 1.05304837, "epoch": 0.08044491206974297, "flos": 25300719527040.0, "grad_norm": 1.9421039451316542, "language_loss": 0.86847901, "learning_rate": 3.973366567512453e-06, "loss": 0.89097536, "num_input_tokens_seen": 28411845, "step": 1338, "time_per_iteration": 2.758418560028076 }, { "auxiliary_loss_clip": 0.01136344, "auxiliary_loss_mlp": 0.01082295, "balance_loss_clip": 1.04883683, "balance_loss_mlp": 1.05596161, "epoch": 0.08050503532241095, "flos": 22376549226240.0, "grad_norm": 2.4557709650828157, "language_loss": 0.87217385, "learning_rate": 3.973303182868147e-06, "loss": 0.89436018, "num_input_tokens_seen": 28427875, "step": 1339, "time_per_iteration": 2.72682785987854 }, { "auxiliary_loss_clip": 0.01188632, "auxiliary_loss_mlp": 0.01055953, "balance_loss_clip": 1.06334567, "balance_loss_mlp": 1.03417385, "epoch": 0.08056515857507891, "flos": 18369421827840.0, "grad_norm": 10.603370056653041, "language_loss": 0.89504963, "learning_rate": 3.973239723395988e-06, "loss": 0.91749549, "num_input_tokens_seen": 28446615, "step": 1340, "time_per_iteration": 2.639601469039917 }, { "auxiliary_loss_clip": 0.01080107, "auxiliary_loss_mlp": 0.01012224, "balance_loss_clip": 1.02943289, "balance_loss_mlp": 1.00850451, "epoch": 0.08062528182774688, "flos": 51348130980480.0, "grad_norm": 0.8861598592181924, "language_loss": 0.64834231, "learning_rate": 3.97317618909838e-06, "loss": 0.66926563, "num_input_tokens_seen": 28505290, "step": 1341, "time_per_iteration": 3.0625648498535156 }, { "auxiliary_loss_clip": 0.01197538, "auxiliary_loss_mlp": 0.01061885, "balance_loss_clip": 1.0628854, "balance_loss_mlp": 1.0364095, "epoch": 0.08068540508041486, "flos": 17599712261760.0, "grad_norm": 3.3156125209451286, "language_loss": 0.89471233, "learning_rate": 3.973112579977733e-06, "loss": 0.9173066, "num_input_tokens_seen": 28522735, "step": 1342, "time_per_iteration": 2.6123783588409424 }, { "auxiliary_loss_clip": 0.01177687, "auxiliary_loss_mlp": 0.01062063, "balance_loss_clip": 1.0644995, "balance_loss_mlp": 1.03818512, "epoch": 0.08074552833308282, "flos": 10561185486720.0, "grad_norm": 2.2904075751929365, "language_loss": 0.76354575, "learning_rate": 3.973048896036459e-06, "loss": 0.78594327, "num_input_tokens_seen": 28539460, "step": 1343, "time_per_iteration": 2.7564918994903564 }, { "auxiliary_loss_clip": 0.01064182, "auxiliary_loss_mlp": 0.01010488, "balance_loss_clip": 1.02542567, "balance_loss_mlp": 1.0066731, "epoch": 0.08080565158575079, "flos": 60840254954880.0, "grad_norm": 0.8071281523255156, "language_loss": 0.57418531, "learning_rate": 3.972985137276974e-06, "loss": 0.59493202, "num_input_tokens_seen": 28599855, "step": 1344, "time_per_iteration": 3.170443058013916 }, { "auxiliary_loss_clip": 0.01158029, "auxiliary_loss_mlp": 0.01063108, "balance_loss_clip": 1.05839872, "balance_loss_mlp": 1.03846788, "epoch": 0.08086577483841875, "flos": 18332361970560.0, "grad_norm": 2.5953739346171676, "language_loss": 0.86569476, "learning_rate": 3.972921303701695e-06, "loss": 0.88790607, "num_input_tokens_seen": 28617585, "step": 1345, "time_per_iteration": 2.765254497528076 }, { "auxiliary_loss_clip": 0.01203428, "auxiliary_loss_mlp": 0.01057879, "balance_loss_clip": 1.06629944, "balance_loss_mlp": 1.03603959, "epoch": 0.08092589809108673, "flos": 21543601766400.0, "grad_norm": 1.8653844332842058, "language_loss": 0.87646407, "learning_rate": 3.972857395313042e-06, "loss": 0.89907712, "num_input_tokens_seen": 28636355, "step": 1346, "time_per_iteration": 2.655611991882324 }, { "auxiliary_loss_clip": 0.01191822, "auxiliary_loss_mlp": 0.0105414, "balance_loss_clip": 1.06450033, "balance_loss_mlp": 1.03047693, "epoch": 0.0809860213437547, "flos": 22128012046080.0, "grad_norm": 1.7047476553504466, "language_loss": 0.9298563, "learning_rate": 3.972793412113439e-06, "loss": 0.95231593, "num_input_tokens_seen": 28656260, "step": 1347, "time_per_iteration": 2.718355417251587 }, { "auxiliary_loss_clip": 0.01188696, "auxiliary_loss_mlp": 0.01066703, "balance_loss_clip": 1.06260633, "balance_loss_mlp": 1.04144263, "epoch": 0.08104614459642266, "flos": 21725489260800.0, "grad_norm": 1.9307860049130865, "language_loss": 0.89506733, "learning_rate": 3.972729354105312e-06, "loss": 0.91762137, "num_input_tokens_seen": 28675865, "step": 1348, "time_per_iteration": 2.763735771179199 }, { "auxiliary_loss_clip": 0.01137961, "auxiliary_loss_mlp": 0.01059733, "balance_loss_clip": 1.06026649, "balance_loss_mlp": 1.03730989, "epoch": 0.08110626784909064, "flos": 23951878980480.0, "grad_norm": 1.6214351378274148, "language_loss": 0.76906884, "learning_rate": 3.97266522129109e-06, "loss": 0.79104578, "num_input_tokens_seen": 28696255, "step": 1349, "time_per_iteration": 2.778050661087036 }, { "auxiliary_loss_clip": 0.01202122, "auxiliary_loss_mlp": 0.01065092, "balance_loss_clip": 1.06290889, "balance_loss_mlp": 1.04144049, "epoch": 0.0811663911017586, "flos": 19025689265280.0, "grad_norm": 1.777484449358279, "language_loss": 0.8877703, "learning_rate": 3.972601013673205e-06, "loss": 0.91044247, "num_input_tokens_seen": 28713905, "step": 1350, "time_per_iteration": 2.5871450901031494 }, { "auxiliary_loss_clip": 0.01164889, "auxiliary_loss_mlp": 0.00780958, "balance_loss_clip": 1.06011164, "balance_loss_mlp": 1.00028801, "epoch": 0.08122651435442657, "flos": 15341290588800.0, "grad_norm": 2.7472756845793156, "language_loss": 0.82298493, "learning_rate": 3.972536731254092e-06, "loss": 0.84244347, "num_input_tokens_seen": 28732075, "step": 1351, "time_per_iteration": 2.840271234512329 }, { "auxiliary_loss_clip": 0.01198177, "auxiliary_loss_mlp": 0.01055773, "balance_loss_clip": 1.06010592, "balance_loss_mlp": 1.03090644, "epoch": 0.08128663760709455, "flos": 23221563655680.0, "grad_norm": 2.2808101252466724, "language_loss": 0.75274944, "learning_rate": 3.972472374036189e-06, "loss": 0.775289, "num_input_tokens_seen": 28751150, "step": 1352, "time_per_iteration": 2.733644485473633 }, { "auxiliary_loss_clip": 0.01194643, "auxiliary_loss_mlp": 0.00783595, "balance_loss_clip": 1.06613326, "balance_loss_mlp": 1.00036311, "epoch": 0.08134676085976252, "flos": 22965628273920.0, "grad_norm": 1.678520960707938, "language_loss": 0.82936156, "learning_rate": 3.972407942021935e-06, "loss": 0.84914398, "num_input_tokens_seen": 28773360, "step": 1353, "time_per_iteration": 2.742149829864502 }, { "auxiliary_loss_clip": 0.01068236, "auxiliary_loss_mlp": 0.01015932, "balance_loss_clip": 1.02440155, "balance_loss_mlp": 1.01242769, "epoch": 0.08140688411243048, "flos": 64322115816960.0, "grad_norm": 0.8516312511934722, "language_loss": 0.59741521, "learning_rate": 3.972343435213775e-06, "loss": 0.61825693, "num_input_tokens_seen": 28833390, "step": 1354, "time_per_iteration": 3.1912426948547363 }, { "auxiliary_loss_clip": 0.01150343, "auxiliary_loss_mlp": 0.01058874, "balance_loss_clip": 1.0546236, "balance_loss_mlp": 1.03583086, "epoch": 0.08146700736509845, "flos": 22491858862080.0, "grad_norm": 2.1234068486581643, "language_loss": 0.82310611, "learning_rate": 3.972278853614154e-06, "loss": 0.84519827, "num_input_tokens_seen": 28852430, "step": 1355, "time_per_iteration": 2.782442808151245 }, { "auxiliary_loss_clip": 0.01186948, "auxiliary_loss_mlp": 0.01062856, "balance_loss_clip": 1.0600667, "balance_loss_mlp": 1.03801262, "epoch": 0.08152713061776642, "flos": 20447823513600.0, "grad_norm": 1.8366299277102565, "language_loss": 0.7135247, "learning_rate": 3.972214197225521e-06, "loss": 0.73602271, "num_input_tokens_seen": 28870685, "step": 1356, "time_per_iteration": 2.7777554988861084 }, { "auxiliary_loss_clip": 0.01194666, "auxiliary_loss_mlp": 0.01056522, "balance_loss_clip": 1.06462216, "balance_loss_mlp": 1.03259718, "epoch": 0.08158725387043439, "flos": 23550218121600.0, "grad_norm": 2.050923525150184, "language_loss": 0.70426142, "learning_rate": 3.972149466050329e-06, "loss": 0.72677326, "num_input_tokens_seen": 28889860, "step": 1357, "time_per_iteration": 2.852046012878418 }, { "auxiliary_loss_clip": 0.01186996, "auxiliary_loss_mlp": 0.01054475, "balance_loss_clip": 1.06138206, "balance_loss_mlp": 1.03070426, "epoch": 0.08164737712310235, "flos": 22017335264640.0, "grad_norm": 2.634204556872777, "language_loss": 0.84203482, "learning_rate": 3.97208466009103e-06, "loss": 0.8644495, "num_input_tokens_seen": 28905865, "step": 1358, "time_per_iteration": 2.7127115726470947 }, { "auxiliary_loss_clip": 0.01176629, "auxiliary_loss_mlp": 0.010566, "balance_loss_clip": 1.06037402, "balance_loss_mlp": 1.03154182, "epoch": 0.08170750037577033, "flos": 23367827836800.0, "grad_norm": 2.1726272773281097, "language_loss": 1.02781308, "learning_rate": 3.972019779350084e-06, "loss": 1.05014539, "num_input_tokens_seen": 28925250, "step": 1359, "time_per_iteration": 2.7171826362609863 }, { "auxiliary_loss_clip": 0.01128357, "auxiliary_loss_mlp": 0.01056774, "balance_loss_clip": 1.05009234, "balance_loss_mlp": 1.03263426, "epoch": 0.0817676236284383, "flos": 28397978490240.0, "grad_norm": 2.0494617207464945, "language_loss": 0.8313604, "learning_rate": 3.971954823829951e-06, "loss": 0.85321164, "num_input_tokens_seen": 28943445, "step": 1360, "time_per_iteration": 2.9020919799804688 }, { "auxiliary_loss_clip": 0.01202956, "auxiliary_loss_mlp": 0.0106887, "balance_loss_clip": 1.06274688, "balance_loss_mlp": 1.04469395, "epoch": 0.08182774688110626, "flos": 19208905562880.0, "grad_norm": 5.2377005088202075, "language_loss": 0.72322488, "learning_rate": 3.971889793533093e-06, "loss": 0.74594313, "num_input_tokens_seen": 28962695, "step": 1361, "time_per_iteration": 2.6643178462982178 }, { "auxiliary_loss_clip": 0.01166556, "auxiliary_loss_mlp": 0.01056311, "balance_loss_clip": 1.0552367, "balance_loss_mlp": 1.03184962, "epoch": 0.08188787013377424, "flos": 22784099915520.0, "grad_norm": 28.302545492028134, "language_loss": 0.76657653, "learning_rate": 3.971824688461976e-06, "loss": 0.78880513, "num_input_tokens_seen": 28982120, "step": 1362, "time_per_iteration": 2.7439064979553223 }, { "auxiliary_loss_clip": 0.01199728, "auxiliary_loss_mlp": 0.01053492, "balance_loss_clip": 1.06350708, "balance_loss_mlp": 1.03104496, "epoch": 0.08194799338644221, "flos": 16468095214080.0, "grad_norm": 2.1850191919210338, "language_loss": 0.72384715, "learning_rate": 3.971759508619069e-06, "loss": 0.74637932, "num_input_tokens_seen": 28998100, "step": 1363, "time_per_iteration": 2.7082791328430176 }, { "auxiliary_loss_clip": 0.01202887, "auxiliary_loss_mlp": 0.01066374, "balance_loss_clip": 1.06580126, "balance_loss_mlp": 1.04083955, "epoch": 0.08200811663911017, "flos": 23913633974400.0, "grad_norm": 2.142285699657122, "language_loss": 0.7726444, "learning_rate": 3.971694254006844e-06, "loss": 0.79533696, "num_input_tokens_seen": 29017095, "step": 1364, "time_per_iteration": 2.777156114578247 }, { "auxiliary_loss_clip": 0.01135428, "auxiliary_loss_mlp": 0.01063854, "balance_loss_clip": 1.05182433, "balance_loss_mlp": 1.03645968, "epoch": 0.08206823989177814, "flos": 17896550256000.0, "grad_norm": 1.85589982882842, "language_loss": 0.82242119, "learning_rate": 3.971628924627776e-06, "loss": 0.844414, "num_input_tokens_seen": 29037240, "step": 1365, "time_per_iteration": 2.8192803859710693 }, { "auxiliary_loss_clip": 0.01196582, "auxiliary_loss_mlp": 0.01059945, "balance_loss_clip": 1.07006347, "balance_loss_mlp": 1.03706884, "epoch": 0.08212836314444612, "flos": 22088186841600.0, "grad_norm": 1.7803424706125983, "language_loss": 0.82062519, "learning_rate": 3.97156352048434e-06, "loss": 0.84319043, "num_input_tokens_seen": 29056250, "step": 1366, "time_per_iteration": 2.7482311725616455 }, { "auxiliary_loss_clip": 0.01153262, "auxiliary_loss_mlp": 0.0107233, "balance_loss_clip": 1.05320215, "balance_loss_mlp": 1.04779685, "epoch": 0.08218848639711408, "flos": 17597485618560.0, "grad_norm": 2.010209091244133, "language_loss": 0.81944495, "learning_rate": 3.97149804157902e-06, "loss": 0.84170091, "num_input_tokens_seen": 29073380, "step": 1367, "time_per_iteration": 4.352729797363281 }, { "auxiliary_loss_clip": 0.01206125, "auxiliary_loss_mlp": 0.01066888, "balance_loss_clip": 1.06541765, "balance_loss_mlp": 1.04241478, "epoch": 0.08224860964978205, "flos": 17857838373120.0, "grad_norm": 2.518996379768439, "language_loss": 0.8331567, "learning_rate": 3.9714324879142946e-06, "loss": 0.85588682, "num_input_tokens_seen": 29091330, "step": 1368, "time_per_iteration": 6.077457666397095 }, { "auxiliary_loss_clip": 0.01159992, "auxiliary_loss_mlp": 0.01049874, "balance_loss_clip": 1.06314564, "balance_loss_mlp": 1.02790344, "epoch": 0.08230873290245003, "flos": 25227533566080.0, "grad_norm": 3.198110530618569, "language_loss": 0.81336468, "learning_rate": 3.971366859492653e-06, "loss": 0.8354634, "num_input_tokens_seen": 29110375, "step": 1369, "time_per_iteration": 2.769972085952759 }, { "auxiliary_loss_clip": 0.01137456, "auxiliary_loss_mlp": 0.00781814, "balance_loss_clip": 1.05438268, "balance_loss_mlp": 1.00027657, "epoch": 0.08236885615511799, "flos": 31759935753600.0, "grad_norm": 2.610758273724768, "language_loss": 0.74818152, "learning_rate": 3.971301156316582e-06, "loss": 0.76737428, "num_input_tokens_seen": 29129395, "step": 1370, "time_per_iteration": 4.497304201126099 }, { "auxiliary_loss_clip": 0.0115498, "auxiliary_loss_mlp": 0.01064278, "balance_loss_clip": 1.06403351, "balance_loss_mlp": 1.03987551, "epoch": 0.08242897940778596, "flos": 23185832601600.0, "grad_norm": 1.5246391685186451, "language_loss": 0.7398203, "learning_rate": 3.971235378388573e-06, "loss": 0.76201284, "num_input_tokens_seen": 29148650, "step": 1371, "time_per_iteration": 2.758089065551758 }, { "auxiliary_loss_clip": 0.01097162, "auxiliary_loss_mlp": 0.0106614, "balance_loss_clip": 1.05124569, "balance_loss_mlp": 1.04098701, "epoch": 0.08248910266045394, "flos": 34491480393600.0, "grad_norm": 1.9670948823939327, "language_loss": 0.70851803, "learning_rate": 3.971169525711122e-06, "loss": 0.73015106, "num_input_tokens_seen": 29170785, "step": 1372, "time_per_iteration": 4.069301605224609 }, { "auxiliary_loss_clip": 0.01162292, "auxiliary_loss_mlp": 0.01056859, "balance_loss_clip": 1.0571332, "balance_loss_mlp": 1.03261209, "epoch": 0.0825492259131219, "flos": 13436228960640.0, "grad_norm": 2.750431245604494, "language_loss": 0.88363653, "learning_rate": 3.9711035982867246e-06, "loss": 0.905828, "num_input_tokens_seen": 29185210, "step": 1373, "time_per_iteration": 3.9346964359283447 }, { "auxiliary_loss_clip": 0.01147291, "auxiliary_loss_mlp": 0.01062343, "balance_loss_clip": 1.05334187, "balance_loss_mlp": 1.03878665, "epoch": 0.08260934916578987, "flos": 25812446636160.0, "grad_norm": 2.128923272573014, "language_loss": 0.82465184, "learning_rate": 3.971037596117882e-06, "loss": 0.84674811, "num_input_tokens_seen": 29205210, "step": 1374, "time_per_iteration": 2.933377981185913 }, { "auxiliary_loss_clip": 0.01044322, "auxiliary_loss_mlp": 0.01017124, "balance_loss_clip": 1.03154135, "balance_loss_mlp": 1.0135479, "epoch": 0.08266947241845783, "flos": 63460009491840.0, "grad_norm": 0.8272339650193923, "language_loss": 0.60641956, "learning_rate": 3.970971519207095e-06, "loss": 0.62703401, "num_input_tokens_seen": 29265350, "step": 1375, "time_per_iteration": 3.3287038803100586 }, { "auxiliary_loss_clip": 0.01060461, "auxiliary_loss_mlp": 0.01013653, "balance_loss_clip": 1.02398169, "balance_loss_mlp": 1.01017237, "epoch": 0.08272959567112581, "flos": 69993704568960.0, "grad_norm": 0.9162492148708097, "language_loss": 0.62171799, "learning_rate": 3.970905367556871e-06, "loss": 0.64245915, "num_input_tokens_seen": 29321475, "step": 1376, "time_per_iteration": 3.218834161758423 }, { "auxiliary_loss_clip": 0.01159103, "auxiliary_loss_mlp": 0.0106347, "balance_loss_clip": 1.06229186, "balance_loss_mlp": 1.03942561, "epoch": 0.08278971892379378, "flos": 20413205781120.0, "grad_norm": 1.9191670647860084, "language_loss": 0.82577401, "learning_rate": 3.970839141169718e-06, "loss": 0.84799975, "num_input_tokens_seen": 29341405, "step": 1377, "time_per_iteration": 2.8763558864593506 }, { "auxiliary_loss_clip": 0.01176967, "auxiliary_loss_mlp": 0.01054072, "balance_loss_clip": 1.06486619, "balance_loss_mlp": 1.03011107, "epoch": 0.08284984217646174, "flos": 26250233598720.0, "grad_norm": 1.915539507671093, "language_loss": 0.84923226, "learning_rate": 3.970772840048147e-06, "loss": 0.87154263, "num_input_tokens_seen": 29361955, "step": 1378, "time_per_iteration": 2.8232595920562744 }, { "auxiliary_loss_clip": 0.01185329, "auxiliary_loss_mlp": 0.01058999, "balance_loss_clip": 1.06043923, "balance_loss_mlp": 1.0344305, "epoch": 0.08290996542912972, "flos": 27194683852800.0, "grad_norm": 6.4689921779024795, "language_loss": 0.87319231, "learning_rate": 3.970706464194672e-06, "loss": 0.8956356, "num_input_tokens_seen": 29382395, "step": 1379, "time_per_iteration": 2.756082534790039 }, { "auxiliary_loss_clip": 0.01158173, "auxiliary_loss_mlp": 0.01061479, "balance_loss_clip": 1.05779433, "balance_loss_mlp": 1.03829277, "epoch": 0.08297008868179769, "flos": 38618191146240.0, "grad_norm": 2.078993196749275, "language_loss": 0.78545237, "learning_rate": 3.970640013611812e-06, "loss": 0.8076489, "num_input_tokens_seen": 29404460, "step": 1380, "time_per_iteration": 2.9525601863861084 }, { "auxiliary_loss_clip": 0.01183492, "auxiliary_loss_mlp": 0.01059448, "balance_loss_clip": 1.06308961, "balance_loss_mlp": 1.0344255, "epoch": 0.08303021193446565, "flos": 19974736460160.0, "grad_norm": 2.6608111668609697, "language_loss": 0.86125714, "learning_rate": 3.970573488302083e-06, "loss": 0.88368654, "num_input_tokens_seen": 29422675, "step": 1381, "time_per_iteration": 2.735203742980957 }, { "auxiliary_loss_clip": 0.01197152, "auxiliary_loss_mlp": 0.00781814, "balance_loss_clip": 1.06611753, "balance_loss_mlp": 1.00034571, "epoch": 0.08309033518713363, "flos": 13662646341120.0, "grad_norm": 2.9433398182948203, "language_loss": 0.87471211, "learning_rate": 3.970506888268011e-06, "loss": 0.89450181, "num_input_tokens_seen": 29439840, "step": 1382, "time_per_iteration": 2.6392617225646973 }, { "auxiliary_loss_clip": 0.0115996, "auxiliary_loss_mlp": 0.01055463, "balance_loss_clip": 1.06138313, "balance_loss_mlp": 1.03337312, "epoch": 0.0831504584398016, "flos": 17968551068160.0, "grad_norm": 1.9901989904031434, "language_loss": 0.77085757, "learning_rate": 3.970440213512121e-06, "loss": 0.79301178, "num_input_tokens_seen": 29457360, "step": 1383, "time_per_iteration": 2.756565809249878 }, { "auxiliary_loss_clip": 0.01191549, "auxiliary_loss_mlp": 0.01058014, "balance_loss_clip": 1.06211782, "balance_loss_mlp": 1.03395748, "epoch": 0.08321058169246956, "flos": 22601386408320.0, "grad_norm": 1.818236548161018, "language_loss": 0.82858944, "learning_rate": 3.97037346403694e-06, "loss": 0.85108507, "num_input_tokens_seen": 29477040, "step": 1384, "time_per_iteration": 2.7848587036132812 }, { "auxiliary_loss_clip": 0.01148661, "auxiliary_loss_mlp": 0.01063605, "balance_loss_clip": 1.05671442, "balance_loss_mlp": 1.03610373, "epoch": 0.08327070494513754, "flos": 22850426378880.0, "grad_norm": 3.9982776391866346, "language_loss": 0.85219657, "learning_rate": 3.970306639845e-06, "loss": 0.8743192, "num_input_tokens_seen": 29492010, "step": 1385, "time_per_iteration": 2.803893566131592 }, { "auxiliary_loss_clip": 0.01157001, "auxiliary_loss_mlp": 0.01061891, "balance_loss_clip": 1.05823874, "balance_loss_mlp": 1.03750122, "epoch": 0.0833308281978055, "flos": 22782986593920.0, "grad_norm": 1.7071515381676081, "language_loss": 0.69195282, "learning_rate": 3.970239740938835e-06, "loss": 0.71414173, "num_input_tokens_seen": 29511850, "step": 1386, "time_per_iteration": 3.004786252975464 }, { "auxiliary_loss_clip": 0.01172803, "auxiliary_loss_mlp": 0.01058809, "balance_loss_clip": 1.05489016, "balance_loss_mlp": 1.03483546, "epoch": 0.08339095145047347, "flos": 20812604083200.0, "grad_norm": 1.672791522425571, "language_loss": 0.81894958, "learning_rate": 3.97017276732098e-06, "loss": 0.84126568, "num_input_tokens_seen": 29531415, "step": 1387, "time_per_iteration": 2.7678542137145996 }, { "auxiliary_loss_clip": 0.01179554, "auxiliary_loss_mlp": 0.01074251, "balance_loss_clip": 1.06179345, "balance_loss_mlp": 1.04817975, "epoch": 0.08345107470314143, "flos": 18515326872960.0, "grad_norm": 2.071322011459688, "language_loss": 0.77205479, "learning_rate": 3.970105718993978e-06, "loss": 0.7945928, "num_input_tokens_seen": 29549525, "step": 1388, "time_per_iteration": 2.8246304988861084 }, { "auxiliary_loss_clip": 0.01130856, "auxiliary_loss_mlp": 0.01062414, "balance_loss_clip": 1.05684018, "balance_loss_mlp": 1.03742766, "epoch": 0.08351119795580941, "flos": 18807567926400.0, "grad_norm": 2.0255270252506636, "language_loss": 0.79527366, "learning_rate": 3.970038595960369e-06, "loss": 0.81720638, "num_input_tokens_seen": 29568705, "step": 1389, "time_per_iteration": 2.8606414794921875 }, { "auxiliary_loss_clip": 0.01172785, "auxiliary_loss_mlp": 0.01064077, "balance_loss_clip": 1.05787444, "balance_loss_mlp": 1.03923428, "epoch": 0.08357132120847738, "flos": 18441817689600.0, "grad_norm": 2.546615132743645, "language_loss": 0.87427586, "learning_rate": 3.969971398222699e-06, "loss": 0.89664447, "num_input_tokens_seen": 29585855, "step": 1390, "time_per_iteration": 2.795931577682495 }, { "auxiliary_loss_clip": 0.01160426, "auxiliary_loss_mlp": 0.01067723, "balance_loss_clip": 1.05447149, "balance_loss_mlp": 1.04082966, "epoch": 0.08363144446114534, "flos": 25922333318400.0, "grad_norm": 1.8703157168219726, "language_loss": 0.86833143, "learning_rate": 3.969904125783517e-06, "loss": 0.89061296, "num_input_tokens_seen": 29607280, "step": 1391, "time_per_iteration": 2.811598062515259 }, { "auxiliary_loss_clip": 0.01156119, "auxiliary_loss_mlp": 0.01076482, "balance_loss_clip": 1.05575848, "balance_loss_mlp": 1.05180562, "epoch": 0.08369156771381332, "flos": 18041306065920.0, "grad_norm": 3.7979396758909263, "language_loss": 0.87688571, "learning_rate": 3.969836778645371e-06, "loss": 0.89921176, "num_input_tokens_seen": 29624130, "step": 1392, "time_per_iteration": 2.776819944381714 }, { "auxiliary_loss_clip": 0.01183316, "auxiliary_loss_mlp": 0.01058545, "balance_loss_clip": 1.05830503, "balance_loss_mlp": 1.03500128, "epoch": 0.08375169096648129, "flos": 22675111073280.0, "grad_norm": 8.95243370865895, "language_loss": 0.80574775, "learning_rate": 3.969769356810819e-06, "loss": 0.82816637, "num_input_tokens_seen": 29643210, "step": 1393, "time_per_iteration": 2.735761880874634 }, { "auxiliary_loss_clip": 0.01197686, "auxiliary_loss_mlp": 0.01058125, "balance_loss_clip": 1.06329441, "balance_loss_mlp": 1.03466487, "epoch": 0.08381181421914925, "flos": 26103215232000.0, "grad_norm": 1.7485261130451684, "language_loss": 0.85064757, "learning_rate": 3.969701860282415e-06, "loss": 0.87320572, "num_input_tokens_seen": 29663920, "step": 1394, "time_per_iteration": 2.950211524963379 }, { "auxiliary_loss_clip": 0.01145594, "auxiliary_loss_mlp": 0.01058123, "balance_loss_clip": 1.05994248, "balance_loss_mlp": 1.03432918, "epoch": 0.08387193747181723, "flos": 20629782835200.0, "grad_norm": 1.782466846937859, "language_loss": 0.82979721, "learning_rate": 3.969634289062719e-06, "loss": 0.85183442, "num_input_tokens_seen": 29683825, "step": 1395, "time_per_iteration": 2.883977174758911 }, { "auxiliary_loss_clip": 0.01187279, "auxiliary_loss_mlp": 0.00782865, "balance_loss_clip": 1.06065941, "balance_loss_mlp": 1.00028706, "epoch": 0.0839320607244852, "flos": 13443196199040.0, "grad_norm": 3.330409107955743, "language_loss": 0.82481396, "learning_rate": 3.969566643154293e-06, "loss": 0.84451544, "num_input_tokens_seen": 29698775, "step": 1396, "time_per_iteration": 2.6729378700256348 }, { "auxiliary_loss_clip": 0.0118605, "auxiliary_loss_mlp": 0.01060468, "balance_loss_clip": 1.06378388, "balance_loss_mlp": 1.03475475, "epoch": 0.08399218397715316, "flos": 23477247642240.0, "grad_norm": 1.780410555630689, "language_loss": 0.76843297, "learning_rate": 3.969498922559703e-06, "loss": 0.79089814, "num_input_tokens_seen": 29719430, "step": 1397, "time_per_iteration": 2.64888334274292 }, { "auxiliary_loss_clip": 0.01153742, "auxiliary_loss_mlp": 0.01050759, "balance_loss_clip": 1.05790138, "balance_loss_mlp": 1.02621412, "epoch": 0.08405230722982113, "flos": 25920717206400.0, "grad_norm": 2.1323769932413184, "language_loss": 0.77941638, "learning_rate": 3.969431127281516e-06, "loss": 0.8014614, "num_input_tokens_seen": 29739685, "step": 1398, "time_per_iteration": 2.8302125930786133 }, { "auxiliary_loss_clip": 0.01191086, "auxiliary_loss_mlp": 0.01052374, "balance_loss_clip": 1.05962944, "balance_loss_mlp": 1.02943766, "epoch": 0.0841124304824891, "flos": 17967437746560.0, "grad_norm": 2.150764713624159, "language_loss": 0.94635069, "learning_rate": 3.969363257322304e-06, "loss": 0.96878529, "num_input_tokens_seen": 29756165, "step": 1399, "time_per_iteration": 2.650517702102661 }, { "auxiliary_loss_clip": 0.01172403, "auxiliary_loss_mlp": 0.0106738, "balance_loss_clip": 1.0562712, "balance_loss_mlp": 1.04168999, "epoch": 0.08417255373515707, "flos": 25629661301760.0, "grad_norm": 3.6141849657848137, "language_loss": 0.81904209, "learning_rate": 3.96929531268464e-06, "loss": 0.8414399, "num_input_tokens_seen": 29776425, "step": 1400, "time_per_iteration": 2.777369260787964 }, { "auxiliary_loss_clip": 0.01170173, "auxiliary_loss_mlp": 0.01064292, "balance_loss_clip": 1.05968165, "balance_loss_mlp": 1.03957999, "epoch": 0.08423267698782504, "flos": 26249730808320.0, "grad_norm": 8.998651919840762, "language_loss": 0.8642807, "learning_rate": 3.969227293371099e-06, "loss": 0.88662529, "num_input_tokens_seen": 29796440, "step": 1401, "time_per_iteration": 2.91375732421875 }, { "auxiliary_loss_clip": 0.01196, "auxiliary_loss_mlp": 0.01066109, "balance_loss_clip": 1.05935979, "balance_loss_mlp": 1.04053831, "epoch": 0.08429280024049302, "flos": 20119707751680.0, "grad_norm": 2.9792515680869114, "language_loss": 0.87500131, "learning_rate": 3.969159199384263e-06, "loss": 0.89762247, "num_input_tokens_seen": 29814755, "step": 1402, "time_per_iteration": 2.7827296257019043 }, { "auxiliary_loss_clip": 0.01144907, "auxiliary_loss_mlp": 0.00781428, "balance_loss_clip": 1.05105817, "balance_loss_mlp": 1.00033188, "epoch": 0.08435292349316098, "flos": 42924526836480.0, "grad_norm": 2.1517994230241566, "language_loss": 0.8905524, "learning_rate": 3.9690910307267125e-06, "loss": 0.90981579, "num_input_tokens_seen": 29834785, "step": 1403, "time_per_iteration": 2.931666374206543 }, { "auxiliary_loss_clip": 0.01165276, "auxiliary_loss_mlp": 0.01061696, "balance_loss_clip": 1.05570936, "balance_loss_mlp": 1.03715038, "epoch": 0.08441304674582895, "flos": 22857285876480.0, "grad_norm": 1.790271378285476, "language_loss": 0.80321431, "learning_rate": 3.969022787401033e-06, "loss": 0.82548404, "num_input_tokens_seen": 29854695, "step": 1404, "time_per_iteration": 2.7397725582122803 }, { "auxiliary_loss_clip": 0.01181709, "auxiliary_loss_mlp": 0.01071408, "balance_loss_clip": 1.06211567, "balance_loss_mlp": 1.04649353, "epoch": 0.08447316999849692, "flos": 18697501676160.0, "grad_norm": 2.0849305916509193, "language_loss": 0.83557045, "learning_rate": 3.968954469409811e-06, "loss": 0.85810155, "num_input_tokens_seen": 29872180, "step": 1405, "time_per_iteration": 2.8052847385406494 }, { "auxiliary_loss_clip": 0.0118246, "auxiliary_loss_mlp": 0.01058347, "balance_loss_clip": 1.05636072, "balance_loss_mlp": 1.03588748, "epoch": 0.08453329325116489, "flos": 25483971738240.0, "grad_norm": 1.5225846020503528, "language_loss": 0.7991904, "learning_rate": 3.968886076755639e-06, "loss": 0.82159847, "num_input_tokens_seen": 29893205, "step": 1406, "time_per_iteration": 4.301243305206299 }, { "auxiliary_loss_clip": 0.0117117, "auxiliary_loss_mlp": 0.01068275, "balance_loss_clip": 1.05790758, "balance_loss_mlp": 1.04406369, "epoch": 0.08459341650383286, "flos": 20920048640640.0, "grad_norm": 1.717770739318623, "language_loss": 0.79441547, "learning_rate": 3.96881760944111e-06, "loss": 0.81680995, "num_input_tokens_seen": 29911970, "step": 1407, "time_per_iteration": 2.6535613536834717 }, { "auxiliary_loss_clip": 0.01186501, "auxiliary_loss_mlp": 0.01057881, "balance_loss_clip": 1.05982685, "balance_loss_mlp": 1.03409886, "epoch": 0.08465353975650082, "flos": 13043079624960.0, "grad_norm": 2.191354041218588, "language_loss": 0.91799384, "learning_rate": 3.968749067468819e-06, "loss": 0.94043779, "num_input_tokens_seen": 29929925, "step": 1408, "time_per_iteration": 5.774486064910889 }, { "auxiliary_loss_clip": 0.01058217, "auxiliary_loss_mlp": 0.01015213, "balance_loss_clip": 1.0231359, "balance_loss_mlp": 1.01139832, "epoch": 0.0847136630091688, "flos": 60877422552960.0, "grad_norm": 0.9559717259642487, "language_loss": 0.61891782, "learning_rate": 3.968680450841368e-06, "loss": 0.63965201, "num_input_tokens_seen": 29985950, "step": 1409, "time_per_iteration": 4.9455225467681885 }, { "auxiliary_loss_clip": 0.01188186, "auxiliary_loss_mlp": 0.01061718, "balance_loss_clip": 1.05840743, "balance_loss_mlp": 1.03878236, "epoch": 0.08477378626183676, "flos": 22046530043520.0, "grad_norm": 1.6980375913788566, "language_loss": 0.86357373, "learning_rate": 3.968611759561355e-06, "loss": 0.88607281, "num_input_tokens_seen": 30004330, "step": 1410, "time_per_iteration": 2.640355110168457 }, { "auxiliary_loss_clip": 0.01181512, "auxiliary_loss_mlp": 0.01053874, "balance_loss_clip": 1.0583061, "balance_loss_mlp": 1.02870846, "epoch": 0.08483390951450473, "flos": 16690059308160.0, "grad_norm": 2.248971712939306, "language_loss": 0.74384397, "learning_rate": 3.968542993631388e-06, "loss": 0.7661978, "num_input_tokens_seen": 30022555, "step": 1411, "time_per_iteration": 2.6200830936431885 }, { "auxiliary_loss_clip": 0.01077929, "auxiliary_loss_mlp": 0.01003535, "balance_loss_clip": 1.02317524, "balance_loss_mlp": 0.99991113, "epoch": 0.08489403276717271, "flos": 51584640082560.0, "grad_norm": 0.9014663966204861, "language_loss": 0.56748837, "learning_rate": 3.968474153054073e-06, "loss": 0.58830309, "num_input_tokens_seen": 30077220, "step": 1412, "time_per_iteration": 3.0746512413024902 }, { "auxiliary_loss_clip": 0.01156137, "auxiliary_loss_mlp": 0.01067795, "balance_loss_clip": 1.05325568, "balance_loss_mlp": 1.04265356, "epoch": 0.08495415601984067, "flos": 17092330698240.0, "grad_norm": 2.2757293876932945, "language_loss": 0.88754624, "learning_rate": 3.96840523783202e-06, "loss": 0.90978551, "num_input_tokens_seen": 30094600, "step": 1413, "time_per_iteration": 2.7309420108795166 }, { "auxiliary_loss_clip": 0.01164895, "auxiliary_loss_mlp": 0.01057479, "balance_loss_clip": 1.05780244, "balance_loss_mlp": 1.03295755, "epoch": 0.08501427927250864, "flos": 23148413608320.0, "grad_norm": 1.9781781646219805, "language_loss": 0.87963474, "learning_rate": 3.968336247967844e-06, "loss": 0.90185857, "num_input_tokens_seen": 30114475, "step": 1414, "time_per_iteration": 2.692030668258667 }, { "auxiliary_loss_clip": 0.01168145, "auxiliary_loss_mlp": 0.01063751, "balance_loss_clip": 1.05704033, "balance_loss_mlp": 1.04170966, "epoch": 0.08507440252517662, "flos": 19063467394560.0, "grad_norm": 1.9706021333256292, "language_loss": 0.77636635, "learning_rate": 3.96826718346416e-06, "loss": 0.79868531, "num_input_tokens_seen": 30133350, "step": 1415, "time_per_iteration": 2.8435540199279785 }, { "auxiliary_loss_clip": 0.01182108, "auxiliary_loss_mlp": 0.01059478, "balance_loss_clip": 1.0588963, "balance_loss_mlp": 1.03701878, "epoch": 0.08513452577784458, "flos": 60182296600320.0, "grad_norm": 1.7170282174092708, "language_loss": 0.70545506, "learning_rate": 3.968198044323587e-06, "loss": 0.72787094, "num_input_tokens_seen": 30159005, "step": 1416, "time_per_iteration": 3.021360158920288 }, { "auxiliary_loss_clip": 0.01174166, "auxiliary_loss_mlp": 0.01066487, "balance_loss_clip": 1.05930233, "balance_loss_mlp": 1.04131043, "epoch": 0.08519464903051255, "flos": 27308485117440.0, "grad_norm": 2.8159853289102053, "language_loss": 0.74938154, "learning_rate": 3.968128830548748e-06, "loss": 0.771788, "num_input_tokens_seen": 30179450, "step": 1417, "time_per_iteration": 2.738301992416382 }, { "auxiliary_loss_clip": 0.01171292, "auxiliary_loss_mlp": 0.01057092, "balance_loss_clip": 1.05715823, "balance_loss_mlp": 1.03313112, "epoch": 0.08525477228318051, "flos": 20266438809600.0, "grad_norm": 2.4132423968154635, "language_loss": 0.8258723, "learning_rate": 3.968059542142265e-06, "loss": 0.84815615, "num_input_tokens_seen": 30197235, "step": 1418, "time_per_iteration": 2.671574831008911 }, { "auxiliary_loss_clip": 0.0104499, "auxiliary_loss_mlp": 0.01004818, "balance_loss_clip": 1.02242994, "balance_loss_mlp": 1.0004549, "epoch": 0.08531489553584849, "flos": 67615017183360.0, "grad_norm": 0.8667411864001444, "language_loss": 0.56638753, "learning_rate": 3.9679901791067685e-06, "loss": 0.58688557, "num_input_tokens_seen": 30257410, "step": 1419, "time_per_iteration": 3.199730396270752 }, { "auxiliary_loss_clip": 0.01192231, "auxiliary_loss_mlp": 0.01067737, "balance_loss_clip": 1.05757999, "balance_loss_mlp": 1.04369283, "epoch": 0.08537501878851646, "flos": 27526965592320.0, "grad_norm": 2.2357492693560466, "language_loss": 0.70111859, "learning_rate": 3.967920741444886e-06, "loss": 0.72371829, "num_input_tokens_seen": 30277865, "step": 1420, "time_per_iteration": 2.7176027297973633 }, { "auxiliary_loss_clip": 0.01155207, "auxiliary_loss_mlp": 0.01050755, "balance_loss_clip": 1.05377483, "balance_loss_mlp": 1.02692556, "epoch": 0.08543514204118442, "flos": 22784243569920.0, "grad_norm": 1.5975069204011494, "language_loss": 0.88011539, "learning_rate": 3.967851229159252e-06, "loss": 0.90217495, "num_input_tokens_seen": 30298545, "step": 1421, "time_per_iteration": 2.7552106380462646 }, { "auxiliary_loss_clip": 0.01077473, "auxiliary_loss_mlp": 0.01013517, "balance_loss_clip": 1.02364218, "balance_loss_mlp": 1.01020324, "epoch": 0.0854952652938524, "flos": 60990721027200.0, "grad_norm": 0.9142209544576306, "language_loss": 0.63506877, "learning_rate": 3.967781642252502e-06, "loss": 0.65597868, "num_input_tokens_seen": 30361725, "step": 1422, "time_per_iteration": 3.134183168411255 }, { "auxiliary_loss_clip": 0.01152948, "auxiliary_loss_mlp": 0.01063847, "balance_loss_clip": 1.05932307, "balance_loss_mlp": 1.0406723, "epoch": 0.08555538854652037, "flos": 28038046256640.0, "grad_norm": 1.8757015124159093, "language_loss": 0.82691669, "learning_rate": 3.967711980727276e-06, "loss": 0.84908462, "num_input_tokens_seen": 30382180, "step": 1423, "time_per_iteration": 2.789393424987793 }, { "auxiliary_loss_clip": 0.01153439, "auxiliary_loss_mlp": 0.01064169, "balance_loss_clip": 1.0526228, "balance_loss_mlp": 1.04089928, "epoch": 0.08561551179918833, "flos": 23509279595520.0, "grad_norm": 1.6593534429066656, "language_loss": 0.75424892, "learning_rate": 3.967642244586213e-06, "loss": 0.776425, "num_input_tokens_seen": 30402980, "step": 1424, "time_per_iteration": 2.7805826663970947 }, { "auxiliary_loss_clip": 0.01139579, "auxiliary_loss_mlp": 0.01060342, "balance_loss_clip": 1.05769765, "balance_loss_mlp": 1.03751373, "epoch": 0.08567563505185631, "flos": 17926930183680.0, "grad_norm": 1.7999307606718091, "language_loss": 0.75948423, "learning_rate": 3.96757243383196e-06, "loss": 0.78148341, "num_input_tokens_seen": 30420800, "step": 1425, "time_per_iteration": 2.677889823913574 }, { "auxiliary_loss_clip": 0.0118966, "auxiliary_loss_mlp": 0.01055231, "balance_loss_clip": 1.05982256, "balance_loss_mlp": 1.03230715, "epoch": 0.08573575830452428, "flos": 19719519350400.0, "grad_norm": 2.1792756220437743, "language_loss": 0.93362999, "learning_rate": 3.9675025484671624e-06, "loss": 0.95607889, "num_input_tokens_seen": 30439620, "step": 1426, "time_per_iteration": 2.6270906925201416 }, { "auxiliary_loss_clip": 0.01145994, "auxiliary_loss_mlp": 0.01066219, "balance_loss_clip": 1.05707717, "balance_loss_mlp": 1.0406251, "epoch": 0.08579588155719224, "flos": 17931563038080.0, "grad_norm": 2.3679064075186553, "language_loss": 0.75424731, "learning_rate": 3.967432588494471e-06, "loss": 0.77636945, "num_input_tokens_seen": 30457300, "step": 1427, "time_per_iteration": 2.84614634513855 }, { "auxiliary_loss_clip": 0.01190697, "auxiliary_loss_mlp": 0.01052992, "balance_loss_clip": 1.06006169, "balance_loss_mlp": 1.0305804, "epoch": 0.08585600480986022, "flos": 16033324993920.0, "grad_norm": 3.503048788198607, "language_loss": 0.82108849, "learning_rate": 3.96736255391654e-06, "loss": 0.84352541, "num_input_tokens_seen": 30471580, "step": 1428, "time_per_iteration": 2.5882396697998047 }, { "auxiliary_loss_clip": 0.01173688, "auxiliary_loss_mlp": 0.0106298, "balance_loss_clip": 1.05633736, "balance_loss_mlp": 1.03832793, "epoch": 0.08591612806252819, "flos": 28657433404800.0, "grad_norm": 2.088481658755078, "language_loss": 0.79929984, "learning_rate": 3.967292444736023e-06, "loss": 0.82166648, "num_input_tokens_seen": 30492720, "step": 1429, "time_per_iteration": 2.720500946044922 }, { "auxiliary_loss_clip": 0.01169119, "auxiliary_loss_mlp": 0.010606, "balance_loss_clip": 1.05971265, "balance_loss_mlp": 1.0379504, "epoch": 0.08597625131519615, "flos": 20959119659520.0, "grad_norm": 1.9029222975672677, "language_loss": 0.87716508, "learning_rate": 3.967222260955578e-06, "loss": 0.89946228, "num_input_tokens_seen": 30509535, "step": 1430, "time_per_iteration": 2.6914596557617188 }, { "auxiliary_loss_clip": 0.01144304, "auxiliary_loss_mlp": 0.01074633, "balance_loss_clip": 1.05802035, "balance_loss_mlp": 1.05125606, "epoch": 0.08603637456786412, "flos": 23256360956160.0, "grad_norm": 1.6366623508781384, "language_loss": 0.81859726, "learning_rate": 3.96715200257787e-06, "loss": 0.84078664, "num_input_tokens_seen": 30529490, "step": 1431, "time_per_iteration": 2.834402322769165 }, { "auxiliary_loss_clip": 0.01148362, "auxiliary_loss_mlp": 0.01054323, "balance_loss_clip": 1.05620182, "balance_loss_mlp": 1.03132737, "epoch": 0.0860964978205321, "flos": 28694170039680.0, "grad_norm": 1.5497375505717568, "language_loss": 0.78109461, "learning_rate": 3.967081669605559e-06, "loss": 0.80312145, "num_input_tokens_seen": 30550205, "step": 1432, "time_per_iteration": 2.767860174179077 }, { "auxiliary_loss_clip": 0.01167351, "auxiliary_loss_mlp": 0.0106333, "balance_loss_clip": 1.0540905, "balance_loss_mlp": 1.03914225, "epoch": 0.08615662107320006, "flos": 19318397195520.0, "grad_norm": 1.9631692713893694, "language_loss": 0.73365706, "learning_rate": 3.967011262041315e-06, "loss": 0.75596392, "num_input_tokens_seen": 30568830, "step": 1433, "time_per_iteration": 2.6930699348449707 }, { "auxiliary_loss_clip": 0.01150098, "auxiliary_loss_mlp": 0.00781967, "balance_loss_clip": 1.05335927, "balance_loss_mlp": 1.00044179, "epoch": 0.08621674432586802, "flos": 15851688894720.0, "grad_norm": 2.468588778716135, "language_loss": 0.85340321, "learning_rate": 3.9669407798878065e-06, "loss": 0.87272388, "num_input_tokens_seen": 30585730, "step": 1434, "time_per_iteration": 2.735690116882324 }, { "auxiliary_loss_clip": 0.01170363, "auxiliary_loss_mlp": 0.01057659, "balance_loss_clip": 1.05604434, "balance_loss_mlp": 1.0344249, "epoch": 0.086276867578536, "flos": 14100648785280.0, "grad_norm": 2.160640509122794, "language_loss": 0.7870298, "learning_rate": 3.966870223147707e-06, "loss": 0.80931008, "num_input_tokens_seen": 30603180, "step": 1435, "time_per_iteration": 2.776567220687866 }, { "auxiliary_loss_clip": 0.01047768, "auxiliary_loss_mlp": 0.01015597, "balance_loss_clip": 1.023893, "balance_loss_mlp": 1.01206815, "epoch": 0.08633699083120397, "flos": 70184857772160.0, "grad_norm": 0.8900716332014227, "language_loss": 0.57975936, "learning_rate": 3.96679959182369e-06, "loss": 0.60039294, "num_input_tokens_seen": 30668895, "step": 1436, "time_per_iteration": 3.344207763671875 }, { "auxiliary_loss_clip": 0.0117372, "auxiliary_loss_mlp": 0.01056829, "balance_loss_clip": 1.05617976, "balance_loss_mlp": 1.03153312, "epoch": 0.08639711408387193, "flos": 30298874140800.0, "grad_norm": 2.240343996649645, "language_loss": 0.69169062, "learning_rate": 3.966728885918437e-06, "loss": 0.71399617, "num_input_tokens_seen": 30688955, "step": 1437, "time_per_iteration": 2.7171547412872314 }, { "auxiliary_loss_clip": 0.01121044, "auxiliary_loss_mlp": 0.01055264, "balance_loss_clip": 1.05334914, "balance_loss_mlp": 1.03223276, "epoch": 0.08645723733653991, "flos": 20297680663680.0, "grad_norm": 2.1340571114707245, "language_loss": 0.72624576, "learning_rate": 3.966658105434627e-06, "loss": 0.74800885, "num_input_tokens_seen": 30706095, "step": 1438, "time_per_iteration": 2.7815651893615723 }, { "auxiliary_loss_clip": 0.01179626, "auxiliary_loss_mlp": 0.01052578, "balance_loss_clip": 1.06052637, "balance_loss_mlp": 1.02872419, "epoch": 0.08651736058920788, "flos": 32890583134080.0, "grad_norm": 1.5339762166114281, "language_loss": 0.64377135, "learning_rate": 3.966587250374945e-06, "loss": 0.66609335, "num_input_tokens_seen": 30729025, "step": 1439, "time_per_iteration": 2.8935797214508057 }, { "auxiliary_loss_clip": 0.01153286, "auxiliary_loss_mlp": 0.01056452, "balance_loss_clip": 1.05530453, "balance_loss_mlp": 1.03213322, "epoch": 0.08657748384187584, "flos": 22637368857600.0, "grad_norm": 5.193932354158579, "language_loss": 0.87521696, "learning_rate": 3.966516320742077e-06, "loss": 0.89731431, "num_input_tokens_seen": 30746155, "step": 1440, "time_per_iteration": 2.731531858444214 }, { "auxiliary_loss_clip": 0.01155923, "auxiliary_loss_mlp": 0.00782787, "balance_loss_clip": 1.05752945, "balance_loss_mlp": 1.00043201, "epoch": 0.08663760709454381, "flos": 23658380951040.0, "grad_norm": 2.023462963415533, "language_loss": 0.83434939, "learning_rate": 3.9664453165387124e-06, "loss": 0.85373652, "num_input_tokens_seen": 30761410, "step": 1441, "time_per_iteration": 2.7126500606536865 }, { "auxiliary_loss_clip": 0.01074667, "auxiliary_loss_mlp": 0.01004602, "balance_loss_clip": 1.0222367, "balance_loss_mlp": 1.00100195, "epoch": 0.08669773034721179, "flos": 62686564911360.0, "grad_norm": 0.8541685426878655, "language_loss": 0.60479522, "learning_rate": 3.966374237767545e-06, "loss": 0.62558794, "num_input_tokens_seen": 30823010, "step": 1442, "time_per_iteration": 3.25555157661438 }, { "auxiliary_loss_clip": 0.0116729, "auxiliary_loss_mlp": 0.01054262, "balance_loss_clip": 1.05768681, "balance_loss_mlp": 1.03075421, "epoch": 0.08675785359987975, "flos": 20667489137280.0, "grad_norm": 2.8449103562639073, "language_loss": 0.79304373, "learning_rate": 3.96630308443127e-06, "loss": 0.81525922, "num_input_tokens_seen": 30841980, "step": 1443, "time_per_iteration": 2.7314631938934326 }, { "auxiliary_loss_clip": 0.01180858, "auxiliary_loss_mlp": 0.01051075, "balance_loss_clip": 1.05780149, "balance_loss_mlp": 1.02755547, "epoch": 0.08681797685254772, "flos": 26941118768640.0, "grad_norm": 1.6739262813835734, "language_loss": 0.82399666, "learning_rate": 3.966231856532584e-06, "loss": 0.84631598, "num_input_tokens_seen": 30863280, "step": 1444, "time_per_iteration": 2.7341418266296387 }, { "auxiliary_loss_clip": 0.01196759, "auxiliary_loss_mlp": 0.01051473, "balance_loss_clip": 1.06044626, "balance_loss_mlp": 1.02810788, "epoch": 0.0868781001052157, "flos": 17712831168000.0, "grad_norm": 2.3015461969915747, "language_loss": 0.87354827, "learning_rate": 3.966160554074189e-06, "loss": 0.8960306, "num_input_tokens_seen": 30881710, "step": 1445, "time_per_iteration": 4.25179386138916 }, { "auxiliary_loss_clip": 0.01180784, "auxiliary_loss_mlp": 0.01055896, "balance_loss_clip": 1.06094933, "balance_loss_mlp": 1.03446186, "epoch": 0.08693822335788366, "flos": 19896522595200.0, "grad_norm": 1.8066650797875201, "language_loss": 0.81863767, "learning_rate": 3.96608917705879e-06, "loss": 0.84100449, "num_input_tokens_seen": 30900225, "step": 1446, "time_per_iteration": 4.197181940078735 }, { "auxiliary_loss_clip": 0.01056056, "auxiliary_loss_mlp": 0.01004371, "balance_loss_clip": 1.01782191, "balance_loss_mlp": 1.00031781, "epoch": 0.08699834661055163, "flos": 67023747406080.0, "grad_norm": 0.7255245569613363, "language_loss": 0.54762936, "learning_rate": 3.966017725489091e-06, "loss": 0.56823361, "num_input_tokens_seen": 30959580, "step": 1447, "time_per_iteration": 3.2158126831054688 }, { "auxiliary_loss_clip": 0.0114861, "auxiliary_loss_mlp": 0.01056824, "balance_loss_clip": 1.05373001, "balance_loss_mlp": 1.03518772, "epoch": 0.0870584698632196, "flos": 13480507451520.0, "grad_norm": 2.1586118179593696, "language_loss": 0.84592307, "learning_rate": 3.965946199367804e-06, "loss": 0.86797738, "num_input_tokens_seen": 30976775, "step": 1448, "time_per_iteration": 4.262767314910889 }, { "auxiliary_loss_clip": 0.01194173, "auxiliary_loss_mlp": 0.01050219, "balance_loss_clip": 1.05891991, "balance_loss_mlp": 1.02768826, "epoch": 0.08711859311588757, "flos": 16107013745280.0, "grad_norm": 3.4326906921347096, "language_loss": 0.80644608, "learning_rate": 3.965874598697638e-06, "loss": 0.82888997, "num_input_tokens_seen": 30990495, "step": 1449, "time_per_iteration": 4.553676128387451 }, { "auxiliary_loss_clip": 0.01138548, "auxiliary_loss_mlp": 0.01052142, "balance_loss_clip": 1.05437374, "balance_loss_mlp": 1.02946854, "epoch": 0.08717871636855554, "flos": 38472357928320.0, "grad_norm": 1.5251600336566102, "language_loss": 0.70971417, "learning_rate": 3.965802923481313e-06, "loss": 0.73162109, "num_input_tokens_seen": 31014080, "step": 1450, "time_per_iteration": 2.9082705974578857 }, { "auxiliary_loss_clip": 0.01124466, "auxiliary_loss_mlp": 0.01054883, "balance_loss_clip": 1.05164719, "balance_loss_mlp": 1.03207827, "epoch": 0.0872388396212235, "flos": 17600574188160.0, "grad_norm": 1.9392114767205617, "language_loss": 0.83684897, "learning_rate": 3.965731173721542e-06, "loss": 0.85864246, "num_input_tokens_seen": 31031210, "step": 1451, "time_per_iteration": 2.809880495071411 }, { "auxiliary_loss_clip": 0.01134251, "auxiliary_loss_mlp": 0.00780873, "balance_loss_clip": 1.05147851, "balance_loss_mlp": 1.00039482, "epoch": 0.08729896287389148, "flos": 25259385951360.0, "grad_norm": 2.5160845512367773, "language_loss": 0.74654591, "learning_rate": 3.965659349421049e-06, "loss": 0.76569718, "num_input_tokens_seen": 31049710, "step": 1452, "time_per_iteration": 2.88580060005188 }, { "auxiliary_loss_clip": 0.01157134, "auxiliary_loss_mlp": 0.01063328, "balance_loss_clip": 1.05607891, "balance_loss_mlp": 1.0388428, "epoch": 0.08735908612655945, "flos": 15632454234240.0, "grad_norm": 4.56941406999875, "language_loss": 0.80543101, "learning_rate": 3.965587450582556e-06, "loss": 0.82763565, "num_input_tokens_seen": 31066160, "step": 1453, "time_per_iteration": 2.733632802963257 }, { "auxiliary_loss_clip": 0.01169707, "auxiliary_loss_mlp": 0.01059533, "balance_loss_clip": 1.05905569, "balance_loss_mlp": 1.03625154, "epoch": 0.08741920937922741, "flos": 20339660684160.0, "grad_norm": 2.0102093196988102, "language_loss": 0.71041977, "learning_rate": 3.96551547720879e-06, "loss": 0.73271215, "num_input_tokens_seen": 31085270, "step": 1454, "time_per_iteration": 2.7568745613098145 }, { "auxiliary_loss_clip": 0.0106426, "auxiliary_loss_mlp": 0.01008112, "balance_loss_clip": 1.0215131, "balance_loss_mlp": 1.00463128, "epoch": 0.08747933263189539, "flos": 62819795433600.0, "grad_norm": 0.7713706503543015, "language_loss": 0.5859946, "learning_rate": 3.96544342930248e-06, "loss": 0.6067183, "num_input_tokens_seen": 31148445, "step": 1455, "time_per_iteration": 3.2372186183929443 }, { "auxiliary_loss_clip": 0.01189404, "auxiliary_loss_mlp": 0.01060742, "balance_loss_clip": 1.05742884, "balance_loss_mlp": 1.03688788, "epoch": 0.08753945588456336, "flos": 33035877648000.0, "grad_norm": 1.6485208275358016, "language_loss": 0.77564865, "learning_rate": 3.965371306866359e-06, "loss": 0.79815018, "num_input_tokens_seen": 31168770, "step": 1456, "time_per_iteration": 2.790663003921509 }, { "auxiliary_loss_clip": 0.01127959, "auxiliary_loss_mlp": 0.01054526, "balance_loss_clip": 1.04962158, "balance_loss_mlp": 1.03071976, "epoch": 0.08759957913723132, "flos": 35547182046720.0, "grad_norm": 1.83889407784057, "language_loss": 0.72420907, "learning_rate": 3.96529910990316e-06, "loss": 0.74603397, "num_input_tokens_seen": 31189270, "step": 1457, "time_per_iteration": 2.9099740982055664 }, { "auxiliary_loss_clip": 0.01176549, "auxiliary_loss_mlp": 0.0104866, "balance_loss_clip": 1.05627227, "balance_loss_mlp": 1.02633214, "epoch": 0.0876597023898993, "flos": 23911120022400.0, "grad_norm": 1.5250401870177361, "language_loss": 0.86412215, "learning_rate": 3.965226838415622e-06, "loss": 0.88637424, "num_input_tokens_seen": 31210385, "step": 1458, "time_per_iteration": 2.7517166137695312 }, { "auxiliary_loss_clip": 0.01169535, "auxiliary_loss_mlp": 0.01061413, "balance_loss_clip": 1.05884266, "balance_loss_mlp": 1.03825045, "epoch": 0.08771982564256726, "flos": 18114025150080.0, "grad_norm": 1.7412813512419094, "language_loss": 0.80268395, "learning_rate": 3.965154492406486e-06, "loss": 0.82499349, "num_input_tokens_seen": 31229745, "step": 1459, "time_per_iteration": 2.71455717086792 }, { "auxiliary_loss_clip": 0.01130491, "auxiliary_loss_mlp": 0.01054334, "balance_loss_clip": 1.05256546, "balance_loss_mlp": 1.03018188, "epoch": 0.08777994889523523, "flos": 17712005155200.0, "grad_norm": 2.1450339680450714, "language_loss": 0.84538847, "learning_rate": 3.9650820718784945e-06, "loss": 0.86723673, "num_input_tokens_seen": 31248280, "step": 1460, "time_per_iteration": 2.8737733364105225 }, { "auxiliary_loss_clip": 0.01177787, "auxiliary_loss_mlp": 0.01057974, "balance_loss_clip": 1.0572983, "balance_loss_mlp": 1.03640938, "epoch": 0.0878400721479032, "flos": 12819930382080.0, "grad_norm": 4.917361835698274, "language_loss": 0.79993135, "learning_rate": 3.965009576834394e-06, "loss": 0.82228899, "num_input_tokens_seen": 31262190, "step": 1461, "time_per_iteration": 2.8436062335968018 }, { "auxiliary_loss_clip": 0.01169165, "auxiliary_loss_mlp": 0.01058947, "balance_loss_clip": 1.05800629, "balance_loss_mlp": 1.03704822, "epoch": 0.08790019540057117, "flos": 26392690938240.0, "grad_norm": 1.566202508611165, "language_loss": 0.76571167, "learning_rate": 3.964937007276932e-06, "loss": 0.78799284, "num_input_tokens_seen": 31283690, "step": 1462, "time_per_iteration": 2.7895474433898926 }, { "auxiliary_loss_clip": 0.0117563, "auxiliary_loss_mlp": 0.01060064, "balance_loss_clip": 1.05839491, "balance_loss_mlp": 1.03580475, "epoch": 0.08796031865323914, "flos": 19134031662720.0, "grad_norm": 2.89717114041641, "language_loss": 0.74710488, "learning_rate": 3.9648643632088634e-06, "loss": 0.76946187, "num_input_tokens_seen": 31302505, "step": 1463, "time_per_iteration": 2.760404348373413 }, { "auxiliary_loss_clip": 0.01191543, "auxiliary_loss_mlp": 0.01061609, "balance_loss_clip": 1.06145048, "balance_loss_mlp": 1.03680158, "epoch": 0.0880204419059071, "flos": 26064287867520.0, "grad_norm": 2.431514195311041, "language_loss": 0.83797103, "learning_rate": 3.964791644632941e-06, "loss": 0.8605026, "num_input_tokens_seen": 31323070, "step": 1464, "time_per_iteration": 2.7417759895324707 }, { "auxiliary_loss_clip": 0.011733, "auxiliary_loss_mlp": 0.01063475, "balance_loss_clip": 1.05683231, "balance_loss_mlp": 1.04093289, "epoch": 0.08808056515857508, "flos": 22377842115840.0, "grad_norm": 2.1775753375634963, "language_loss": 0.78104752, "learning_rate": 3.964718851551923e-06, "loss": 0.8034153, "num_input_tokens_seen": 31341880, "step": 1465, "time_per_iteration": 2.6852309703826904 }, { "auxiliary_loss_clip": 0.01199489, "auxiliary_loss_mlp": 0.01059873, "balance_loss_clip": 1.0619812, "balance_loss_mlp": 1.03791499, "epoch": 0.08814068841124305, "flos": 23185293897600.0, "grad_norm": 2.412657222564686, "language_loss": 0.85187089, "learning_rate": 3.9646459839685675e-06, "loss": 0.87446451, "num_input_tokens_seen": 31361995, "step": 1466, "time_per_iteration": 2.706264019012451 }, { "auxiliary_loss_clip": 0.01120627, "auxiliary_loss_mlp": 0.00782645, "balance_loss_clip": 1.04989958, "balance_loss_mlp": 1.00037241, "epoch": 0.08820081166391101, "flos": 25155281358720.0, "grad_norm": 1.9900601596102498, "language_loss": 0.84168816, "learning_rate": 3.964573041885641e-06, "loss": 0.86072087, "num_input_tokens_seen": 31381515, "step": 1467, "time_per_iteration": 2.8636934757232666 }, { "auxiliary_loss_clip": 0.01178935, "auxiliary_loss_mlp": 0.01055379, "balance_loss_clip": 1.05910301, "balance_loss_mlp": 1.03219247, "epoch": 0.08826093491657899, "flos": 22231685675520.0, "grad_norm": 1.660218686828999, "language_loss": 0.75506544, "learning_rate": 3.964500025305907e-06, "loss": 0.77740854, "num_input_tokens_seen": 31400345, "step": 1468, "time_per_iteration": 2.661501884460449 }, { "auxiliary_loss_clip": 0.01181261, "auxiliary_loss_mlp": 0.01054252, "balance_loss_clip": 1.0629456, "balance_loss_mlp": 1.03266358, "epoch": 0.08832105816924696, "flos": 22126826897280.0, "grad_norm": 4.868504388441724, "language_loss": 0.80322379, "learning_rate": 3.9644269342321355e-06, "loss": 0.82557893, "num_input_tokens_seen": 31419620, "step": 1469, "time_per_iteration": 2.7473137378692627 }, { "auxiliary_loss_clip": 0.01198542, "auxiliary_loss_mlp": 0.01059353, "balance_loss_clip": 1.0627017, "balance_loss_mlp": 1.03677487, "epoch": 0.08838118142191492, "flos": 17566495159680.0, "grad_norm": 2.0179242193855806, "language_loss": 0.77437651, "learning_rate": 3.9643537686670974e-06, "loss": 0.79695547, "num_input_tokens_seen": 31437970, "step": 1470, "time_per_iteration": 2.7672410011291504 }, { "auxiliary_loss_clip": 0.01193825, "auxiliary_loss_mlp": 0.01067102, "balance_loss_clip": 1.06180143, "balance_loss_mlp": 1.04281926, "epoch": 0.0884413046745829, "flos": 20777196251520.0, "grad_norm": 1.6812425162011504, "language_loss": 0.84297001, "learning_rate": 3.964280528613569e-06, "loss": 0.86557925, "num_input_tokens_seen": 31457040, "step": 1471, "time_per_iteration": 2.7584216594696045 }, { "auxiliary_loss_clip": 0.01156315, "auxiliary_loss_mlp": 0.01054307, "balance_loss_clip": 1.05682266, "balance_loss_mlp": 1.03342199, "epoch": 0.08850142792725087, "flos": 22125462180480.0, "grad_norm": 1.6938350729430058, "language_loss": 0.83321345, "learning_rate": 3.964207214074324e-06, "loss": 0.85531968, "num_input_tokens_seen": 31477520, "step": 1472, "time_per_iteration": 2.7895469665527344 }, { "auxiliary_loss_clip": 0.01176151, "auxiliary_loss_mlp": 0.01058616, "balance_loss_clip": 1.06106544, "balance_loss_mlp": 1.03529835, "epoch": 0.08856155117991883, "flos": 22418744728320.0, "grad_norm": 2.3638705809965, "language_loss": 0.82781172, "learning_rate": 3.964133825052146e-06, "loss": 0.85015941, "num_input_tokens_seen": 31495575, "step": 1473, "time_per_iteration": 2.7361483573913574 }, { "auxiliary_loss_clip": 0.01129906, "auxiliary_loss_mlp": 0.01064148, "balance_loss_clip": 1.05552769, "balance_loss_mlp": 1.04263091, "epoch": 0.0886216744325868, "flos": 29937002572800.0, "grad_norm": 1.6022277785896435, "language_loss": 0.78712153, "learning_rate": 3.964060361549816e-06, "loss": 0.80906206, "num_input_tokens_seen": 31520020, "step": 1474, "time_per_iteration": 2.894319534301758 }, { "auxiliary_loss_clip": 0.01146238, "auxiliary_loss_mlp": 0.01068131, "balance_loss_clip": 1.05575764, "balance_loss_mlp": 1.04175043, "epoch": 0.08868179768525478, "flos": 23982833525760.0, "grad_norm": 1.6120869011213488, "language_loss": 0.79030406, "learning_rate": 3.963986823570121e-06, "loss": 0.81244779, "num_input_tokens_seen": 31539265, "step": 1475, "time_per_iteration": 2.8806042671203613 }, { "auxiliary_loss_clip": 0.01191986, "auxiliary_loss_mlp": 0.01047451, "balance_loss_clip": 1.05980015, "balance_loss_mlp": 1.02478909, "epoch": 0.08874192093792274, "flos": 43177553216640.0, "grad_norm": 1.4679464237421194, "language_loss": 0.74202317, "learning_rate": 3.963913211115848e-06, "loss": 0.76441753, "num_input_tokens_seen": 31563425, "step": 1476, "time_per_iteration": 2.8381049633026123 }, { "auxiliary_loss_clip": 0.01174628, "auxiliary_loss_mlp": 0.01059934, "balance_loss_clip": 1.06217527, "balance_loss_mlp": 1.03678358, "epoch": 0.0888020441905907, "flos": 32852445868800.0, "grad_norm": 1.712954575149443, "language_loss": 0.74220836, "learning_rate": 3.9638395241897895e-06, "loss": 0.76455402, "num_input_tokens_seen": 31584525, "step": 1477, "time_per_iteration": 2.8452210426330566 }, { "auxiliary_loss_clip": 0.01191865, "auxiliary_loss_mlp": 0.01051229, "balance_loss_clip": 1.06062829, "balance_loss_mlp": 1.0278163, "epoch": 0.08886216744325869, "flos": 23149347361920.0, "grad_norm": 1.95844459768748, "language_loss": 0.87194049, "learning_rate": 3.963765762794739e-06, "loss": 0.89437139, "num_input_tokens_seen": 31603325, "step": 1478, "time_per_iteration": 2.644918203353882 }, { "auxiliary_loss_clip": 0.01176299, "auxiliary_loss_mlp": 0.01058069, "balance_loss_clip": 1.0572443, "balance_loss_mlp": 1.03546739, "epoch": 0.08892229069592665, "flos": 23331593992320.0, "grad_norm": 1.6306868156426517, "language_loss": 0.77571511, "learning_rate": 3.963691926933495e-06, "loss": 0.79805881, "num_input_tokens_seen": 31624820, "step": 1479, "time_per_iteration": 2.738168954849243 }, { "auxiliary_loss_clip": 0.01164179, "auxiliary_loss_mlp": 0.010526, "balance_loss_clip": 1.05629039, "balance_loss_mlp": 1.02801871, "epoch": 0.08898241394859462, "flos": 26213784272640.0, "grad_norm": 2.199164032289915, "language_loss": 0.77797234, "learning_rate": 3.9636180166088555e-06, "loss": 0.80014014, "num_input_tokens_seen": 31646080, "step": 1480, "time_per_iteration": 2.837562322616577 }, { "auxiliary_loss_clip": 0.01180168, "auxiliary_loss_mlp": 0.01060894, "balance_loss_clip": 1.05762577, "balance_loss_mlp": 1.03656292, "epoch": 0.0890425372012626, "flos": 23550613171200.0, "grad_norm": 2.9471668635954273, "language_loss": 0.66437578, "learning_rate": 3.963544031823624e-06, "loss": 0.68678641, "num_input_tokens_seen": 31665770, "step": 1481, "time_per_iteration": 2.742422580718994 }, { "auxiliary_loss_clip": 0.01143445, "auxiliary_loss_mlp": 0.01055318, "balance_loss_clip": 1.05510306, "balance_loss_mlp": 1.03273988, "epoch": 0.08910266045393056, "flos": 23002795872000.0, "grad_norm": 2.124586862599894, "language_loss": 0.96630967, "learning_rate": 3.9634699725806065e-06, "loss": 0.9882974, "num_input_tokens_seen": 31683805, "step": 1482, "time_per_iteration": 2.8150243759155273 }, { "auxiliary_loss_clip": 0.0115336, "auxiliary_loss_mlp": 0.01057266, "balance_loss_clip": 1.05521989, "balance_loss_mlp": 1.03353167, "epoch": 0.08916278370659853, "flos": 31936508035200.0, "grad_norm": 1.7904792435575492, "language_loss": 0.78683239, "learning_rate": 3.96339583888261e-06, "loss": 0.80893862, "num_input_tokens_seen": 31704630, "step": 1483, "time_per_iteration": 2.869084119796753 }, { "auxiliary_loss_clip": 0.0116904, "auxiliary_loss_mlp": 0.01082082, "balance_loss_clip": 1.05540919, "balance_loss_mlp": 1.05829978, "epoch": 0.08922290695926649, "flos": 17530404969600.0, "grad_norm": 2.2229749189835677, "language_loss": 0.85424453, "learning_rate": 3.963321630732448e-06, "loss": 0.87675571, "num_input_tokens_seen": 31723255, "step": 1484, "time_per_iteration": 4.280332326889038 }, { "auxiliary_loss_clip": 0.01199312, "auxiliary_loss_mlp": 0.01060639, "balance_loss_clip": 1.06350458, "balance_loss_mlp": 1.03701186, "epoch": 0.08928303021193447, "flos": 32125075459200.0, "grad_norm": 1.7208139316694195, "language_loss": 0.80205405, "learning_rate": 3.963247348132932e-06, "loss": 0.82465357, "num_input_tokens_seen": 31747045, "step": 1485, "time_per_iteration": 2.761733055114746 }, { "auxiliary_loss_clip": 0.01173167, "auxiliary_loss_mlp": 0.01056554, "balance_loss_clip": 1.0563333, "balance_loss_mlp": 1.03228331, "epoch": 0.08934315346460243, "flos": 22125210785280.0, "grad_norm": 1.8969438127775513, "language_loss": 0.82859123, "learning_rate": 3.96317299108688e-06, "loss": 0.85088843, "num_input_tokens_seen": 31766615, "step": 1486, "time_per_iteration": 4.144649028778076 }, { "auxiliary_loss_clip": 0.01144509, "auxiliary_loss_mlp": 0.01063805, "balance_loss_clip": 1.05592823, "balance_loss_mlp": 1.04021382, "epoch": 0.0894032767172704, "flos": 22565583527040.0, "grad_norm": 2.1520807598980185, "language_loss": 0.76365155, "learning_rate": 3.963098559597111e-06, "loss": 0.78573477, "num_input_tokens_seen": 31785855, "step": 1487, "time_per_iteration": 4.432489395141602 }, { "auxiliary_loss_clip": 0.01157327, "auxiliary_loss_mlp": 0.01060261, "balance_loss_clip": 1.05041027, "balance_loss_mlp": 1.03542995, "epoch": 0.08946339996993838, "flos": 20193396503040.0, "grad_norm": 3.851280697857004, "language_loss": 0.83030224, "learning_rate": 3.963024053666449e-06, "loss": 0.85247803, "num_input_tokens_seen": 31804210, "step": 1488, "time_per_iteration": 2.7262001037597656 }, { "auxiliary_loss_clip": 0.01171869, "auxiliary_loss_mlp": 0.01051875, "balance_loss_clip": 1.05546355, "balance_loss_mlp": 1.02916527, "epoch": 0.08952352322260634, "flos": 48360181104000.0, "grad_norm": 1.7759111472560039, "language_loss": 0.71783459, "learning_rate": 3.962949473297718e-06, "loss": 0.74007201, "num_input_tokens_seen": 31826150, "step": 1489, "time_per_iteration": 4.562536954879761 }, { "auxiliary_loss_clip": 0.01150585, "auxiliary_loss_mlp": 0.01051382, "balance_loss_clip": 1.05190349, "balance_loss_mlp": 1.02830291, "epoch": 0.08958364647527431, "flos": 31793081028480.0, "grad_norm": 1.6999724957706692, "language_loss": 0.89717221, "learning_rate": 3.962874818493745e-06, "loss": 0.91919196, "num_input_tokens_seen": 31848060, "step": 1490, "time_per_iteration": 2.838327646255493 }, { "auxiliary_loss_clip": 0.01184278, "auxiliary_loss_mlp": 0.01064168, "balance_loss_clip": 1.05656135, "balance_loss_mlp": 1.04102957, "epoch": 0.08964376972794229, "flos": 23368186972800.0, "grad_norm": 3.9062133325383126, "language_loss": 0.73075998, "learning_rate": 3.9628000892573635e-06, "loss": 0.7532444, "num_input_tokens_seen": 31870040, "step": 1491, "time_per_iteration": 2.7007367610931396 }, { "auxiliary_loss_clip": 0.01189564, "auxiliary_loss_mlp": 0.00780167, "balance_loss_clip": 1.05968356, "balance_loss_mlp": 1.00023544, "epoch": 0.08970389298061025, "flos": 23294785530240.0, "grad_norm": 1.7021050418948058, "language_loss": 0.77235049, "learning_rate": 3.9627252855914055e-06, "loss": 0.79204774, "num_input_tokens_seen": 31890400, "step": 1492, "time_per_iteration": 2.7799623012542725 }, { "auxiliary_loss_clip": 0.01187114, "auxiliary_loss_mlp": 0.01057952, "balance_loss_clip": 1.05902028, "balance_loss_mlp": 1.03512359, "epoch": 0.08976401623327822, "flos": 33761703772800.0, "grad_norm": 1.9236790530591625, "language_loss": 0.71429193, "learning_rate": 3.962650407498707e-06, "loss": 0.73674262, "num_input_tokens_seen": 31913435, "step": 1493, "time_per_iteration": 2.8479840755462646 }, { "auxiliary_loss_clip": 0.01188796, "auxiliary_loss_mlp": 0.01057103, "balance_loss_clip": 1.05757976, "balance_loss_mlp": 1.03371406, "epoch": 0.08982413948594618, "flos": 23911335504000.0, "grad_norm": 2.6977604073852053, "language_loss": 0.87175488, "learning_rate": 3.962575454982109e-06, "loss": 0.8942138, "num_input_tokens_seen": 31932435, "step": 1494, "time_per_iteration": 2.855658769607544 }, { "auxiliary_loss_clip": 0.0108466, "auxiliary_loss_mlp": 0.01070478, "balance_loss_clip": 1.04641223, "balance_loss_mlp": 1.04551601, "epoch": 0.08988426273861416, "flos": 16837544551680.0, "grad_norm": 1.6162523894431247, "language_loss": 0.82929438, "learning_rate": 3.962500428044454e-06, "loss": 0.85084569, "num_input_tokens_seen": 31950125, "step": 1495, "time_per_iteration": 2.9265449047088623 }, { "auxiliary_loss_clip": 0.01171464, "auxiliary_loss_mlp": 0.01059756, "balance_loss_clip": 1.05779243, "balance_loss_mlp": 1.03682017, "epoch": 0.08994438599128213, "flos": 14793365548800.0, "grad_norm": 9.387255385257733, "language_loss": 0.70191383, "learning_rate": 3.962425326688585e-06, "loss": 0.72422606, "num_input_tokens_seen": 31968050, "step": 1496, "time_per_iteration": 2.773693799972534 }, { "auxiliary_loss_clip": 0.01164171, "auxiliary_loss_mlp": 0.01049454, "balance_loss_clip": 1.05397439, "balance_loss_mlp": 1.02888989, "epoch": 0.09000450924395009, "flos": 17384320356480.0, "grad_norm": 1.6327835891742186, "language_loss": 0.79752576, "learning_rate": 3.962350150917351e-06, "loss": 0.81966203, "num_input_tokens_seen": 31985675, "step": 1497, "time_per_iteration": 2.6850852966308594 }, { "auxiliary_loss_clip": 0.01129609, "auxiliary_loss_mlp": 0.01054903, "balance_loss_clip": 1.05307686, "balance_loss_mlp": 1.03146648, "epoch": 0.09006463249661807, "flos": 24280317964800.0, "grad_norm": 8.517000212139891, "language_loss": 0.82940567, "learning_rate": 3.9622749007336035e-06, "loss": 0.85125089, "num_input_tokens_seen": 32005180, "step": 1498, "time_per_iteration": 2.786205768585205 }, { "auxiliary_loss_clip": 0.01170006, "auxiliary_loss_mlp": 0.01059397, "balance_loss_clip": 1.0577898, "balance_loss_mlp": 1.03718853, "epoch": 0.09012475574928604, "flos": 13661928069120.0, "grad_norm": 2.220597323082783, "language_loss": 0.78609937, "learning_rate": 3.962199576140195e-06, "loss": 0.80839342, "num_input_tokens_seen": 32022970, "step": 1499, "time_per_iteration": 2.71785831451416 }, { "auxiliary_loss_clip": 0.01161539, "auxiliary_loss_mlp": 0.00780528, "balance_loss_clip": 1.05444527, "balance_loss_mlp": 1.00024021, "epoch": 0.090184879001954, "flos": 23327751237120.0, "grad_norm": 2.049001350461653, "language_loss": 0.93337607, "learning_rate": 3.962124177139981e-06, "loss": 0.95279682, "num_input_tokens_seen": 32043055, "step": 1500, "time_per_iteration": 2.7077536582946777 }, { "auxiliary_loss_clip": 0.01148009, "auxiliary_loss_mlp": 0.01055246, "balance_loss_clip": 1.05371249, "balance_loss_mlp": 1.0308435, "epoch": 0.09024500225462198, "flos": 23002688131200.0, "grad_norm": 3.0778515668575492, "language_loss": 0.74595469, "learning_rate": 3.962048703735822e-06, "loss": 0.76798725, "num_input_tokens_seen": 32061900, "step": 1501, "time_per_iteration": 2.7073416709899902 }, { "auxiliary_loss_clip": 0.01056535, "auxiliary_loss_mlp": 0.01013118, "balance_loss_clip": 1.03392363, "balance_loss_mlp": 1.00963676, "epoch": 0.09030512550728995, "flos": 62189203242240.0, "grad_norm": 0.7274487593473578, "language_loss": 0.58316052, "learning_rate": 3.96197315593058e-06, "loss": 0.60385704, "num_input_tokens_seen": 32122745, "step": 1502, "time_per_iteration": 3.274049997329712 }, { "auxiliary_loss_clip": 0.0114469, "auxiliary_loss_mlp": 0.01062533, "balance_loss_clip": 1.04626393, "balance_loss_mlp": 1.03896546, "epoch": 0.09036524875995791, "flos": 38800689171840.0, "grad_norm": 2.1727281711500095, "language_loss": 0.69501173, "learning_rate": 3.961897533727119e-06, "loss": 0.71708393, "num_input_tokens_seen": 32145125, "step": 1503, "time_per_iteration": 2.87554669380188 }, { "auxiliary_loss_clip": 0.01133108, "auxiliary_loss_mlp": 0.0105903, "balance_loss_clip": 1.04783726, "balance_loss_mlp": 1.03660655, "epoch": 0.09042537201262588, "flos": 21690081429120.0, "grad_norm": 2.169205134580129, "language_loss": 0.86124271, "learning_rate": 3.961821837128306e-06, "loss": 0.88316405, "num_input_tokens_seen": 32166255, "step": 1504, "time_per_iteration": 2.844688892364502 }, { "auxiliary_loss_clip": 0.01146301, "auxiliary_loss_mlp": 0.01069714, "balance_loss_clip": 1.05341232, "balance_loss_mlp": 1.04261804, "epoch": 0.09048549526529386, "flos": 22267021680000.0, "grad_norm": 2.178155372989796, "language_loss": 0.7233696, "learning_rate": 3.961746066137014e-06, "loss": 0.74552977, "num_input_tokens_seen": 32184010, "step": 1505, "time_per_iteration": 2.7992677688598633 }, { "auxiliary_loss_clip": 0.01137399, "auxiliary_loss_mlp": 0.01056414, "balance_loss_clip": 1.05097985, "balance_loss_mlp": 1.03302479, "epoch": 0.09054561851796182, "flos": 14610939350400.0, "grad_norm": 2.5107188210784526, "language_loss": 0.80730999, "learning_rate": 3.961670220756114e-06, "loss": 0.82924813, "num_input_tokens_seen": 32201635, "step": 1506, "time_per_iteration": 2.7458760738372803 }, { "auxiliary_loss_clip": 0.01140643, "auxiliary_loss_mlp": 0.01053315, "balance_loss_clip": 1.05161858, "balance_loss_mlp": 1.03197718, "epoch": 0.09060574177062979, "flos": 27636169916160.0, "grad_norm": 2.166956120197676, "language_loss": 0.75915337, "learning_rate": 3.961594300988482e-06, "loss": 0.78109294, "num_input_tokens_seen": 32221940, "step": 1507, "time_per_iteration": 2.873826742172241 }, { "auxiliary_loss_clip": 0.01051873, "auxiliary_loss_mlp": 0.01005715, "balance_loss_clip": 1.02043629, "balance_loss_mlp": 1.00175714, "epoch": 0.09066586502329776, "flos": 66085797513600.0, "grad_norm": 0.7272435825555993, "language_loss": 0.57699698, "learning_rate": 3.961518306836998e-06, "loss": 0.59757286, "num_input_tokens_seen": 32276495, "step": 1508, "time_per_iteration": 3.064926862716675 }, { "auxiliary_loss_clip": 0.01165416, "auxiliary_loss_mlp": 0.01054804, "balance_loss_clip": 1.055233, "balance_loss_mlp": 1.03155804, "epoch": 0.09072598827596573, "flos": 18916449027840.0, "grad_norm": 1.7601330807914457, "language_loss": 0.85090744, "learning_rate": 3.961442238304543e-06, "loss": 0.87310958, "num_input_tokens_seen": 32294130, "step": 1509, "time_per_iteration": 2.6664113998413086 }, { "auxiliary_loss_clip": 0.01168837, "auxiliary_loss_mlp": 0.01064138, "balance_loss_clip": 1.05745769, "balance_loss_mlp": 1.03949761, "epoch": 0.0907861115286337, "flos": 24821742643200.0, "grad_norm": 2.3794507710009203, "language_loss": 0.84110659, "learning_rate": 3.961366095394002e-06, "loss": 0.8634364, "num_input_tokens_seen": 32313555, "step": 1510, "time_per_iteration": 2.783484697341919 }, { "auxiliary_loss_clip": 0.01153141, "auxiliary_loss_mlp": 0.01058569, "balance_loss_clip": 1.05423617, "balance_loss_mlp": 1.03482211, "epoch": 0.09084623478130167, "flos": 21652842003840.0, "grad_norm": 1.8490761573484715, "language_loss": 0.85247588, "learning_rate": 3.961289878108262e-06, "loss": 0.87459302, "num_input_tokens_seen": 32331430, "step": 1511, "time_per_iteration": 2.714620351791382 }, { "auxiliary_loss_clip": 0.01145395, "auxiliary_loss_mlp": 0.01052919, "balance_loss_clip": 1.05182219, "balance_loss_mlp": 1.02983987, "epoch": 0.09090635803396964, "flos": 27639258485760.0, "grad_norm": 1.5734326837562458, "language_loss": 0.84977764, "learning_rate": 3.9612135864502135e-06, "loss": 0.87176073, "num_input_tokens_seen": 32353705, "step": 1512, "time_per_iteration": 2.75361704826355 }, { "auxiliary_loss_clip": 0.01155239, "auxiliary_loss_mlp": 0.01053669, "balance_loss_clip": 1.05740952, "balance_loss_mlp": 1.03185391, "epoch": 0.0909664812866376, "flos": 17669127294720.0, "grad_norm": 3.0235926431973654, "language_loss": 0.87346804, "learning_rate": 3.961137220422749e-06, "loss": 0.89555705, "num_input_tokens_seen": 32370520, "step": 1513, "time_per_iteration": 2.6864211559295654 }, { "auxiliary_loss_clip": 0.01168585, "auxiliary_loss_mlp": 0.01049408, "balance_loss_clip": 1.05562937, "balance_loss_mlp": 1.02841544, "epoch": 0.09102660453930557, "flos": 23951448017280.0, "grad_norm": 1.7883280971870592, "language_loss": 0.86802679, "learning_rate": 3.961060780028764e-06, "loss": 0.89020675, "num_input_tokens_seen": 32389105, "step": 1514, "time_per_iteration": 2.6788065433502197 }, { "auxiliary_loss_clip": 0.01134005, "auxiliary_loss_mlp": 0.01064386, "balance_loss_clip": 1.05571628, "balance_loss_mlp": 1.04252315, "epoch": 0.09108672779197355, "flos": 25812949426560.0, "grad_norm": 1.7666120550996132, "language_loss": 0.89944756, "learning_rate": 3.960984265271159e-06, "loss": 0.92143154, "num_input_tokens_seen": 32408065, "step": 1515, "time_per_iteration": 2.757390022277832 }, { "auxiliary_loss_clip": 0.01162518, "auxiliary_loss_mlp": 0.01056937, "balance_loss_clip": 1.05547726, "balance_loss_mlp": 1.03360808, "epoch": 0.09114685104464151, "flos": 29639482220160.0, "grad_norm": 2.1090985009837646, "language_loss": 0.85576892, "learning_rate": 3.9609076761528335e-06, "loss": 0.87796342, "num_input_tokens_seen": 32427225, "step": 1516, "time_per_iteration": 2.704784870147705 }, { "auxiliary_loss_clip": 0.01158781, "auxiliary_loss_mlp": 0.01057165, "balance_loss_clip": 1.05135357, "balance_loss_mlp": 1.03451526, "epoch": 0.09120697429730948, "flos": 33729635905920.0, "grad_norm": 2.086405156201108, "language_loss": 0.81167233, "learning_rate": 3.960831012676692e-06, "loss": 0.83383185, "num_input_tokens_seen": 32450510, "step": 1517, "time_per_iteration": 2.8586854934692383 }, { "auxiliary_loss_clip": 0.0117857, "auxiliary_loss_mlp": 0.01065492, "balance_loss_clip": 1.05741739, "balance_loss_mlp": 1.04280686, "epoch": 0.09126709754997746, "flos": 18401381953920.0, "grad_norm": 2.104468567304263, "language_loss": 0.78067243, "learning_rate": 3.960754274845642e-06, "loss": 0.80311304, "num_input_tokens_seen": 32468425, "step": 1518, "time_per_iteration": 2.7862088680267334 }, { "auxiliary_loss_clip": 0.01165395, "auxiliary_loss_mlp": 0.01061371, "balance_loss_clip": 1.05285823, "balance_loss_mlp": 1.03900695, "epoch": 0.09132722080264542, "flos": 22091957769600.0, "grad_norm": 1.6816479812467473, "language_loss": 0.86124098, "learning_rate": 3.960677462662594e-06, "loss": 0.88350856, "num_input_tokens_seen": 32487510, "step": 1519, "time_per_iteration": 2.723714828491211 }, { "auxiliary_loss_clip": 0.01163599, "auxiliary_loss_mlp": 0.01052792, "balance_loss_clip": 1.05454183, "balance_loss_mlp": 1.02914131, "epoch": 0.09138734405531339, "flos": 21033131633280.0, "grad_norm": 1.9681293960876167, "language_loss": 0.73279071, "learning_rate": 3.96060057613046e-06, "loss": 0.75495458, "num_input_tokens_seen": 32507250, "step": 1520, "time_per_iteration": 2.8098628520965576 }, { "auxiliary_loss_clip": 0.01161166, "auxiliary_loss_mlp": 0.01058035, "balance_loss_clip": 1.05696058, "balance_loss_mlp": 1.03469419, "epoch": 0.09144746730798137, "flos": 20083940784000.0, "grad_norm": 2.6988457876937066, "language_loss": 0.85236609, "learning_rate": 3.960523615252156e-06, "loss": 0.87455815, "num_input_tokens_seen": 32526045, "step": 1521, "time_per_iteration": 2.7134172916412354 }, { "auxiliary_loss_clip": 0.01120174, "auxiliary_loss_mlp": 0.01063979, "balance_loss_clip": 1.05189717, "balance_loss_mlp": 1.03991079, "epoch": 0.09150759056064933, "flos": 22778210085120.0, "grad_norm": 1.6991603177293335, "language_loss": 0.83933008, "learning_rate": 3.960446580030599e-06, "loss": 0.8611716, "num_input_tokens_seen": 32546575, "step": 1522, "time_per_iteration": 2.93745493888855 }, { "auxiliary_loss_clip": 0.01182362, "auxiliary_loss_mlp": 0.01064589, "balance_loss_clip": 1.05630755, "balance_loss_mlp": 1.04153395, "epoch": 0.0915677138133173, "flos": 27564205017600.0, "grad_norm": 1.647915064434875, "language_loss": 0.81012994, "learning_rate": 3.960369470468711e-06, "loss": 0.8325994, "num_input_tokens_seen": 32568795, "step": 1523, "time_per_iteration": 4.378152847290039 }, { "auxiliary_loss_clip": 0.01157976, "auxiliary_loss_mlp": 0.00781395, "balance_loss_clip": 1.05422449, "balance_loss_mlp": 1.00037968, "epoch": 0.09162783706598528, "flos": 17674765729920.0, "grad_norm": 2.106497620262502, "language_loss": 0.7460072, "learning_rate": 3.960292286569418e-06, "loss": 0.76540089, "num_input_tokens_seen": 32587010, "step": 1524, "time_per_iteration": 2.7146124839782715 }, { "auxiliary_loss_clip": 0.01135228, "auxiliary_loss_mlp": 0.0106119, "balance_loss_clip": 1.05092478, "balance_loss_mlp": 1.03782487, "epoch": 0.09168796031865324, "flos": 18478195188480.0, "grad_norm": 2.0992608845945413, "language_loss": 0.86498803, "learning_rate": 3.960215028335644e-06, "loss": 0.88695222, "num_input_tokens_seen": 32602375, "step": 1525, "time_per_iteration": 4.314826965332031 }, { "auxiliary_loss_clip": 0.01164396, "auxiliary_loss_mlp": 0.01049506, "balance_loss_clip": 1.05688822, "balance_loss_mlp": 1.0263319, "epoch": 0.0917480835713212, "flos": 29387605075200.0, "grad_norm": 2.1146348399758237, "language_loss": 0.74512708, "learning_rate": 3.96013769577032e-06, "loss": 0.76726609, "num_input_tokens_seen": 32621460, "step": 1526, "time_per_iteration": 5.878855466842651 }, { "auxiliary_loss_clip": 0.01186002, "auxiliary_loss_mlp": 0.01055817, "balance_loss_clip": 1.05732703, "balance_loss_mlp": 1.03392982, "epoch": 0.09180820682398917, "flos": 19829262378240.0, "grad_norm": 2.5135282962071215, "language_loss": 0.77581728, "learning_rate": 3.960060288876378e-06, "loss": 0.79823542, "num_input_tokens_seen": 32640440, "step": 1527, "time_per_iteration": 2.693847179412842 }, { "auxiliary_loss_clip": 0.01173605, "auxiliary_loss_mlp": 0.01052264, "balance_loss_clip": 1.0534333, "balance_loss_mlp": 1.02868414, "epoch": 0.09186833007665715, "flos": 23841848643840.0, "grad_norm": 2.655631139677705, "language_loss": 0.78546697, "learning_rate": 3.959982807656753e-06, "loss": 0.80772561, "num_input_tokens_seen": 32660020, "step": 1528, "time_per_iteration": 2.774219512939453 }, { "auxiliary_loss_clip": 0.01146017, "auxiliary_loss_mlp": 0.01050376, "balance_loss_clip": 1.0499053, "balance_loss_mlp": 1.02827477, "epoch": 0.09192845332932512, "flos": 12932726065920.0, "grad_norm": 2.682547324044482, "language_loss": 0.76732361, "learning_rate": 3.959905252114384e-06, "loss": 0.78928751, "num_input_tokens_seen": 32678170, "step": 1529, "time_per_iteration": 4.603156089782715 }, { "auxiliary_loss_clip": 0.01186538, "auxiliary_loss_mlp": 0.00780856, "balance_loss_clip": 1.05415928, "balance_loss_mlp": 1.00045025, "epoch": 0.09198857658199308, "flos": 24568177559040.0, "grad_norm": 1.7410660090049153, "language_loss": 0.82906747, "learning_rate": 3.959827622252211e-06, "loss": 0.84874141, "num_input_tokens_seen": 32697540, "step": 1530, "time_per_iteration": 2.7118582725524902 }, { "auxiliary_loss_clip": 0.01130108, "auxiliary_loss_mlp": 0.0106509, "balance_loss_clip": 1.04975331, "balance_loss_mlp": 1.04220152, "epoch": 0.09204869983466106, "flos": 20266941600000.0, "grad_norm": 2.182960664479704, "language_loss": 0.84001881, "learning_rate": 3.959749918073179e-06, "loss": 0.86197078, "num_input_tokens_seen": 32716805, "step": 1531, "time_per_iteration": 2.791947603225708 }, { "auxiliary_loss_clip": 0.0113655, "auxiliary_loss_mlp": 0.01051554, "balance_loss_clip": 1.04906452, "balance_loss_mlp": 1.02853465, "epoch": 0.09210882308732903, "flos": 20885646389760.0, "grad_norm": 1.7570281394880602, "language_loss": 0.81253195, "learning_rate": 3.959672139580233e-06, "loss": 0.83441293, "num_input_tokens_seen": 32736385, "step": 1532, "time_per_iteration": 2.737739324569702 }, { "auxiliary_loss_clip": 0.01157728, "auxiliary_loss_mlp": 0.01056753, "balance_loss_clip": 1.052163, "balance_loss_mlp": 1.03385305, "epoch": 0.09216894633999699, "flos": 30956326727040.0, "grad_norm": 2.2821036564882182, "language_loss": 0.84194255, "learning_rate": 3.9595942867763235e-06, "loss": 0.86408734, "num_input_tokens_seen": 32757140, "step": 1533, "time_per_iteration": 2.7542598247528076 }, { "auxiliary_loss_clip": 0.01149262, "auxiliary_loss_mlp": 0.01053623, "balance_loss_clip": 1.05813503, "balance_loss_mlp": 1.03190327, "epoch": 0.09222906959266497, "flos": 13151565676800.0, "grad_norm": 1.9396914937933663, "language_loss": 0.9009546, "learning_rate": 3.959516359664402e-06, "loss": 0.92298347, "num_input_tokens_seen": 32774860, "step": 1534, "time_per_iteration": 2.6450984477996826 }, { "auxiliary_loss_clip": 0.01150273, "auxiliary_loss_mlp": 0.0106298, "balance_loss_clip": 1.0495038, "balance_loss_mlp": 1.03849435, "epoch": 0.09228919284533293, "flos": 25994477784960.0, "grad_norm": 5.065477266086046, "language_loss": 0.75779241, "learning_rate": 3.959438358247424e-06, "loss": 0.77992499, "num_input_tokens_seen": 32795250, "step": 1535, "time_per_iteration": 2.730915069580078 }, { "auxiliary_loss_clip": 0.01168283, "auxiliary_loss_mlp": 0.01045276, "balance_loss_clip": 1.05278873, "balance_loss_mlp": 1.02403271, "epoch": 0.0923493160980009, "flos": 18660800954880.0, "grad_norm": 1.8085584532497372, "language_loss": 0.81631637, "learning_rate": 3.959360282528346e-06, "loss": 0.83845198, "num_input_tokens_seen": 32813805, "step": 1536, "time_per_iteration": 2.7326817512512207 }, { "auxiliary_loss_clip": 0.01181977, "auxiliary_loss_mlp": 0.01053699, "balance_loss_clip": 1.05431938, "balance_loss_mlp": 1.03224182, "epoch": 0.09240943935066886, "flos": 21140576190720.0, "grad_norm": 2.0929096884707556, "language_loss": 0.89092755, "learning_rate": 3.959282132510131e-06, "loss": 0.9132843, "num_input_tokens_seen": 32830960, "step": 1537, "time_per_iteration": 2.675771713256836 }, { "auxiliary_loss_clip": 0.01157238, "auxiliary_loss_mlp": 0.01058647, "balance_loss_clip": 1.05114293, "balance_loss_mlp": 1.03605688, "epoch": 0.09246956260333684, "flos": 20592435669120.0, "grad_norm": 1.9480116987165197, "language_loss": 0.80702311, "learning_rate": 3.959203908195741e-06, "loss": 0.82918191, "num_input_tokens_seen": 32848275, "step": 1538, "time_per_iteration": 2.71618390083313 }, { "auxiliary_loss_clip": 0.01060495, "auxiliary_loss_mlp": 0.0101237, "balance_loss_clip": 1.03095436, "balance_loss_mlp": 1.00872231, "epoch": 0.09252968585600481, "flos": 67558710614400.0, "grad_norm": 0.7534074452314953, "language_loss": 0.57429332, "learning_rate": 3.959125609588142e-06, "loss": 0.59502202, "num_input_tokens_seen": 32917730, "step": 1539, "time_per_iteration": 3.3933441638946533 }, { "auxiliary_loss_clip": 0.01159831, "auxiliary_loss_mlp": 0.01050602, "balance_loss_clip": 1.05638027, "balance_loss_mlp": 1.02863121, "epoch": 0.09258980910867277, "flos": 17383853479680.0, "grad_norm": 2.849299216868502, "language_loss": 0.67554641, "learning_rate": 3.959047236690304e-06, "loss": 0.69765073, "num_input_tokens_seen": 32934910, "step": 1540, "time_per_iteration": 2.757084608078003 }, { "auxiliary_loss_clip": 0.01144239, "auxiliary_loss_mlp": 0.01048444, "balance_loss_clip": 1.04954028, "balance_loss_mlp": 1.026438, "epoch": 0.09264993236134075, "flos": 19865927185920.0, "grad_norm": 2.044335478602743, "language_loss": 0.83917534, "learning_rate": 3.958968789505198e-06, "loss": 0.86110216, "num_input_tokens_seen": 32953840, "step": 1541, "time_per_iteration": 2.8497180938720703 }, { "auxiliary_loss_clip": 0.01077839, "auxiliary_loss_mlp": 0.01013078, "balance_loss_clip": 1.02602255, "balance_loss_mlp": 1.0097636, "epoch": 0.09271005561400872, "flos": 62284401262080.0, "grad_norm": 0.8790732834061692, "language_loss": 0.61881655, "learning_rate": 3.9588902680358e-06, "loss": 0.63972563, "num_input_tokens_seen": 33011410, "step": 1542, "time_per_iteration": 3.3079330921173096 }, { "auxiliary_loss_clip": 0.01161232, "auxiliary_loss_mlp": 0.01059438, "balance_loss_clip": 1.05441117, "balance_loss_mlp": 1.03808808, "epoch": 0.09277017886667668, "flos": 23329870139520.0, "grad_norm": 1.6256118826429122, "language_loss": 0.82802349, "learning_rate": 3.958811672285086e-06, "loss": 0.85023022, "num_input_tokens_seen": 33031675, "step": 1543, "time_per_iteration": 2.7408807277679443 }, { "auxiliary_loss_clip": 0.01135873, "auxiliary_loss_mlp": 0.01060295, "balance_loss_clip": 1.04848838, "balance_loss_mlp": 1.03863442, "epoch": 0.09283030211934466, "flos": 54745169875200.0, "grad_norm": 1.706948475246468, "language_loss": 0.72265279, "learning_rate": 3.958733002256038e-06, "loss": 0.74461448, "num_input_tokens_seen": 33056355, "step": 1544, "time_per_iteration": 3.104156255722046 }, { "auxiliary_loss_clip": 0.01166071, "auxiliary_loss_mlp": 0.01055881, "balance_loss_clip": 1.05165935, "balance_loss_mlp": 1.03138375, "epoch": 0.09289042537201263, "flos": 30334784762880.0, "grad_norm": 1.7720844214030114, "language_loss": 0.77286768, "learning_rate": 3.958654257951637e-06, "loss": 0.79508722, "num_input_tokens_seen": 33079520, "step": 1545, "time_per_iteration": 2.808180570602417 }, { "auxiliary_loss_clip": 0.01140161, "auxiliary_loss_mlp": 0.01050495, "balance_loss_clip": 1.0526737, "balance_loss_mlp": 1.02872682, "epoch": 0.09295054862468059, "flos": 17746838369280.0, "grad_norm": 2.7089619481030076, "language_loss": 0.74396008, "learning_rate": 3.9585754393748706e-06, "loss": 0.76586664, "num_input_tokens_seen": 33096135, "step": 1546, "time_per_iteration": 2.7634081840515137 }, { "auxiliary_loss_clip": 0.01163775, "auxiliary_loss_mlp": 0.0105305, "balance_loss_clip": 1.05357957, "balance_loss_mlp": 1.02956545, "epoch": 0.09301067187734856, "flos": 23658021815040.0, "grad_norm": 1.9423225100503794, "language_loss": 0.84200966, "learning_rate": 3.9584965465287275e-06, "loss": 0.86417794, "num_input_tokens_seen": 33115245, "step": 1547, "time_per_iteration": 2.790003776550293 }, { "auxiliary_loss_clip": 0.01141839, "auxiliary_loss_mlp": 0.01053941, "balance_loss_clip": 1.04740989, "balance_loss_mlp": 1.03195918, "epoch": 0.09307079513001654, "flos": 27527719777920.0, "grad_norm": 2.6545433694843488, "language_loss": 0.67698336, "learning_rate": 3.958417579416199e-06, "loss": 0.69894123, "num_input_tokens_seen": 33136640, "step": 1548, "time_per_iteration": 2.8367013931274414 }, { "auxiliary_loss_clip": 0.01123899, "auxiliary_loss_mlp": 0.01059885, "balance_loss_clip": 1.04744387, "balance_loss_mlp": 1.03754544, "epoch": 0.0931309183826845, "flos": 20627340710400.0, "grad_norm": 1.6829727803454704, "language_loss": 0.8326273, "learning_rate": 3.9583385380402795e-06, "loss": 0.85446513, "num_input_tokens_seen": 33155060, "step": 1549, "time_per_iteration": 2.8462016582489014 }, { "auxiliary_loss_clip": 0.01176243, "auxiliary_loss_mlp": 0.0104617, "balance_loss_clip": 1.05815506, "balance_loss_mlp": 1.02473652, "epoch": 0.09319104163535247, "flos": 29020921084800.0, "grad_norm": 1.5528514681372962, "language_loss": 0.75838119, "learning_rate": 3.958259422403966e-06, "loss": 0.78060532, "num_input_tokens_seen": 33175420, "step": 1550, "time_per_iteration": 2.7325351238250732 }, { "auxiliary_loss_clip": 0.01150315, "auxiliary_loss_mlp": 0.01069257, "balance_loss_clip": 1.05249369, "balance_loss_mlp": 1.04483092, "epoch": 0.09325116488802045, "flos": 25301545539840.0, "grad_norm": 2.1922696027472233, "language_loss": 0.82828665, "learning_rate": 3.95818023251026e-06, "loss": 0.85048234, "num_input_tokens_seen": 33194120, "step": 1551, "time_per_iteration": 2.852602481842041 }, { "auxiliary_loss_clip": 0.01064371, "auxiliary_loss_mlp": 0.00760109, "balance_loss_clip": 1.02203059, "balance_loss_mlp": 0.99984246, "epoch": 0.09331128814068841, "flos": 61536203942400.0, "grad_norm": 0.7384225982202158, "language_loss": 0.61837572, "learning_rate": 3.958100968362163e-06, "loss": 0.63662052, "num_input_tokens_seen": 33261080, "step": 1552, "time_per_iteration": 3.3453099727630615 }, { "auxiliary_loss_clip": 0.01059175, "auxiliary_loss_mlp": 0.01016654, "balance_loss_clip": 1.02415061, "balance_loss_mlp": 1.01338792, "epoch": 0.09337141139335638, "flos": 53293700171520.0, "grad_norm": 0.8524917480784928, "language_loss": 0.58986926, "learning_rate": 3.958021629962681e-06, "loss": 0.61062753, "num_input_tokens_seen": 33330235, "step": 1553, "time_per_iteration": 3.37673282623291 }, { "auxiliary_loss_clip": 0.01146955, "auxiliary_loss_mlp": 0.01056683, "balance_loss_clip": 1.05026984, "balance_loss_mlp": 1.03336585, "epoch": 0.09343153464602436, "flos": 23476852592640.0, "grad_norm": 2.3365109182487, "language_loss": 0.87665397, "learning_rate": 3.957942217314823e-06, "loss": 0.8986904, "num_input_tokens_seen": 33349035, "step": 1554, "time_per_iteration": 2.8098127841949463 }, { "auxiliary_loss_clip": 0.01153047, "auxiliary_loss_mlp": 0.01057257, "balance_loss_clip": 1.05439448, "balance_loss_mlp": 1.03393972, "epoch": 0.09349165789869232, "flos": 19353481804800.0, "grad_norm": 4.388884220182432, "language_loss": 0.81678319, "learning_rate": 3.957862730421599e-06, "loss": 0.83888626, "num_input_tokens_seen": 33368060, "step": 1555, "time_per_iteration": 2.726207971572876 }, { "auxiliary_loss_clip": 0.01058869, "auxiliary_loss_mlp": 0.01003892, "balance_loss_clip": 1.0202632, "balance_loss_mlp": 1.00045919, "epoch": 0.09355178115136029, "flos": 67502580635520.0, "grad_norm": 0.8683826280274983, "language_loss": 0.59606886, "learning_rate": 3.957783169286024e-06, "loss": 0.61669648, "num_input_tokens_seen": 33430825, "step": 1556, "time_per_iteration": 3.209326982498169 }, { "auxiliary_loss_clip": 0.01174249, "auxiliary_loss_mlp": 0.01059741, "balance_loss_clip": 1.05518138, "balance_loss_mlp": 1.03727031, "epoch": 0.09361190440402825, "flos": 37341638720640.0, "grad_norm": 1.6803158790244075, "language_loss": 0.84290808, "learning_rate": 3.9577035339111155e-06, "loss": 0.86524796, "num_input_tokens_seen": 33454855, "step": 1557, "time_per_iteration": 2.831650733947754 }, { "auxiliary_loss_clip": 0.01110857, "auxiliary_loss_mlp": 0.01065156, "balance_loss_clip": 1.04900038, "balance_loss_mlp": 1.04112351, "epoch": 0.09367202765669623, "flos": 24899705112960.0, "grad_norm": 1.6725809358966677, "language_loss": 0.780913, "learning_rate": 3.957623824299893e-06, "loss": 0.8026731, "num_input_tokens_seen": 33476000, "step": 1558, "time_per_iteration": 3.0111780166625977 }, { "auxiliary_loss_clip": 0.01164994, "auxiliary_loss_mlp": 0.01051229, "balance_loss_clip": 1.0558666, "balance_loss_mlp": 1.02881753, "epoch": 0.0937321509093642, "flos": 15705568368000.0, "grad_norm": 2.0141986314124414, "language_loss": 0.80066288, "learning_rate": 3.957544040455379e-06, "loss": 0.82282507, "num_input_tokens_seen": 33493845, "step": 1559, "time_per_iteration": 3.024117946624756 }, { "auxiliary_loss_clip": 0.01141277, "auxiliary_loss_mlp": 0.01061718, "balance_loss_clip": 1.05060387, "balance_loss_mlp": 1.04012942, "epoch": 0.09379227416203216, "flos": 20483698222080.0, "grad_norm": 1.8358373674042003, "language_loss": 0.76418209, "learning_rate": 3.957464182380599e-06, "loss": 0.78621197, "num_input_tokens_seen": 33510850, "step": 1560, "time_per_iteration": 2.68558406829834 }, { "auxiliary_loss_clip": 0.01137939, "auxiliary_loss_mlp": 0.01054925, "balance_loss_clip": 1.05014277, "balance_loss_mlp": 1.03213274, "epoch": 0.09385239741470014, "flos": 24352498344960.0, "grad_norm": 3.575155933252121, "language_loss": 0.80784953, "learning_rate": 3.95738425007858e-06, "loss": 0.82977819, "num_input_tokens_seen": 33530430, "step": 1561, "time_per_iteration": 2.759148359298706 }, { "auxiliary_loss_clip": 0.01173652, "auxiliary_loss_mlp": 0.01052448, "balance_loss_clip": 1.05276573, "balance_loss_mlp": 1.02989376, "epoch": 0.0939125206673681, "flos": 33291489807360.0, "grad_norm": 2.448664627367939, "language_loss": 0.6140722, "learning_rate": 3.957304243552354e-06, "loss": 0.63633323, "num_input_tokens_seen": 33551975, "step": 1562, "time_per_iteration": 2.9014978408813477 }, { "auxiliary_loss_clip": 0.01162693, "auxiliary_loss_mlp": 0.0106374, "balance_loss_clip": 1.05719543, "balance_loss_mlp": 1.04213953, "epoch": 0.09397264392003607, "flos": 19244923925760.0, "grad_norm": 3.5098220300578555, "language_loss": 0.8496151, "learning_rate": 3.957224162804956e-06, "loss": 0.87187934, "num_input_tokens_seen": 33569850, "step": 1563, "time_per_iteration": 4.404061555862427 }, { "auxiliary_loss_clip": 0.01164811, "auxiliary_loss_mlp": 0.01047932, "balance_loss_clip": 1.05775142, "balance_loss_mlp": 1.02652228, "epoch": 0.09403276717270405, "flos": 19317930318720.0, "grad_norm": 1.6765528861156813, "language_loss": 0.76511294, "learning_rate": 3.9571440078394205e-06, "loss": 0.78724039, "num_input_tokens_seen": 33590510, "step": 1564, "time_per_iteration": 4.255565166473389 }, { "auxiliary_loss_clip": 0.01151297, "auxiliary_loss_mlp": 0.01063256, "balance_loss_clip": 1.05196142, "balance_loss_mlp": 1.04172707, "epoch": 0.09409289042537201, "flos": 23583471137280.0, "grad_norm": 1.9762038777899962, "language_loss": 0.80134326, "learning_rate": 3.9570637786587895e-06, "loss": 0.82348871, "num_input_tokens_seen": 33608810, "step": 1565, "time_per_iteration": 2.8548545837402344 }, { "auxiliary_loss_clip": 0.01158602, "auxiliary_loss_mlp": 0.01063767, "balance_loss_clip": 1.05420566, "balance_loss_mlp": 1.04233313, "epoch": 0.09415301367803998, "flos": 20078446003200.0, "grad_norm": 1.6810250981626251, "language_loss": 0.75134379, "learning_rate": 3.956983475266103e-06, "loss": 0.77356744, "num_input_tokens_seen": 33627265, "step": 1566, "time_per_iteration": 4.889045715332031 }, { "auxiliary_loss_clip": 0.01145856, "auxiliary_loss_mlp": 0.00780689, "balance_loss_clip": 1.05168366, "balance_loss_mlp": 1.00022864, "epoch": 0.09421313693070796, "flos": 21062075016960.0, "grad_norm": 1.6828919748843199, "language_loss": 0.77958012, "learning_rate": 3.956903097664407e-06, "loss": 0.79884553, "num_input_tokens_seen": 33644810, "step": 1567, "time_per_iteration": 4.445765972137451 }, { "auxiliary_loss_clip": 0.01156815, "auxiliary_loss_mlp": 0.01056228, "balance_loss_clip": 1.05256855, "balance_loss_mlp": 1.03591454, "epoch": 0.09427326018337592, "flos": 24316156759680.0, "grad_norm": 2.008686295040646, "language_loss": 0.82608044, "learning_rate": 3.956822645856749e-06, "loss": 0.84821093, "num_input_tokens_seen": 33665665, "step": 1568, "time_per_iteration": 2.881535768508911 }, { "auxiliary_loss_clip": 0.01187915, "auxiliary_loss_mlp": 0.01051731, "balance_loss_clip": 1.05717778, "balance_loss_mlp": 1.02927184, "epoch": 0.09433338343604389, "flos": 20263888944000.0, "grad_norm": 1.9573151026586577, "language_loss": 0.76943743, "learning_rate": 3.9567421198461814e-06, "loss": 0.79183388, "num_input_tokens_seen": 33684760, "step": 1569, "time_per_iteration": 2.6097726821899414 }, { "auxiliary_loss_clip": 0.01120191, "auxiliary_loss_mlp": 0.01060805, "balance_loss_clip": 1.04771852, "balance_loss_mlp": 1.03625941, "epoch": 0.09439350668871185, "flos": 12742973493120.0, "grad_norm": 3.3813700161908917, "language_loss": 0.85488856, "learning_rate": 3.956661519635756e-06, "loss": 0.87669849, "num_input_tokens_seen": 33700750, "step": 1570, "time_per_iteration": 2.7571377754211426 }, { "auxiliary_loss_clip": 0.01122458, "auxiliary_loss_mlp": 0.01055939, "balance_loss_clip": 1.04927301, "balance_loss_mlp": 1.03183508, "epoch": 0.09445362994137983, "flos": 25962266263680.0, "grad_norm": 1.540414635950846, "language_loss": 0.76415235, "learning_rate": 3.95658084522853e-06, "loss": 0.7859363, "num_input_tokens_seen": 33724430, "step": 1571, "time_per_iteration": 2.913569211959839 }, { "auxiliary_loss_clip": 0.01135683, "auxiliary_loss_mlp": 0.01057111, "balance_loss_clip": 1.0490278, "balance_loss_mlp": 1.0349735, "epoch": 0.0945137531940478, "flos": 19715353372800.0, "grad_norm": 1.6745378641752047, "language_loss": 0.79397607, "learning_rate": 3.956500096627561e-06, "loss": 0.81590402, "num_input_tokens_seen": 33743455, "step": 1572, "time_per_iteration": 2.813410758972168 }, { "auxiliary_loss_clip": 0.01148251, "auxiliary_loss_mlp": 0.0106927, "balance_loss_clip": 1.05619979, "balance_loss_mlp": 1.04524922, "epoch": 0.09457387644671576, "flos": 23617047375360.0, "grad_norm": 1.7559396294879055, "language_loss": 0.87707287, "learning_rate": 3.956419273835913e-06, "loss": 0.89924812, "num_input_tokens_seen": 33763435, "step": 1573, "time_per_iteration": 2.776535987854004 }, { "auxiliary_loss_clip": 0.01161183, "auxiliary_loss_mlp": 0.01063326, "balance_loss_clip": 1.05485129, "balance_loss_mlp": 1.03804219, "epoch": 0.09463399969938374, "flos": 26907291135360.0, "grad_norm": 2.9707854698090097, "language_loss": 0.81982428, "learning_rate": 3.95633837685665e-06, "loss": 0.84206939, "num_input_tokens_seen": 33784325, "step": 1574, "time_per_iteration": 2.7604806423187256 }, { "auxiliary_loss_clip": 0.01156287, "auxiliary_loss_mlp": 0.01055594, "balance_loss_clip": 1.05234718, "balance_loss_mlp": 1.0344342, "epoch": 0.0946941229520517, "flos": 23659566099840.0, "grad_norm": 1.7178511535677499, "language_loss": 0.80855322, "learning_rate": 3.95625740569284e-06, "loss": 0.83067203, "num_input_tokens_seen": 33802510, "step": 1575, "time_per_iteration": 2.713247299194336 }, { "auxiliary_loss_clip": 0.01182326, "auxiliary_loss_mlp": 0.01068689, "balance_loss_clip": 1.05578864, "balance_loss_mlp": 1.04581285, "epoch": 0.09475424620471967, "flos": 24134053783680.0, "grad_norm": 1.9110861379460222, "language_loss": 0.86483347, "learning_rate": 3.956176360347553e-06, "loss": 0.88734365, "num_input_tokens_seen": 33819980, "step": 1576, "time_per_iteration": 2.682644844055176 }, { "auxiliary_loss_clip": 0.01056441, "auxiliary_loss_mlp": 0.01027284, "balance_loss_clip": 1.0225811, "balance_loss_mlp": 1.02344561, "epoch": 0.09481436945738765, "flos": 68426168065920.0, "grad_norm": 0.9789918611127905, "language_loss": 0.6582402, "learning_rate": 3.956095240823862e-06, "loss": 0.67907751, "num_input_tokens_seen": 33878925, "step": 1577, "time_per_iteration": 3.2106685638427734 }, { "auxiliary_loss_clip": 0.01147668, "auxiliary_loss_mlp": 0.01051958, "balance_loss_clip": 1.05218005, "balance_loss_mlp": 1.03098869, "epoch": 0.09487449271005562, "flos": 16654076858880.0, "grad_norm": 1.8223175005615506, "language_loss": 0.79152733, "learning_rate": 3.956014047124844e-06, "loss": 0.81352365, "num_input_tokens_seen": 33897600, "step": 1578, "time_per_iteration": 2.820089340209961 }, { "auxiliary_loss_clip": 0.01185941, "auxiliary_loss_mlp": 0.01066432, "balance_loss_clip": 1.05838132, "balance_loss_mlp": 1.04437804, "epoch": 0.09493461596272358, "flos": 24275685110400.0, "grad_norm": 3.480730999818176, "language_loss": 0.78161818, "learning_rate": 3.955932779253578e-06, "loss": 0.80414188, "num_input_tokens_seen": 33917365, "step": 1579, "time_per_iteration": 2.6518983840942383 }, { "auxiliary_loss_clip": 0.01128319, "auxiliary_loss_mlp": 0.01065633, "balance_loss_clip": 1.04771328, "balance_loss_mlp": 1.04001498, "epoch": 0.09499473921539155, "flos": 21870173243520.0, "grad_norm": 2.0084876987684526, "language_loss": 0.73410392, "learning_rate": 3.955851437213144e-06, "loss": 0.75604343, "num_input_tokens_seen": 33936680, "step": 1580, "time_per_iteration": 2.679461717605591 }, { "auxiliary_loss_clip": 0.01157568, "auxiliary_loss_mlp": 0.01062628, "balance_loss_clip": 1.05573344, "balance_loss_mlp": 1.04095626, "epoch": 0.09505486246805953, "flos": 33547137880320.0, "grad_norm": 14.809542792179553, "language_loss": 0.77565914, "learning_rate": 3.955770021006627e-06, "loss": 0.7978611, "num_input_tokens_seen": 33960685, "step": 1581, "time_per_iteration": 2.765394449234009 }, { "auxiliary_loss_clip": 0.01144835, "auxiliary_loss_mlp": 0.0106468, "balance_loss_clip": 1.05426359, "balance_loss_mlp": 1.04276967, "epoch": 0.09511498572072749, "flos": 21215342350080.0, "grad_norm": 1.8617167187056045, "language_loss": 0.87230825, "learning_rate": 3.955688530637116e-06, "loss": 0.89440346, "num_input_tokens_seen": 33980015, "step": 1582, "time_per_iteration": 2.691364288330078 }, { "auxiliary_loss_clip": 0.01174295, "auxiliary_loss_mlp": 0.0106431, "balance_loss_clip": 1.05508888, "balance_loss_mlp": 1.04039705, "epoch": 0.09517510897339546, "flos": 14611262572800.0, "grad_norm": 1.8512060219658202, "language_loss": 0.67043924, "learning_rate": 3.955606966107699e-06, "loss": 0.69282532, "num_input_tokens_seen": 33997705, "step": 1583, "time_per_iteration": 2.6693732738494873 }, { "auxiliary_loss_clip": 0.01177751, "auxiliary_loss_mlp": 0.01053743, "balance_loss_clip": 1.0593859, "balance_loss_mlp": 1.03035378, "epoch": 0.09523523222606343, "flos": 27817339138560.0, "grad_norm": 2.144216926782962, "language_loss": 0.70752859, "learning_rate": 3.95552532742147e-06, "loss": 0.7298435, "num_input_tokens_seen": 34017465, "step": 1584, "time_per_iteration": 2.7164390087127686 }, { "auxiliary_loss_clip": 0.01138507, "auxiliary_loss_mlp": 0.0105762, "balance_loss_clip": 1.05243039, "balance_loss_mlp": 1.03584039, "epoch": 0.0952953554787314, "flos": 20706272847360.0, "grad_norm": 1.4654737580846544, "language_loss": 0.8080442, "learning_rate": 3.955443614581525e-06, "loss": 0.83000553, "num_input_tokens_seen": 34038550, "step": 1585, "time_per_iteration": 2.879831314086914 }, { "auxiliary_loss_clip": 0.01159374, "auxiliary_loss_mlp": 0.01057717, "balance_loss_clip": 1.05387473, "balance_loss_mlp": 1.03355336, "epoch": 0.09535547873139937, "flos": 24787627701120.0, "grad_norm": 1.638250735795891, "language_loss": 0.71921158, "learning_rate": 3.955361827590961e-06, "loss": 0.74138248, "num_input_tokens_seen": 34058665, "step": 1586, "time_per_iteration": 2.750436544418335 }, { "auxiliary_loss_clip": 0.01048565, "auxiliary_loss_mlp": 0.01003302, "balance_loss_clip": 1.03115988, "balance_loss_mlp": 0.99901009, "epoch": 0.09541560198406734, "flos": 71912194905600.0, "grad_norm": 0.8099482252624973, "language_loss": 0.55475175, "learning_rate": 3.955279966452883e-06, "loss": 0.57527041, "num_input_tokens_seen": 34109655, "step": 1587, "time_per_iteration": 3.0975699424743652 }, { "auxiliary_loss_clip": 0.01128884, "auxiliary_loss_mlp": 0.0105965, "balance_loss_clip": 1.04768586, "balance_loss_mlp": 1.03661847, "epoch": 0.09547572523673531, "flos": 28982604251520.0, "grad_norm": 1.708481785076906, "language_loss": 0.81062275, "learning_rate": 3.955198031170391e-06, "loss": 0.83250809, "num_input_tokens_seen": 34131115, "step": 1588, "time_per_iteration": 2.7718451023101807 }, { "auxiliary_loss_clip": 0.01131602, "auxiliary_loss_mlp": 0.01056117, "balance_loss_clip": 1.04894614, "balance_loss_mlp": 1.03438473, "epoch": 0.09553584848940327, "flos": 24133910129280.0, "grad_norm": 1.5119879232668088, "language_loss": 0.81481898, "learning_rate": 3.955116021746594e-06, "loss": 0.83669615, "num_input_tokens_seen": 34151925, "step": 1589, "time_per_iteration": 2.782468795776367 }, { "auxiliary_loss_clip": 0.0112194, "auxiliary_loss_mlp": 0.00780573, "balance_loss_clip": 1.0508883, "balance_loss_mlp": 1.00013089, "epoch": 0.09559597174207124, "flos": 42851376789120.0, "grad_norm": 1.525287399882202, "language_loss": 0.64882791, "learning_rate": 3.955033938184601e-06, "loss": 0.667853, "num_input_tokens_seen": 34175395, "step": 1590, "time_per_iteration": 3.0783450603485107 }, { "auxiliary_loss_clip": 0.01143501, "auxiliary_loss_mlp": 0.01058399, "balance_loss_clip": 1.05087948, "balance_loss_mlp": 1.0358206, "epoch": 0.09565609499473922, "flos": 32670845683200.0, "grad_norm": 2.0745314237741916, "language_loss": 0.83290577, "learning_rate": 3.954951780487526e-06, "loss": 0.85492468, "num_input_tokens_seen": 34197760, "step": 1591, "time_per_iteration": 2.8393962383270264 }, { "auxiliary_loss_clip": 0.01163486, "auxiliary_loss_mlp": 0.01065588, "balance_loss_clip": 1.0522387, "balance_loss_mlp": 1.04266405, "epoch": 0.09571621824740718, "flos": 18478410670080.0, "grad_norm": 2.825705290827541, "language_loss": 0.74087322, "learning_rate": 3.9548695486584835e-06, "loss": 0.76316392, "num_input_tokens_seen": 34215330, "step": 1592, "time_per_iteration": 2.6828882694244385 }, { "auxiliary_loss_clip": 0.01169239, "auxiliary_loss_mlp": 0.01055073, "balance_loss_clip": 1.05161428, "balance_loss_mlp": 1.03337741, "epoch": 0.09577634150007515, "flos": 29387497334400.0, "grad_norm": 2.18277080043521, "language_loss": 0.74483889, "learning_rate": 3.954787242700592e-06, "loss": 0.76708198, "num_input_tokens_seen": 34237745, "step": 1593, "time_per_iteration": 2.7193498611450195 }, { "auxiliary_loss_clip": 0.01177343, "auxiliary_loss_mlp": 0.01055096, "balance_loss_clip": 1.05910873, "balance_loss_mlp": 1.03307831, "epoch": 0.09583646475274313, "flos": 22747830157440.0, "grad_norm": 1.887493467708827, "language_loss": 0.69782627, "learning_rate": 3.954704862616971e-06, "loss": 0.72015071, "num_input_tokens_seen": 34256565, "step": 1594, "time_per_iteration": 2.635383367538452 }, { "auxiliary_loss_clip": 0.01173222, "auxiliary_loss_mlp": 0.01051806, "balance_loss_clip": 1.05618978, "balance_loss_mlp": 1.03037214, "epoch": 0.0958965880054111, "flos": 23218367345280.0, "grad_norm": 2.1411006117727682, "language_loss": 0.82780552, "learning_rate": 3.954622408410747e-06, "loss": 0.85005581, "num_input_tokens_seen": 34275970, "step": 1595, "time_per_iteration": 2.7158257961273193 }, { "auxiliary_loss_clip": 0.01153253, "auxiliary_loss_mlp": 0.01054246, "balance_loss_clip": 1.05143809, "balance_loss_mlp": 1.0301652, "epoch": 0.09595671125807906, "flos": 21324438933120.0, "grad_norm": 1.7751890788987925, "language_loss": 0.84513396, "learning_rate": 3.954539880085045e-06, "loss": 0.86720896, "num_input_tokens_seen": 34295490, "step": 1596, "time_per_iteration": 2.710228204727173 }, { "auxiliary_loss_clip": 0.01166586, "auxiliary_loss_mlp": 0.0105804, "balance_loss_clip": 1.05440903, "balance_loss_mlp": 1.03376901, "epoch": 0.09601683451074704, "flos": 39603472185600.0, "grad_norm": 1.8335529067237837, "language_loss": 0.69328064, "learning_rate": 3.9544572776429945e-06, "loss": 0.71552688, "num_input_tokens_seen": 34319990, "step": 1597, "time_per_iteration": 2.802959442138672 }, { "auxiliary_loss_clip": 0.01167235, "auxiliary_loss_mlp": 0.00780978, "balance_loss_clip": 1.0503217, "balance_loss_mlp": 1.00010371, "epoch": 0.096076957763415, "flos": 23732716147200.0, "grad_norm": 2.0491570740921885, "language_loss": 0.7486403, "learning_rate": 3.954374601087729e-06, "loss": 0.76812243, "num_input_tokens_seen": 34339225, "step": 1598, "time_per_iteration": 2.6502270698547363 }, { "auxiliary_loss_clip": 0.01176661, "auxiliary_loss_mlp": 0.01053936, "balance_loss_clip": 1.05745888, "balance_loss_mlp": 1.03009462, "epoch": 0.09613708101608297, "flos": 34678108483200.0, "grad_norm": 1.6831440826618358, "language_loss": 0.68804371, "learning_rate": 3.954291850422382e-06, "loss": 0.71034968, "num_input_tokens_seen": 34361020, "step": 1599, "time_per_iteration": 2.74243426322937 }, { "auxiliary_loss_clip": 0.01157322, "auxiliary_loss_mlp": 0.01059883, "balance_loss_clip": 1.05754852, "balance_loss_mlp": 1.0371263, "epoch": 0.09619720426875093, "flos": 20740028653440.0, "grad_norm": 2.9774251326108367, "language_loss": 0.83950365, "learning_rate": 3.954209025650093e-06, "loss": 0.86167574, "num_input_tokens_seen": 34378630, "step": 1600, "time_per_iteration": 2.702907085418701 }, { "auxiliary_loss_clip": 0.01150263, "auxiliary_loss_mlp": 0.01054168, "balance_loss_clip": 1.05129707, "balance_loss_mlp": 1.03093433, "epoch": 0.09625732752141891, "flos": 13042720488960.0, "grad_norm": 2.287254549480118, "language_loss": 0.80520785, "learning_rate": 3.954126126774001e-06, "loss": 0.82725215, "num_input_tokens_seen": 34397110, "step": 1601, "time_per_iteration": 2.693399429321289 }, { "auxiliary_loss_clip": 0.01181247, "auxiliary_loss_mlp": 0.01054578, "balance_loss_clip": 1.05711937, "balance_loss_mlp": 1.03133249, "epoch": 0.09631745077408688, "flos": 22273629782400.0, "grad_norm": 2.4356926646094954, "language_loss": 0.81959623, "learning_rate": 3.954043153797251e-06, "loss": 0.84195447, "num_input_tokens_seen": 34414165, "step": 1602, "time_per_iteration": 2.639479875564575 }, { "auxiliary_loss_clip": 0.01137855, "auxiliary_loss_mlp": 0.01051495, "balance_loss_clip": 1.05295444, "balance_loss_mlp": 1.02681863, "epoch": 0.09637757402675484, "flos": 24754266944640.0, "grad_norm": 3.099164686790191, "language_loss": 0.62498438, "learning_rate": 3.953960106722989e-06, "loss": 0.64687788, "num_input_tokens_seen": 34434445, "step": 1603, "time_per_iteration": 4.341834306716919 }, { "auxiliary_loss_clip": 0.01189954, "auxiliary_loss_mlp": 0.01054376, "balance_loss_clip": 1.05902839, "balance_loss_mlp": 1.02918696, "epoch": 0.09643769727942282, "flos": 22525758322560.0, "grad_norm": 3.121905357886113, "language_loss": 0.70996022, "learning_rate": 3.953876985554364e-06, "loss": 0.73240346, "num_input_tokens_seen": 34453095, "step": 1604, "time_per_iteration": 2.6520893573760986 }, { "auxiliary_loss_clip": 0.01176446, "auxiliary_loss_mlp": 0.01055314, "balance_loss_clip": 1.0570209, "balance_loss_mlp": 1.03358221, "epoch": 0.09649782053209079, "flos": 30921026636160.0, "grad_norm": 2.082890345500055, "language_loss": 0.7993719, "learning_rate": 3.953793790294527e-06, "loss": 0.82168949, "num_input_tokens_seen": 34473680, "step": 1605, "time_per_iteration": 4.5557661056518555 }, { "auxiliary_loss_clip": 0.01161047, "auxiliary_loss_mlp": 0.01047918, "balance_loss_clip": 1.05455577, "balance_loss_mlp": 1.0245893, "epoch": 0.09655794378475875, "flos": 25337635729920.0, "grad_norm": 1.990204665194141, "language_loss": 0.74550986, "learning_rate": 3.953710520946634e-06, "loss": 0.76759952, "num_input_tokens_seen": 34492610, "step": 1606, "time_per_iteration": 2.7172651290893555 }, { "auxiliary_loss_clip": 0.01172416, "auxiliary_loss_mlp": 0.01046772, "balance_loss_clip": 1.05834222, "balance_loss_mlp": 1.02378857, "epoch": 0.09661806703742673, "flos": 22346061557760.0, "grad_norm": 1.6403710807101601, "language_loss": 0.7571919, "learning_rate": 3.953627177513843e-06, "loss": 0.77938372, "num_input_tokens_seen": 34511855, "step": 1607, "time_per_iteration": 4.302686452865601 }, { "auxiliary_loss_clip": 0.01139491, "auxiliary_loss_mlp": 0.01051546, "balance_loss_clip": 1.04833579, "balance_loss_mlp": 1.0289799, "epoch": 0.0966781902900947, "flos": 17457578144640.0, "grad_norm": 1.975850982703557, "language_loss": 0.86756283, "learning_rate": 3.953543759999312e-06, "loss": 0.88947326, "num_input_tokens_seen": 34528905, "step": 1608, "time_per_iteration": 2.6280455589294434 }, { "auxiliary_loss_clip": 0.01126253, "auxiliary_loss_mlp": 0.01064704, "balance_loss_clip": 1.05433142, "balance_loss_mlp": 1.03940821, "epoch": 0.09673831354276266, "flos": 36903995412480.0, "grad_norm": 2.3082762386200266, "language_loss": 0.71363097, "learning_rate": 3.953460268406207e-06, "loss": 0.73554057, "num_input_tokens_seen": 34548480, "step": 1609, "time_per_iteration": 2.9116146564483643 }, { "auxiliary_loss_clip": 0.01149353, "auxiliary_loss_mlp": 0.01058179, "balance_loss_clip": 1.0546515, "balance_loss_mlp": 1.03606534, "epoch": 0.09679843679543064, "flos": 20701388597760.0, "grad_norm": 1.9988414994799784, "language_loss": 0.84810984, "learning_rate": 3.953376702737693e-06, "loss": 0.87018514, "num_input_tokens_seen": 34565410, "step": 1610, "time_per_iteration": 2.8005051612854004 }, { "auxiliary_loss_clip": 0.01161389, "auxiliary_loss_mlp": 0.01056267, "balance_loss_clip": 1.05790925, "balance_loss_mlp": 1.03228188, "epoch": 0.0968585600480986, "flos": 23514415240320.0, "grad_norm": 2.176236379770122, "language_loss": 0.6696198, "learning_rate": 3.953293062996939e-06, "loss": 0.69179636, "num_input_tokens_seen": 34584840, "step": 1611, "time_per_iteration": 2.731931447982788 }, { "auxiliary_loss_clip": 0.01125259, "auxiliary_loss_mlp": 0.01057116, "balance_loss_clip": 1.04740572, "balance_loss_mlp": 1.03385806, "epoch": 0.09691868330076657, "flos": 20121072468480.0, "grad_norm": 1.6508278294088392, "language_loss": 0.81067657, "learning_rate": 3.953209349187115e-06, "loss": 0.83250034, "num_input_tokens_seen": 34603360, "step": 1612, "time_per_iteration": 2.7998390197753906 }, { "auxiliary_loss_clip": 0.01182404, "auxiliary_loss_mlp": 0.01069551, "balance_loss_clip": 1.06046534, "balance_loss_mlp": 1.04600716, "epoch": 0.09697880655343454, "flos": 16544692967040.0, "grad_norm": 3.304939197664143, "language_loss": 0.80836105, "learning_rate": 3.953125561311398e-06, "loss": 0.83088064, "num_input_tokens_seen": 34620760, "step": 1613, "time_per_iteration": 2.624218702316284 }, { "auxiliary_loss_clip": 0.01148565, "auxiliary_loss_mlp": 0.01054743, "balance_loss_clip": 1.05542159, "balance_loss_mlp": 1.03047192, "epoch": 0.09703892980610251, "flos": 26104184899200.0, "grad_norm": 1.7164386274315457, "language_loss": 0.84289789, "learning_rate": 3.953041699372964e-06, "loss": 0.86493099, "num_input_tokens_seen": 34640695, "step": 1614, "time_per_iteration": 2.744340419769287 }, { "auxiliary_loss_clip": 0.01066618, "auxiliary_loss_mlp": 0.00759744, "balance_loss_clip": 1.02654934, "balance_loss_mlp": 1.00008702, "epoch": 0.09709905305877048, "flos": 60443622000000.0, "grad_norm": 0.7127167896900892, "language_loss": 0.54629624, "learning_rate": 3.952957763374992e-06, "loss": 0.56455994, "num_input_tokens_seen": 34702395, "step": 1615, "time_per_iteration": 3.1547679901123047 }, { "auxiliary_loss_clip": 0.01033143, "auxiliary_loss_mlp": 0.01017555, "balance_loss_clip": 1.02384067, "balance_loss_mlp": 1.01381195, "epoch": 0.09715917631143844, "flos": 57639932893440.0, "grad_norm": 0.7689373847786285, "language_loss": 0.58190405, "learning_rate": 3.952873753320666e-06, "loss": 0.60241103, "num_input_tokens_seen": 34768910, "step": 1616, "time_per_iteration": 3.3940556049346924 }, { "auxiliary_loss_clip": 0.01155533, "auxiliary_loss_mlp": 0.01067983, "balance_loss_clip": 1.05504358, "balance_loss_mlp": 1.04205465, "epoch": 0.09721929956410642, "flos": 20558212986240.0, "grad_norm": 1.8932449927934136, "language_loss": 0.69031835, "learning_rate": 3.952789669213172e-06, "loss": 0.7125535, "num_input_tokens_seen": 34787680, "step": 1617, "time_per_iteration": 2.714629888534546 }, { "auxiliary_loss_clip": 0.01152637, "auxiliary_loss_mlp": 0.01057882, "balance_loss_clip": 1.05386162, "balance_loss_mlp": 1.03127456, "epoch": 0.09727942281677439, "flos": 27344359825920.0, "grad_norm": 1.755493071880773, "language_loss": 0.80910909, "learning_rate": 3.952705511055698e-06, "loss": 0.83121431, "num_input_tokens_seen": 34808330, "step": 1618, "time_per_iteration": 2.8081507682800293 }, { "auxiliary_loss_clip": 0.01168356, "auxiliary_loss_mlp": 0.01058179, "balance_loss_clip": 1.06048679, "balance_loss_mlp": 1.03678131, "epoch": 0.09733954606944235, "flos": 24900028335360.0, "grad_norm": 1.667659488760432, "language_loss": 0.92901695, "learning_rate": 3.952621278851435e-06, "loss": 0.95128226, "num_input_tokens_seen": 34830020, "step": 1619, "time_per_iteration": 2.7752275466918945 }, { "auxiliary_loss_clip": 0.01175515, "auxiliary_loss_mlp": 0.01058252, "balance_loss_clip": 1.05952573, "balance_loss_mlp": 1.03512526, "epoch": 0.09739966932211033, "flos": 31503928544640.0, "grad_norm": 2.1973967195348902, "language_loss": 0.88978708, "learning_rate": 3.9525369726035784e-06, "loss": 0.91212475, "num_input_tokens_seen": 34850330, "step": 1620, "time_per_iteration": 2.771176338195801 }, { "auxiliary_loss_clip": 0.01153763, "auxiliary_loss_mlp": 0.01065329, "balance_loss_clip": 1.05353975, "balance_loss_mlp": 1.0397464, "epoch": 0.0974597925747783, "flos": 23878764846720.0, "grad_norm": 2.154793183838835, "language_loss": 0.77331412, "learning_rate": 3.952452592315324e-06, "loss": 0.79550499, "num_input_tokens_seen": 34871640, "step": 1621, "time_per_iteration": 2.6740832328796387 }, { "auxiliary_loss_clip": 0.01131342, "auxiliary_loss_mlp": 0.01082359, "balance_loss_clip": 1.04798269, "balance_loss_mlp": 1.05640674, "epoch": 0.09751991582744626, "flos": 17019575700480.0, "grad_norm": 1.9420195171733425, "language_loss": 0.77671158, "learning_rate": 3.952368137989871e-06, "loss": 0.79884863, "num_input_tokens_seen": 34888100, "step": 1622, "time_per_iteration": 2.7247347831726074 }, { "auxiliary_loss_clip": 0.01150185, "auxiliary_loss_mlp": 0.01064277, "balance_loss_clip": 1.05335355, "balance_loss_mlp": 1.04025626, "epoch": 0.09758003908011423, "flos": 28402826826240.0, "grad_norm": 1.8603109065807166, "language_loss": 0.85784447, "learning_rate": 3.9522836096304225e-06, "loss": 0.87998909, "num_input_tokens_seen": 34910485, "step": 1623, "time_per_iteration": 2.785388469696045 }, { "auxiliary_loss_clip": 0.0117659, "auxiliary_loss_mlp": 0.01064102, "balance_loss_clip": 1.05769634, "balance_loss_mlp": 1.04043913, "epoch": 0.09764016233278221, "flos": 18144297336960.0, "grad_norm": 2.39630116599036, "language_loss": 0.80534065, "learning_rate": 3.952199007240184e-06, "loss": 0.82774758, "num_input_tokens_seen": 34928615, "step": 1624, "time_per_iteration": 2.6818184852600098 }, { "auxiliary_loss_clip": 0.01176335, "auxiliary_loss_mlp": 0.01056788, "balance_loss_clip": 1.05616927, "balance_loss_mlp": 1.03465128, "epoch": 0.09770028558545017, "flos": 15265842071040.0, "grad_norm": 2.44379144971104, "language_loss": 0.85556966, "learning_rate": 3.952114330822364e-06, "loss": 0.8779009, "num_input_tokens_seen": 34946045, "step": 1625, "time_per_iteration": 2.6594324111938477 }, { "auxiliary_loss_clip": 0.01181411, "auxiliary_loss_mlp": 0.0106682, "balance_loss_clip": 1.06004012, "balance_loss_mlp": 1.04411101, "epoch": 0.09776040883811814, "flos": 23472435219840.0, "grad_norm": 2.058269503362464, "language_loss": 0.85431635, "learning_rate": 3.952029580380172e-06, "loss": 0.87679869, "num_input_tokens_seen": 34962865, "step": 1626, "time_per_iteration": 2.7384841442108154 }, { "auxiliary_loss_clip": 0.01165311, "auxiliary_loss_mlp": 0.007823, "balance_loss_clip": 1.05467701, "balance_loss_mlp": 1.000211, "epoch": 0.09782053209078612, "flos": 24499480798080.0, "grad_norm": 2.0701580273163036, "language_loss": 0.83370024, "learning_rate": 3.9519447559168234e-06, "loss": 0.85317636, "num_input_tokens_seen": 34983505, "step": 1627, "time_per_iteration": 2.8269948959350586 }, { "auxiliary_loss_clip": 0.01168188, "auxiliary_loss_mlp": 0.01065332, "balance_loss_clip": 1.05557203, "balance_loss_mlp": 1.04275417, "epoch": 0.09788065534345408, "flos": 21580158833280.0, "grad_norm": 1.8143281262319713, "language_loss": 0.84674478, "learning_rate": 3.951859857435534e-06, "loss": 0.86907995, "num_input_tokens_seen": 35001825, "step": 1628, "time_per_iteration": 2.6151821613311768 }, { "auxiliary_loss_clip": 0.01170257, "auxiliary_loss_mlp": 0.01058367, "balance_loss_clip": 1.05374515, "balance_loss_mlp": 1.03558636, "epoch": 0.09794077859612205, "flos": 23842459175040.0, "grad_norm": 1.5658807312485334, "language_loss": 0.75531614, "learning_rate": 3.951774884939523e-06, "loss": 0.77760237, "num_input_tokens_seen": 35023075, "step": 1629, "time_per_iteration": 2.6794557571411133 }, { "auxiliary_loss_clip": 0.01129604, "auxiliary_loss_mlp": 0.01056904, "balance_loss_clip": 1.0577755, "balance_loss_mlp": 1.03169131, "epoch": 0.09800090184879003, "flos": 23659889322240.0, "grad_norm": 1.6755762488260617, "language_loss": 0.78487194, "learning_rate": 3.951689838432013e-06, "loss": 0.80673707, "num_input_tokens_seen": 35043480, "step": 1630, "time_per_iteration": 2.7986228466033936 }, { "auxiliary_loss_clip": 0.01167766, "auxiliary_loss_mlp": 0.01063441, "balance_loss_clip": 1.05938148, "balance_loss_mlp": 1.03804946, "epoch": 0.09806102510145799, "flos": 17055773631360.0, "grad_norm": 1.8175370389297836, "language_loss": 0.86677933, "learning_rate": 3.951604717916228e-06, "loss": 0.88909143, "num_input_tokens_seen": 35061490, "step": 1631, "time_per_iteration": 2.6350157260894775 }, { "auxiliary_loss_clip": 0.01171369, "auxiliary_loss_mlp": 0.01058643, "balance_loss_clip": 1.0610745, "balance_loss_mlp": 1.03625536, "epoch": 0.09812114835412596, "flos": 23878477537920.0, "grad_norm": 2.2030333753544773, "language_loss": 0.82996809, "learning_rate": 3.9515195233953975e-06, "loss": 0.85226822, "num_input_tokens_seen": 35079670, "step": 1632, "time_per_iteration": 2.7990314960479736 }, { "auxiliary_loss_clip": 0.01148453, "auxiliary_loss_mlp": 0.01064004, "balance_loss_clip": 1.05554819, "balance_loss_mlp": 1.04102039, "epoch": 0.09818127160679392, "flos": 20595488325120.0, "grad_norm": 1.531777801288569, "language_loss": 0.7882973, "learning_rate": 3.951434254872751e-06, "loss": 0.81042188, "num_input_tokens_seen": 35099205, "step": 1633, "time_per_iteration": 2.735353708267212 }, { "auxiliary_loss_clip": 0.01170992, "auxiliary_loss_mlp": 0.01061681, "balance_loss_clip": 1.05558002, "balance_loss_mlp": 1.03731489, "epoch": 0.0982413948594619, "flos": 15487339288320.0, "grad_norm": 2.4037572513069687, "language_loss": 0.73209554, "learning_rate": 3.951348912351521e-06, "loss": 0.75442231, "num_input_tokens_seen": 35115270, "step": 1634, "time_per_iteration": 2.688596248626709 }, { "auxiliary_loss_clip": 0.01162743, "auxiliary_loss_mlp": 0.01071164, "balance_loss_clip": 1.05591321, "balance_loss_mlp": 1.04672611, "epoch": 0.09830151811212987, "flos": 24207958016640.0, "grad_norm": 3.2244021303311405, "language_loss": 0.72553629, "learning_rate": 3.951263495834947e-06, "loss": 0.74787533, "num_input_tokens_seen": 35134065, "step": 1635, "time_per_iteration": 2.720266342163086 }, { "auxiliary_loss_clip": 0.01154765, "auxiliary_loss_mlp": 0.01068349, "balance_loss_clip": 1.05526268, "balance_loss_mlp": 1.04177701, "epoch": 0.09836164136479783, "flos": 20594590485120.0, "grad_norm": 1.7699592352066487, "language_loss": 0.78026646, "learning_rate": 3.951178005326264e-06, "loss": 0.80249763, "num_input_tokens_seen": 35154870, "step": 1636, "time_per_iteration": 2.9618239402770996 }, { "auxiliary_loss_clip": 0.01162744, "auxiliary_loss_mlp": 0.01060716, "balance_loss_clip": 1.05561686, "balance_loss_mlp": 1.0368979, "epoch": 0.09842176461746581, "flos": 19934157070080.0, "grad_norm": 1.8332710343018006, "language_loss": 0.69524407, "learning_rate": 3.951092440828715e-06, "loss": 0.71747863, "num_input_tokens_seen": 35171850, "step": 1637, "time_per_iteration": 2.671178102493286 }, { "auxiliary_loss_clip": 0.01188316, "auxiliary_loss_mlp": 0.01058851, "balance_loss_clip": 1.05926394, "balance_loss_mlp": 1.03500926, "epoch": 0.09848188787013377, "flos": 21214659991680.0, "grad_norm": 2.3775286970935503, "language_loss": 0.77050996, "learning_rate": 3.951006802345545e-06, "loss": 0.79298162, "num_input_tokens_seen": 35188795, "step": 1638, "time_per_iteration": 2.62457537651062 }, { "auxiliary_loss_clip": 0.01140265, "auxiliary_loss_mlp": 0.01052026, "balance_loss_clip": 1.05538166, "balance_loss_mlp": 1.02941203, "epoch": 0.09854201112280174, "flos": 30154226071680.0, "grad_norm": 1.4014263071342075, "language_loss": 0.72620296, "learning_rate": 3.950921089880003e-06, "loss": 0.74812591, "num_input_tokens_seen": 35212100, "step": 1639, "time_per_iteration": 2.7499618530273438 }, { "auxiliary_loss_clip": 0.01173752, "auxiliary_loss_mlp": 0.01051382, "balance_loss_clip": 1.0582087, "balance_loss_mlp": 1.02831531, "epoch": 0.09860213437546972, "flos": 21795730306560.0, "grad_norm": 1.7213189449892274, "language_loss": 0.88679075, "learning_rate": 3.950835303435337e-06, "loss": 0.90904212, "num_input_tokens_seen": 35230390, "step": 1640, "time_per_iteration": 2.664133071899414 }, { "auxiliary_loss_clip": 0.01177786, "auxiliary_loss_mlp": 0.01044457, "balance_loss_clip": 1.05981517, "balance_loss_mlp": 1.02130616, "epoch": 0.09866225762813768, "flos": 21835555511040.0, "grad_norm": 2.0701766566296915, "language_loss": 0.80567038, "learning_rate": 3.950749443014801e-06, "loss": 0.82789278, "num_input_tokens_seen": 35250405, "step": 1641, "time_per_iteration": 2.645353317260742 }, { "auxiliary_loss_clip": 0.011756, "auxiliary_loss_mlp": 0.01062641, "balance_loss_clip": 1.05896795, "balance_loss_mlp": 1.03742838, "epoch": 0.09872238088080565, "flos": 17599855916160.0, "grad_norm": 2.64335263522248, "language_loss": 0.86117625, "learning_rate": 3.95066350862165e-06, "loss": 0.88355863, "num_input_tokens_seen": 35262820, "step": 1642, "time_per_iteration": 5.81004524230957 }, { "auxiliary_loss_clip": 0.01151329, "auxiliary_loss_mlp": 0.01056693, "balance_loss_clip": 1.05857074, "balance_loss_mlp": 1.03404331, "epoch": 0.09878250413347361, "flos": 27636134002560.0, "grad_norm": 2.7092208079201607, "language_loss": 0.8058275, "learning_rate": 3.950577500259144e-06, "loss": 0.82790768, "num_input_tokens_seen": 35284490, "step": 1643, "time_per_iteration": 2.7235090732574463 }, { "auxiliary_loss_clip": 0.01174075, "auxiliary_loss_mlp": 0.01077435, "balance_loss_clip": 1.05761337, "balance_loss_mlp": 1.05470192, "epoch": 0.0988426273861416, "flos": 16544728880640.0, "grad_norm": 2.0561742686210676, "language_loss": 0.82546467, "learning_rate": 3.950491417930543e-06, "loss": 0.84797978, "num_input_tokens_seen": 35302815, "step": 1644, "time_per_iteration": 4.318823575973511 }, { "auxiliary_loss_clip": 0.01163142, "auxiliary_loss_mlp": 0.00782463, "balance_loss_clip": 1.05607629, "balance_loss_mlp": 1.00010633, "epoch": 0.09890275063880956, "flos": 21215270522880.0, "grad_norm": 1.6945489721625269, "language_loss": 0.68219113, "learning_rate": 3.9504052616391124e-06, "loss": 0.70164716, "num_input_tokens_seen": 35321175, "step": 1645, "time_per_iteration": 2.6626670360565186 }, { "auxiliary_loss_clip": 0.01059795, "auxiliary_loss_mlp": 0.01047617, "balance_loss_clip": 1.02852345, "balance_loss_mlp": 1.04404068, "epoch": 0.09896287389147752, "flos": 59379372910080.0, "grad_norm": 0.8512889940087613, "language_loss": 0.60885167, "learning_rate": 3.950319031388119e-06, "loss": 0.62992585, "num_input_tokens_seen": 35381740, "step": 1646, "time_per_iteration": 4.752669095993042 }, { "auxiliary_loss_clip": 0.01147006, "auxiliary_loss_mlp": 0.0105976, "balance_loss_clip": 1.0574733, "balance_loss_mlp": 1.03464222, "epoch": 0.0990229971441455, "flos": 29642678530560.0, "grad_norm": 5.785751121573768, "language_loss": 0.73211443, "learning_rate": 3.950232727180833e-06, "loss": 0.7541821, "num_input_tokens_seen": 35403760, "step": 1647, "time_per_iteration": 2.783442974090576 }, { "auxiliary_loss_clip": 0.01161789, "auxiliary_loss_mlp": 0.01066314, "balance_loss_clip": 1.06016421, "balance_loss_mlp": 1.04445136, "epoch": 0.09908312039681347, "flos": 21834873152640.0, "grad_norm": 1.828428298130997, "language_loss": 0.84094375, "learning_rate": 3.950146349020525e-06, "loss": 0.86322474, "num_input_tokens_seen": 35424050, "step": 1648, "time_per_iteration": 2.709559679031372 }, { "auxiliary_loss_clip": 0.01065954, "auxiliary_loss_mlp": 0.01020799, "balance_loss_clip": 1.02565169, "balance_loss_mlp": 1.01722264, "epoch": 0.09914324364948143, "flos": 57564304807680.0, "grad_norm": 0.7317434537206132, "language_loss": 0.55672908, "learning_rate": 3.950059896910473e-06, "loss": 0.5775966, "num_input_tokens_seen": 35481690, "step": 1649, "time_per_iteration": 3.0944156646728516 }, { "auxiliary_loss_clip": 0.0117133, "auxiliary_loss_mlp": 0.01049543, "balance_loss_clip": 1.05603158, "balance_loss_mlp": 1.02723897, "epoch": 0.09920336690214941, "flos": 34123934476800.0, "grad_norm": 2.195431109372502, "language_loss": 0.8975327, "learning_rate": 3.949973370853954e-06, "loss": 0.91974139, "num_input_tokens_seen": 35498635, "step": 1650, "time_per_iteration": 2.7438554763793945 }, { "auxiliary_loss_clip": 0.01033978, "auxiliary_loss_mlp": 0.00758727, "balance_loss_clip": 1.02943921, "balance_loss_mlp": 0.9997822, "epoch": 0.09926349015481738, "flos": 71216428464000.0, "grad_norm": 0.8036050505402587, "language_loss": 0.63734978, "learning_rate": 3.94988677085425e-06, "loss": 0.65527683, "num_input_tokens_seen": 35565720, "step": 1651, "time_per_iteration": 3.40269136428833 }, { "auxiliary_loss_clip": 0.01170347, "auxiliary_loss_mlp": 0.01062486, "balance_loss_clip": 1.05790281, "balance_loss_mlp": 1.03842974, "epoch": 0.09932361340748534, "flos": 23148700917120.0, "grad_norm": 1.9744130417114842, "language_loss": 0.88115525, "learning_rate": 3.949800096914643e-06, "loss": 0.90348363, "num_input_tokens_seen": 35586000, "step": 1652, "time_per_iteration": 2.6695117950439453 }, { "auxiliary_loss_clip": 0.0116773, "auxiliary_loss_mlp": 0.01062073, "balance_loss_clip": 1.06095552, "balance_loss_mlp": 1.03895831, "epoch": 0.09938373666015332, "flos": 19828651847040.0, "grad_norm": 2.166773052437996, "language_loss": 0.81789082, "learning_rate": 3.949713349038422e-06, "loss": 0.84018886, "num_input_tokens_seen": 35604355, "step": 1653, "time_per_iteration": 2.7136831283569336 }, { "auxiliary_loss_clip": 0.01173152, "auxiliary_loss_mlp": 0.00780466, "balance_loss_clip": 1.05683279, "balance_loss_mlp": 1.00016594, "epoch": 0.09944385991282129, "flos": 22090664880000.0, "grad_norm": 1.662037391605293, "language_loss": 0.79489207, "learning_rate": 3.949626527228875e-06, "loss": 0.81442821, "num_input_tokens_seen": 35625495, "step": 1654, "time_per_iteration": 2.645875930786133 }, { "auxiliary_loss_clip": 0.01187918, "auxiliary_loss_mlp": 0.01056849, "balance_loss_clip": 1.06405056, "balance_loss_mlp": 1.03561759, "epoch": 0.09950398316548925, "flos": 19828867328640.0, "grad_norm": 1.7263610037420916, "language_loss": 0.81038272, "learning_rate": 3.949539631489295e-06, "loss": 0.83283037, "num_input_tokens_seen": 35645030, "step": 1655, "time_per_iteration": 2.630404233932495 }, { "auxiliary_loss_clip": 0.01181205, "auxiliary_loss_mlp": 0.01055977, "balance_loss_clip": 1.05679035, "balance_loss_mlp": 1.03294599, "epoch": 0.09956410641815722, "flos": 25003701964800.0, "grad_norm": 2.426795421082641, "language_loss": 0.80429518, "learning_rate": 3.9494526618229765e-06, "loss": 0.82666701, "num_input_tokens_seen": 35664305, "step": 1656, "time_per_iteration": 2.6283950805664062 }, { "auxiliary_loss_clip": 0.01170003, "auxiliary_loss_mlp": 0.01061881, "balance_loss_clip": 1.05787742, "balance_loss_mlp": 1.03870714, "epoch": 0.0996242296708252, "flos": 19317714837120.0, "grad_norm": 1.4960238412267362, "language_loss": 0.89040691, "learning_rate": 3.949365618233217e-06, "loss": 0.91272575, "num_input_tokens_seen": 35684060, "step": 1657, "time_per_iteration": 2.653674602508545 }, { "auxiliary_loss_clip": 0.01165842, "auxiliary_loss_mlp": 0.01057352, "balance_loss_clip": 1.05830753, "balance_loss_mlp": 1.0329144, "epoch": 0.09968435292349316, "flos": 21871609787520.0, "grad_norm": 2.1866084372248062, "language_loss": 0.84684521, "learning_rate": 3.9492785007233195e-06, "loss": 0.86907715, "num_input_tokens_seen": 35703250, "step": 1658, "time_per_iteration": 2.6897473335266113 }, { "auxiliary_loss_clip": 0.01069806, "auxiliary_loss_mlp": 0.01015844, "balance_loss_clip": 1.02042234, "balance_loss_mlp": 1.01292348, "epoch": 0.09974447617616113, "flos": 65384533313280.0, "grad_norm": 0.9123227767672076, "language_loss": 0.60828507, "learning_rate": 3.949191309296585e-06, "loss": 0.62914157, "num_input_tokens_seen": 35762165, "step": 1659, "time_per_iteration": 3.273890495300293 }, { "auxiliary_loss_clip": 0.01152432, "auxiliary_loss_mlp": 0.01051829, "balance_loss_clip": 1.05082798, "balance_loss_mlp": 1.02814245, "epoch": 0.0998045994288291, "flos": 23659817495040.0, "grad_norm": 1.9344290476513741, "language_loss": 0.84892076, "learning_rate": 3.949104043956321e-06, "loss": 0.87096334, "num_input_tokens_seen": 35781520, "step": 1660, "time_per_iteration": 2.788018226623535 }, { "auxiliary_loss_clip": 0.01149163, "auxiliary_loss_mlp": 0.01060092, "balance_loss_clip": 1.05374026, "balance_loss_mlp": 1.03514171, "epoch": 0.09986472268149707, "flos": 19609704495360.0, "grad_norm": 1.9493882663610318, "language_loss": 0.80024737, "learning_rate": 3.949016704705836e-06, "loss": 0.82234001, "num_input_tokens_seen": 35799565, "step": 1661, "time_per_iteration": 2.6537399291992188 }, { "auxiliary_loss_clip": 0.01172787, "auxiliary_loss_mlp": 0.01055532, "balance_loss_clip": 1.05715156, "balance_loss_mlp": 1.03153503, "epoch": 0.09992484593416504, "flos": 26213317395840.0, "grad_norm": 2.0152235709188377, "language_loss": 0.83560598, "learning_rate": 3.948929291548443e-06, "loss": 0.85788912, "num_input_tokens_seen": 35821085, "step": 1662, "time_per_iteration": 2.753807783126831 }, { "auxiliary_loss_clip": 0.01154838, "auxiliary_loss_mlp": 0.01061466, "balance_loss_clip": 1.05079484, "balance_loss_mlp": 1.03616929, "epoch": 0.09998496918683301, "flos": 17493632421120.0, "grad_norm": 1.9355779644050557, "language_loss": 0.88865256, "learning_rate": 3.9488418044874546e-06, "loss": 0.91081554, "num_input_tokens_seen": 35839840, "step": 1663, "time_per_iteration": 2.6829047203063965 }, { "auxiliary_loss_clip": 0.0118246, "auxiliary_loss_mlp": 0.01061692, "balance_loss_clip": 1.06228638, "balance_loss_mlp": 1.03825521, "epoch": 0.10004509243950098, "flos": 22784925928320.0, "grad_norm": 1.7925330820671084, "language_loss": 0.70140731, "learning_rate": 3.948754243526191e-06, "loss": 0.72384882, "num_input_tokens_seen": 35861545, "step": 1664, "time_per_iteration": 2.809300184249878 }, { "auxiliary_loss_clip": 0.01142878, "auxiliary_loss_mlp": 0.01055306, "balance_loss_clip": 1.05475903, "balance_loss_mlp": 1.03312087, "epoch": 0.10010521569216894, "flos": 16253385667200.0, "grad_norm": 2.4978474602303895, "language_loss": 0.78981555, "learning_rate": 3.94866660866797e-06, "loss": 0.81179744, "num_input_tokens_seen": 35878295, "step": 1665, "time_per_iteration": 2.7010488510131836 }, { "auxiliary_loss_clip": 0.01175861, "auxiliary_loss_mlp": 0.01070341, "balance_loss_clip": 1.06286561, "balance_loss_mlp": 1.04742861, "epoch": 0.10016533894483691, "flos": 23402589223680.0, "grad_norm": 3.1438625724360945, "language_loss": 0.70054829, "learning_rate": 3.9485788999161165e-06, "loss": 0.7230103, "num_input_tokens_seen": 35898990, "step": 1666, "time_per_iteration": 2.689879894256592 }, { "auxiliary_loss_clip": 0.01110848, "auxiliary_loss_mlp": 0.01074593, "balance_loss_clip": 1.05082703, "balance_loss_mlp": 1.04946339, "epoch": 0.10022546219750489, "flos": 19354164163200.0, "grad_norm": 1.7583449522195267, "language_loss": 0.78647351, "learning_rate": 3.948491117273956e-06, "loss": 0.80832791, "num_input_tokens_seen": 35916225, "step": 1667, "time_per_iteration": 2.8973352909088135 }, { "auxiliary_loss_clip": 0.01153352, "auxiliary_loss_mlp": 0.01062819, "balance_loss_clip": 1.05452693, "balance_loss_mlp": 1.03752255, "epoch": 0.10028558545017285, "flos": 27085766837760.0, "grad_norm": 2.4011089045072187, "language_loss": 0.77357388, "learning_rate": 3.948403260744817e-06, "loss": 0.7957356, "num_input_tokens_seen": 35934630, "step": 1668, "time_per_iteration": 3.2600321769714355 }, { "auxiliary_loss_clip": 0.01184879, "auxiliary_loss_mlp": 0.01059367, "balance_loss_clip": 1.05833495, "balance_loss_mlp": 1.03523922, "epoch": 0.10034570870284082, "flos": 25847136195840.0, "grad_norm": 1.7407668002390366, "language_loss": 0.77520061, "learning_rate": 3.948315330332031e-06, "loss": 0.79764307, "num_input_tokens_seen": 35953855, "step": 1669, "time_per_iteration": 2.6899471282958984 }, { "auxiliary_loss_clip": 0.0118887, "auxiliary_loss_mlp": 0.01067842, "balance_loss_clip": 1.05948365, "balance_loss_mlp": 1.04416728, "epoch": 0.1004058319555088, "flos": 26249587153920.0, "grad_norm": 5.441134829238958, "language_loss": 0.85160148, "learning_rate": 3.948227326038933e-06, "loss": 0.87416857, "num_input_tokens_seen": 35974555, "step": 1670, "time_per_iteration": 2.616867780685425 }, { "auxiliary_loss_clip": 0.011763, "auxiliary_loss_mlp": 0.01055607, "balance_loss_clip": 1.05584121, "balance_loss_mlp": 1.03354108, "epoch": 0.10046595520817676, "flos": 25374480105600.0, "grad_norm": 1.4849262119454174, "language_loss": 0.76836258, "learning_rate": 3.9481392478688586e-06, "loss": 0.79068166, "num_input_tokens_seen": 35996830, "step": 1671, "time_per_iteration": 2.658254384994507 }, { "auxiliary_loss_clip": 0.01061447, "auxiliary_loss_mlp": 0.01017561, "balance_loss_clip": 1.02178144, "balance_loss_mlp": 1.01454473, "epoch": 0.10052607846084473, "flos": 67461821677440.0, "grad_norm": 0.7781454358921105, "language_loss": 0.60718858, "learning_rate": 3.948051095825149e-06, "loss": 0.62797856, "num_input_tokens_seen": 36054465, "step": 1672, "time_per_iteration": 3.1269097328186035 }, { "auxiliary_loss_clip": 0.01143177, "auxiliary_loss_mlp": 0.01063346, "balance_loss_clip": 1.05112922, "balance_loss_mlp": 1.04055333, "epoch": 0.10058620171351271, "flos": 21360493209600.0, "grad_norm": 2.433278134910662, "language_loss": 0.7711426, "learning_rate": 3.947962869911147e-06, "loss": 0.79320776, "num_input_tokens_seen": 36073480, "step": 1673, "time_per_iteration": 2.6931638717651367 }, { "auxiliary_loss_clip": 0.01132094, "auxiliary_loss_mlp": 0.01056611, "balance_loss_clip": 1.04989302, "balance_loss_mlp": 1.03262639, "epoch": 0.10064632496618067, "flos": 16800125558400.0, "grad_norm": 2.074683072839241, "language_loss": 0.73173523, "learning_rate": 3.947874570130197e-06, "loss": 0.75362229, "num_input_tokens_seen": 36091830, "step": 1674, "time_per_iteration": 2.7188127040863037 }, { "auxiliary_loss_clip": 0.01172389, "auxiliary_loss_mlp": 0.00779533, "balance_loss_clip": 1.0556165, "balance_loss_mlp": 1.00024796, "epoch": 0.10070644821884864, "flos": 23624445576960.0, "grad_norm": 2.1982379565146872, "language_loss": 0.79456973, "learning_rate": 3.947786196485649e-06, "loss": 0.81408894, "num_input_tokens_seen": 36111400, "step": 1675, "time_per_iteration": 2.712090253829956 }, { "auxiliary_loss_clip": 0.01182659, "auxiliary_loss_mlp": 0.01063327, "balance_loss_clip": 1.05801332, "balance_loss_mlp": 1.04239404, "epoch": 0.1007665714715166, "flos": 24462564595200.0, "grad_norm": 2.408955682155161, "language_loss": 0.8120935, "learning_rate": 3.947697748980853e-06, "loss": 0.83455336, "num_input_tokens_seen": 36129345, "step": 1676, "time_per_iteration": 2.685472249984741 }, { "auxiliary_loss_clip": 0.01175397, "auxiliary_loss_mlp": 0.01057105, "balance_loss_clip": 1.05950332, "balance_loss_mlp": 1.03546858, "epoch": 0.10082669472418458, "flos": 16799119977600.0, "grad_norm": 2.008035557658629, "language_loss": 0.86132157, "learning_rate": 3.947609227619163e-06, "loss": 0.88364655, "num_input_tokens_seen": 36146255, "step": 1677, "time_per_iteration": 2.6589157581329346 }, { "auxiliary_loss_clip": 0.01162997, "auxiliary_loss_mlp": 0.010508, "balance_loss_clip": 1.05363441, "balance_loss_mlp": 1.02896047, "epoch": 0.10088681797685255, "flos": 13553513844480.0, "grad_norm": 2.160847391025828, "language_loss": 0.86006588, "learning_rate": 3.947520632403936e-06, "loss": 0.88220382, "num_input_tokens_seen": 36164050, "step": 1678, "time_per_iteration": 2.694347858428955 }, { "auxiliary_loss_clip": 0.0116292, "auxiliary_loss_mlp": 0.01056376, "balance_loss_clip": 1.0587275, "balance_loss_mlp": 1.03406048, "epoch": 0.10094694122952051, "flos": 25265706744960.0, "grad_norm": 12.700254532531051, "language_loss": 0.89978886, "learning_rate": 3.947431963338532e-06, "loss": 0.92198181, "num_input_tokens_seen": 36183530, "step": 1679, "time_per_iteration": 2.6741397380828857 }, { "auxiliary_loss_clip": 0.01071086, "auxiliary_loss_mlp": 0.0101685, "balance_loss_clip": 1.02328789, "balance_loss_mlp": 1.01360798, "epoch": 0.10100706448218849, "flos": 69854299885440.0, "grad_norm": 0.7882499243548835, "language_loss": 0.52985126, "learning_rate": 3.947343220426312e-06, "loss": 0.55073065, "num_input_tokens_seen": 36248550, "step": 1680, "time_per_iteration": 3.169893503189087 }, { "auxiliary_loss_clip": 0.01185252, "auxiliary_loss_mlp": 0.00779951, "balance_loss_clip": 1.06022644, "balance_loss_mlp": 1.00017488, "epoch": 0.10106718773485646, "flos": 20007163463040.0, "grad_norm": 1.6642182724084642, "language_loss": 0.76869059, "learning_rate": 3.947254403670641e-06, "loss": 0.7883426, "num_input_tokens_seen": 36266065, "step": 1681, "time_per_iteration": 4.146950006484985 }, { "auxiliary_loss_clip": 0.01156046, "auxiliary_loss_mlp": 0.01059972, "balance_loss_clip": 1.0539515, "balance_loss_mlp": 1.03469992, "epoch": 0.10112731098752442, "flos": 13479825093120.0, "grad_norm": 2.3884003317971225, "language_loss": 0.93957508, "learning_rate": 3.947165513074889e-06, "loss": 0.96173531, "num_input_tokens_seen": 36280960, "step": 1682, "time_per_iteration": 4.220505237579346 }, { "auxiliary_loss_clip": 0.01173183, "auxiliary_loss_mlp": 0.01053261, "balance_loss_clip": 1.05487084, "balance_loss_mlp": 1.03133821, "epoch": 0.1011874342401924, "flos": 18515901490560.0, "grad_norm": 3.5300660189263917, "language_loss": 0.87618893, "learning_rate": 3.947076548642425e-06, "loss": 0.89845335, "num_input_tokens_seen": 36299010, "step": 1683, "time_per_iteration": 2.635636329650879 }, { "auxiliary_loss_clip": 0.01128888, "auxiliary_loss_mlp": 0.01063089, "balance_loss_clip": 1.04814756, "balance_loss_mlp": 1.04008126, "epoch": 0.10124755749286037, "flos": 20702861055360.0, "grad_norm": 2.3337760241024923, "language_loss": 0.74566805, "learning_rate": 3.946987510376624e-06, "loss": 0.76758784, "num_input_tokens_seen": 36318400, "step": 1684, "time_per_iteration": 4.417364835739136 }, { "auxiliary_loss_clip": 0.01053031, "auxiliary_loss_mlp": 0.0101182, "balance_loss_clip": 1.02547038, "balance_loss_mlp": 1.00853014, "epoch": 0.10130768074552833, "flos": 56109456247680.0, "grad_norm": 0.7564631726021327, "language_loss": 0.61085057, "learning_rate": 3.9468983982808615e-06, "loss": 0.63149905, "num_input_tokens_seen": 36381815, "step": 1685, "time_per_iteration": 4.87179970741272 }, { "auxiliary_loss_clip": 0.01157045, "auxiliary_loss_mlp": 0.01056064, "balance_loss_clip": 1.05233479, "balance_loss_mlp": 1.0341655, "epoch": 0.1013678039981963, "flos": 33402346156800.0, "grad_norm": 4.297801792672815, "language_loss": 0.61381406, "learning_rate": 3.946809212358516e-06, "loss": 0.6359452, "num_input_tokens_seen": 36404320, "step": 1686, "time_per_iteration": 2.8289108276367188 }, { "auxiliary_loss_clip": 0.01144631, "auxiliary_loss_mlp": 0.01059888, "balance_loss_clip": 1.05645001, "balance_loss_mlp": 1.03678524, "epoch": 0.10142792725086427, "flos": 31905338008320.0, "grad_norm": 2.21923850158845, "language_loss": 0.81216162, "learning_rate": 3.946719952612972e-06, "loss": 0.83420682, "num_input_tokens_seen": 36427510, "step": 1687, "time_per_iteration": 2.947535276412964 }, { "auxiliary_loss_clip": 0.0117612, "auxiliary_loss_mlp": 0.0105614, "balance_loss_clip": 1.05933213, "balance_loss_mlp": 1.03403926, "epoch": 0.10148805050353224, "flos": 28475905046400.0, "grad_norm": 1.7955898786084035, "language_loss": 0.71943259, "learning_rate": 3.94663061904761e-06, "loss": 0.74175525, "num_input_tokens_seen": 36448230, "step": 1688, "time_per_iteration": 2.693249225616455 }, { "auxiliary_loss_clip": 0.01151953, "auxiliary_loss_mlp": 0.01063362, "balance_loss_clip": 1.05288756, "balance_loss_mlp": 1.04079556, "epoch": 0.1015481737562002, "flos": 25148888737920.0, "grad_norm": 2.636795901714516, "language_loss": 0.86876953, "learning_rate": 3.94654121166582e-06, "loss": 0.89092261, "num_input_tokens_seen": 36464395, "step": 1689, "time_per_iteration": 2.677992820739746 }, { "auxiliary_loss_clip": 0.01172188, "auxiliary_loss_mlp": 0.01057982, "balance_loss_clip": 1.05476904, "balance_loss_mlp": 1.0378834, "epoch": 0.10160829700886818, "flos": 30882781630080.0, "grad_norm": 2.2211105929909696, "language_loss": 0.88170946, "learning_rate": 3.946451730470993e-06, "loss": 0.90401113, "num_input_tokens_seen": 36486475, "step": 1690, "time_per_iteration": 2.707209348678589 }, { "auxiliary_loss_clip": 0.01158767, "auxiliary_loss_mlp": 0.01052386, "balance_loss_clip": 1.05507553, "balance_loss_mlp": 1.02973664, "epoch": 0.10166842026153615, "flos": 20412020632320.0, "grad_norm": 2.08291471600754, "language_loss": 0.83348423, "learning_rate": 3.946362175466521e-06, "loss": 0.85559577, "num_input_tokens_seen": 36505310, "step": 1691, "time_per_iteration": 2.6521170139312744 }, { "auxiliary_loss_clip": 0.01162159, "auxiliary_loss_mlp": 0.01051716, "balance_loss_clip": 1.05550599, "balance_loss_mlp": 1.03016281, "epoch": 0.10172854351420411, "flos": 33476968661760.0, "grad_norm": 1.704519528530946, "language_loss": 0.66773653, "learning_rate": 3.946272546655801e-06, "loss": 0.68987525, "num_input_tokens_seen": 36529820, "step": 1692, "time_per_iteration": 2.799353837966919 }, { "auxiliary_loss_clip": 0.01144502, "auxiliary_loss_mlp": 0.0107473, "balance_loss_clip": 1.05057836, "balance_loss_mlp": 1.05258095, "epoch": 0.1017886667668721, "flos": 23550325862400.0, "grad_norm": 1.8345924563029705, "language_loss": 0.75939322, "learning_rate": 3.94618284404223e-06, "loss": 0.78158557, "num_input_tokens_seen": 36549000, "step": 1693, "time_per_iteration": 2.6711113452911377 }, { "auxiliary_loss_clip": 0.01132621, "auxiliary_loss_mlp": 0.01057162, "balance_loss_clip": 1.04893303, "balance_loss_mlp": 1.03289056, "epoch": 0.10184879001954006, "flos": 23296078419840.0, "grad_norm": 1.7027745569702395, "language_loss": 0.87503564, "learning_rate": 3.9460930676292105e-06, "loss": 0.89693356, "num_input_tokens_seen": 36567515, "step": 1694, "time_per_iteration": 2.749119520187378 }, { "auxiliary_loss_clip": 0.01130673, "auxiliary_loss_mlp": 0.01058451, "balance_loss_clip": 1.04954553, "balance_loss_mlp": 1.033095, "epoch": 0.10190891327220802, "flos": 18333116156160.0, "grad_norm": 1.7649462193878245, "language_loss": 0.79299057, "learning_rate": 3.946003217420147e-06, "loss": 0.8148818, "num_input_tokens_seen": 36586190, "step": 1695, "time_per_iteration": 2.839081048965454 }, { "auxiliary_loss_clip": 0.0112732, "auxiliary_loss_mlp": 0.01061103, "balance_loss_clip": 1.04818296, "balance_loss_mlp": 1.03772628, "epoch": 0.10196903652487599, "flos": 26465374108800.0, "grad_norm": 2.7190993931598446, "language_loss": 0.86494684, "learning_rate": 3.945913293418447e-06, "loss": 0.88683105, "num_input_tokens_seen": 36607495, "step": 1696, "time_per_iteration": 2.7802348136901855 }, { "auxiliary_loss_clip": 0.01168675, "auxiliary_loss_mlp": 0.01054661, "balance_loss_clip": 1.05711746, "balance_loss_mlp": 1.03315568, "epoch": 0.10202915977754397, "flos": 21869526798720.0, "grad_norm": 1.7889048836535288, "language_loss": 0.82350796, "learning_rate": 3.945823295627519e-06, "loss": 0.84574133, "num_input_tokens_seen": 36628555, "step": 1697, "time_per_iteration": 2.667962074279785 }, { "auxiliary_loss_clip": 0.01184333, "auxiliary_loss_mlp": 0.01055548, "balance_loss_clip": 1.05680871, "balance_loss_mlp": 1.033149, "epoch": 0.10208928303021193, "flos": 22309755886080.0, "grad_norm": 2.0464291543972006, "language_loss": 0.81198204, "learning_rate": 3.9457332240507775e-06, "loss": 0.83438087, "num_input_tokens_seen": 36646250, "step": 1698, "time_per_iteration": 2.6484432220458984 }, { "auxiliary_loss_clip": 0.01150498, "auxiliary_loss_mlp": 0.01053546, "balance_loss_clip": 1.05696845, "balance_loss_mlp": 1.03226686, "epoch": 0.1021494062828799, "flos": 22125569921280.0, "grad_norm": 2.3020250981163226, "language_loss": 0.75612724, "learning_rate": 3.945643078691637e-06, "loss": 0.77816761, "num_input_tokens_seen": 36666675, "step": 1699, "time_per_iteration": 2.8040614128112793 }, { "auxiliary_loss_clip": 0.01162088, "auxiliary_loss_mlp": 0.01050379, "balance_loss_clip": 1.06041551, "balance_loss_mlp": 1.02827764, "epoch": 0.10220952953554788, "flos": 19646728439040.0, "grad_norm": 1.6839869206777538, "language_loss": 0.80395639, "learning_rate": 3.945552859553516e-06, "loss": 0.8260811, "num_input_tokens_seen": 36685225, "step": 1700, "time_per_iteration": 2.6701290607452393 }, { "auxiliary_loss_clip": 0.0117076, "auxiliary_loss_mlp": 0.0104804, "balance_loss_clip": 1.05714083, "balance_loss_mlp": 1.02653444, "epoch": 0.10226965278821584, "flos": 29787290686080.0, "grad_norm": 2.102621975458346, "language_loss": 0.76877582, "learning_rate": 3.945462566639836e-06, "loss": 0.79096377, "num_input_tokens_seen": 36705985, "step": 1701, "time_per_iteration": 2.748201847076416 }, { "auxiliary_loss_clip": 0.01182259, "auxiliary_loss_mlp": 0.01050364, "balance_loss_clip": 1.06157088, "balance_loss_mlp": 1.02852523, "epoch": 0.10232977604088381, "flos": 27016818681600.0, "grad_norm": 2.1099726588763965, "language_loss": 0.77922845, "learning_rate": 3.945372199954019e-06, "loss": 0.80155474, "num_input_tokens_seen": 36725815, "step": 1702, "time_per_iteration": 2.6703274250030518 }, { "auxiliary_loss_clip": 0.01156323, "auxiliary_loss_mlp": 0.01052524, "balance_loss_clip": 1.05596721, "balance_loss_mlp": 1.03126872, "epoch": 0.10238989929355179, "flos": 20777519473920.0, "grad_norm": 2.2326457826946293, "language_loss": 0.94093609, "learning_rate": 3.945281759499494e-06, "loss": 0.96302462, "num_input_tokens_seen": 36742345, "step": 1703, "time_per_iteration": 2.6712698936462402 }, { "auxiliary_loss_clip": 0.01034483, "auxiliary_loss_mlp": 0.01037784, "balance_loss_clip": 1.02765131, "balance_loss_mlp": 1.03315914, "epoch": 0.10245002254621975, "flos": 57698322451200.0, "grad_norm": 0.8815387598011586, "language_loss": 0.55096036, "learning_rate": 3.94519124527969e-06, "loss": 0.57168299, "num_input_tokens_seen": 36798775, "step": 1704, "time_per_iteration": 3.2863855361938477 }, { "auxiliary_loss_clip": 0.01186822, "auxiliary_loss_mlp": 0.01053701, "balance_loss_clip": 1.06026638, "balance_loss_mlp": 1.03088403, "epoch": 0.10251014579888772, "flos": 16800125558400.0, "grad_norm": 2.051901555713709, "language_loss": 0.84025991, "learning_rate": 3.945100657298039e-06, "loss": 0.86266518, "num_input_tokens_seen": 36816295, "step": 1705, "time_per_iteration": 2.8991851806640625 }, { "auxiliary_loss_clip": 0.01045354, "auxiliary_loss_mlp": 0.01018361, "balance_loss_clip": 1.02622223, "balance_loss_mlp": 1.01526153, "epoch": 0.1025702690515557, "flos": 68565500922240.0, "grad_norm": 0.7692746082941451, "language_loss": 0.60408181, "learning_rate": 3.9450099955579765e-06, "loss": 0.62471896, "num_input_tokens_seen": 36882030, "step": 1706, "time_per_iteration": 3.2174558639526367 }, { "auxiliary_loss_clip": 0.01149922, "auxiliary_loss_mlp": 0.01051211, "balance_loss_clip": 1.05388391, "balance_loss_mlp": 1.02812052, "epoch": 0.10263039230422366, "flos": 14866623336960.0, "grad_norm": 2.201796189576969, "language_loss": 0.85937822, "learning_rate": 3.94491926006294e-06, "loss": 0.88138962, "num_input_tokens_seen": 36899245, "step": 1707, "time_per_iteration": 2.689208507537842 }, { "auxiliary_loss_clip": 0.01169165, "auxiliary_loss_mlp": 0.0105297, "balance_loss_clip": 1.05941081, "balance_loss_mlp": 1.03114319, "epoch": 0.10269051555689163, "flos": 25337599816320.0, "grad_norm": 1.471109036018689, "language_loss": 0.73299325, "learning_rate": 3.944828450816369e-06, "loss": 0.75521457, "num_input_tokens_seen": 36920950, "step": 1708, "time_per_iteration": 2.679760456085205 }, { "auxiliary_loss_clip": 0.01155833, "auxiliary_loss_mlp": 0.00780571, "balance_loss_clip": 1.05718231, "balance_loss_mlp": 1.00042295, "epoch": 0.10275063880955959, "flos": 21068826773760.0, "grad_norm": 1.7051644476897239, "language_loss": 0.91616452, "learning_rate": 3.944737567821709e-06, "loss": 0.93552846, "num_input_tokens_seen": 36938900, "step": 1709, "time_per_iteration": 2.6754679679870605 }, { "auxiliary_loss_clip": 0.01124911, "auxiliary_loss_mlp": 0.01057008, "balance_loss_clip": 1.05144072, "balance_loss_mlp": 1.0343945, "epoch": 0.10281076206222757, "flos": 30366780802560.0, "grad_norm": 2.1056252966717275, "language_loss": 0.88004494, "learning_rate": 3.944646611082406e-06, "loss": 0.90186411, "num_input_tokens_seen": 36957010, "step": 1710, "time_per_iteration": 2.708723306655884 }, { "auxiliary_loss_clip": 0.01171004, "auxiliary_loss_mlp": 0.0105967, "balance_loss_clip": 1.05658317, "balance_loss_mlp": 1.036973, "epoch": 0.10287088531489554, "flos": 22418313765120.0, "grad_norm": 1.7046493271202992, "language_loss": 0.79370153, "learning_rate": 3.944555580601908e-06, "loss": 0.81600821, "num_input_tokens_seen": 36977690, "step": 1711, "time_per_iteration": 2.631908416748047 }, { "auxiliary_loss_clip": 0.01156003, "auxiliary_loss_mlp": 0.01055126, "balance_loss_clip": 1.05841637, "balance_loss_mlp": 1.03189242, "epoch": 0.1029310085675635, "flos": 25115994858240.0, "grad_norm": 3.2168061349371135, "language_loss": 0.73666596, "learning_rate": 3.944464476383668e-06, "loss": 0.75877726, "num_input_tokens_seen": 36997300, "step": 1712, "time_per_iteration": 2.7107467651367188 }, { "auxiliary_loss_clip": 0.01133407, "auxiliary_loss_mlp": 0.01056055, "balance_loss_clip": 1.05496907, "balance_loss_mlp": 1.03334546, "epoch": 0.10299113182023148, "flos": 19865639877120.0, "grad_norm": 1.974447377126898, "language_loss": 0.87049067, "learning_rate": 3.94437329843114e-06, "loss": 0.89238536, "num_input_tokens_seen": 37016110, "step": 1713, "time_per_iteration": 2.6532411575317383 }, { "auxiliary_loss_clip": 0.0116832, "auxiliary_loss_mlp": 0.01060237, "balance_loss_clip": 1.05669498, "balance_loss_mlp": 1.03877962, "epoch": 0.10305125507289944, "flos": 20447608032000.0, "grad_norm": 1.57388574383124, "language_loss": 0.72406238, "learning_rate": 3.944282046747782e-06, "loss": 0.74634796, "num_input_tokens_seen": 37036405, "step": 1714, "time_per_iteration": 2.5987610816955566 }, { "auxiliary_loss_clip": 0.01174482, "auxiliary_loss_mlp": 0.01063165, "balance_loss_clip": 1.05715692, "balance_loss_mlp": 1.03934693, "epoch": 0.10311137832556741, "flos": 26250772302720.0, "grad_norm": 2.1959530175190434, "language_loss": 0.91065919, "learning_rate": 3.944190721337053e-06, "loss": 0.93303567, "num_input_tokens_seen": 37057580, "step": 1715, "time_per_iteration": 2.743833303451538 }, { "auxiliary_loss_clip": 0.01170297, "auxiliary_loss_mlp": 0.01054891, "balance_loss_clip": 1.05448914, "balance_loss_mlp": 1.03305221, "epoch": 0.10317150157823539, "flos": 35298932175360.0, "grad_norm": 1.8741123562687005, "language_loss": 0.75969976, "learning_rate": 3.944099322202418e-06, "loss": 0.78195167, "num_input_tokens_seen": 37079120, "step": 1716, "time_per_iteration": 2.748903274536133 }, { "auxiliary_loss_clip": 0.01162664, "auxiliary_loss_mlp": 0.01061895, "balance_loss_clip": 1.05617428, "balance_loss_mlp": 1.03804111, "epoch": 0.10323162483090335, "flos": 25739943033600.0, "grad_norm": 3.178190042364093, "language_loss": 0.85308528, "learning_rate": 3.944007849347342e-06, "loss": 0.87533092, "num_input_tokens_seen": 37099710, "step": 1717, "time_per_iteration": 2.690772533416748 }, { "auxiliary_loss_clip": 0.01127019, "auxiliary_loss_mlp": 0.01067935, "balance_loss_clip": 1.05048633, "balance_loss_mlp": 1.04436755, "epoch": 0.10329174808357132, "flos": 16289870906880.0, "grad_norm": 1.8474438265561113, "language_loss": 0.82945001, "learning_rate": 3.943916302775292e-06, "loss": 0.85139954, "num_input_tokens_seen": 37117775, "step": 1718, "time_per_iteration": 2.7029476165771484 }, { "auxiliary_loss_clip": 0.01171184, "auxiliary_loss_mlp": 0.01049869, "balance_loss_clip": 1.05912328, "balance_loss_mlp": 1.02701616, "epoch": 0.10335187133623928, "flos": 36687166963200.0, "grad_norm": 1.7728224248964342, "language_loss": 0.73396438, "learning_rate": 3.943824682489742e-06, "loss": 0.75617492, "num_input_tokens_seen": 37140280, "step": 1719, "time_per_iteration": 2.7653820514678955 }, { "auxiliary_loss_clip": 0.01168859, "auxiliary_loss_mlp": 0.01048444, "balance_loss_clip": 1.05861163, "balance_loss_mlp": 1.02786827, "epoch": 0.10341199458890726, "flos": 14975648092800.0, "grad_norm": 1.7819459058763836, "language_loss": 0.92692196, "learning_rate": 3.9437329884941665e-06, "loss": 0.94909501, "num_input_tokens_seen": 37158350, "step": 1720, "time_per_iteration": 4.1962480545043945 }, { "auxiliary_loss_clip": 0.01139894, "auxiliary_loss_mlp": 0.01051033, "balance_loss_clip": 1.05092323, "balance_loss_mlp": 1.02827597, "epoch": 0.10347211784157523, "flos": 21031587348480.0, "grad_norm": 1.6861044154168399, "language_loss": 0.79497123, "learning_rate": 3.943641220792039e-06, "loss": 0.81688046, "num_input_tokens_seen": 37177120, "step": 1721, "time_per_iteration": 4.524151802062988 }, { "auxiliary_loss_clip": 0.01130482, "auxiliary_loss_mlp": 0.01067754, "balance_loss_clip": 1.05380797, "balance_loss_mlp": 1.04109859, "epoch": 0.1035322410942432, "flos": 19792094780160.0, "grad_norm": 1.951940775381607, "language_loss": 0.80707669, "learning_rate": 3.9435493793868434e-06, "loss": 0.829059, "num_input_tokens_seen": 37195895, "step": 1722, "time_per_iteration": 2.7972562313079834 }, { "auxiliary_loss_clip": 0.01059018, "auxiliary_loss_mlp": 0.01038991, "balance_loss_clip": 1.02668202, "balance_loss_mlp": 1.03536737, "epoch": 0.10359236434691117, "flos": 52698874947840.0, "grad_norm": 0.9413879826908518, "language_loss": 0.67161834, "learning_rate": 3.943457464282059e-06, "loss": 0.69259846, "num_input_tokens_seen": 37247270, "step": 1723, "time_per_iteration": 4.899553060531616 }, { "auxiliary_loss_clip": 0.01169875, "auxiliary_loss_mlp": 0.01062977, "balance_loss_clip": 1.05482125, "balance_loss_mlp": 1.04193664, "epoch": 0.10365248759957914, "flos": 18405404277120.0, "grad_norm": 2.8641520576523116, "language_loss": 0.77715755, "learning_rate": 3.9433654754811745e-06, "loss": 0.7994861, "num_input_tokens_seen": 37265595, "step": 1724, "time_per_iteration": 2.7613437175750732 }, { "auxiliary_loss_clip": 0.01151829, "auxiliary_loss_mlp": 0.01069246, "balance_loss_clip": 1.05667496, "balance_loss_mlp": 1.04753852, "epoch": 0.1037126108522471, "flos": 47553555335040.0, "grad_norm": 2.6433978354033543, "language_loss": 0.74533165, "learning_rate": 3.943273412987676e-06, "loss": 0.76754242, "num_input_tokens_seen": 37286660, "step": 1725, "time_per_iteration": 4.557274580001831 }, { "auxiliary_loss_clip": 0.01137065, "auxiliary_loss_mlp": 0.01081067, "balance_loss_clip": 1.05264461, "balance_loss_mlp": 1.05832207, "epoch": 0.10377273410491508, "flos": 22816670572800.0, "grad_norm": 2.2241153649877865, "language_loss": 0.75043738, "learning_rate": 3.943181276805054e-06, "loss": 0.77261865, "num_input_tokens_seen": 37304915, "step": 1726, "time_per_iteration": 2.7098495960235596 }, { "auxiliary_loss_clip": 0.01150932, "auxiliary_loss_mlp": 0.0107864, "balance_loss_clip": 1.05345368, "balance_loss_mlp": 1.05610991, "epoch": 0.10383285735758305, "flos": 26138694890880.0, "grad_norm": 2.783771441956431, "language_loss": 0.73243797, "learning_rate": 3.9430890669368035e-06, "loss": 0.75473368, "num_input_tokens_seen": 37325265, "step": 1727, "time_per_iteration": 2.74774169921875 }, { "auxiliary_loss_clip": 0.01157922, "auxiliary_loss_mlp": 0.01068007, "balance_loss_clip": 1.05303776, "balance_loss_mlp": 1.04625082, "epoch": 0.10389298061025101, "flos": 17091791994240.0, "grad_norm": 2.172978726198527, "language_loss": 0.84373868, "learning_rate": 3.942996783386422e-06, "loss": 0.86599791, "num_input_tokens_seen": 37341650, "step": 1728, "time_per_iteration": 2.675724744796753 }, { "auxiliary_loss_clip": 0.01154897, "auxiliary_loss_mlp": 0.01060505, "balance_loss_clip": 1.0545603, "balance_loss_mlp": 1.0393219, "epoch": 0.10395310386291898, "flos": 20776513893120.0, "grad_norm": 2.1406499008555513, "language_loss": 0.70776087, "learning_rate": 3.942904426157406e-06, "loss": 0.7299149, "num_input_tokens_seen": 37360270, "step": 1729, "time_per_iteration": 2.6885008811950684 }, { "auxiliary_loss_clip": 0.01158623, "auxiliary_loss_mlp": 0.01068311, "balance_loss_clip": 1.05437422, "balance_loss_mlp": 1.04520774, "epoch": 0.10401322711558696, "flos": 12820540913280.0, "grad_norm": 2.4133379049648283, "language_loss": 0.81237471, "learning_rate": 3.9428119952532605e-06, "loss": 0.83464402, "num_input_tokens_seen": 37375225, "step": 1730, "time_per_iteration": 2.6659536361694336 }, { "auxiliary_loss_clip": 0.01085856, "auxiliary_loss_mlp": 0.01063394, "balance_loss_clip": 1.04733562, "balance_loss_mlp": 1.04314065, "epoch": 0.10407335036825492, "flos": 23184683366400.0, "grad_norm": 1.6634499611984725, "language_loss": 0.75829297, "learning_rate": 3.942719490677489e-06, "loss": 0.77978551, "num_input_tokens_seen": 37395165, "step": 1731, "time_per_iteration": 3.043125629425049 }, { "auxiliary_loss_clip": 0.01129913, "auxiliary_loss_mlp": 0.01065783, "balance_loss_clip": 1.0526607, "balance_loss_mlp": 1.04604149, "epoch": 0.10413347362092289, "flos": 26104184899200.0, "grad_norm": 1.8280179918091173, "language_loss": 0.8268069, "learning_rate": 3.9426269124336e-06, "loss": 0.84876388, "num_input_tokens_seen": 37414845, "step": 1732, "time_per_iteration": 2.96221661567688 }, { "auxiliary_loss_clip": 0.01141505, "auxiliary_loss_mlp": 0.01067805, "balance_loss_clip": 1.05805755, "balance_loss_mlp": 1.04852867, "epoch": 0.10419359687359087, "flos": 12641059630080.0, "grad_norm": 1.9919813178368582, "language_loss": 0.83320522, "learning_rate": 3.942534260525104e-06, "loss": 0.85529828, "num_input_tokens_seen": 37432490, "step": 1733, "time_per_iteration": 2.7364420890808105 }, { "auxiliary_loss_clip": 0.01153374, "auxiliary_loss_mlp": 0.0106675, "balance_loss_clip": 1.05592012, "balance_loss_mlp": 1.04654372, "epoch": 0.10425372012625883, "flos": 12125094716160.0, "grad_norm": 2.4441875881355597, "language_loss": 0.76683885, "learning_rate": 3.942441534955514e-06, "loss": 0.78904009, "num_input_tokens_seen": 37449435, "step": 1734, "time_per_iteration": 2.669623851776123 }, { "auxiliary_loss_clip": 0.0113597, "auxiliary_loss_mlp": 0.01052567, "balance_loss_clip": 1.05042601, "balance_loss_mlp": 1.03255177, "epoch": 0.1043138433789268, "flos": 25337563902720.0, "grad_norm": 1.6775801166329647, "language_loss": 0.74826896, "learning_rate": 3.9423487357283465e-06, "loss": 0.7701543, "num_input_tokens_seen": 37469105, "step": 1735, "time_per_iteration": 2.8477160930633545 }, { "auxiliary_loss_clip": 0.01167698, "auxiliary_loss_mlp": 0.01055716, "balance_loss_clip": 1.05678105, "balance_loss_mlp": 1.0344727, "epoch": 0.10437396663159478, "flos": 29167149352320.0, "grad_norm": 1.7228393064183538, "language_loss": 0.78835273, "learning_rate": 3.94225586284712e-06, "loss": 0.81058681, "num_input_tokens_seen": 37490540, "step": 1736, "time_per_iteration": 2.690453052520752 }, { "auxiliary_loss_clip": 0.0116734, "auxiliary_loss_mlp": 0.01064692, "balance_loss_clip": 1.05800533, "balance_loss_mlp": 1.04357982, "epoch": 0.10443408988426274, "flos": 25080946162560.0, "grad_norm": 1.8549131823334455, "language_loss": 0.7058785, "learning_rate": 3.942162916315356e-06, "loss": 0.72819883, "num_input_tokens_seen": 37511905, "step": 1737, "time_per_iteration": 2.6296744346618652 }, { "auxiliary_loss_clip": 0.01150138, "auxiliary_loss_mlp": 0.01059407, "balance_loss_clip": 1.04806042, "balance_loss_mlp": 1.03600669, "epoch": 0.1044942131369307, "flos": 26759662237440.0, "grad_norm": 2.415613377802324, "language_loss": 0.81624997, "learning_rate": 3.942069896136581e-06, "loss": 0.83834541, "num_input_tokens_seen": 37533635, "step": 1738, "time_per_iteration": 2.7436723709106445 }, { "auxiliary_loss_clip": 0.01181471, "auxiliary_loss_mlp": 0.01062035, "balance_loss_clip": 1.05579174, "balance_loss_mlp": 1.03950453, "epoch": 0.10455433638959867, "flos": 18442571875200.0, "grad_norm": 2.1004590024567897, "language_loss": 0.75419426, "learning_rate": 3.9419768023143196e-06, "loss": 0.77662933, "num_input_tokens_seen": 37552035, "step": 1739, "time_per_iteration": 2.585538148880005 }, { "auxiliary_loss_clip": 0.01146716, "auxiliary_loss_mlp": 0.01054893, "balance_loss_clip": 1.05417264, "balance_loss_mlp": 1.03348303, "epoch": 0.10461445964226665, "flos": 23218977876480.0, "grad_norm": 1.586314706443492, "language_loss": 0.77523744, "learning_rate": 3.941883634852104e-06, "loss": 0.79725355, "num_input_tokens_seen": 37571540, "step": 1740, "time_per_iteration": 2.8947789669036865 }, { "auxiliary_loss_clip": 0.01152077, "auxiliary_loss_mlp": 0.01049503, "balance_loss_clip": 1.05725431, "balance_loss_mlp": 1.0288676, "epoch": 0.10467458289493461, "flos": 24345243797760.0, "grad_norm": 1.964868695493703, "language_loss": 0.85976374, "learning_rate": 3.941790393753467e-06, "loss": 0.88177955, "num_input_tokens_seen": 37588265, "step": 1741, "time_per_iteration": 2.7706260681152344 }, { "auxiliary_loss_clip": 0.01158134, "auxiliary_loss_mlp": 0.01056311, "balance_loss_clip": 1.05614483, "balance_loss_mlp": 1.03350592, "epoch": 0.10473470614760258, "flos": 21287953693440.0, "grad_norm": 5.197245251055922, "language_loss": 0.75592613, "learning_rate": 3.941697079021942e-06, "loss": 0.77807057, "num_input_tokens_seen": 37606860, "step": 1742, "time_per_iteration": 2.784748077392578 }, { "auxiliary_loss_clip": 0.0113066, "auxiliary_loss_mlp": 0.01057571, "balance_loss_clip": 1.05678856, "balance_loss_mlp": 1.03735304, "epoch": 0.10479482940027056, "flos": 21687208341120.0, "grad_norm": 2.1426857583950416, "language_loss": 0.87614191, "learning_rate": 3.94160369066107e-06, "loss": 0.89802414, "num_input_tokens_seen": 37625210, "step": 1743, "time_per_iteration": 2.819350004196167 }, { "auxiliary_loss_clip": 0.01139959, "auxiliary_loss_mlp": 0.01048534, "balance_loss_clip": 1.0552268, "balance_loss_mlp": 1.0254786, "epoch": 0.10485495265293852, "flos": 21573694385280.0, "grad_norm": 2.060686178474056, "language_loss": 0.75927812, "learning_rate": 3.941510228674391e-06, "loss": 0.7811631, "num_input_tokens_seen": 37644110, "step": 1744, "time_per_iteration": 2.7817211151123047 }, { "auxiliary_loss_clip": 0.01170232, "auxiliary_loss_mlp": 0.01054483, "balance_loss_clip": 1.05992889, "balance_loss_mlp": 1.03442037, "epoch": 0.10491507590560649, "flos": 37961923708800.0, "grad_norm": 1.9689383181633062, "language_loss": 0.78905094, "learning_rate": 3.941416693065451e-06, "loss": 0.81129813, "num_input_tokens_seen": 37665800, "step": 1745, "time_per_iteration": 2.88080096244812 }, { "auxiliary_loss_clip": 0.01180482, "auxiliary_loss_mlp": 0.01060479, "balance_loss_clip": 1.05740213, "balance_loss_mlp": 1.03920031, "epoch": 0.10497519915827447, "flos": 26396282298240.0, "grad_norm": 2.64819141351011, "language_loss": 0.82568693, "learning_rate": 3.941323083837794e-06, "loss": 0.84809649, "num_input_tokens_seen": 37685095, "step": 1746, "time_per_iteration": 2.7068004608154297 }, { "auxiliary_loss_clip": 0.01158367, "auxiliary_loss_mlp": 0.0105595, "balance_loss_clip": 1.05737162, "balance_loss_mlp": 1.03448033, "epoch": 0.10503532241094243, "flos": 40662190581120.0, "grad_norm": 1.6274602877205533, "language_loss": 0.70573747, "learning_rate": 3.941229400994971e-06, "loss": 0.7278806, "num_input_tokens_seen": 37707445, "step": 1747, "time_per_iteration": 2.8689963817596436 }, { "auxiliary_loss_clip": 0.01159389, "auxiliary_loss_mlp": 0.01056346, "balance_loss_clip": 1.06035507, "balance_loss_mlp": 1.03492367, "epoch": 0.1050954456636104, "flos": 29789409588480.0, "grad_norm": 2.386885173400054, "language_loss": 0.8447504, "learning_rate": 3.941135644540535e-06, "loss": 0.86690772, "num_input_tokens_seen": 37728325, "step": 1748, "time_per_iteration": 2.8022749423980713 }, { "auxiliary_loss_clip": 0.01175489, "auxiliary_loss_mlp": 0.01049407, "balance_loss_clip": 1.05471563, "balance_loss_mlp": 1.02701974, "epoch": 0.10515556891627838, "flos": 23948754497280.0, "grad_norm": 1.759895679837136, "language_loss": 0.71681082, "learning_rate": 3.941041814478041e-06, "loss": 0.73905981, "num_input_tokens_seen": 37748910, "step": 1749, "time_per_iteration": 2.6568849086761475 }, { "auxiliary_loss_clip": 0.01158221, "auxiliary_loss_mlp": 0.01058697, "balance_loss_clip": 1.05427456, "balance_loss_mlp": 1.03590393, "epoch": 0.10521569216894634, "flos": 18259606972800.0, "grad_norm": 2.95022560634889, "language_loss": 0.81510806, "learning_rate": 3.940947910811047e-06, "loss": 0.83727717, "num_input_tokens_seen": 37765745, "step": 1750, "time_per_iteration": 2.6282739639282227 }, { "auxiliary_loss_clip": 0.01156475, "auxiliary_loss_mlp": 0.01062657, "balance_loss_clip": 1.06022298, "balance_loss_mlp": 1.03973269, "epoch": 0.10527581542161431, "flos": 15630909949440.0, "grad_norm": 2.2218325288878953, "language_loss": 0.92364043, "learning_rate": 3.940853933543114e-06, "loss": 0.94583178, "num_input_tokens_seen": 37780520, "step": 1751, "time_per_iteration": 2.703376531600952 }, { "auxiliary_loss_clip": 0.01165779, "auxiliary_loss_mlp": 0.01053304, "balance_loss_clip": 1.0570029, "balance_loss_mlp": 1.03171563, "epoch": 0.10533593867428227, "flos": 18296559089280.0, "grad_norm": 2.0356912608722877, "language_loss": 0.79293752, "learning_rate": 3.940759882677805e-06, "loss": 0.81512833, "num_input_tokens_seen": 37799515, "step": 1752, "time_per_iteration": 2.6501150131225586 }, { "auxiliary_loss_clip": 0.01116865, "auxiliary_loss_mlp": 0.01055489, "balance_loss_clip": 1.05116987, "balance_loss_mlp": 1.03264856, "epoch": 0.10539606192695025, "flos": 29023219555200.0, "grad_norm": 2.022904639316529, "language_loss": 0.75978744, "learning_rate": 3.940665758218686e-06, "loss": 0.78151095, "num_input_tokens_seen": 37818695, "step": 1753, "time_per_iteration": 2.871335744857788 }, { "auxiliary_loss_clip": 0.01141721, "auxiliary_loss_mlp": 0.01057356, "balance_loss_clip": 1.05547547, "balance_loss_mlp": 1.03415775, "epoch": 0.10545618517961822, "flos": 19969313506560.0, "grad_norm": 2.0563919939847914, "language_loss": 0.83969283, "learning_rate": 3.940571560169328e-06, "loss": 0.86168355, "num_input_tokens_seen": 37837860, "step": 1754, "time_per_iteration": 2.685591459274292 }, { "auxiliary_loss_clip": 0.01136802, "auxiliary_loss_mlp": 0.01053577, "balance_loss_clip": 1.05587101, "balance_loss_mlp": 1.03034329, "epoch": 0.10551630843228618, "flos": 16143427157760.0, "grad_norm": 2.7567281016961087, "language_loss": 0.68732727, "learning_rate": 3.940477288533302e-06, "loss": 0.70923102, "num_input_tokens_seen": 37856260, "step": 1755, "time_per_iteration": 2.754117727279663 }, { "auxiliary_loss_clip": 0.01161626, "auxiliary_loss_mlp": 0.010623, "balance_loss_clip": 1.05367684, "balance_loss_mlp": 1.040187, "epoch": 0.10557643168495416, "flos": 23440115957760.0, "grad_norm": 2.26658946748733, "language_loss": 0.76382339, "learning_rate": 3.940382943314182e-06, "loss": 0.7860626, "num_input_tokens_seen": 37876960, "step": 1756, "time_per_iteration": 2.686790943145752 }, { "auxiliary_loss_clip": 0.01182062, "auxiliary_loss_mlp": 0.01062906, "balance_loss_clip": 1.05688286, "balance_loss_mlp": 1.04203284, "epoch": 0.10563655493762213, "flos": 21799034357760.0, "grad_norm": 1.5917029795724482, "language_loss": 0.79926664, "learning_rate": 3.940288524515547e-06, "loss": 0.82171631, "num_input_tokens_seen": 37897070, "step": 1757, "time_per_iteration": 2.6543681621551514 }, { "auxiliary_loss_clip": 0.01149304, "auxiliary_loss_mlp": 0.01057523, "balance_loss_clip": 1.0524838, "balance_loss_mlp": 1.03563643, "epoch": 0.10569667819029009, "flos": 53800863275520.0, "grad_norm": 1.6583181970862437, "language_loss": 0.78714895, "learning_rate": 3.940194032140976e-06, "loss": 0.80921721, "num_input_tokens_seen": 37923635, "step": 1758, "time_per_iteration": 3.013157367706299 }, { "auxiliary_loss_clip": 0.01165597, "auxiliary_loss_mlp": 0.01054919, "balance_loss_clip": 1.05894113, "balance_loss_mlp": 1.03347349, "epoch": 0.10575680144295807, "flos": 22925515760640.0, "grad_norm": 1.870482409236857, "language_loss": 0.91388202, "learning_rate": 3.940099466194054e-06, "loss": 0.93608713, "num_input_tokens_seen": 37942650, "step": 1759, "time_per_iteration": 4.1841137409210205 }, { "auxiliary_loss_clip": 0.0115455, "auxiliary_loss_mlp": 0.01056708, "balance_loss_clip": 1.05242109, "balance_loss_mlp": 1.03346229, "epoch": 0.10581692469562604, "flos": 14136667148160.0, "grad_norm": 2.509404173865799, "language_loss": 0.77406812, "learning_rate": 3.940004826678365e-06, "loss": 0.79618067, "num_input_tokens_seen": 37960660, "step": 1760, "time_per_iteration": 4.476959228515625 }, { "auxiliary_loss_clip": 0.01161737, "auxiliary_loss_mlp": 0.01064522, "balance_loss_clip": 1.0536418, "balance_loss_mlp": 1.04053712, "epoch": 0.105877047948294, "flos": 25958674903680.0, "grad_norm": 2.27300461956159, "language_loss": 0.88896096, "learning_rate": 3.939910113597498e-06, "loss": 0.91122353, "num_input_tokens_seen": 37978625, "step": 1761, "time_per_iteration": 2.6907520294189453 }, { "auxiliary_loss_clip": 0.01110571, "auxiliary_loss_mlp": 0.00782389, "balance_loss_clip": 1.04964042, "balance_loss_mlp": 1.00012767, "epoch": 0.10593717120096197, "flos": 30664768032000.0, "grad_norm": 2.010693315376097, "language_loss": 0.7809304, "learning_rate": 3.9398153269550464e-06, "loss": 0.79986, "num_input_tokens_seen": 38000005, "step": 1762, "time_per_iteration": 2.869051456451416 }, { "auxiliary_loss_clip": 0.01053171, "auxiliary_loss_mlp": 0.0105371, "balance_loss_clip": 1.02694225, "balance_loss_mlp": 1.05056334, "epoch": 0.10599729445362994, "flos": 66436682497920.0, "grad_norm": 0.8956567750819878, "language_loss": 0.60503203, "learning_rate": 3.939720466754602e-06, "loss": 0.6261009, "num_input_tokens_seen": 38066165, "step": 1763, "time_per_iteration": 5.049196720123291 }, { "auxiliary_loss_clip": 0.01156865, "auxiliary_loss_mlp": 0.01048706, "balance_loss_clip": 1.05424261, "balance_loss_mlp": 1.02708137, "epoch": 0.10605741770629791, "flos": 23948179879680.0, "grad_norm": 2.0510547250099633, "language_loss": 0.80232942, "learning_rate": 3.939625532999763e-06, "loss": 0.82438517, "num_input_tokens_seen": 38086150, "step": 1764, "time_per_iteration": 4.288762807846069 }, { "auxiliary_loss_clip": 0.01136032, "auxiliary_loss_mlp": 0.01055975, "balance_loss_clip": 1.04879069, "balance_loss_mlp": 1.03218043, "epoch": 0.10611754095896588, "flos": 19387524919680.0, "grad_norm": 1.693202084864273, "language_loss": 0.801691, "learning_rate": 3.9395305256941314e-06, "loss": 0.82361102, "num_input_tokens_seen": 38104205, "step": 1765, "time_per_iteration": 2.931269407272339 }, { "auxiliary_loss_clip": 0.01163261, "auxiliary_loss_mlp": 0.01058956, "balance_loss_clip": 1.05457163, "balance_loss_mlp": 1.0367949, "epoch": 0.10617766421163385, "flos": 22237755073920.0, "grad_norm": 1.7665774264343403, "language_loss": 0.76864165, "learning_rate": 3.939435444841306e-06, "loss": 0.79086387, "num_input_tokens_seen": 38122005, "step": 1766, "time_per_iteration": 2.5976176261901855 }, { "auxiliary_loss_clip": 0.01182495, "auxiliary_loss_mlp": 0.01059246, "balance_loss_clip": 1.05923963, "balance_loss_mlp": 1.03766894, "epoch": 0.10623778746430182, "flos": 28404407024640.0, "grad_norm": 1.6265727447650185, "language_loss": 0.77311498, "learning_rate": 3.939340290444895e-06, "loss": 0.79553241, "num_input_tokens_seen": 38143365, "step": 1767, "time_per_iteration": 2.6356630325317383 }, { "auxiliary_loss_clip": 0.01006515, "auxiliary_loss_mlp": 0.01018751, "balance_loss_clip": 1.03004837, "balance_loss_mlp": 1.0151509, "epoch": 0.10629791071696978, "flos": 64234639221120.0, "grad_norm": 0.9172341423433896, "language_loss": 0.57889944, "learning_rate": 3.939245062508506e-06, "loss": 0.59915209, "num_input_tokens_seen": 38210035, "step": 1768, "time_per_iteration": 3.6866471767425537 }, { "auxiliary_loss_clip": 0.01144481, "auxiliary_loss_mlp": 0.01047419, "balance_loss_clip": 1.0546546, "balance_loss_mlp": 1.02687907, "epoch": 0.10635803396963776, "flos": 22747578762240.0, "grad_norm": 1.4529696494540971, "language_loss": 0.86711109, "learning_rate": 3.939149761035749e-06, "loss": 0.8890301, "num_input_tokens_seen": 38231230, "step": 1769, "time_per_iteration": 3.936905860900879 }, { "auxiliary_loss_clip": 0.01141219, "auxiliary_loss_mlp": 0.00780338, "balance_loss_clip": 1.05321527, "balance_loss_mlp": 1.00008726, "epoch": 0.10641815722230573, "flos": 31395586147200.0, "grad_norm": 1.8275276693890916, "language_loss": 0.61906171, "learning_rate": 3.9390543860302395e-06, "loss": 0.63827729, "num_input_tokens_seen": 38253890, "step": 1770, "time_per_iteration": 2.8926138877868652 }, { "auxiliary_loss_clip": 0.01057689, "auxiliary_loss_mlp": 0.01010808, "balance_loss_clip": 1.02007711, "balance_loss_mlp": 1.00775671, "epoch": 0.1064782804749737, "flos": 58552527784320.0, "grad_norm": 0.9163874753670794, "language_loss": 0.57049137, "learning_rate": 3.9389589374955925e-06, "loss": 0.59117633, "num_input_tokens_seen": 38304290, "step": 1771, "time_per_iteration": 3.0783088207244873 }, { "auxiliary_loss_clip": 0.01146276, "auxiliary_loss_mlp": 0.01065918, "balance_loss_clip": 1.05574095, "balance_loss_mlp": 1.04465103, "epoch": 0.10653840372764166, "flos": 23987825516160.0, "grad_norm": 12.794881398939157, "language_loss": 0.88265753, "learning_rate": 3.938863415435429e-06, "loss": 0.90477949, "num_input_tokens_seen": 38324725, "step": 1772, "time_per_iteration": 2.770202159881592 }, { "auxiliary_loss_clip": 0.0118421, "auxiliary_loss_mlp": 0.01058161, "balance_loss_clip": 1.05697048, "balance_loss_mlp": 1.03497458, "epoch": 0.10659852698030964, "flos": 18294655668480.0, "grad_norm": 2.576940958490313, "language_loss": 0.76030588, "learning_rate": 3.93876781985337e-06, "loss": 0.78272957, "num_input_tokens_seen": 38340735, "step": 1773, "time_per_iteration": 2.6177070140838623 }, { "auxiliary_loss_clip": 0.01122733, "auxiliary_loss_mlp": 0.01067657, "balance_loss_clip": 1.04691553, "balance_loss_mlp": 1.04205084, "epoch": 0.1066586502329776, "flos": 32160591031680.0, "grad_norm": 1.868288871406422, "language_loss": 0.8330853, "learning_rate": 3.938672150753041e-06, "loss": 0.85498923, "num_input_tokens_seen": 38361315, "step": 1774, "time_per_iteration": 2.7396061420440674 }, { "auxiliary_loss_clip": 0.01156305, "auxiliary_loss_mlp": 0.00780518, "balance_loss_clip": 1.05627465, "balance_loss_mlp": 1.00011277, "epoch": 0.10671877348564557, "flos": 17785155202560.0, "grad_norm": 2.73383407032925, "language_loss": 0.76446521, "learning_rate": 3.9385764081380704e-06, "loss": 0.78383344, "num_input_tokens_seen": 38377425, "step": 1775, "time_per_iteration": 2.624208927154541 }, { "auxiliary_loss_clip": 0.01063199, "auxiliary_loss_mlp": 0.01007654, "balance_loss_clip": 1.01726675, "balance_loss_mlp": 1.00443542, "epoch": 0.10677889673831355, "flos": 63510177813120.0, "grad_norm": 0.8200823962511624, "language_loss": 0.57477289, "learning_rate": 3.9384805920120876e-06, "loss": 0.5954814, "num_input_tokens_seen": 38440275, "step": 1776, "time_per_iteration": 3.1782386302948 }, { "auxiliary_loss_clip": 0.01150087, "auxiliary_loss_mlp": 0.01066244, "balance_loss_clip": 1.05192852, "balance_loss_mlp": 1.0407691, "epoch": 0.10683901999098151, "flos": 22017694400640.0, "grad_norm": 1.4232532718517703, "language_loss": 0.83442962, "learning_rate": 3.938384702378727e-06, "loss": 0.85659301, "num_input_tokens_seen": 38461820, "step": 1777, "time_per_iteration": 2.7342305183410645 }, { "auxiliary_loss_clip": 0.01113855, "auxiliary_loss_mlp": 0.00780712, "balance_loss_clip": 1.04919302, "balance_loss_mlp": 1.00015831, "epoch": 0.10689914324364948, "flos": 25042952551680.0, "grad_norm": 1.8326039994575831, "language_loss": 0.87207437, "learning_rate": 3.938288739241625e-06, "loss": 0.89102006, "num_input_tokens_seen": 38482235, "step": 1778, "time_per_iteration": 2.859834671020508 }, { "auxiliary_loss_clip": 0.01152509, "auxiliary_loss_mlp": 0.00780436, "balance_loss_clip": 1.06804752, "balance_loss_mlp": 1.00019765, "epoch": 0.10695926649631746, "flos": 16435129507200.0, "grad_norm": 2.4525249429301823, "language_loss": 0.84165859, "learning_rate": 3.938192702604417e-06, "loss": 0.86098808, "num_input_tokens_seen": 38500690, "step": 1779, "time_per_iteration": 2.81423020362854 }, { "auxiliary_loss_clip": 0.01141718, "auxiliary_loss_mlp": 0.00779857, "balance_loss_clip": 1.05215359, "balance_loss_mlp": 1.0001775, "epoch": 0.10701938974898542, "flos": 16979211792000.0, "grad_norm": 1.9378348403129941, "language_loss": 0.66915894, "learning_rate": 3.9380965924707495e-06, "loss": 0.68837464, "num_input_tokens_seen": 38518405, "step": 1780, "time_per_iteration": 2.616684913635254 }, { "auxiliary_loss_clip": 0.01166288, "auxiliary_loss_mlp": 0.01054109, "balance_loss_clip": 1.05843914, "balance_loss_mlp": 1.03268683, "epoch": 0.10707951300165339, "flos": 15888102307200.0, "grad_norm": 1.9168180254288365, "language_loss": 0.92058647, "learning_rate": 3.938000408844265e-06, "loss": 0.94279045, "num_input_tokens_seen": 38535060, "step": 1781, "time_per_iteration": 2.6167802810668945 }, { "auxiliary_loss_clip": 0.0113109, "auxiliary_loss_mlp": 0.01064554, "balance_loss_clip": 1.0531441, "balance_loss_mlp": 1.04344225, "epoch": 0.10713963625432135, "flos": 14247164361600.0, "grad_norm": 1.8357670097294174, "language_loss": 0.79336482, "learning_rate": 3.9379041517286105e-06, "loss": 0.81532121, "num_input_tokens_seen": 38552855, "step": 1782, "time_per_iteration": 2.7669336795806885 }, { "auxiliary_loss_clip": 0.01158369, "auxiliary_loss_mlp": 0.01061646, "balance_loss_clip": 1.05510604, "balance_loss_mlp": 1.04016423, "epoch": 0.10719975950698933, "flos": 16756780821120.0, "grad_norm": 2.0914095256513945, "language_loss": 0.79086542, "learning_rate": 3.937807821127436e-06, "loss": 0.81306553, "num_input_tokens_seen": 38570075, "step": 1783, "time_per_iteration": 2.6349542140960693 }, { "auxiliary_loss_clip": 0.01164267, "auxiliary_loss_mlp": 0.01065333, "balance_loss_clip": 1.0570296, "balance_loss_mlp": 1.04299295, "epoch": 0.1072598827596573, "flos": 22710626645760.0, "grad_norm": 2.1874612027367806, "language_loss": 0.86421812, "learning_rate": 3.937711417044395e-06, "loss": 0.88651407, "num_input_tokens_seen": 38587970, "step": 1784, "time_per_iteration": 2.8452541828155518 }, { "auxiliary_loss_clip": 0.01153461, "auxiliary_loss_mlp": 0.01055605, "balance_loss_clip": 1.05502176, "balance_loss_mlp": 1.03321707, "epoch": 0.10732000601232526, "flos": 23258264376960.0, "grad_norm": 2.4649130783319553, "language_loss": 1.01192284, "learning_rate": 3.937614939483143e-06, "loss": 1.03401351, "num_input_tokens_seen": 38605840, "step": 1785, "time_per_iteration": 2.690018653869629 }, { "auxiliary_loss_clip": 0.01168517, "auxiliary_loss_mlp": 0.01060763, "balance_loss_clip": 1.05854678, "balance_loss_mlp": 1.03984189, "epoch": 0.10738012926499324, "flos": 24207060176640.0, "grad_norm": 1.397915549237645, "language_loss": 0.84951413, "learning_rate": 3.937518388447339e-06, "loss": 0.87180698, "num_input_tokens_seen": 38627070, "step": 1786, "time_per_iteration": 2.637430191040039 }, { "auxiliary_loss_clip": 0.01183118, "auxiliary_loss_mlp": 0.01059079, "balance_loss_clip": 1.05716729, "balance_loss_mlp": 1.03520155, "epoch": 0.1074402525176612, "flos": 20923065383040.0, "grad_norm": 1.7951357311742837, "language_loss": 0.78861409, "learning_rate": 3.937421763940642e-06, "loss": 0.81103605, "num_input_tokens_seen": 38645840, "step": 1787, "time_per_iteration": 2.54508900642395 }, { "auxiliary_loss_clip": 0.01174896, "auxiliary_loss_mlp": 0.01047406, "balance_loss_clip": 1.05971575, "balance_loss_mlp": 1.02528071, "epoch": 0.10750037577032917, "flos": 16946928443520.0, "grad_norm": 1.8536072321218278, "language_loss": 0.82307518, "learning_rate": 3.937325065966719e-06, "loss": 0.84529817, "num_input_tokens_seen": 38664770, "step": 1788, "time_per_iteration": 2.706247568130493 }, { "auxiliary_loss_clip": 0.01180896, "auxiliary_loss_mlp": 0.01064682, "balance_loss_clip": 1.05843878, "balance_loss_mlp": 1.04427314, "epoch": 0.10756049902299715, "flos": 20266546550400.0, "grad_norm": 2.110245519520894, "language_loss": 0.77840686, "learning_rate": 3.9372282945292335e-06, "loss": 0.80086267, "num_input_tokens_seen": 38683865, "step": 1789, "time_per_iteration": 2.6274654865264893 }, { "auxiliary_loss_clip": 0.01185566, "auxiliary_loss_mlp": 0.01065099, "balance_loss_clip": 1.0604099, "balance_loss_mlp": 1.04049408, "epoch": 0.10762062227566511, "flos": 23586523793280.0, "grad_norm": 2.7248977042722524, "language_loss": 0.74817526, "learning_rate": 3.937131449631859e-06, "loss": 0.77068192, "num_input_tokens_seen": 38702485, "step": 1790, "time_per_iteration": 2.624382972717285 }, { "auxiliary_loss_clip": 0.01178128, "auxiliary_loss_mlp": 0.00780572, "balance_loss_clip": 1.06110644, "balance_loss_mlp": 1.00021124, "epoch": 0.10768074552833308, "flos": 24310626065280.0, "grad_norm": 2.350797373347828, "language_loss": 0.78764236, "learning_rate": 3.9370345312782645e-06, "loss": 0.80722934, "num_input_tokens_seen": 38722475, "step": 1791, "time_per_iteration": 2.696162223815918 }, { "auxiliary_loss_clip": 0.01134133, "auxiliary_loss_mlp": 0.01065057, "balance_loss_clip": 1.05280125, "balance_loss_mlp": 1.04117918, "epoch": 0.10774086878100106, "flos": 25299965341440.0, "grad_norm": 1.5879424734455678, "language_loss": 0.70638013, "learning_rate": 3.936937539472126e-06, "loss": 0.7283721, "num_input_tokens_seen": 38743285, "step": 1792, "time_per_iteration": 2.770874261856079 }, { "auxiliary_loss_clip": 0.01149934, "auxiliary_loss_mlp": 0.01051019, "balance_loss_clip": 1.05610943, "balance_loss_mlp": 1.02764249, "epoch": 0.10780099203366902, "flos": 22054035985920.0, "grad_norm": 1.920104493539276, "language_loss": 0.76565266, "learning_rate": 3.9368404742171236e-06, "loss": 0.78766215, "num_input_tokens_seen": 38763035, "step": 1793, "time_per_iteration": 2.7218761444091797 }, { "auxiliary_loss_clip": 0.01116412, "auxiliary_loss_mlp": 0.01064574, "balance_loss_clip": 1.05029237, "balance_loss_mlp": 1.0414238, "epoch": 0.10786111528633699, "flos": 22747471021440.0, "grad_norm": 1.7475786500241859, "language_loss": 0.85103315, "learning_rate": 3.936743335516936e-06, "loss": 0.87284303, "num_input_tokens_seen": 38784900, "step": 1794, "time_per_iteration": 2.7590620517730713 }, { "auxiliary_loss_clip": 0.01115198, "auxiliary_loss_mlp": 0.01055294, "balance_loss_clip": 1.04807687, "balance_loss_mlp": 1.03146446, "epoch": 0.10792123853900495, "flos": 20851064570880.0, "grad_norm": 2.5236234593460924, "language_loss": 0.74585378, "learning_rate": 3.936646123375246e-06, "loss": 0.76755869, "num_input_tokens_seen": 38804695, "step": 1795, "time_per_iteration": 2.8500585556030273 }, { "auxiliary_loss_clip": 0.01124895, "auxiliary_loss_mlp": 0.01058294, "balance_loss_clip": 1.04831553, "balance_loss_mlp": 1.03479767, "epoch": 0.10798136179167293, "flos": 17748705876480.0, "grad_norm": 2.842374039298248, "language_loss": 0.81653619, "learning_rate": 3.936548837795741e-06, "loss": 0.83836806, "num_input_tokens_seen": 38822395, "step": 1796, "time_per_iteration": 2.7549750804901123 }, { "auxiliary_loss_clip": 0.01140492, "auxiliary_loss_mlp": 0.01083966, "balance_loss_clip": 1.05246449, "balance_loss_mlp": 1.05721593, "epoch": 0.1080414850443409, "flos": 13589639948160.0, "grad_norm": 2.59635455269928, "language_loss": 0.74233043, "learning_rate": 3.936451478782111e-06, "loss": 0.764575, "num_input_tokens_seen": 38839865, "step": 1797, "time_per_iteration": 2.6396753787994385 }, { "auxiliary_loss_clip": 0.01160286, "auxiliary_loss_mlp": 0.01049954, "balance_loss_clip": 1.05505061, "balance_loss_mlp": 1.02874684, "epoch": 0.10810160829700886, "flos": 16253421580800.0, "grad_norm": 2.0852339617015025, "language_loss": 0.81855786, "learning_rate": 3.936354046338046e-06, "loss": 0.84066033, "num_input_tokens_seen": 38857300, "step": 1798, "time_per_iteration": 2.7105324268341064 }, { "auxiliary_loss_clip": 0.01142859, "auxiliary_loss_mlp": 0.01054502, "balance_loss_clip": 1.05379176, "balance_loss_mlp": 1.03117299, "epoch": 0.10816173154967684, "flos": 15158002464000.0, "grad_norm": 2.4443000829323687, "language_loss": 0.85516405, "learning_rate": 3.936256540467242e-06, "loss": 0.87713766, "num_input_tokens_seen": 38874960, "step": 1799, "time_per_iteration": 4.159978628158569 }, { "auxiliary_loss_clip": 0.01154352, "auxiliary_loss_mlp": 0.01062903, "balance_loss_clip": 1.05493283, "balance_loss_mlp": 1.04114687, "epoch": 0.10822185480234481, "flos": 17785334770560.0, "grad_norm": 2.7405734706827825, "language_loss": 0.77434146, "learning_rate": 3.9361589611733955e-06, "loss": 0.79651403, "num_input_tokens_seen": 38893610, "step": 1800, "time_per_iteration": 4.52047872543335 }, { "auxiliary_loss_clip": 0.01178634, "auxiliary_loss_mlp": 0.0104758, "balance_loss_clip": 1.05722904, "balance_loss_mlp": 1.02689719, "epoch": 0.10828197805501277, "flos": 25556654908800.0, "grad_norm": 1.582468034859118, "language_loss": 0.72897375, "learning_rate": 3.9360613084602075e-06, "loss": 0.75123584, "num_input_tokens_seen": 38913485, "step": 1801, "time_per_iteration": 4.291400909423828 }, { "auxiliary_loss_clip": 0.01190595, "auxiliary_loss_mlp": 0.01056056, "balance_loss_clip": 1.06095624, "balance_loss_mlp": 1.03478956, "epoch": 0.10834210130768075, "flos": 28984435845120.0, "grad_norm": 1.951139287607183, "language_loss": 0.6634692, "learning_rate": 3.935963582331381e-06, "loss": 0.68593562, "num_input_tokens_seen": 38935650, "step": 1802, "time_per_iteration": 2.722628355026245 }, { "auxiliary_loss_clip": 0.01155661, "auxiliary_loss_mlp": 0.01059375, "balance_loss_clip": 1.05326533, "balance_loss_mlp": 1.03695142, "epoch": 0.10840222456034872, "flos": 20264212166400.0, "grad_norm": 2.084551157592464, "language_loss": 0.81612957, "learning_rate": 3.935865782790621e-06, "loss": 0.8382799, "num_input_tokens_seen": 38954130, "step": 1803, "time_per_iteration": 4.239379167556763 }, { "auxiliary_loss_clip": 0.01163104, "auxiliary_loss_mlp": 0.01061781, "balance_loss_clip": 1.0567112, "balance_loss_mlp": 1.03921473, "epoch": 0.10846234781301668, "flos": 19863054097920.0, "grad_norm": 1.9102934552723363, "language_loss": 0.91127038, "learning_rate": 3.9357679098416365e-06, "loss": 0.93351918, "num_input_tokens_seen": 38972905, "step": 1804, "time_per_iteration": 2.5836737155914307 }, { "auxiliary_loss_clip": 0.01136188, "auxiliary_loss_mlp": 0.01060133, "balance_loss_clip": 1.05617714, "balance_loss_mlp": 1.03718543, "epoch": 0.10852247106568465, "flos": 26469037296000.0, "grad_norm": 2.5742522317806262, "language_loss": 0.76198906, "learning_rate": 3.935669963488139e-06, "loss": 0.78395224, "num_input_tokens_seen": 38993255, "step": 1805, "time_per_iteration": 2.783137321472168 }, { "auxiliary_loss_clip": 0.01149468, "auxiliary_loss_mlp": 0.01050946, "balance_loss_clip": 1.05419612, "balance_loss_mlp": 1.03050184, "epoch": 0.10858259431835263, "flos": 30081506987520.0, "grad_norm": 1.7049574807827799, "language_loss": 0.85876733, "learning_rate": 3.935571943733843e-06, "loss": 0.88077152, "num_input_tokens_seen": 39012610, "step": 1806, "time_per_iteration": 2.8148701190948486 }, { "auxiliary_loss_clip": 0.01168733, "auxiliary_loss_mlp": 0.00779888, "balance_loss_clip": 1.05462408, "balance_loss_mlp": 1.00006652, "epoch": 0.10864271757102059, "flos": 19063180085760.0, "grad_norm": 2.554050049117878, "language_loss": 0.8108198, "learning_rate": 3.9354738505824635e-06, "loss": 0.83030605, "num_input_tokens_seen": 39030120, "step": 1807, "time_per_iteration": 2.6275649070739746 }, { "auxiliary_loss_clip": 0.01139085, "auxiliary_loss_mlp": 0.01055438, "balance_loss_clip": 1.05193985, "balance_loss_mlp": 1.03522038, "epoch": 0.10870284082368856, "flos": 24715052271360.0, "grad_norm": 1.834914777588586, "language_loss": 0.78910971, "learning_rate": 3.9353756840377225e-06, "loss": 0.81105494, "num_input_tokens_seen": 39049875, "step": 1808, "time_per_iteration": 2.722910165786743 }, { "auxiliary_loss_clip": 0.01157997, "auxiliary_loss_mlp": 0.01056971, "balance_loss_clip": 1.05918014, "balance_loss_mlp": 1.03548992, "epoch": 0.10876296407635654, "flos": 20627663932800.0, "grad_norm": 1.6201371380093192, "language_loss": 0.79013431, "learning_rate": 3.935277444103342e-06, "loss": 0.81228393, "num_input_tokens_seen": 39068935, "step": 1809, "time_per_iteration": 2.7261481285095215 }, { "auxiliary_loss_clip": 0.01180468, "auxiliary_loss_mlp": 0.01057915, "balance_loss_clip": 1.0568099, "balance_loss_mlp": 1.03705359, "epoch": 0.1088230873290245, "flos": 21579835610880.0, "grad_norm": 1.9004896030263678, "language_loss": 0.85129547, "learning_rate": 3.935179130783046e-06, "loss": 0.87367928, "num_input_tokens_seen": 39087370, "step": 1810, "time_per_iteration": 2.672696828842163 }, { "auxiliary_loss_clip": 0.01124301, "auxiliary_loss_mlp": 0.01057363, "balance_loss_clip": 1.04580724, "balance_loss_mlp": 1.0335803, "epoch": 0.10888321058169247, "flos": 26469037296000.0, "grad_norm": 1.5993643379141278, "language_loss": 0.63822675, "learning_rate": 3.935080744080564e-06, "loss": 0.66004336, "num_input_tokens_seen": 39106635, "step": 1811, "time_per_iteration": 2.7731611728668213 }, { "auxiliary_loss_clip": 0.01151891, "auxiliary_loss_mlp": 0.01050225, "balance_loss_clip": 1.05335796, "balance_loss_mlp": 1.02836192, "epoch": 0.10894333383436045, "flos": 25848608653440.0, "grad_norm": 1.9284151803363307, "language_loss": 0.74238706, "learning_rate": 3.934982283999626e-06, "loss": 0.76440823, "num_input_tokens_seen": 39126335, "step": 1812, "time_per_iteration": 2.727743625640869 }, { "auxiliary_loss_clip": 0.01142498, "auxiliary_loss_mlp": 0.01057826, "balance_loss_clip": 1.05199611, "balance_loss_mlp": 1.03546214, "epoch": 0.10900345708702841, "flos": 19537093152000.0, "grad_norm": 1.5783196636767667, "language_loss": 0.72746086, "learning_rate": 3.934883750543966e-06, "loss": 0.74946409, "num_input_tokens_seen": 39144820, "step": 1813, "time_per_iteration": 2.798297166824341 }, { "auxiliary_loss_clip": 0.0113892, "auxiliary_loss_mlp": 0.01056639, "balance_loss_clip": 1.0511452, "balance_loss_mlp": 1.03515792, "epoch": 0.10906358033969638, "flos": 23623296341760.0, "grad_norm": 1.635228619121262, "language_loss": 0.82981038, "learning_rate": 3.93478514371732e-06, "loss": 0.85176599, "num_input_tokens_seen": 39165945, "step": 1814, "time_per_iteration": 2.7120048999786377 }, { "auxiliary_loss_clip": 0.01141958, "auxiliary_loss_mlp": 0.01058857, "balance_loss_clip": 1.0537864, "balance_loss_mlp": 1.03787625, "epoch": 0.10912370359236434, "flos": 21214731818880.0, "grad_norm": 1.9556743991494996, "language_loss": 0.84310579, "learning_rate": 3.934686463523429e-06, "loss": 0.86511397, "num_input_tokens_seen": 39183520, "step": 1815, "time_per_iteration": 2.788870096206665 }, { "auxiliary_loss_clip": 0.01146878, "auxiliary_loss_mlp": 0.01055141, "balance_loss_clip": 1.05443966, "balance_loss_mlp": 1.03182411, "epoch": 0.10918382684503232, "flos": 13553190622080.0, "grad_norm": 2.5374826422013195, "language_loss": 0.71670222, "learning_rate": 3.9345877099660315e-06, "loss": 0.73872244, "num_input_tokens_seen": 39201190, "step": 1816, "time_per_iteration": 2.8424103260040283 }, { "auxiliary_loss_clip": 0.01164173, "auxiliary_loss_mlp": 0.01064184, "balance_loss_clip": 1.05216932, "balance_loss_mlp": 1.04052126, "epoch": 0.10924395009770028, "flos": 27964321591680.0, "grad_norm": 2.016899555923086, "language_loss": 0.72880268, "learning_rate": 3.9344888830488744e-06, "loss": 0.75108624, "num_input_tokens_seen": 39221210, "step": 1817, "time_per_iteration": 2.7320947647094727 }, { "auxiliary_loss_clip": 0.01116915, "auxiliary_loss_mlp": 0.01057856, "balance_loss_clip": 1.05173278, "balance_loss_mlp": 1.03517008, "epoch": 0.10930407335036825, "flos": 25593750679680.0, "grad_norm": 1.5988628345308824, "language_loss": 0.67275256, "learning_rate": 3.934389982775706e-06, "loss": 0.69450033, "num_input_tokens_seen": 39242025, "step": 1818, "time_per_iteration": 2.8700790405273438 }, { "auxiliary_loss_clip": 0.01155804, "auxiliary_loss_mlp": 0.01065952, "balance_loss_clip": 1.05673873, "balance_loss_mlp": 1.04313517, "epoch": 0.10936419660303623, "flos": 18406194376320.0, "grad_norm": 3.593580913512793, "language_loss": 0.73149616, "learning_rate": 3.934291009150275e-06, "loss": 0.75371373, "num_input_tokens_seen": 39259870, "step": 1819, "time_per_iteration": 2.7091007232666016 }, { "auxiliary_loss_clip": 0.01142955, "auxiliary_loss_mlp": 0.00779155, "balance_loss_clip": 1.05341268, "balance_loss_mlp": 1.00027704, "epoch": 0.1094243198557042, "flos": 23840052963840.0, "grad_norm": 4.531598275817935, "language_loss": 0.73764241, "learning_rate": 3.934191962176335e-06, "loss": 0.75686359, "num_input_tokens_seen": 39278500, "step": 1820, "time_per_iteration": 2.6513099670410156 }, { "auxiliary_loss_clip": 0.01179358, "auxiliary_loss_mlp": 0.01056073, "balance_loss_clip": 1.05747604, "balance_loss_mlp": 1.03297031, "epoch": 0.10948444310837216, "flos": 14643940970880.0, "grad_norm": 2.2567103978329337, "language_loss": 0.82532805, "learning_rate": 3.934092841857642e-06, "loss": 0.84768236, "num_input_tokens_seen": 39294800, "step": 1821, "time_per_iteration": 2.5348384380340576 }, { "auxiliary_loss_clip": 0.01148016, "auxiliary_loss_mlp": 0.01052031, "balance_loss_clip": 1.05133605, "balance_loss_mlp": 1.03077567, "epoch": 0.10954456636104014, "flos": 27818811596160.0, "grad_norm": 2.0770330480401578, "language_loss": 0.76271641, "learning_rate": 3.933993648197955e-06, "loss": 0.7847169, "num_input_tokens_seen": 39314625, "step": 1822, "time_per_iteration": 2.730079174041748 }, { "auxiliary_loss_clip": 0.01142446, "auxiliary_loss_mlp": 0.01049259, "balance_loss_clip": 1.04849207, "balance_loss_mlp": 1.02856421, "epoch": 0.1096046896137081, "flos": 33620934372480.0, "grad_norm": 1.734419613996414, "language_loss": 0.79309607, "learning_rate": 3.933894381201034e-06, "loss": 0.81501311, "num_input_tokens_seen": 39336465, "step": 1823, "time_per_iteration": 2.756969928741455 }, { "auxiliary_loss_clip": 0.01148165, "auxiliary_loss_mlp": 0.01049595, "balance_loss_clip": 1.05160606, "balance_loss_mlp": 1.02745807, "epoch": 0.10966481286637607, "flos": 26980010219520.0, "grad_norm": 1.4318009514182364, "language_loss": 0.79590744, "learning_rate": 3.933795040870645e-06, "loss": 0.81788504, "num_input_tokens_seen": 39357930, "step": 1824, "time_per_iteration": 2.798168182373047 }, { "auxiliary_loss_clip": 0.01142146, "auxiliary_loss_mlp": 0.01055513, "balance_loss_clip": 1.05104232, "balance_loss_mlp": 1.03381693, "epoch": 0.10972493611904403, "flos": 23036551678080.0, "grad_norm": 2.127143421089703, "language_loss": 0.88138539, "learning_rate": 3.933695627210554e-06, "loss": 0.90336192, "num_input_tokens_seen": 39376380, "step": 1825, "time_per_iteration": 2.6804513931274414 }, { "auxiliary_loss_clip": 0.01128623, "auxiliary_loss_mlp": 0.01056127, "balance_loss_clip": 1.04586983, "balance_loss_mlp": 1.03439498, "epoch": 0.10978505937171201, "flos": 38104632443520.0, "grad_norm": 1.721192594935189, "language_loss": 0.76441038, "learning_rate": 3.933596140224532e-06, "loss": 0.78625786, "num_input_tokens_seen": 39399935, "step": 1826, "time_per_iteration": 2.8315086364746094 }, { "auxiliary_loss_clip": 0.01063155, "auxiliary_loss_mlp": 0.01016957, "balance_loss_clip": 1.02709544, "balance_loss_mlp": 1.01409554, "epoch": 0.10984518262437998, "flos": 59849694616320.0, "grad_norm": 0.8518463216820418, "language_loss": 0.54997343, "learning_rate": 3.93349657991635e-06, "loss": 0.57077461, "num_input_tokens_seen": 39460685, "step": 1827, "time_per_iteration": 3.1425766944885254 }, { "auxiliary_loss_clip": 0.01072651, "auxiliary_loss_mlp": 0.01010167, "balance_loss_clip": 1.02693772, "balance_loss_mlp": 1.00717473, "epoch": 0.10990530587704794, "flos": 66719837410560.0, "grad_norm": 0.7375455878808789, "language_loss": 0.55382878, "learning_rate": 3.933396946289784e-06, "loss": 0.57465696, "num_input_tokens_seen": 39524765, "step": 1828, "time_per_iteration": 3.168165922164917 }, { "auxiliary_loss_clip": 0.01156998, "auxiliary_loss_mlp": 0.01059335, "balance_loss_clip": 1.05407059, "balance_loss_mlp": 1.03618491, "epoch": 0.10996542912971592, "flos": 25447199189760.0, "grad_norm": 2.250827401167328, "language_loss": 0.84010404, "learning_rate": 3.933297239348612e-06, "loss": 0.86226743, "num_input_tokens_seen": 39543640, "step": 1829, "time_per_iteration": 2.7341628074645996 }, { "auxiliary_loss_clip": 0.01130747, "auxiliary_loss_mlp": 0.01053464, "balance_loss_clip": 1.0547024, "balance_loss_mlp": 1.03036165, "epoch": 0.11002555238238389, "flos": 44018186186880.0, "grad_norm": 2.342204785330024, "language_loss": 0.88880253, "learning_rate": 3.933197459096614e-06, "loss": 0.91064465, "num_input_tokens_seen": 39567525, "step": 1830, "time_per_iteration": 2.9093260765075684 }, { "auxiliary_loss_clip": 0.01049643, "auxiliary_loss_mlp": 0.01009685, "balance_loss_clip": 1.02618647, "balance_loss_mlp": 1.00681162, "epoch": 0.11008567563505185, "flos": 54065133590400.0, "grad_norm": 0.6882192363357665, "language_loss": 0.55566543, "learning_rate": 3.9330976055375756e-06, "loss": 0.57625872, "num_input_tokens_seen": 39628470, "step": 1831, "time_per_iteration": 3.1713974475860596 }, { "auxiliary_loss_clip": 0.01156783, "auxiliary_loss_mlp": 0.01073931, "balance_loss_clip": 1.05708003, "balance_loss_mlp": 1.04965997, "epoch": 0.11014579888771983, "flos": 24243150366720.0, "grad_norm": 2.4937725361201495, "language_loss": 0.90836191, "learning_rate": 3.932997678675282e-06, "loss": 0.93066907, "num_input_tokens_seen": 39646670, "step": 1832, "time_per_iteration": 2.6786489486694336 }, { "auxiliary_loss_clip": 0.0106111, "auxiliary_loss_mlp": 0.01010664, "balance_loss_clip": 1.02332854, "balance_loss_mlp": 1.00769615, "epoch": 0.1102059221403878, "flos": 57743965658880.0, "grad_norm": 0.7154576595208243, "language_loss": 0.59911001, "learning_rate": 3.932897678513523e-06, "loss": 0.61982775, "num_input_tokens_seen": 39712915, "step": 1833, "time_per_iteration": 3.1802401542663574 }, { "auxiliary_loss_clip": 0.01167201, "auxiliary_loss_mlp": 0.0105502, "balance_loss_clip": 1.05312014, "balance_loss_mlp": 1.03285873, "epoch": 0.11026604539305576, "flos": 16795923667200.0, "grad_norm": 2.6772934272606923, "language_loss": 0.80799395, "learning_rate": 3.93279760505609e-06, "loss": 0.83021617, "num_input_tokens_seen": 39730650, "step": 1834, "time_per_iteration": 2.591374635696411 }, { "auxiliary_loss_clip": 0.01141662, "auxiliary_loss_mlp": 0.01054827, "balance_loss_clip": 1.05557871, "balance_loss_mlp": 1.03004324, "epoch": 0.11032616864572373, "flos": 23988076911360.0, "grad_norm": 2.4853906687508247, "language_loss": 0.89856094, "learning_rate": 3.932697458306779e-06, "loss": 0.92052579, "num_input_tokens_seen": 39751065, "step": 1835, "time_per_iteration": 2.742330312728882 }, { "auxiliary_loss_clip": 0.01131787, "auxiliary_loss_mlp": 0.01063812, "balance_loss_clip": 1.0524013, "balance_loss_mlp": 1.03758645, "epoch": 0.1103862918983917, "flos": 19683141851520.0, "grad_norm": 2.2754442269720023, "language_loss": 0.63256055, "learning_rate": 3.932597238269386e-06, "loss": 0.65451658, "num_input_tokens_seen": 39769245, "step": 1836, "time_per_iteration": 2.6935038566589355 }, { "auxiliary_loss_clip": 0.01138919, "auxiliary_loss_mlp": 0.01061469, "balance_loss_clip": 1.05021358, "balance_loss_mlp": 1.03954661, "epoch": 0.11044641515105967, "flos": 32160878340480.0, "grad_norm": 1.6726289784191204, "language_loss": 0.72792488, "learning_rate": 3.932496944947711e-06, "loss": 0.74992871, "num_input_tokens_seen": 39790830, "step": 1837, "time_per_iteration": 2.7790510654449463 }, { "auxiliary_loss_clip": 0.01165472, "auxiliary_loss_mlp": 0.01057035, "balance_loss_clip": 1.05463088, "balance_loss_mlp": 1.03551781, "epoch": 0.11050653840372764, "flos": 16689233295360.0, "grad_norm": 2.027055787194766, "language_loss": 0.78489268, "learning_rate": 3.93239657834556e-06, "loss": 0.8071177, "num_input_tokens_seen": 39809475, "step": 1838, "time_per_iteration": 4.098532438278198 }, { "auxiliary_loss_clip": 0.01154042, "auxiliary_loss_mlp": 0.01062407, "balance_loss_clip": 1.05542612, "balance_loss_mlp": 1.03970969, "epoch": 0.11056666165639562, "flos": 21208877902080.0, "grad_norm": 2.046221888979386, "language_loss": 0.71451718, "learning_rate": 3.932296138466736e-06, "loss": 0.7366817, "num_input_tokens_seen": 39826355, "step": 1839, "time_per_iteration": 4.205714464187622 }, { "auxiliary_loss_clip": 0.01187588, "auxiliary_loss_mlp": 0.00781104, "balance_loss_clip": 1.06183171, "balance_loss_mlp": 1.00018013, "epoch": 0.11062678490906358, "flos": 19165488998400.0, "grad_norm": 2.623062836625425, "language_loss": 0.79027873, "learning_rate": 3.93219562531505e-06, "loss": 0.80996567, "num_input_tokens_seen": 39845335, "step": 1840, "time_per_iteration": 2.6023378372192383 }, { "auxiliary_loss_clip": 0.01156508, "auxiliary_loss_mlp": 0.01052512, "balance_loss_clip": 1.05206251, "balance_loss_mlp": 1.02887261, "epoch": 0.11068690816173155, "flos": 24895287740160.0, "grad_norm": 1.7551987843009527, "language_loss": 0.88083529, "learning_rate": 3.932095038894311e-06, "loss": 0.90292549, "num_input_tokens_seen": 39865065, "step": 1841, "time_per_iteration": 4.3361639976501465 }, { "auxiliary_loss_clip": 0.01130203, "auxiliary_loss_mlp": 0.01067683, "balance_loss_clip": 1.05036247, "balance_loss_mlp": 1.04453301, "epoch": 0.11074703141439952, "flos": 16472368932480.0, "grad_norm": 3.1603067125494126, "language_loss": 0.90521991, "learning_rate": 3.931994379208334e-06, "loss": 0.92719877, "num_input_tokens_seen": 39882780, "step": 1842, "time_per_iteration": 2.7086760997772217 }, { "auxiliary_loss_clip": 0.01152506, "auxiliary_loss_mlp": 0.01061227, "balance_loss_clip": 1.05065131, "balance_loss_mlp": 1.03982854, "epoch": 0.11080715466706749, "flos": 19172420323200.0, "grad_norm": 2.112801816568727, "language_loss": 0.85845053, "learning_rate": 3.931893646260937e-06, "loss": 0.88058788, "num_input_tokens_seen": 39900295, "step": 1843, "time_per_iteration": 4.263117790222168 }, { "auxiliary_loss_clip": 0.01119254, "auxiliary_loss_mlp": 0.00783076, "balance_loss_clip": 1.05050898, "balance_loss_mlp": 1.00012159, "epoch": 0.11086727791973545, "flos": 27704687109120.0, "grad_norm": 1.4511349711086798, "language_loss": 0.74735641, "learning_rate": 3.931792840055941e-06, "loss": 0.76637971, "num_input_tokens_seen": 39922075, "step": 1844, "time_per_iteration": 2.7999000549316406 }, { "auxiliary_loss_clip": 0.01180395, "auxiliary_loss_mlp": 0.01055824, "balance_loss_clip": 1.05662274, "balance_loss_mlp": 1.03238785, "epoch": 0.11092740117240343, "flos": 18514967736960.0, "grad_norm": 2.017286766878137, "language_loss": 0.7566812, "learning_rate": 3.931691960597165e-06, "loss": 0.77904338, "num_input_tokens_seen": 39940115, "step": 1845, "time_per_iteration": 2.5305535793304443 }, { "auxiliary_loss_clip": 0.01153403, "auxiliary_loss_mlp": 0.01058911, "balance_loss_clip": 1.05442989, "balance_loss_mlp": 1.03807366, "epoch": 0.1109875244250714, "flos": 20522446018560.0, "grad_norm": 1.9628359583393364, "language_loss": 0.75953126, "learning_rate": 3.9315910078884375e-06, "loss": 0.78165436, "num_input_tokens_seen": 39959920, "step": 1846, "time_per_iteration": 2.719325542449951 }, { "auxiliary_loss_clip": 0.01173899, "auxiliary_loss_mlp": 0.01059369, "balance_loss_clip": 1.05823123, "balance_loss_mlp": 1.03717244, "epoch": 0.11104764767773936, "flos": 14098601710080.0, "grad_norm": 2.612459533347621, "language_loss": 0.8620472, "learning_rate": 3.931489981933584e-06, "loss": 0.88437986, "num_input_tokens_seen": 39974755, "step": 1847, "time_per_iteration": 2.7705559730529785 }, { "auxiliary_loss_clip": 0.01181158, "auxiliary_loss_mlp": 0.01055145, "balance_loss_clip": 1.05562854, "balance_loss_mlp": 1.0322808, "epoch": 0.11110777093040733, "flos": 20594518657920.0, "grad_norm": 1.8452742714770096, "language_loss": 0.76981926, "learning_rate": 3.931388882736438e-06, "loss": 0.79218227, "num_input_tokens_seen": 39993355, "step": 1848, "time_per_iteration": 2.605933666229248 }, { "auxiliary_loss_clip": 0.01172398, "auxiliary_loss_mlp": 0.01056349, "balance_loss_clip": 1.06262445, "balance_loss_mlp": 1.03455794, "epoch": 0.11116789418307531, "flos": 21870065502720.0, "grad_norm": 1.6943193134392138, "language_loss": 0.77621841, "learning_rate": 3.931287710300832e-06, "loss": 0.7985059, "num_input_tokens_seen": 40012410, "step": 1849, "time_per_iteration": 2.678415536880493 }, { "auxiliary_loss_clip": 0.01138995, "auxiliary_loss_mlp": 0.00781122, "balance_loss_clip": 1.05277848, "balance_loss_mlp": 1.00010324, "epoch": 0.11122801743574327, "flos": 15523106256000.0, "grad_norm": 3.3234972538165066, "language_loss": 0.72098577, "learning_rate": 3.931186464630601e-06, "loss": 0.74018693, "num_input_tokens_seen": 40029315, "step": 1850, "time_per_iteration": 2.7763028144836426 }, { "auxiliary_loss_clip": 0.01170569, "auxiliary_loss_mlp": 0.01061108, "balance_loss_clip": 1.05759382, "balance_loss_mlp": 1.03874469, "epoch": 0.11128814068841124, "flos": 14392279307520.0, "grad_norm": 2.0638339407107873, "language_loss": 0.81499028, "learning_rate": 3.931085145729588e-06, "loss": 0.83730704, "num_input_tokens_seen": 40045765, "step": 1851, "time_per_iteration": 2.688854694366455 }, { "auxiliary_loss_clip": 0.01164692, "auxiliary_loss_mlp": 0.01061301, "balance_loss_clip": 1.05789042, "balance_loss_mlp": 1.04027295, "epoch": 0.11134826394107922, "flos": 16653933204480.0, "grad_norm": 2.365035468310974, "language_loss": 0.88270009, "learning_rate": 3.930983753601631e-06, "loss": 0.90496004, "num_input_tokens_seen": 40061660, "step": 1852, "time_per_iteration": 2.659914493560791 }, { "auxiliary_loss_clip": 0.01166772, "auxiliary_loss_mlp": 0.01060698, "balance_loss_clip": 1.05489326, "balance_loss_mlp": 1.03791702, "epoch": 0.11140838719374718, "flos": 16690993061760.0, "grad_norm": 2.1825610274136054, "language_loss": 0.72492862, "learning_rate": 3.930882288250578e-06, "loss": 0.74720335, "num_input_tokens_seen": 40080180, "step": 1853, "time_per_iteration": 2.7840964794158936 }, { "auxiliary_loss_clip": 0.01069898, "auxiliary_loss_mlp": 0.01019902, "balance_loss_clip": 1.02549517, "balance_loss_mlp": 1.01701725, "epoch": 0.11146851044641515, "flos": 60976355587200.0, "grad_norm": 0.772231443606995, "language_loss": 0.53664064, "learning_rate": 3.930780749680273e-06, "loss": 0.55753863, "num_input_tokens_seen": 40138910, "step": 1854, "time_per_iteration": 3.089354991912842 }, { "auxiliary_loss_clip": 0.01159576, "auxiliary_loss_mlp": 0.0105585, "balance_loss_clip": 1.05390525, "balance_loss_mlp": 1.03184092, "epoch": 0.11152863369908313, "flos": 22193835719040.0, "grad_norm": 1.863523240792578, "language_loss": 0.8468501, "learning_rate": 3.9306791378945705e-06, "loss": 0.86900431, "num_input_tokens_seen": 40157745, "step": 1855, "time_per_iteration": 2.7361156940460205 }, { "auxiliary_loss_clip": 0.01147504, "auxiliary_loss_mlp": 0.01064479, "balance_loss_clip": 1.05225825, "balance_loss_mlp": 1.0424726, "epoch": 0.11158875695175109, "flos": 19537524115200.0, "grad_norm": 2.1217067547931756, "language_loss": 0.81187081, "learning_rate": 3.9305774528973205e-06, "loss": 0.83399057, "num_input_tokens_seen": 40175375, "step": 1856, "time_per_iteration": 2.7158002853393555 }, { "auxiliary_loss_clip": 0.01168288, "auxiliary_loss_mlp": 0.01052259, "balance_loss_clip": 1.05843937, "balance_loss_mlp": 1.02957392, "epoch": 0.11164888020441906, "flos": 25442709989760.0, "grad_norm": 2.0555738298465314, "language_loss": 0.82761133, "learning_rate": 3.93047569469238e-06, "loss": 0.8498168, "num_input_tokens_seen": 40195715, "step": 1857, "time_per_iteration": 2.647184133529663 }, { "auxiliary_loss_clip": 0.01144196, "auxiliary_loss_mlp": 0.01044915, "balance_loss_clip": 1.05255508, "balance_loss_mlp": 1.02395833, "epoch": 0.11170900345708702, "flos": 15632741543040.0, "grad_norm": 2.3199985887988914, "language_loss": 0.83131742, "learning_rate": 3.930373863283608e-06, "loss": 0.85320854, "num_input_tokens_seen": 40213975, "step": 1858, "time_per_iteration": 2.726905107498169 }, { "auxiliary_loss_clip": 0.01134962, "auxiliary_loss_mlp": 0.01067658, "balance_loss_clip": 1.04900265, "balance_loss_mlp": 1.04350638, "epoch": 0.111769126709755, "flos": 23039424766080.0, "grad_norm": 2.0395414997027657, "language_loss": 0.9133389, "learning_rate": 3.930271958674866e-06, "loss": 0.93536508, "num_input_tokens_seen": 40233905, "step": 1859, "time_per_iteration": 3.0006766319274902 }, { "auxiliary_loss_clip": 0.01167289, "auxiliary_loss_mlp": 0.01049698, "balance_loss_clip": 1.05445409, "balance_loss_mlp": 1.02751315, "epoch": 0.11182924996242297, "flos": 20850705434880.0, "grad_norm": 2.048197345879043, "language_loss": 0.81528586, "learning_rate": 3.930169980870018e-06, "loss": 0.83745575, "num_input_tokens_seen": 40252810, "step": 1860, "time_per_iteration": 2.7216553688049316 }, { "auxiliary_loss_clip": 0.01154007, "auxiliary_loss_mlp": 0.01060885, "balance_loss_clip": 1.05737674, "balance_loss_mlp": 1.03920078, "epoch": 0.11188937321509093, "flos": 17455315587840.0, "grad_norm": 2.00330439318394, "language_loss": 0.75250578, "learning_rate": 3.930067929872931e-06, "loss": 0.77465475, "num_input_tokens_seen": 40272000, "step": 1861, "time_per_iteration": 2.6878490447998047 }, { "auxiliary_loss_clip": 0.01177651, "auxiliary_loss_mlp": 0.01054452, "balance_loss_clip": 1.0565964, "balance_loss_mlp": 1.03360212, "epoch": 0.11194949646775891, "flos": 24095916518400.0, "grad_norm": 1.9427039767358767, "language_loss": 0.88888168, "learning_rate": 3.929965805687474e-06, "loss": 0.91120267, "num_input_tokens_seen": 40290660, "step": 1862, "time_per_iteration": 2.615057945251465 }, { "auxiliary_loss_clip": 0.01164251, "auxiliary_loss_mlp": 0.01062894, "balance_loss_clip": 1.05994737, "balance_loss_mlp": 1.04086459, "epoch": 0.11200961972042688, "flos": 25153880728320.0, "grad_norm": 2.2273555113866847, "language_loss": 0.87719512, "learning_rate": 3.92986360831752e-06, "loss": 0.89946657, "num_input_tokens_seen": 40307820, "step": 1863, "time_per_iteration": 2.6778175830841064 }, { "auxiliary_loss_clip": 0.01158667, "auxiliary_loss_mlp": 0.01055299, "balance_loss_clip": 1.05455208, "balance_loss_mlp": 1.03071773, "epoch": 0.11206974297309484, "flos": 21288312829440.0, "grad_norm": 2.8013407816012226, "language_loss": 0.64245486, "learning_rate": 3.929761337766945e-06, "loss": 0.66459453, "num_input_tokens_seen": 40327430, "step": 1864, "time_per_iteration": 2.724076509475708 }, { "auxiliary_loss_clip": 0.01110154, "auxiliary_loss_mlp": 0.01047933, "balance_loss_clip": 1.04924703, "balance_loss_mlp": 1.02672601, "epoch": 0.11212986622576282, "flos": 18915982151040.0, "grad_norm": 2.0303098144917135, "language_loss": 0.74043733, "learning_rate": 3.929658994039627e-06, "loss": 0.7620182, "num_input_tokens_seen": 40344545, "step": 1865, "time_per_iteration": 2.8119356632232666 }, { "auxiliary_loss_clip": 0.01114683, "auxiliary_loss_mlp": 0.01070203, "balance_loss_clip": 1.05348182, "balance_loss_mlp": 1.04483545, "epoch": 0.11218998947843078, "flos": 22054754257920.0, "grad_norm": 2.7389427033573375, "language_loss": 0.84692436, "learning_rate": 3.929556577139446e-06, "loss": 0.86877316, "num_input_tokens_seen": 40362300, "step": 1866, "time_per_iteration": 2.8022067546844482 }, { "auxiliary_loss_clip": 0.01092364, "auxiliary_loss_mlp": 0.00781014, "balance_loss_clip": 1.04227424, "balance_loss_mlp": 1.00006938, "epoch": 0.11225011273109875, "flos": 24571697091840.0, "grad_norm": 1.704208120094955, "language_loss": 0.8104012, "learning_rate": 3.929454087070286e-06, "loss": 0.82913494, "num_input_tokens_seen": 40384720, "step": 1867, "time_per_iteration": 2.915989875793457 }, { "auxiliary_loss_clip": 0.01179505, "auxiliary_loss_mlp": 0.01060529, "balance_loss_clip": 1.05720687, "balance_loss_mlp": 1.03959608, "epoch": 0.11231023598376672, "flos": 28438665621120.0, "grad_norm": 2.0811636681692844, "language_loss": 0.86840278, "learning_rate": 3.929351523836035e-06, "loss": 0.8908031, "num_input_tokens_seen": 40404000, "step": 1868, "time_per_iteration": 2.6855647563934326 }, { "auxiliary_loss_clip": 0.01161412, "auxiliary_loss_mlp": 0.00779977, "balance_loss_clip": 1.06005311, "balance_loss_mlp": 1.00010097, "epoch": 0.1123703592364347, "flos": 14426466076800.0, "grad_norm": 2.1491178409138376, "language_loss": 0.68308532, "learning_rate": 3.9292488874405795e-06, "loss": 0.70249927, "num_input_tokens_seen": 40418665, "step": 1869, "time_per_iteration": 2.7404487133026123 }, { "auxiliary_loss_clip": 0.01133783, "auxiliary_loss_mlp": 0.01066188, "balance_loss_clip": 1.04932964, "balance_loss_mlp": 1.04225063, "epoch": 0.11243048248910266, "flos": 22236282616320.0, "grad_norm": 1.5255545896853626, "language_loss": 0.76943326, "learning_rate": 3.929146177887814e-06, "loss": 0.79143298, "num_input_tokens_seen": 40437870, "step": 1870, "time_per_iteration": 2.809734344482422 }, { "auxiliary_loss_clip": 0.01129358, "auxiliary_loss_mlp": 0.01056867, "balance_loss_clip": 1.0509038, "balance_loss_mlp": 1.03300166, "epoch": 0.11249060574177062, "flos": 18584167288320.0, "grad_norm": 1.8186132867503446, "language_loss": 0.76056099, "learning_rate": 3.929043395181631e-06, "loss": 0.78242326, "num_input_tokens_seen": 40455570, "step": 1871, "time_per_iteration": 2.727161169052124 }, { "auxiliary_loss_clip": 0.01105662, "auxiliary_loss_mlp": 0.01051114, "balance_loss_clip": 1.04993379, "balance_loss_mlp": 1.03026426, "epoch": 0.1125507289944386, "flos": 22856567604480.0, "grad_norm": 1.9425066802508644, "language_loss": 0.81811988, "learning_rate": 3.928940539325929e-06, "loss": 0.83968765, "num_input_tokens_seen": 40473600, "step": 1872, "time_per_iteration": 2.851868152618408 }, { "auxiliary_loss_clip": 0.01179923, "auxiliary_loss_mlp": 0.01055722, "balance_loss_clip": 1.05722499, "balance_loss_mlp": 1.03359652, "epoch": 0.11261085224710657, "flos": 19676390094720.0, "grad_norm": 2.186176467187071, "language_loss": 0.8361913, "learning_rate": 3.9288376103246095e-06, "loss": 0.85854775, "num_input_tokens_seen": 40490025, "step": 1873, "time_per_iteration": 2.6668763160705566 }, { "auxiliary_loss_clip": 0.01144862, "auxiliary_loss_mlp": 0.01054726, "balance_loss_clip": 1.0525465, "balance_loss_mlp": 1.03196871, "epoch": 0.11267097549977453, "flos": 26063246373120.0, "grad_norm": 1.8822875514234196, "language_loss": 0.92342389, "learning_rate": 3.928734608181575e-06, "loss": 0.94541967, "num_input_tokens_seen": 40511580, "step": 1874, "time_per_iteration": 2.700533866882324 }, { "auxiliary_loss_clip": 0.01140327, "auxiliary_loss_mlp": 0.01056402, "balance_loss_clip": 1.05100179, "balance_loss_mlp": 1.03509891, "epoch": 0.11273109875244251, "flos": 21068036674560.0, "grad_norm": 1.6564425098873434, "language_loss": 0.75359404, "learning_rate": 3.928631532900729e-06, "loss": 0.77556133, "num_input_tokens_seen": 40530155, "step": 1875, "time_per_iteration": 2.7642719745635986 }, { "auxiliary_loss_clip": 0.01167091, "auxiliary_loss_mlp": 0.01055271, "balance_loss_clip": 1.05893159, "balance_loss_mlp": 1.0348264, "epoch": 0.11279122200511048, "flos": 27088999061760.0, "grad_norm": 2.12758140825061, "language_loss": 0.71578634, "learning_rate": 3.928528384485984e-06, "loss": 0.73800993, "num_input_tokens_seen": 40549500, "step": 1876, "time_per_iteration": 2.8505096435546875 }, { "auxiliary_loss_clip": 0.01147417, "auxiliary_loss_mlp": 0.01054094, "balance_loss_clip": 1.05223966, "balance_loss_mlp": 1.03200495, "epoch": 0.11285134525777844, "flos": 20187901722240.0, "grad_norm": 1.8103612630164048, "language_loss": 0.76795971, "learning_rate": 3.9284251629412475e-06, "loss": 0.78997481, "num_input_tokens_seen": 40567475, "step": 1877, "time_per_iteration": 2.6972849369049072 }, { "auxiliary_loss_clip": 0.01168106, "auxiliary_loss_mlp": 0.01063056, "balance_loss_clip": 1.05518627, "balance_loss_mlp": 1.04026341, "epoch": 0.11291146851044641, "flos": 12458453863680.0, "grad_norm": 2.1601834607000368, "language_loss": 0.87843502, "learning_rate": 3.928321868270436e-06, "loss": 0.90074658, "num_input_tokens_seen": 40583280, "step": 1878, "time_per_iteration": 5.6992692947387695 }, { "auxiliary_loss_clip": 0.01140682, "auxiliary_loss_mlp": 0.01054902, "balance_loss_clip": 1.05420399, "balance_loss_mlp": 1.03333724, "epoch": 0.11297159176311439, "flos": 23842315520640.0, "grad_norm": 2.151084139284284, "language_loss": 0.81623232, "learning_rate": 3.928218500477466e-06, "loss": 0.83818817, "num_input_tokens_seen": 40603080, "step": 1879, "time_per_iteration": 2.8688366413116455 }, { "auxiliary_loss_clip": 0.01155904, "auxiliary_loss_mlp": 0.01059079, "balance_loss_clip": 1.05238748, "balance_loss_mlp": 1.03609526, "epoch": 0.11303171501578235, "flos": 29930538124800.0, "grad_norm": 1.941623939252122, "language_loss": 0.70234305, "learning_rate": 3.928115059566259e-06, "loss": 0.72449279, "num_input_tokens_seen": 40623255, "step": 1880, "time_per_iteration": 5.567574739456177 }, { "auxiliary_loss_clip": 0.01155691, "auxiliary_loss_mlp": 0.01052309, "balance_loss_clip": 1.05585837, "balance_loss_mlp": 1.0306015, "epoch": 0.11309183826845032, "flos": 16180558842240.0, "grad_norm": 1.6696082535169858, "language_loss": 0.72690225, "learning_rate": 3.928011545540734e-06, "loss": 0.74898225, "num_input_tokens_seen": 40641570, "step": 1881, "time_per_iteration": 2.792428493499756 }, { "auxiliary_loss_clip": 0.011425, "auxiliary_loss_mlp": 0.00781179, "balance_loss_clip": 1.05046606, "balance_loss_mlp": 1.00008667, "epoch": 0.1131519615211183, "flos": 12020702814720.0, "grad_norm": 2.2964043184115783, "language_loss": 0.74205768, "learning_rate": 3.927907958404819e-06, "loss": 0.76129448, "num_input_tokens_seen": 40658775, "step": 1882, "time_per_iteration": 4.414916515350342 }, { "auxiliary_loss_clip": 0.01177281, "auxiliary_loss_mlp": 0.01054815, "balance_loss_clip": 1.05680335, "balance_loss_mlp": 1.03203452, "epoch": 0.11321208477378626, "flos": 26250125857920.0, "grad_norm": 2.4326158086005965, "language_loss": 0.7923016, "learning_rate": 3.92780429816244e-06, "loss": 0.81462252, "num_input_tokens_seen": 40679555, "step": 1883, "time_per_iteration": 2.762615919113159 }, { "auxiliary_loss_clip": 0.01140926, "auxiliary_loss_mlp": 0.01058465, "balance_loss_clip": 1.05226314, "balance_loss_mlp": 1.03520727, "epoch": 0.11327220802645423, "flos": 13626376583040.0, "grad_norm": 2.2898863699254974, "language_loss": 0.77047318, "learning_rate": 3.927700564817529e-06, "loss": 0.79246712, "num_input_tokens_seen": 40697295, "step": 1884, "time_per_iteration": 2.835468292236328 }, { "auxiliary_loss_clip": 0.01074478, "auxiliary_loss_mlp": 0.01009476, "balance_loss_clip": 1.03993821, "balance_loss_mlp": 1.00620937, "epoch": 0.1133323312791222, "flos": 57191802814080.0, "grad_norm": 0.8138652948403053, "language_loss": 0.55151373, "learning_rate": 3.927596758374019e-06, "loss": 0.5723533, "num_input_tokens_seen": 40758095, "step": 1885, "time_per_iteration": 3.179532289505005 }, { "auxiliary_loss_clip": 0.01083888, "auxiliary_loss_mlp": 0.01050751, "balance_loss_clip": 1.04415166, "balance_loss_mlp": 1.02910316, "epoch": 0.11339245453179017, "flos": 24351708245760.0, "grad_norm": 1.9836288003076585, "language_loss": 0.90384823, "learning_rate": 3.927492878835848e-06, "loss": 0.92519462, "num_input_tokens_seen": 40777140, "step": 1886, "time_per_iteration": 3.038928747177124 }, { "auxiliary_loss_clip": 0.01116325, "auxiliary_loss_mlp": 0.01057697, "balance_loss_clip": 1.05137897, "balance_loss_mlp": 1.03634632, "epoch": 0.11345257778445814, "flos": 22670693700480.0, "grad_norm": 2.0132756022974023, "language_loss": 0.84852886, "learning_rate": 3.927388926206953e-06, "loss": 0.87026906, "num_input_tokens_seen": 40797505, "step": 1887, "time_per_iteration": 3.178863048553467 }, { "auxiliary_loss_clip": 0.01136567, "auxiliary_loss_mlp": 0.01056557, "balance_loss_clip": 1.05091035, "balance_loss_mlp": 1.03549314, "epoch": 0.11351270103712612, "flos": 20988242611200.0, "grad_norm": 2.847610033990257, "language_loss": 0.75826252, "learning_rate": 3.927284900491277e-06, "loss": 0.78019381, "num_input_tokens_seen": 40812970, "step": 1888, "time_per_iteration": 2.7349846363067627 }, { "auxiliary_loss_clip": 0.0113463, "auxiliary_loss_mlp": 0.01062359, "balance_loss_clip": 1.05614805, "balance_loss_mlp": 1.03892243, "epoch": 0.11357282428979408, "flos": 37347923600640.0, "grad_norm": 2.0598279187313624, "language_loss": 0.68104899, "learning_rate": 3.927180801692764e-06, "loss": 0.7030189, "num_input_tokens_seen": 40837745, "step": 1889, "time_per_iteration": 3.144444465637207 }, { "auxiliary_loss_clip": 0.01177206, "auxiliary_loss_mlp": 0.01049162, "balance_loss_clip": 1.05653095, "balance_loss_mlp": 1.02694094, "epoch": 0.11363294754246205, "flos": 21757018423680.0, "grad_norm": 1.7896678692754837, "language_loss": 0.83947051, "learning_rate": 3.927076629815362e-06, "loss": 0.86173415, "num_input_tokens_seen": 40856490, "step": 1890, "time_per_iteration": 2.73126482963562 }, { "auxiliary_loss_clip": 0.01145149, "auxiliary_loss_mlp": 0.01056017, "balance_loss_clip": 1.05039728, "balance_loss_mlp": 1.03395164, "epoch": 0.11369307079513001, "flos": 22601637803520.0, "grad_norm": 2.1678723202845256, "language_loss": 0.64663875, "learning_rate": 3.926972384863022e-06, "loss": 0.66865045, "num_input_tokens_seen": 40874070, "step": 1891, "time_per_iteration": 2.7474160194396973 }, { "auxiliary_loss_clip": 0.01145505, "auxiliary_loss_mlp": 0.01049015, "balance_loss_clip": 1.05395687, "balance_loss_mlp": 1.02773631, "epoch": 0.11375319404779799, "flos": 21944257044480.0, "grad_norm": 2.126575023047711, "language_loss": 0.87889415, "learning_rate": 3.9268680668396956e-06, "loss": 0.90083933, "num_input_tokens_seen": 40892425, "step": 1892, "time_per_iteration": 2.795269250869751 }, { "auxiliary_loss_clip": 0.01119535, "auxiliary_loss_mlp": 0.01079586, "balance_loss_clip": 1.05541015, "balance_loss_mlp": 1.05461168, "epoch": 0.11381331730046595, "flos": 26395456285440.0, "grad_norm": 3.1806920305576973, "language_loss": 0.72902197, "learning_rate": 3.926763675749339e-06, "loss": 0.75101316, "num_input_tokens_seen": 40912190, "step": 1893, "time_per_iteration": 2.890289306640625 }, { "auxiliary_loss_clip": 0.01175698, "auxiliary_loss_mlp": 0.0106591, "balance_loss_clip": 1.05438137, "balance_loss_mlp": 1.04290223, "epoch": 0.11387344055313392, "flos": 23804716959360.0, "grad_norm": 1.8842571229841023, "language_loss": 0.79247093, "learning_rate": 3.92665921159591e-06, "loss": 0.81488699, "num_input_tokens_seen": 40928395, "step": 1894, "time_per_iteration": 2.6820743083953857 }, { "auxiliary_loss_clip": 0.01150233, "auxiliary_loss_mlp": 0.01061956, "balance_loss_clip": 1.05356526, "balance_loss_mlp": 1.03944933, "epoch": 0.1139335638058019, "flos": 34522865902080.0, "grad_norm": 3.429237983174195, "language_loss": 0.79718482, "learning_rate": 3.926554674383371e-06, "loss": 0.81930667, "num_input_tokens_seen": 40946555, "step": 1895, "time_per_iteration": 2.829946994781494 }, { "auxiliary_loss_clip": 0.01075529, "auxiliary_loss_mlp": 0.01018518, "balance_loss_clip": 1.03062391, "balance_loss_mlp": 1.0155375, "epoch": 0.11399368705846986, "flos": 70587811520640.0, "grad_norm": 0.8041110638842961, "language_loss": 0.63357508, "learning_rate": 3.926450064115686e-06, "loss": 0.65451556, "num_input_tokens_seen": 41004910, "step": 1896, "time_per_iteration": 3.3087315559387207 }, { "auxiliary_loss_clip": 0.01147265, "auxiliary_loss_mlp": 0.0106086, "balance_loss_clip": 1.05560398, "balance_loss_mlp": 1.03663635, "epoch": 0.11405381031113783, "flos": 21324259365120.0, "grad_norm": 1.5952307342327186, "language_loss": 0.85055745, "learning_rate": 3.926345380796821e-06, "loss": 0.8726387, "num_input_tokens_seen": 41026385, "step": 1897, "time_per_iteration": 2.8522274494171143 }, { "auxiliary_loss_clip": 0.0117836, "auxiliary_loss_mlp": 0.00780276, "balance_loss_clip": 1.05591989, "balance_loss_mlp": 1.0001986, "epoch": 0.11411393356380581, "flos": 19719627091200.0, "grad_norm": 3.3624139627125587, "language_loss": 0.79675245, "learning_rate": 3.9262406244307465e-06, "loss": 0.81633884, "num_input_tokens_seen": 41045315, "step": 1898, "time_per_iteration": 2.760057210922241 }, { "auxiliary_loss_clip": 0.01115338, "auxiliary_loss_mlp": 0.01064417, "balance_loss_clip": 1.04594529, "balance_loss_mlp": 1.03965724, "epoch": 0.11417405681647377, "flos": 17530440883200.0, "grad_norm": 2.0191769665152903, "language_loss": 0.73251313, "learning_rate": 3.926135795021435e-06, "loss": 0.75431061, "num_input_tokens_seen": 41063390, "step": 1899, "time_per_iteration": 2.7363204956054688 }, { "auxiliary_loss_clip": 0.01042449, "auxiliary_loss_mlp": 0.01003313, "balance_loss_clip": 1.03643703, "balance_loss_mlp": 1.0003922, "epoch": 0.11423418006914174, "flos": 59674666619520.0, "grad_norm": 0.9089505356695228, "language_loss": 0.63434029, "learning_rate": 3.92603089257286e-06, "loss": 0.65479791, "num_input_tokens_seen": 41124180, "step": 1900, "time_per_iteration": 3.2045955657958984 }, { "auxiliary_loss_clip": 0.01113626, "auxiliary_loss_mlp": 0.01066815, "balance_loss_clip": 1.04929233, "balance_loss_mlp": 1.04378414, "epoch": 0.1142943033218097, "flos": 22963114321920.0, "grad_norm": 1.577500478750639, "language_loss": 0.77943742, "learning_rate": 3.925925917089001e-06, "loss": 0.80124187, "num_input_tokens_seen": 41143485, "step": 1901, "time_per_iteration": 2.745089530944824 }, { "auxiliary_loss_clip": 0.01171621, "auxiliary_loss_mlp": 0.01057834, "balance_loss_clip": 1.05803061, "balance_loss_mlp": 1.0359118, "epoch": 0.11435442657447768, "flos": 18256267008000.0, "grad_norm": 2.175933638179557, "language_loss": 0.84158623, "learning_rate": 3.925820868573839e-06, "loss": 0.86388075, "num_input_tokens_seen": 41161695, "step": 1902, "time_per_iteration": 2.6433799266815186 }, { "auxiliary_loss_clip": 0.01159941, "auxiliary_loss_mlp": 0.01056662, "balance_loss_clip": 1.05280399, "balance_loss_mlp": 1.03122306, "epoch": 0.11441454982714565, "flos": 24061191045120.0, "grad_norm": 1.7702735053047673, "language_loss": 0.77720451, "learning_rate": 3.925715747031356e-06, "loss": 0.79937053, "num_input_tokens_seen": 41181715, "step": 1903, "time_per_iteration": 2.6385905742645264 }, { "auxiliary_loss_clip": 0.01145143, "auxiliary_loss_mlp": 0.0104196, "balance_loss_clip": 1.05293322, "balance_loss_mlp": 1.02174175, "epoch": 0.11447467307981361, "flos": 25337707557120.0, "grad_norm": 2.212790565732917, "language_loss": 0.75751555, "learning_rate": 3.925610552465539e-06, "loss": 0.77938658, "num_input_tokens_seen": 41201770, "step": 1904, "time_per_iteration": 2.632152557373047 }, { "auxiliary_loss_clip": 0.01149375, "auxiliary_loss_mlp": 0.01056532, "balance_loss_clip": 1.05207586, "balance_loss_mlp": 1.03279781, "epoch": 0.11453479633248159, "flos": 21726063878400.0, "grad_norm": 2.4422699353972006, "language_loss": 0.91853034, "learning_rate": 3.9255052848803764e-06, "loss": 0.94058943, "num_input_tokens_seen": 41220590, "step": 1905, "time_per_iteration": 2.7421486377716064 }, { "auxiliary_loss_clip": 0.01161686, "auxiliary_loss_mlp": 0.01050264, "balance_loss_clip": 1.04978943, "balance_loss_mlp": 1.02612448, "epoch": 0.11459491958514956, "flos": 12969714096000.0, "grad_norm": 2.5117992419356066, "language_loss": 0.77484202, "learning_rate": 3.925399944279861e-06, "loss": 0.79696143, "num_input_tokens_seen": 41237250, "step": 1906, "time_per_iteration": 2.69333553314209 }, { "auxiliary_loss_clip": 0.0117911, "auxiliary_loss_mlp": 0.01055129, "balance_loss_clip": 1.05697322, "balance_loss_mlp": 1.03222847, "epoch": 0.11465504283781752, "flos": 22711273090560.0, "grad_norm": 2.0720467666322113, "language_loss": 0.81739306, "learning_rate": 3.925294530667986e-06, "loss": 0.83973539, "num_input_tokens_seen": 41256680, "step": 1907, "time_per_iteration": 2.6531317234039307 }, { "auxiliary_loss_clip": 0.0113647, "auxiliary_loss_mlp": 0.01065473, "balance_loss_clip": 1.05235374, "balance_loss_mlp": 1.04227471, "epoch": 0.1147151660904855, "flos": 23398387332480.0, "grad_norm": 2.1769364553121293, "language_loss": 0.84901214, "learning_rate": 3.92518904404875e-06, "loss": 0.87103164, "num_input_tokens_seen": 41270955, "step": 1908, "time_per_iteration": 2.8768258094787598 }, { "auxiliary_loss_clip": 0.01029536, "auxiliary_loss_mlp": 0.01020856, "balance_loss_clip": 1.02524137, "balance_loss_mlp": 1.01694632, "epoch": 0.11477528934315347, "flos": 63011843498880.0, "grad_norm": 0.9197306473097341, "language_loss": 0.61072773, "learning_rate": 3.925083484426153e-06, "loss": 0.63123173, "num_input_tokens_seen": 41319180, "step": 1909, "time_per_iteration": 3.0845727920532227 }, { "auxiliary_loss_clip": 0.01182744, "auxiliary_loss_mlp": 0.01054075, "balance_loss_clip": 1.06014562, "balance_loss_mlp": 1.03219986, "epoch": 0.11483541259582143, "flos": 16325601960960.0, "grad_norm": 7.319166590530674, "language_loss": 0.79170966, "learning_rate": 3.924977851804197e-06, "loss": 0.81407785, "num_input_tokens_seen": 41337480, "step": 1910, "time_per_iteration": 2.708704710006714 }, { "auxiliary_loss_clip": 0.01156489, "auxiliary_loss_mlp": 0.01052406, "balance_loss_clip": 1.0580864, "balance_loss_mlp": 1.03029275, "epoch": 0.1148955358484894, "flos": 21580410228480.0, "grad_norm": 2.117911712245717, "language_loss": 0.7702589, "learning_rate": 3.9248721461868875e-06, "loss": 0.79234779, "num_input_tokens_seen": 41354650, "step": 1911, "time_per_iteration": 2.7597720623016357 }, { "auxiliary_loss_clip": 0.01159986, "auxiliary_loss_mlp": 0.01054599, "balance_loss_clip": 1.05726957, "balance_loss_mlp": 1.03227139, "epoch": 0.11495565910115738, "flos": 27673696650240.0, "grad_norm": 1.677508784227342, "language_loss": 0.79177421, "learning_rate": 3.9247663675782336e-06, "loss": 0.81392002, "num_input_tokens_seen": 41376935, "step": 1912, "time_per_iteration": 2.8143310546875 }, { "auxiliary_loss_clip": 0.01183047, "auxiliary_loss_mlp": 0.00779659, "balance_loss_clip": 1.06065917, "balance_loss_mlp": 1.00014925, "epoch": 0.11501578235382534, "flos": 20632368614400.0, "grad_norm": 2.291252405113977, "language_loss": 0.77942276, "learning_rate": 3.924660515982246e-06, "loss": 0.79904979, "num_input_tokens_seen": 41396105, "step": 1913, "time_per_iteration": 2.696430206298828 }, { "auxiliary_loss_clip": 0.01166892, "auxiliary_loss_mlp": 0.01052769, "balance_loss_clip": 1.05442226, "balance_loss_mlp": 1.02953506, "epoch": 0.1150759056064933, "flos": 19829046896640.0, "grad_norm": 1.8145547055361753, "language_loss": 0.7003395, "learning_rate": 3.924554591402939e-06, "loss": 0.72253609, "num_input_tokens_seen": 41415600, "step": 1914, "time_per_iteration": 2.739251136779785 }, { "auxiliary_loss_clip": 0.01007182, "auxiliary_loss_mlp": 0.01004682, "balance_loss_clip": 1.02677619, "balance_loss_mlp": 1.00191641, "epoch": 0.11513602885916129, "flos": 70045776311040.0, "grad_norm": 0.7558771871458172, "language_loss": 0.61059874, "learning_rate": 3.92444859384433e-06, "loss": 0.6307174, "num_input_tokens_seen": 41478760, "step": 1915, "time_per_iteration": 3.56019926071167 }, { "auxiliary_loss_clip": 0.01166434, "auxiliary_loss_mlp": 0.01058573, "balance_loss_clip": 1.05994964, "balance_loss_mlp": 1.03595936, "epoch": 0.11519615211182925, "flos": 15741730385280.0, "grad_norm": 2.437201506258279, "language_loss": 0.93116963, "learning_rate": 3.924342523310436e-06, "loss": 0.95341969, "num_input_tokens_seen": 41495720, "step": 1916, "time_per_iteration": 3.244772434234619 }, { "auxiliary_loss_clip": 0.01161132, "auxiliary_loss_mlp": 0.01059827, "balance_loss_clip": 1.05798697, "balance_loss_mlp": 1.03470993, "epoch": 0.11525627536449722, "flos": 20667632791680.0, "grad_norm": 1.8909260082350545, "language_loss": 0.72560197, "learning_rate": 3.9242363798052806e-06, "loss": 0.74781156, "num_input_tokens_seen": 41513585, "step": 1917, "time_per_iteration": 4.502236843109131 }, { "auxiliary_loss_clip": 0.01138773, "auxiliary_loss_mlp": 0.0104964, "balance_loss_clip": 1.05739903, "balance_loss_mlp": 1.02700245, "epoch": 0.1153163986171652, "flos": 20303283185280.0, "grad_norm": 9.147356795176979, "language_loss": 0.74213129, "learning_rate": 3.92413016333289e-06, "loss": 0.76401544, "num_input_tokens_seen": 41533390, "step": 1918, "time_per_iteration": 4.344711065292358 }, { "auxiliary_loss_clip": 0.0114898, "auxiliary_loss_mlp": 0.010469, "balance_loss_clip": 1.05532503, "balance_loss_mlp": 1.02450073, "epoch": 0.11537652186983316, "flos": 17639321984640.0, "grad_norm": 3.182152136597976, "language_loss": 0.86367452, "learning_rate": 3.92402387389729e-06, "loss": 0.88563335, "num_input_tokens_seen": 41551015, "step": 1919, "time_per_iteration": 4.540036201477051 }, { "auxiliary_loss_clip": 0.01134044, "auxiliary_loss_mlp": 0.01067867, "balance_loss_clip": 1.0496366, "balance_loss_mlp": 1.04172444, "epoch": 0.11543664512250112, "flos": 21069401391360.0, "grad_norm": 1.93595243799445, "language_loss": 0.86735415, "learning_rate": 3.923917511502512e-06, "loss": 0.8893733, "num_input_tokens_seen": 41568055, "step": 1920, "time_per_iteration": 2.7719242572784424 }, { "auxiliary_loss_clip": 0.011686, "auxiliary_loss_mlp": 0.010528, "balance_loss_clip": 1.0593946, "balance_loss_mlp": 1.0302341, "epoch": 0.11549676837516909, "flos": 22747542848640.0, "grad_norm": 4.512761907267092, "language_loss": 0.79294932, "learning_rate": 3.923811076152589e-06, "loss": 0.81516337, "num_input_tokens_seen": 41587435, "step": 1921, "time_per_iteration": 2.798673629760742 }, { "auxiliary_loss_clip": 0.01174604, "auxiliary_loss_mlp": 0.01063526, "balance_loss_clip": 1.05685806, "balance_loss_mlp": 1.04007721, "epoch": 0.11555689162783707, "flos": 19168972617600.0, "grad_norm": 2.4057040360661484, "language_loss": 0.78464305, "learning_rate": 3.923704567851557e-06, "loss": 0.80702436, "num_input_tokens_seen": 41604975, "step": 1922, "time_per_iteration": 4.352341651916504 }, { "auxiliary_loss_clip": 0.01092284, "auxiliary_loss_mlp": 0.01064602, "balance_loss_clip": 1.04645681, "balance_loss_mlp": 1.04229808, "epoch": 0.11561701488050503, "flos": 24572056227840.0, "grad_norm": 1.8560991769949675, "language_loss": 0.84293079, "learning_rate": 3.923597986603456e-06, "loss": 0.86449969, "num_input_tokens_seen": 41626155, "step": 1923, "time_per_iteration": 3.2956740856170654 }, { "auxiliary_loss_clip": 0.01171957, "auxiliary_loss_mlp": 0.01056739, "balance_loss_clip": 1.0600003, "balance_loss_mlp": 1.03317094, "epoch": 0.115677138133173, "flos": 17092546179840.0, "grad_norm": 1.944851076041885, "language_loss": 0.80890471, "learning_rate": 3.9234913324123264e-06, "loss": 0.83119166, "num_input_tokens_seen": 41644805, "step": 1924, "time_per_iteration": 3.0939247608184814 }, { "auxiliary_loss_clip": 0.01055916, "auxiliary_loss_mlp": 0.01027131, "balance_loss_clip": 1.03045607, "balance_loss_mlp": 1.02436543, "epoch": 0.11573726138584098, "flos": 62703875266560.0, "grad_norm": 0.8171642061509322, "language_loss": 0.61196578, "learning_rate": 3.923384605282212e-06, "loss": 0.63279623, "num_input_tokens_seen": 41709345, "step": 1925, "time_per_iteration": 3.3765265941619873 }, { "auxiliary_loss_clip": 0.01155845, "auxiliary_loss_mlp": 0.01079328, "balance_loss_clip": 1.05374098, "balance_loss_mlp": 1.0549382, "epoch": 0.11579738463850894, "flos": 22601135013120.0, "grad_norm": 1.7772533553430212, "language_loss": 0.74766397, "learning_rate": 3.923277805217161e-06, "loss": 0.77001572, "num_input_tokens_seen": 41730210, "step": 1926, "time_per_iteration": 2.754974126815796 }, { "auxiliary_loss_clip": 0.01116228, "auxiliary_loss_mlp": 0.00781701, "balance_loss_clip": 1.04683304, "balance_loss_mlp": 1.00016665, "epoch": 0.11585750789117691, "flos": 21726135705600.0, "grad_norm": 4.731879086182685, "language_loss": 0.71978599, "learning_rate": 3.923170932221222e-06, "loss": 0.7387653, "num_input_tokens_seen": 41750270, "step": 1927, "time_per_iteration": 2.9454004764556885 }, { "auxiliary_loss_clip": 0.01137955, "auxiliary_loss_mlp": 0.01058796, "balance_loss_clip": 1.05250621, "balance_loss_mlp": 1.03572917, "epoch": 0.11591763114384489, "flos": 26287544851200.0, "grad_norm": 1.5938674022456252, "language_loss": 0.86854041, "learning_rate": 3.92306398629845e-06, "loss": 0.89050794, "num_input_tokens_seen": 41772975, "step": 1928, "time_per_iteration": 2.832750082015991 }, { "auxiliary_loss_clip": 0.01129041, "auxiliary_loss_mlp": 0.01060836, "balance_loss_clip": 1.05032003, "balance_loss_mlp": 1.03706551, "epoch": 0.11597775439651285, "flos": 23000461488000.0, "grad_norm": 1.6639520350020578, "language_loss": 0.77450585, "learning_rate": 3.922956967452898e-06, "loss": 0.79640466, "num_input_tokens_seen": 41791765, "step": 1929, "time_per_iteration": 2.7876811027526855 }, { "auxiliary_loss_clip": 0.01176887, "auxiliary_loss_mlp": 0.01063611, "balance_loss_clip": 1.05667901, "balance_loss_mlp": 1.0424509, "epoch": 0.11603787764918082, "flos": 31941715507200.0, "grad_norm": 1.8085677541874856, "language_loss": 0.76831949, "learning_rate": 3.922849875688626e-06, "loss": 0.79072452, "num_input_tokens_seen": 41815615, "step": 1930, "time_per_iteration": 2.819934844970703 }, { "auxiliary_loss_clip": 0.01145781, "auxiliary_loss_mlp": 0.01054046, "balance_loss_clip": 1.05066586, "balance_loss_mlp": 1.03165817, "epoch": 0.1160980009018488, "flos": 22271654534400.0, "grad_norm": 1.9434791543130712, "language_loss": 0.72291863, "learning_rate": 3.922742711009693e-06, "loss": 0.74491692, "num_input_tokens_seen": 41834810, "step": 1931, "time_per_iteration": 2.8078088760375977 }, { "auxiliary_loss_clip": 0.01146409, "auxiliary_loss_mlp": 0.01061336, "balance_loss_clip": 1.05090261, "balance_loss_mlp": 1.03575325, "epoch": 0.11615812415451676, "flos": 22783633038720.0, "grad_norm": 1.7378937044391531, "language_loss": 0.8222791, "learning_rate": 3.922635473420164e-06, "loss": 0.8443566, "num_input_tokens_seen": 41854975, "step": 1932, "time_per_iteration": 2.7495200634002686 }, { "auxiliary_loss_clip": 0.01030493, "auxiliary_loss_mlp": 0.01018834, "balance_loss_clip": 1.02184403, "balance_loss_mlp": 1.01556778, "epoch": 0.11621824740718473, "flos": 67146096107520.0, "grad_norm": 0.7669378012870447, "language_loss": 0.61050332, "learning_rate": 3.922528162924105e-06, "loss": 0.63099658, "num_input_tokens_seen": 41911105, "step": 1933, "time_per_iteration": 3.256678581237793 }, { "auxiliary_loss_clip": 0.01108577, "auxiliary_loss_mlp": 0.00780156, "balance_loss_clip": 1.04764509, "balance_loss_mlp": 1.00006175, "epoch": 0.11627837065985269, "flos": 20375930442240.0, "grad_norm": 2.830760437639296, "language_loss": 0.85790741, "learning_rate": 3.922420779525586e-06, "loss": 0.8767947, "num_input_tokens_seen": 41931750, "step": 1934, "time_per_iteration": 2.9144253730773926 }, { "auxiliary_loss_clip": 0.01117671, "auxiliary_loss_mlp": 0.01059839, "balance_loss_clip": 1.04929256, "balance_loss_mlp": 1.03453088, "epoch": 0.11633849391252067, "flos": 21725812483200.0, "grad_norm": 2.625764216143105, "language_loss": 0.66222906, "learning_rate": 3.9223133232286776e-06, "loss": 0.68400419, "num_input_tokens_seen": 41949400, "step": 1935, "time_per_iteration": 2.867152452468872 }, { "auxiliary_loss_clip": 0.01183991, "auxiliary_loss_mlp": 0.01052492, "balance_loss_clip": 1.05868936, "balance_loss_mlp": 1.03111792, "epoch": 0.11639861716518864, "flos": 18805341283200.0, "grad_norm": 2.025938843377603, "language_loss": 0.75678742, "learning_rate": 3.922205794037456e-06, "loss": 0.77915227, "num_input_tokens_seen": 41968100, "step": 1936, "time_per_iteration": 2.7282185554504395 }, { "auxiliary_loss_clip": 0.01179718, "auxiliary_loss_mlp": 0.01049532, "balance_loss_clip": 1.05632091, "balance_loss_mlp": 1.02639306, "epoch": 0.1164587404178566, "flos": 21214983214080.0, "grad_norm": 2.0032002399718905, "language_loss": 0.84086847, "learning_rate": 3.922098191955998e-06, "loss": 0.86316097, "num_input_tokens_seen": 41986375, "step": 1937, "time_per_iteration": 2.715386152267456 }, { "auxiliary_loss_clip": 0.01152084, "auxiliary_loss_mlp": 0.01048961, "balance_loss_clip": 1.05258632, "balance_loss_mlp": 1.0268234, "epoch": 0.11651886367052458, "flos": 27818632028160.0, "grad_norm": 3.0485930101216607, "language_loss": 0.7617709, "learning_rate": 3.921990516988384e-06, "loss": 0.78378135, "num_input_tokens_seen": 42006055, "step": 1938, "time_per_iteration": 2.7624804973602295 }, { "auxiliary_loss_clip": 0.01182576, "auxiliary_loss_mlp": 0.01055104, "balance_loss_clip": 1.05742419, "balance_loss_mlp": 1.03250146, "epoch": 0.11657898692319255, "flos": 22889569224960.0, "grad_norm": 1.7682499083089231, "language_loss": 0.79677606, "learning_rate": 3.921882769138696e-06, "loss": 0.81915289, "num_input_tokens_seen": 42024995, "step": 1939, "time_per_iteration": 2.71458101272583 }, { "auxiliary_loss_clip": 0.01148291, "auxiliary_loss_mlp": 0.01057951, "balance_loss_clip": 1.05209351, "balance_loss_mlp": 1.03508627, "epoch": 0.11663911017586051, "flos": 24315905364480.0, "grad_norm": 2.2281245193552475, "language_loss": 0.85916591, "learning_rate": 3.9217749484110215e-06, "loss": 0.88122833, "num_input_tokens_seen": 42042640, "step": 1940, "time_per_iteration": 2.7322728633880615 }, { "auxiliary_loss_clip": 0.01153746, "auxiliary_loss_mlp": 0.01056301, "balance_loss_clip": 1.05659437, "balance_loss_mlp": 1.03548717, "epoch": 0.11669923342852849, "flos": 42340152470400.0, "grad_norm": 1.4952807995381137, "language_loss": 0.75590646, "learning_rate": 3.921667054809449e-06, "loss": 0.77800703, "num_input_tokens_seen": 42067005, "step": 1941, "time_per_iteration": 2.9211390018463135 }, { "auxiliary_loss_clip": 0.01149585, "auxiliary_loss_mlp": 0.00780203, "balance_loss_clip": 1.05181897, "balance_loss_mlp": 1.00006557, "epoch": 0.11675935668119646, "flos": 14642288945280.0, "grad_norm": 2.277225749463833, "language_loss": 0.88847101, "learning_rate": 3.921559088338068e-06, "loss": 0.90776885, "num_input_tokens_seen": 42082295, "step": 1942, "time_per_iteration": 2.7145469188690186 }, { "auxiliary_loss_clip": 0.01165183, "auxiliary_loss_mlp": 0.01056257, "balance_loss_clip": 1.05553317, "balance_loss_mlp": 1.03552663, "epoch": 0.11681947993386442, "flos": 35116470063360.0, "grad_norm": 1.6547450593003057, "language_loss": 0.67979252, "learning_rate": 3.921451049000975e-06, "loss": 0.70200694, "num_input_tokens_seen": 42105295, "step": 1943, "time_per_iteration": 2.789701461791992 }, { "auxiliary_loss_clip": 0.01153022, "auxiliary_loss_mlp": 0.01047648, "balance_loss_clip": 1.05515063, "balance_loss_mlp": 1.02591634, "epoch": 0.11687960318653239, "flos": 38983259024640.0, "grad_norm": 1.9817763000300312, "language_loss": 0.69831288, "learning_rate": 3.921342936802265e-06, "loss": 0.72031963, "num_input_tokens_seen": 42125520, "step": 1944, "time_per_iteration": 2.827150583267212 }, { "auxiliary_loss_clip": 0.01155915, "auxiliary_loss_mlp": 0.01051888, "balance_loss_clip": 1.05038309, "balance_loss_mlp": 1.03158641, "epoch": 0.11693972643920036, "flos": 25994980575360.0, "grad_norm": 1.4963028532298175, "language_loss": 0.82662582, "learning_rate": 3.921234751746038e-06, "loss": 0.84870374, "num_input_tokens_seen": 42146335, "step": 1945, "time_per_iteration": 2.7190194129943848 }, { "auxiliary_loss_clip": 0.01137101, "auxiliary_loss_mlp": 0.01062082, "balance_loss_clip": 1.04682803, "balance_loss_mlp": 1.04005265, "epoch": 0.11699984969186833, "flos": 27272107618560.0, "grad_norm": 2.3643045784637735, "language_loss": 0.76298034, "learning_rate": 3.9211264938363975e-06, "loss": 0.78497219, "num_input_tokens_seen": 42165320, "step": 1946, "time_per_iteration": 2.792555093765259 }, { "auxiliary_loss_clip": 0.01134728, "auxiliary_loss_mlp": 0.01056112, "balance_loss_clip": 1.0507704, "balance_loss_mlp": 1.03536999, "epoch": 0.1170599729445363, "flos": 15267853232640.0, "grad_norm": 2.058923240355934, "language_loss": 0.69014907, "learning_rate": 3.921018163077448e-06, "loss": 0.71205747, "num_input_tokens_seen": 42182955, "step": 1947, "time_per_iteration": 2.643807888031006 }, { "auxiliary_loss_clip": 0.01154759, "auxiliary_loss_mlp": 0.01067767, "balance_loss_clip": 1.05707347, "balance_loss_mlp": 1.04604673, "epoch": 0.11712009619720427, "flos": 17164439251200.0, "grad_norm": 2.0690991629011615, "language_loss": 0.85044622, "learning_rate": 3.920909759473295e-06, "loss": 0.87267148, "num_input_tokens_seen": 42200760, "step": 1948, "time_per_iteration": 2.6399292945861816 }, { "auxiliary_loss_clip": 0.01051031, "auxiliary_loss_mlp": 0.0075782, "balance_loss_clip": 1.0245688, "balance_loss_mlp": 0.99997467, "epoch": 0.11718021944987224, "flos": 70940991997440.0, "grad_norm": 0.8206821069070506, "language_loss": 0.65139282, "learning_rate": 3.920801283028054e-06, "loss": 0.66948134, "num_input_tokens_seen": 42265745, "step": 1949, "time_per_iteration": 3.3030900955200195 }, { "auxiliary_loss_clip": 0.01159399, "auxiliary_loss_mlp": 0.01061163, "balance_loss_clip": 1.05735683, "balance_loss_mlp": 1.04054022, "epoch": 0.1172403427025402, "flos": 27453456408960.0, "grad_norm": 1.512876015443777, "language_loss": 0.71746683, "learning_rate": 3.920692733745835e-06, "loss": 0.73967248, "num_input_tokens_seen": 42286245, "step": 1950, "time_per_iteration": 2.739341974258423 }, { "auxiliary_loss_clip": 0.01175731, "auxiliary_loss_mlp": 0.01061149, "balance_loss_clip": 1.06152189, "balance_loss_mlp": 1.03907192, "epoch": 0.11730046595520818, "flos": 15668723992320.0, "grad_norm": 2.1258853115079996, "language_loss": 0.76671386, "learning_rate": 3.920584111630755e-06, "loss": 0.78908259, "num_input_tokens_seen": 42302710, "step": 1951, "time_per_iteration": 2.624788999557495 }, { "auxiliary_loss_clip": 0.01129104, "auxiliary_loss_mlp": 0.0106562, "balance_loss_clip": 1.05285251, "balance_loss_mlp": 1.04435349, "epoch": 0.11736058920787615, "flos": 25630164092160.0, "grad_norm": 1.7264952730121887, "language_loss": 0.75964963, "learning_rate": 3.9204754166869325e-06, "loss": 0.7815969, "num_input_tokens_seen": 42324115, "step": 1952, "time_per_iteration": 2.824826955795288 }, { "auxiliary_loss_clip": 0.01123677, "auxiliary_loss_mlp": 0.01065929, "balance_loss_clip": 1.04589534, "balance_loss_mlp": 1.04451907, "epoch": 0.11742071246054411, "flos": 21434289701760.0, "grad_norm": 2.2111022500713453, "language_loss": 0.72316217, "learning_rate": 3.920366648918491e-06, "loss": 0.74505818, "num_input_tokens_seen": 42342505, "step": 1953, "time_per_iteration": 2.7456531524658203 }, { "auxiliary_loss_clip": 0.01149214, "auxiliary_loss_mlp": 0.00781136, "balance_loss_clip": 1.0549686, "balance_loss_mlp": 1.0000577, "epoch": 0.11748083571321208, "flos": 15997845335040.0, "grad_norm": 2.1208802652878522, "language_loss": 0.79780388, "learning_rate": 3.920257808329552e-06, "loss": 0.81710744, "num_input_tokens_seen": 42360525, "step": 1954, "time_per_iteration": 2.653949737548828 }, { "auxiliary_loss_clip": 0.01112399, "auxiliary_loss_mlp": 0.01059787, "balance_loss_clip": 1.04880822, "balance_loss_mlp": 1.03763783, "epoch": 0.11754095896588006, "flos": 16180056051840.0, "grad_norm": 1.9673692595826442, "language_loss": 0.8553021, "learning_rate": 3.920148894924246e-06, "loss": 0.87702394, "num_input_tokens_seen": 42377045, "step": 1955, "time_per_iteration": 2.7987124919891357 }, { "auxiliary_loss_clip": 0.01163172, "auxiliary_loss_mlp": 0.00779783, "balance_loss_clip": 1.05209899, "balance_loss_mlp": 1.00016606, "epoch": 0.11760108221854802, "flos": 13261596013440.0, "grad_norm": 2.12926288831445, "language_loss": 0.78105426, "learning_rate": 3.920039908706701e-06, "loss": 0.80048382, "num_input_tokens_seen": 42393960, "step": 1956, "time_per_iteration": 2.6247944831848145 }, { "auxiliary_loss_clip": 0.01158287, "auxiliary_loss_mlp": 0.01058454, "balance_loss_clip": 1.05559933, "balance_loss_mlp": 1.03601909, "epoch": 0.11766120547121599, "flos": 24498439303680.0, "grad_norm": 2.264983200322237, "language_loss": 0.80487299, "learning_rate": 3.91993084968105e-06, "loss": 0.82704043, "num_input_tokens_seen": 42413160, "step": 1957, "time_per_iteration": 5.862411260604858 }, { "auxiliary_loss_clip": 0.01168294, "auxiliary_loss_mlp": 0.0105259, "balance_loss_clip": 1.05703866, "balance_loss_mlp": 1.0308696, "epoch": 0.11772132872388397, "flos": 17784005967360.0, "grad_norm": 4.8672025609093215, "language_loss": 0.77955222, "learning_rate": 3.919821717851428e-06, "loss": 0.80176103, "num_input_tokens_seen": 42432590, "step": 1958, "time_per_iteration": 4.4218549728393555 }, { "auxiliary_loss_clip": 0.01149976, "auxiliary_loss_mlp": 0.0105003, "balance_loss_clip": 1.05451894, "balance_loss_mlp": 1.02680755, "epoch": 0.11778145197655193, "flos": 13217030213760.0, "grad_norm": 1.7537692363765556, "language_loss": 0.77002251, "learning_rate": 3.919712513221976e-06, "loss": 0.79202259, "num_input_tokens_seen": 42450135, "step": 1959, "time_per_iteration": 2.674323558807373 }, { "auxiliary_loss_clip": 0.01162585, "auxiliary_loss_mlp": 0.01057019, "balance_loss_clip": 1.05857027, "balance_loss_mlp": 1.03484631, "epoch": 0.1178415752292199, "flos": 20230204965120.0, "grad_norm": 2.2026367524708927, "language_loss": 0.70078689, "learning_rate": 3.919603235796832e-06, "loss": 0.722983, "num_input_tokens_seen": 42470050, "step": 1960, "time_per_iteration": 2.7704508304595947 }, { "auxiliary_loss_clip": 0.01161089, "auxiliary_loss_mlp": 0.01055224, "balance_loss_clip": 1.05841374, "balance_loss_mlp": 1.03228831, "epoch": 0.11790169848188788, "flos": 13040134709760.0, "grad_norm": 2.663996374773888, "language_loss": 0.81045067, "learning_rate": 3.9194938855801406e-06, "loss": 0.83261371, "num_input_tokens_seen": 42484335, "step": 1961, "time_per_iteration": 4.67006778717041 }, { "auxiliary_loss_clip": 0.01163817, "auxiliary_loss_mlp": 0.00779643, "balance_loss_clip": 1.05658793, "balance_loss_mlp": 1.00009537, "epoch": 0.11796182173455584, "flos": 22265728790400.0, "grad_norm": 1.71345119244153, "language_loss": 0.92273545, "learning_rate": 3.919384462576049e-06, "loss": 0.94217002, "num_input_tokens_seen": 42502720, "step": 1962, "time_per_iteration": 2.6559524536132812 }, { "auxiliary_loss_clip": 0.01139826, "auxiliary_loss_mlp": 0.01058964, "balance_loss_clip": 1.05222392, "balance_loss_mlp": 1.03704107, "epoch": 0.1180219449872238, "flos": 10635017892480.0, "grad_norm": 2.157203116008796, "language_loss": 0.87635934, "learning_rate": 3.919274966788707e-06, "loss": 0.8983472, "num_input_tokens_seen": 42519460, "step": 1963, "time_per_iteration": 2.710042715072632 }, { "auxiliary_loss_clip": 0.0115823, "auxiliary_loss_mlp": 0.00779391, "balance_loss_clip": 1.05600929, "balance_loss_mlp": 1.00011134, "epoch": 0.11808206823989177, "flos": 20923532259840.0, "grad_norm": 2.8331529324994333, "language_loss": 0.83879703, "learning_rate": 3.919165398222265e-06, "loss": 0.85817325, "num_input_tokens_seen": 42539420, "step": 1964, "time_per_iteration": 2.734941244125366 }, { "auxiliary_loss_clip": 0.01122529, "auxiliary_loss_mlp": 0.01069054, "balance_loss_clip": 1.05171156, "balance_loss_mlp": 1.04628491, "epoch": 0.11814219149255975, "flos": 20777770869120.0, "grad_norm": 3.9132941826799543, "language_loss": 0.8313272, "learning_rate": 3.919055756880879e-06, "loss": 0.85324299, "num_input_tokens_seen": 42558225, "step": 1965, "time_per_iteration": 2.7427306175231934 }, { "auxiliary_loss_clip": 0.01178673, "auxiliary_loss_mlp": 0.01053338, "balance_loss_clip": 1.05815279, "balance_loss_mlp": 1.03163004, "epoch": 0.11820231474522772, "flos": 48759938542080.0, "grad_norm": 1.6720023918141877, "language_loss": 0.74227381, "learning_rate": 3.918946042768707e-06, "loss": 0.76459396, "num_input_tokens_seen": 42580790, "step": 1966, "time_per_iteration": 2.8265397548675537 }, { "auxiliary_loss_clip": 0.01163407, "auxiliary_loss_mlp": 0.0106081, "balance_loss_clip": 1.06309748, "balance_loss_mlp": 1.03836274, "epoch": 0.11826243799789568, "flos": 16690598012160.0, "grad_norm": 2.5628488285375397, "language_loss": 0.73137337, "learning_rate": 3.918836255889908e-06, "loss": 0.7536155, "num_input_tokens_seen": 42597355, "step": 1967, "time_per_iteration": 2.706193685531616 }, { "auxiliary_loss_clip": 0.01167052, "auxiliary_loss_mlp": 0.01053471, "balance_loss_clip": 1.05852592, "balance_loss_mlp": 1.03141701, "epoch": 0.11832256125056366, "flos": 16909868586240.0, "grad_norm": 5.332816815546028, "language_loss": 0.8831054, "learning_rate": 3.9187263962486456e-06, "loss": 0.90531063, "num_input_tokens_seen": 42616060, "step": 1968, "time_per_iteration": 2.6308343410491943 }, { "auxiliary_loss_clip": 0.01168356, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.06406927, "balance_loss_mlp": 1.0294776, "epoch": 0.11838268450323162, "flos": 22820405587200.0, "grad_norm": 2.252087054693662, "language_loss": 0.67010254, "learning_rate": 3.918616463849087e-06, "loss": 0.69230425, "num_input_tokens_seen": 42636285, "step": 1969, "time_per_iteration": 2.662480592727661 }, { "auxiliary_loss_clip": 0.01130071, "auxiliary_loss_mlp": 0.0106143, "balance_loss_clip": 1.05177045, "balance_loss_mlp": 1.03774357, "epoch": 0.11844280775589959, "flos": 33545844990720.0, "grad_norm": 2.153814675458072, "language_loss": 0.80455101, "learning_rate": 3.918506458695399e-06, "loss": 0.82646602, "num_input_tokens_seen": 42658320, "step": 1970, "time_per_iteration": 2.798050880432129 }, { "auxiliary_loss_clip": 0.01060284, "auxiliary_loss_mlp": 0.01021383, "balance_loss_clip": 1.02553701, "balance_loss_mlp": 1.01892686, "epoch": 0.11850293100856757, "flos": 66350998604160.0, "grad_norm": 0.8165911228106061, "language_loss": 0.66192186, "learning_rate": 3.918396380791754e-06, "loss": 0.68273854, "num_input_tokens_seen": 42721500, "step": 1971, "time_per_iteration": 3.167018413543701 }, { "auxiliary_loss_clip": 0.01151504, "auxiliary_loss_mlp": 0.0105629, "balance_loss_clip": 1.05294323, "balance_loss_mlp": 1.03422379, "epoch": 0.11856305426123553, "flos": 24681045070080.0, "grad_norm": 2.1839859106137554, "language_loss": 0.79782552, "learning_rate": 3.918286230142327e-06, "loss": 0.81990343, "num_input_tokens_seen": 42739825, "step": 1972, "time_per_iteration": 2.6908793449401855 }, { "auxiliary_loss_clip": 0.01133219, "auxiliary_loss_mlp": 0.00778766, "balance_loss_clip": 1.05341005, "balance_loss_mlp": 1.00005877, "epoch": 0.1186231775139035, "flos": 24280102483200.0, "grad_norm": 2.0473813607633384, "language_loss": 0.72843599, "learning_rate": 3.918176006751292e-06, "loss": 0.74755585, "num_input_tokens_seen": 42758695, "step": 1973, "time_per_iteration": 2.7801859378814697 }, { "auxiliary_loss_clip": 0.01138022, "auxiliary_loss_mlp": 0.01049764, "balance_loss_clip": 1.05580497, "balance_loss_mlp": 1.02707887, "epoch": 0.11868330076657148, "flos": 21757413473280.0, "grad_norm": 1.6449677647733996, "language_loss": 0.72019619, "learning_rate": 3.918065710622832e-06, "loss": 0.74207413, "num_input_tokens_seen": 42778510, "step": 1974, "time_per_iteration": 2.7337663173675537 }, { "auxiliary_loss_clip": 0.01129602, "auxiliary_loss_mlp": 0.01043161, "balance_loss_clip": 1.05265522, "balance_loss_mlp": 1.02086854, "epoch": 0.11874342401923944, "flos": 17193274894080.0, "grad_norm": 2.017372400194955, "language_loss": 0.77409399, "learning_rate": 3.917955341761128e-06, "loss": 0.79582161, "num_input_tokens_seen": 42793995, "step": 1975, "time_per_iteration": 2.669546604156494 }, { "auxiliary_loss_clip": 0.01131477, "auxiliary_loss_mlp": 0.01059968, "balance_loss_clip": 1.05880177, "balance_loss_mlp": 1.03908277, "epoch": 0.11880354727190741, "flos": 15229572312960.0, "grad_norm": 2.3842578575289, "language_loss": 0.75110453, "learning_rate": 3.917844900170364e-06, "loss": 0.77301902, "num_input_tokens_seen": 42809000, "step": 1976, "time_per_iteration": 2.8439090251922607 }, { "auxiliary_loss_clip": 0.0116819, "auxiliary_loss_mlp": 0.01049523, "balance_loss_clip": 1.05999744, "balance_loss_mlp": 1.02835166, "epoch": 0.11886367052457537, "flos": 27309706179840.0, "grad_norm": 1.8674311015318124, "language_loss": 0.74877423, "learning_rate": 3.91773438585473e-06, "loss": 0.77095133, "num_input_tokens_seen": 42831585, "step": 1977, "time_per_iteration": 2.6747169494628906 }, { "auxiliary_loss_clip": 0.01182095, "auxiliary_loss_mlp": 0.01059621, "balance_loss_clip": 1.05954552, "balance_loss_mlp": 1.03805614, "epoch": 0.11892379377724335, "flos": 21798280172160.0, "grad_norm": 2.1793079873879604, "language_loss": 0.74207634, "learning_rate": 3.9176237988184165e-06, "loss": 0.76449353, "num_input_tokens_seen": 42848420, "step": 1978, "time_per_iteration": 2.631664514541626 }, { "auxiliary_loss_clip": 0.01142323, "auxiliary_loss_mlp": 0.01050585, "balance_loss_clip": 1.06037045, "balance_loss_mlp": 1.0289247, "epoch": 0.11898391702991132, "flos": 13991013498240.0, "grad_norm": 1.7170872786869797, "language_loss": 0.73256385, "learning_rate": 3.917513139065616e-06, "loss": 0.754493, "num_input_tokens_seen": 42866645, "step": 1979, "time_per_iteration": 2.7442541122436523 }, { "auxiliary_loss_clip": 0.01137516, "auxiliary_loss_mlp": 0.01051378, "balance_loss_clip": 1.0566175, "balance_loss_mlp": 1.02968168, "epoch": 0.11904404028257928, "flos": 32234567091840.0, "grad_norm": 1.876224505386343, "language_loss": 0.98293436, "learning_rate": 3.917402406600525e-06, "loss": 1.00482333, "num_input_tokens_seen": 42888515, "step": 1980, "time_per_iteration": 2.787667989730835 }, { "auxiliary_loss_clip": 0.01153629, "auxiliary_loss_mlp": 0.01053612, "balance_loss_clip": 1.05595791, "balance_loss_mlp": 1.03077161, "epoch": 0.11910416353524726, "flos": 23586272398080.0, "grad_norm": 1.7507584506289393, "language_loss": 0.86265099, "learning_rate": 3.917291601427342e-06, "loss": 0.88472342, "num_input_tokens_seen": 42909035, "step": 1981, "time_per_iteration": 2.6680359840393066 }, { "auxiliary_loss_clip": 0.01158736, "auxiliary_loss_mlp": 0.01064978, "balance_loss_clip": 1.06144083, "balance_loss_mlp": 1.04214907, "epoch": 0.11916428678791523, "flos": 25333038789120.0, "grad_norm": 1.8908045276276995, "language_loss": 0.85375237, "learning_rate": 3.91718072355027e-06, "loss": 0.87598956, "num_input_tokens_seen": 42927555, "step": 1982, "time_per_iteration": 2.732797861099243 }, { "auxiliary_loss_clip": 0.01146432, "auxiliary_loss_mlp": 0.01050259, "balance_loss_clip": 1.05539966, "balance_loss_mlp": 1.02843213, "epoch": 0.11922441004058319, "flos": 19788431592960.0, "grad_norm": 2.3856086229742877, "language_loss": 0.85202634, "learning_rate": 3.917069772973513e-06, "loss": 0.87399322, "num_input_tokens_seen": 42945300, "step": 1983, "time_per_iteration": 2.6839804649353027 }, { "auxiliary_loss_clip": 0.01126589, "auxiliary_loss_mlp": 0.01056051, "balance_loss_clip": 1.05602145, "balance_loss_mlp": 1.03399742, "epoch": 0.11928453329325117, "flos": 21536347219200.0, "grad_norm": 3.6641824085676022, "language_loss": 0.7693429, "learning_rate": 3.916958749701277e-06, "loss": 0.79116929, "num_input_tokens_seen": 42961295, "step": 1984, "time_per_iteration": 2.7008767127990723 }, { "auxiliary_loss_clip": 0.01161623, "auxiliary_loss_mlp": 0.01055251, "balance_loss_clip": 1.05752373, "balance_loss_mlp": 1.0334003, "epoch": 0.11934465654591914, "flos": 20815010294400.0, "grad_norm": 1.917528093726237, "language_loss": 0.83058321, "learning_rate": 3.9168476537377745e-06, "loss": 0.85275191, "num_input_tokens_seen": 42980330, "step": 1985, "time_per_iteration": 2.6692728996276855 }, { "auxiliary_loss_clip": 0.01151831, "auxiliary_loss_mlp": 0.01050086, "balance_loss_clip": 1.0541923, "balance_loss_mlp": 1.02835393, "epoch": 0.1194047797985871, "flos": 19060486565760.0, "grad_norm": 1.8732848573733223, "language_loss": 0.74398553, "learning_rate": 3.916736485087216e-06, "loss": 0.76600474, "num_input_tokens_seen": 42996125, "step": 1986, "time_per_iteration": 2.722013473510742 }, { "auxiliary_loss_clip": 0.01146125, "auxiliary_loss_mlp": 0.01059008, "balance_loss_clip": 1.05472732, "balance_loss_mlp": 1.03791952, "epoch": 0.11946490305125507, "flos": 27190805184000.0, "grad_norm": 2.4724436343771083, "language_loss": 0.72123617, "learning_rate": 3.916625243753819e-06, "loss": 0.74328756, "num_input_tokens_seen": 43014180, "step": 1987, "time_per_iteration": 2.814481258392334 }, { "auxiliary_loss_clip": 0.01156854, "auxiliary_loss_mlp": 0.01054644, "balance_loss_clip": 1.05747938, "balance_loss_mlp": 1.03138638, "epoch": 0.11952502630392305, "flos": 21140791672320.0, "grad_norm": 1.9246234449532542, "language_loss": 0.72007513, "learning_rate": 3.916513929741799e-06, "loss": 0.74219012, "num_input_tokens_seen": 43032120, "step": 1988, "time_per_iteration": 2.7242019176483154 }, { "auxiliary_loss_clip": 0.0116348, "auxiliary_loss_mlp": 0.01062102, "balance_loss_clip": 1.05559146, "balance_loss_mlp": 1.03913057, "epoch": 0.11958514955659101, "flos": 22124241118080.0, "grad_norm": 1.7561483239324645, "language_loss": 0.81144297, "learning_rate": 3.91640254305538e-06, "loss": 0.83369875, "num_input_tokens_seen": 43052215, "step": 1989, "time_per_iteration": 2.6259546279907227 }, { "auxiliary_loss_clip": 0.01135956, "auxiliary_loss_mlp": 0.01057689, "balance_loss_clip": 1.05254042, "balance_loss_mlp": 1.03325129, "epoch": 0.11964527280925898, "flos": 17421452040960.0, "grad_norm": 2.5516320258539795, "language_loss": 0.75881672, "learning_rate": 3.916291083698784e-06, "loss": 0.7807532, "num_input_tokens_seen": 43069720, "step": 1990, "time_per_iteration": 2.6779251098632812 }, { "auxiliary_loss_clip": 0.0105322, "auxiliary_loss_mlp": 0.01019112, "balance_loss_clip": 1.02816892, "balance_loss_mlp": 1.01647794, "epoch": 0.11970539606192696, "flos": 70679741402880.0, "grad_norm": 0.8628582727639288, "language_loss": 0.55184531, "learning_rate": 3.916179551676238e-06, "loss": 0.57256866, "num_input_tokens_seen": 43123130, "step": 1991, "time_per_iteration": 3.3713693618774414 }, { "auxiliary_loss_clip": 0.01136423, "auxiliary_loss_mlp": 0.01053959, "balance_loss_clip": 1.05748868, "balance_loss_mlp": 1.03326464, "epoch": 0.11976551931459492, "flos": 21215019127680.0, "grad_norm": 2.286300891386994, "language_loss": 0.78371406, "learning_rate": 3.916067946991971e-06, "loss": 0.80561793, "num_input_tokens_seen": 43140015, "step": 1992, "time_per_iteration": 2.6797914505004883 }, { "auxiliary_loss_clip": 0.0117949, "auxiliary_loss_mlp": 0.01056635, "balance_loss_clip": 1.05811, "balance_loss_mlp": 1.03453374, "epoch": 0.11982564256726289, "flos": 25989306226560.0, "grad_norm": 1.8481811043026504, "language_loss": 0.78911144, "learning_rate": 3.915956269650216e-06, "loss": 0.81147265, "num_input_tokens_seen": 43160105, "step": 1993, "time_per_iteration": 2.691301107406616 }, { "auxiliary_loss_clip": 0.01126423, "auxiliary_loss_mlp": 0.0106217, "balance_loss_clip": 1.05012226, "balance_loss_mlp": 1.04081941, "epoch": 0.11988576581993086, "flos": 21650866755840.0, "grad_norm": 1.644866568705103, "language_loss": 0.82088816, "learning_rate": 3.915844519655208e-06, "loss": 0.84277415, "num_input_tokens_seen": 43179835, "step": 1994, "time_per_iteration": 2.772905111312866 }, { "auxiliary_loss_clip": 0.0115068, "auxiliary_loss_mlp": 0.01063961, "balance_loss_clip": 1.05523098, "balance_loss_mlp": 1.0433259, "epoch": 0.11994588907259883, "flos": 17857407409920.0, "grad_norm": 2.0065598513575247, "language_loss": 0.88392794, "learning_rate": 3.915732697011183e-06, "loss": 0.9060744, "num_input_tokens_seen": 43197210, "step": 1995, "time_per_iteration": 4.206532716751099 }, { "auxiliary_loss_clip": 0.01153482, "auxiliary_loss_mlp": 0.01066415, "balance_loss_clip": 1.06005812, "balance_loss_mlp": 1.0441823, "epoch": 0.1200060123252668, "flos": 24462744163200.0, "grad_norm": 1.8775058007239456, "language_loss": 0.73949909, "learning_rate": 3.9156208017223825e-06, "loss": 0.76169801, "num_input_tokens_seen": 43215050, "step": 1996, "time_per_iteration": 2.7263944149017334 }, { "auxiliary_loss_clip": 0.01141484, "auxiliary_loss_mlp": 0.01060112, "balance_loss_clip": 1.05754757, "balance_loss_mlp": 1.03808212, "epoch": 0.12006613557793476, "flos": 18732191235840.0, "grad_norm": 1.976051865072764, "language_loss": 0.88125587, "learning_rate": 3.915508833793048e-06, "loss": 0.90327179, "num_input_tokens_seen": 43233900, "step": 1997, "time_per_iteration": 4.29426383972168 }, { "auxiliary_loss_clip": 0.01165634, "auxiliary_loss_mlp": 0.00779568, "balance_loss_clip": 1.05701697, "balance_loss_mlp": 1.00001049, "epoch": 0.12012625883060274, "flos": 22267739952000.0, "grad_norm": 2.1091392562336018, "language_loss": 0.79031086, "learning_rate": 3.915396793227428e-06, "loss": 0.80976284, "num_input_tokens_seen": 43252105, "step": 1998, "time_per_iteration": 4.330955266952515 }, { "auxiliary_loss_clip": 0.0116661, "auxiliary_loss_mlp": 0.00779642, "balance_loss_clip": 1.0576719, "balance_loss_mlp": 1.00002396, "epoch": 0.1201863820832707, "flos": 21758885930880.0, "grad_norm": 1.799585336659533, "language_loss": 0.73583078, "learning_rate": 3.915284680029769e-06, "loss": 0.75529337, "num_input_tokens_seen": 43270315, "step": 1999, "time_per_iteration": 2.754770040512085 }, { "auxiliary_loss_clip": 0.01178966, "auxiliary_loss_mlp": 0.01073097, "balance_loss_clip": 1.0602119, "balance_loss_mlp": 1.05115068, "epoch": 0.12024650533593867, "flos": 21907987286400.0, "grad_norm": 2.916355473014409, "language_loss": 0.74854898, "learning_rate": 3.915172494204323e-06, "loss": 0.77106953, "num_input_tokens_seen": 43289935, "step": 2000, "time_per_iteration": 4.3900322914123535 }, { "auxiliary_loss_clip": 0.01149374, "auxiliary_loss_mlp": 0.01069735, "balance_loss_clip": 1.05375695, "balance_loss_mlp": 1.04763341, "epoch": 0.12030662858860665, "flos": 21689219502720.0, "grad_norm": 1.5203973891597686, "language_loss": 0.8496564, "learning_rate": 3.915060235755344e-06, "loss": 0.87184751, "num_input_tokens_seen": 43309325, "step": 2001, "time_per_iteration": 2.6912643909454346 }, { "auxiliary_loss_clip": 0.01154057, "auxiliary_loss_mlp": 0.01063637, "balance_loss_clip": 1.05600786, "balance_loss_mlp": 1.04265642, "epoch": 0.12036675184127461, "flos": 12933228856320.0, "grad_norm": 2.932264271186656, "language_loss": 0.74711967, "learning_rate": 3.91494790468709e-06, "loss": 0.76929653, "num_input_tokens_seen": 43327010, "step": 2002, "time_per_iteration": 2.6991024017333984 }, { "auxiliary_loss_clip": 0.01129169, "auxiliary_loss_mlp": 0.01066705, "balance_loss_clip": 1.05340302, "balance_loss_mlp": 1.0429939, "epoch": 0.12042687509394258, "flos": 20851028657280.0, "grad_norm": 2.117271428042382, "language_loss": 0.78029454, "learning_rate": 3.9148355010038185e-06, "loss": 0.80225325, "num_input_tokens_seen": 43345650, "step": 2003, "time_per_iteration": 2.731381416320801 }, { "auxiliary_loss_clip": 0.01163252, "auxiliary_loss_mlp": 0.01062886, "balance_loss_clip": 1.05728662, "balance_loss_mlp": 1.04073668, "epoch": 0.12048699834661056, "flos": 23878513451520.0, "grad_norm": 1.585850552088038, "language_loss": 0.72205627, "learning_rate": 3.914723024709793e-06, "loss": 0.74431765, "num_input_tokens_seen": 43365555, "step": 2004, "time_per_iteration": 2.725092649459839 }, { "auxiliary_loss_clip": 0.01160616, "auxiliary_loss_mlp": 0.01069457, "balance_loss_clip": 1.05870187, "balance_loss_mlp": 1.04645014, "epoch": 0.12054712159927852, "flos": 19756363726080.0, "grad_norm": 1.9357732467170252, "language_loss": 0.78415942, "learning_rate": 3.914610475809279e-06, "loss": 0.8064602, "num_input_tokens_seen": 43384990, "step": 2005, "time_per_iteration": 2.7232437133789062 }, { "auxiliary_loss_clip": 0.01073016, "auxiliary_loss_mlp": 0.00758901, "balance_loss_clip": 1.02995479, "balance_loss_mlp": 1.00011683, "epoch": 0.12060724485194649, "flos": 51672763123200.0, "grad_norm": 0.9264315537536937, "language_loss": 0.58087146, "learning_rate": 3.914497854306543e-06, "loss": 0.59919059, "num_input_tokens_seen": 43436335, "step": 2006, "time_per_iteration": 2.9570157527923584 }, { "auxiliary_loss_clip": 0.01155081, "auxiliary_loss_mlp": 0.01053472, "balance_loss_clip": 1.05803597, "balance_loss_mlp": 1.03299201, "epoch": 0.12066736810461445, "flos": 18990425088000.0, "grad_norm": 1.6109316320484448, "language_loss": 0.76524282, "learning_rate": 3.9143851602058575e-06, "loss": 0.78732836, "num_input_tokens_seen": 43456495, "step": 2007, "time_per_iteration": 2.763380289077759 }, { "auxiliary_loss_clip": 0.01147254, "auxiliary_loss_mlp": 0.01064209, "balance_loss_clip": 1.05931091, "balance_loss_mlp": 1.04177368, "epoch": 0.12072749135728243, "flos": 16471973882880.0, "grad_norm": 2.449779851562752, "language_loss": 0.83023942, "learning_rate": 3.914272393511494e-06, "loss": 0.85235405, "num_input_tokens_seen": 43473085, "step": 2008, "time_per_iteration": 2.7693119049072266 }, { "auxiliary_loss_clip": 0.01176157, "auxiliary_loss_mlp": 0.01052894, "balance_loss_clip": 1.0584172, "balance_loss_mlp": 1.03135288, "epoch": 0.1207876146099504, "flos": 18077108947200.0, "grad_norm": 2.203355340521787, "language_loss": 0.83835697, "learning_rate": 3.91415955422773e-06, "loss": 0.86064744, "num_input_tokens_seen": 43491135, "step": 2009, "time_per_iteration": 2.640944242477417 }, { "auxiliary_loss_clip": 0.01180076, "auxiliary_loss_mlp": 0.01053549, "balance_loss_clip": 1.06196725, "balance_loss_mlp": 1.02994514, "epoch": 0.12084773786261836, "flos": 21871573873920.0, "grad_norm": 1.6799099601218046, "language_loss": 0.83870012, "learning_rate": 3.914046642358844e-06, "loss": 0.8610363, "num_input_tokens_seen": 43510440, "step": 2010, "time_per_iteration": 2.716127634048462 }, { "auxiliary_loss_clip": 0.01145261, "auxiliary_loss_mlp": 0.00780804, "balance_loss_clip": 1.05555713, "balance_loss_mlp": 1.0000627, "epoch": 0.12090786111528634, "flos": 18333044328960.0, "grad_norm": 1.8933604390076018, "language_loss": 0.84194541, "learning_rate": 3.9139336579091174e-06, "loss": 0.86120605, "num_input_tokens_seen": 43530145, "step": 2011, "time_per_iteration": 2.73793625831604 }, { "auxiliary_loss_clip": 0.01148418, "auxiliary_loss_mlp": 0.01060974, "balance_loss_clip": 1.05480969, "balance_loss_mlp": 1.03905129, "epoch": 0.1209679843679543, "flos": 21105850717440.0, "grad_norm": 2.0524904800028154, "language_loss": 0.96236968, "learning_rate": 3.913820600882834e-06, "loss": 0.98446357, "num_input_tokens_seen": 43549315, "step": 2012, "time_per_iteration": 2.7269980907440186 }, { "auxiliary_loss_clip": 0.01146369, "auxiliary_loss_mlp": 0.01051396, "balance_loss_clip": 1.05808425, "balance_loss_mlp": 1.0289607, "epoch": 0.12102810762062227, "flos": 29241053585280.0, "grad_norm": 1.853151366811655, "language_loss": 0.80903435, "learning_rate": 3.913707471284283e-06, "loss": 0.83101201, "num_input_tokens_seen": 43569240, "step": 2013, "time_per_iteration": 2.740489959716797 }, { "auxiliary_loss_clip": 0.01124703, "auxiliary_loss_mlp": 0.0105341, "balance_loss_clip": 1.05300117, "balance_loss_mlp": 1.02962804, "epoch": 0.12108823087329025, "flos": 17930701111680.0, "grad_norm": 5.099975898232357, "language_loss": 0.77255923, "learning_rate": 3.9135942691177515e-06, "loss": 0.79434031, "num_input_tokens_seen": 43587710, "step": 2014, "time_per_iteration": 2.7361485958099365 }, { "auxiliary_loss_clip": 0.0116607, "auxiliary_loss_mlp": 0.01051056, "balance_loss_clip": 1.05832791, "balance_loss_mlp": 1.02791715, "epoch": 0.12114835412595822, "flos": 22091850028800.0, "grad_norm": 5.8570343294144465, "language_loss": 0.87169874, "learning_rate": 3.913480994387535e-06, "loss": 0.89387, "num_input_tokens_seen": 43606000, "step": 2015, "time_per_iteration": 2.6881515979766846 }, { "auxiliary_loss_clip": 0.01170382, "auxiliary_loss_mlp": 0.01051162, "balance_loss_clip": 1.05500197, "balance_loss_mlp": 1.0289886, "epoch": 0.12120847737862618, "flos": 20412343854720.0, "grad_norm": 2.087765239068409, "language_loss": 0.69146478, "learning_rate": 3.913367647097926e-06, "loss": 0.71368027, "num_input_tokens_seen": 43624815, "step": 2016, "time_per_iteration": 2.7096211910247803 }, { "auxiliary_loss_clip": 0.01152563, "auxiliary_loss_mlp": 0.0104714, "balance_loss_clip": 1.05737591, "balance_loss_mlp": 1.02390599, "epoch": 0.12126860063129415, "flos": 22309037614080.0, "grad_norm": 2.8043603396252865, "language_loss": 0.79858959, "learning_rate": 3.913254227253225e-06, "loss": 0.82058656, "num_input_tokens_seen": 43643960, "step": 2017, "time_per_iteration": 2.7042336463928223 }, { "auxiliary_loss_clip": 0.01156022, "auxiliary_loss_mlp": 0.0105052, "balance_loss_clip": 1.05479789, "balance_loss_mlp": 1.02740538, "epoch": 0.12132872388396213, "flos": 13699275235200.0, "grad_norm": 2.8700241463026654, "language_loss": 0.68828821, "learning_rate": 3.913140734857731e-06, "loss": 0.71035373, "num_input_tokens_seen": 43662650, "step": 2018, "time_per_iteration": 2.7015058994293213 }, { "auxiliary_loss_clip": 0.01136376, "auxiliary_loss_mlp": 0.01050749, "balance_loss_clip": 1.05524123, "balance_loss_mlp": 1.02873111, "epoch": 0.12138884713663009, "flos": 26466954307200.0, "grad_norm": 1.6132330771570709, "language_loss": 0.72476816, "learning_rate": 3.91302716991575e-06, "loss": 0.74663943, "num_input_tokens_seen": 43684205, "step": 2019, "time_per_iteration": 2.8956947326660156 }, { "auxiliary_loss_clip": 0.01107167, "auxiliary_loss_mlp": 0.01057916, "balance_loss_clip": 1.05286384, "balance_loss_mlp": 1.03482556, "epoch": 0.12144897038929806, "flos": 26141603892480.0, "grad_norm": 1.853626515444831, "language_loss": 0.92125106, "learning_rate": 3.912913532431586e-06, "loss": 0.94290185, "num_input_tokens_seen": 43706320, "step": 2020, "time_per_iteration": 2.9980764389038086 }, { "auxiliary_loss_clip": 0.0114145, "auxiliary_loss_mlp": 0.01055455, "balance_loss_clip": 1.05289125, "balance_loss_mlp": 1.03360391, "epoch": 0.12150909364196603, "flos": 24717530309760.0, "grad_norm": 1.9227427415613194, "language_loss": 0.7772885, "learning_rate": 3.912799822409549e-06, "loss": 0.79925752, "num_input_tokens_seen": 43724805, "step": 2021, "time_per_iteration": 3.01798939704895 }, { "auxiliary_loss_clip": 0.0117749, "auxiliary_loss_mlp": 0.01049007, "balance_loss_clip": 1.0610733, "balance_loss_mlp": 1.0277164, "epoch": 0.121569216894634, "flos": 25186990089600.0, "grad_norm": 2.054228820960504, "language_loss": 0.80712306, "learning_rate": 3.912686039853952e-06, "loss": 0.82938808, "num_input_tokens_seen": 43742320, "step": 2022, "time_per_iteration": 2.684309244155884 }, { "auxiliary_loss_clip": 0.01144749, "auxiliary_loss_mlp": 0.0106163, "balance_loss_clip": 1.055619, "balance_loss_mlp": 1.03697765, "epoch": 0.12162934014730196, "flos": 13444094039040.0, "grad_norm": 1.734031517866852, "language_loss": 0.84842217, "learning_rate": 3.912572184769108e-06, "loss": 0.87048596, "num_input_tokens_seen": 43760665, "step": 2023, "time_per_iteration": 2.6886441707611084 }, { "auxiliary_loss_clip": 0.01139348, "auxiliary_loss_mlp": 0.01053043, "balance_loss_clip": 1.05162323, "balance_loss_mlp": 1.03081048, "epoch": 0.12168946339996994, "flos": 16946138344320.0, "grad_norm": 2.3397199529221546, "language_loss": 0.85514021, "learning_rate": 3.912458257159335e-06, "loss": 0.87706411, "num_input_tokens_seen": 43779020, "step": 2024, "time_per_iteration": 2.8043718338012695 }, { "auxiliary_loss_clip": 0.01169767, "auxiliary_loss_mlp": 0.01055534, "balance_loss_clip": 1.05277538, "balance_loss_mlp": 1.03389716, "epoch": 0.12174958665263791, "flos": 29821585196160.0, "grad_norm": 1.8432491304976684, "language_loss": 0.72088945, "learning_rate": 3.912344257028954e-06, "loss": 0.74314243, "num_input_tokens_seen": 43798850, "step": 2025, "time_per_iteration": 2.704876184463501 }, { "auxiliary_loss_clip": 0.01148564, "auxiliary_loss_mlp": 0.01047618, "balance_loss_clip": 1.05486572, "balance_loss_mlp": 1.02555275, "epoch": 0.12180970990530587, "flos": 24641902224000.0, "grad_norm": 1.4969552271445652, "language_loss": 0.76075011, "learning_rate": 3.912230184382286e-06, "loss": 0.78271192, "num_input_tokens_seen": 43820130, "step": 2026, "time_per_iteration": 2.6957921981811523 }, { "auxiliary_loss_clip": 0.01147374, "auxiliary_loss_mlp": 0.01046261, "balance_loss_clip": 1.05086374, "balance_loss_mlp": 1.02474427, "epoch": 0.12186983315797385, "flos": 20521691832960.0, "grad_norm": 2.2064263994277478, "language_loss": 0.88769746, "learning_rate": 3.912116039223659e-06, "loss": 0.90963376, "num_input_tokens_seen": 43838485, "step": 2027, "time_per_iteration": 2.6847639083862305 }, { "auxiliary_loss_clip": 0.01143778, "auxiliary_loss_mlp": 0.01056715, "balance_loss_clip": 1.05258501, "balance_loss_mlp": 1.03667617, "epoch": 0.12192995641064182, "flos": 27818344719360.0, "grad_norm": 1.5725885574076592, "language_loss": 0.75544459, "learning_rate": 3.912001821557399e-06, "loss": 0.77744961, "num_input_tokens_seen": 43859080, "step": 2028, "time_per_iteration": 2.7706027030944824 }, { "auxiliary_loss_clip": 0.01123185, "auxiliary_loss_mlp": 0.01057136, "balance_loss_clip": 1.0518471, "balance_loss_mlp": 1.03554714, "epoch": 0.12199007966330978, "flos": 22017119783040.0, "grad_norm": 2.0550419223931193, "language_loss": 0.76802504, "learning_rate": 3.911887531387839e-06, "loss": 0.78982824, "num_input_tokens_seen": 43879030, "step": 2029, "time_per_iteration": 2.732637405395508 }, { "auxiliary_loss_clip": 0.01156591, "auxiliary_loss_mlp": 0.01052355, "balance_loss_clip": 1.05253625, "balance_loss_mlp": 1.03107572, "epoch": 0.12205020291597775, "flos": 23295216493440.0, "grad_norm": 1.707195979328818, "language_loss": 0.79164296, "learning_rate": 3.911773168719313e-06, "loss": 0.81373239, "num_input_tokens_seen": 43898505, "step": 2030, "time_per_iteration": 2.7254061698913574 }, { "auxiliary_loss_clip": 0.0116997, "auxiliary_loss_mlp": 0.01051357, "balance_loss_clip": 1.05618095, "balance_loss_mlp": 1.02930319, "epoch": 0.12211032616864573, "flos": 26031609469440.0, "grad_norm": 3.038077546298312, "language_loss": 0.74411637, "learning_rate": 3.911658733556155e-06, "loss": 0.76632965, "num_input_tokens_seen": 43917945, "step": 2031, "time_per_iteration": 2.6711080074310303 }, { "auxiliary_loss_clip": 0.01174332, "auxiliary_loss_mlp": 0.01045812, "balance_loss_clip": 1.05888343, "balance_loss_mlp": 1.02545118, "epoch": 0.12217044942131369, "flos": 20410943224320.0, "grad_norm": 1.7636188348969384, "language_loss": 0.75230348, "learning_rate": 3.911544225902707e-06, "loss": 0.7745049, "num_input_tokens_seen": 43937385, "step": 2032, "time_per_iteration": 2.7134530544281006 }, { "auxiliary_loss_clip": 0.01152363, "auxiliary_loss_mlp": 0.01045735, "balance_loss_clip": 1.05129802, "balance_loss_mlp": 1.02538586, "epoch": 0.12223057267398166, "flos": 22857142222080.0, "grad_norm": 1.5809359138264147, "language_loss": 0.89502287, "learning_rate": 3.911429645763311e-06, "loss": 0.91700387, "num_input_tokens_seen": 43958130, "step": 2033, "time_per_iteration": 2.7105965614318848 }, { "auxiliary_loss_clip": 0.01155694, "auxiliary_loss_mlp": 0.01051169, "balance_loss_clip": 1.05740523, "balance_loss_mlp": 1.03005767, "epoch": 0.12229069592664964, "flos": 20047563285120.0, "grad_norm": 1.9580868921695649, "language_loss": 0.65195286, "learning_rate": 3.911314993142311e-06, "loss": 0.67402148, "num_input_tokens_seen": 43976800, "step": 2034, "time_per_iteration": 4.222668886184692 }, { "auxiliary_loss_clip": 0.01152239, "auxiliary_loss_mlp": 0.01055659, "balance_loss_clip": 1.05550218, "balance_loss_mlp": 1.0327704, "epoch": 0.1223508191793176, "flos": 22274240313600.0, "grad_norm": 1.6376942269871653, "language_loss": 0.76459455, "learning_rate": 3.911200268044055e-06, "loss": 0.78667355, "num_input_tokens_seen": 43996620, "step": 2035, "time_per_iteration": 2.7306556701660156 }, { "auxiliary_loss_clip": 0.01176703, "auxiliary_loss_mlp": 0.01050008, "balance_loss_clip": 1.0577215, "balance_loss_mlp": 1.02798975, "epoch": 0.12241094243198557, "flos": 21285978445440.0, "grad_norm": 1.8460180606974623, "language_loss": 0.71294892, "learning_rate": 3.911085470472892e-06, "loss": 0.73521602, "num_input_tokens_seen": 44016175, "step": 2036, "time_per_iteration": 2.7327258586883545 }, { "auxiliary_loss_clip": 0.01144473, "auxiliary_loss_mlp": 0.01058389, "balance_loss_clip": 1.05778408, "balance_loss_mlp": 1.03623962, "epoch": 0.12247106568465355, "flos": 17382381022080.0, "grad_norm": 1.5772021569883852, "language_loss": 0.83130831, "learning_rate": 3.910970600433178e-06, "loss": 0.85333693, "num_input_tokens_seen": 44035060, "step": 2037, "time_per_iteration": 4.248440742492676 }, { "auxiliary_loss_clip": 0.01153641, "auxiliary_loss_mlp": 0.01060257, "balance_loss_clip": 1.0556947, "balance_loss_mlp": 1.0366174, "epoch": 0.12253118893732151, "flos": 27045438842880.0, "grad_norm": 2.676780030246967, "language_loss": 0.79765236, "learning_rate": 3.910855657929267e-06, "loss": 0.81979132, "num_input_tokens_seen": 44053330, "step": 2038, "time_per_iteration": 2.7321341037750244 }, { "auxiliary_loss_clip": 0.010642, "auxiliary_loss_mlp": 0.00759248, "balance_loss_clip": 1.02961969, "balance_loss_mlp": 1.00006962, "epoch": 0.12259131218998948, "flos": 53861518368000.0, "grad_norm": 0.8248048644272604, "language_loss": 0.58659601, "learning_rate": 3.910740642965518e-06, "loss": 0.6048305, "num_input_tokens_seen": 44107575, "step": 2039, "time_per_iteration": 4.739040851593018 }, { "auxiliary_loss_clip": 0.01128, "auxiliary_loss_mlp": 0.01064411, "balance_loss_clip": 1.05292714, "balance_loss_mlp": 1.03912663, "epoch": 0.12265143544265744, "flos": 17891917401600.0, "grad_norm": 2.1548467753138136, "language_loss": 0.80099291, "learning_rate": 3.910625555546292e-06, "loss": 0.82291704, "num_input_tokens_seen": 44126075, "step": 2040, "time_per_iteration": 2.723247766494751 }, { "auxiliary_loss_clip": 0.01149343, "auxiliary_loss_mlp": 0.01058534, "balance_loss_clip": 1.05517352, "balance_loss_mlp": 1.03673029, "epoch": 0.12271155869532542, "flos": 21799932197760.0, "grad_norm": 1.8247690225218605, "language_loss": 0.82841176, "learning_rate": 3.910510395675953e-06, "loss": 0.85049051, "num_input_tokens_seen": 44145605, "step": 2041, "time_per_iteration": 2.699110984802246 }, { "auxiliary_loss_clip": 0.01136001, "auxiliary_loss_mlp": 0.01053451, "balance_loss_clip": 1.05120957, "balance_loss_mlp": 1.03061032, "epoch": 0.12277168194799339, "flos": 19828759587840.0, "grad_norm": 1.9386136063873771, "language_loss": 0.67272276, "learning_rate": 3.9103951633588694e-06, "loss": 0.69461727, "num_input_tokens_seen": 44164770, "step": 2042, "time_per_iteration": 2.7042133808135986 }, { "auxiliary_loss_clip": 0.01133115, "auxiliary_loss_mlp": 0.01056941, "balance_loss_clip": 1.05079007, "balance_loss_mlp": 1.03517294, "epoch": 0.12283180520066135, "flos": 23221024951680.0, "grad_norm": 1.912164915278887, "language_loss": 0.81765604, "learning_rate": 3.910279858599409e-06, "loss": 0.83955657, "num_input_tokens_seen": 44184025, "step": 2043, "time_per_iteration": 2.6942050457000732 }, { "auxiliary_loss_clip": 0.01146416, "auxiliary_loss_mlp": 0.01052365, "balance_loss_clip": 1.05161905, "balance_loss_mlp": 1.03040695, "epoch": 0.12289192845332933, "flos": 18588476920320.0, "grad_norm": 1.7894844734354058, "language_loss": 0.80192459, "learning_rate": 3.910164481401946e-06, "loss": 0.82391244, "num_input_tokens_seen": 44202950, "step": 2044, "time_per_iteration": 2.6227192878723145 }, { "auxiliary_loss_clip": 0.01116285, "auxiliary_loss_mlp": 0.01052013, "balance_loss_clip": 1.05284619, "balance_loss_mlp": 1.03055525, "epoch": 0.1229520517059973, "flos": 25769532862080.0, "grad_norm": 1.7152742607840916, "language_loss": 0.7794897, "learning_rate": 3.910049031770853e-06, "loss": 0.80117267, "num_input_tokens_seen": 44221115, "step": 2045, "time_per_iteration": 2.769017219543457 }, { "auxiliary_loss_clip": 0.01163545, "auxiliary_loss_mlp": 0.01060468, "balance_loss_clip": 1.05796146, "balance_loss_mlp": 1.03827095, "epoch": 0.12301217495866526, "flos": 20887154760960.0, "grad_norm": 1.852572781372854, "language_loss": 0.67284262, "learning_rate": 3.90993350971051e-06, "loss": 0.69508278, "num_input_tokens_seen": 44240575, "step": 2046, "time_per_iteration": 2.6377944946289062 }, { "auxiliary_loss_clip": 0.01173803, "auxiliary_loss_mlp": 0.01053755, "balance_loss_clip": 1.06010675, "balance_loss_mlp": 1.03202295, "epoch": 0.12307229821133324, "flos": 22378811783040.0, "grad_norm": 4.982373490718116, "language_loss": 0.72730684, "learning_rate": 3.909817915225297e-06, "loss": 0.74958241, "num_input_tokens_seen": 44257145, "step": 2047, "time_per_iteration": 2.5791239738464355 }, { "auxiliary_loss_clip": 0.01155159, "auxiliary_loss_mlp": 0.01060632, "balance_loss_clip": 1.05398846, "balance_loss_mlp": 1.03817296, "epoch": 0.1231324214640012, "flos": 23367396873600.0, "grad_norm": 1.8194194024321948, "language_loss": 0.76583183, "learning_rate": 3.909702248319597e-06, "loss": 0.78798974, "num_input_tokens_seen": 44278035, "step": 2048, "time_per_iteration": 2.6997592449188232 }, { "auxiliary_loss_clip": 0.01146796, "auxiliary_loss_mlp": 0.01047309, "balance_loss_clip": 1.05524468, "balance_loss_mlp": 1.02798486, "epoch": 0.12319254471666917, "flos": 23767154311680.0, "grad_norm": 1.8097490634569602, "language_loss": 0.85359102, "learning_rate": 3.909586508997797e-06, "loss": 0.87553203, "num_input_tokens_seen": 44296980, "step": 2049, "time_per_iteration": 2.739617109298706 }, { "auxiliary_loss_clip": 0.01120276, "auxiliary_loss_mlp": 0.01050145, "balance_loss_clip": 1.0533725, "balance_loss_mlp": 1.02887857, "epoch": 0.12325266796933713, "flos": 23550146294400.0, "grad_norm": 2.6582136339172724, "language_loss": 0.75563407, "learning_rate": 3.909470697264285e-06, "loss": 0.77733827, "num_input_tokens_seen": 44318005, "step": 2050, "time_per_iteration": 2.7814078330993652 }, { "auxiliary_loss_clip": 0.01138568, "auxiliary_loss_mlp": 0.01057939, "balance_loss_clip": 1.05428278, "balance_loss_mlp": 1.03608823, "epoch": 0.12331279122200511, "flos": 24423996366720.0, "grad_norm": 1.81408967902731, "language_loss": 0.81166679, "learning_rate": 3.909354813123452e-06, "loss": 0.83363187, "num_input_tokens_seen": 44335260, "step": 2051, "time_per_iteration": 2.7555224895477295 }, { "auxiliary_loss_clip": 0.01171646, "auxiliary_loss_mlp": 0.00779218, "balance_loss_clip": 1.05882978, "balance_loss_mlp": 0.99996465, "epoch": 0.12337291447467308, "flos": 25484294960640.0, "grad_norm": 1.8885516327307212, "language_loss": 0.80445349, "learning_rate": 3.909238856579693e-06, "loss": 0.82396215, "num_input_tokens_seen": 44355315, "step": 2052, "time_per_iteration": 2.7676405906677246 }, { "auxiliary_loss_clip": 0.01165489, "auxiliary_loss_mlp": 0.010569, "balance_loss_clip": 1.0581975, "balance_loss_mlp": 1.03537059, "epoch": 0.12343303772734104, "flos": 23550002640000.0, "grad_norm": 2.171205541070781, "language_loss": 0.73676848, "learning_rate": 3.909122827637406e-06, "loss": 0.75899243, "num_input_tokens_seen": 44373020, "step": 2053, "time_per_iteration": 2.648609161376953 }, { "auxiliary_loss_clip": 0.01168883, "auxiliary_loss_mlp": 0.00778478, "balance_loss_clip": 1.05302441, "balance_loss_mlp": 0.99995315, "epoch": 0.12349316098000902, "flos": 47557074867840.0, "grad_norm": 1.5051513438882418, "language_loss": 0.7413671, "learning_rate": 3.909006726300991e-06, "loss": 0.76084077, "num_input_tokens_seen": 44397525, "step": 2054, "time_per_iteration": 2.871469020843506 }, { "auxiliary_loss_clip": 0.01147607, "auxiliary_loss_mlp": 0.01044612, "balance_loss_clip": 1.05402803, "balance_loss_mlp": 1.02482307, "epoch": 0.12355328423267699, "flos": 25045969294080.0, "grad_norm": 4.50189877271012, "language_loss": 0.85417157, "learning_rate": 3.908890552574849e-06, "loss": 0.8760938, "num_input_tokens_seen": 44415890, "step": 2055, "time_per_iteration": 2.7136077880859375 }, { "auxiliary_loss_clip": 0.01133829, "auxiliary_loss_mlp": 0.01047458, "balance_loss_clip": 1.05999517, "balance_loss_mlp": 1.02802706, "epoch": 0.12361340748534495, "flos": 27709140395520.0, "grad_norm": 2.0629908776416688, "language_loss": 0.77506042, "learning_rate": 3.908774306463384e-06, "loss": 0.79687333, "num_input_tokens_seen": 44436625, "step": 2056, "time_per_iteration": 2.83107852935791 }, { "auxiliary_loss_clip": 0.01158234, "auxiliary_loss_mlp": 0.01055, "balance_loss_clip": 1.05444396, "balance_loss_mlp": 1.03405499, "epoch": 0.12367353073801293, "flos": 26140598311680.0, "grad_norm": 1.9893743253373262, "language_loss": 0.83361745, "learning_rate": 3.908657987971009e-06, "loss": 0.85574985, "num_input_tokens_seen": 44455265, "step": 2057, "time_per_iteration": 2.6987085342407227 }, { "auxiliary_loss_clip": 0.01141319, "auxiliary_loss_mlp": 0.01051708, "balance_loss_clip": 1.05057144, "balance_loss_mlp": 1.02991605, "epoch": 0.1237336539906809, "flos": 25156035544320.0, "grad_norm": 1.4905135493793764, "language_loss": 0.77818203, "learning_rate": 3.90854159710213e-06, "loss": 0.80011231, "num_input_tokens_seen": 44475815, "step": 2058, "time_per_iteration": 2.7149016857147217 }, { "auxiliary_loss_clip": 0.01138087, "auxiliary_loss_mlp": 0.01058134, "balance_loss_clip": 1.05117273, "balance_loss_mlp": 1.03482866, "epoch": 0.12379377724334886, "flos": 15304589867520.0, "grad_norm": 1.8387803476985631, "language_loss": 0.8342883, "learning_rate": 3.9084251338611624e-06, "loss": 0.85625052, "num_input_tokens_seen": 44494045, "step": 2059, "time_per_iteration": 2.7030091285705566 }, { "auxiliary_loss_clip": 0.01133517, "auxiliary_loss_mlp": 0.01057399, "balance_loss_clip": 1.05123472, "balance_loss_mlp": 1.03445077, "epoch": 0.12385390049601683, "flos": 21316717509120.0, "grad_norm": 2.7478129466394217, "language_loss": 0.81420219, "learning_rate": 3.908308598252523e-06, "loss": 0.83611137, "num_input_tokens_seen": 44509120, "step": 2060, "time_per_iteration": 2.738499402999878 }, { "auxiliary_loss_clip": 0.01150334, "auxiliary_loss_mlp": 0.01054424, "balance_loss_clip": 1.05367386, "balance_loss_mlp": 1.0315125, "epoch": 0.1239140237486848, "flos": 15116309752320.0, "grad_norm": 1.8699548955873522, "language_loss": 0.86224365, "learning_rate": 3.9081919902806306e-06, "loss": 0.88429129, "num_input_tokens_seen": 44525780, "step": 2061, "time_per_iteration": 2.6492960453033447 }, { "auxiliary_loss_clip": 0.0115523, "auxiliary_loss_mlp": 0.01050307, "balance_loss_clip": 1.05506253, "balance_loss_mlp": 1.03031528, "epoch": 0.12397414700135277, "flos": 21976791788160.0, "grad_norm": 2.006361909654615, "language_loss": 0.84949362, "learning_rate": 3.908075309949906e-06, "loss": 0.87154901, "num_input_tokens_seen": 44543125, "step": 2062, "time_per_iteration": 2.5925393104553223 }, { "auxiliary_loss_clip": 0.01124676, "auxiliary_loss_mlp": 0.01058304, "balance_loss_clip": 1.05198252, "balance_loss_mlp": 1.03498697, "epoch": 0.12403427025402074, "flos": 13400892956160.0, "grad_norm": 1.6181471799462952, "language_loss": 0.78765064, "learning_rate": 3.907958557264774e-06, "loss": 0.80948043, "num_input_tokens_seen": 44560275, "step": 2063, "time_per_iteration": 2.7551674842834473 }, { "auxiliary_loss_clip": 0.01124369, "auxiliary_loss_mlp": 0.01057465, "balance_loss_clip": 1.05492854, "balance_loss_mlp": 1.03450513, "epoch": 0.12409439350668872, "flos": 15304374385920.0, "grad_norm": 2.9315517002695017, "language_loss": 0.79452097, "learning_rate": 3.907841732229663e-06, "loss": 0.81633931, "num_input_tokens_seen": 44577640, "step": 2064, "time_per_iteration": 2.699711322784424 }, { "auxiliary_loss_clip": 0.01144709, "auxiliary_loss_mlp": 0.01058768, "balance_loss_clip": 1.05316699, "balance_loss_mlp": 1.03847849, "epoch": 0.12415451675935668, "flos": 25009376313600.0, "grad_norm": 2.5611248351266016, "language_loss": 0.92676973, "learning_rate": 3.907724834849002e-06, "loss": 0.9488045, "num_input_tokens_seen": 44594860, "step": 2065, "time_per_iteration": 2.7114996910095215 }, { "auxiliary_loss_clip": 0.01147841, "auxiliary_loss_mlp": 0.01052058, "balance_loss_clip": 1.05113554, "balance_loss_mlp": 1.02943158, "epoch": 0.12421464001202465, "flos": 23659673840640.0, "grad_norm": 1.7498294279318665, "language_loss": 0.80540735, "learning_rate": 3.907607865127225e-06, "loss": 0.82740629, "num_input_tokens_seen": 44614780, "step": 2066, "time_per_iteration": 2.6958389282226562 }, { "auxiliary_loss_clip": 0.01030831, "auxiliary_loss_mlp": 0.01051436, "balance_loss_clip": 1.02768898, "balance_loss_mlp": 1.04884958, "epoch": 0.12427476326469263, "flos": 65732904345600.0, "grad_norm": 0.8715885531008962, "language_loss": 0.63299954, "learning_rate": 3.907490823068766e-06, "loss": 0.6538223, "num_input_tokens_seen": 44671240, "step": 2067, "time_per_iteration": 3.200000762939453 }, { "auxiliary_loss_clip": 0.01117858, "auxiliary_loss_mlp": 0.01057985, "balance_loss_clip": 1.04878855, "balance_loss_mlp": 1.0344646, "epoch": 0.12433488651736059, "flos": 24535427333760.0, "grad_norm": 1.9218217735084064, "language_loss": 0.93783462, "learning_rate": 3.907373708678063e-06, "loss": 0.959593, "num_input_tokens_seen": 44691050, "step": 2068, "time_per_iteration": 2.7631025314331055 }, { "auxiliary_loss_clip": 0.01166393, "auxiliary_loss_mlp": 0.0105657, "balance_loss_clip": 1.05994427, "balance_loss_mlp": 1.03697169, "epoch": 0.12439500977002856, "flos": 21031659175680.0, "grad_norm": 1.8717926968048342, "language_loss": 0.80861229, "learning_rate": 3.9072565219595596e-06, "loss": 0.83084196, "num_input_tokens_seen": 44709850, "step": 2069, "time_per_iteration": 2.6630098819732666 }, { "auxiliary_loss_clip": 0.01113262, "auxiliary_loss_mlp": 0.01062592, "balance_loss_clip": 1.04863238, "balance_loss_mlp": 1.03963184, "epoch": 0.12445513302269653, "flos": 26830621555200.0, "grad_norm": 1.5649570979854035, "language_loss": 0.777978, "learning_rate": 3.907139262917696e-06, "loss": 0.79973656, "num_input_tokens_seen": 44731475, "step": 2070, "time_per_iteration": 2.7750463485717773 }, { "auxiliary_loss_clip": 0.01156875, "auxiliary_loss_mlp": 0.01052509, "balance_loss_clip": 1.05520415, "balance_loss_mlp": 1.03055048, "epoch": 0.1245152562753645, "flos": 18368919037440.0, "grad_norm": 2.2051981544638166, "language_loss": 0.80743957, "learning_rate": 3.907021931556922e-06, "loss": 0.8295334, "num_input_tokens_seen": 44749685, "step": 2071, "time_per_iteration": 2.654171943664551 }, { "auxiliary_loss_clip": 0.01154683, "auxiliary_loss_mlp": 0.01055767, "balance_loss_clip": 1.05492425, "balance_loss_mlp": 1.03405952, "epoch": 0.12457537952803246, "flos": 33107986200960.0, "grad_norm": 2.118828414072521, "language_loss": 0.78278041, "learning_rate": 3.906904527881684e-06, "loss": 0.80488491, "num_input_tokens_seen": 44772165, "step": 2072, "time_per_iteration": 2.753159284591675 }, { "auxiliary_loss_clip": 0.0114568, "auxiliary_loss_mlp": 0.01055287, "balance_loss_clip": 1.05651307, "balance_loss_mlp": 1.03381729, "epoch": 0.12463550278070043, "flos": 22270217990400.0, "grad_norm": 7.360489773093417, "language_loss": 0.752267, "learning_rate": 3.9067870518964355e-06, "loss": 0.77427667, "num_input_tokens_seen": 44790580, "step": 2073, "time_per_iteration": 2.6561899185180664 }, { "auxiliary_loss_clip": 0.01096485, "auxiliary_loss_mlp": 0.01053193, "balance_loss_clip": 1.04471385, "balance_loss_mlp": 1.03086543, "epoch": 0.12469562603336841, "flos": 14679025580160.0, "grad_norm": 1.9234955386089483, "language_loss": 0.90560025, "learning_rate": 3.906669503605631e-06, "loss": 0.92709696, "num_input_tokens_seen": 44806730, "step": 2074, "time_per_iteration": 2.7846343517303467 }, { "auxiliary_loss_clip": 0.01105332, "auxiliary_loss_mlp": 0.01056651, "balance_loss_clip": 1.04977274, "balance_loss_mlp": 1.03346491, "epoch": 0.12475574928603637, "flos": 24644775312000.0, "grad_norm": 2.8321626325497493, "language_loss": 0.83836985, "learning_rate": 3.906551883013728e-06, "loss": 0.8599897, "num_input_tokens_seen": 44825550, "step": 2075, "time_per_iteration": 4.412928342819214 }, { "auxiliary_loss_clip": 0.01107078, "auxiliary_loss_mlp": 0.01062819, "balance_loss_clip": 1.04380202, "balance_loss_mlp": 1.03972864, "epoch": 0.12481587253870434, "flos": 21762980081280.0, "grad_norm": 2.042892519020311, "language_loss": 0.73648787, "learning_rate": 3.9064341901251865e-06, "loss": 0.75818682, "num_input_tokens_seen": 44844155, "step": 2076, "time_per_iteration": 5.925223112106323 }, { "auxiliary_loss_clip": 0.01101731, "auxiliary_loss_mlp": 0.01048176, "balance_loss_clip": 1.04774427, "balance_loss_mlp": 1.02751708, "epoch": 0.12487599579137232, "flos": 21432529935360.0, "grad_norm": 1.8779339700875872, "language_loss": 0.7622484, "learning_rate": 3.906316424944469e-06, "loss": 0.78374755, "num_input_tokens_seen": 44863780, "step": 2077, "time_per_iteration": 2.70566987991333 }, { "auxiliary_loss_clip": 0.01156274, "auxiliary_loss_mlp": 0.01062042, "balance_loss_clip": 1.05365288, "balance_loss_mlp": 1.04001164, "epoch": 0.12493611904404028, "flos": 16107624276480.0, "grad_norm": 2.022280968605665, "language_loss": 0.82290226, "learning_rate": 3.906198587476043e-06, "loss": 0.84508544, "num_input_tokens_seen": 44881480, "step": 2078, "time_per_iteration": 4.302385568618774 }, { "auxiliary_loss_clip": 0.01144821, "auxiliary_loss_mlp": 0.01050482, "balance_loss_clip": 1.05281842, "balance_loss_mlp": 1.02855957, "epoch": 0.12499624229670825, "flos": 21580266574080.0, "grad_norm": 1.6413520418295044, "language_loss": 0.75195324, "learning_rate": 3.906080677724374e-06, "loss": 0.77390629, "num_input_tokens_seen": 44900390, "step": 2079, "time_per_iteration": 2.6915946006774902 }, { "auxiliary_loss_clip": 0.01166758, "auxiliary_loss_mlp": 0.01058474, "balance_loss_clip": 1.05881989, "balance_loss_mlp": 1.03696847, "epoch": 0.1250563655493762, "flos": 25699040421120.0, "grad_norm": 6.733284446627088, "language_loss": 0.83874094, "learning_rate": 3.905962695693935e-06, "loss": 0.86099327, "num_input_tokens_seen": 44920375, "step": 2080, "time_per_iteration": 2.7467572689056396 }, { "auxiliary_loss_clip": 0.01156163, "auxiliary_loss_mlp": 0.01059409, "balance_loss_clip": 1.05525088, "balance_loss_mlp": 1.03885686, "epoch": 0.12511648880204418, "flos": 16909509450240.0, "grad_norm": 1.8581885454518776, "language_loss": 0.84644079, "learning_rate": 3.9058446413892e-06, "loss": 0.86859655, "num_input_tokens_seen": 44938415, "step": 2081, "time_per_iteration": 2.685875654220581 }, { "auxiliary_loss_clip": 0.01156835, "auxiliary_loss_mlp": 0.01046398, "balance_loss_clip": 1.05375946, "balance_loss_mlp": 1.02594149, "epoch": 0.12517661205471217, "flos": 17567500740480.0, "grad_norm": 1.8191819349610059, "language_loss": 0.76739037, "learning_rate": 3.905726514814646e-06, "loss": 0.78942269, "num_input_tokens_seen": 44957135, "step": 2082, "time_per_iteration": 2.6133053302764893 }, { "auxiliary_loss_clip": 0.01152911, "auxiliary_loss_mlp": 0.0104632, "balance_loss_clip": 1.05701911, "balance_loss_mlp": 1.02463615, "epoch": 0.12523673530738014, "flos": 16033791870720.0, "grad_norm": 2.5415589476696265, "language_loss": 0.79044539, "learning_rate": 3.9056083159747495e-06, "loss": 0.81243765, "num_input_tokens_seen": 44974480, "step": 2083, "time_per_iteration": 2.6963307857513428 }, { "auxiliary_loss_clip": 0.01147874, "auxiliary_loss_mlp": 0.01047351, "balance_loss_clip": 1.05509973, "balance_loss_mlp": 1.02421284, "epoch": 0.1252968585600481, "flos": 18807747494400.0, "grad_norm": 2.1696249857299, "language_loss": 0.89831448, "learning_rate": 3.9054900448739966e-06, "loss": 0.92026675, "num_input_tokens_seen": 44990310, "step": 2084, "time_per_iteration": 2.6770403385162354 }, { "auxiliary_loss_clip": 0.01131068, "auxiliary_loss_mlp": 0.01048299, "balance_loss_clip": 1.05299771, "balance_loss_mlp": 1.02729464, "epoch": 0.12535698181271607, "flos": 27271568914560.0, "grad_norm": 1.8896331095253402, "language_loss": 0.80354226, "learning_rate": 3.905371701516869e-06, "loss": 0.82533598, "num_input_tokens_seen": 45010720, "step": 2085, "time_per_iteration": 2.749783515930176 }, { "auxiliary_loss_clip": 0.01170318, "auxiliary_loss_mlp": 0.01051018, "balance_loss_clip": 1.05725896, "balance_loss_mlp": 1.03001356, "epoch": 0.12541710506538403, "flos": 22054107813120.0, "grad_norm": 1.8300316094254767, "language_loss": 0.88228154, "learning_rate": 3.905253285907856e-06, "loss": 0.90449488, "num_input_tokens_seen": 45030360, "step": 2086, "time_per_iteration": 2.603515148162842 }, { "auxiliary_loss_clip": 0.01134598, "auxiliary_loss_mlp": 0.01044925, "balance_loss_clip": 1.05278981, "balance_loss_mlp": 1.02522027, "epoch": 0.125477228318052, "flos": 12603173760000.0, "grad_norm": 2.0471238132540344, "language_loss": 0.86819696, "learning_rate": 3.905134798051447e-06, "loss": 0.88999224, "num_input_tokens_seen": 45045085, "step": 2087, "time_per_iteration": 2.6265859603881836 }, { "auxiliary_loss_clip": 0.01146999, "auxiliary_loss_mlp": 0.01058875, "balance_loss_clip": 1.05599046, "balance_loss_mlp": 1.03651142, "epoch": 0.12553735157071996, "flos": 23878549365120.0, "grad_norm": 2.3362397674907758, "language_loss": 0.73027468, "learning_rate": 3.905016237952136e-06, "loss": 0.75233346, "num_input_tokens_seen": 45065145, "step": 2088, "time_per_iteration": 2.65324330329895 }, { "auxiliary_loss_clip": 0.01062529, "auxiliary_loss_mlp": 0.01013405, "balance_loss_clip": 1.02985716, "balance_loss_mlp": 1.01079392, "epoch": 0.12559747482338796, "flos": 69920841830400.0, "grad_norm": 0.7742255614948045, "language_loss": 0.61767036, "learning_rate": 3.904897605614418e-06, "loss": 0.6384297, "num_input_tokens_seen": 45126230, "step": 2089, "time_per_iteration": 3.1219804286956787 }, { "auxiliary_loss_clip": 0.01149606, "auxiliary_loss_mlp": 0.01060841, "balance_loss_clip": 1.05670094, "balance_loss_mlp": 1.0388943, "epoch": 0.12565759807605592, "flos": 24279563779200.0, "grad_norm": 1.817095421446176, "language_loss": 0.7781918, "learning_rate": 3.904778901042793e-06, "loss": 0.80029625, "num_input_tokens_seen": 45145545, "step": 2090, "time_per_iteration": 2.700425863265991 }, { "auxiliary_loss_clip": 0.01046946, "auxiliary_loss_mlp": 0.01013884, "balance_loss_clip": 1.03125095, "balance_loss_mlp": 1.01101136, "epoch": 0.12571772132872389, "flos": 56451180286080.0, "grad_norm": 0.760599485634597, "language_loss": 0.59434772, "learning_rate": 3.90466012424176e-06, "loss": 0.61495602, "num_input_tokens_seen": 45206845, "step": 2091, "time_per_iteration": 3.0814294815063477 }, { "auxiliary_loss_clip": 0.01159814, "auxiliary_loss_mlp": 0.01060546, "balance_loss_clip": 1.05760789, "balance_loss_mlp": 1.041067, "epoch": 0.12577784458139185, "flos": 41245846675200.0, "grad_norm": 1.6552462178493936, "language_loss": 0.62916517, "learning_rate": 3.904541275215825e-06, "loss": 0.6513688, "num_input_tokens_seen": 45228495, "step": 2092, "time_per_iteration": 2.7813880443573 }, { "auxiliary_loss_clip": 0.01147016, "auxiliary_loss_mlp": 0.01061963, "balance_loss_clip": 1.05395663, "balance_loss_mlp": 1.04069614, "epoch": 0.12583796783405982, "flos": 19755501799680.0, "grad_norm": 2.279616692029291, "language_loss": 0.80507946, "learning_rate": 3.904422353969493e-06, "loss": 0.82716924, "num_input_tokens_seen": 45245720, "step": 2093, "time_per_iteration": 2.6768014430999756 }, { "auxiliary_loss_clip": 0.01146976, "auxiliary_loss_mlp": 0.01075616, "balance_loss_clip": 1.0524025, "balance_loss_mlp": 1.05380058, "epoch": 0.12589809108672778, "flos": 22602104680320.0, "grad_norm": 1.7347385846840702, "language_loss": 0.76003867, "learning_rate": 3.904303360507276e-06, "loss": 0.78226459, "num_input_tokens_seen": 45265650, "step": 2094, "time_per_iteration": 2.6730611324310303 }, { "auxiliary_loss_clip": 0.01117887, "auxiliary_loss_mlp": 0.01069309, "balance_loss_clip": 1.0500071, "balance_loss_mlp": 1.04892457, "epoch": 0.12595821433939577, "flos": 45222845541120.0, "grad_norm": 1.5703706409155747, "language_loss": 0.76664734, "learning_rate": 3.9041842948336835e-06, "loss": 0.78851926, "num_input_tokens_seen": 45287790, "step": 2095, "time_per_iteration": 2.958367109298706 }, { "auxiliary_loss_clip": 0.01147751, "auxiliary_loss_mlp": 0.01058477, "balance_loss_clip": 1.05202031, "balance_loss_mlp": 1.03782988, "epoch": 0.12601833759206374, "flos": 14319811618560.0, "grad_norm": 2.2556524892449326, "language_loss": 0.83266854, "learning_rate": 3.904065156953232e-06, "loss": 0.85473078, "num_input_tokens_seen": 45305720, "step": 2096, "time_per_iteration": 2.7097342014312744 }, { "auxiliary_loss_clip": 0.01163652, "auxiliary_loss_mlp": 0.01056552, "balance_loss_clip": 1.05806553, "balance_loss_mlp": 1.03577375, "epoch": 0.1260784608447317, "flos": 21288241002240.0, "grad_norm": 1.7589400475615893, "language_loss": 0.75478256, "learning_rate": 3.903945946870439e-06, "loss": 0.77698463, "num_input_tokens_seen": 45325290, "step": 2097, "time_per_iteration": 2.642056703567505 }, { "auxiliary_loss_clip": 0.01156719, "auxiliary_loss_mlp": 0.01063976, "balance_loss_clip": 1.05648863, "balance_loss_mlp": 1.04527175, "epoch": 0.12613858409739967, "flos": 26251311006720.0, "grad_norm": 1.8828235460619742, "language_loss": 0.87110066, "learning_rate": 3.9038266645898246e-06, "loss": 0.89330757, "num_input_tokens_seen": 45344465, "step": 2098, "time_per_iteration": 2.63826584815979 }, { "auxiliary_loss_clip": 0.01117414, "auxiliary_loss_mlp": 0.01058025, "balance_loss_clip": 1.04983974, "balance_loss_mlp": 1.03475559, "epoch": 0.12619870735006763, "flos": 21579979265280.0, "grad_norm": 1.8855647331078333, "language_loss": 0.69494271, "learning_rate": 3.903707310115912e-06, "loss": 0.7166971, "num_input_tokens_seen": 45362465, "step": 2099, "time_per_iteration": 2.7813057899475098 }, { "auxiliary_loss_clip": 0.01142696, "auxiliary_loss_mlp": 0.01061431, "balance_loss_clip": 1.04979372, "balance_loss_mlp": 1.03923464, "epoch": 0.1262588306027356, "flos": 23367037737600.0, "grad_norm": 2.0457253500590498, "language_loss": 0.81949925, "learning_rate": 3.903587883453228e-06, "loss": 0.84154058, "num_input_tokens_seen": 45382700, "step": 2100, "time_per_iteration": 2.704871416091919 }, { "auxiliary_loss_clip": 0.01159613, "auxiliary_loss_mlp": 0.01055067, "balance_loss_clip": 1.0620985, "balance_loss_mlp": 1.03408623, "epoch": 0.12631895385540357, "flos": 23949185460480.0, "grad_norm": 1.7810176086536167, "language_loss": 0.80399859, "learning_rate": 3.903468384606302e-06, "loss": 0.82614541, "num_input_tokens_seen": 45401005, "step": 2101, "time_per_iteration": 2.7071452140808105 }, { "auxiliary_loss_clip": 0.0106985, "auxiliary_loss_mlp": 0.01010859, "balance_loss_clip": 1.02823138, "balance_loss_mlp": 1.00803375, "epoch": 0.12637907710807156, "flos": 70282138780800.0, "grad_norm": 0.7128618749962091, "language_loss": 0.57087427, "learning_rate": 3.903348813579662e-06, "loss": 0.59168136, "num_input_tokens_seen": 45466555, "step": 2102, "time_per_iteration": 3.20320987701416 }, { "auxiliary_loss_clip": 0.01140495, "auxiliary_loss_mlp": 0.01056574, "balance_loss_clip": 1.053671, "balance_loss_mlp": 1.03661788, "epoch": 0.12643920036073952, "flos": 18915084311040.0, "grad_norm": 2.0306165352193988, "language_loss": 0.93653679, "learning_rate": 3.903229170377845e-06, "loss": 0.95850742, "num_input_tokens_seen": 45485165, "step": 2103, "time_per_iteration": 2.6628894805908203 }, { "auxiliary_loss_clip": 0.01144405, "auxiliary_loss_mlp": 0.01040745, "balance_loss_clip": 1.04991472, "balance_loss_mlp": 1.02174282, "epoch": 0.1264993236134075, "flos": 27782470010880.0, "grad_norm": 1.5962316578756222, "language_loss": 0.7804662, "learning_rate": 3.903109455005387e-06, "loss": 0.80231774, "num_input_tokens_seen": 45504630, "step": 2104, "time_per_iteration": 2.6215474605560303 }, { "auxiliary_loss_clip": 0.01135927, "auxiliary_loss_mlp": 0.01056343, "balance_loss_clip": 1.05414486, "balance_loss_mlp": 1.03683996, "epoch": 0.12655944686607545, "flos": 24754697907840.0, "grad_norm": 1.7362499149688688, "language_loss": 0.80728614, "learning_rate": 3.902989667466828e-06, "loss": 0.82920885, "num_input_tokens_seen": 45524885, "step": 2105, "time_per_iteration": 2.74128794670105 }, { "auxiliary_loss_clip": 0.01162904, "auxiliary_loss_mlp": 0.01056367, "balance_loss_clip": 1.05482686, "balance_loss_mlp": 1.03514743, "epoch": 0.12661957011874342, "flos": 24133048202880.0, "grad_norm": 1.9810187943106816, "language_loss": 0.83402872, "learning_rate": 3.90286980776671e-06, "loss": 0.85622144, "num_input_tokens_seen": 45545000, "step": 2106, "time_per_iteration": 2.676694631576538 }, { "auxiliary_loss_clip": 0.01126632, "auxiliary_loss_mlp": 0.01052067, "balance_loss_clip": 1.05697966, "balance_loss_mlp": 1.03147984, "epoch": 0.12667969337141138, "flos": 24569614103040.0, "grad_norm": 1.6951691508845637, "language_loss": 0.73469931, "learning_rate": 3.902749875909578e-06, "loss": 0.7564863, "num_input_tokens_seen": 45564210, "step": 2107, "time_per_iteration": 2.7506372928619385 }, { "auxiliary_loss_clip": 0.01162931, "auxiliary_loss_mlp": 0.01044317, "balance_loss_clip": 1.05320692, "balance_loss_mlp": 1.02599406, "epoch": 0.12673981662407935, "flos": 22961677777920.0, "grad_norm": 2.0116792159666477, "language_loss": 0.79395336, "learning_rate": 3.90262987189998e-06, "loss": 0.81602579, "num_input_tokens_seen": 45583030, "step": 2108, "time_per_iteration": 2.6611146926879883 }, { "auxiliary_loss_clip": 0.01168073, "auxiliary_loss_mlp": 0.01049192, "balance_loss_clip": 1.05300844, "balance_loss_mlp": 1.02945089, "epoch": 0.12679993987674734, "flos": 17274864637440.0, "grad_norm": 1.9298328790617403, "language_loss": 0.7561394, "learning_rate": 3.902509795742467e-06, "loss": 0.77831209, "num_input_tokens_seen": 45602265, "step": 2109, "time_per_iteration": 2.5963573455810547 }, { "auxiliary_loss_clip": 0.01111025, "auxiliary_loss_mlp": 0.01053822, "balance_loss_clip": 1.04636049, "balance_loss_mlp": 1.0335331, "epoch": 0.1268600631294153, "flos": 17275080119040.0, "grad_norm": 1.6171901700648081, "language_loss": 0.82806516, "learning_rate": 3.902389647441592e-06, "loss": 0.84971368, "num_input_tokens_seen": 45620595, "step": 2110, "time_per_iteration": 2.6745550632476807 }, { "auxiliary_loss_clip": 0.01145969, "auxiliary_loss_mlp": 0.00778071, "balance_loss_clip": 1.05419564, "balance_loss_mlp": 0.99996144, "epoch": 0.12692018638208327, "flos": 24061047390720.0, "grad_norm": 1.6765217216011241, "language_loss": 0.78092968, "learning_rate": 3.90226942700191e-06, "loss": 0.80017006, "num_input_tokens_seen": 45641140, "step": 2111, "time_per_iteration": 2.65983510017395 }, { "auxiliary_loss_clip": 0.01130932, "auxiliary_loss_mlp": 0.01076547, "balance_loss_clip": 1.05490458, "balance_loss_mlp": 1.05352807, "epoch": 0.12698030963475124, "flos": 31831900652160.0, "grad_norm": 2.15738266202174, "language_loss": 0.77103376, "learning_rate": 3.902149134427982e-06, "loss": 0.79310858, "num_input_tokens_seen": 45662315, "step": 2112, "time_per_iteration": 2.870299816131592 }, { "auxiliary_loss_clip": 0.01129438, "auxiliary_loss_mlp": 0.01074863, "balance_loss_clip": 1.05213726, "balance_loss_mlp": 1.05427516, "epoch": 0.1270404328874192, "flos": 25187744275200.0, "grad_norm": 1.9191529425470424, "language_loss": 0.85806453, "learning_rate": 3.902028769724367e-06, "loss": 0.88010758, "num_input_tokens_seen": 45680335, "step": 2113, "time_per_iteration": 4.26338267326355 }, { "auxiliary_loss_clip": 0.01137468, "auxiliary_loss_mlp": 0.01078067, "balance_loss_clip": 1.05511892, "balance_loss_mlp": 1.05670488, "epoch": 0.12710055614008717, "flos": 15997342544640.0, "grad_norm": 1.9721234476704599, "language_loss": 0.74027002, "learning_rate": 3.9019083328956315e-06, "loss": 0.7624253, "num_input_tokens_seen": 45696240, "step": 2114, "time_per_iteration": 2.7573230266571045 }, { "auxiliary_loss_clip": 0.01156713, "auxiliary_loss_mlp": 0.01060574, "balance_loss_clip": 1.05770111, "balance_loss_mlp": 1.03924704, "epoch": 0.12716067939275516, "flos": 15085642515840.0, "grad_norm": 1.7921743813213327, "language_loss": 0.83240676, "learning_rate": 3.901787823946341e-06, "loss": 0.85457963, "num_input_tokens_seen": 45713695, "step": 2115, "time_per_iteration": 4.1369829177856445 }, { "auxiliary_loss_clip": 0.01154653, "auxiliary_loss_mlp": 0.01065557, "balance_loss_clip": 1.05875492, "balance_loss_mlp": 1.04476702, "epoch": 0.12722080264542313, "flos": 28366736636160.0, "grad_norm": 1.4840591347809418, "language_loss": 0.87010503, "learning_rate": 3.901667242881065e-06, "loss": 0.89230716, "num_input_tokens_seen": 45736655, "step": 2116, "time_per_iteration": 2.73896861076355 }, { "auxiliary_loss_clip": 0.01139498, "auxiliary_loss_mlp": 0.00777066, "balance_loss_clip": 1.05413389, "balance_loss_mlp": 0.99995339, "epoch": 0.1272809258980911, "flos": 32379897519360.0, "grad_norm": 1.753205985010591, "language_loss": 0.70374918, "learning_rate": 3.9015465897043775e-06, "loss": 0.72291481, "num_input_tokens_seen": 45758195, "step": 2117, "time_per_iteration": 2.783156156539917 }, { "auxiliary_loss_clip": 0.01127455, "auxiliary_loss_mlp": 0.0106424, "balance_loss_clip": 1.04978406, "balance_loss_mlp": 1.04068434, "epoch": 0.12734104915075906, "flos": 16034402401920.0, "grad_norm": 1.9957647698478755, "language_loss": 0.86237884, "learning_rate": 3.901425864420852e-06, "loss": 0.8842957, "num_input_tokens_seen": 45774280, "step": 2118, "time_per_iteration": 4.322036266326904 }, { "auxiliary_loss_clip": 0.01161417, "auxiliary_loss_mlp": 0.01049008, "balance_loss_clip": 1.05827069, "balance_loss_mlp": 1.02951694, "epoch": 0.12740117240342702, "flos": 18260325244800.0, "grad_norm": 1.705293179953873, "language_loss": 0.87577266, "learning_rate": 3.901305067035068e-06, "loss": 0.89787692, "num_input_tokens_seen": 45792760, "step": 2119, "time_per_iteration": 2.6559741497039795 }, { "auxiliary_loss_clip": 0.01145426, "auxiliary_loss_mlp": 0.0077754, "balance_loss_clip": 1.05233431, "balance_loss_mlp": 0.99984539, "epoch": 0.127461295656095, "flos": 12121790664960.0, "grad_norm": 2.05013605026053, "language_loss": 0.87824571, "learning_rate": 3.901184197551605e-06, "loss": 0.89747536, "num_input_tokens_seen": 45804300, "step": 2120, "time_per_iteration": 2.6154048442840576 }, { "auxiliary_loss_clip": 0.01170497, "auxiliary_loss_mlp": 0.01046075, "balance_loss_clip": 1.05822706, "balance_loss_mlp": 1.02626204, "epoch": 0.12752141890876295, "flos": 23149095966720.0, "grad_norm": 1.9784951602308867, "language_loss": 0.75584805, "learning_rate": 3.901063255975046e-06, "loss": 0.77801377, "num_input_tokens_seen": 45823780, "step": 2121, "time_per_iteration": 2.579265832901001 }, { "auxiliary_loss_clip": 0.0111249, "auxiliary_loss_mlp": 0.01047949, "balance_loss_clip": 1.04741263, "balance_loss_mlp": 1.02727842, "epoch": 0.12758154216143094, "flos": 21615997628160.0, "grad_norm": 2.0293629108662405, "language_loss": 0.82732606, "learning_rate": 3.900942242309978e-06, "loss": 0.84893048, "num_input_tokens_seen": 45840495, "step": 2122, "time_per_iteration": 2.793870210647583 }, { "auxiliary_loss_clip": 0.01151713, "auxiliary_loss_mlp": 0.01049724, "balance_loss_clip": 1.05901408, "balance_loss_mlp": 1.02983987, "epoch": 0.1276416654140989, "flos": 15924874855680.0, "grad_norm": 1.7660235451894624, "language_loss": 0.78699338, "learning_rate": 3.90082115656099e-06, "loss": 0.80900776, "num_input_tokens_seen": 45857735, "step": 2123, "time_per_iteration": 2.70546293258667 }, { "auxiliary_loss_clip": 0.01172823, "auxiliary_loss_mlp": 0.01055328, "balance_loss_clip": 1.05931985, "balance_loss_mlp": 1.03478789, "epoch": 0.12770178866676687, "flos": 22382690451840.0, "grad_norm": 1.5643885422181942, "language_loss": 0.78931451, "learning_rate": 3.900699998732673e-06, "loss": 0.81159604, "num_input_tokens_seen": 45876485, "step": 2124, "time_per_iteration": 2.661712408065796 }, { "auxiliary_loss_clip": 0.01160474, "auxiliary_loss_mlp": 0.00776885, "balance_loss_clip": 1.05457389, "balance_loss_mlp": 0.99987447, "epoch": 0.12776191191943484, "flos": 21652482867840.0, "grad_norm": 1.9695028631977674, "language_loss": 0.75605726, "learning_rate": 3.900578768829623e-06, "loss": 0.7754308, "num_input_tokens_seen": 45894645, "step": 2125, "time_per_iteration": 2.696021556854248 }, { "auxiliary_loss_clip": 0.01158163, "auxiliary_loss_mlp": 0.00777059, "balance_loss_clip": 1.05398965, "balance_loss_mlp": 1.00002348, "epoch": 0.1278220351721028, "flos": 25735561574400.0, "grad_norm": 3.019802885219414, "language_loss": 0.78016824, "learning_rate": 3.900457466856434e-06, "loss": 0.79952049, "num_input_tokens_seen": 45913755, "step": 2126, "time_per_iteration": 2.721435308456421 }, { "auxiliary_loss_clip": 0.01124637, "auxiliary_loss_mlp": 0.010537, "balance_loss_clip": 1.05406642, "balance_loss_mlp": 1.03504348, "epoch": 0.12788215842477077, "flos": 41243224982400.0, "grad_norm": 1.3825945270792501, "language_loss": 0.6927852, "learning_rate": 3.9003360928177085e-06, "loss": 0.71456861, "num_input_tokens_seen": 45936095, "step": 2127, "time_per_iteration": 2.902101993560791 }, { "auxiliary_loss_clip": 0.01030231, "auxiliary_loss_mlp": 0.00759051, "balance_loss_clip": 1.02830005, "balance_loss_mlp": 1.00050259, "epoch": 0.12794228167743876, "flos": 70877430881280.0, "grad_norm": 0.853491438999862, "language_loss": 0.62831402, "learning_rate": 3.900214646718047e-06, "loss": 0.64620686, "num_input_tokens_seen": 46004655, "step": 2128, "time_per_iteration": 3.3387396335601807 }, { "auxiliary_loss_clip": 0.01145823, "auxiliary_loss_mlp": 0.01047815, "balance_loss_clip": 1.05080712, "balance_loss_mlp": 1.02599955, "epoch": 0.12800240493010673, "flos": 16289727252480.0, "grad_norm": 2.066959353069841, "language_loss": 0.77626479, "learning_rate": 3.900093128562056e-06, "loss": 0.7982012, "num_input_tokens_seen": 46023610, "step": 2129, "time_per_iteration": 2.611309766769409 }, { "auxiliary_loss_clip": 0.01122914, "auxiliary_loss_mlp": 0.01052577, "balance_loss_clip": 1.05058527, "balance_loss_mlp": 1.03029668, "epoch": 0.1280625281827747, "flos": 20631542601600.0, "grad_norm": 2.1214737401843893, "language_loss": 0.79263359, "learning_rate": 3.899971538354343e-06, "loss": 0.81438851, "num_input_tokens_seen": 46041725, "step": 2130, "time_per_iteration": 2.753243923187256 }, { "auxiliary_loss_clip": 0.01139626, "auxiliary_loss_mlp": 0.01052453, "balance_loss_clip": 1.05133748, "balance_loss_mlp": 1.03147244, "epoch": 0.12812265143544266, "flos": 22638230784000.0, "grad_norm": 1.7780274650921335, "language_loss": 0.70945668, "learning_rate": 3.899849876099518e-06, "loss": 0.73137754, "num_input_tokens_seen": 46061095, "step": 2131, "time_per_iteration": 2.6809306144714355 }, { "auxiliary_loss_clip": 0.01102824, "auxiliary_loss_mlp": 0.01052393, "balance_loss_clip": 1.04982638, "balance_loss_mlp": 1.03163886, "epoch": 0.12818277468811062, "flos": 34714701463680.0, "grad_norm": 2.2916674504462655, "language_loss": 0.72298968, "learning_rate": 3.899728141802197e-06, "loss": 0.74454176, "num_input_tokens_seen": 46082670, "step": 2132, "time_per_iteration": 2.8769233226776123 }, { "auxiliary_loss_clip": 0.01102594, "auxiliary_loss_mlp": 0.01055993, "balance_loss_clip": 1.04384947, "balance_loss_mlp": 1.03348672, "epoch": 0.1282428979407786, "flos": 23112107936640.0, "grad_norm": 2.0316054281953155, "language_loss": 0.82128644, "learning_rate": 3.8996063354669935e-06, "loss": 0.84287226, "num_input_tokens_seen": 46102410, "step": 2133, "time_per_iteration": 2.766897678375244 }, { "auxiliary_loss_clip": 0.01163396, "auxiliary_loss_mlp": 0.01057069, "balance_loss_clip": 1.05397773, "balance_loss_mlp": 1.03458595, "epoch": 0.12830302119344655, "flos": 20886508316160.0, "grad_norm": 3.232115826630309, "language_loss": 0.80001891, "learning_rate": 3.899484457098528e-06, "loss": 0.82222354, "num_input_tokens_seen": 46121145, "step": 2134, "time_per_iteration": 2.6347672939300537 }, { "auxiliary_loss_clip": 0.01159056, "auxiliary_loss_mlp": 0.01046209, "balance_loss_clip": 1.05907345, "balance_loss_mlp": 1.02614641, "epoch": 0.12836314444611455, "flos": 21397768548480.0, "grad_norm": 1.731952504909339, "language_loss": 0.82657921, "learning_rate": 3.899362506701421e-06, "loss": 0.84863198, "num_input_tokens_seen": 46140740, "step": 2135, "time_per_iteration": 2.6393656730651855 }, { "auxiliary_loss_clip": 0.0114208, "auxiliary_loss_mlp": 0.0105553, "balance_loss_clip": 1.05345035, "balance_loss_mlp": 1.03411996, "epoch": 0.1284232676987825, "flos": 13662466773120.0, "grad_norm": 2.1083924470752278, "language_loss": 0.7764526, "learning_rate": 3.899240484280298e-06, "loss": 0.79842871, "num_input_tokens_seen": 46156805, "step": 2136, "time_per_iteration": 2.7195920944213867 }, { "auxiliary_loss_clip": 0.01020946, "auxiliary_loss_mlp": 0.01003991, "balance_loss_clip": 1.01967573, "balance_loss_mlp": 1.00096273, "epoch": 0.12848339095145048, "flos": 59994737735040.0, "grad_norm": 0.8964253308146478, "language_loss": 0.59152198, "learning_rate": 3.899118389839785e-06, "loss": 0.61177135, "num_input_tokens_seen": 46222085, "step": 2137, "time_per_iteration": 3.416015625 }, { "auxiliary_loss_clip": 0.01153694, "auxiliary_loss_mlp": 0.01054623, "balance_loss_clip": 1.05178177, "balance_loss_mlp": 1.03483438, "epoch": 0.12854351420411844, "flos": 13881378211200.0, "grad_norm": 3.244493357011547, "language_loss": 0.82344306, "learning_rate": 3.898996223384512e-06, "loss": 0.84552622, "num_input_tokens_seen": 46239970, "step": 2138, "time_per_iteration": 2.65515398979187 }, { "auxiliary_loss_clip": 0.01159586, "auxiliary_loss_mlp": 0.01049293, "balance_loss_clip": 1.05592752, "balance_loss_mlp": 1.02665496, "epoch": 0.1286036374567864, "flos": 22637943475200.0, "grad_norm": 2.5417837252920323, "language_loss": 0.78691363, "learning_rate": 3.898873984919113e-06, "loss": 0.8090024, "num_input_tokens_seen": 46257740, "step": 2139, "time_per_iteration": 2.651132345199585 }, { "auxiliary_loss_clip": 0.01136892, "auxiliary_loss_mlp": 0.01045928, "balance_loss_clip": 1.05267286, "balance_loss_mlp": 1.02582908, "epoch": 0.12866376070945437, "flos": 16324775948160.0, "grad_norm": 1.9541049485452633, "language_loss": 0.85289955, "learning_rate": 3.8987516744482215e-06, "loss": 0.87472773, "num_input_tokens_seen": 46275445, "step": 2140, "time_per_iteration": 2.730156183242798 }, { "auxiliary_loss_clip": 0.01143134, "auxiliary_loss_mlp": 0.01044337, "balance_loss_clip": 1.05203128, "balance_loss_mlp": 1.02482224, "epoch": 0.12872388396212234, "flos": 11874546374400.0, "grad_norm": 1.8185491602156885, "language_loss": 0.86268306, "learning_rate": 3.898629291976476e-06, "loss": 0.88455778, "num_input_tokens_seen": 46291710, "step": 2141, "time_per_iteration": 2.62223482131958 }, { "auxiliary_loss_clip": 0.01146971, "auxiliary_loss_mlp": 0.01045813, "balance_loss_clip": 1.0528295, "balance_loss_mlp": 1.02548814, "epoch": 0.12878400721479033, "flos": 28366700722560.0, "grad_norm": 3.1267362471736684, "language_loss": 0.68282312, "learning_rate": 3.898506837508518e-06, "loss": 0.70475101, "num_input_tokens_seen": 46311335, "step": 2142, "time_per_iteration": 2.71232271194458 }, { "auxiliary_loss_clip": 0.01165678, "auxiliary_loss_mlp": 0.0077895, "balance_loss_clip": 1.05764627, "balance_loss_mlp": 0.99990749, "epoch": 0.1288441304674583, "flos": 25885632597120.0, "grad_norm": 2.373838274123079, "language_loss": 0.83479214, "learning_rate": 3.89838431104899e-06, "loss": 0.85423845, "num_input_tokens_seen": 46330985, "step": 2143, "time_per_iteration": 2.677692174911499 }, { "auxiliary_loss_clip": 0.01175134, "auxiliary_loss_mlp": 0.00777405, "balance_loss_clip": 1.0598439, "balance_loss_mlp": 0.99994075, "epoch": 0.12890425372012626, "flos": 20813789232000.0, "grad_norm": 1.5662270309624111, "language_loss": 0.81703234, "learning_rate": 3.898261712602539e-06, "loss": 0.83655775, "num_input_tokens_seen": 46351295, "step": 2144, "time_per_iteration": 2.712620496749878 }, { "auxiliary_loss_clip": 0.01130321, "auxiliary_loss_mlp": 0.01053521, "balance_loss_clip": 1.04658103, "balance_loss_mlp": 1.03145528, "epoch": 0.12896437697279423, "flos": 22565870835840.0, "grad_norm": 1.8026346290528672, "language_loss": 0.78304374, "learning_rate": 3.898139042173813e-06, "loss": 0.80488217, "num_input_tokens_seen": 46368600, "step": 2145, "time_per_iteration": 2.6766605377197266 }, { "auxiliary_loss_clip": 0.01170585, "auxiliary_loss_mlp": 0.01047893, "balance_loss_clip": 1.0543592, "balance_loss_mlp": 1.02662635, "epoch": 0.1290245002254622, "flos": 17493776075520.0, "grad_norm": 2.147087506474235, "language_loss": 0.82865375, "learning_rate": 3.898016299767465e-06, "loss": 0.85083848, "num_input_tokens_seen": 46387370, "step": 2146, "time_per_iteration": 2.5860395431518555 }, { "auxiliary_loss_clip": 0.01141916, "auxiliary_loss_mlp": 0.0105138, "balance_loss_clip": 1.05367482, "balance_loss_mlp": 1.03062606, "epoch": 0.12908462347813016, "flos": 36315957859200.0, "grad_norm": 2.344626501147968, "language_loss": 0.71275079, "learning_rate": 3.897893485388149e-06, "loss": 0.73468375, "num_input_tokens_seen": 46409570, "step": 2147, "time_per_iteration": 2.7870359420776367 }, { "auxiliary_loss_clip": 0.01147238, "auxiliary_loss_mlp": 0.01052291, "balance_loss_clip": 1.05527067, "balance_loss_mlp": 1.03297925, "epoch": 0.12914474673079815, "flos": 22528703237760.0, "grad_norm": 2.120275205230366, "language_loss": 0.71432978, "learning_rate": 3.897770599040521e-06, "loss": 0.73632509, "num_input_tokens_seen": 46429320, "step": 2148, "time_per_iteration": 2.6865081787109375 }, { "auxiliary_loss_clip": 0.01168479, "auxiliary_loss_mlp": 0.01049575, "balance_loss_clip": 1.05762172, "balance_loss_mlp": 1.03016782, "epoch": 0.12920486998346611, "flos": 21471888263040.0, "grad_norm": 1.6388902851592406, "language_loss": 0.79064089, "learning_rate": 3.897647640729242e-06, "loss": 0.81282145, "num_input_tokens_seen": 46450155, "step": 2149, "time_per_iteration": 2.6041862964630127 }, { "auxiliary_loss_clip": 0.01159527, "auxiliary_loss_mlp": 0.01046069, "balance_loss_clip": 1.05377793, "balance_loss_mlp": 1.02531469, "epoch": 0.12926499323613408, "flos": 27308556944640.0, "grad_norm": 2.034796374339078, "language_loss": 0.75976646, "learning_rate": 3.897524610458975e-06, "loss": 0.78182244, "num_input_tokens_seen": 46470280, "step": 2150, "time_per_iteration": 2.647224187850952 }, { "auxiliary_loss_clip": 0.01155787, "auxiliary_loss_mlp": 0.01055192, "balance_loss_clip": 1.05445433, "balance_loss_mlp": 1.03491461, "epoch": 0.12932511648880204, "flos": 22091131756800.0, "grad_norm": 2.3830500835005592, "language_loss": 0.70986372, "learning_rate": 3.8974015082343835e-06, "loss": 0.73197353, "num_input_tokens_seen": 46487605, "step": 2151, "time_per_iteration": 2.7008492946624756 }, { "auxiliary_loss_clip": 0.01167835, "auxiliary_loss_mlp": 0.0104951, "balance_loss_clip": 1.05603719, "balance_loss_mlp": 1.03017378, "epoch": 0.12938523974147, "flos": 20302780394880.0, "grad_norm": 2.058334480733051, "language_loss": 0.83964819, "learning_rate": 3.897278334060137e-06, "loss": 0.86182165, "num_input_tokens_seen": 46505100, "step": 2152, "time_per_iteration": 2.6467373371124268 }, { "auxiliary_loss_clip": 0.01158553, "auxiliary_loss_mlp": 0.01058416, "balance_loss_clip": 1.05283821, "balance_loss_mlp": 1.03888893, "epoch": 0.12944536299413797, "flos": 19499961467520.0, "grad_norm": 1.5624811365269535, "language_loss": 0.78585124, "learning_rate": 3.897155087940906e-06, "loss": 0.80802095, "num_input_tokens_seen": 46524020, "step": 2153, "time_per_iteration": 4.286921262741089 }, { "auxiliary_loss_clip": 0.01113716, "auxiliary_loss_mlp": 0.00777812, "balance_loss_clip": 1.04707122, "balance_loss_mlp": 0.99989671, "epoch": 0.12950548624680594, "flos": 27707919333120.0, "grad_norm": 1.6189787343362376, "language_loss": 0.80253434, "learning_rate": 3.897031769881364e-06, "loss": 0.82144964, "num_input_tokens_seen": 46544640, "step": 2154, "time_per_iteration": 2.7602338790893555 }, { "auxiliary_loss_clip": 0.01149958, "auxiliary_loss_mlp": 0.0105188, "balance_loss_clip": 1.05262971, "balance_loss_mlp": 1.03099442, "epoch": 0.12956560949947393, "flos": 17565740974080.0, "grad_norm": 1.8080432584650143, "language_loss": 0.83717728, "learning_rate": 3.896908379886188e-06, "loss": 0.85919571, "num_input_tokens_seen": 46561395, "step": 2155, "time_per_iteration": 5.696707010269165 }, { "auxiliary_loss_clip": 0.01161999, "auxiliary_loss_mlp": 0.01056273, "balance_loss_clip": 1.05426383, "balance_loss_mlp": 1.03611445, "epoch": 0.1296257327521419, "flos": 20740711011840.0, "grad_norm": 2.4972858828122666, "language_loss": 0.76114857, "learning_rate": 3.896784917960055e-06, "loss": 0.78333133, "num_input_tokens_seen": 46579395, "step": 2156, "time_per_iteration": 2.6279313564300537 }, { "auxiliary_loss_clip": 0.01105089, "auxiliary_loss_mlp": 0.01056603, "balance_loss_clip": 1.0510118, "balance_loss_mlp": 1.03679013, "epoch": 0.12968585600480986, "flos": 16395735265920.0, "grad_norm": 1.6652476704410177, "language_loss": 0.86493659, "learning_rate": 3.896661384107648e-06, "loss": 0.88655347, "num_input_tokens_seen": 46597090, "step": 2157, "time_per_iteration": 4.4089202880859375 }, { "auxiliary_loss_clip": 0.01170107, "auxiliary_loss_mlp": 0.01055814, "balance_loss_clip": 1.05253935, "balance_loss_mlp": 1.0349642, "epoch": 0.12974597925747783, "flos": 28329533124480.0, "grad_norm": 2.5240136552338956, "language_loss": 0.80393612, "learning_rate": 3.896537778333651e-06, "loss": 0.8261953, "num_input_tokens_seen": 46617355, "step": 2158, "time_per_iteration": 2.702765703201294 }, { "auxiliary_loss_clip": 0.01177017, "auxiliary_loss_mlp": 0.01060365, "balance_loss_clip": 1.05905974, "balance_loss_mlp": 1.04050517, "epoch": 0.1298061025101458, "flos": 9683025782400.0, "grad_norm": 2.5307604694159607, "language_loss": 0.74881256, "learning_rate": 3.896414100642752e-06, "loss": 0.77118635, "num_input_tokens_seen": 46633130, "step": 2159, "time_per_iteration": 2.534163475036621 }, { "auxiliary_loss_clip": 0.01122909, "auxiliary_loss_mlp": 0.01058309, "balance_loss_clip": 1.04594469, "balance_loss_mlp": 1.03471708, "epoch": 0.12986622576281376, "flos": 27709535445120.0, "grad_norm": 1.954419432637739, "language_loss": 0.8259204, "learning_rate": 3.89629035103964e-06, "loss": 0.84773254, "num_input_tokens_seen": 46650575, "step": 2160, "time_per_iteration": 2.7358646392822266 }, { "auxiliary_loss_clip": 0.01154348, "auxiliary_loss_mlp": 0.01047243, "balance_loss_clip": 1.05873609, "balance_loss_mlp": 1.02732301, "epoch": 0.12992634901548175, "flos": 18802719590400.0, "grad_norm": 1.7252123805741888, "language_loss": 0.82310414, "learning_rate": 3.896166529529008e-06, "loss": 0.84512007, "num_input_tokens_seen": 46668780, "step": 2161, "time_per_iteration": 2.7029623985290527 }, { "auxiliary_loss_clip": 0.01145886, "auxiliary_loss_mlp": 0.01060381, "balance_loss_clip": 1.05145073, "balance_loss_mlp": 1.03911448, "epoch": 0.12998647226814972, "flos": 29127575543040.0, "grad_norm": 2.0780374068601253, "language_loss": 0.82668459, "learning_rate": 3.896042636115551e-06, "loss": 0.84874725, "num_input_tokens_seen": 46687550, "step": 2162, "time_per_iteration": 2.674825668334961 }, { "auxiliary_loss_clip": 0.0113921, "auxiliary_loss_mlp": 0.0105953, "balance_loss_clip": 1.05468941, "balance_loss_mlp": 1.03957474, "epoch": 0.13004659552081768, "flos": 19573686132480.0, "grad_norm": 3.928222506771022, "language_loss": 0.72579277, "learning_rate": 3.895918670803968e-06, "loss": 0.7477802, "num_input_tokens_seen": 46706730, "step": 2163, "time_per_iteration": 2.678394079208374 }, { "auxiliary_loss_clip": 0.01173873, "auxiliary_loss_mlp": 0.00778662, "balance_loss_clip": 1.05635965, "balance_loss_mlp": 0.99994016, "epoch": 0.13010671877348565, "flos": 22490709626880.0, "grad_norm": 2.0196348424542827, "language_loss": 0.81330699, "learning_rate": 3.895794633598958e-06, "loss": 0.83283234, "num_input_tokens_seen": 46724250, "step": 2164, "time_per_iteration": 2.6116931438446045 }, { "auxiliary_loss_clip": 0.01119834, "auxiliary_loss_mlp": 0.01050661, "balance_loss_clip": 1.04808033, "balance_loss_mlp": 1.03061032, "epoch": 0.1301668420261536, "flos": 23878226142720.0, "grad_norm": 2.274563635903502, "language_loss": 0.72262049, "learning_rate": 3.8956705245052256e-06, "loss": 0.74432552, "num_input_tokens_seen": 46744105, "step": 2165, "time_per_iteration": 2.7646515369415283 }, { "auxiliary_loss_clip": 0.01109832, "auxiliary_loss_mlp": 0.01048351, "balance_loss_clip": 1.05059505, "balance_loss_mlp": 1.02707219, "epoch": 0.13022696527882158, "flos": 23150065633920.0, "grad_norm": 2.8383873988269217, "language_loss": 0.74749964, "learning_rate": 3.8955463435274765e-06, "loss": 0.76908153, "num_input_tokens_seen": 46764250, "step": 2166, "time_per_iteration": 2.7939398288726807 }, { "auxiliary_loss_clip": 0.01170298, "auxiliary_loss_mlp": 0.01048037, "balance_loss_clip": 1.05364752, "balance_loss_mlp": 1.02827251, "epoch": 0.13028708853148954, "flos": 26908548111360.0, "grad_norm": 1.5379857106114436, "language_loss": 0.83098066, "learning_rate": 3.895422090670421e-06, "loss": 0.85316396, "num_input_tokens_seen": 46786865, "step": 2167, "time_per_iteration": 2.700505495071411 }, { "auxiliary_loss_clip": 0.01108628, "auxiliary_loss_mlp": 0.01059921, "balance_loss_clip": 1.04567361, "balance_loss_mlp": 1.03841531, "epoch": 0.13034721178415754, "flos": 21251468453760.0, "grad_norm": 1.6054044551173634, "language_loss": 0.83578718, "learning_rate": 3.89529776593877e-06, "loss": 0.85747266, "num_input_tokens_seen": 46807030, "step": 2168, "time_per_iteration": 2.839285135269165 }, { "auxiliary_loss_clip": 0.01079188, "auxiliary_loss_mlp": 0.01063413, "balance_loss_clip": 1.04247975, "balance_loss_mlp": 1.03861713, "epoch": 0.1304073350368255, "flos": 18767239931520.0, "grad_norm": 1.950315007602454, "language_loss": 0.79910588, "learning_rate": 3.8951733693372375e-06, "loss": 0.8205319, "num_input_tokens_seen": 46826280, "step": 2169, "time_per_iteration": 2.8150076866149902 }, { "auxiliary_loss_clip": 0.01174566, "auxiliary_loss_mlp": 0.01044893, "balance_loss_clip": 1.05822575, "balance_loss_mlp": 1.02339983, "epoch": 0.13046745828949347, "flos": 28364653647360.0, "grad_norm": 2.4117618540057766, "language_loss": 0.66804767, "learning_rate": 3.8950489008705406e-06, "loss": 0.69024229, "num_input_tokens_seen": 46846505, "step": 2170, "time_per_iteration": 2.722769021987915 }, { "auxiliary_loss_clip": 0.0114216, "auxiliary_loss_mlp": 0.01046684, "balance_loss_clip": 1.05424142, "balance_loss_mlp": 1.02637053, "epoch": 0.13052758154216143, "flos": 29605044055680.0, "grad_norm": 1.9089846415842238, "language_loss": 0.66768706, "learning_rate": 3.8949243605434e-06, "loss": 0.68957549, "num_input_tokens_seen": 46867380, "step": 2171, "time_per_iteration": 2.7474682331085205 }, { "auxiliary_loss_clip": 0.01157431, "auxiliary_loss_mlp": 0.01049079, "balance_loss_clip": 1.05283058, "balance_loss_mlp": 1.02701378, "epoch": 0.1305877047948294, "flos": 19390864884480.0, "grad_norm": 2.103440896006443, "language_loss": 0.72157478, "learning_rate": 3.894799748360537e-06, "loss": 0.74363995, "num_input_tokens_seen": 46886810, "step": 2172, "time_per_iteration": 2.8062691688537598 }, { "auxiliary_loss_clip": 0.01131178, "auxiliary_loss_mlp": 0.01045812, "balance_loss_clip": 1.05676126, "balance_loss_mlp": 1.0248909, "epoch": 0.13064782804749736, "flos": 16873527000960.0, "grad_norm": 1.8662964619330822, "language_loss": 0.75331408, "learning_rate": 3.894675064326678e-06, "loss": 0.77508402, "num_input_tokens_seen": 46905620, "step": 2173, "time_per_iteration": 2.749630928039551 }, { "auxiliary_loss_clip": 0.01132129, "auxiliary_loss_mlp": 0.01056024, "balance_loss_clip": 1.05241716, "balance_loss_mlp": 1.03388715, "epoch": 0.13070795130016533, "flos": 24499085748480.0, "grad_norm": 2.8034072456055426, "language_loss": 0.70175481, "learning_rate": 3.894550308446551e-06, "loss": 0.72363639, "num_input_tokens_seen": 46925120, "step": 2174, "time_per_iteration": 2.723314046859741 }, { "auxiliary_loss_clip": 0.01047643, "auxiliary_loss_mlp": 0.01015006, "balance_loss_clip": 1.02629197, "balance_loss_mlp": 1.01260972, "epoch": 0.13076807455283332, "flos": 71054505953280.0, "grad_norm": 0.7998489021914615, "language_loss": 0.59026134, "learning_rate": 3.894425480724886e-06, "loss": 0.61088777, "num_input_tokens_seen": 46988195, "step": 2175, "time_per_iteration": 3.318049192428589 }, { "auxiliary_loss_clip": 0.01159762, "auxiliary_loss_mlp": 0.01053929, "balance_loss_clip": 1.05441868, "balance_loss_mlp": 1.03342521, "epoch": 0.13082819780550128, "flos": 20264499475200.0, "grad_norm": 2.2309284705459707, "language_loss": 0.80365628, "learning_rate": 3.894300581166417e-06, "loss": 0.82579315, "num_input_tokens_seen": 47004720, "step": 2176, "time_per_iteration": 2.631732702255249 }, { "auxiliary_loss_clip": 0.01169648, "auxiliary_loss_mlp": 0.01047517, "balance_loss_clip": 1.05513525, "balance_loss_mlp": 1.02529645, "epoch": 0.13088832105816925, "flos": 34203441231360.0, "grad_norm": 1.6906214681317566, "language_loss": 0.74661696, "learning_rate": 3.894175609775881e-06, "loss": 0.76878858, "num_input_tokens_seen": 47024255, "step": 2177, "time_per_iteration": 2.701422691345215 }, { "auxiliary_loss_clip": 0.01131124, "auxiliary_loss_mlp": 0.0105144, "balance_loss_clip": 1.051373, "balance_loss_mlp": 1.02905297, "epoch": 0.13094844431083721, "flos": 17894970057600.0, "grad_norm": 1.8043513019060269, "language_loss": 0.82266748, "learning_rate": 3.894050566558015e-06, "loss": 0.84449303, "num_input_tokens_seen": 47042465, "step": 2178, "time_per_iteration": 2.6934497356414795 }, { "auxiliary_loss_clip": 0.01170524, "auxiliary_loss_mlp": 0.01047895, "balance_loss_clip": 1.05729508, "balance_loss_mlp": 1.02705729, "epoch": 0.13100856756350518, "flos": 17311313963520.0, "grad_norm": 2.9251611149508276, "language_loss": 0.74291968, "learning_rate": 3.893925451517562e-06, "loss": 0.76510382, "num_input_tokens_seen": 47060370, "step": 2179, "time_per_iteration": 2.6111502647399902 }, { "auxiliary_loss_clip": 0.01128297, "auxiliary_loss_mlp": 0.01052407, "balance_loss_clip": 1.04917574, "balance_loss_mlp": 1.03184354, "epoch": 0.13106869081617314, "flos": 22200551562240.0, "grad_norm": 1.9805514150688242, "language_loss": 0.84366202, "learning_rate": 3.893800264659266e-06, "loss": 0.8654691, "num_input_tokens_seen": 47081415, "step": 2180, "time_per_iteration": 2.731229543685913 }, { "auxiliary_loss_clip": 0.01162028, "auxiliary_loss_mlp": 0.0105845, "balance_loss_clip": 1.05875921, "balance_loss_mlp": 1.03757644, "epoch": 0.13112881406884114, "flos": 21763123735680.0, "grad_norm": 1.8389866248015785, "language_loss": 0.89840436, "learning_rate": 3.8936750059878746e-06, "loss": 0.92060918, "num_input_tokens_seen": 47099860, "step": 2181, "time_per_iteration": 2.643890380859375 }, { "auxiliary_loss_clip": 0.01153771, "auxiliary_loss_mlp": 0.01051982, "balance_loss_clip": 1.05222976, "balance_loss_mlp": 1.03126323, "epoch": 0.1311889373215091, "flos": 23331091201920.0, "grad_norm": 2.117586475019142, "language_loss": 0.68813586, "learning_rate": 3.893549675508137e-06, "loss": 0.7101934, "num_input_tokens_seen": 47118540, "step": 2182, "time_per_iteration": 2.6198863983154297 }, { "auxiliary_loss_clip": 0.01123039, "auxiliary_loss_mlp": 0.01051411, "balance_loss_clip": 1.0502702, "balance_loss_mlp": 1.0292381, "epoch": 0.13124906057417707, "flos": 21467363149440.0, "grad_norm": 1.787500136217105, "language_loss": 0.78694725, "learning_rate": 3.893424273224806e-06, "loss": 0.8086918, "num_input_tokens_seen": 47136710, "step": 2183, "time_per_iteration": 2.715517520904541 }, { "auxiliary_loss_clip": 0.01169106, "auxiliary_loss_mlp": 0.01047098, "balance_loss_clip": 1.05452895, "balance_loss_mlp": 1.02586675, "epoch": 0.13130918382684503, "flos": 23255319461760.0, "grad_norm": 26.753588494231124, "language_loss": 0.85792655, "learning_rate": 3.893298799142636e-06, "loss": 0.88008863, "num_input_tokens_seen": 47157155, "step": 2184, "time_per_iteration": 2.632539987564087 }, { "auxiliary_loss_clip": 0.01138714, "auxiliary_loss_mlp": 0.01054657, "balance_loss_clip": 1.05349112, "balance_loss_mlp": 1.03230524, "epoch": 0.131369307079513, "flos": 20850274471680.0, "grad_norm": 2.50466124454056, "language_loss": 0.82703435, "learning_rate": 3.893173253266387e-06, "loss": 0.84896809, "num_input_tokens_seen": 47176820, "step": 2185, "time_per_iteration": 2.6809136867523193 }, { "auxiliary_loss_clip": 0.01144077, "auxiliary_loss_mlp": 0.01054121, "balance_loss_clip": 1.05262399, "balance_loss_mlp": 1.03236496, "epoch": 0.13142943033218096, "flos": 17858341163520.0, "grad_norm": 1.8949462712827352, "language_loss": 0.72956109, "learning_rate": 3.893047635600818e-06, "loss": 0.75154305, "num_input_tokens_seen": 47195855, "step": 2186, "time_per_iteration": 2.628096342086792 }, { "auxiliary_loss_clip": 0.01157778, "auxiliary_loss_mlp": 0.01050695, "balance_loss_clip": 1.05436552, "balance_loss_mlp": 1.02783096, "epoch": 0.13148955358484893, "flos": 20996035862400.0, "grad_norm": 1.9822444068613732, "language_loss": 0.80363685, "learning_rate": 3.892921946150693e-06, "loss": 0.82572162, "num_input_tokens_seen": 47214535, "step": 2187, "time_per_iteration": 2.762223720550537 }, { "auxiliary_loss_clip": 0.01027324, "auxiliary_loss_mlp": 0.0101023, "balance_loss_clip": 1.02364707, "balance_loss_mlp": 1.00792885, "epoch": 0.13154967683751692, "flos": 70172467580160.0, "grad_norm": 0.8471850380496847, "language_loss": 0.59082437, "learning_rate": 3.892796184920778e-06, "loss": 0.61119986, "num_input_tokens_seen": 47270300, "step": 2188, "time_per_iteration": 3.302457571029663 }, { "auxiliary_loss_clip": 0.01095126, "auxiliary_loss_mlp": 0.01059346, "balance_loss_clip": 1.04827487, "balance_loss_mlp": 1.03676724, "epoch": 0.1316098000901849, "flos": 20376145923840.0, "grad_norm": 1.7340345041340466, "language_loss": 0.74211109, "learning_rate": 3.892670351915842e-06, "loss": 0.76365584, "num_input_tokens_seen": 47290720, "step": 2189, "time_per_iteration": 2.7990496158599854 }, { "auxiliary_loss_clip": 0.01160124, "auxiliary_loss_mlp": 0.01049098, "balance_loss_clip": 1.05551052, "balance_loss_mlp": 1.02799821, "epoch": 0.13166992334285285, "flos": 23221132692480.0, "grad_norm": 1.8160574809616576, "language_loss": 0.73152113, "learning_rate": 3.892544447140657e-06, "loss": 0.75361335, "num_input_tokens_seen": 47311820, "step": 2190, "time_per_iteration": 2.6485326290130615 }, { "auxiliary_loss_clip": 0.01160351, "auxiliary_loss_mlp": 0.01058461, "balance_loss_clip": 1.05671644, "balance_loss_mlp": 1.03811169, "epoch": 0.13173004659552082, "flos": 23330947547520.0, "grad_norm": 1.8825588242208007, "language_loss": 0.74617779, "learning_rate": 3.892418470599996e-06, "loss": 0.76836598, "num_input_tokens_seen": 47331605, "step": 2191, "time_per_iteration": 2.644484281539917 }, { "auxiliary_loss_clip": 0.0112783, "auxiliary_loss_mlp": 0.01054712, "balance_loss_clip": 1.05129039, "balance_loss_mlp": 1.03356445, "epoch": 0.13179016984818878, "flos": 21251504367360.0, "grad_norm": 1.8823393822145031, "language_loss": 0.79093283, "learning_rate": 3.892292422298637e-06, "loss": 0.81275827, "num_input_tokens_seen": 47350455, "step": 2192, "time_per_iteration": 2.735225200653076 }, { "auxiliary_loss_clip": 0.0111282, "auxiliary_loss_mlp": 0.01051113, "balance_loss_clip": 1.04457211, "balance_loss_mlp": 1.02936912, "epoch": 0.13185029310085675, "flos": 17778690754560.0, "grad_norm": 1.7242105632860862, "language_loss": 0.85350716, "learning_rate": 3.892166302241361e-06, "loss": 0.87514639, "num_input_tokens_seen": 47368225, "step": 2193, "time_per_iteration": 4.262877941131592 }, { "auxiliary_loss_clip": 0.0104173, "auxiliary_loss_mlp": 0.01015651, "balance_loss_clip": 1.02609122, "balance_loss_mlp": 1.01280212, "epoch": 0.1319104163535247, "flos": 69851785933440.0, "grad_norm": 0.7746813180799224, "language_loss": 0.54112649, "learning_rate": 3.8920401104329475e-06, "loss": 0.56170022, "num_input_tokens_seen": 47427125, "step": 2194, "time_per_iteration": 6.223008394241333 }, { "auxiliary_loss_clip": 0.01168022, "auxiliary_loss_mlp": 0.01048581, "balance_loss_clip": 1.05420566, "balance_loss_mlp": 1.02828002, "epoch": 0.1319705396061927, "flos": 25193095401600.0, "grad_norm": 2.1079865649821925, "language_loss": 0.72433972, "learning_rate": 3.891913846878185e-06, "loss": 0.74650574, "num_input_tokens_seen": 47450275, "step": 2195, "time_per_iteration": 2.6357345581054688 }, { "auxiliary_loss_clip": 0.01136503, "auxiliary_loss_mlp": 0.00778731, "balance_loss_clip": 1.05176425, "balance_loss_mlp": 0.99996454, "epoch": 0.13203066285886067, "flos": 20740459616640.0, "grad_norm": 1.5737174748369949, "language_loss": 0.78126895, "learning_rate": 3.891787511581859e-06, "loss": 0.8004213, "num_input_tokens_seen": 47469155, "step": 2196, "time_per_iteration": 2.7118594646453857 }, { "auxiliary_loss_clip": 0.01162447, "auxiliary_loss_mlp": 0.010526, "balance_loss_clip": 1.05453539, "balance_loss_mlp": 1.03210831, "epoch": 0.13209078611152864, "flos": 22054395121920.0, "grad_norm": 1.9385650447291836, "language_loss": 0.74632496, "learning_rate": 3.89166110454876e-06, "loss": 0.76847541, "num_input_tokens_seen": 47488405, "step": 2197, "time_per_iteration": 4.270530939102173 }, { "auxiliary_loss_clip": 0.01173786, "auxiliary_loss_mlp": 0.01050846, "balance_loss_clip": 1.05440533, "balance_loss_mlp": 1.02947164, "epoch": 0.1321509093641966, "flos": 16284950743680.0, "grad_norm": 1.785688190112577, "language_loss": 0.79566747, "learning_rate": 3.891534625783685e-06, "loss": 0.81791383, "num_input_tokens_seen": 47505650, "step": 2198, "time_per_iteration": 2.6145474910736084 }, { "auxiliary_loss_clip": 0.01170264, "auxiliary_loss_mlp": 0.01057159, "balance_loss_clip": 1.05536175, "balance_loss_mlp": 1.03647637, "epoch": 0.13221103261686457, "flos": 16983018633600.0, "grad_norm": 2.56313218775589, "language_loss": 0.82932216, "learning_rate": 3.891408075291425e-06, "loss": 0.85159647, "num_input_tokens_seen": 47521540, "step": 2199, "time_per_iteration": 2.5715503692626953 }, { "auxiliary_loss_clip": 0.01122554, "auxiliary_loss_mlp": 0.01052148, "balance_loss_clip": 1.05047798, "balance_loss_mlp": 1.03045249, "epoch": 0.13227115586953253, "flos": 34233605677440.0, "grad_norm": 1.8710902505917797, "language_loss": 0.69579422, "learning_rate": 3.8912814530767826e-06, "loss": 0.71754128, "num_input_tokens_seen": 47543625, "step": 2200, "time_per_iteration": 2.8001365661621094 }, { "auxiliary_loss_clip": 0.01167798, "auxiliary_loss_mlp": 0.01058155, "balance_loss_clip": 1.05345917, "balance_loss_mlp": 1.03618431, "epoch": 0.13233127912220052, "flos": 20704656735360.0, "grad_norm": 1.647659287704997, "language_loss": 0.84624702, "learning_rate": 3.891154759144557e-06, "loss": 0.86850655, "num_input_tokens_seen": 47563740, "step": 2201, "time_per_iteration": 2.6485981941223145 }, { "auxiliary_loss_clip": 0.0117188, "auxiliary_loss_mlp": 0.01055627, "balance_loss_clip": 1.05427861, "balance_loss_mlp": 1.03431273, "epoch": 0.1323914023748685, "flos": 25805048434560.0, "grad_norm": 1.7446392584198542, "language_loss": 0.87088037, "learning_rate": 3.891027993499554e-06, "loss": 0.8931554, "num_input_tokens_seen": 47582655, "step": 2202, "time_per_iteration": 2.5921456813812256 }, { "auxiliary_loss_clip": 0.01139991, "auxiliary_loss_mlp": 0.01053413, "balance_loss_clip": 1.05299544, "balance_loss_mlp": 1.03267026, "epoch": 0.13245152562753645, "flos": 21251540280960.0, "grad_norm": 2.405254380671628, "language_loss": 0.72801507, "learning_rate": 3.89090115614658e-06, "loss": 0.7499491, "num_input_tokens_seen": 47600875, "step": 2203, "time_per_iteration": 2.6257405281066895 }, { "auxiliary_loss_clip": 0.01124508, "auxiliary_loss_mlp": 0.0105959, "balance_loss_clip": 1.05080879, "balance_loss_mlp": 1.03916979, "epoch": 0.13251164888020442, "flos": 26610955931520.0, "grad_norm": 2.044348475010678, "language_loss": 0.73170948, "learning_rate": 3.890774247090444e-06, "loss": 0.75355047, "num_input_tokens_seen": 47619250, "step": 2204, "time_per_iteration": 2.753830909729004 }, { "auxiliary_loss_clip": 0.01160826, "auxiliary_loss_mlp": 0.01054406, "balance_loss_clip": 1.05474758, "balance_loss_mlp": 1.03225708, "epoch": 0.13257177213287238, "flos": 29826541272960.0, "grad_norm": 2.094172729236468, "language_loss": 0.78377104, "learning_rate": 3.89064726633596e-06, "loss": 0.80592328, "num_input_tokens_seen": 47639445, "step": 2205, "time_per_iteration": 2.730682134628296 }, { "auxiliary_loss_clip": 0.01125154, "auxiliary_loss_mlp": 0.01048818, "balance_loss_clip": 1.04975629, "balance_loss_mlp": 1.02782559, "epoch": 0.13263189538554035, "flos": 21288456483840.0, "grad_norm": 1.8609089802832188, "language_loss": 0.78638101, "learning_rate": 3.890520213887941e-06, "loss": 0.80812073, "num_input_tokens_seen": 47658740, "step": 2206, "time_per_iteration": 2.691962718963623 }, { "auxiliary_loss_clip": 0.01124965, "auxiliary_loss_mlp": 0.01045957, "balance_loss_clip": 1.04958403, "balance_loss_mlp": 1.02649069, "epoch": 0.13269201863820831, "flos": 16874101618560.0, "grad_norm": 2.2777192787220066, "language_loss": 0.74672282, "learning_rate": 3.890393089751208e-06, "loss": 0.76843208, "num_input_tokens_seen": 47676880, "step": 2207, "time_per_iteration": 2.7062454223632812 }, { "auxiliary_loss_clip": 0.01143208, "auxiliary_loss_mlp": 0.01047941, "balance_loss_clip": 1.05257845, "balance_loss_mlp": 1.02672219, "epoch": 0.1327521418908763, "flos": 23768914078080.0, "grad_norm": 1.692212064021935, "language_loss": 0.84061795, "learning_rate": 3.890265893930578e-06, "loss": 0.8625294, "num_input_tokens_seen": 47696635, "step": 2208, "time_per_iteration": 2.687717914581299 }, { "auxiliary_loss_clip": 0.01152573, "auxiliary_loss_mlp": 0.0105274, "balance_loss_clip": 1.05847478, "balance_loss_mlp": 1.03411973, "epoch": 0.13281226514354427, "flos": 26505594362880.0, "grad_norm": 1.7032258459750478, "language_loss": 0.85587811, "learning_rate": 3.890138626430876e-06, "loss": 0.8779313, "num_input_tokens_seen": 47717760, "step": 2209, "time_per_iteration": 2.646015167236328 }, { "auxiliary_loss_clip": 0.01138084, "auxiliary_loss_mlp": 0.00778828, "balance_loss_clip": 1.05316806, "balance_loss_mlp": 1.00002563, "epoch": 0.13287238839621224, "flos": 24498762526080.0, "grad_norm": 2.237247968175465, "language_loss": 0.81797457, "learning_rate": 3.890011287256929e-06, "loss": 0.83714366, "num_input_tokens_seen": 47737685, "step": 2210, "time_per_iteration": 2.676262378692627 }, { "auxiliary_loss_clip": 0.0104445, "auxiliary_loss_mlp": 0.00757817, "balance_loss_clip": 1.03801322, "balance_loss_mlp": 1.00007725, "epoch": 0.1329325116488802, "flos": 67694344369920.0, "grad_norm": 0.7515252652740232, "language_loss": 0.58031559, "learning_rate": 3.889883876413563e-06, "loss": 0.59833825, "num_input_tokens_seen": 47802415, "step": 2211, "time_per_iteration": 3.3914146423339844 }, { "auxiliary_loss_clip": 0.01064712, "auxiliary_loss_mlp": 0.01012978, "balance_loss_clip": 1.04205871, "balance_loss_mlp": 1.01083231, "epoch": 0.13299263490154817, "flos": 72261894741120.0, "grad_norm": 0.8012428422082742, "language_loss": 0.55299425, "learning_rate": 3.889756393905611e-06, "loss": 0.57377112, "num_input_tokens_seen": 47871485, "step": 2212, "time_per_iteration": 3.2910914421081543 }, { "auxiliary_loss_clip": 0.01132433, "auxiliary_loss_mlp": 0.01054299, "balance_loss_clip": 1.05107963, "balance_loss_mlp": 1.0331986, "epoch": 0.13305275815421613, "flos": 17931275729280.0, "grad_norm": 2.484635795733661, "language_loss": 0.74228692, "learning_rate": 3.889628839737908e-06, "loss": 0.7641542, "num_input_tokens_seen": 47888315, "step": 2213, "time_per_iteration": 2.755777597427368 }, { "auxiliary_loss_clip": 0.01114671, "auxiliary_loss_mlp": 0.01051459, "balance_loss_clip": 1.04682255, "balance_loss_mlp": 1.03231359, "epoch": 0.13311288140688413, "flos": 22340889999360.0, "grad_norm": 1.850943077435394, "language_loss": 0.79699469, "learning_rate": 3.889501213915291e-06, "loss": 0.81865597, "num_input_tokens_seen": 47906600, "step": 2214, "time_per_iteration": 2.702603340148926 }, { "auxiliary_loss_clip": 0.01143494, "auxiliary_loss_mlp": 0.01052411, "balance_loss_clip": 1.05555344, "balance_loss_mlp": 1.03171659, "epoch": 0.1331730046595521, "flos": 31868888682240.0, "grad_norm": 1.8782588426913054, "language_loss": 0.69341159, "learning_rate": 3.889373516442597e-06, "loss": 0.71537066, "num_input_tokens_seen": 47927630, "step": 2215, "time_per_iteration": 2.769237518310547 }, { "auxiliary_loss_clip": 0.01167307, "auxiliary_loss_mlp": 0.01051423, "balance_loss_clip": 1.06098068, "balance_loss_mlp": 1.03132463, "epoch": 0.13323312791222006, "flos": 22566589107840.0, "grad_norm": 1.884566493826098, "language_loss": 0.81262428, "learning_rate": 3.889245747324671e-06, "loss": 0.83481157, "num_input_tokens_seen": 47947935, "step": 2216, "time_per_iteration": 2.7427120208740234 }, { "auxiliary_loss_clip": 0.01163681, "auxiliary_loss_mlp": 0.01056545, "balance_loss_clip": 1.06198788, "balance_loss_mlp": 1.03631544, "epoch": 0.13329325116488802, "flos": 15085319293440.0, "grad_norm": 3.783334161704178, "language_loss": 0.87299347, "learning_rate": 3.889117906566356e-06, "loss": 0.89519572, "num_input_tokens_seen": 47965515, "step": 2217, "time_per_iteration": 2.709527015686035 }, { "auxiliary_loss_clip": 0.01152703, "auxiliary_loss_mlp": 0.01056364, "balance_loss_clip": 1.06054497, "balance_loss_mlp": 1.0343225, "epoch": 0.133353374417556, "flos": 27453671890560.0, "grad_norm": 4.412823416345162, "language_loss": 0.73105222, "learning_rate": 3.888989994172501e-06, "loss": 0.75314289, "num_input_tokens_seen": 47985675, "step": 2218, "time_per_iteration": 2.697733163833618 }, { "auxiliary_loss_clip": 0.01129106, "auxiliary_loss_mlp": 0.01051151, "balance_loss_clip": 1.0535965, "balance_loss_mlp": 1.02993202, "epoch": 0.13341349767022395, "flos": 24094695456000.0, "grad_norm": 1.7935349411013712, "language_loss": 0.86911142, "learning_rate": 3.8888620101479565e-06, "loss": 0.89091408, "num_input_tokens_seen": 48004985, "step": 2219, "time_per_iteration": 2.7641642093658447 }, { "auxiliary_loss_clip": 0.01141172, "auxiliary_loss_mlp": 0.0106326, "balance_loss_clip": 1.05751657, "balance_loss_mlp": 1.04406714, "epoch": 0.13347362092289192, "flos": 24133335511680.0, "grad_norm": 1.8604531362737113, "language_loss": 0.77244747, "learning_rate": 3.888733954497574e-06, "loss": 0.79449183, "num_input_tokens_seen": 48024965, "step": 2220, "time_per_iteration": 2.732160806655884 }, { "auxiliary_loss_clip": 0.01146487, "auxiliary_loss_mlp": 0.01048662, "balance_loss_clip": 1.05399704, "balance_loss_mlp": 1.03001785, "epoch": 0.1335337441755599, "flos": 18436538390400.0, "grad_norm": 2.3004113327688955, "language_loss": 0.79467338, "learning_rate": 3.888605827226212e-06, "loss": 0.81662482, "num_input_tokens_seen": 48040890, "step": 2221, "time_per_iteration": 2.685612440109253 }, { "auxiliary_loss_clip": 0.01062777, "auxiliary_loss_mlp": 0.01021711, "balance_loss_clip": 1.03293467, "balance_loss_mlp": 1.0194701, "epoch": 0.13359386742822787, "flos": 50611997652480.0, "grad_norm": 0.9755051104211709, "language_loss": 0.68938822, "learning_rate": 3.8884776283387275e-06, "loss": 0.71023309, "num_input_tokens_seen": 48091855, "step": 2222, "time_per_iteration": 3.0336835384368896 }, { "auxiliary_loss_clip": 0.01130152, "auxiliary_loss_mlp": 0.01058574, "balance_loss_clip": 1.05544209, "balance_loss_mlp": 1.03940475, "epoch": 0.13365399068089584, "flos": 22778569221120.0, "grad_norm": 2.1295993667823416, "language_loss": 0.67389107, "learning_rate": 3.888349357839982e-06, "loss": 0.69577825, "num_input_tokens_seen": 48111350, "step": 2223, "time_per_iteration": 2.7134146690368652 }, { "auxiliary_loss_clip": 0.01161386, "auxiliary_loss_mlp": 0.01060571, "balance_loss_clip": 1.05785358, "balance_loss_mlp": 1.04010296, "epoch": 0.1337141139335638, "flos": 12531603911040.0, "grad_norm": 4.277142483609355, "language_loss": 0.82505226, "learning_rate": 3.88822101573484e-06, "loss": 0.84727186, "num_input_tokens_seen": 48129840, "step": 2224, "time_per_iteration": 2.608372926712036 }, { "auxiliary_loss_clip": 0.01173412, "auxiliary_loss_mlp": 0.01050086, "balance_loss_clip": 1.0573926, "balance_loss_mlp": 1.0290221, "epoch": 0.13377423718623177, "flos": 23038957889280.0, "grad_norm": 1.9890294619132924, "language_loss": 0.66270435, "learning_rate": 3.888092602028167e-06, "loss": 0.68493932, "num_input_tokens_seen": 48149240, "step": 2225, "time_per_iteration": 2.6304945945739746 }, { "auxiliary_loss_clip": 0.01153626, "auxiliary_loss_mlp": 0.01051637, "balance_loss_clip": 1.05233717, "balance_loss_mlp": 1.03180075, "epoch": 0.13383436043889974, "flos": 16216397637120.0, "grad_norm": 2.2915668787246997, "language_loss": 0.89469218, "learning_rate": 3.887964116724835e-06, "loss": 0.91674477, "num_input_tokens_seen": 48166330, "step": 2226, "time_per_iteration": 2.6002328395843506 }, { "auxiliary_loss_clip": 0.01150395, "auxiliary_loss_mlp": 0.01054296, "balance_loss_clip": 1.0549798, "balance_loss_mlp": 1.03423262, "epoch": 0.1338944836915677, "flos": 24279671520000.0, "grad_norm": 1.7271512115821777, "language_loss": 0.73209751, "learning_rate": 3.887835559829712e-06, "loss": 0.75414443, "num_input_tokens_seen": 48187600, "step": 2227, "time_per_iteration": 2.706193447113037 }, { "auxiliary_loss_clip": 0.01157707, "auxiliary_loss_mlp": 0.01047387, "balance_loss_clip": 1.05518484, "balance_loss_mlp": 1.02683568, "epoch": 0.1339546069442357, "flos": 17598742594560.0, "grad_norm": 2.848999829625599, "language_loss": 0.85160232, "learning_rate": 3.8877069313476764e-06, "loss": 0.87365323, "num_input_tokens_seen": 48204400, "step": 2228, "time_per_iteration": 2.689209222793579 }, { "auxiliary_loss_clip": 0.01132803, "auxiliary_loss_mlp": 0.01052829, "balance_loss_clip": 1.04935181, "balance_loss_mlp": 1.03126431, "epoch": 0.13401473019690366, "flos": 18990065952000.0, "grad_norm": 1.909679794697233, "language_loss": 0.81460214, "learning_rate": 3.8875782312836054e-06, "loss": 0.83645844, "num_input_tokens_seen": 48222180, "step": 2229, "time_per_iteration": 2.6380228996276855 }, { "auxiliary_loss_clip": 0.0110557, "auxiliary_loss_mlp": 0.01052684, "balance_loss_clip": 1.04774594, "balance_loss_mlp": 1.03233457, "epoch": 0.13407485344957162, "flos": 26943812288640.0, "grad_norm": 1.7464076089691416, "language_loss": 0.73822236, "learning_rate": 3.887449459642378e-06, "loss": 0.7598049, "num_input_tokens_seen": 48243245, "step": 2230, "time_per_iteration": 2.7332983016967773 }, { "auxiliary_loss_clip": 0.01125236, "auxiliary_loss_mlp": 0.01058977, "balance_loss_clip": 1.05213606, "balance_loss_mlp": 1.03890252, "epoch": 0.1341349767022396, "flos": 20339373375360.0, "grad_norm": 1.6827882777998602, "language_loss": 0.80133682, "learning_rate": 3.8873206164288785e-06, "loss": 0.82317901, "num_input_tokens_seen": 48262600, "step": 2231, "time_per_iteration": 2.6759045124053955 }, { "auxiliary_loss_clip": 0.01111387, "auxiliary_loss_mlp": 0.01057582, "balance_loss_clip": 1.04997492, "balance_loss_mlp": 1.03499198, "epoch": 0.13419509995490755, "flos": 29862020931840.0, "grad_norm": 1.746756846769887, "language_loss": 0.72152746, "learning_rate": 3.887191701647992e-06, "loss": 0.74321723, "num_input_tokens_seen": 48285075, "step": 2232, "time_per_iteration": 4.391890048980713 }, { "auxiliary_loss_clip": 0.0112104, "auxiliary_loss_mlp": 0.01051805, "balance_loss_clip": 1.0481019, "balance_loss_mlp": 1.03039551, "epoch": 0.13425522320757552, "flos": 26942986275840.0, "grad_norm": 2.4719586176391686, "language_loss": 0.65116024, "learning_rate": 3.8870627153046066e-06, "loss": 0.67288864, "num_input_tokens_seen": 48301285, "step": 2233, "time_per_iteration": 4.234508037567139 }, { "auxiliary_loss_clip": 0.01167005, "auxiliary_loss_mlp": 0.0104461, "balance_loss_clip": 1.05189967, "balance_loss_mlp": 1.02421367, "epoch": 0.1343153464602435, "flos": 15777281871360.0, "grad_norm": 2.4864430088666656, "language_loss": 0.80878961, "learning_rate": 3.886933657403615e-06, "loss": 0.8309058, "num_input_tokens_seen": 48317835, "step": 2234, "time_per_iteration": 4.175215005874634 }, { "auxiliary_loss_clip": 0.01140761, "auxiliary_loss_mlp": 0.01054039, "balance_loss_clip": 1.05052733, "balance_loss_mlp": 1.03268874, "epoch": 0.13437546971291148, "flos": 24314756129280.0, "grad_norm": 2.0569321713284827, "language_loss": 0.82114553, "learning_rate": 3.886804527949909e-06, "loss": 0.84309351, "num_input_tokens_seen": 48335670, "step": 2235, "time_per_iteration": 2.6588025093078613 }, { "auxiliary_loss_clip": 0.01149093, "auxiliary_loss_mlp": 0.01052015, "balance_loss_clip": 1.05040097, "balance_loss_mlp": 1.02983022, "epoch": 0.13443559296557944, "flos": 26650673395200.0, "grad_norm": 1.6363146905087136, "language_loss": 0.86092007, "learning_rate": 3.8866753269483864e-06, "loss": 0.88293117, "num_input_tokens_seen": 48357805, "step": 2236, "time_per_iteration": 4.349383592605591 }, { "auxiliary_loss_clip": 0.01166751, "auxiliary_loss_mlp": 0.01047925, "balance_loss_clip": 1.05288053, "balance_loss_mlp": 1.02724242, "epoch": 0.1344957162182474, "flos": 21796197183360.0, "grad_norm": 1.82135056053112, "language_loss": 0.77258497, "learning_rate": 3.886546054403946e-06, "loss": 0.79473174, "num_input_tokens_seen": 48377845, "step": 2237, "time_per_iteration": 2.6398766040802 }, { "auxiliary_loss_clip": 0.01145425, "auxiliary_loss_mlp": 0.01051006, "balance_loss_clip": 1.05016851, "balance_loss_mlp": 1.02919102, "epoch": 0.13455583947091537, "flos": 19865568049920.0, "grad_norm": 2.440947698046141, "language_loss": 0.78772336, "learning_rate": 3.886416710321491e-06, "loss": 0.80968761, "num_input_tokens_seen": 48394735, "step": 2238, "time_per_iteration": 2.6556923389434814 }, { "auxiliary_loss_clip": 0.01141594, "auxiliary_loss_mlp": 0.01050085, "balance_loss_clip": 1.05123293, "balance_loss_mlp": 1.02878201, "epoch": 0.13461596272358334, "flos": 30846835094400.0, "grad_norm": 2.9136729194949735, "language_loss": 0.68486369, "learning_rate": 3.886287294705924e-06, "loss": 0.70678043, "num_input_tokens_seen": 48414200, "step": 2239, "time_per_iteration": 2.6778814792633057 }, { "auxiliary_loss_clip": 0.01147129, "auxiliary_loss_mlp": 0.01052633, "balance_loss_clip": 1.0515976, "balance_loss_mlp": 1.03197384, "epoch": 0.1346760859762513, "flos": 12494436312960.0, "grad_norm": 2.3763106012672925, "language_loss": 0.81277847, "learning_rate": 3.8861578075621555e-06, "loss": 0.8347761, "num_input_tokens_seen": 48431065, "step": 2240, "time_per_iteration": 2.5920939445495605 }, { "auxiliary_loss_clip": 0.01107793, "auxiliary_loss_mlp": 0.01049909, "balance_loss_clip": 1.04459488, "balance_loss_mlp": 1.02884459, "epoch": 0.1347362092289193, "flos": 21836022387840.0, "grad_norm": 1.7269080191231387, "language_loss": 0.77183759, "learning_rate": 3.886028248895093e-06, "loss": 0.79341465, "num_input_tokens_seen": 48450335, "step": 2241, "time_per_iteration": 2.7224419116973877 }, { "auxiliary_loss_clip": 0.0116331, "auxiliary_loss_mlp": 0.01041419, "balance_loss_clip": 1.05439126, "balance_loss_mlp": 1.02324009, "epoch": 0.13479633248158726, "flos": 23509459163520.0, "grad_norm": 2.0305903786470743, "language_loss": 0.83062387, "learning_rate": 3.88589861870965e-06, "loss": 0.85267115, "num_input_tokens_seen": 48468555, "step": 2242, "time_per_iteration": 2.5794169902801514 }, { "auxiliary_loss_clip": 0.01170048, "auxiliary_loss_mlp": 0.01056609, "balance_loss_clip": 1.05504107, "balance_loss_mlp": 1.03469825, "epoch": 0.13485645573425523, "flos": 29344332165120.0, "grad_norm": 2.465549548535016, "language_loss": 0.6498239, "learning_rate": 3.885768917010744e-06, "loss": 0.67209053, "num_input_tokens_seen": 48488515, "step": 2243, "time_per_iteration": 2.6709110736846924 }, { "auxiliary_loss_clip": 0.01125086, "auxiliary_loss_mlp": 0.01046786, "balance_loss_clip": 1.04593956, "balance_loss_mlp": 1.02618706, "epoch": 0.1349165789869232, "flos": 28037112503040.0, "grad_norm": 1.7770524512670738, "language_loss": 0.72633034, "learning_rate": 3.8856391438032895e-06, "loss": 0.74804902, "num_input_tokens_seen": 48510515, "step": 2244, "time_per_iteration": 2.713803768157959 }, { "auxiliary_loss_clip": 0.0115377, "auxiliary_loss_mlp": 0.0105148, "balance_loss_clip": 1.05312431, "balance_loss_mlp": 1.03209639, "epoch": 0.13497670223959116, "flos": 22853730430080.0, "grad_norm": 1.7564166456764931, "language_loss": 0.86023217, "learning_rate": 3.88550929909221e-06, "loss": 0.88228464, "num_input_tokens_seen": 48529940, "step": 2245, "time_per_iteration": 2.626560926437378 }, { "auxiliary_loss_clip": 0.01149467, "auxiliary_loss_mlp": 0.0105327, "balance_loss_clip": 1.05035663, "balance_loss_mlp": 1.03346968, "epoch": 0.13503682549225912, "flos": 16504580453760.0, "grad_norm": 1.7861449859595755, "language_loss": 0.78912753, "learning_rate": 3.88537938288243e-06, "loss": 0.8111549, "num_input_tokens_seen": 48548190, "step": 2246, "time_per_iteration": 2.6543703079223633 }, { "auxiliary_loss_clip": 0.010304, "auxiliary_loss_mlp": 0.01015407, "balance_loss_clip": 1.03666449, "balance_loss_mlp": 1.01285601, "epoch": 0.1350969487449271, "flos": 70756303242240.0, "grad_norm": 0.7509256694227144, "language_loss": 0.6054731, "learning_rate": 3.885249395178874e-06, "loss": 0.62593114, "num_input_tokens_seen": 48613165, "step": 2247, "time_per_iteration": 3.3349809646606445 }, { "auxiliary_loss_clip": 0.01162017, "auxiliary_loss_mlp": 0.01056869, "balance_loss_clip": 1.05492628, "balance_loss_mlp": 1.03470767, "epoch": 0.13515707199759508, "flos": 23075981832960.0, "grad_norm": 2.562042993856578, "language_loss": 0.80841738, "learning_rate": 3.885119335986473e-06, "loss": 0.83060622, "num_input_tokens_seen": 48631705, "step": 2248, "time_per_iteration": 2.6279287338256836 }, { "auxiliary_loss_clip": 0.0114073, "auxiliary_loss_mlp": 0.01049128, "balance_loss_clip": 1.05086231, "balance_loss_mlp": 1.03054309, "epoch": 0.13521719525026304, "flos": 23186371305600.0, "grad_norm": 1.9247838227480492, "language_loss": 0.77108699, "learning_rate": 3.884989205310157e-06, "loss": 0.79298556, "num_input_tokens_seen": 48649740, "step": 2249, "time_per_iteration": 2.7100210189819336 }, { "auxiliary_loss_clip": 0.0112733, "auxiliary_loss_mlp": 0.01057649, "balance_loss_clip": 1.05325472, "balance_loss_mlp": 1.03863478, "epoch": 0.135277318502931, "flos": 24790931752320.0, "grad_norm": 1.7403695434994237, "language_loss": 0.84457541, "learning_rate": 3.884859003154862e-06, "loss": 0.86642522, "num_input_tokens_seen": 48671565, "step": 2250, "time_per_iteration": 2.789350986480713 }, { "auxiliary_loss_clip": 0.01155547, "auxiliary_loss_mlp": 0.0105348, "balance_loss_clip": 1.05310512, "balance_loss_mlp": 1.03243995, "epoch": 0.13533744175559898, "flos": 21908525990400.0, "grad_norm": 3.018154510939524, "language_loss": 0.81796515, "learning_rate": 3.884728729525524e-06, "loss": 0.84005541, "num_input_tokens_seen": 48690425, "step": 2251, "time_per_iteration": 2.685617208480835 }, { "auxiliary_loss_clip": 0.01165433, "auxiliary_loss_mlp": 0.01060257, "balance_loss_clip": 1.05235004, "balance_loss_mlp": 1.03888273, "epoch": 0.13539756500826694, "flos": 21211643249280.0, "grad_norm": 1.7680273527580506, "language_loss": 0.86173487, "learning_rate": 3.884598384427084e-06, "loss": 0.88399172, "num_input_tokens_seen": 48707505, "step": 2252, "time_per_iteration": 2.597219467163086 }, { "auxiliary_loss_clip": 0.01052296, "auxiliary_loss_mlp": 0.01018557, "balance_loss_clip": 1.02446079, "balance_loss_mlp": 1.01632786, "epoch": 0.1354576882609349, "flos": 63242103634560.0, "grad_norm": 0.8028920055572067, "language_loss": 0.61837333, "learning_rate": 3.884467967864485e-06, "loss": 0.6390819, "num_input_tokens_seen": 48775895, "step": 2253, "time_per_iteration": 3.25115704536438 }, { "auxiliary_loss_clip": 0.01155107, "auxiliary_loss_mlp": 0.01055639, "balance_loss_clip": 1.0539906, "balance_loss_mlp": 1.03587449, "epoch": 0.1355178115136029, "flos": 25483037984640.0, "grad_norm": 1.6376691715964824, "language_loss": 0.89441288, "learning_rate": 3.884337479842671e-06, "loss": 0.91652036, "num_input_tokens_seen": 48798370, "step": 2254, "time_per_iteration": 2.6803932189941406 }, { "auxiliary_loss_clip": 0.01131786, "auxiliary_loss_mlp": 0.01063066, "balance_loss_clip": 1.04506016, "balance_loss_mlp": 1.03872383, "epoch": 0.13557793476627086, "flos": 21616967295360.0, "grad_norm": 2.1104776784573787, "language_loss": 0.84626925, "learning_rate": 3.884206920366591e-06, "loss": 0.86821771, "num_input_tokens_seen": 48817955, "step": 2255, "time_per_iteration": 2.7074074745178223 }, { "auxiliary_loss_clip": 0.01165481, "auxiliary_loss_mlp": 0.01058458, "balance_loss_clip": 1.05211091, "balance_loss_mlp": 1.03767991, "epoch": 0.13563805801893883, "flos": 24928253447040.0, "grad_norm": 4.791676738707355, "language_loss": 0.74684238, "learning_rate": 3.884076289441196e-06, "loss": 0.76908177, "num_input_tokens_seen": 48836330, "step": 2256, "time_per_iteration": 2.590178966522217 }, { "auxiliary_loss_clip": 0.01127027, "auxiliary_loss_mlp": 0.01054317, "balance_loss_clip": 1.04977024, "balance_loss_mlp": 1.03338361, "epoch": 0.1356981812716068, "flos": 14750272206720.0, "grad_norm": 5.890843360804152, "language_loss": 0.8309083, "learning_rate": 3.88394558707144e-06, "loss": 0.85272169, "num_input_tokens_seen": 48851890, "step": 2257, "time_per_iteration": 2.642096519470215 }, { "auxiliary_loss_clip": 0.0114984, "auxiliary_loss_mlp": 0.00780177, "balance_loss_clip": 1.05128407, "balance_loss_mlp": 1.00013828, "epoch": 0.13575830452427476, "flos": 11108571822720.0, "grad_norm": 2.1957250492246505, "language_loss": 0.82045269, "learning_rate": 3.883814813262277e-06, "loss": 0.83975297, "num_input_tokens_seen": 48865510, "step": 2258, "time_per_iteration": 2.6279473304748535 }, { "auxiliary_loss_clip": 0.01155515, "auxiliary_loss_mlp": 0.01054519, "balance_loss_clip": 1.05172098, "balance_loss_mlp": 1.03152323, "epoch": 0.13581842777694272, "flos": 17960290940160.0, "grad_norm": 2.6364031487830464, "language_loss": 0.82694167, "learning_rate": 3.883683968018669e-06, "loss": 0.849042, "num_input_tokens_seen": 48882360, "step": 2259, "time_per_iteration": 2.677804708480835 }, { "auxiliary_loss_clip": 0.01127201, "auxiliary_loss_mlp": 0.01054646, "balance_loss_clip": 1.0495683, "balance_loss_mlp": 1.03547728, "epoch": 0.1358785510296107, "flos": 22857142222080.0, "grad_norm": 2.0790748617118853, "language_loss": 0.73916006, "learning_rate": 3.8835530513455755e-06, "loss": 0.76097858, "num_input_tokens_seen": 48902700, "step": 2260, "time_per_iteration": 2.7416799068450928 }, { "auxiliary_loss_clip": 0.01144177, "auxiliary_loss_mlp": 0.01056881, "balance_loss_clip": 1.05196047, "balance_loss_mlp": 1.03691387, "epoch": 0.13593867428227868, "flos": 25739404329600.0, "grad_norm": 3.546593987683097, "language_loss": 0.74799728, "learning_rate": 3.883422063247961e-06, "loss": 0.77000785, "num_input_tokens_seen": 48922525, "step": 2261, "time_per_iteration": 2.675342559814453 }, { "auxiliary_loss_clip": 0.01170469, "auxiliary_loss_mlp": 0.01050986, "balance_loss_clip": 1.05486035, "balance_loss_mlp": 1.03043413, "epoch": 0.13599879753494665, "flos": 31249214225280.0, "grad_norm": 2.967396076139427, "language_loss": 0.63602281, "learning_rate": 3.883291003730794e-06, "loss": 0.65823734, "num_input_tokens_seen": 48942510, "step": 2262, "time_per_iteration": 2.660538911819458 }, { "auxiliary_loss_clip": 0.01148004, "auxiliary_loss_mlp": 0.01052118, "balance_loss_clip": 1.0516696, "balance_loss_mlp": 1.03216195, "epoch": 0.1360589207876146, "flos": 23915034604800.0, "grad_norm": 2.301949377353301, "language_loss": 0.81810403, "learning_rate": 3.883159872799043e-06, "loss": 0.84010524, "num_input_tokens_seen": 48962625, "step": 2263, "time_per_iteration": 2.840043783187866 }, { "auxiliary_loss_clip": 0.01098888, "auxiliary_loss_mlp": 0.01064302, "balance_loss_clip": 1.04875195, "balance_loss_mlp": 1.0410558, "epoch": 0.13611904404028258, "flos": 19974197756160.0, "grad_norm": 1.7561035968690553, "language_loss": 0.87737143, "learning_rate": 3.8830286704576815e-06, "loss": 0.89900339, "num_input_tokens_seen": 48982525, "step": 2264, "time_per_iteration": 2.784648895263672 }, { "auxiliary_loss_clip": 0.01157618, "auxiliary_loss_mlp": 0.01049521, "balance_loss_clip": 1.05161715, "balance_loss_mlp": 1.02709746, "epoch": 0.13617916729295054, "flos": 15340644144000.0, "grad_norm": 3.151792845640157, "language_loss": 0.7115528, "learning_rate": 3.882897396711683e-06, "loss": 0.7336241, "num_input_tokens_seen": 48997605, "step": 2265, "time_per_iteration": 2.6108245849609375 }, { "auxiliary_loss_clip": 0.01111831, "auxiliary_loss_mlp": 0.01042545, "balance_loss_clip": 1.05199265, "balance_loss_mlp": 1.02256525, "epoch": 0.1362392905456185, "flos": 27451445247360.0, "grad_norm": 4.918827494175735, "language_loss": 0.6671263, "learning_rate": 3.882766051566027e-06, "loss": 0.68867004, "num_input_tokens_seen": 49018535, "step": 2266, "time_per_iteration": 2.7810373306274414 }, { "auxiliary_loss_clip": 0.01127539, "auxiliary_loss_mlp": 0.01057589, "balance_loss_clip": 1.05683684, "balance_loss_mlp": 1.03739524, "epoch": 0.1362994137982865, "flos": 25009017177600.0, "grad_norm": 1.707924588861666, "language_loss": 0.7634865, "learning_rate": 3.882634635025694e-06, "loss": 0.78533769, "num_input_tokens_seen": 49038865, "step": 2267, "time_per_iteration": 2.7682721614837646 }, { "auxiliary_loss_clip": 0.01133448, "auxiliary_loss_mlp": 0.01048207, "balance_loss_clip": 1.04668903, "balance_loss_mlp": 1.02641535, "epoch": 0.13635953705095447, "flos": 20303031790080.0, "grad_norm": 2.9531688260339934, "language_loss": 0.81653506, "learning_rate": 3.882503147095667e-06, "loss": 0.83835161, "num_input_tokens_seen": 49058010, "step": 2268, "time_per_iteration": 2.645081043243408 }, { "auxiliary_loss_clip": 0.01155147, "auxiliary_loss_mlp": 0.01048448, "balance_loss_clip": 1.05424881, "balance_loss_mlp": 1.02738333, "epoch": 0.13641966030362243, "flos": 31358418549120.0, "grad_norm": 1.9923150848418427, "language_loss": 0.75975174, "learning_rate": 3.882371587780931e-06, "loss": 0.78178769, "num_input_tokens_seen": 49080330, "step": 2269, "time_per_iteration": 2.6764814853668213 }, { "auxiliary_loss_clip": 0.0113465, "auxiliary_loss_mlp": 0.01049702, "balance_loss_clip": 1.04941857, "balance_loss_mlp": 1.02844727, "epoch": 0.1364797835562904, "flos": 20478095700480.0, "grad_norm": 2.1475090354855473, "language_loss": 0.81328762, "learning_rate": 3.882239957086477e-06, "loss": 0.83513117, "num_input_tokens_seen": 49097035, "step": 2270, "time_per_iteration": 2.6801655292510986 }, { "auxiliary_loss_clip": 0.01142111, "auxiliary_loss_mlp": 0.010594, "balance_loss_clip": 1.04989171, "balance_loss_mlp": 1.03773928, "epoch": 0.13653990680895836, "flos": 13078343802240.0, "grad_norm": 3.2227070482893976, "language_loss": 0.75812757, "learning_rate": 3.882108255017295e-06, "loss": 0.78014266, "num_input_tokens_seen": 49113945, "step": 2271, "time_per_iteration": 4.197805166244507 }, { "auxiliary_loss_clip": 0.01156913, "auxiliary_loss_mlp": 0.01061846, "balance_loss_clip": 1.05097795, "balance_loss_mlp": 1.03921962, "epoch": 0.13660003006162633, "flos": 16946712961920.0, "grad_norm": 2.2800716885469754, "language_loss": 0.80251753, "learning_rate": 3.881976481578379e-06, "loss": 0.82470512, "num_input_tokens_seen": 49132855, "step": 2272, "time_per_iteration": 4.1461029052734375 }, { "auxiliary_loss_clip": 0.01055091, "auxiliary_loss_mlp": 0.01042701, "balance_loss_clip": 1.02539539, "balance_loss_mlp": 1.04001904, "epoch": 0.1366601533142943, "flos": 68682749892480.0, "grad_norm": 0.7097054685047118, "language_loss": 0.60739923, "learning_rate": 3.8818446367747255e-06, "loss": 0.62837708, "num_input_tokens_seen": 49198310, "step": 2273, "time_per_iteration": 4.731219530105591 }, { "auxiliary_loss_clip": 0.01165514, "auxiliary_loss_mlp": 0.00780474, "balance_loss_clip": 1.0523783, "balance_loss_mlp": 1.00008452, "epoch": 0.13672027656696228, "flos": 19244241567360.0, "grad_norm": 2.4844725334882583, "language_loss": 0.77506429, "learning_rate": 3.881712720611336e-06, "loss": 0.79452413, "num_input_tokens_seen": 49217250, "step": 2274, "time_per_iteration": 2.7122738361358643 }, { "auxiliary_loss_clip": 0.01154937, "auxiliary_loss_mlp": 0.01054542, "balance_loss_clip": 1.05082417, "balance_loss_mlp": 1.03271496, "epoch": 0.13678039981963025, "flos": 24534924543360.0, "grad_norm": 2.391437383339344, "language_loss": 0.78256011, "learning_rate": 3.881580733093211e-06, "loss": 0.8046549, "num_input_tokens_seen": 49236615, "step": 2275, "time_per_iteration": 2.6674444675445557 }, { "auxiliary_loss_clip": 0.01154585, "auxiliary_loss_mlp": 0.01044634, "balance_loss_clip": 1.05220842, "balance_loss_mlp": 1.02449977, "epoch": 0.13684052307229821, "flos": 15669334523520.0, "grad_norm": 2.271072834476717, "language_loss": 0.81682789, "learning_rate": 3.881448674225356e-06, "loss": 0.83882004, "num_input_tokens_seen": 49253935, "step": 2276, "time_per_iteration": 4.202202558517456 }, { "auxiliary_loss_clip": 0.01164941, "auxiliary_loss_mlp": 0.01060078, "balance_loss_clip": 1.05228245, "balance_loss_mlp": 1.03604531, "epoch": 0.13690064632496618, "flos": 28364689560960.0, "grad_norm": 5.063053962589045, "language_loss": 0.69948691, "learning_rate": 3.881316544012779e-06, "loss": 0.72173715, "num_input_tokens_seen": 49273605, "step": 2277, "time_per_iteration": 2.708591938018799 }, { "auxiliary_loss_clip": 0.01160044, "auxiliary_loss_mlp": 0.00780297, "balance_loss_clip": 1.05169702, "balance_loss_mlp": 1.00017083, "epoch": 0.13696076957763414, "flos": 23404779953280.0, "grad_norm": 2.062701620585305, "language_loss": 0.80197465, "learning_rate": 3.88118434246049e-06, "loss": 0.82137805, "num_input_tokens_seen": 49291785, "step": 2278, "time_per_iteration": 2.6916158199310303 }, { "auxiliary_loss_clip": 0.01159146, "auxiliary_loss_mlp": 0.01060686, "balance_loss_clip": 1.05954766, "balance_loss_mlp": 1.03925228, "epoch": 0.1370208928303021, "flos": 37196595601920.0, "grad_norm": 7.088344486179519, "language_loss": 0.75048816, "learning_rate": 3.881052069573502e-06, "loss": 0.77268648, "num_input_tokens_seen": 49311405, "step": 2279, "time_per_iteration": 2.7316977977752686 }, { "auxiliary_loss_clip": 0.01101952, "auxiliary_loss_mlp": 0.01066685, "balance_loss_clip": 1.04605758, "balance_loss_mlp": 1.04485774, "epoch": 0.13708101608297008, "flos": 26976311118720.0, "grad_norm": 2.5293116992138223, "language_loss": 0.76743513, "learning_rate": 3.880919725356831e-06, "loss": 0.78912151, "num_input_tokens_seen": 49331835, "step": 2280, "time_per_iteration": 2.813720941543579 }, { "auxiliary_loss_clip": 0.01108594, "auxiliary_loss_mlp": 0.01060805, "balance_loss_clip": 1.04457331, "balance_loss_mlp": 1.04022956, "epoch": 0.13714113933563807, "flos": 32556864850560.0, "grad_norm": 2.0597640944890325, "language_loss": 0.79657966, "learning_rate": 3.880787309815496e-06, "loss": 0.81827366, "num_input_tokens_seen": 49352290, "step": 2281, "time_per_iteration": 2.8325345516204834 }, { "auxiliary_loss_clip": 0.0117656, "auxiliary_loss_mlp": 0.0107773, "balance_loss_clip": 1.05715084, "balance_loss_mlp": 1.05671358, "epoch": 0.13720126258830603, "flos": 16101267569280.0, "grad_norm": 2.0769142230572877, "language_loss": 0.83383757, "learning_rate": 3.880654822954518e-06, "loss": 0.85638046, "num_input_tokens_seen": 49370285, "step": 2282, "time_per_iteration": 2.5988755226135254 }, { "auxiliary_loss_clip": 0.01142098, "auxiliary_loss_mlp": 0.01075909, "balance_loss_clip": 1.04898703, "balance_loss_mlp": 1.05583453, "epoch": 0.137261385840974, "flos": 18953544798720.0, "grad_norm": 1.5269487193470777, "language_loss": 0.73526621, "learning_rate": 3.8805222647789195e-06, "loss": 0.75744629, "num_input_tokens_seen": 49389610, "step": 2283, "time_per_iteration": 2.7099714279174805 }, { "auxiliary_loss_clip": 0.01160178, "auxiliary_loss_mlp": 0.01062577, "balance_loss_clip": 1.05577087, "balance_loss_mlp": 1.04173923, "epoch": 0.13732150909364196, "flos": 23295360147840.0, "grad_norm": 2.2306012559941455, "language_loss": 0.83934438, "learning_rate": 3.880389635293729e-06, "loss": 0.86157191, "num_input_tokens_seen": 49408390, "step": 2284, "time_per_iteration": 2.7315831184387207 }, { "auxiliary_loss_clip": 0.01151427, "auxiliary_loss_mlp": 0.01070288, "balance_loss_clip": 1.05204272, "balance_loss_mlp": 1.04779351, "epoch": 0.13738163234630993, "flos": 29351263489920.0, "grad_norm": 2.0900141273659223, "language_loss": 0.7557056, "learning_rate": 3.880256934503974e-06, "loss": 0.77792281, "num_input_tokens_seen": 49427725, "step": 2285, "time_per_iteration": 2.7257747650146484 }, { "auxiliary_loss_clip": 0.01144078, "auxiliary_loss_mlp": 0.01064539, "balance_loss_clip": 1.05233073, "balance_loss_mlp": 1.04392731, "epoch": 0.1374417555989779, "flos": 26651319840000.0, "grad_norm": 2.727019945657865, "language_loss": 0.74521589, "learning_rate": 3.880124162414689e-06, "loss": 0.76730204, "num_input_tokens_seen": 49449000, "step": 2286, "time_per_iteration": 2.742582082748413 }, { "auxiliary_loss_clip": 0.0112541, "auxiliary_loss_mlp": 0.01059198, "balance_loss_clip": 1.04906356, "balance_loss_mlp": 1.03659606, "epoch": 0.1375018788516459, "flos": 28403401443840.0, "grad_norm": 2.2168449035378357, "language_loss": 0.86683542, "learning_rate": 3.879991319030908e-06, "loss": 0.88868147, "num_input_tokens_seen": 49468360, "step": 2287, "time_per_iteration": 2.802088499069214 }, { "auxiliary_loss_clip": 0.01124712, "auxiliary_loss_mlp": 0.01064517, "balance_loss_clip": 1.04803944, "balance_loss_mlp": 1.04207003, "epoch": 0.13756200210431385, "flos": 37413783187200.0, "grad_norm": 2.0592152854463106, "language_loss": 0.68410838, "learning_rate": 3.879858404357666e-06, "loss": 0.70600063, "num_input_tokens_seen": 49493450, "step": 2288, "time_per_iteration": 2.861175537109375 }, { "auxiliary_loss_clip": 0.01112106, "auxiliary_loss_mlp": 0.01071262, "balance_loss_clip": 1.05062151, "balance_loss_mlp": 1.04666936, "epoch": 0.13762212535698182, "flos": 22711021695360.0, "grad_norm": 2.3933568244149357, "language_loss": 0.87090456, "learning_rate": 3.879725418400005e-06, "loss": 0.89273822, "num_input_tokens_seen": 49511220, "step": 2289, "time_per_iteration": 2.7185773849487305 }, { "auxiliary_loss_clip": 0.01130193, "auxiliary_loss_mlp": 0.00781167, "balance_loss_clip": 1.0480957, "balance_loss_mlp": 1.00019848, "epoch": 0.13768224860964978, "flos": 23952130375680.0, "grad_norm": 1.8106848287624444, "language_loss": 0.74668044, "learning_rate": 3.879592361162969e-06, "loss": 0.76579404, "num_input_tokens_seen": 49529820, "step": 2290, "time_per_iteration": 2.6751222610473633 }, { "auxiliary_loss_clip": 0.01039657, "auxiliary_loss_mlp": 0.01081332, "balance_loss_clip": 1.03094769, "balance_loss_mlp": 1.07881641, "epoch": 0.13774237186231775, "flos": 63590438753280.0, "grad_norm": 0.7179159366671727, "language_loss": 0.51597112, "learning_rate": 3.8794592326516015e-06, "loss": 0.53718102, "num_input_tokens_seen": 49595325, "step": 2291, "time_per_iteration": 3.2823359966278076 }, { "auxiliary_loss_clip": 0.01157406, "auxiliary_loss_mlp": 0.01052846, "balance_loss_clip": 1.05224037, "balance_loss_mlp": 1.03123331, "epoch": 0.1378024951149857, "flos": 24279456038400.0, "grad_norm": 1.9326408617769533, "language_loss": 0.71273667, "learning_rate": 3.879326032870952e-06, "loss": 0.7348392, "num_input_tokens_seen": 49615850, "step": 2292, "time_per_iteration": 2.74045729637146 }, { "auxiliary_loss_clip": 0.01156871, "auxiliary_loss_mlp": 0.01049315, "balance_loss_clip": 1.05427122, "balance_loss_mlp": 1.02931166, "epoch": 0.13786261836765368, "flos": 14021537080320.0, "grad_norm": 6.592759889378346, "language_loss": 0.8047784, "learning_rate": 3.879192761826071e-06, "loss": 0.82684022, "num_input_tokens_seen": 49631860, "step": 2293, "time_per_iteration": 2.587576389312744 }, { "auxiliary_loss_clip": 0.0115787, "auxiliary_loss_mlp": 0.0104972, "balance_loss_clip": 1.0554558, "balance_loss_mlp": 1.02921653, "epoch": 0.13792274162032167, "flos": 28878679226880.0, "grad_norm": 1.9082895606463517, "language_loss": 0.78440171, "learning_rate": 3.879059419522011e-06, "loss": 0.80647767, "num_input_tokens_seen": 49652145, "step": 2294, "time_per_iteration": 2.7152793407440186 }, { "auxiliary_loss_clip": 0.01126374, "auxiliary_loss_mlp": 0.01050648, "balance_loss_clip": 1.05281758, "balance_loss_mlp": 1.03104973, "epoch": 0.13798286487298964, "flos": 21141150808320.0, "grad_norm": 1.991103290125302, "language_loss": 0.80339509, "learning_rate": 3.878926005963831e-06, "loss": 0.82516527, "num_input_tokens_seen": 49669880, "step": 2295, "time_per_iteration": 2.7026021480560303 }, { "auxiliary_loss_clip": 0.01154693, "auxiliary_loss_mlp": 0.01052186, "balance_loss_clip": 1.05239046, "balance_loss_mlp": 1.03102624, "epoch": 0.1380429881256576, "flos": 22487477402880.0, "grad_norm": 1.7450624966187134, "language_loss": 0.78661883, "learning_rate": 3.878792521156588e-06, "loss": 0.80868757, "num_input_tokens_seen": 49687255, "step": 2296, "time_per_iteration": 2.566929340362549 }, { "auxiliary_loss_clip": 0.01153425, "auxiliary_loss_mlp": 0.01069343, "balance_loss_clip": 1.05437231, "balance_loss_mlp": 1.04811132, "epoch": 0.13810311137832557, "flos": 21393674398080.0, "grad_norm": 1.7434096141785573, "language_loss": 0.78663194, "learning_rate": 3.8786589651053446e-06, "loss": 0.80885959, "num_input_tokens_seen": 49706650, "step": 2297, "time_per_iteration": 2.6254489421844482 }, { "auxiliary_loss_clip": 0.01110905, "auxiliary_loss_mlp": 0.01059754, "balance_loss_clip": 1.05296302, "balance_loss_mlp": 1.03871369, "epoch": 0.13816323463099353, "flos": 25989844930560.0, "grad_norm": 1.929043788877404, "language_loss": 0.69199705, "learning_rate": 3.878525337815164e-06, "loss": 0.71370363, "num_input_tokens_seen": 49725715, "step": 2298, "time_per_iteration": 2.791301965713501 }, { "auxiliary_loss_clip": 0.01137772, "auxiliary_loss_mlp": 0.01061768, "balance_loss_clip": 1.0517292, "balance_loss_mlp": 1.04059684, "epoch": 0.1382233578836615, "flos": 19244313394560.0, "grad_norm": 1.7910922430646712, "language_loss": 0.86382294, "learning_rate": 3.878391639291116e-06, "loss": 0.88581836, "num_input_tokens_seen": 49744710, "step": 2299, "time_per_iteration": 2.6075453758239746 }, { "auxiliary_loss_clip": 0.01166817, "auxiliary_loss_mlp": 0.01054863, "balance_loss_clip": 1.05378175, "balance_loss_mlp": 1.03292871, "epoch": 0.1382834811363295, "flos": 25666290195840.0, "grad_norm": 2.2378660690879606, "language_loss": 0.75468475, "learning_rate": 3.878257869538267e-06, "loss": 0.77690154, "num_input_tokens_seen": 49764300, "step": 2300, "time_per_iteration": 2.663328170776367 }, { "auxiliary_loss_clip": 0.01130608, "auxiliary_loss_mlp": 0.01047248, "balance_loss_clip": 1.05274105, "balance_loss_mlp": 1.02664876, "epoch": 0.13834360438899745, "flos": 19784193788160.0, "grad_norm": 2.5571861214345963, "language_loss": 0.82463622, "learning_rate": 3.878124028561692e-06, "loss": 0.8464148, "num_input_tokens_seen": 49778380, "step": 2301, "time_per_iteration": 2.6705129146575928 }, { "auxiliary_loss_clip": 0.0113862, "auxiliary_loss_mlp": 0.00777879, "balance_loss_clip": 1.05323792, "balance_loss_mlp": 1.00021625, "epoch": 0.13840372764166542, "flos": 26651858544000.0, "grad_norm": 1.9612043619218924, "language_loss": 0.85957694, "learning_rate": 3.877990116366466e-06, "loss": 0.87874192, "num_input_tokens_seen": 49797460, "step": 2302, "time_per_iteration": 2.679797410964966 }, { "auxiliary_loss_clip": 0.01059341, "auxiliary_loss_mlp": 0.01025212, "balance_loss_clip": 1.03226125, "balance_loss_mlp": 1.02244604, "epoch": 0.13846385089433338, "flos": 70510998286080.0, "grad_norm": 0.7598813547967705, "language_loss": 0.65591633, "learning_rate": 3.877856132957667e-06, "loss": 0.67676187, "num_input_tokens_seen": 49868005, "step": 2303, "time_per_iteration": 3.3249399662017822 }, { "auxiliary_loss_clip": 0.01151443, "auxiliary_loss_mlp": 0.01046478, "balance_loss_clip": 1.05337632, "balance_loss_mlp": 1.02655792, "epoch": 0.13852397414700135, "flos": 17348732956800.0, "grad_norm": 3.141207945865242, "language_loss": 0.78663635, "learning_rate": 3.877722078340374e-06, "loss": 0.80861557, "num_input_tokens_seen": 49885825, "step": 2304, "time_per_iteration": 2.7364001274108887 }, { "auxiliary_loss_clip": 0.01157514, "auxiliary_loss_mlp": 0.01043253, "balance_loss_clip": 1.05566275, "balance_loss_mlp": 1.02385736, "epoch": 0.13858409739966931, "flos": 21543781334400.0, "grad_norm": 1.7487365854034607, "language_loss": 0.77559888, "learning_rate": 3.877587952519672e-06, "loss": 0.79760659, "num_input_tokens_seen": 49905975, "step": 2305, "time_per_iteration": 2.7814202308654785 }, { "auxiliary_loss_clip": 0.01074766, "auxiliary_loss_mlp": 0.01055718, "balance_loss_clip": 1.04160607, "balance_loss_mlp": 1.03473723, "epoch": 0.13864422065233728, "flos": 21579907438080.0, "grad_norm": 1.8207477060355044, "language_loss": 0.87737936, "learning_rate": 3.877453755500647e-06, "loss": 0.89868426, "num_input_tokens_seen": 49925800, "step": 2306, "time_per_iteration": 2.917616605758667 }, { "auxiliary_loss_clip": 0.01064826, "auxiliary_loss_mlp": 0.0101208, "balance_loss_clip": 1.02692199, "balance_loss_mlp": 1.0094099, "epoch": 0.13870434390500527, "flos": 53371156872960.0, "grad_norm": 0.8728538231155298, "language_loss": 0.59008431, "learning_rate": 3.877319487288387e-06, "loss": 0.61085337, "num_input_tokens_seen": 49977620, "step": 2307, "time_per_iteration": 3.4345149993896484 }, { "auxiliary_loss_clip": 0.01169624, "auxiliary_loss_mlp": 0.00778134, "balance_loss_clip": 1.05528641, "balance_loss_mlp": 1.00021303, "epoch": 0.13876446715767324, "flos": 22565906749440.0, "grad_norm": 1.8467673932802395, "language_loss": 0.79483795, "learning_rate": 3.877185147887984e-06, "loss": 0.81431556, "num_input_tokens_seen": 49996650, "step": 2308, "time_per_iteration": 2.7137296199798584 }, { "auxiliary_loss_clip": 0.01131024, "auxiliary_loss_mlp": 0.01050332, "balance_loss_clip": 1.05118585, "balance_loss_mlp": 1.03054297, "epoch": 0.1388245904103412, "flos": 20705231352960.0, "grad_norm": 2.352128383160346, "language_loss": 0.78101134, "learning_rate": 3.877050737304533e-06, "loss": 0.80282485, "num_input_tokens_seen": 50015640, "step": 2309, "time_per_iteration": 2.9259471893310547 }, { "auxiliary_loss_clip": 0.01128109, "auxiliary_loss_mlp": 0.01057348, "balance_loss_clip": 1.04979932, "balance_loss_mlp": 1.03620028, "epoch": 0.13888471366300917, "flos": 20554729367040.0, "grad_norm": 3.914796791761399, "language_loss": 0.68133545, "learning_rate": 3.876916255543129e-06, "loss": 0.70318997, "num_input_tokens_seen": 50033500, "step": 2310, "time_per_iteration": 4.27877140045166 }, { "auxiliary_loss_clip": 0.01164985, "auxiliary_loss_mlp": 0.01062516, "balance_loss_clip": 1.05356944, "balance_loss_mlp": 1.04021168, "epoch": 0.13894483691567713, "flos": 13838033473920.0, "grad_norm": 1.934954545600412, "language_loss": 0.84295756, "learning_rate": 3.8767817026088725e-06, "loss": 0.86523259, "num_input_tokens_seen": 50050075, "step": 2311, "time_per_iteration": 2.5612359046936035 }, { "auxiliary_loss_clip": 0.01173749, "auxiliary_loss_mlp": 0.01055474, "balance_loss_clip": 1.05752683, "balance_loss_mlp": 1.0350771, "epoch": 0.1390049601683451, "flos": 28031186759040.0, "grad_norm": 2.9213009430481143, "language_loss": 0.82358992, "learning_rate": 3.876647078506866e-06, "loss": 0.84588212, "num_input_tokens_seen": 50070080, "step": 2312, "time_per_iteration": 5.737139701843262 }, { "auxiliary_loss_clip": 0.01129781, "auxiliary_loss_mlp": 0.00778347, "balance_loss_clip": 1.05464363, "balance_loss_mlp": 1.00023031, "epoch": 0.13906508342101306, "flos": 26756860976640.0, "grad_norm": 2.109799495913242, "language_loss": 0.86732674, "learning_rate": 3.876512383242215e-06, "loss": 0.88640809, "num_input_tokens_seen": 50090040, "step": 2313, "time_per_iteration": 2.8402304649353027 }, { "auxiliary_loss_clip": 0.01168088, "auxiliary_loss_mlp": 0.01061738, "balance_loss_clip": 1.05670547, "balance_loss_mlp": 1.04115057, "epoch": 0.13912520667368106, "flos": 24535104111360.0, "grad_norm": 1.784990717237318, "language_loss": 0.79935932, "learning_rate": 3.876377616820024e-06, "loss": 0.8216576, "num_input_tokens_seen": 50110595, "step": 2314, "time_per_iteration": 2.683448076248169 }, { "auxiliary_loss_clip": 0.01124732, "auxiliary_loss_mlp": 0.01061041, "balance_loss_clip": 1.04845023, "balance_loss_mlp": 1.04103708, "epoch": 0.13918532992634902, "flos": 19383215287680.0, "grad_norm": 2.585875079553688, "language_loss": 0.85367405, "learning_rate": 3.876242779245409e-06, "loss": 0.87553179, "num_input_tokens_seen": 50125430, "step": 2315, "time_per_iteration": 4.332594394683838 }, { "auxiliary_loss_clip": 0.01156122, "auxiliary_loss_mlp": 0.01058532, "balance_loss_clip": 1.05397022, "balance_loss_mlp": 1.0372889, "epoch": 0.139245453179017, "flos": 21323756574720.0, "grad_norm": 2.333331492160627, "language_loss": 0.77170396, "learning_rate": 3.876107870523477e-06, "loss": 0.79385042, "num_input_tokens_seen": 50144120, "step": 2316, "time_per_iteration": 2.654604911804199 }, { "auxiliary_loss_clip": 0.01163967, "auxiliary_loss_mlp": 0.00780027, "balance_loss_clip": 1.05353916, "balance_loss_mlp": 1.00024533, "epoch": 0.13930557643168495, "flos": 19500607912320.0, "grad_norm": 2.1485284032262086, "language_loss": 0.76820493, "learning_rate": 3.875972890659349e-06, "loss": 0.78764486, "num_input_tokens_seen": 50162500, "step": 2317, "time_per_iteration": 2.6501235961914062 }, { "auxiliary_loss_clip": 0.01144052, "auxiliary_loss_mlp": 0.01061042, "balance_loss_clip": 1.05156648, "balance_loss_mlp": 1.04074025, "epoch": 0.13936569968435292, "flos": 25410821690880.0, "grad_norm": 1.7797832869421444, "language_loss": 0.80185997, "learning_rate": 3.875837839658139e-06, "loss": 0.82391089, "num_input_tokens_seen": 50182415, "step": 2318, "time_per_iteration": 2.7097995281219482 }, { "auxiliary_loss_clip": 0.01049096, "auxiliary_loss_mlp": 0.01048478, "balance_loss_clip": 1.03358936, "balance_loss_mlp": 1.04518783, "epoch": 0.13942582293702088, "flos": 70771063731840.0, "grad_norm": 0.854553938374386, "language_loss": 0.59004617, "learning_rate": 3.87570271752497e-06, "loss": 0.61102188, "num_input_tokens_seen": 50245160, "step": 2319, "time_per_iteration": 3.2631640434265137 }, { "auxiliary_loss_clip": 0.0111484, "auxiliary_loss_mlp": 0.01055367, "balance_loss_clip": 1.04508984, "balance_loss_mlp": 1.03437412, "epoch": 0.13948594618968888, "flos": 35590885920000.0, "grad_norm": 2.3313836691947722, "language_loss": 0.64993447, "learning_rate": 3.875567524264967e-06, "loss": 0.67163646, "num_input_tokens_seen": 50268215, "step": 2320, "time_per_iteration": 2.8668782711029053 }, { "auxiliary_loss_clip": 0.01096421, "auxiliary_loss_mlp": 0.01056652, "balance_loss_clip": 1.04400086, "balance_loss_mlp": 1.03521848, "epoch": 0.13954606944235684, "flos": 21105204272640.0, "grad_norm": 2.285151015895421, "language_loss": 0.70708811, "learning_rate": 3.875432259883256e-06, "loss": 0.72861886, "num_input_tokens_seen": 50288575, "step": 2321, "time_per_iteration": 2.8273603916168213 }, { "auxiliary_loss_clip": 0.01117698, "auxiliary_loss_mlp": 0.01061754, "balance_loss_clip": 1.04603076, "balance_loss_mlp": 1.03698206, "epoch": 0.1396061926950248, "flos": 25044425009280.0, "grad_norm": 1.7926270181208543, "language_loss": 0.85931206, "learning_rate": 3.875296924384965e-06, "loss": 0.88110662, "num_input_tokens_seen": 50308735, "step": 2322, "time_per_iteration": 2.833807945251465 }, { "auxiliary_loss_clip": 0.01120545, "auxiliary_loss_mlp": 0.01055036, "balance_loss_clip": 1.04616976, "balance_loss_mlp": 1.03568828, "epoch": 0.13966631594769277, "flos": 37634023428480.0, "grad_norm": 1.5963293576391182, "language_loss": 0.67159557, "learning_rate": 3.875161517775226e-06, "loss": 0.69335139, "num_input_tokens_seen": 50331025, "step": 2323, "time_per_iteration": 2.875265121459961 }, { "auxiliary_loss_clip": 0.01127992, "auxiliary_loss_mlp": 0.01055173, "balance_loss_clip": 1.04900301, "balance_loss_mlp": 1.03432369, "epoch": 0.13972643920036074, "flos": 16690993061760.0, "grad_norm": 2.0757452253793485, "language_loss": 0.88878977, "learning_rate": 3.875026040059175e-06, "loss": 0.9106214, "num_input_tokens_seen": 50349725, "step": 2324, "time_per_iteration": 2.6841063499450684 }, { "auxiliary_loss_clip": 0.01154799, "auxiliary_loss_mlp": 0.01056834, "balance_loss_clip": 1.05145955, "balance_loss_mlp": 1.03541231, "epoch": 0.1397865624530287, "flos": 23331055288320.0, "grad_norm": 2.8450589371660526, "language_loss": 0.70621002, "learning_rate": 3.8748904912419485e-06, "loss": 0.72832638, "num_input_tokens_seen": 50367965, "step": 2325, "time_per_iteration": 2.694218397140503 }, { "auxiliary_loss_clip": 0.01134393, "auxiliary_loss_mlp": 0.00778751, "balance_loss_clip": 1.05273592, "balance_loss_mlp": 1.00028229, "epoch": 0.13984668570569667, "flos": 22778317825920.0, "grad_norm": 2.230299294128946, "language_loss": 0.81657004, "learning_rate": 3.874754871328688e-06, "loss": 0.83570141, "num_input_tokens_seen": 50385605, "step": 2326, "time_per_iteration": 2.715306282043457 }, { "auxiliary_loss_clip": 0.01151297, "auxiliary_loss_mlp": 0.01045813, "balance_loss_clip": 1.05490732, "balance_loss_mlp": 1.02745473, "epoch": 0.13990680895836466, "flos": 19464553635840.0, "grad_norm": 1.729713540462037, "language_loss": 0.89241689, "learning_rate": 3.874619180324534e-06, "loss": 0.91438794, "num_input_tokens_seen": 50403985, "step": 2327, "time_per_iteration": 2.679626941680908 }, { "auxiliary_loss_clip": 0.01119996, "auxiliary_loss_mlp": 0.01057397, "balance_loss_clip": 1.04873121, "balance_loss_mlp": 1.0352242, "epoch": 0.13996693221103262, "flos": 20303283185280.0, "grad_norm": 2.9217951598838363, "language_loss": 0.84760427, "learning_rate": 3.874483418234632e-06, "loss": 0.86937821, "num_input_tokens_seen": 50421590, "step": 2328, "time_per_iteration": 2.7277352809906006 }, { "auxiliary_loss_clip": 0.01151775, "auxiliary_loss_mlp": 0.0104443, "balance_loss_clip": 1.05300856, "balance_loss_mlp": 1.02421176, "epoch": 0.1400270554637006, "flos": 26617707688320.0, "grad_norm": 1.6116398320348613, "language_loss": 0.73835862, "learning_rate": 3.874347585064131e-06, "loss": 0.76032066, "num_input_tokens_seen": 50443945, "step": 2329, "time_per_iteration": 2.6911025047302246 }, { "auxiliary_loss_clip": 0.01153137, "auxiliary_loss_mlp": 0.01046755, "balance_loss_clip": 1.05254042, "balance_loss_mlp": 1.02644169, "epoch": 0.14008717871636855, "flos": 19391475415680.0, "grad_norm": 2.565670250114109, "language_loss": 0.78373277, "learning_rate": 3.874211680818183e-06, "loss": 0.80573165, "num_input_tokens_seen": 50462065, "step": 2330, "time_per_iteration": 2.703225612640381 }, { "auxiliary_loss_clip": 0.01144455, "auxiliary_loss_mlp": 0.01046085, "balance_loss_clip": 1.05247569, "balance_loss_mlp": 1.02692819, "epoch": 0.14014730196903652, "flos": 15304266645120.0, "grad_norm": 2.2215524337864143, "language_loss": 0.72115719, "learning_rate": 3.87407570550194e-06, "loss": 0.74306256, "num_input_tokens_seen": 50479565, "step": 2331, "time_per_iteration": 2.7044217586517334 }, { "auxiliary_loss_clip": 0.01159691, "auxiliary_loss_mlp": 0.01051771, "balance_loss_clip": 1.0558939, "balance_loss_mlp": 1.03234017, "epoch": 0.14020742522170448, "flos": 14939701557120.0, "grad_norm": 1.5806705357110964, "language_loss": 0.72634697, "learning_rate": 3.873939659120557e-06, "loss": 0.7484616, "num_input_tokens_seen": 50497305, "step": 2332, "time_per_iteration": 2.647564649581909 }, { "auxiliary_loss_clip": 0.01063058, "auxiliary_loss_mlp": 0.01022564, "balance_loss_clip": 1.03391051, "balance_loss_mlp": 1.01944101, "epoch": 0.14026754847437245, "flos": 48824580044160.0, "grad_norm": 0.8445516092095569, "language_loss": 0.56185365, "learning_rate": 3.873803541679196e-06, "loss": 0.58270991, "num_input_tokens_seen": 50549735, "step": 2333, "time_per_iteration": 3.038390636444092 }, { "auxiliary_loss_clip": 0.01127793, "auxiliary_loss_mlp": 0.01045888, "balance_loss_clip": 1.05246043, "balance_loss_mlp": 1.02587318, "epoch": 0.14032767172704044, "flos": 25773267876480.0, "grad_norm": 1.7702774265545234, "language_loss": 0.82728767, "learning_rate": 3.873667353183016e-06, "loss": 0.84902453, "num_input_tokens_seen": 50570100, "step": 2334, "time_per_iteration": 2.7205803394317627 }, { "auxiliary_loss_clip": 0.01129244, "auxiliary_loss_mlp": 0.01044663, "balance_loss_clip": 1.05110407, "balance_loss_mlp": 1.02593565, "epoch": 0.1403877949797084, "flos": 21216312017280.0, "grad_norm": 1.7790720657464538, "language_loss": 0.80958998, "learning_rate": 3.8735310936371825e-06, "loss": 0.83132899, "num_input_tokens_seen": 50589185, "step": 2335, "time_per_iteration": 2.7844314575195312 }, { "auxiliary_loss_clip": 0.01108373, "auxiliary_loss_mlp": 0.0104374, "balance_loss_clip": 1.04802513, "balance_loss_mlp": 1.02160311, "epoch": 0.14044791823237637, "flos": 22747973811840.0, "grad_norm": 1.739505291070366, "language_loss": 0.81987065, "learning_rate": 3.873394763046862e-06, "loss": 0.84139174, "num_input_tokens_seen": 50609645, "step": 2336, "time_per_iteration": 2.7787351608276367 }, { "auxiliary_loss_clip": 0.01150445, "auxiliary_loss_mlp": 0.01046319, "balance_loss_clip": 1.05603921, "balance_loss_mlp": 1.02709103, "epoch": 0.14050804148504434, "flos": 22964443125120.0, "grad_norm": 1.7584048007565314, "language_loss": 0.80606967, "learning_rate": 3.873258361417225e-06, "loss": 0.82803738, "num_input_tokens_seen": 50628385, "step": 2337, "time_per_iteration": 2.6119275093078613 }, { "auxiliary_loss_clip": 0.01150898, "auxiliary_loss_mlp": 0.01051074, "balance_loss_clip": 1.05363941, "balance_loss_mlp": 1.03202438, "epoch": 0.1405681647377123, "flos": 22200336080640.0, "grad_norm": 2.383737065589604, "language_loss": 0.78994334, "learning_rate": 3.873121888753442e-06, "loss": 0.81196302, "num_input_tokens_seen": 50647260, "step": 2338, "time_per_iteration": 2.672427177429199 }, { "auxiliary_loss_clip": 0.01158377, "auxiliary_loss_mlp": 0.01050168, "balance_loss_clip": 1.05894089, "balance_loss_mlp": 1.02919865, "epoch": 0.14062828799038027, "flos": 23732787974400.0, "grad_norm": 2.117725014058833, "language_loss": 0.79766536, "learning_rate": 3.87298534506069e-06, "loss": 0.81975079, "num_input_tokens_seen": 50666130, "step": 2339, "time_per_iteration": 2.68635892868042 }, { "auxiliary_loss_clip": 0.01097095, "auxiliary_loss_mlp": 0.01065327, "balance_loss_clip": 1.04686952, "balance_loss_mlp": 1.04463232, "epoch": 0.14068841124304826, "flos": 39202493685120.0, "grad_norm": 2.0269377249156793, "language_loss": 0.65632963, "learning_rate": 3.872848730344146e-06, "loss": 0.67795384, "num_input_tokens_seen": 50687440, "step": 2340, "time_per_iteration": 2.9426286220550537 }, { "auxiliary_loss_clip": 0.0114865, "auxiliary_loss_mlp": 0.01050723, "balance_loss_clip": 1.05418086, "balance_loss_mlp": 1.0310297, "epoch": 0.14074853449571623, "flos": 20192283181440.0, "grad_norm": 2.8518792803213917, "language_loss": 0.78760445, "learning_rate": 3.87271204460899e-06, "loss": 0.80959821, "num_input_tokens_seen": 50704030, "step": 2341, "time_per_iteration": 2.8814899921417236 }, { "auxiliary_loss_clip": 0.01162758, "auxiliary_loss_mlp": 0.01057334, "balance_loss_clip": 1.0554986, "balance_loss_mlp": 1.03876162, "epoch": 0.1408086577483842, "flos": 18405871153920.0, "grad_norm": 2.2693198584224454, "language_loss": 0.80322361, "learning_rate": 3.8725752878604066e-06, "loss": 0.82542449, "num_input_tokens_seen": 50723305, "step": 2342, "time_per_iteration": 2.604814291000366 }, { "auxiliary_loss_clip": 0.01152048, "auxiliary_loss_mlp": 0.01056552, "balance_loss_clip": 1.05776191, "balance_loss_mlp": 1.03858757, "epoch": 0.14086878100105216, "flos": 25264593423360.0, "grad_norm": 2.4727499245104343, "language_loss": 0.77686632, "learning_rate": 3.87243846010358e-06, "loss": 0.79895234, "num_input_tokens_seen": 50743270, "step": 2343, "time_per_iteration": 2.676823854446411 }, { "auxiliary_loss_clip": 0.0105659, "auxiliary_loss_mlp": 0.01037584, "balance_loss_clip": 1.03650093, "balance_loss_mlp": 1.03438878, "epoch": 0.14092890425372012, "flos": 65978388869760.0, "grad_norm": 0.8521752699932517, "language_loss": 0.61553669, "learning_rate": 3.872301561343699e-06, "loss": 0.63647842, "num_input_tokens_seen": 50802710, "step": 2344, "time_per_iteration": 3.156792402267456 }, { "auxiliary_loss_clip": 0.01147637, "auxiliary_loss_mlp": 0.01049362, "balance_loss_clip": 1.05167484, "balance_loss_mlp": 1.03121877, "epoch": 0.1409890275063881, "flos": 23694973931520.0, "grad_norm": 1.558783678159347, "language_loss": 0.64331692, "learning_rate": 3.872164591585956e-06, "loss": 0.6652869, "num_input_tokens_seen": 50822625, "step": 2345, "time_per_iteration": 2.654100179672241 }, { "auxiliary_loss_clip": 0.01154879, "auxiliary_loss_mlp": 0.0104633, "balance_loss_clip": 1.05009735, "balance_loss_mlp": 1.02562308, "epoch": 0.14104915075905605, "flos": 23623152687360.0, "grad_norm": 2.26337760563351, "language_loss": 0.73892581, "learning_rate": 3.8720275508355435e-06, "loss": 0.76093793, "num_input_tokens_seen": 50842330, "step": 2346, "time_per_iteration": 2.7032830715179443 }, { "auxiliary_loss_clip": 0.0115447, "auxiliary_loss_mlp": 0.01048793, "balance_loss_clip": 1.0572027, "balance_loss_mlp": 1.02929008, "epoch": 0.14110927401172405, "flos": 20595165102720.0, "grad_norm": 1.7675181118684058, "language_loss": 0.7727294, "learning_rate": 3.8718904390976585e-06, "loss": 0.79476202, "num_input_tokens_seen": 50861035, "step": 2347, "time_per_iteration": 2.678647518157959 }, { "auxiliary_loss_clip": 0.01164131, "auxiliary_loss_mlp": 0.01052088, "balance_loss_clip": 1.05490732, "balance_loss_mlp": 1.03370619, "epoch": 0.141169397264392, "flos": 28548049512960.0, "grad_norm": 2.592464695784388, "language_loss": 0.76753062, "learning_rate": 3.8717532563775e-06, "loss": 0.78969282, "num_input_tokens_seen": 50880105, "step": 2348, "time_per_iteration": 2.7450597286224365 }, { "auxiliary_loss_clip": 0.01147264, "auxiliary_loss_mlp": 0.01042525, "balance_loss_clip": 1.05267334, "balance_loss_mlp": 1.02295136, "epoch": 0.14122952051705998, "flos": 17092258871040.0, "grad_norm": 1.8617784303344698, "language_loss": 0.86794335, "learning_rate": 3.871616002680272e-06, "loss": 0.8898412, "num_input_tokens_seen": 50897720, "step": 2349, "time_per_iteration": 2.662508964538574 }, { "auxiliary_loss_clip": 0.01150971, "auxiliary_loss_mlp": 0.01048616, "balance_loss_clip": 1.05632985, "balance_loss_mlp": 1.02897048, "epoch": 0.14128964376972794, "flos": 28946801370240.0, "grad_norm": 2.650060051711467, "language_loss": 0.88758218, "learning_rate": 3.871478678011177e-06, "loss": 0.90957808, "num_input_tokens_seen": 50918385, "step": 2350, "time_per_iteration": 4.1697962284088135 }, { "auxiliary_loss_clip": 0.01142704, "auxiliary_loss_mlp": 0.01045134, "balance_loss_clip": 1.05369377, "balance_loss_mlp": 1.02442729, "epoch": 0.1413497670223959, "flos": 18989778643200.0, "grad_norm": 1.801090232061166, "language_loss": 0.8094542, "learning_rate": 3.871341282375423e-06, "loss": 0.83133256, "num_input_tokens_seen": 50938270, "step": 2351, "time_per_iteration": 2.6769907474517822 }, { "auxiliary_loss_clip": 0.01149546, "auxiliary_loss_mlp": 0.01040141, "balance_loss_clip": 1.05100775, "balance_loss_mlp": 1.02096045, "epoch": 0.14140989027506387, "flos": 29862236413440.0, "grad_norm": 2.590933181784672, "language_loss": 0.82796198, "learning_rate": 3.871203815778219e-06, "loss": 0.84985888, "num_input_tokens_seen": 50958155, "step": 2352, "time_per_iteration": 5.713203430175781 }, { "auxiliary_loss_clip": 0.01063742, "auxiliary_loss_mlp": 0.01009803, "balance_loss_clip": 1.03462291, "balance_loss_mlp": 1.0060122, "epoch": 0.14147001352773186, "flos": 62079532041600.0, "grad_norm": 0.9118003008214054, "language_loss": 0.61876011, "learning_rate": 3.87106627822478e-06, "loss": 0.63949555, "num_input_tokens_seen": 51020705, "step": 2353, "time_per_iteration": 3.1698319911956787 }, { "auxiliary_loss_clip": 0.01134069, "auxiliary_loss_mlp": 0.01049094, "balance_loss_clip": 1.0536828, "balance_loss_mlp": 1.03039002, "epoch": 0.14153013678039983, "flos": 22017514832640.0, "grad_norm": 1.5909284402791886, "language_loss": 0.87075388, "learning_rate": 3.8709286697203196e-06, "loss": 0.89258552, "num_input_tokens_seen": 51039995, "step": 2354, "time_per_iteration": 2.6781272888183594 }, { "auxiliary_loss_clip": 0.01124592, "auxiliary_loss_mlp": 0.0104583, "balance_loss_clip": 1.0527302, "balance_loss_mlp": 1.02562428, "epoch": 0.1415902600330678, "flos": 19720093968000.0, "grad_norm": 2.035812967878614, "language_loss": 0.74701214, "learning_rate": 3.870790990270057e-06, "loss": 0.76871634, "num_input_tokens_seen": 51059075, "step": 2355, "time_per_iteration": 4.464852571487427 }, { "auxiliary_loss_clip": 0.01062228, "auxiliary_loss_mlp": 0.01003337, "balance_loss_clip": 1.03320074, "balance_loss_mlp": 0.99947417, "epoch": 0.14165038328573576, "flos": 65900929190400.0, "grad_norm": 0.6801443738216844, "language_loss": 0.51819825, "learning_rate": 3.870653239879212e-06, "loss": 0.53885388, "num_input_tokens_seen": 51120380, "step": 2356, "time_per_iteration": 3.094026803970337 }, { "auxiliary_loss_clip": 0.01165635, "auxiliary_loss_mlp": 0.01057535, "balance_loss_clip": 1.05662966, "balance_loss_mlp": 1.0379492, "epoch": 0.14171050653840372, "flos": 12130158533760.0, "grad_norm": 1.9928903491175036, "language_loss": 0.70598352, "learning_rate": 3.8705154185530095e-06, "loss": 0.72821522, "num_input_tokens_seen": 51136950, "step": 2357, "time_per_iteration": 2.569486141204834 }, { "auxiliary_loss_clip": 0.01117022, "auxiliary_loss_mlp": 0.01054948, "balance_loss_clip": 1.04706419, "balance_loss_mlp": 1.0355413, "epoch": 0.1417706297910717, "flos": 20412487509120.0, "grad_norm": 2.1046358800035234, "language_loss": 0.82020235, "learning_rate": 3.870377526296674e-06, "loss": 0.84192204, "num_input_tokens_seen": 51155175, "step": 2358, "time_per_iteration": 2.719344139099121 }, { "auxiliary_loss_clip": 0.01145283, "auxiliary_loss_mlp": 0.01050239, "balance_loss_clip": 1.05257189, "balance_loss_mlp": 1.02932954, "epoch": 0.14183075304373965, "flos": 22380607463040.0, "grad_norm": 2.2336131404929787, "language_loss": 0.71575904, "learning_rate": 3.870239563115436e-06, "loss": 0.73771417, "num_input_tokens_seen": 51174500, "step": 2359, "time_per_iteration": 2.6914820671081543 }, { "auxiliary_loss_clip": 0.0111529, "auxiliary_loss_mlp": 0.007787, "balance_loss_clip": 1.0526464, "balance_loss_mlp": 1.00033379, "epoch": 0.14189087629640765, "flos": 21580913018880.0, "grad_norm": 2.4314273775499906, "language_loss": 0.7541784, "learning_rate": 3.870101529014526e-06, "loss": 0.77311832, "num_input_tokens_seen": 51194270, "step": 2360, "time_per_iteration": 2.803493022918701 }, { "auxiliary_loss_clip": 0.01108644, "auxiliary_loss_mlp": 0.01053684, "balance_loss_clip": 1.0491271, "balance_loss_mlp": 1.03136814, "epoch": 0.1419509995490756, "flos": 20008564093440.0, "grad_norm": 2.374719540518049, "language_loss": 0.81920552, "learning_rate": 3.869963423999178e-06, "loss": 0.84082878, "num_input_tokens_seen": 51211850, "step": 2361, "time_per_iteration": 2.8039920330047607 }, { "auxiliary_loss_clip": 0.0115065, "auxiliary_loss_mlp": 0.01057946, "balance_loss_clip": 1.05230403, "balance_loss_mlp": 1.03802609, "epoch": 0.14201112280174358, "flos": 31941464112000.0, "grad_norm": 1.9397979109407166, "language_loss": 0.74081504, "learning_rate": 3.86982524807463e-06, "loss": 0.76290095, "num_input_tokens_seen": 51233545, "step": 2362, "time_per_iteration": 2.7272114753723145 }, { "auxiliary_loss_clip": 0.0115354, "auxiliary_loss_mlp": 0.01048321, "balance_loss_clip": 1.05355787, "balance_loss_mlp": 1.02861547, "epoch": 0.14207124605441154, "flos": 41464147582080.0, "grad_norm": 1.7489521991344694, "language_loss": 0.74221587, "learning_rate": 3.869687001246122e-06, "loss": 0.76423442, "num_input_tokens_seen": 51257615, "step": 2363, "time_per_iteration": 2.789802312850952 }, { "auxiliary_loss_clip": 0.01128802, "auxiliary_loss_mlp": 0.0105205, "balance_loss_clip": 1.04769099, "balance_loss_mlp": 1.03180885, "epoch": 0.1421313693070795, "flos": 31905086613120.0, "grad_norm": 1.7832713632097879, "language_loss": 0.73034167, "learning_rate": 3.8695486835188946e-06, "loss": 0.75215018, "num_input_tokens_seen": 51279645, "step": 2364, "time_per_iteration": 2.8508312702178955 }, { "auxiliary_loss_clip": 0.01142769, "auxiliary_loss_mlp": 0.01049829, "balance_loss_clip": 1.05160844, "balance_loss_mlp": 1.03207827, "epoch": 0.14219149255974747, "flos": 26871165031680.0, "grad_norm": 1.875477198706701, "language_loss": 0.90395916, "learning_rate": 3.869410294898195e-06, "loss": 0.92588514, "num_input_tokens_seen": 51299775, "step": 2365, "time_per_iteration": 2.6807806491851807 }, { "auxiliary_loss_clip": 0.01127252, "auxiliary_loss_mlp": 0.01054912, "balance_loss_clip": 1.04759967, "balance_loss_mlp": 1.03394318, "epoch": 0.14225161581241544, "flos": 27454426076160.0, "grad_norm": 1.719218863067841, "language_loss": 0.65305161, "learning_rate": 3.869271835389268e-06, "loss": 0.67487329, "num_input_tokens_seen": 51319430, "step": 2366, "time_per_iteration": 2.7293641567230225 }, { "auxiliary_loss_clip": 0.01143576, "auxiliary_loss_mlp": 0.01051629, "balance_loss_clip": 1.05218709, "balance_loss_mlp": 1.03058839, "epoch": 0.14231173906508343, "flos": 10561436881920.0, "grad_norm": 2.3740196514966256, "language_loss": 0.80331928, "learning_rate": 3.8691333049973665e-06, "loss": 0.82527137, "num_input_tokens_seen": 51336045, "step": 2367, "time_per_iteration": 2.67529296875 }, { "auxiliary_loss_clip": 0.01138517, "auxiliary_loss_mlp": 0.01062653, "balance_loss_clip": 1.05117869, "balance_loss_mlp": 1.0402534, "epoch": 0.1423718623177514, "flos": 28360882719360.0, "grad_norm": 2.0081973718426283, "language_loss": 0.82346755, "learning_rate": 3.868994703727742e-06, "loss": 0.84547925, "num_input_tokens_seen": 51357030, "step": 2368, "time_per_iteration": 2.7447288036346436 }, { "auxiliary_loss_clip": 0.01122755, "auxiliary_loss_mlp": 0.01052229, "balance_loss_clip": 1.05180073, "balance_loss_mlp": 1.03065228, "epoch": 0.14243198557041936, "flos": 19354235990400.0, "grad_norm": 2.6586279461428144, "language_loss": 0.8711772, "learning_rate": 3.868856031585652e-06, "loss": 0.89292705, "num_input_tokens_seen": 51374890, "step": 2369, "time_per_iteration": 2.736872673034668 }, { "auxiliary_loss_clip": 0.01127301, "auxiliary_loss_mlp": 0.0104182, "balance_loss_clip": 1.05011857, "balance_loss_mlp": 1.02170992, "epoch": 0.14249210882308733, "flos": 28806857982720.0, "grad_norm": 1.7900856007188275, "language_loss": 0.75828248, "learning_rate": 3.868717288576354e-06, "loss": 0.77997375, "num_input_tokens_seen": 51398100, "step": 2370, "time_per_iteration": 2.762603998184204 }, { "auxiliary_loss_clip": 0.01158195, "auxiliary_loss_mlp": 0.00781098, "balance_loss_clip": 1.05268764, "balance_loss_mlp": 1.00028419, "epoch": 0.1425522320757553, "flos": 21835016807040.0, "grad_norm": 1.7770434161065212, "language_loss": 0.82934797, "learning_rate": 3.868578474705109e-06, "loss": 0.84874088, "num_input_tokens_seen": 51418745, "step": 2371, "time_per_iteration": 2.6224656105041504 }, { "auxiliary_loss_clip": 0.01173447, "auxiliary_loss_mlp": 0.0105718, "balance_loss_clip": 1.05837953, "balance_loss_mlp": 1.03638947, "epoch": 0.14261235532842326, "flos": 17311457617920.0, "grad_norm": 2.0431625041319825, "language_loss": 0.82982123, "learning_rate": 3.868439589977181e-06, "loss": 0.85212755, "num_input_tokens_seen": 51437455, "step": 2372, "time_per_iteration": 2.575690269470215 }, { "auxiliary_loss_clip": 0.01172196, "auxiliary_loss_mlp": 0.0105022, "balance_loss_clip": 1.0581125, "balance_loss_mlp": 1.0285356, "epoch": 0.14267247858109125, "flos": 18806741913600.0, "grad_norm": 3.3704326167450582, "language_loss": 0.8438468, "learning_rate": 3.868300634397836e-06, "loss": 0.86607099, "num_input_tokens_seen": 51455710, "step": 2373, "time_per_iteration": 2.7160356044769287 }, { "auxiliary_loss_clip": 0.01141742, "auxiliary_loss_mlp": 0.01055295, "balance_loss_clip": 1.05160809, "balance_loss_mlp": 1.03598261, "epoch": 0.14273260183375922, "flos": 11358904682880.0, "grad_norm": 3.5035356392631836, "language_loss": 0.86027539, "learning_rate": 3.8681616079723445e-06, "loss": 0.88224572, "num_input_tokens_seen": 51471270, "step": 2374, "time_per_iteration": 2.6845595836639404 }, { "auxiliary_loss_clip": 0.01164623, "auxiliary_loss_mlp": 0.01061957, "balance_loss_clip": 1.05515146, "balance_loss_mlp": 1.03996301, "epoch": 0.14279272508642718, "flos": 27567688636800.0, "grad_norm": 1.6059368749673757, "language_loss": 0.79169822, "learning_rate": 3.868022510705977e-06, "loss": 0.81396401, "num_input_tokens_seen": 51492705, "step": 2375, "time_per_iteration": 2.738156795501709 }, { "auxiliary_loss_clip": 0.01163115, "auxiliary_loss_mlp": 0.01058224, "balance_loss_clip": 1.05641222, "balance_loss_mlp": 1.0368259, "epoch": 0.14285284833909515, "flos": 16252559654400.0, "grad_norm": 2.559097553272684, "language_loss": 0.76907504, "learning_rate": 3.867883342604009e-06, "loss": 0.79128844, "num_input_tokens_seen": 51510780, "step": 2376, "time_per_iteration": 2.751178741455078 }, { "auxiliary_loss_clip": 0.01160115, "auxiliary_loss_mlp": 0.0105168, "balance_loss_clip": 1.054515, "balance_loss_mlp": 1.03040111, "epoch": 0.1429129715917631, "flos": 19755609540480.0, "grad_norm": 2.7331999261828592, "language_loss": 0.92795181, "learning_rate": 3.867744103671717e-06, "loss": 0.95006979, "num_input_tokens_seen": 51531400, "step": 2377, "time_per_iteration": 2.6584725379943848 }, { "auxiliary_loss_clip": 0.01147246, "auxiliary_loss_mlp": 0.01061419, "balance_loss_clip": 1.05362535, "balance_loss_mlp": 1.03793442, "epoch": 0.14297309484443108, "flos": 21137092571520.0, "grad_norm": 2.9252003733204894, "language_loss": 0.91754365, "learning_rate": 3.867604793914382e-06, "loss": 0.93963027, "num_input_tokens_seen": 51548215, "step": 2378, "time_per_iteration": 2.8107075691223145 }, { "auxiliary_loss_clip": 0.01164153, "auxiliary_loss_mlp": 0.0105303, "balance_loss_clip": 1.05712187, "balance_loss_mlp": 1.03092849, "epoch": 0.14303321809709904, "flos": 23586667447680.0, "grad_norm": 2.1292902842232966, "language_loss": 0.73961306, "learning_rate": 3.8674654133372864e-06, "loss": 0.76178491, "num_input_tokens_seen": 51566820, "step": 2379, "time_per_iteration": 2.7029881477355957 }, { "auxiliary_loss_clip": 0.01137551, "auxiliary_loss_mlp": 0.01055012, "balance_loss_clip": 1.05204058, "balance_loss_mlp": 1.0330174, "epoch": 0.14309334134976703, "flos": 15888281875200.0, "grad_norm": 2.1898245228218784, "language_loss": 0.78818595, "learning_rate": 3.867325961945714e-06, "loss": 0.81011152, "num_input_tokens_seen": 51585075, "step": 2380, "time_per_iteration": 2.7213294506073 }, { "auxiliary_loss_clip": 0.01126442, "auxiliary_loss_mlp": 0.01057409, "balance_loss_clip": 1.05457354, "balance_loss_mlp": 1.03580785, "epoch": 0.143153464602435, "flos": 16325601960960.0, "grad_norm": 4.699041640805274, "language_loss": 0.87895483, "learning_rate": 3.867186439744955e-06, "loss": 0.90079331, "num_input_tokens_seen": 51603185, "step": 2381, "time_per_iteration": 2.7144110202789307 }, { "auxiliary_loss_clip": 0.01141327, "auxiliary_loss_mlp": 0.01052708, "balance_loss_clip": 1.05200005, "balance_loss_mlp": 1.03088117, "epoch": 0.14321358785510296, "flos": 17092079303040.0, "grad_norm": 2.47508592106904, "language_loss": 0.76396096, "learning_rate": 3.867046846740299e-06, "loss": 0.78590137, "num_input_tokens_seen": 51620880, "step": 2382, "time_per_iteration": 2.6185953617095947 }, { "auxiliary_loss_clip": 0.01132222, "auxiliary_loss_mlp": 0.01054019, "balance_loss_clip": 1.05162048, "balance_loss_mlp": 1.03319359, "epoch": 0.14327371110777093, "flos": 26322916769280.0, "grad_norm": 4.3017095308344375, "language_loss": 0.76636785, "learning_rate": 3.866907182937039e-06, "loss": 0.7882303, "num_input_tokens_seen": 51640170, "step": 2383, "time_per_iteration": 2.7408525943756104 }, { "auxiliary_loss_clip": 0.01139698, "auxiliary_loss_mlp": 0.01052888, "balance_loss_clip": 1.05078864, "balance_loss_mlp": 1.02926064, "epoch": 0.1433338343604389, "flos": 18076462502400.0, "grad_norm": 2.3526544982502284, "language_loss": 0.87649417, "learning_rate": 3.866767448340471e-06, "loss": 0.8984201, "num_input_tokens_seen": 51656580, "step": 2384, "time_per_iteration": 2.6798789501190186 }, { "auxiliary_loss_clip": 0.01164805, "auxiliary_loss_mlp": 0.01053206, "balance_loss_clip": 1.05644679, "balance_loss_mlp": 1.02985239, "epoch": 0.14339395761310686, "flos": 15522783033600.0, "grad_norm": 2.6134761315069284, "language_loss": 0.79340684, "learning_rate": 3.866627642955895e-06, "loss": 0.81558692, "num_input_tokens_seen": 51674645, "step": 2385, "time_per_iteration": 2.5856544971466064 }, { "auxiliary_loss_clip": 0.01156607, "auxiliary_loss_mlp": 0.01042784, "balance_loss_clip": 1.05148256, "balance_loss_mlp": 1.02182722, "epoch": 0.14345408086577485, "flos": 28548767784960.0, "grad_norm": 2.6990187663653247, "language_loss": 0.74960196, "learning_rate": 3.866487766788612e-06, "loss": 0.77159584, "num_input_tokens_seen": 51695770, "step": 2386, "time_per_iteration": 2.6670751571655273 }, { "auxiliary_loss_clip": 0.01171639, "auxiliary_loss_mlp": 0.01048096, "balance_loss_clip": 1.05699563, "balance_loss_mlp": 1.02733016, "epoch": 0.14351420411844282, "flos": 20230061310720.0, "grad_norm": 2.299870083842227, "language_loss": 0.78659731, "learning_rate": 3.866347819843925e-06, "loss": 0.80879462, "num_input_tokens_seen": 51714165, "step": 2387, "time_per_iteration": 2.5805532932281494 }, { "auxiliary_loss_clip": 0.01140581, "auxiliary_loss_mlp": 0.01055299, "balance_loss_clip": 1.05355716, "balance_loss_mlp": 1.03317428, "epoch": 0.14357432737111078, "flos": 19865029345920.0, "grad_norm": 6.554164509194222, "language_loss": 0.82492924, "learning_rate": 3.866207802127143e-06, "loss": 0.84688807, "num_input_tokens_seen": 51734440, "step": 2388, "time_per_iteration": 2.656609058380127 }, { "auxiliary_loss_clip": 0.01155007, "auxiliary_loss_mlp": 0.01047154, "balance_loss_clip": 1.0537287, "balance_loss_mlp": 1.02674508, "epoch": 0.14363445062377875, "flos": 28256814040320.0, "grad_norm": 2.5973624291758655, "language_loss": 0.82025754, "learning_rate": 3.866067713643573e-06, "loss": 0.84227914, "num_input_tokens_seen": 51753730, "step": 2389, "time_per_iteration": 4.21793794631958 }, { "auxiliary_loss_clip": 0.01145665, "auxiliary_loss_mlp": 0.01046852, "balance_loss_clip": 1.05107975, "balance_loss_mlp": 1.02513266, "epoch": 0.1436945738764467, "flos": 18186672407040.0, "grad_norm": 3.7970835440683097, "language_loss": 0.83056784, "learning_rate": 3.8659275543985285e-06, "loss": 0.85249299, "num_input_tokens_seen": 51771195, "step": 2390, "time_per_iteration": 2.6859514713287354 }, { "auxiliary_loss_clip": 0.01152608, "auxiliary_loss_mlp": 0.01054404, "balance_loss_clip": 1.05400729, "balance_loss_mlp": 1.0334475, "epoch": 0.14375469712911468, "flos": 27307910499840.0, "grad_norm": 1.8176612067028404, "language_loss": 0.75018179, "learning_rate": 3.865787324397324e-06, "loss": 0.77225184, "num_input_tokens_seen": 51792290, "step": 2391, "time_per_iteration": 5.726900577545166 }, { "auxiliary_loss_clip": 0.01045505, "auxiliary_loss_mlp": 0.01033342, "balance_loss_clip": 1.03226101, "balance_loss_mlp": 1.0303973, "epoch": 0.14381482038178264, "flos": 56891445287040.0, "grad_norm": 0.8787809928903102, "language_loss": 0.61848003, "learning_rate": 3.865647023645277e-06, "loss": 0.63926852, "num_input_tokens_seen": 51843675, "step": 2392, "time_per_iteration": 3.113558053970337 }, { "auxiliary_loss_clip": 0.01158698, "auxiliary_loss_mlp": 0.01058807, "balance_loss_clip": 1.05467868, "balance_loss_mlp": 1.03608608, "epoch": 0.14387494363445064, "flos": 14282177143680.0, "grad_norm": 2.718376715006273, "language_loss": 0.77346605, "learning_rate": 3.865506652147709e-06, "loss": 0.79564106, "num_input_tokens_seen": 51860285, "step": 2393, "time_per_iteration": 2.6578521728515625 }, { "auxiliary_loss_clip": 0.0116951, "auxiliary_loss_mlp": 0.01052986, "balance_loss_clip": 1.05671048, "balance_loss_mlp": 1.03287578, "epoch": 0.1439350668871186, "flos": 26761493831040.0, "grad_norm": 5.715284956255472, "language_loss": 0.76301813, "learning_rate": 3.865366209909941e-06, "loss": 0.78524309, "num_input_tokens_seen": 51880105, "step": 2394, "time_per_iteration": 4.345217943191528 }, { "auxiliary_loss_clip": 0.01165266, "auxiliary_loss_mlp": 0.01053501, "balance_loss_clip": 1.05325842, "balance_loss_mlp": 1.03365326, "epoch": 0.14399519013978657, "flos": 40700040537600.0, "grad_norm": 2.2496244390836893, "language_loss": 0.85859704, "learning_rate": 3.8652256969372994e-06, "loss": 0.88078463, "num_input_tokens_seen": 51905175, "step": 2395, "time_per_iteration": 2.739717483520508 }, { "auxiliary_loss_clip": 0.0112523, "auxiliary_loss_mlp": 0.01051092, "balance_loss_clip": 1.04946184, "balance_loss_mlp": 1.028669, "epoch": 0.14405531339245453, "flos": 20557530627840.0, "grad_norm": 4.117082508421602, "language_loss": 0.82894099, "learning_rate": 3.865085113235113e-06, "loss": 0.85070425, "num_input_tokens_seen": 51924490, "step": 2396, "time_per_iteration": 2.686732053756714 }, { "auxiliary_loss_clip": 0.01126754, "auxiliary_loss_mlp": 0.00779833, "balance_loss_clip": 1.04752374, "balance_loss_mlp": 1.00036597, "epoch": 0.1441154366451225, "flos": 19572931946880.0, "grad_norm": 6.956399779275871, "language_loss": 0.82801461, "learning_rate": 3.864944458808712e-06, "loss": 0.84708053, "num_input_tokens_seen": 51940490, "step": 2397, "time_per_iteration": 2.742809534072876 }, { "auxiliary_loss_clip": 0.01168871, "auxiliary_loss_mlp": 0.0104994, "balance_loss_clip": 1.05485702, "balance_loss_mlp": 1.02892387, "epoch": 0.14417555989779046, "flos": 18515721922560.0, "grad_norm": 8.355198005975433, "language_loss": 0.8001197, "learning_rate": 3.86480373366343e-06, "loss": 0.82230783, "num_input_tokens_seen": 51957910, "step": 2398, "time_per_iteration": 2.573267936706543 }, { "auxiliary_loss_clip": 0.01152449, "auxiliary_loss_mlp": 0.01053407, "balance_loss_clip": 1.05287588, "balance_loss_mlp": 1.03336823, "epoch": 0.14423568315045843, "flos": 26031681296640.0, "grad_norm": 3.294581575970509, "language_loss": 0.64690518, "learning_rate": 3.864662937804603e-06, "loss": 0.66896379, "num_input_tokens_seen": 51978010, "step": 2399, "time_per_iteration": 2.6831774711608887 }, { "auxiliary_loss_clip": 0.01134916, "auxiliary_loss_mlp": 0.01052493, "balance_loss_clip": 1.04998159, "balance_loss_mlp": 1.03119016, "epoch": 0.14429580640312642, "flos": 21288743792640.0, "grad_norm": 3.586256880371596, "language_loss": 0.82207137, "learning_rate": 3.864522071237571e-06, "loss": 0.84394544, "num_input_tokens_seen": 51998515, "step": 2400, "time_per_iteration": 2.6812663078308105 }, { "auxiliary_loss_clip": 0.01149983, "auxiliary_loss_mlp": 0.01051884, "balance_loss_clip": 1.0567503, "balance_loss_mlp": 1.02954376, "epoch": 0.14435592965579438, "flos": 25627865621760.0, "grad_norm": 2.3908005596579165, "language_loss": 0.74217784, "learning_rate": 3.864381133967676e-06, "loss": 0.76419652, "num_input_tokens_seen": 52019270, "step": 2401, "time_per_iteration": 2.773838520050049 }, { "auxiliary_loss_clip": 0.01137207, "auxiliary_loss_mlp": 0.01047592, "balance_loss_clip": 1.05065656, "balance_loss_mlp": 1.02671885, "epoch": 0.14441605290846235, "flos": 22965053656320.0, "grad_norm": 2.616063077702737, "language_loss": 0.80771816, "learning_rate": 3.86424012600026e-06, "loss": 0.82956612, "num_input_tokens_seen": 52039315, "step": 2402, "time_per_iteration": 2.786031723022461 }, { "auxiliary_loss_clip": 0.01120897, "auxiliary_loss_mlp": 0.01052115, "balance_loss_clip": 1.04718328, "balance_loss_mlp": 1.02988231, "epoch": 0.14447617616113032, "flos": 17347655548800.0, "grad_norm": 2.397935571801219, "language_loss": 0.84159613, "learning_rate": 3.864099047340673e-06, "loss": 0.86332625, "num_input_tokens_seen": 52056555, "step": 2403, "time_per_iteration": 2.8113911151885986 }, { "auxiliary_loss_clip": 0.01129082, "auxiliary_loss_mlp": 0.00783127, "balance_loss_clip": 1.04854488, "balance_loss_mlp": 1.00030184, "epoch": 0.14453629941379828, "flos": 24060185464320.0, "grad_norm": 2.224282169770823, "language_loss": 0.70142806, "learning_rate": 3.863957897994262e-06, "loss": 0.72055018, "num_input_tokens_seen": 52075800, "step": 2404, "time_per_iteration": 2.7748003005981445 }, { "auxiliary_loss_clip": 0.01144289, "auxiliary_loss_mlp": 0.01051404, "balance_loss_clip": 1.05279732, "balance_loss_mlp": 1.03099549, "epoch": 0.14459642266646625, "flos": 14429554646400.0, "grad_norm": 2.429117427076043, "language_loss": 0.73179376, "learning_rate": 3.863816677966381e-06, "loss": 0.75375068, "num_input_tokens_seen": 52092585, "step": 2405, "time_per_iteration": 2.7927868366241455 }, { "auxiliary_loss_clip": 0.01108387, "auxiliary_loss_mlp": 0.01054584, "balance_loss_clip": 1.04661417, "balance_loss_mlp": 1.0326612, "epoch": 0.14465654591913424, "flos": 9867032179200.0, "grad_norm": 7.089523066959408, "language_loss": 0.73039794, "learning_rate": 3.863675387262386e-06, "loss": 0.75202763, "num_input_tokens_seen": 52108990, "step": 2406, "time_per_iteration": 2.742253303527832 }, { "auxiliary_loss_clip": 0.01157268, "auxiliary_loss_mlp": 0.01054465, "balance_loss_clip": 1.05420268, "balance_loss_mlp": 1.03198171, "epoch": 0.1447166691718022, "flos": 24972926987520.0, "grad_norm": 5.383630788916188, "language_loss": 0.75570732, "learning_rate": 3.8635340258876325e-06, "loss": 0.77782464, "num_input_tokens_seen": 52125385, "step": 2407, "time_per_iteration": 2.654636859893799 }, { "auxiliary_loss_clip": 0.0116674, "auxiliary_loss_mlp": 0.01054642, "balance_loss_clip": 1.05440819, "balance_loss_mlp": 1.03392315, "epoch": 0.14477679242447017, "flos": 21908023200000.0, "grad_norm": 2.0240540465866146, "language_loss": 0.79426706, "learning_rate": 3.8633925938474826e-06, "loss": 0.81648088, "num_input_tokens_seen": 52144985, "step": 2408, "time_per_iteration": 2.663611650466919 }, { "auxiliary_loss_clip": 0.01155332, "auxiliary_loss_mlp": 0.01053557, "balance_loss_clip": 1.05411625, "balance_loss_mlp": 1.03107429, "epoch": 0.14483691567713813, "flos": 20740746925440.0, "grad_norm": 2.249858190268702, "language_loss": 0.82188261, "learning_rate": 3.863251091147299e-06, "loss": 0.84397143, "num_input_tokens_seen": 52163885, "step": 2409, "time_per_iteration": 2.6218342781066895 }, { "auxiliary_loss_clip": 0.01116852, "auxiliary_loss_mlp": 0.01065498, "balance_loss_clip": 1.04859877, "balance_loss_mlp": 1.04340839, "epoch": 0.1448970389298061, "flos": 35407705536000.0, "grad_norm": 3.918408886138166, "language_loss": 0.74477464, "learning_rate": 3.863109517792446e-06, "loss": 0.76659817, "num_input_tokens_seen": 52184325, "step": 2410, "time_per_iteration": 2.8525002002716064 }, { "auxiliary_loss_clip": 0.01166422, "auxiliary_loss_mlp": 0.0105028, "balance_loss_clip": 1.05447876, "balance_loss_mlp": 1.0300622, "epoch": 0.14495716218247406, "flos": 15414368808960.0, "grad_norm": 2.976325973684052, "language_loss": 0.81616414, "learning_rate": 3.8629678737882945e-06, "loss": 0.8383311, "num_input_tokens_seen": 52202740, "step": 2411, "time_per_iteration": 2.580059051513672 }, { "auxiliary_loss_clip": 0.01143671, "auxiliary_loss_mlp": 0.01055066, "balance_loss_clip": 1.05553794, "balance_loss_mlp": 1.03366852, "epoch": 0.14501728543514203, "flos": 33693222493440.0, "grad_norm": 2.049708152728223, "language_loss": 0.69947547, "learning_rate": 3.862826159140214e-06, "loss": 0.72146285, "num_input_tokens_seen": 52223100, "step": 2412, "time_per_iteration": 2.792389392852783 }, { "auxiliary_loss_clip": 0.01153861, "auxiliary_loss_mlp": 0.01047504, "balance_loss_clip": 1.05600309, "balance_loss_mlp": 1.02669024, "epoch": 0.14507740868781002, "flos": 15596112648960.0, "grad_norm": 1.9741671649406984, "language_loss": 0.76655865, "learning_rate": 3.862684373853579e-06, "loss": 0.78857231, "num_input_tokens_seen": 52239690, "step": 2413, "time_per_iteration": 2.6535370349884033 }, { "auxiliary_loss_clip": 0.01072879, "auxiliary_loss_mlp": 0.01028499, "balance_loss_clip": 1.04041791, "balance_loss_mlp": 1.0252564, "epoch": 0.145137531940478, "flos": 66675343438080.0, "grad_norm": 0.9047547971056389, "language_loss": 0.58883119, "learning_rate": 3.8625425179337656e-06, "loss": 0.60984492, "num_input_tokens_seen": 52296705, "step": 2414, "time_per_iteration": 3.1230342388153076 }, { "auxiliary_loss_clip": 0.01059489, "auxiliary_loss_mlp": 0.01009718, "balance_loss_clip": 1.03874373, "balance_loss_mlp": 1.00692892, "epoch": 0.14519765519314595, "flos": 67521578929920.0, "grad_norm": 0.8422279258983576, "language_loss": 0.62171185, "learning_rate": 3.862400591386154e-06, "loss": 0.64240396, "num_input_tokens_seen": 52361830, "step": 2415, "time_per_iteration": 3.1932270526885986 }, { "auxiliary_loss_clip": 0.01151643, "auxiliary_loss_mlp": 0.01046675, "balance_loss_clip": 1.05383611, "balance_loss_mlp": 1.02500319, "epoch": 0.14525777844581392, "flos": 17198913329280.0, "grad_norm": 2.2913061581681036, "language_loss": 0.71468806, "learning_rate": 3.8622585942161245e-06, "loss": 0.73667121, "num_input_tokens_seen": 52379420, "step": 2416, "time_per_iteration": 2.5892374515533447 }, { "auxiliary_loss_clip": 0.01050816, "auxiliary_loss_mlp": 0.010049, "balance_loss_clip": 1.03675056, "balance_loss_mlp": 1.00211036, "epoch": 0.14531790169848188, "flos": 65404609015680.0, "grad_norm": 0.7147623603004897, "language_loss": 0.6037569, "learning_rate": 3.8621165264290635e-06, "loss": 0.62431407, "num_input_tokens_seen": 52446290, "step": 2417, "time_per_iteration": 3.3065359592437744 }, { "auxiliary_loss_clip": 0.01168766, "auxiliary_loss_mlp": 0.01053548, "balance_loss_clip": 1.05357766, "balance_loss_mlp": 1.03275824, "epoch": 0.14537802495114985, "flos": 32562467372160.0, "grad_norm": 3.7032433533234346, "language_loss": 0.78014368, "learning_rate": 3.861974388030356e-06, "loss": 0.80236679, "num_input_tokens_seen": 52467295, "step": 2418, "time_per_iteration": 2.887986183166504 }, { "auxiliary_loss_clip": 0.01114137, "auxiliary_loss_mlp": 0.01049779, "balance_loss_clip": 1.04354823, "balance_loss_mlp": 1.02911985, "epoch": 0.1454381482038178, "flos": 20226685432320.0, "grad_norm": 2.096300480609688, "language_loss": 0.71208847, "learning_rate": 3.861832179025394e-06, "loss": 0.73372757, "num_input_tokens_seen": 52487295, "step": 2419, "time_per_iteration": 2.764268636703491 }, { "auxiliary_loss_clip": 0.01142427, "auxiliary_loss_mlp": 0.01054976, "balance_loss_clip": 1.05351484, "balance_loss_mlp": 1.03300607, "epoch": 0.1454982714564858, "flos": 22893124671360.0, "grad_norm": 2.414673655978061, "language_loss": 0.89847761, "learning_rate": 3.861689899419569e-06, "loss": 0.92045164, "num_input_tokens_seen": 52504220, "step": 2420, "time_per_iteration": 2.7500016689300537 }, { "auxiliary_loss_clip": 0.01155004, "auxiliary_loss_mlp": 0.01060929, "balance_loss_clip": 1.05202007, "balance_loss_mlp": 1.04072309, "epoch": 0.14555839470915377, "flos": 20229845829120.0, "grad_norm": 2.0953123539002383, "language_loss": 0.82278717, "learning_rate": 3.861547549218276e-06, "loss": 0.8449465, "num_input_tokens_seen": 52521900, "step": 2421, "time_per_iteration": 2.672722816467285 }, { "auxiliary_loss_clip": 0.01099277, "auxiliary_loss_mlp": 0.01056793, "balance_loss_clip": 1.04282439, "balance_loss_mlp": 1.03507352, "epoch": 0.14561851796182174, "flos": 22236282616320.0, "grad_norm": 1.667429152986229, "language_loss": 0.81741488, "learning_rate": 3.861405128426914e-06, "loss": 0.83897555, "num_input_tokens_seen": 52540495, "step": 2422, "time_per_iteration": 2.739992141723633 }, { "auxiliary_loss_clip": 0.01031842, "auxiliary_loss_mlp": 0.00760413, "balance_loss_clip": 1.0271318, "balance_loss_mlp": 1.00019872, "epoch": 0.1456786412144897, "flos": 52636786289280.0, "grad_norm": 0.9102961670465963, "language_loss": 0.63342595, "learning_rate": 3.861262637050883e-06, "loss": 0.65134847, "num_input_tokens_seen": 52603305, "step": 2423, "time_per_iteration": 3.2704036235809326 }, { "auxiliary_loss_clip": 0.01112855, "auxiliary_loss_mlp": 0.00780065, "balance_loss_clip": 1.05457556, "balance_loss_mlp": 1.00038898, "epoch": 0.14573876446715767, "flos": 23221671396480.0, "grad_norm": 2.2239460229896206, "language_loss": 0.82163274, "learning_rate": 3.861120075095585e-06, "loss": 0.84056193, "num_input_tokens_seen": 52623435, "step": 2424, "time_per_iteration": 2.7993249893188477 }, { "auxiliary_loss_clip": 0.01141208, "auxiliary_loss_mlp": 0.01069468, "balance_loss_clip": 1.0535512, "balance_loss_mlp": 1.0496788, "epoch": 0.14579888771982563, "flos": 18114384286080.0, "grad_norm": 2.769336045727131, "language_loss": 0.78602695, "learning_rate": 3.860977442566429e-06, "loss": 0.80813372, "num_input_tokens_seen": 52642255, "step": 2425, "time_per_iteration": 2.698594093322754 }, { "auxiliary_loss_clip": 0.01156078, "auxiliary_loss_mlp": 0.01062133, "balance_loss_clip": 1.05603778, "balance_loss_mlp": 1.04148602, "epoch": 0.14585901097249362, "flos": 23001107932800.0, "grad_norm": 50.77412231982301, "language_loss": 0.83184898, "learning_rate": 3.860834739468821e-06, "loss": 0.85403109, "num_input_tokens_seen": 52658700, "step": 2426, "time_per_iteration": 2.6948676109313965 }, { "auxiliary_loss_clip": 0.01166642, "auxiliary_loss_mlp": 0.01060596, "balance_loss_clip": 1.05706, "balance_loss_mlp": 1.04040194, "epoch": 0.1459191342251616, "flos": 21908669644800.0, "grad_norm": 3.7420612082917475, "language_loss": 0.87215799, "learning_rate": 3.860691965808173e-06, "loss": 0.8944304, "num_input_tokens_seen": 52678140, "step": 2427, "time_per_iteration": 2.6479666233062744 }, { "auxiliary_loss_clip": 0.01128634, "auxiliary_loss_mlp": 0.01064346, "balance_loss_clip": 1.04835391, "balance_loss_mlp": 1.0405997, "epoch": 0.14597925747782955, "flos": 14975504438400.0, "grad_norm": 1.9221483903926033, "language_loss": 0.66815829, "learning_rate": 3.8605491215899e-06, "loss": 0.69008809, "num_input_tokens_seen": 52696825, "step": 2428, "time_per_iteration": 2.6971306800842285 }, { "auxiliary_loss_clip": 0.01155557, "auxiliary_loss_mlp": 0.01059343, "balance_loss_clip": 1.05335426, "balance_loss_mlp": 1.03842235, "epoch": 0.14603938073049752, "flos": 21068898600960.0, "grad_norm": 2.0918238083564242, "language_loss": 0.83231717, "learning_rate": 3.860406206819417e-06, "loss": 0.8544662, "num_input_tokens_seen": 52715125, "step": 2429, "time_per_iteration": 4.283279895782471 }, { "auxiliary_loss_clip": 0.01120809, "auxiliary_loss_mlp": 0.01053505, "balance_loss_clip": 1.04625869, "balance_loss_mlp": 1.03446746, "epoch": 0.14609950398316549, "flos": 19864777950720.0, "grad_norm": 2.4559042296603746, "language_loss": 0.79087842, "learning_rate": 3.860263221502145e-06, "loss": 0.81262159, "num_input_tokens_seen": 52734015, "step": 2430, "time_per_iteration": 4.197890758514404 }, { "auxiliary_loss_clip": 0.01170782, "auxiliary_loss_mlp": 0.01061965, "balance_loss_clip": 1.05820751, "balance_loss_mlp": 1.04179525, "epoch": 0.14615962723583345, "flos": 22418852469120.0, "grad_norm": 2.4376691278662506, "language_loss": 0.82910693, "learning_rate": 3.860120165643504e-06, "loss": 0.85143435, "num_input_tokens_seen": 52753025, "step": 2431, "time_per_iteration": 4.162708282470703 }, { "auxiliary_loss_clip": 0.011607, "auxiliary_loss_mlp": 0.01060112, "balance_loss_clip": 1.05553937, "balance_loss_mlp": 1.03853524, "epoch": 0.14621975048850142, "flos": 22346241125760.0, "grad_norm": 2.881661839068268, "language_loss": 0.78330141, "learning_rate": 3.859977039248921e-06, "loss": 0.80550951, "num_input_tokens_seen": 52773420, "step": 2432, "time_per_iteration": 2.6907777786254883 }, { "auxiliary_loss_clip": 0.01165399, "auxiliary_loss_mlp": 0.00782861, "balance_loss_clip": 1.05517077, "balance_loss_mlp": 1.00040507, "epoch": 0.1462798737411694, "flos": 24389163152640.0, "grad_norm": 2.3488382544651887, "language_loss": 0.79515982, "learning_rate": 3.859833842323822e-06, "loss": 0.81464243, "num_input_tokens_seen": 52792870, "step": 2433, "time_per_iteration": 2.719841241836548 }, { "auxiliary_loss_clip": 0.01124303, "auxiliary_loss_mlp": 0.01055776, "balance_loss_clip": 1.05385411, "balance_loss_mlp": 1.03484273, "epoch": 0.14633999699383737, "flos": 19244672530560.0, "grad_norm": 2.0782880949269926, "language_loss": 0.77905983, "learning_rate": 3.859690574873638e-06, "loss": 0.80086064, "num_input_tokens_seen": 52811615, "step": 2434, "time_per_iteration": 4.371506929397583 }, { "auxiliary_loss_clip": 0.01066282, "auxiliary_loss_mlp": 0.01033141, "balance_loss_clip": 1.05327988, "balance_loss_mlp": 1.03022039, "epoch": 0.14640012024650534, "flos": 62660638270080.0, "grad_norm": 0.8566726319617045, "language_loss": 0.58453119, "learning_rate": 3.8595472369038e-06, "loss": 0.60552537, "num_input_tokens_seen": 52873230, "step": 2435, "time_per_iteration": 3.229882001876831 }, { "auxiliary_loss_clip": 0.01160087, "auxiliary_loss_mlp": 0.01045043, "balance_loss_clip": 1.05263698, "balance_loss_mlp": 1.0257076, "epoch": 0.1464602434991733, "flos": 12276243146880.0, "grad_norm": 3.775553645712452, "language_loss": 0.88436592, "learning_rate": 3.859403828419744e-06, "loss": 0.90641725, "num_input_tokens_seen": 52889325, "step": 2436, "time_per_iteration": 2.568624973297119 }, { "auxiliary_loss_clip": 0.011561, "auxiliary_loss_mlp": 0.00780257, "balance_loss_clip": 1.05587268, "balance_loss_mlp": 1.00041819, "epoch": 0.14652036675184127, "flos": 20922311197440.0, "grad_norm": 2.028718201913856, "language_loss": 0.74904168, "learning_rate": 3.85926034942691e-06, "loss": 0.7684052, "num_input_tokens_seen": 52909705, "step": 2437, "time_per_iteration": 2.6361188888549805 }, { "auxiliary_loss_clip": 0.01165187, "auxiliary_loss_mlp": 0.01050068, "balance_loss_clip": 1.05295086, "balance_loss_mlp": 1.02729869, "epoch": 0.14658049000450923, "flos": 27703681528320.0, "grad_norm": 3.0822234004311033, "language_loss": 0.73914421, "learning_rate": 3.859116799930736e-06, "loss": 0.76129669, "num_input_tokens_seen": 52930300, "step": 2438, "time_per_iteration": 2.7590928077697754 }, { "auxiliary_loss_clip": 0.01154571, "auxiliary_loss_mlp": 0.01046509, "balance_loss_clip": 1.05747688, "balance_loss_mlp": 1.02708936, "epoch": 0.14664061325717723, "flos": 24936513575040.0, "grad_norm": 4.476318678757457, "language_loss": 0.74410725, "learning_rate": 3.858973179936668e-06, "loss": 0.76611805, "num_input_tokens_seen": 52949955, "step": 2439, "time_per_iteration": 2.627037763595581 }, { "auxiliary_loss_clip": 0.01152452, "auxiliary_loss_mlp": 0.01051294, "balance_loss_clip": 1.05477583, "balance_loss_mlp": 1.0309453, "epoch": 0.1467007365098452, "flos": 40297661406720.0, "grad_norm": 2.1583973700525343, "language_loss": 0.74123728, "learning_rate": 3.85882948945015e-06, "loss": 0.76327467, "num_input_tokens_seen": 52972905, "step": 2440, "time_per_iteration": 2.79715633392334 }, { "auxiliary_loss_clip": 0.01160843, "auxiliary_loss_mlp": 0.01044034, "balance_loss_clip": 1.05471611, "balance_loss_mlp": 1.02493691, "epoch": 0.14676085976251316, "flos": 26541074021760.0, "grad_norm": 1.9756103236146798, "language_loss": 0.82730794, "learning_rate": 3.85868572847663e-06, "loss": 0.84935671, "num_input_tokens_seen": 52994850, "step": 2441, "time_per_iteration": 2.6505653858184814 }, { "auxiliary_loss_clip": 0.01152605, "auxiliary_loss_mlp": 0.01049175, "balance_loss_clip": 1.05408478, "balance_loss_mlp": 1.02796757, "epoch": 0.14682098301518112, "flos": 23550110380800.0, "grad_norm": 2.582118236216862, "language_loss": 0.71455544, "learning_rate": 3.858541897021563e-06, "loss": 0.73657322, "num_input_tokens_seen": 53014740, "step": 2442, "time_per_iteration": 2.772648572921753 }, { "auxiliary_loss_clip": 0.0113053, "auxiliary_loss_mlp": 0.0104246, "balance_loss_clip": 1.05283213, "balance_loss_mlp": 1.02224207, "epoch": 0.1468811062678491, "flos": 11651073909120.0, "grad_norm": 3.6780587187273155, "language_loss": 0.81992352, "learning_rate": 3.8583979950904e-06, "loss": 0.84165335, "num_input_tokens_seen": 53029780, "step": 2443, "time_per_iteration": 2.6979780197143555 }, { "auxiliary_loss_clip": 0.01147138, "auxiliary_loss_mlp": 0.0105693, "balance_loss_clip": 1.05402422, "balance_loss_mlp": 1.03474557, "epoch": 0.14694122952051705, "flos": 23002616304000.0, "grad_norm": 3.190851099873364, "language_loss": 0.83093917, "learning_rate": 3.858254022688599e-06, "loss": 0.85297978, "num_input_tokens_seen": 53048620, "step": 2444, "time_per_iteration": 2.7177255153656006 }, { "auxiliary_loss_clip": 0.01134628, "auxiliary_loss_mlp": 0.01051986, "balance_loss_clip": 1.05385137, "balance_loss_mlp": 1.03213811, "epoch": 0.14700135277318502, "flos": 26502972670080.0, "grad_norm": 3.1425569240832414, "language_loss": 0.71183646, "learning_rate": 3.85810997982162e-06, "loss": 0.7337026, "num_input_tokens_seen": 53070055, "step": 2445, "time_per_iteration": 2.735361099243164 }, { "auxiliary_loss_clip": 0.01095177, "auxiliary_loss_mlp": 0.01023118, "balance_loss_clip": 1.05335557, "balance_loss_mlp": 1.01999438, "epoch": 0.147061476025853, "flos": 59449434387840.0, "grad_norm": 0.824990401786658, "language_loss": 0.63083708, "learning_rate": 3.857965866494923e-06, "loss": 0.65202004, "num_input_tokens_seen": 53126945, "step": 2446, "time_per_iteration": 3.0853025913238525 }, { "auxiliary_loss_clip": 0.01120664, "auxiliary_loss_mlp": 0.01045249, "balance_loss_clip": 1.05621576, "balance_loss_mlp": 1.02491164, "epoch": 0.14712159927852098, "flos": 28330897841280.0, "grad_norm": 2.813052009295296, "language_loss": 0.74895924, "learning_rate": 3.857821682713975e-06, "loss": 0.77061838, "num_input_tokens_seen": 53149130, "step": 2447, "time_per_iteration": 2.858643054962158 }, { "auxiliary_loss_clip": 0.01168929, "auxiliary_loss_mlp": 0.01042907, "balance_loss_clip": 1.0604012, "balance_loss_mlp": 1.02383327, "epoch": 0.14718172253118894, "flos": 27089825074560.0, "grad_norm": 2.2427639286159367, "language_loss": 0.8528471, "learning_rate": 3.857677428484242e-06, "loss": 0.87496543, "num_input_tokens_seen": 53167120, "step": 2448, "time_per_iteration": 2.699781894683838 }, { "auxiliary_loss_clip": 0.01092169, "auxiliary_loss_mlp": 0.01019616, "balance_loss_clip": 1.05051064, "balance_loss_mlp": 1.01654005, "epoch": 0.1472418457838569, "flos": 66706764860160.0, "grad_norm": 0.7683837313264128, "language_loss": 0.56829578, "learning_rate": 3.857533103811195e-06, "loss": 0.58941364, "num_input_tokens_seen": 53227945, "step": 2449, "time_per_iteration": 3.1478211879730225 }, { "auxiliary_loss_clip": 0.01135016, "auxiliary_loss_mlp": 0.01050801, "balance_loss_clip": 1.05464292, "balance_loss_mlp": 1.03023791, "epoch": 0.14730196903652487, "flos": 19573578391680.0, "grad_norm": 1.9048653074507311, "language_loss": 0.85067344, "learning_rate": 3.857388708700307e-06, "loss": 0.87253165, "num_input_tokens_seen": 53244615, "step": 2450, "time_per_iteration": 2.726008653640747 }, { "auxiliary_loss_clip": 0.01158708, "auxiliary_loss_mlp": 0.01049735, "balance_loss_clip": 1.05984712, "balance_loss_mlp": 1.02994645, "epoch": 0.14736209228919284, "flos": 16071031296000.0, "grad_norm": 2.306043539040143, "language_loss": 0.74523091, "learning_rate": 3.857244243157052e-06, "loss": 0.76731533, "num_input_tokens_seen": 53262205, "step": 2451, "time_per_iteration": 2.641082286834717 }, { "auxiliary_loss_clip": 0.01133915, "auxiliary_loss_mlp": 0.01038458, "balance_loss_clip": 1.05399728, "balance_loss_mlp": 1.02031422, "epoch": 0.1474222155418608, "flos": 23039460679680.0, "grad_norm": 1.8026547738986978, "language_loss": 0.82384264, "learning_rate": 3.85709970718691e-06, "loss": 0.84556639, "num_input_tokens_seen": 53282445, "step": 2452, "time_per_iteration": 2.7810096740722656 }, { "auxiliary_loss_clip": 0.01101553, "auxiliary_loss_mlp": 0.01041864, "balance_loss_clip": 1.05924153, "balance_loss_mlp": 1.0238874, "epoch": 0.1474823387945288, "flos": 17018641946880.0, "grad_norm": 1.6675065143572472, "language_loss": 0.74075705, "learning_rate": 3.856955100795361e-06, "loss": 0.76219124, "num_input_tokens_seen": 53299060, "step": 2453, "time_per_iteration": 2.7913167476654053 }, { "auxiliary_loss_clip": 0.01141798, "auxiliary_loss_mlp": 0.0104607, "balance_loss_clip": 1.05557632, "balance_loss_mlp": 1.026353, "epoch": 0.14754246204719676, "flos": 17895041884800.0, "grad_norm": 1.9958141581621542, "language_loss": 0.7558704, "learning_rate": 3.856810423987889e-06, "loss": 0.77774906, "num_input_tokens_seen": 53315970, "step": 2454, "time_per_iteration": 2.7199089527130127 }, { "auxiliary_loss_clip": 0.01147348, "auxiliary_loss_mlp": 0.01038134, "balance_loss_clip": 1.05733335, "balance_loss_mlp": 1.01864362, "epoch": 0.14760258529986472, "flos": 13079097987840.0, "grad_norm": 2.0858167958418674, "language_loss": 0.83077228, "learning_rate": 3.856665676769979e-06, "loss": 0.85262716, "num_input_tokens_seen": 53332940, "step": 2455, "time_per_iteration": 2.75616192817688 }, { "auxiliary_loss_clip": 0.01130504, "auxiliary_loss_mlp": 0.01042951, "balance_loss_clip": 1.05704689, "balance_loss_mlp": 1.02452159, "epoch": 0.1476627085525327, "flos": 30806399358720.0, "grad_norm": 2.3702229998953976, "language_loss": 0.83881497, "learning_rate": 3.85652085914712e-06, "loss": 0.86054951, "num_input_tokens_seen": 53353295, "step": 2456, "time_per_iteration": 2.7914254665374756 }, { "auxiliary_loss_clip": 0.01154014, "auxiliary_loss_mlp": 0.01043715, "balance_loss_clip": 1.05863023, "balance_loss_mlp": 1.02514231, "epoch": 0.14772283180520066, "flos": 21689434984320.0, "grad_norm": 2.4172359629848996, "language_loss": 0.84154665, "learning_rate": 3.856375971124805e-06, "loss": 0.86352402, "num_input_tokens_seen": 53373410, "step": 2457, "time_per_iteration": 2.688265323638916 }, { "auxiliary_loss_clip": 0.01155788, "auxiliary_loss_mlp": 0.01042903, "balance_loss_clip": 1.06250155, "balance_loss_mlp": 1.02529585, "epoch": 0.14778295505786862, "flos": 18770400328320.0, "grad_norm": 6.310680797376285, "language_loss": 0.75692672, "learning_rate": 3.856231012708527e-06, "loss": 0.77891362, "num_input_tokens_seen": 53391430, "step": 2458, "time_per_iteration": 2.698697805404663 }, { "auxiliary_loss_clip": 0.01117404, "auxiliary_loss_mlp": 0.01047753, "balance_loss_clip": 1.05451179, "balance_loss_mlp": 1.02718902, "epoch": 0.1478430783105366, "flos": 22893555634560.0, "grad_norm": 3.1268711361266393, "language_loss": 0.83348328, "learning_rate": 3.856085983903782e-06, "loss": 0.85513484, "num_input_tokens_seen": 53409960, "step": 2459, "time_per_iteration": 2.790552854537964 }, { "auxiliary_loss_clip": 0.01126767, "auxiliary_loss_mlp": 0.01042293, "balance_loss_clip": 1.05070424, "balance_loss_mlp": 1.02435231, "epoch": 0.14790320156320458, "flos": 15085319293440.0, "grad_norm": 3.1203941208753534, "language_loss": 0.7554391, "learning_rate": 3.855940884716071e-06, "loss": 0.77712965, "num_input_tokens_seen": 53426160, "step": 2460, "time_per_iteration": 2.815455675125122 }, { "auxiliary_loss_clip": 0.01134117, "auxiliary_loss_mlp": 0.01056838, "balance_loss_clip": 1.05845904, "balance_loss_mlp": 1.03770471, "epoch": 0.14796332481587254, "flos": 26504768350080.0, "grad_norm": 3.59241393994, "language_loss": 0.81227219, "learning_rate": 3.855795715150896e-06, "loss": 0.83418173, "num_input_tokens_seen": 53448530, "step": 2461, "time_per_iteration": 2.785569190979004 }, { "auxiliary_loss_clip": 0.01156748, "auxiliary_loss_mlp": 0.01051178, "balance_loss_clip": 1.05812359, "balance_loss_mlp": 1.03044713, "epoch": 0.1480234480685405, "flos": 17563191108480.0, "grad_norm": 3.2910626990147183, "language_loss": 0.66117477, "learning_rate": 3.855650475213761e-06, "loss": 0.683254, "num_input_tokens_seen": 53465915, "step": 2462, "time_per_iteration": 2.7222983837127686 }, { "auxiliary_loss_clip": 0.01136035, "auxiliary_loss_mlp": 0.01049537, "balance_loss_clip": 1.05622339, "balance_loss_mlp": 1.02965331, "epoch": 0.14808357132120847, "flos": 53582203232640.0, "grad_norm": 1.8120706772856114, "language_loss": 0.67226064, "learning_rate": 3.8555051649101745e-06, "loss": 0.69411635, "num_input_tokens_seen": 53496055, "step": 2463, "time_per_iteration": 3.0344398021698 }, { "auxiliary_loss_clip": 0.01153077, "auxiliary_loss_mlp": 0.01050435, "balance_loss_clip": 1.05550933, "balance_loss_mlp": 1.0307889, "epoch": 0.14814369457387644, "flos": 19829190551040.0, "grad_norm": 1.9881580745750587, "language_loss": 0.76870739, "learning_rate": 3.855359784245646e-06, "loss": 0.79074258, "num_input_tokens_seen": 53513790, "step": 2464, "time_per_iteration": 2.69480037689209 }, { "auxiliary_loss_clip": 0.01133748, "auxiliary_loss_mlp": 0.01057139, "balance_loss_clip": 1.05392432, "balance_loss_mlp": 1.03769565, "epoch": 0.1482038178265444, "flos": 23914962777600.0, "grad_norm": 1.8401367705559406, "language_loss": 0.79628456, "learning_rate": 3.855214333225688e-06, "loss": 0.81819344, "num_input_tokens_seen": 53533410, "step": 2465, "time_per_iteration": 2.6989939212799072 }, { "auxiliary_loss_clip": 0.01170385, "auxiliary_loss_mlp": 0.01054925, "balance_loss_clip": 1.06119514, "balance_loss_mlp": 1.03568494, "epoch": 0.1482639410792124, "flos": 24170503109760.0, "grad_norm": 2.005541134809237, "language_loss": 0.76272273, "learning_rate": 3.855068811855817e-06, "loss": 0.78497583, "num_input_tokens_seen": 53554775, "step": 2466, "time_per_iteration": 2.646245002746582 }, { "auxiliary_loss_clip": 0.01018939, "auxiliary_loss_mlp": 0.0114331, "balance_loss_clip": 1.03313899, "balance_loss_mlp": 1.14004362, "epoch": 0.14832406433188036, "flos": 66191051341440.0, "grad_norm": 0.8320983618395327, "language_loss": 0.6004858, "learning_rate": 3.854923220141551e-06, "loss": 0.62210834, "num_input_tokens_seen": 53609675, "step": 2467, "time_per_iteration": 3.33776593208313 }, { "auxiliary_loss_clip": 0.01141854, "auxiliary_loss_mlp": 0.01044026, "balance_loss_clip": 1.05437851, "balance_loss_mlp": 1.02509522, "epoch": 0.14838418758454833, "flos": 25411252654080.0, "grad_norm": 2.92694776694492, "language_loss": 0.87666196, "learning_rate": 3.85477755808841e-06, "loss": 0.89852077, "num_input_tokens_seen": 53626950, "step": 2468, "time_per_iteration": 4.266207456588745 }, { "auxiliary_loss_clip": 0.01130189, "auxiliary_loss_mlp": 0.01048186, "balance_loss_clip": 1.05255163, "balance_loss_mlp": 1.02782488, "epoch": 0.1484443108372163, "flos": 23289901280640.0, "grad_norm": 2.2284173124426223, "language_loss": 0.7598694, "learning_rate": 3.854631825701919e-06, "loss": 0.78165317, "num_input_tokens_seen": 53644200, "step": 2469, "time_per_iteration": 4.217481851577759 }, { "auxiliary_loss_clip": 0.01126269, "auxiliary_loss_mlp": 0.0104139, "balance_loss_clip": 1.05208421, "balance_loss_mlp": 1.02251911, "epoch": 0.14850443408988426, "flos": 14647675985280.0, "grad_norm": 6.591244267451795, "language_loss": 0.75895017, "learning_rate": 3.854486022987603e-06, "loss": 0.78062677, "num_input_tokens_seen": 53659650, "step": 2470, "time_per_iteration": 2.7157187461853027 }, { "auxiliary_loss_clip": 0.01161157, "auxiliary_loss_mlp": 0.01044729, "balance_loss_clip": 1.05831027, "balance_loss_mlp": 1.02571499, "epoch": 0.14856455734255222, "flos": 23548314700800.0, "grad_norm": 1.8610043660805562, "language_loss": 0.7215873, "learning_rate": 3.8543401499509905e-06, "loss": 0.74364614, "num_input_tokens_seen": 53680275, "step": 2471, "time_per_iteration": 4.162387132644653 }, { "auxiliary_loss_clip": 0.01135244, "auxiliary_loss_mlp": 0.01047611, "balance_loss_clip": 1.05438995, "balance_loss_mlp": 1.02717888, "epoch": 0.1486246805952202, "flos": 18077288515200.0, "grad_norm": 1.979025280241548, "language_loss": 0.89558828, "learning_rate": 3.854194206597615e-06, "loss": 0.91741687, "num_input_tokens_seen": 53698270, "step": 2472, "time_per_iteration": 2.739457607269287 }, { "auxiliary_loss_clip": 0.01134625, "auxiliary_loss_mlp": 0.01049109, "balance_loss_clip": 1.06334805, "balance_loss_mlp": 1.02964163, "epoch": 0.14868480384788818, "flos": 19353625459200.0, "grad_norm": 2.6029609251362764, "language_loss": 0.80801564, "learning_rate": 3.854048192933008e-06, "loss": 0.82985294, "num_input_tokens_seen": 53716845, "step": 2473, "time_per_iteration": 4.412883758544922 }, { "auxiliary_loss_clip": 0.01161034, "auxiliary_loss_mlp": 0.01051306, "balance_loss_clip": 1.0626657, "balance_loss_mlp": 1.03267312, "epoch": 0.14874492710055615, "flos": 22200192426240.0, "grad_norm": 3.426519274325147, "language_loss": 0.77372944, "learning_rate": 3.853902108962709e-06, "loss": 0.79585278, "num_input_tokens_seen": 53734970, "step": 2474, "time_per_iteration": 2.6879520416259766 }, { "auxiliary_loss_clip": 0.01124216, "auxiliary_loss_mlp": 0.01059785, "balance_loss_clip": 1.05597806, "balance_loss_mlp": 1.04041362, "epoch": 0.1488050503532241, "flos": 21103444506240.0, "grad_norm": 2.4771626433268734, "language_loss": 0.82151824, "learning_rate": 3.853755954692255e-06, "loss": 0.84335828, "num_input_tokens_seen": 53753415, "step": 2475, "time_per_iteration": 2.7828469276428223 }, { "auxiliary_loss_clip": 0.01115855, "auxiliary_loss_mlp": 0.01052322, "balance_loss_clip": 1.0614953, "balance_loss_mlp": 1.03341544, "epoch": 0.14886517360589208, "flos": 12786569625600.0, "grad_norm": 1.9349243252831771, "language_loss": 0.80917645, "learning_rate": 3.85360973012719e-06, "loss": 0.83085823, "num_input_tokens_seen": 53770305, "step": 2476, "time_per_iteration": 2.7227590084075928 }, { "auxiliary_loss_clip": 0.01156019, "auxiliary_loss_mlp": 0.0105036, "balance_loss_clip": 1.06338036, "balance_loss_mlp": 1.03216898, "epoch": 0.14892529685856004, "flos": 29022860419200.0, "grad_norm": 2.0032169897498346, "language_loss": 0.77659523, "learning_rate": 3.853463435273058e-06, "loss": 0.79865897, "num_input_tokens_seen": 53788895, "step": 2477, "time_per_iteration": 2.740241765975952 }, { "auxiliary_loss_clip": 0.0110234, "auxiliary_loss_mlp": 0.01092005, "balance_loss_clip": 1.07879949, "balance_loss_mlp": 1.08730817, "epoch": 0.148985420111228, "flos": 61926121054080.0, "grad_norm": 0.8188153224748298, "language_loss": 0.60153681, "learning_rate": 3.853317070135407e-06, "loss": 0.62348026, "num_input_tokens_seen": 53850260, "step": 2478, "time_per_iteration": 3.2467947006225586 }, { "auxiliary_loss_clip": 0.01107417, "auxiliary_loss_mlp": 0.01048452, "balance_loss_clip": 1.0516423, "balance_loss_mlp": 1.03041577, "epoch": 0.149045543363896, "flos": 23915106432000.0, "grad_norm": 2.666109649137694, "language_loss": 0.7139731, "learning_rate": 3.853170634719787e-06, "loss": 0.73553181, "num_input_tokens_seen": 53867520, "step": 2479, "time_per_iteration": 2.7973475456237793 }, { "auxiliary_loss_clip": 0.01140551, "auxiliary_loss_mlp": 0.01043104, "balance_loss_clip": 1.05563831, "balance_loss_mlp": 1.02407789, "epoch": 0.14910566661656396, "flos": 23654394541440.0, "grad_norm": 1.7687137634424535, "language_loss": 0.80758464, "learning_rate": 3.853024129031751e-06, "loss": 0.82942122, "num_input_tokens_seen": 53886620, "step": 2480, "time_per_iteration": 2.7238829135894775 }, { "auxiliary_loss_clip": 0.01138106, "auxiliary_loss_mlp": 0.0104537, "balance_loss_clip": 1.0584991, "balance_loss_mlp": 1.02627277, "epoch": 0.14916578986923193, "flos": 20515299212160.0, "grad_norm": 4.65741826395702, "language_loss": 0.84375542, "learning_rate": 3.852877553076854e-06, "loss": 0.86559021, "num_input_tokens_seen": 53902230, "step": 2481, "time_per_iteration": 2.791550874710083 }, { "auxiliary_loss_clip": 0.01149484, "auxiliary_loss_mlp": 0.01050268, "balance_loss_clip": 1.05772805, "balance_loss_mlp": 1.02948999, "epoch": 0.1492259131218999, "flos": 22491822948480.0, "grad_norm": 8.035113387353048, "language_loss": 0.77703977, "learning_rate": 3.8527309068606546e-06, "loss": 0.79903734, "num_input_tokens_seen": 53919475, "step": 2482, "time_per_iteration": 2.7310593128204346 }, { "auxiliary_loss_clip": 0.01133163, "auxiliary_loss_mlp": 0.01040426, "balance_loss_clip": 1.05452228, "balance_loss_mlp": 1.02032781, "epoch": 0.14928603637456786, "flos": 23185868515200.0, "grad_norm": 2.207731010812049, "language_loss": 0.78967929, "learning_rate": 3.852584190388713e-06, "loss": 0.81141514, "num_input_tokens_seen": 53939150, "step": 2483, "time_per_iteration": 2.749671220779419 }, { "auxiliary_loss_clip": 0.01154122, "auxiliary_loss_mlp": 0.00776708, "balance_loss_clip": 1.06144214, "balance_loss_mlp": 1.00029397, "epoch": 0.14934615962723582, "flos": 21653237053440.0, "grad_norm": 2.020127706544282, "language_loss": 0.70361555, "learning_rate": 3.852437403666595e-06, "loss": 0.72292387, "num_input_tokens_seen": 53958735, "step": 2484, "time_per_iteration": 2.737781524658203 }, { "auxiliary_loss_clip": 0.01141919, "auxiliary_loss_mlp": 0.00778215, "balance_loss_clip": 1.05718136, "balance_loss_mlp": 1.00030363, "epoch": 0.1494062828799038, "flos": 27010066924800.0, "grad_norm": 2.165877689982274, "language_loss": 0.84666765, "learning_rate": 3.852290546699863e-06, "loss": 0.86586899, "num_input_tokens_seen": 53975065, "step": 2485, "time_per_iteration": 2.697976589202881 }, { "auxiliary_loss_clip": 0.01145272, "auxiliary_loss_mlp": 0.0104224, "balance_loss_clip": 1.05639958, "balance_loss_mlp": 1.02257001, "epoch": 0.14946640613257178, "flos": 21214947300480.0, "grad_norm": 2.5229241908443023, "language_loss": 0.8476423, "learning_rate": 3.8521436194940894e-06, "loss": 0.86951739, "num_input_tokens_seen": 53993330, "step": 2486, "time_per_iteration": 2.6799628734588623 }, { "auxiliary_loss_clip": 0.01149031, "auxiliary_loss_mlp": 0.01039312, "balance_loss_clip": 1.05667424, "balance_loss_mlp": 1.0230875, "epoch": 0.14952652938523975, "flos": 13370872164480.0, "grad_norm": 2.1822908802725203, "language_loss": 0.74762607, "learning_rate": 3.851996622054842e-06, "loss": 0.76950949, "num_input_tokens_seen": 54010515, "step": 2487, "time_per_iteration": 2.8037290573120117 }, { "auxiliary_loss_clip": 0.01153097, "auxiliary_loss_mlp": 0.01044274, "balance_loss_clip": 1.05934322, "balance_loss_mlp": 1.02611899, "epoch": 0.1495866526379077, "flos": 35517699959040.0, "grad_norm": 16.320028017118723, "language_loss": 0.72210175, "learning_rate": 3.8518495543877e-06, "loss": 0.74407548, "num_input_tokens_seen": 54031315, "step": 2488, "time_per_iteration": 2.8031094074249268 }, { "auxiliary_loss_clip": 0.01137536, "auxiliary_loss_mlp": 0.01054916, "balance_loss_clip": 1.05569518, "balance_loss_mlp": 1.03636682, "epoch": 0.14964677589057568, "flos": 17632749795840.0, "grad_norm": 3.2458980886023143, "language_loss": 0.71352434, "learning_rate": 3.851702416498235e-06, "loss": 0.73544884, "num_input_tokens_seen": 54045965, "step": 2489, "time_per_iteration": 2.648883819580078 }, { "auxiliary_loss_clip": 0.0113767, "auxiliary_loss_mlp": 0.01052603, "balance_loss_clip": 1.05376494, "balance_loss_mlp": 1.03357768, "epoch": 0.14970689914324364, "flos": 20185280029440.0, "grad_norm": 3.893198448080141, "language_loss": 0.81559736, "learning_rate": 3.8515552083920295e-06, "loss": 0.8375001, "num_input_tokens_seen": 54059960, "step": 2490, "time_per_iteration": 2.702808380126953 }, { "auxiliary_loss_clip": 0.01125097, "auxiliary_loss_mlp": 0.01055928, "balance_loss_clip": 1.05606139, "balance_loss_mlp": 1.03803492, "epoch": 0.1497670223959116, "flos": 37228699382400.0, "grad_norm": 1.9071281232744548, "language_loss": 0.80057055, "learning_rate": 3.851407930074666e-06, "loss": 0.82238084, "num_input_tokens_seen": 54079330, "step": 2491, "time_per_iteration": 2.833272933959961 }, { "auxiliary_loss_clip": 0.01143407, "auxiliary_loss_mlp": 0.01052558, "balance_loss_clip": 1.05301452, "balance_loss_mlp": 1.03195894, "epoch": 0.1498271456485796, "flos": 24455848752000.0, "grad_norm": 2.3105790695512294, "language_loss": 0.90820229, "learning_rate": 3.851260581551727e-06, "loss": 0.93016195, "num_input_tokens_seen": 54097555, "step": 2492, "time_per_iteration": 2.684178352355957 }, { "auxiliary_loss_clip": 0.01152331, "auxiliary_loss_mlp": 0.01063543, "balance_loss_clip": 1.05835843, "balance_loss_mlp": 1.04508913, "epoch": 0.14988726890124757, "flos": 16253601148800.0, "grad_norm": 6.881290297472923, "language_loss": 0.79406559, "learning_rate": 3.851113162828802e-06, "loss": 0.81622434, "num_input_tokens_seen": 54115600, "step": 2493, "time_per_iteration": 2.6558918952941895 }, { "auxiliary_loss_clip": 0.0114858, "auxiliary_loss_mlp": 0.01052018, "balance_loss_clip": 1.05345511, "balance_loss_mlp": 1.03258693, "epoch": 0.14994739215391553, "flos": 20666555383680.0, "grad_norm": 2.3431247769189967, "language_loss": 0.79894584, "learning_rate": 3.85096567391148e-06, "loss": 0.82095182, "num_input_tokens_seen": 54135220, "step": 2494, "time_per_iteration": 2.6774168014526367 }, { "auxiliary_loss_clip": 0.01137216, "auxiliary_loss_mlp": 0.01050857, "balance_loss_clip": 1.05474579, "balance_loss_mlp": 1.03212965, "epoch": 0.1500075154065835, "flos": 70652375239680.0, "grad_norm": 1.928941284350508, "language_loss": 0.66480517, "learning_rate": 3.850818114805354e-06, "loss": 0.68668592, "num_input_tokens_seen": 54161065, "step": 2495, "time_per_iteration": 3.1090729236602783 }, { "auxiliary_loss_clip": 0.01103374, "auxiliary_loss_mlp": 0.01038654, "balance_loss_clip": 1.06896818, "balance_loss_mlp": 1.03560257, "epoch": 0.15006763865925146, "flos": 68011937447040.0, "grad_norm": 0.9030283421527312, "language_loss": 0.59524739, "learning_rate": 3.850670485516019e-06, "loss": 0.61666763, "num_input_tokens_seen": 54225095, "step": 2496, "time_per_iteration": 3.2250726222991943 }, { "auxiliary_loss_clip": 0.01163934, "auxiliary_loss_mlp": 0.01055725, "balance_loss_clip": 1.05690169, "balance_loss_mlp": 1.0360074, "epoch": 0.15012776191191943, "flos": 18916269459840.0, "grad_norm": 3.063784198565679, "language_loss": 0.65276247, "learning_rate": 3.850522786049075e-06, "loss": 0.67495906, "num_input_tokens_seen": 54243750, "step": 2497, "time_per_iteration": 2.619946002960205 }, { "auxiliary_loss_clip": 0.01125657, "auxiliary_loss_mlp": 0.01054091, "balance_loss_clip": 1.05308235, "balance_loss_mlp": 1.03316998, "epoch": 0.1501878851645874, "flos": 23701330638720.0, "grad_norm": 1.5552670947231086, "language_loss": 0.75182658, "learning_rate": 3.850375016410121e-06, "loss": 0.77362406, "num_input_tokens_seen": 54266185, "step": 2498, "time_per_iteration": 2.778163433074951 }, { "auxiliary_loss_clip": 0.01132738, "auxiliary_loss_mlp": 0.01046919, "balance_loss_clip": 1.05919099, "balance_loss_mlp": 1.02701163, "epoch": 0.15024800841725539, "flos": 20412523422720.0, "grad_norm": 3.357364003851319, "language_loss": 0.71821117, "learning_rate": 3.850227176604761e-06, "loss": 0.74000776, "num_input_tokens_seen": 54283940, "step": 2499, "time_per_iteration": 2.6929259300231934 }, { "auxiliary_loss_clip": 0.01134239, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.0547812, "balance_loss_mlp": 1.03236222, "epoch": 0.15030813166992335, "flos": 31831002812160.0, "grad_norm": 2.1406696998963652, "language_loss": 0.7206136, "learning_rate": 3.850079266638601e-06, "loss": 0.7424742, "num_input_tokens_seen": 54304830, "step": 2500, "time_per_iteration": 2.769988536834717 }, { "auxiliary_loss_clip": 0.01134021, "auxiliary_loss_mlp": 0.0105021, "balance_loss_clip": 1.06063724, "balance_loss_mlp": 1.03181624, "epoch": 0.15036825492259132, "flos": 35657822914560.0, "grad_norm": 2.0251881980439306, "language_loss": 0.65127194, "learning_rate": 3.849931286517249e-06, "loss": 0.6731143, "num_input_tokens_seen": 54325595, "step": 2501, "time_per_iteration": 2.810945510864258 }, { "auxiliary_loss_clip": 0.01137877, "auxiliary_loss_mlp": 0.01055223, "balance_loss_clip": 1.0541079, "balance_loss_mlp": 1.03511274, "epoch": 0.15042837817525928, "flos": 18838163335680.0, "grad_norm": 2.209666371186328, "language_loss": 0.83401144, "learning_rate": 3.849783236246318e-06, "loss": 0.85594243, "num_input_tokens_seen": 54342180, "step": 2502, "time_per_iteration": 2.6780545711517334 }, { "auxiliary_loss_clip": 0.01122961, "auxiliary_loss_mlp": 0.01049887, "balance_loss_clip": 1.05318308, "balance_loss_mlp": 1.0323875, "epoch": 0.15048850142792725, "flos": 19535548867200.0, "grad_norm": 2.0319272128830947, "language_loss": 0.77134645, "learning_rate": 3.849635115831421e-06, "loss": 0.79307491, "num_input_tokens_seen": 54360255, "step": 2503, "time_per_iteration": 2.7579123973846436 }, { "auxiliary_loss_clip": 0.01159116, "auxiliary_loss_mlp": 0.01044094, "balance_loss_clip": 1.05766046, "balance_loss_mlp": 1.02692807, "epoch": 0.1505486246805952, "flos": 22017550746240.0, "grad_norm": 1.9852139459946199, "language_loss": 0.85514295, "learning_rate": 3.849486925278176e-06, "loss": 0.87717503, "num_input_tokens_seen": 54378260, "step": 2504, "time_per_iteration": 2.631882905960083 }, { "auxiliary_loss_clip": 0.01146113, "auxiliary_loss_mlp": 0.01048035, "balance_loss_clip": 1.05622697, "balance_loss_mlp": 1.03098798, "epoch": 0.15060874793326318, "flos": 20743153136640.0, "grad_norm": 1.8222645508164372, "language_loss": 0.83178544, "learning_rate": 3.8493386645922e-06, "loss": 0.85372692, "num_input_tokens_seen": 54399745, "step": 2505, "time_per_iteration": 2.7706007957458496 }, { "auxiliary_loss_clip": 0.01125699, "auxiliary_loss_mlp": 0.01053819, "balance_loss_clip": 1.05586648, "balance_loss_mlp": 1.03590202, "epoch": 0.15066887118593117, "flos": 16471902055680.0, "grad_norm": 2.0148067518000445, "language_loss": 0.76044405, "learning_rate": 3.849190333779117e-06, "loss": 0.7822392, "num_input_tokens_seen": 54417105, "step": 2506, "time_per_iteration": 2.70989990234375 }, { "auxiliary_loss_clip": 0.01165314, "auxiliary_loss_mlp": 0.01041911, "balance_loss_clip": 1.05785728, "balance_loss_mlp": 1.02305174, "epoch": 0.15072899443859913, "flos": 19859319083520.0, "grad_norm": 2.823460856599666, "language_loss": 0.76220375, "learning_rate": 3.849041932844552e-06, "loss": 0.78427601, "num_input_tokens_seen": 54433920, "step": 2507, "time_per_iteration": 2.5367634296417236 }, { "auxiliary_loss_clip": 0.01144479, "auxiliary_loss_mlp": 0.01041094, "balance_loss_clip": 1.05261898, "balance_loss_mlp": 1.02306986, "epoch": 0.1507891176912671, "flos": 20776226584320.0, "grad_norm": 2.5197772895304906, "language_loss": 0.68633789, "learning_rate": 3.848893461794131e-06, "loss": 0.70819366, "num_input_tokens_seen": 54451540, "step": 2508, "time_per_iteration": 4.303388833999634 }, { "auxiliary_loss_clip": 0.01130299, "auxiliary_loss_mlp": 0.01046507, "balance_loss_clip": 1.05477214, "balance_loss_mlp": 1.02835178, "epoch": 0.15084924094393506, "flos": 23586631534080.0, "grad_norm": 2.840517748098311, "language_loss": 0.77994299, "learning_rate": 3.8487449206334845e-06, "loss": 0.80171108, "num_input_tokens_seen": 54470800, "step": 2509, "time_per_iteration": 4.380200147628784 }, { "auxiliary_loss_clip": 0.01141335, "auxiliary_loss_mlp": 0.00776843, "balance_loss_clip": 1.05463386, "balance_loss_mlp": 1.00027037, "epoch": 0.15090936419660303, "flos": 18911313383040.0, "grad_norm": 2.53406994590866, "language_loss": 0.79959804, "learning_rate": 3.848596309368246e-06, "loss": 0.81877983, "num_input_tokens_seen": 54486525, "step": 2510, "time_per_iteration": 4.219487428665161 }, { "auxiliary_loss_clip": 0.01150641, "auxiliary_loss_mlp": 0.01047345, "balance_loss_clip": 1.05529225, "balance_loss_mlp": 1.02794981, "epoch": 0.150969487449271, "flos": 17928223073280.0, "grad_norm": 1.8628702139594306, "language_loss": 0.73398602, "learning_rate": 3.8484476280040495e-06, "loss": 0.75596589, "num_input_tokens_seen": 54503795, "step": 2511, "time_per_iteration": 2.62237811088562 }, { "auxiliary_loss_clip": 0.01094269, "auxiliary_loss_mlp": 0.0104236, "balance_loss_clip": 1.04747009, "balance_loss_mlp": 1.02365553, "epoch": 0.151029610701939, "flos": 24243078539520.0, "grad_norm": 2.20399257021602, "language_loss": 0.68716824, "learning_rate": 3.848298876546534e-06, "loss": 0.70853454, "num_input_tokens_seen": 54523025, "step": 2512, "time_per_iteration": 2.823359489440918 }, { "auxiliary_loss_clip": 0.01149398, "auxiliary_loss_mlp": 0.01043296, "balance_loss_clip": 1.05574036, "balance_loss_mlp": 1.02615356, "epoch": 0.15108973395460695, "flos": 30262496641920.0, "grad_norm": 2.6278607305338877, "language_loss": 0.73833561, "learning_rate": 3.84815005500134e-06, "loss": 0.76026255, "num_input_tokens_seen": 54545025, "step": 2513, "time_per_iteration": 4.386258602142334 }, { "auxiliary_loss_clip": 0.01059691, "auxiliary_loss_mlp": 0.01109321, "balance_loss_clip": 1.0685482, "balance_loss_mlp": 1.10529137, "epoch": 0.15114985720727492, "flos": 60437624428800.0, "grad_norm": 0.9017688875456507, "language_loss": 0.64720047, "learning_rate": 3.84800116337411e-06, "loss": 0.6688906, "num_input_tokens_seen": 54604545, "step": 2514, "time_per_iteration": 3.254983425140381 }, { "auxiliary_loss_clip": 0.01146323, "auxiliary_loss_mlp": 0.0104352, "balance_loss_clip": 1.05674648, "balance_loss_mlp": 1.02584124, "epoch": 0.15120998045994288, "flos": 20521691832960.0, "grad_norm": 3.178381755435586, "language_loss": 0.72995645, "learning_rate": 3.8478522016704916e-06, "loss": 0.7518549, "num_input_tokens_seen": 54620590, "step": 2515, "time_per_iteration": 2.67921781539917 }, { "auxiliary_loss_clip": 0.01133382, "auxiliary_loss_mlp": 0.01040315, "balance_loss_clip": 1.05675673, "balance_loss_mlp": 1.02120531, "epoch": 0.15127010371261085, "flos": 21178893024000.0, "grad_norm": 2.0712989062813243, "language_loss": 0.7773214, "learning_rate": 3.8477031698961325e-06, "loss": 0.79905832, "num_input_tokens_seen": 54640410, "step": 2516, "time_per_iteration": 2.763467788696289 }, { "auxiliary_loss_clip": 0.01087601, "auxiliary_loss_mlp": 0.01004779, "balance_loss_clip": 1.05344796, "balance_loss_mlp": 1.00160813, "epoch": 0.1513302269652788, "flos": 65320648974720.0, "grad_norm": 0.7270407819118658, "language_loss": 0.54622567, "learning_rate": 3.8475540680566835e-06, "loss": 0.56714946, "num_input_tokens_seen": 54701430, "step": 2517, "time_per_iteration": 3.2293660640716553 }, { "auxiliary_loss_clip": 0.01110142, "auxiliary_loss_mlp": 0.0104362, "balance_loss_clip": 1.04499209, "balance_loss_mlp": 1.02427244, "epoch": 0.15139035021794678, "flos": 19135827342720.0, "grad_norm": 3.035771526476276, "language_loss": 0.78264821, "learning_rate": 3.8474048961577995e-06, "loss": 0.80418587, "num_input_tokens_seen": 54720845, "step": 2518, "time_per_iteration": 2.8154754638671875 }, { "auxiliary_loss_clip": 0.01147342, "auxiliary_loss_mlp": 0.01056368, "balance_loss_clip": 1.05279088, "balance_loss_mlp": 1.03681803, "epoch": 0.15145047347061477, "flos": 26578564842240.0, "grad_norm": 2.1881526177791097, "language_loss": 0.70480245, "learning_rate": 3.847255654205137e-06, "loss": 0.72683954, "num_input_tokens_seen": 54740495, "step": 2519, "time_per_iteration": 2.7098515033721924 }, { "auxiliary_loss_clip": 0.01152463, "auxiliary_loss_mlp": 0.01056975, "balance_loss_clip": 1.05683672, "balance_loss_mlp": 1.03802037, "epoch": 0.15151059672328274, "flos": 20302959962880.0, "grad_norm": 1.9048594994100874, "language_loss": 0.78681207, "learning_rate": 3.847106342204354e-06, "loss": 0.80890644, "num_input_tokens_seen": 54758415, "step": 2520, "time_per_iteration": 2.664187431335449 }, { "auxiliary_loss_clip": 0.01140573, "auxiliary_loss_mlp": 0.01071607, "balance_loss_clip": 1.05435348, "balance_loss_mlp": 1.05244994, "epoch": 0.1515707199759507, "flos": 27228367831680.0, "grad_norm": 3.950911503454746, "language_loss": 0.74849677, "learning_rate": 3.846956960161114e-06, "loss": 0.77061862, "num_input_tokens_seen": 54779355, "step": 2521, "time_per_iteration": 2.7900772094726562 }, { "auxiliary_loss_clip": 0.01132038, "auxiliary_loss_mlp": 0.01055874, "balance_loss_clip": 1.05052209, "balance_loss_mlp": 1.0360136, "epoch": 0.15163084322861867, "flos": 23587349806080.0, "grad_norm": 4.620979243079986, "language_loss": 0.8253814, "learning_rate": 3.84680750808108e-06, "loss": 0.84726053, "num_input_tokens_seen": 54799465, "step": 2522, "time_per_iteration": 2.7216525077819824 }, { "auxiliary_loss_clip": 0.01051858, "auxiliary_loss_mlp": 0.01048797, "balance_loss_clip": 1.05645704, "balance_loss_mlp": 1.04595995, "epoch": 0.15169096648128663, "flos": 66889622021760.0, "grad_norm": 0.8362305181264502, "language_loss": 0.57885599, "learning_rate": 3.846657985969922e-06, "loss": 0.59986252, "num_input_tokens_seen": 54857665, "step": 2523, "time_per_iteration": 3.2375056743621826 }, { "auxiliary_loss_clip": 0.0114147, "auxiliary_loss_mlp": 0.01057964, "balance_loss_clip": 1.05213499, "balance_loss_mlp": 1.0368042, "epoch": 0.1517510897339546, "flos": 29095435848960.0, "grad_norm": 1.8054087157705183, "language_loss": 0.74795163, "learning_rate": 3.8465083938333066e-06, "loss": 0.76994598, "num_input_tokens_seen": 54879895, "step": 2524, "time_per_iteration": 2.711557388305664 }, { "auxiliary_loss_clip": 0.01138185, "auxiliary_loss_mlp": 0.01057236, "balance_loss_clip": 1.05304718, "balance_loss_mlp": 1.03865099, "epoch": 0.1518112129866226, "flos": 18406553512320.0, "grad_norm": 1.8255227790100423, "language_loss": 0.74631184, "learning_rate": 3.8463587316769085e-06, "loss": 0.76826608, "num_input_tokens_seen": 54898245, "step": 2525, "time_per_iteration": 2.6936984062194824 }, { "auxiliary_loss_clip": 0.01144047, "auxiliary_loss_mlp": 0.01057009, "balance_loss_clip": 1.05403006, "balance_loss_mlp": 1.03747034, "epoch": 0.15187133623929056, "flos": 19425410789760.0, "grad_norm": 1.8907352833287865, "language_loss": 0.79600316, "learning_rate": 3.846208999506402e-06, "loss": 0.81801373, "num_input_tokens_seen": 54917060, "step": 2526, "time_per_iteration": 2.651494264602661 }, { "auxiliary_loss_clip": 0.01135228, "auxiliary_loss_mlp": 0.01047798, "balance_loss_clip": 1.05538774, "balance_loss_mlp": 1.03056002, "epoch": 0.15193145949195852, "flos": 17566207850880.0, "grad_norm": 1.7677336965262924, "language_loss": 0.8443349, "learning_rate": 3.846059197327466e-06, "loss": 0.86616516, "num_input_tokens_seen": 54936365, "step": 2527, "time_per_iteration": 2.702683448791504 }, { "auxiliary_loss_clip": 0.01124925, "auxiliary_loss_mlp": 0.01049207, "balance_loss_clip": 1.04976487, "balance_loss_mlp": 1.02985954, "epoch": 0.15199158274462649, "flos": 36176265866880.0, "grad_norm": 1.85678489681458, "language_loss": 0.69361663, "learning_rate": 3.845909325145779e-06, "loss": 0.7153579, "num_input_tokens_seen": 54961365, "step": 2528, "time_per_iteration": 2.9250690937042236 }, { "auxiliary_loss_clip": 0.01134092, "auxiliary_loss_mlp": 0.01055056, "balance_loss_clip": 1.05266535, "balance_loss_mlp": 1.03587484, "epoch": 0.15205170599729445, "flos": 23074042498560.0, "grad_norm": 2.004144148858156, "language_loss": 0.86482549, "learning_rate": 3.845759382967026e-06, "loss": 0.88671696, "num_input_tokens_seen": 54980750, "step": 2529, "time_per_iteration": 2.7277863025665283 }, { "auxiliary_loss_clip": 0.01124798, "auxiliary_loss_mlp": 0.01041651, "balance_loss_clip": 1.05046487, "balance_loss_mlp": 1.02297091, "epoch": 0.15211182924996242, "flos": 21908382336000.0, "grad_norm": 2.544775548600603, "language_loss": 0.83399373, "learning_rate": 3.845609370796893e-06, "loss": 0.85565823, "num_input_tokens_seen": 54999675, "step": 2530, "time_per_iteration": 2.8717291355133057 }, { "auxiliary_loss_clip": 0.01125761, "auxiliary_loss_mlp": 0.01048121, "balance_loss_clip": 1.05035281, "balance_loss_mlp": 1.02940559, "epoch": 0.15217195250263038, "flos": 13881521865600.0, "grad_norm": 2.1410437006568723, "language_loss": 0.80404246, "learning_rate": 3.845459288641066e-06, "loss": 0.82578129, "num_input_tokens_seen": 55018295, "step": 2531, "time_per_iteration": 2.8444995880126953 }, { "auxiliary_loss_clip": 0.01143114, "auxiliary_loss_mlp": 0.01043494, "balance_loss_clip": 1.05216551, "balance_loss_mlp": 1.02613723, "epoch": 0.15223207575529837, "flos": 24535319592960.0, "grad_norm": 1.7922494378130023, "language_loss": 0.78874445, "learning_rate": 3.8453091365052394e-06, "loss": 0.81061059, "num_input_tokens_seen": 55037975, "step": 2532, "time_per_iteration": 2.9122390747070312 }, { "auxiliary_loss_clip": 0.01149502, "auxiliary_loss_mlp": 0.0104596, "balance_loss_clip": 1.05737543, "balance_loss_mlp": 1.02676702, "epoch": 0.15229219900796634, "flos": 25556798563200.0, "grad_norm": 1.9533698136575197, "language_loss": 0.87679356, "learning_rate": 3.845158914395105e-06, "loss": 0.89874816, "num_input_tokens_seen": 55057135, "step": 2533, "time_per_iteration": 2.7987985610961914 }, { "auxiliary_loss_clip": 0.01117955, "auxiliary_loss_mlp": 0.01048672, "balance_loss_clip": 1.05235386, "balance_loss_mlp": 1.02983665, "epoch": 0.1523523222606343, "flos": 18217806520320.0, "grad_norm": 2.391026063452041, "language_loss": 0.78886449, "learning_rate": 3.84500862231636e-06, "loss": 0.81053078, "num_input_tokens_seen": 55075525, "step": 2534, "time_per_iteration": 2.7587406635284424 }, { "auxiliary_loss_clip": 0.01164218, "auxiliary_loss_mlp": 0.0104722, "balance_loss_clip": 1.05609345, "balance_loss_mlp": 1.0270381, "epoch": 0.15241244551330227, "flos": 13260087642240.0, "grad_norm": 2.689732363294508, "language_loss": 0.76809752, "learning_rate": 3.844858260274702e-06, "loss": 0.79021192, "num_input_tokens_seen": 55090845, "step": 2535, "time_per_iteration": 2.7494406700134277 }, { "auxiliary_loss_clip": 0.01142628, "auxiliary_loss_mlp": 0.01042905, "balance_loss_clip": 1.05345285, "balance_loss_mlp": 1.02401042, "epoch": 0.15247256876597023, "flos": 19715568854400.0, "grad_norm": 2.2235871255319446, "language_loss": 0.78301942, "learning_rate": 3.844707828275835e-06, "loss": 0.80487478, "num_input_tokens_seen": 55108750, "step": 2536, "time_per_iteration": 2.738638401031494 }, { "auxiliary_loss_clip": 0.01128919, "auxiliary_loss_mlp": 0.0105368, "balance_loss_clip": 1.05349088, "balance_loss_mlp": 1.03497589, "epoch": 0.1525326920186382, "flos": 20375858615040.0, "grad_norm": 2.311649941233105, "language_loss": 0.75824189, "learning_rate": 3.844557326325461e-06, "loss": 0.78006792, "num_input_tokens_seen": 55126750, "step": 2537, "time_per_iteration": 2.632373809814453 }, { "auxiliary_loss_clip": 0.0114911, "auxiliary_loss_mlp": 0.01041421, "balance_loss_clip": 1.05675745, "balance_loss_mlp": 1.02331281, "epoch": 0.15259281527130616, "flos": 13589963170560.0, "grad_norm": 2.193148723631548, "language_loss": 0.77737647, "learning_rate": 3.8444067544292896e-06, "loss": 0.79928178, "num_input_tokens_seen": 55144690, "step": 2538, "time_per_iteration": 2.6835639476776123 }, { "auxiliary_loss_clip": 0.01109367, "auxiliary_loss_mlp": 0.01042256, "balance_loss_clip": 1.05477905, "balance_loss_mlp": 1.02480412, "epoch": 0.15265293852397416, "flos": 22860374446080.0, "grad_norm": 2.951423477379744, "language_loss": 0.89502335, "learning_rate": 3.844256112593029e-06, "loss": 0.91653961, "num_input_tokens_seen": 55166055, "step": 2539, "time_per_iteration": 2.7825794219970703 }, { "auxiliary_loss_clip": 0.01142581, "auxiliary_loss_mlp": 0.01045856, "balance_loss_clip": 1.05367279, "balance_loss_mlp": 1.02721143, "epoch": 0.15271306177664212, "flos": 29238108670080.0, "grad_norm": 2.1073423273657044, "language_loss": 0.93423879, "learning_rate": 3.844105400822391e-06, "loss": 0.95612311, "num_input_tokens_seen": 55186285, "step": 2540, "time_per_iteration": 2.717541456222534 }, { "auxiliary_loss_clip": 0.01131603, "auxiliary_loss_mlp": 0.01041863, "balance_loss_clip": 1.05122495, "balance_loss_mlp": 1.0240885, "epoch": 0.1527731850293101, "flos": 31246269310080.0, "grad_norm": 2.084754505375857, "language_loss": 0.75217843, "learning_rate": 3.843954619123092e-06, "loss": 0.77391309, "num_input_tokens_seen": 55207915, "step": 2541, "time_per_iteration": 2.8376123905181885 }, { "auxiliary_loss_clip": 0.01116303, "auxiliary_loss_mlp": 0.01045227, "balance_loss_clip": 1.04877007, "balance_loss_mlp": 1.0268805, "epoch": 0.15283330828197805, "flos": 22382079920640.0, "grad_norm": 2.037290364787748, "language_loss": 0.80996066, "learning_rate": 3.84380376750085e-06, "loss": 0.83157599, "num_input_tokens_seen": 55227860, "step": 2542, "time_per_iteration": 2.7110376358032227 }, { "auxiliary_loss_clip": 0.01160331, "auxiliary_loss_mlp": 0.01048661, "balance_loss_clip": 1.0566076, "balance_loss_mlp": 1.02992105, "epoch": 0.15289343153464602, "flos": 25520133755520.0, "grad_norm": 3.2152362880248857, "language_loss": 0.77796149, "learning_rate": 3.843652845961383e-06, "loss": 0.80005145, "num_input_tokens_seen": 55247330, "step": 2543, "time_per_iteration": 2.674131155014038 }, { "auxiliary_loss_clip": 0.01145565, "auxiliary_loss_mlp": 0.01042133, "balance_loss_clip": 1.05380869, "balance_loss_mlp": 1.02388239, "epoch": 0.15295355478731398, "flos": 22710016114560.0, "grad_norm": 2.4890924021550918, "language_loss": 0.85898137, "learning_rate": 3.843501854510416e-06, "loss": 0.88085836, "num_input_tokens_seen": 55266195, "step": 2544, "time_per_iteration": 2.685840606689453 }, { "auxiliary_loss_clip": 0.01149904, "auxiliary_loss_mlp": 0.01051141, "balance_loss_clip": 1.05162692, "balance_loss_mlp": 1.03061318, "epoch": 0.15301367803998198, "flos": 23251907669760.0, "grad_norm": 1.9817931887295275, "language_loss": 0.83159137, "learning_rate": 3.843350793153673e-06, "loss": 0.85360181, "num_input_tokens_seen": 55283305, "step": 2545, "time_per_iteration": 2.7415812015533447 }, { "auxiliary_loss_clip": 0.01158976, "auxiliary_loss_mlp": 0.01040888, "balance_loss_clip": 1.05556524, "balance_loss_mlp": 1.02257705, "epoch": 0.15307380129264994, "flos": 25886279041920.0, "grad_norm": 6.0131413628182, "language_loss": 0.71669161, "learning_rate": 3.843199661896884e-06, "loss": 0.73869026, "num_input_tokens_seen": 55303035, "step": 2546, "time_per_iteration": 2.6626265048980713 }, { "auxiliary_loss_clip": 0.01130357, "auxiliary_loss_mlp": 0.01047635, "balance_loss_clip": 1.05013335, "balance_loss_mlp": 1.02688098, "epoch": 0.1531339245453179, "flos": 46973239205760.0, "grad_norm": 1.6563553629779504, "language_loss": 0.77438712, "learning_rate": 3.843048460745779e-06, "loss": 0.79616702, "num_input_tokens_seen": 55327570, "step": 2547, "time_per_iteration": 4.451423168182373 }, { "auxiliary_loss_clip": 0.01107553, "auxiliary_loss_mlp": 0.01044692, "balance_loss_clip": 1.04845536, "balance_loss_mlp": 1.02517736, "epoch": 0.15319404779798587, "flos": 35882049565440.0, "grad_norm": 2.3544675813743834, "language_loss": 0.74357474, "learning_rate": 3.842897189706092e-06, "loss": 0.7650972, "num_input_tokens_seen": 55351090, "step": 2548, "time_per_iteration": 2.846991539001465 }, { "auxiliary_loss_clip": 0.01138346, "auxiliary_loss_mlp": 0.0105294, "balance_loss_clip": 1.05340147, "balance_loss_mlp": 1.03304434, "epoch": 0.15325417105065384, "flos": 25664638170240.0, "grad_norm": 1.446042531021912, "language_loss": 0.80296385, "learning_rate": 3.842745848783558e-06, "loss": 0.82487667, "num_input_tokens_seen": 55371050, "step": 2549, "time_per_iteration": 5.8849101066589355 }, { "auxiliary_loss_clip": 0.01144858, "auxiliary_loss_mlp": 0.01041292, "balance_loss_clip": 1.05108786, "balance_loss_mlp": 1.02255249, "epoch": 0.1533142943033218, "flos": 18770831291520.0, "grad_norm": 1.6149920159034452, "language_loss": 0.74602014, "learning_rate": 3.842594437983917e-06, "loss": 0.76788169, "num_input_tokens_seen": 55390375, "step": 2550, "time_per_iteration": 2.684868812561035 }, { "auxiliary_loss_clip": 0.01149823, "auxiliary_loss_mlp": 0.01040743, "balance_loss_clip": 1.05212283, "balance_loss_mlp": 1.02129996, "epoch": 0.15337441755598977, "flos": 23107367341440.0, "grad_norm": 2.33086854575276, "language_loss": 0.76910275, "learning_rate": 3.8424429573129115e-06, "loss": 0.79100841, "num_input_tokens_seen": 55408890, "step": 2551, "time_per_iteration": 4.415414333343506 }, { "auxiliary_loss_clip": 0.01086721, "auxiliary_loss_mlp": 0.01054065, "balance_loss_clip": 1.05333817, "balance_loss_mlp": 1.05116868, "epoch": 0.15343454080865776, "flos": 59861079227520.0, "grad_norm": 0.9493148205555214, "language_loss": 0.5665558, "learning_rate": 3.842291406776283e-06, "loss": 0.5879637, "num_input_tokens_seen": 55463815, "step": 2552, "time_per_iteration": 3.1105730533599854 }, { "auxiliary_loss_clip": 0.011128, "auxiliary_loss_mlp": 0.01039619, "balance_loss_clip": 1.05131924, "balance_loss_mlp": 1.0204618, "epoch": 0.15349466406132573, "flos": 11910887959680.0, "grad_norm": 2.183188616823757, "language_loss": 0.88550794, "learning_rate": 3.84213978637978e-06, "loss": 0.90703207, "num_input_tokens_seen": 55481050, "step": 2553, "time_per_iteration": 2.748298406600952 }, { "auxiliary_loss_clip": 0.01147024, "auxiliary_loss_mlp": 0.01042929, "balance_loss_clip": 1.05247378, "balance_loss_mlp": 1.0232954, "epoch": 0.1535547873139937, "flos": 24096922099200.0, "grad_norm": 1.8094820084348213, "language_loss": 0.7800495, "learning_rate": 3.841988096129152e-06, "loss": 0.80194902, "num_input_tokens_seen": 55500050, "step": 2554, "time_per_iteration": 2.6555569171905518 }, { "auxiliary_loss_clip": 0.01094445, "auxiliary_loss_mlp": 0.01053684, "balance_loss_clip": 1.04876757, "balance_loss_mlp": 1.03291798, "epoch": 0.15361491056666166, "flos": 17566459246080.0, "grad_norm": 2.372022486587551, "language_loss": 0.77472258, "learning_rate": 3.841836336030151e-06, "loss": 0.79620385, "num_input_tokens_seen": 55518125, "step": 2555, "time_per_iteration": 2.7507212162017822 }, { "auxiliary_loss_clip": 0.01129555, "auxiliary_loss_mlp": 0.01046723, "balance_loss_clip": 1.05400753, "balance_loss_mlp": 1.02873409, "epoch": 0.15367503381932962, "flos": 25046041121280.0, "grad_norm": 1.5517643759455655, "language_loss": 0.77453947, "learning_rate": 3.8416845060885305e-06, "loss": 0.79630232, "num_input_tokens_seen": 55540960, "step": 2556, "time_per_iteration": 2.7947654724121094 }, { "auxiliary_loss_clip": 0.01140725, "auxiliary_loss_mlp": 0.0077646, "balance_loss_clip": 1.05336452, "balance_loss_mlp": 1.00054574, "epoch": 0.15373515707199759, "flos": 21507332008320.0, "grad_norm": 1.8786460244833383, "language_loss": 0.90098578, "learning_rate": 3.84153260631005e-06, "loss": 0.92015761, "num_input_tokens_seen": 55559210, "step": 2557, "time_per_iteration": 2.702029228210449 }, { "auxiliary_loss_clip": 0.01137441, "auxiliary_loss_mlp": 0.01048546, "balance_loss_clip": 1.05146766, "balance_loss_mlp": 1.02862656, "epoch": 0.15379528032466555, "flos": 25994729180160.0, "grad_norm": 2.4046585493240102, "language_loss": 0.7092281, "learning_rate": 3.841380636700468e-06, "loss": 0.73108798, "num_input_tokens_seen": 55578925, "step": 2558, "time_per_iteration": 2.815653085708618 }, { "auxiliary_loss_clip": 0.01131603, "auxiliary_loss_mlp": 0.01045983, "balance_loss_clip": 1.04937947, "balance_loss_mlp": 1.02659965, "epoch": 0.15385540357733354, "flos": 19277315015040.0, "grad_norm": 2.1050139676488535, "language_loss": 0.92165422, "learning_rate": 3.841228597265548e-06, "loss": 0.94343007, "num_input_tokens_seen": 55597255, "step": 2559, "time_per_iteration": 2.7363967895507812 }, { "auxiliary_loss_clip": 0.011375, "auxiliary_loss_mlp": 0.01057878, "balance_loss_clip": 1.05492043, "balance_loss_mlp": 1.03711152, "epoch": 0.1539155268300015, "flos": 28549126920960.0, "grad_norm": 2.149412909113977, "language_loss": 0.63330692, "learning_rate": 3.841076488011055e-06, "loss": 0.65526068, "num_input_tokens_seen": 55619515, "step": 2560, "time_per_iteration": 2.811800003051758 }, { "auxiliary_loss_clip": 0.01132154, "auxiliary_loss_mlp": 0.01043974, "balance_loss_clip": 1.04914606, "balance_loss_mlp": 1.02416182, "epoch": 0.15397565008266947, "flos": 23547883737600.0, "grad_norm": 2.066473237183783, "language_loss": 0.88155699, "learning_rate": 3.8409243089427574e-06, "loss": 0.90331829, "num_input_tokens_seen": 55640050, "step": 2561, "time_per_iteration": 2.7991089820861816 }, { "auxiliary_loss_clip": 0.0114054, "auxiliary_loss_mlp": 0.01041879, "balance_loss_clip": 1.05085099, "balance_loss_mlp": 1.02380693, "epoch": 0.15403577333533744, "flos": 17129821518720.0, "grad_norm": 1.906051405357337, "language_loss": 0.83117974, "learning_rate": 3.840772060066425e-06, "loss": 0.85300398, "num_input_tokens_seen": 55658695, "step": 2562, "time_per_iteration": 2.6410810947418213 }, { "auxiliary_loss_clip": 0.01128756, "auxiliary_loss_mlp": 0.00778205, "balance_loss_clip": 1.04988563, "balance_loss_mlp": 1.00058532, "epoch": 0.1540958965880054, "flos": 17894503180800.0, "grad_norm": 2.3547297997270906, "language_loss": 0.74647415, "learning_rate": 3.840619741387832e-06, "loss": 0.76554382, "num_input_tokens_seen": 55676340, "step": 2563, "time_per_iteration": 2.6813745498657227 }, { "auxiliary_loss_clip": 0.01116857, "auxiliary_loss_mlp": 0.0104411, "balance_loss_clip": 1.05126941, "balance_loss_mlp": 1.02444029, "epoch": 0.15415601984067337, "flos": 32161057908480.0, "grad_norm": 2.842824767177756, "language_loss": 0.7609179, "learning_rate": 3.8404673529127534e-06, "loss": 0.78252757, "num_input_tokens_seen": 55698890, "step": 2564, "time_per_iteration": 2.832885265350342 }, { "auxiliary_loss_clip": 0.01133461, "auxiliary_loss_mlp": 0.01052887, "balance_loss_clip": 1.05174518, "balance_loss_mlp": 1.03443313, "epoch": 0.15421614309334136, "flos": 24024418496640.0, "grad_norm": 2.0125869911748575, "language_loss": 0.70960921, "learning_rate": 3.840314894646969e-06, "loss": 0.73147273, "num_input_tokens_seen": 55718535, "step": 2565, "time_per_iteration": 2.7352514266967773 }, { "auxiliary_loss_clip": 0.01137766, "auxiliary_loss_mlp": 0.01046908, "balance_loss_clip": 1.04731965, "balance_loss_mlp": 1.02787066, "epoch": 0.15427626634600933, "flos": 24386290064640.0, "grad_norm": 2.1021891280826965, "language_loss": 0.71605748, "learning_rate": 3.840162366596259e-06, "loss": 0.73790431, "num_input_tokens_seen": 55738970, "step": 2566, "time_per_iteration": 2.681710720062256 }, { "auxiliary_loss_clip": 0.01150619, "auxiliary_loss_mlp": 0.01040725, "balance_loss_clip": 1.04834008, "balance_loss_mlp": 1.02271223, "epoch": 0.1543363895986773, "flos": 23331522165120.0, "grad_norm": 1.7167104030167524, "language_loss": 0.84746087, "learning_rate": 3.840009768766408e-06, "loss": 0.86937428, "num_input_tokens_seen": 55759585, "step": 2567, "time_per_iteration": 2.6413686275482178 }, { "auxiliary_loss_clip": 0.01104646, "auxiliary_loss_mlp": 0.01050344, "balance_loss_clip": 1.04447246, "balance_loss_mlp": 1.03164053, "epoch": 0.15439651285134526, "flos": 24274284480000.0, "grad_norm": 2.9101336164483014, "language_loss": 0.78074998, "learning_rate": 3.839857101163202e-06, "loss": 0.80229992, "num_input_tokens_seen": 55779250, "step": 2568, "time_per_iteration": 2.7385261058807373 }, { "auxiliary_loss_clip": 0.01121993, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.04715753, "balance_loss_mlp": 1.01684201, "epoch": 0.15445663610401322, "flos": 22456163721600.0, "grad_norm": 1.852436867559063, "language_loss": 0.6991998, "learning_rate": 3.83970436379243e-06, "loss": 0.72079051, "num_input_tokens_seen": 55800470, "step": 2569, "time_per_iteration": 2.746974229812622 }, { "auxiliary_loss_clip": 0.01124209, "auxiliary_loss_mlp": 0.01040299, "balance_loss_clip": 1.04695952, "balance_loss_mlp": 1.02178574, "epoch": 0.1545167593566812, "flos": 22049510872320.0, "grad_norm": 1.7212875994527412, "language_loss": 0.76482332, "learning_rate": 3.839551556659884e-06, "loss": 0.78646845, "num_input_tokens_seen": 55817795, "step": 2570, "time_per_iteration": 2.7470619678497314 }, { "auxiliary_loss_clip": 0.01137702, "auxiliary_loss_mlp": 0.01038561, "balance_loss_clip": 1.04993737, "balance_loss_mlp": 1.0192852, "epoch": 0.15457688260934915, "flos": 19318253541120.0, "grad_norm": 2.5033166184578066, "language_loss": 0.77997506, "learning_rate": 3.839398679771359e-06, "loss": 0.80173767, "num_input_tokens_seen": 55836125, "step": 2571, "time_per_iteration": 2.692863702774048 }, { "auxiliary_loss_clip": 0.0113208, "auxiliary_loss_mlp": 0.0104519, "balance_loss_clip": 1.0498451, "balance_loss_mlp": 1.02704597, "epoch": 0.15463700586201715, "flos": 24133981956480.0, "grad_norm": 4.3242380509309015, "language_loss": 0.82932413, "learning_rate": 3.839245733132652e-06, "loss": 0.85109681, "num_input_tokens_seen": 55855280, "step": 2572, "time_per_iteration": 2.8341822624206543 }, { "auxiliary_loss_clip": 0.01156188, "auxiliary_loss_mlp": 0.01042592, "balance_loss_clip": 1.05181205, "balance_loss_mlp": 1.02383995, "epoch": 0.1546971291146851, "flos": 22420935457920.0, "grad_norm": 1.5874704718869805, "language_loss": 0.90373385, "learning_rate": 3.839092716749563e-06, "loss": 0.92572165, "num_input_tokens_seen": 55875695, "step": 2573, "time_per_iteration": 2.740121364593506 }, { "auxiliary_loss_clip": 0.01088424, "auxiliary_loss_mlp": 0.01049893, "balance_loss_clip": 1.04328668, "balance_loss_mlp": 1.03003311, "epoch": 0.15475725236735308, "flos": 17530225401600.0, "grad_norm": 1.596795561637076, "language_loss": 0.70298707, "learning_rate": 3.838939630627893e-06, "loss": 0.72437024, "num_input_tokens_seen": 55894575, "step": 2574, "time_per_iteration": 2.7629144191741943 }, { "auxiliary_loss_clip": 0.01127537, "auxiliary_loss_mlp": 0.01045732, "balance_loss_clip": 1.04714394, "balance_loss_mlp": 1.02509642, "epoch": 0.15481737562002104, "flos": 22561740771840.0, "grad_norm": 6.018921028505516, "language_loss": 0.82426423, "learning_rate": 3.838786474773448e-06, "loss": 0.84599686, "num_input_tokens_seen": 55912855, "step": 2575, "time_per_iteration": 2.656783103942871 }, { "auxiliary_loss_clip": 0.01127415, "auxiliary_loss_mlp": 0.01043354, "balance_loss_clip": 1.04681587, "balance_loss_mlp": 1.02584219, "epoch": 0.154877498872689, "flos": 24900567039360.0, "grad_norm": 1.8376318938002576, "language_loss": 0.85038638, "learning_rate": 3.838633249192036e-06, "loss": 0.87209404, "num_input_tokens_seen": 55932375, "step": 2576, "time_per_iteration": 2.648484230041504 }, { "auxiliary_loss_clip": 0.01152547, "auxiliary_loss_mlp": 0.01043401, "balance_loss_clip": 1.04872847, "balance_loss_mlp": 1.02499545, "epoch": 0.15493762212535697, "flos": 28147501975680.0, "grad_norm": 1.8027999188827728, "language_loss": 0.82271254, "learning_rate": 3.838479953889465e-06, "loss": 0.84467208, "num_input_tokens_seen": 55953970, "step": 2577, "time_per_iteration": 2.6355643272399902 }, { "auxiliary_loss_clip": 0.01126009, "auxiliary_loss_mlp": 0.01049018, "balance_loss_clip": 1.05147958, "balance_loss_mlp": 1.02984881, "epoch": 0.15499774537802496, "flos": 25411073086080.0, "grad_norm": 2.1677069711314463, "language_loss": 0.76556361, "learning_rate": 3.8383265888715525e-06, "loss": 0.78731394, "num_input_tokens_seen": 55973120, "step": 2578, "time_per_iteration": 2.649043560028076 }, { "auxiliary_loss_clip": 0.01123677, "auxiliary_loss_mlp": 0.01044461, "balance_loss_clip": 1.05155993, "balance_loss_mlp": 1.0253042, "epoch": 0.15505786863069293, "flos": 22091562720000.0, "grad_norm": 1.9614380224881987, "language_loss": 0.82443559, "learning_rate": 3.83817315414411e-06, "loss": 0.8461169, "num_input_tokens_seen": 55993260, "step": 2579, "time_per_iteration": 2.62631893157959 }, { "auxiliary_loss_clip": 0.01143904, "auxiliary_loss_mlp": 0.01044324, "balance_loss_clip": 1.05856657, "balance_loss_mlp": 1.02556014, "epoch": 0.1551179918833609, "flos": 18917131386240.0, "grad_norm": 2.610374735790095, "language_loss": 0.80465376, "learning_rate": 3.838019649712958e-06, "loss": 0.82653606, "num_input_tokens_seen": 56012130, "step": 2580, "time_per_iteration": 2.6512253284454346 }, { "auxiliary_loss_clip": 0.0107737, "auxiliary_loss_mlp": 0.01006304, "balance_loss_clip": 1.04551053, "balance_loss_mlp": 1.00360954, "epoch": 0.15517811513602886, "flos": 66239172587520.0, "grad_norm": 0.842131683094019, "language_loss": 0.58823448, "learning_rate": 3.8378660755839166e-06, "loss": 0.60907125, "num_input_tokens_seen": 56079045, "step": 2581, "time_per_iteration": 3.357855796813965 }, { "auxiliary_loss_clip": 0.01108206, "auxiliary_loss_mlp": 0.01047031, "balance_loss_clip": 1.04392648, "balance_loss_mlp": 1.0249418, "epoch": 0.15523823838869683, "flos": 24021078531840.0, "grad_norm": 1.9584677228939371, "language_loss": 0.84773678, "learning_rate": 3.8377124317628095e-06, "loss": 0.86928916, "num_input_tokens_seen": 56098745, "step": 2582, "time_per_iteration": 2.727062702178955 }, { "auxiliary_loss_clip": 0.01144131, "auxiliary_loss_mlp": 0.01051911, "balance_loss_clip": 1.05233002, "balance_loss_mlp": 1.03175235, "epoch": 0.1552983616413648, "flos": 20485062938880.0, "grad_norm": 2.466663791870015, "language_loss": 0.79050052, "learning_rate": 3.8375587182554625e-06, "loss": 0.81246096, "num_input_tokens_seen": 56117655, "step": 2583, "time_per_iteration": 2.664794683456421 }, { "auxiliary_loss_clip": 0.01139818, "auxiliary_loss_mlp": 0.01054771, "balance_loss_clip": 1.04957032, "balance_loss_mlp": 1.03252697, "epoch": 0.15535848489403276, "flos": 32123710742400.0, "grad_norm": 1.8743170599575527, "language_loss": 0.76320136, "learning_rate": 3.837404935067705e-06, "loss": 0.78514719, "num_input_tokens_seen": 56141960, "step": 2584, "time_per_iteration": 2.757392168045044 }, { "auxiliary_loss_clip": 0.01137324, "auxiliary_loss_mlp": 0.01042496, "balance_loss_clip": 1.04884958, "balance_loss_mlp": 1.02302885, "epoch": 0.15541860814670075, "flos": 19098444263040.0, "grad_norm": 1.6493041410587026, "language_loss": 0.75269651, "learning_rate": 3.837251082205368e-06, "loss": 0.77449471, "num_input_tokens_seen": 56161430, "step": 2585, "time_per_iteration": 2.6497461795806885 }, { "auxiliary_loss_clip": 0.01116144, "auxiliary_loss_mlp": 0.01042356, "balance_loss_clip": 1.04862189, "balance_loss_mlp": 1.02321053, "epoch": 0.1554787313993687, "flos": 19172097100800.0, "grad_norm": 2.068989677221064, "language_loss": 0.61187196, "learning_rate": 3.837097159674286e-06, "loss": 0.63345695, "num_input_tokens_seen": 56179390, "step": 2586, "time_per_iteration": 2.697852373123169 }, { "auxiliary_loss_clip": 0.01129408, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.04842281, "balance_loss_mlp": 1.02341127, "epoch": 0.15553885465203668, "flos": 16143822207360.0, "grad_norm": 1.8484108176722505, "language_loss": 0.81318939, "learning_rate": 3.836943167480296e-06, "loss": 0.83490539, "num_input_tokens_seen": 56198020, "step": 2587, "time_per_iteration": 4.212551593780518 }, { "auxiliary_loss_clip": 0.01160891, "auxiliary_loss_mlp": 0.01054822, "balance_loss_clip": 1.05309868, "balance_loss_mlp": 1.03325701, "epoch": 0.15559897790470464, "flos": 25337779384320.0, "grad_norm": 1.866779523391448, "language_loss": 0.88716942, "learning_rate": 3.836789105629236e-06, "loss": 0.90932655, "num_input_tokens_seen": 56218165, "step": 2588, "time_per_iteration": 4.192267894744873 }, { "auxiliary_loss_clip": 0.01094981, "auxiliary_loss_mlp": 0.01052123, "balance_loss_clip": 1.04558384, "balance_loss_mlp": 1.03164268, "epoch": 0.1556591011573726, "flos": 23148772744320.0, "grad_norm": 2.018423224363699, "language_loss": 0.64624381, "learning_rate": 3.83663497412695e-06, "loss": 0.66771483, "num_input_tokens_seen": 56237160, "step": 2589, "time_per_iteration": 4.303871154785156 }, { "auxiliary_loss_clip": 0.01104407, "auxiliary_loss_mlp": 0.01041976, "balance_loss_clip": 1.04520249, "balance_loss_mlp": 1.02123344, "epoch": 0.15571922441004057, "flos": 25370888745600.0, "grad_norm": 1.784618480549341, "language_loss": 0.82832813, "learning_rate": 3.836480772979281e-06, "loss": 0.84979194, "num_input_tokens_seen": 56257610, "step": 2590, "time_per_iteration": 4.460350751876831 }, { "auxiliary_loss_clip": 0.011248, "auxiliary_loss_mlp": 0.01047287, "balance_loss_clip": 1.05032134, "balance_loss_mlp": 1.02694952, "epoch": 0.15577934766270854, "flos": 14501375890560.0, "grad_norm": 2.6687659077907484, "language_loss": 0.78766, "learning_rate": 3.836326502192077e-06, "loss": 0.80938083, "num_input_tokens_seen": 56275215, "step": 2591, "time_per_iteration": 2.73305606842041 }, { "auxiliary_loss_clip": 0.01143879, "auxiliary_loss_mlp": 0.01049015, "balance_loss_clip": 1.05174232, "balance_loss_mlp": 1.03137255, "epoch": 0.15583947091537653, "flos": 37414537372800.0, "grad_norm": 2.0331558547393054, "language_loss": 0.65025747, "learning_rate": 3.836172161771189e-06, "loss": 0.67218637, "num_input_tokens_seen": 56297130, "step": 2592, "time_per_iteration": 2.8582632541656494 }, { "auxiliary_loss_clip": 0.01136043, "auxiliary_loss_mlp": 0.01052096, "balance_loss_clip": 1.05417228, "balance_loss_mlp": 1.0322001, "epoch": 0.1558995941680445, "flos": 21834729498240.0, "grad_norm": 2.311634250072179, "language_loss": 0.82506329, "learning_rate": 3.836017751722467e-06, "loss": 0.84694475, "num_input_tokens_seen": 56314995, "step": 2593, "time_per_iteration": 2.7230453491210938 }, { "auxiliary_loss_clip": 0.01142565, "auxiliary_loss_mlp": 0.01046037, "balance_loss_clip": 1.05237365, "balance_loss_mlp": 1.02676034, "epoch": 0.15595971742071246, "flos": 19792633484160.0, "grad_norm": 2.778410683125911, "language_loss": 0.73220694, "learning_rate": 3.8358632720517695e-06, "loss": 0.75409293, "num_input_tokens_seen": 56334005, "step": 2594, "time_per_iteration": 2.708063840866089 }, { "auxiliary_loss_clip": 0.01117989, "auxiliary_loss_mlp": 0.01040106, "balance_loss_clip": 1.0453043, "balance_loss_mlp": 1.02077007, "epoch": 0.15601984067338043, "flos": 26722135503360.0, "grad_norm": 2.1444704922101105, "language_loss": 0.81569934, "learning_rate": 3.835708722764952e-06, "loss": 0.83728027, "num_input_tokens_seen": 56353795, "step": 2595, "time_per_iteration": 2.716334581375122 }, { "auxiliary_loss_clip": 0.01155359, "auxiliary_loss_mlp": 0.01043269, "balance_loss_clip": 1.05093551, "balance_loss_mlp": 1.0238502, "epoch": 0.1560799639260484, "flos": 18369278173440.0, "grad_norm": 1.8943501893042642, "language_loss": 0.86674929, "learning_rate": 3.835554103867876e-06, "loss": 0.88873553, "num_input_tokens_seen": 56373195, "step": 2596, "time_per_iteration": 2.5947446823120117 }, { "auxiliary_loss_clip": 0.01144729, "auxiliary_loss_mlp": 0.01042109, "balance_loss_clip": 1.05225515, "balance_loss_mlp": 1.02360725, "epoch": 0.15614008717871636, "flos": 22598980197120.0, "grad_norm": 1.8059460934517404, "language_loss": 0.68772388, "learning_rate": 3.835399415366404e-06, "loss": 0.70959222, "num_input_tokens_seen": 56391525, "step": 2597, "time_per_iteration": 2.8101041316986084 }, { "auxiliary_loss_clip": 0.01130069, "auxiliary_loss_mlp": 0.01050835, "balance_loss_clip": 1.05409336, "balance_loss_mlp": 1.03165436, "epoch": 0.15620021043138435, "flos": 22746860490240.0, "grad_norm": 1.9103744906429732, "language_loss": 0.79860938, "learning_rate": 3.8352446572664035e-06, "loss": 0.82041842, "num_input_tokens_seen": 56410715, "step": 2598, "time_per_iteration": 2.695117950439453 }, { "auxiliary_loss_clip": 0.0112861, "auxiliary_loss_mlp": 0.00776118, "balance_loss_clip": 1.04750216, "balance_loss_mlp": 1.0006249, "epoch": 0.15626033368405232, "flos": 13114936782720.0, "grad_norm": 3.1104681024188827, "language_loss": 0.83092594, "learning_rate": 3.8350898295737405e-06, "loss": 0.84997326, "num_input_tokens_seen": 56429170, "step": 2599, "time_per_iteration": 2.665703773498535 }, { "auxiliary_loss_clip": 0.01160593, "auxiliary_loss_mlp": 0.0105002, "balance_loss_clip": 1.05274248, "balance_loss_mlp": 1.02924192, "epoch": 0.15632045693672028, "flos": 16472297105280.0, "grad_norm": 2.2910683048406266, "language_loss": 0.81530893, "learning_rate": 3.834934932294287e-06, "loss": 0.83741504, "num_input_tokens_seen": 56445685, "step": 2600, "time_per_iteration": 2.615651845932007 }, { "auxiliary_loss_clip": 0.01161023, "auxiliary_loss_mlp": 0.00776671, "balance_loss_clip": 1.05562234, "balance_loss_mlp": 1.00063944, "epoch": 0.15638058018938825, "flos": 20850346298880.0, "grad_norm": 1.7832591469657297, "language_loss": 0.88511437, "learning_rate": 3.834779965433917e-06, "loss": 0.90449131, "num_input_tokens_seen": 56465900, "step": 2601, "time_per_iteration": 2.6833529472351074 }, { "auxiliary_loss_clip": 0.0116257, "auxiliary_loss_mlp": 0.0106307, "balance_loss_clip": 1.05569744, "balance_loss_mlp": 1.04120743, "epoch": 0.1564407034420562, "flos": 21872220318720.0, "grad_norm": 1.9421054688538308, "language_loss": 0.78707534, "learning_rate": 3.834624928998508e-06, "loss": 0.80933177, "num_input_tokens_seen": 56485020, "step": 2602, "time_per_iteration": 2.6296608448028564 }, { "auxiliary_loss_clip": 0.01126653, "auxiliary_loss_mlp": 0.01043676, "balance_loss_clip": 1.05035329, "balance_loss_mlp": 1.02419758, "epoch": 0.15650082669472418, "flos": 21834549930240.0, "grad_norm": 1.8230718276715763, "language_loss": 0.74029547, "learning_rate": 3.8344698229939376e-06, "loss": 0.76199877, "num_input_tokens_seen": 56505205, "step": 2603, "time_per_iteration": 2.744508743286133 }, { "auxiliary_loss_clip": 0.01143305, "auxiliary_loss_mlp": 0.01051047, "balance_loss_clip": 1.04820418, "balance_loss_mlp": 1.03112721, "epoch": 0.15656094994739214, "flos": 13800542653440.0, "grad_norm": 4.041164356714064, "language_loss": 0.87723601, "learning_rate": 3.8343146474260865e-06, "loss": 0.89917958, "num_input_tokens_seen": 56521495, "step": 2604, "time_per_iteration": 2.682457447052002 }, { "auxiliary_loss_clip": 0.01145351, "auxiliary_loss_mlp": 0.01044759, "balance_loss_clip": 1.04976749, "balance_loss_mlp": 1.0256021, "epoch": 0.15662107320006013, "flos": 27308197808640.0, "grad_norm": 2.260429022209425, "language_loss": 0.8573193, "learning_rate": 3.834159402300841e-06, "loss": 0.87922043, "num_input_tokens_seen": 56540665, "step": 2605, "time_per_iteration": 2.7724974155426025 }, { "auxiliary_loss_clip": 0.0115108, "auxiliary_loss_mlp": 0.01047256, "balance_loss_clip": 1.05181313, "balance_loss_mlp": 1.02676356, "epoch": 0.1566811964527281, "flos": 26685075646080.0, "grad_norm": 1.7309636492693905, "language_loss": 0.73101914, "learning_rate": 3.834004087624087e-06, "loss": 0.75300246, "num_input_tokens_seen": 56560805, "step": 2606, "time_per_iteration": 2.7490081787109375 }, { "auxiliary_loss_clip": 0.01158388, "auxiliary_loss_mlp": 0.01049752, "balance_loss_clip": 1.0552665, "balance_loss_mlp": 1.03165627, "epoch": 0.15674131970539606, "flos": 16103422385280.0, "grad_norm": 2.968092109370304, "language_loss": 0.76497948, "learning_rate": 3.8338487034017145e-06, "loss": 0.78706092, "num_input_tokens_seen": 56576335, "step": 2607, "time_per_iteration": 2.6597230434417725 }, { "auxiliary_loss_clip": 0.01120645, "auxiliary_loss_mlp": 0.01047174, "balance_loss_clip": 1.05131412, "balance_loss_mlp": 1.0284934, "epoch": 0.15680144295806403, "flos": 19169690889600.0, "grad_norm": 1.7981763092074996, "language_loss": 0.82107675, "learning_rate": 3.833693249639615e-06, "loss": 0.84275496, "num_input_tokens_seen": 56595880, "step": 2608, "time_per_iteration": 2.7072103023529053 }, { "auxiliary_loss_clip": 0.0112834, "auxiliary_loss_mlp": 0.01045106, "balance_loss_clip": 1.04685056, "balance_loss_mlp": 1.02436399, "epoch": 0.156861566210732, "flos": 20813430096000.0, "grad_norm": 1.6817301031159713, "language_loss": 0.72335941, "learning_rate": 3.833537726343684e-06, "loss": 0.74509382, "num_input_tokens_seen": 56615130, "step": 2609, "time_per_iteration": 2.690690755844116 }, { "auxiliary_loss_clip": 0.01143972, "auxiliary_loss_mlp": 0.01036718, "balance_loss_clip": 1.04901087, "balance_loss_mlp": 1.01756072, "epoch": 0.15692168946339996, "flos": 20047922421120.0, "grad_norm": 5.132438477880424, "language_loss": 0.72317064, "learning_rate": 3.833382133519818e-06, "loss": 0.74497753, "num_input_tokens_seen": 56634005, "step": 2610, "time_per_iteration": 2.6515614986419678 }, { "auxiliary_loss_clip": 0.01159588, "auxiliary_loss_mlp": 0.01051513, "balance_loss_clip": 1.05216432, "balance_loss_mlp": 1.03063977, "epoch": 0.15698181271606793, "flos": 21398019943680.0, "grad_norm": 2.0600295188113935, "language_loss": 0.72915608, "learning_rate": 3.833226471173919e-06, "loss": 0.75126708, "num_input_tokens_seen": 56653480, "step": 2611, "time_per_iteration": 2.630988359451294 }, { "auxiliary_loss_clip": 0.01141924, "auxiliary_loss_mlp": 0.01042538, "balance_loss_clip": 1.04917872, "balance_loss_mlp": 1.0231905, "epoch": 0.15704193596873592, "flos": 20845785271680.0, "grad_norm": 2.0339762399532186, "language_loss": 0.70766544, "learning_rate": 3.833070739311887e-06, "loss": 0.72951007, "num_input_tokens_seen": 56672270, "step": 2612, "time_per_iteration": 2.6569461822509766 }, { "auxiliary_loss_clip": 0.01116284, "auxiliary_loss_mlp": 0.01051299, "balance_loss_clip": 1.04844582, "balance_loss_mlp": 1.03221321, "epoch": 0.15710205922140388, "flos": 21762908254080.0, "grad_norm": 1.9704781930994688, "language_loss": 0.76294881, "learning_rate": 3.83291493793963e-06, "loss": 0.78462464, "num_input_tokens_seen": 56691510, "step": 2613, "time_per_iteration": 2.7188539505004883 }, { "auxiliary_loss_clip": 0.01115155, "auxiliary_loss_mlp": 0.01049301, "balance_loss_clip": 1.04504919, "balance_loss_mlp": 1.02956033, "epoch": 0.15716218247407185, "flos": 25007760201600.0, "grad_norm": 2.137998057111896, "language_loss": 0.65944499, "learning_rate": 3.832759067063055e-06, "loss": 0.68108952, "num_input_tokens_seen": 56712230, "step": 2614, "time_per_iteration": 2.7550084590911865 }, { "auxiliary_loss_clip": 0.01151987, "auxiliary_loss_mlp": 0.01044173, "balance_loss_clip": 1.05387104, "balance_loss_mlp": 1.02374101, "epoch": 0.1572223057267398, "flos": 20191780391040.0, "grad_norm": 2.2755662506820915, "language_loss": 0.75204211, "learning_rate": 3.832603126688072e-06, "loss": 0.77400374, "num_input_tokens_seen": 56727490, "step": 2615, "time_per_iteration": 2.683225154876709 }, { "auxiliary_loss_clip": 0.01138545, "auxiliary_loss_mlp": 0.01050891, "balance_loss_clip": 1.05209839, "balance_loss_mlp": 1.03078008, "epoch": 0.15728242897940778, "flos": 20959514709120.0, "grad_norm": 2.581872009488739, "language_loss": 0.73064095, "learning_rate": 3.832447116820594e-06, "loss": 0.75253528, "num_input_tokens_seen": 56747385, "step": 2616, "time_per_iteration": 2.6660919189453125 }, { "auxiliary_loss_clip": 0.01130717, "auxiliary_loss_mlp": 0.01047511, "balance_loss_clip": 1.04999971, "balance_loss_mlp": 1.02794933, "epoch": 0.15734255223207574, "flos": 23038275530880.0, "grad_norm": 2.813587490853999, "language_loss": 0.72425079, "learning_rate": 3.832291037466539e-06, "loss": 0.74603307, "num_input_tokens_seen": 56768055, "step": 2617, "time_per_iteration": 2.768561363220215 }, { "auxiliary_loss_clip": 0.01138315, "auxiliary_loss_mlp": 0.0104637, "balance_loss_clip": 1.04947805, "balance_loss_mlp": 1.02548432, "epoch": 0.15740267548474374, "flos": 20551281661440.0, "grad_norm": 2.3222819484870016, "language_loss": 0.74358094, "learning_rate": 3.8321348886318235e-06, "loss": 0.76542777, "num_input_tokens_seen": 56785110, "step": 2618, "time_per_iteration": 2.66121768951416 }, { "auxiliary_loss_clip": 0.01162954, "auxiliary_loss_mlp": 0.01046178, "balance_loss_clip": 1.05417252, "balance_loss_mlp": 1.02526867, "epoch": 0.1574627987374117, "flos": 22666922772480.0, "grad_norm": 1.8808629075569874, "language_loss": 0.78896272, "learning_rate": 3.8319786703223695e-06, "loss": 0.81105405, "num_input_tokens_seen": 56804975, "step": 2619, "time_per_iteration": 2.6743338108062744 }, { "auxiliary_loss_clip": 0.01126081, "auxiliary_loss_mlp": 0.01055551, "balance_loss_clip": 1.05046356, "balance_loss_mlp": 1.03576207, "epoch": 0.15752292199007967, "flos": 16800664262400.0, "grad_norm": 1.9082963728737496, "language_loss": 0.76517296, "learning_rate": 3.831822382544101e-06, "loss": 0.78698927, "num_input_tokens_seen": 56822470, "step": 2620, "time_per_iteration": 2.6481080055236816 }, { "auxiliary_loss_clip": 0.01136128, "auxiliary_loss_mlp": 0.0104575, "balance_loss_clip": 1.05097985, "balance_loss_mlp": 1.02488887, "epoch": 0.15758304524274763, "flos": 29826002568960.0, "grad_norm": 1.6603432400664486, "language_loss": 0.7136035, "learning_rate": 3.831666025302944e-06, "loss": 0.73542225, "num_input_tokens_seen": 56842100, "step": 2621, "time_per_iteration": 2.70985746383667 }, { "auxiliary_loss_clip": 0.01103274, "auxiliary_loss_mlp": 0.01052522, "balance_loss_clip": 1.04624665, "balance_loss_mlp": 1.02921629, "epoch": 0.1576431684954156, "flos": 53577426723840.0, "grad_norm": 2.1843515622778624, "language_loss": 0.72136736, "learning_rate": 3.831509598604828e-06, "loss": 0.74292529, "num_input_tokens_seen": 56865920, "step": 2622, "time_per_iteration": 3.024561643600464 }, { "auxiliary_loss_clip": 0.01095163, "auxiliary_loss_mlp": 0.01043948, "balance_loss_clip": 1.04474711, "balance_loss_mlp": 1.02464843, "epoch": 0.15770329174808356, "flos": 20813609664000.0, "grad_norm": 1.6586715789846178, "language_loss": 0.87637675, "learning_rate": 3.831353102455684e-06, "loss": 0.8977679, "num_input_tokens_seen": 56885265, "step": 2623, "time_per_iteration": 2.9600114822387695 }, { "auxiliary_loss_clip": 0.01158714, "auxiliary_loss_mlp": 0.01044337, "balance_loss_clip": 1.05476475, "balance_loss_mlp": 1.02564478, "epoch": 0.15776341500075153, "flos": 24974004395520.0, "grad_norm": 1.6915331173398198, "language_loss": 0.81600082, "learning_rate": 3.831196536861448e-06, "loss": 0.83803129, "num_input_tokens_seen": 56906710, "step": 2624, "time_per_iteration": 2.6621103286743164 }, { "auxiliary_loss_clip": 0.01122344, "auxiliary_loss_mlp": 0.01049423, "balance_loss_clip": 1.04776418, "balance_loss_mlp": 1.02990842, "epoch": 0.15782353825341952, "flos": 21907915459200.0, "grad_norm": 2.879465237309773, "language_loss": 0.79977828, "learning_rate": 3.831039901828054e-06, "loss": 0.82149595, "num_input_tokens_seen": 56924275, "step": 2625, "time_per_iteration": 2.7291064262390137 }, { "auxiliary_loss_clip": 0.01157938, "auxiliary_loss_mlp": 0.01046203, "balance_loss_clip": 1.05403268, "balance_loss_mlp": 1.02857196, "epoch": 0.15788366150608749, "flos": 26177191292160.0, "grad_norm": 2.133783972400447, "language_loss": 0.80332482, "learning_rate": 3.830883197361445e-06, "loss": 0.8253662, "num_input_tokens_seen": 56941525, "step": 2626, "time_per_iteration": 4.252760171890259 }, { "auxiliary_loss_clip": 0.01102762, "auxiliary_loss_mlp": 0.01057658, "balance_loss_clip": 1.05214024, "balance_loss_mlp": 1.03512752, "epoch": 0.15794378475875545, "flos": 27709822753920.0, "grad_norm": 3.9802810067864045, "language_loss": 0.73636395, "learning_rate": 3.830726423467561e-06, "loss": 0.75796819, "num_input_tokens_seen": 56962145, "step": 2627, "time_per_iteration": 4.328871250152588 }, { "auxiliary_loss_clip": 0.01117433, "auxiliary_loss_mlp": 0.01055032, "balance_loss_clip": 1.0503006, "balance_loss_mlp": 1.0351001, "epoch": 0.15800390801142342, "flos": 12130158533760.0, "grad_norm": 2.0211273696228216, "language_loss": 0.84589541, "learning_rate": 3.830569580152348e-06, "loss": 0.86762005, "num_input_tokens_seen": 56977505, "step": 2628, "time_per_iteration": 2.6785013675689697 }, { "auxiliary_loss_clip": 0.01129476, "auxiliary_loss_mlp": 0.01040858, "balance_loss_clip": 1.05065978, "balance_loss_mlp": 1.02308416, "epoch": 0.15806403126409138, "flos": 20704728562560.0, "grad_norm": 1.897214582222077, "language_loss": 0.76437485, "learning_rate": 3.830412667421752e-06, "loss": 0.78607821, "num_input_tokens_seen": 56996770, "step": 2629, "time_per_iteration": 4.2878499031066895 }, { "auxiliary_loss_clip": 0.01143973, "auxiliary_loss_mlp": 0.01046449, "balance_loss_clip": 1.0529623, "balance_loss_mlp": 1.02675569, "epoch": 0.15812415451675935, "flos": 17821712269440.0, "grad_norm": 2.252423233454998, "language_loss": 0.73337436, "learning_rate": 3.8302556852817245e-06, "loss": 0.75527859, "num_input_tokens_seen": 57014970, "step": 2630, "time_per_iteration": 4.253108263015747 }, { "auxiliary_loss_clip": 0.01156261, "auxiliary_loss_mlp": 0.01045602, "balance_loss_clip": 1.05644512, "balance_loss_mlp": 1.02615929, "epoch": 0.15818427776942734, "flos": 20084048524800.0, "grad_norm": 2.390369083551665, "language_loss": 0.83678091, "learning_rate": 3.8300986337382184e-06, "loss": 0.85879952, "num_input_tokens_seen": 57034045, "step": 2631, "time_per_iteration": 2.6145882606506348 }, { "auxiliary_loss_clip": 0.01159092, "auxiliary_loss_mlp": 0.01045772, "balance_loss_clip": 1.05313432, "balance_loss_mlp": 1.02746117, "epoch": 0.1582444010220953, "flos": 21214911386880.0, "grad_norm": 1.8755653224160422, "language_loss": 0.78415525, "learning_rate": 3.8299415127971895e-06, "loss": 0.80620384, "num_input_tokens_seen": 57053695, "step": 2632, "time_per_iteration": 2.656691551208496 }, { "auxiliary_loss_clip": 0.01151481, "auxiliary_loss_mlp": 0.01057283, "balance_loss_clip": 1.05574381, "balance_loss_mlp": 1.03769732, "epoch": 0.15830452427476327, "flos": 17858341163520.0, "grad_norm": 2.079450153413421, "language_loss": 0.8301838, "learning_rate": 3.829784322464594e-06, "loss": 0.85227144, "num_input_tokens_seen": 57071290, "step": 2633, "time_per_iteration": 2.622725248336792 }, { "auxiliary_loss_clip": 0.01165069, "auxiliary_loss_mlp": 0.01041545, "balance_loss_clip": 1.05761647, "balance_loss_mlp": 1.02223265, "epoch": 0.15836464752743123, "flos": 24534960456960.0, "grad_norm": 2.1719104392782813, "language_loss": 0.77448404, "learning_rate": 3.829627062746394e-06, "loss": 0.79655015, "num_input_tokens_seen": 57091465, "step": 2634, "time_per_iteration": 2.6383235454559326 }, { "auxiliary_loss_clip": 0.01127407, "auxiliary_loss_mlp": 0.00777775, "balance_loss_clip": 1.05277348, "balance_loss_mlp": 1.00136137, "epoch": 0.1584247707800992, "flos": 20120821073280.0, "grad_norm": 3.5133527254089087, "language_loss": 0.88479185, "learning_rate": 3.829469733648552e-06, "loss": 0.90384364, "num_input_tokens_seen": 57110075, "step": 2635, "time_per_iteration": 2.725924491882324 }, { "auxiliary_loss_clip": 0.01096223, "auxiliary_loss_mlp": 0.01058885, "balance_loss_clip": 1.04816198, "balance_loss_mlp": 1.03847599, "epoch": 0.15848489403276717, "flos": 20375966355840.0, "grad_norm": 2.8627721083207627, "language_loss": 0.75762677, "learning_rate": 3.829312335177034e-06, "loss": 0.77917778, "num_input_tokens_seen": 57128945, "step": 2636, "time_per_iteration": 2.775310516357422 }, { "auxiliary_loss_clip": 0.01120174, "auxiliary_loss_mlp": 0.01043834, "balance_loss_clip": 1.05117822, "balance_loss_mlp": 1.02350879, "epoch": 0.15854501728543513, "flos": 39346890359040.0, "grad_norm": 2.388418559522659, "language_loss": 0.71977961, "learning_rate": 3.82915486733781e-06, "loss": 0.74141967, "num_input_tokens_seen": 57152385, "step": 2637, "time_per_iteration": 2.8375279903411865 }, { "auxiliary_loss_clip": 0.0115052, "auxiliary_loss_mlp": 0.01044842, "balance_loss_clip": 1.05661607, "balance_loss_mlp": 1.02640057, "epoch": 0.15860514053810312, "flos": 24864225454080.0, "grad_norm": 2.1640345554565057, "language_loss": 0.78352648, "learning_rate": 3.82899733013685e-06, "loss": 0.80548006, "num_input_tokens_seen": 57172620, "step": 2638, "time_per_iteration": 2.7298176288604736 }, { "auxiliary_loss_clip": 0.01129706, "auxiliary_loss_mlp": 0.01057375, "balance_loss_clip": 1.05311394, "balance_loss_mlp": 1.03715718, "epoch": 0.1586652637907711, "flos": 26177694082560.0, "grad_norm": 2.325769963269074, "language_loss": 0.75845039, "learning_rate": 3.828839723580128e-06, "loss": 0.78032124, "num_input_tokens_seen": 57194680, "step": 2639, "time_per_iteration": 2.7731449604034424 }, { "auxiliary_loss_clip": 0.01104856, "auxiliary_loss_mlp": 0.01057283, "balance_loss_clip": 1.05350864, "balance_loss_mlp": 1.03772068, "epoch": 0.15872538704343905, "flos": 19792058866560.0, "grad_norm": 2.173238447343554, "language_loss": 0.81319505, "learning_rate": 3.82868204767362e-06, "loss": 0.83481646, "num_input_tokens_seen": 57214675, "step": 2640, "time_per_iteration": 2.8024139404296875 }, { "auxiliary_loss_clip": 0.01135166, "auxiliary_loss_mlp": 0.01054673, "balance_loss_clip": 1.05492401, "balance_loss_mlp": 1.03426492, "epoch": 0.15878551029610702, "flos": 28475366342400.0, "grad_norm": 2.013499020988034, "language_loss": 0.66893363, "learning_rate": 3.828524302423306e-06, "loss": 0.69083202, "num_input_tokens_seen": 57235830, "step": 2641, "time_per_iteration": 2.7519116401672363 }, { "auxiliary_loss_clip": 0.01149448, "auxiliary_loss_mlp": 0.01051949, "balance_loss_clip": 1.05758858, "balance_loss_mlp": 1.0326376, "epoch": 0.15884563354877498, "flos": 24206701040640.0, "grad_norm": 2.139760259286454, "language_loss": 0.7552591, "learning_rate": 3.828366487835167e-06, "loss": 0.77727306, "num_input_tokens_seen": 57255970, "step": 2642, "time_per_iteration": 2.706136465072632 }, { "auxiliary_loss_clip": 0.01156917, "auxiliary_loss_mlp": 0.01042142, "balance_loss_clip": 1.06263423, "balance_loss_mlp": 1.02323556, "epoch": 0.15890575680144295, "flos": 23949795991680.0, "grad_norm": 1.9419610036505286, "language_loss": 0.70564604, "learning_rate": 3.828208603915186e-06, "loss": 0.72763658, "num_input_tokens_seen": 57274435, "step": 2643, "time_per_iteration": 2.682015895843506 }, { "auxiliary_loss_clip": 0.01161783, "auxiliary_loss_mlp": 0.01041643, "balance_loss_clip": 1.05891204, "balance_loss_mlp": 1.02389312, "epoch": 0.15896588005411091, "flos": 21215019127680.0, "grad_norm": 1.846517711414915, "language_loss": 0.78057045, "learning_rate": 3.828050650669353e-06, "loss": 0.80260473, "num_input_tokens_seen": 57293115, "step": 2644, "time_per_iteration": 2.683790922164917 }, { "auxiliary_loss_clip": 0.01151239, "auxiliary_loss_mlp": 0.01050105, "balance_loss_clip": 1.05701637, "balance_loss_mlp": 1.03154373, "epoch": 0.1590260033067789, "flos": 24352390604160.0, "grad_norm": 3.757920662841351, "language_loss": 0.81961924, "learning_rate": 3.827892628103657e-06, "loss": 0.84163266, "num_input_tokens_seen": 57312565, "step": 2645, "time_per_iteration": 2.698085069656372 }, { "auxiliary_loss_clip": 0.01162748, "auxiliary_loss_mlp": 0.01048492, "balance_loss_clip": 1.05487716, "balance_loss_mlp": 1.02854836, "epoch": 0.15908612655944687, "flos": 32048944583040.0, "grad_norm": 2.056693785790565, "language_loss": 0.69412929, "learning_rate": 3.827734536224087e-06, "loss": 0.71624172, "num_input_tokens_seen": 57333360, "step": 2646, "time_per_iteration": 2.7166528701782227 }, { "auxiliary_loss_clip": 0.01135067, "auxiliary_loss_mlp": 0.01040314, "balance_loss_clip": 1.05435526, "balance_loss_mlp": 1.02223015, "epoch": 0.15914624981211484, "flos": 17785370684160.0, "grad_norm": 2.5975497323405055, "language_loss": 0.62932581, "learning_rate": 3.827576375036642e-06, "loss": 0.65107965, "num_input_tokens_seen": 57350575, "step": 2647, "time_per_iteration": 2.7405354976654053 }, { "auxiliary_loss_clip": 0.01160144, "auxiliary_loss_mlp": 0.01047955, "balance_loss_clip": 1.05654776, "balance_loss_mlp": 1.02896523, "epoch": 0.1592063730647828, "flos": 17712507945600.0, "grad_norm": 2.2161421076431025, "language_loss": 0.89490473, "learning_rate": 3.827418144547318e-06, "loss": 0.91698575, "num_input_tokens_seen": 57367570, "step": 2648, "time_per_iteration": 2.6193346977233887 }, { "auxiliary_loss_clip": 0.01158791, "auxiliary_loss_mlp": 0.01048086, "balance_loss_clip": 1.05630398, "balance_loss_mlp": 1.03072906, "epoch": 0.15926649631745077, "flos": 18803545603200.0, "grad_norm": 1.9960039108301237, "language_loss": 0.91307199, "learning_rate": 3.827259844762114e-06, "loss": 0.93514073, "num_input_tokens_seen": 57383980, "step": 2649, "time_per_iteration": 2.6137378215789795 }, { "auxiliary_loss_clip": 0.01099661, "auxiliary_loss_mlp": 0.01044384, "balance_loss_clip": 1.05474401, "balance_loss_mlp": 1.02439272, "epoch": 0.15932661957011873, "flos": 17566243764480.0, "grad_norm": 2.3504548368335767, "language_loss": 0.71782613, "learning_rate": 3.827101475687033e-06, "loss": 0.73926663, "num_input_tokens_seen": 57400840, "step": 2650, "time_per_iteration": 2.8883376121520996 }, { "auxiliary_loss_clip": 0.01146809, "auxiliary_loss_mlp": 0.01041815, "balance_loss_clip": 1.05386841, "balance_loss_mlp": 1.02476835, "epoch": 0.15938674282278673, "flos": 13334351011200.0, "grad_norm": 1.8238326955956992, "language_loss": 0.71427429, "learning_rate": 3.826943037328082e-06, "loss": 0.73616046, "num_input_tokens_seen": 57419230, "step": 2651, "time_per_iteration": 2.607879638671875 }, { "auxiliary_loss_clip": 0.01118842, "auxiliary_loss_mlp": 0.00777496, "balance_loss_clip": 1.05154157, "balance_loss_mlp": 1.00132799, "epoch": 0.1594468660754547, "flos": 22488842119680.0, "grad_norm": 1.8928974850955373, "language_loss": 0.80185902, "learning_rate": 3.8267845296912674e-06, "loss": 0.82082248, "num_input_tokens_seen": 57439315, "step": 2652, "time_per_iteration": 2.718695640563965 }, { "auxiliary_loss_clip": 0.01138048, "auxiliary_loss_mlp": 0.00775, "balance_loss_clip": 1.0567826, "balance_loss_mlp": 1.00124729, "epoch": 0.15950698932812266, "flos": 15007320910080.0, "grad_norm": 2.6116065834427387, "language_loss": 0.69539076, "learning_rate": 3.826625952782601e-06, "loss": 0.71452117, "num_input_tokens_seen": 57454635, "step": 2653, "time_per_iteration": 2.7088639736175537 }, { "auxiliary_loss_clip": 0.01144826, "auxiliary_loss_mlp": 0.01038735, "balance_loss_clip": 1.05257821, "balance_loss_mlp": 1.02050805, "epoch": 0.15956711258079062, "flos": 30155052084480.0, "grad_norm": 2.1937273620657307, "language_loss": 0.76670635, "learning_rate": 3.826467306608095e-06, "loss": 0.78854191, "num_input_tokens_seen": 57476805, "step": 2654, "time_per_iteration": 2.79425048828125 }, { "auxiliary_loss_clip": 0.01114313, "auxiliary_loss_mlp": 0.01041134, "balance_loss_clip": 1.04714727, "balance_loss_mlp": 1.02248931, "epoch": 0.1596272358334586, "flos": 21032700670080.0, "grad_norm": 2.0572535633716247, "language_loss": 0.81873977, "learning_rate": 3.826308591173765e-06, "loss": 0.84029424, "num_input_tokens_seen": 57496400, "step": 2655, "time_per_iteration": 2.6990878582000732 }, { "auxiliary_loss_clip": 0.01112525, "auxiliary_loss_mlp": 0.01046346, "balance_loss_clip": 1.04670715, "balance_loss_mlp": 1.02849984, "epoch": 0.15968735908612655, "flos": 15268032800640.0, "grad_norm": 2.0964800101687486, "language_loss": 0.73768878, "learning_rate": 3.826149806485631e-06, "loss": 0.75927746, "num_input_tokens_seen": 57513700, "step": 2656, "time_per_iteration": 2.7409873008728027 }, { "auxiliary_loss_clip": 0.01111218, "auxiliary_loss_mlp": 0.01039948, "balance_loss_clip": 1.04749918, "balance_loss_mlp": 1.02220988, "epoch": 0.15974748233879452, "flos": 52665726695040.0, "grad_norm": 2.516351978408242, "language_loss": 0.77637637, "learning_rate": 3.825990952549713e-06, "loss": 0.79788804, "num_input_tokens_seen": 57536180, "step": 2657, "time_per_iteration": 2.984161376953125 }, { "auxiliary_loss_clip": 0.01142397, "auxiliary_loss_mlp": 0.01048058, "balance_loss_clip": 1.05276513, "balance_loss_mlp": 1.02984321, "epoch": 0.1598076055914625, "flos": 18733232730240.0, "grad_norm": 2.1741432296797303, "language_loss": 0.74654955, "learning_rate": 3.825832029372035e-06, "loss": 0.76845407, "num_input_tokens_seen": 57555025, "step": 2658, "time_per_iteration": 2.6795172691345215 }, { "auxiliary_loss_clip": 0.01137294, "auxiliary_loss_mlp": 0.01047097, "balance_loss_clip": 1.05887127, "balance_loss_mlp": 1.02581763, "epoch": 0.15986772884413047, "flos": 34349238535680.0, "grad_norm": 2.2676743120149916, "language_loss": 0.75164986, "learning_rate": 3.825673036958624e-06, "loss": 0.77349377, "num_input_tokens_seen": 57577660, "step": 2659, "time_per_iteration": 2.885744094848633 }, { "auxiliary_loss_clip": 0.01122752, "auxiliary_loss_mlp": 0.0105323, "balance_loss_clip": 1.0512991, "balance_loss_mlp": 1.0334295, "epoch": 0.15992785209679844, "flos": 22054969739520.0, "grad_norm": 2.181311046841435, "language_loss": 0.90998709, "learning_rate": 3.825513975315508e-06, "loss": 0.93174696, "num_input_tokens_seen": 57596335, "step": 2660, "time_per_iteration": 2.7562267780303955 }, { "auxiliary_loss_clip": 0.01114547, "auxiliary_loss_mlp": 0.01058378, "balance_loss_clip": 1.05538487, "balance_loss_mlp": 1.03590751, "epoch": 0.1599879753494664, "flos": 33066652625280.0, "grad_norm": 1.746468400789071, "language_loss": 0.77724659, "learning_rate": 3.82535484444872e-06, "loss": 0.79897583, "num_input_tokens_seen": 57616830, "step": 2661, "time_per_iteration": 2.9896914958953857 }, { "auxiliary_loss_clip": 0.0113781, "auxiliary_loss_mlp": 0.00777461, "balance_loss_clip": 1.05382478, "balance_loss_mlp": 1.00132632, "epoch": 0.16004809860213437, "flos": 28038010343040.0, "grad_norm": 2.0483033922540086, "language_loss": 0.74442393, "learning_rate": 3.825195644364292e-06, "loss": 0.76357663, "num_input_tokens_seen": 57635515, "step": 2662, "time_per_iteration": 2.7993714809417725 }, { "auxiliary_loss_clip": 0.01135674, "auxiliary_loss_mlp": 0.00780783, "balance_loss_clip": 1.05392313, "balance_loss_mlp": 1.0016191, "epoch": 0.16010822185480234, "flos": 22780113505920.0, "grad_norm": 2.9903694104875984, "language_loss": 0.82515085, "learning_rate": 3.825036375068263e-06, "loss": 0.84431541, "num_input_tokens_seen": 57654250, "step": 2663, "time_per_iteration": 2.678490161895752 }, { "auxiliary_loss_clip": 0.01112205, "auxiliary_loss_mlp": 0.01044917, "balance_loss_clip": 1.05182636, "balance_loss_mlp": 1.02574801, "epoch": 0.16016834510747033, "flos": 20084012611200.0, "grad_norm": 2.06786422122115, "language_loss": 0.7951405, "learning_rate": 3.824877036566672e-06, "loss": 0.81671166, "num_input_tokens_seen": 57672645, "step": 2664, "time_per_iteration": 2.819880962371826 }, { "auxiliary_loss_clip": 0.01151449, "auxiliary_loss_mlp": 0.01048023, "balance_loss_clip": 1.05374622, "balance_loss_mlp": 1.02886605, "epoch": 0.1602284683601383, "flos": 21173829206400.0, "grad_norm": 1.6697703441146605, "language_loss": 0.93748474, "learning_rate": 3.824717628865561e-06, "loss": 0.95947945, "num_input_tokens_seen": 57691055, "step": 2665, "time_per_iteration": 2.697660446166992 }, { "auxiliary_loss_clip": 0.01127607, "auxiliary_loss_mlp": 0.01047415, "balance_loss_clip": 1.05185676, "balance_loss_mlp": 1.02774525, "epoch": 0.16028859161280626, "flos": 14647568244480.0, "grad_norm": 2.9655602739253095, "language_loss": 0.85237324, "learning_rate": 3.824558151970974e-06, "loss": 0.87412339, "num_input_tokens_seen": 57707235, "step": 2666, "time_per_iteration": 4.282273530960083 }, { "auxiliary_loss_clip": 0.01129818, "auxiliary_loss_mlp": 0.00777125, "balance_loss_clip": 1.05257225, "balance_loss_mlp": 1.00145936, "epoch": 0.16034871486547422, "flos": 20990325600000.0, "grad_norm": 1.8366839898970433, "language_loss": 0.81284773, "learning_rate": 3.8243986058889595e-06, "loss": 0.83191717, "num_input_tokens_seen": 57724190, "step": 2667, "time_per_iteration": 2.69508695602417 }, { "auxiliary_loss_clip": 0.0116556, "auxiliary_loss_mlp": 0.01046526, "balance_loss_clip": 1.06089485, "balance_loss_mlp": 1.02643883, "epoch": 0.1604088381181422, "flos": 21397732634880.0, "grad_norm": 1.958935842080623, "language_loss": 0.74031079, "learning_rate": 3.824238990625567e-06, "loss": 0.76243162, "num_input_tokens_seen": 57743620, "step": 2668, "time_per_iteration": 4.2559425830841064 }, { "auxiliary_loss_clip": 0.01148853, "auxiliary_loss_mlp": 0.01051992, "balance_loss_clip": 1.05547619, "balance_loss_mlp": 1.03240585, "epoch": 0.16046896137081015, "flos": 23877040993920.0, "grad_norm": 1.7737626564305047, "language_loss": 0.77495629, "learning_rate": 3.824079306186848e-06, "loss": 0.7969647, "num_input_tokens_seen": 57764810, "step": 2669, "time_per_iteration": 2.6424050331115723 }, { "auxiliary_loss_clip": 0.01097339, "auxiliary_loss_mlp": 0.01012737, "balance_loss_clip": 1.06351233, "balance_loss_mlp": 1.00986385, "epoch": 0.16052908462347812, "flos": 59806709015040.0, "grad_norm": 0.8041290684345284, "language_loss": 0.5549804, "learning_rate": 3.823919552578861e-06, "loss": 0.57608116, "num_input_tokens_seen": 57824390, "step": 2670, "time_per_iteration": 4.765664100646973 }, { "auxiliary_loss_clip": 0.01149639, "auxiliary_loss_mlp": 0.01043383, "balance_loss_clip": 1.05322218, "balance_loss_mlp": 1.02430916, "epoch": 0.1605892078761461, "flos": 18296559089280.0, "grad_norm": 2.6306224128650464, "language_loss": 0.77778888, "learning_rate": 3.82375972980766e-06, "loss": 0.7997191, "num_input_tokens_seen": 57843665, "step": 2671, "time_per_iteration": 2.6876416206359863 }, { "auxiliary_loss_clip": 0.01151164, "auxiliary_loss_mlp": 0.01043962, "balance_loss_clip": 1.05529547, "balance_loss_mlp": 1.02503204, "epoch": 0.16064933112881408, "flos": 32160734686080.0, "grad_norm": 1.9167251889277674, "language_loss": 0.64766788, "learning_rate": 3.8235998378793086e-06, "loss": 0.66961908, "num_input_tokens_seen": 57863305, "step": 2672, "time_per_iteration": 2.7102553844451904 }, { "auxiliary_loss_clip": 0.01150206, "auxiliary_loss_mlp": 0.01046785, "balance_loss_clip": 1.05674481, "balance_loss_mlp": 1.02554154, "epoch": 0.16070945438148204, "flos": 19828795501440.0, "grad_norm": 2.045175098484539, "language_loss": 0.85708207, "learning_rate": 3.8234398767998675e-06, "loss": 0.87905198, "num_input_tokens_seen": 57883025, "step": 2673, "time_per_iteration": 2.656360626220703 }, { "auxiliary_loss_clip": 0.01125542, "auxiliary_loss_mlp": 0.01055838, "balance_loss_clip": 1.05366015, "balance_loss_mlp": 1.03716969, "epoch": 0.16076957763415, "flos": 18913144976640.0, "grad_norm": 2.339006860757087, "language_loss": 0.7289716, "learning_rate": 3.823279846575403e-06, "loss": 0.75078535, "num_input_tokens_seen": 57901430, "step": 2674, "time_per_iteration": 2.7122414112091064 }, { "auxiliary_loss_clip": 0.01150063, "auxiliary_loss_mlp": 0.01045468, "balance_loss_clip": 1.05416465, "balance_loss_mlp": 1.02464211, "epoch": 0.16082970088681797, "flos": 16764358590720.0, "grad_norm": 1.9341682597436423, "language_loss": 0.84438515, "learning_rate": 3.823119747211986e-06, "loss": 0.86634052, "num_input_tokens_seen": 57919550, "step": 2675, "time_per_iteration": 2.6646435260772705 }, { "auxiliary_loss_clip": 0.01116221, "auxiliary_loss_mlp": 0.01049343, "balance_loss_clip": 1.05220723, "balance_loss_mlp": 1.02823126, "epoch": 0.16088982413948594, "flos": 35150261783040.0, "grad_norm": 1.871909119220515, "language_loss": 0.82216591, "learning_rate": 3.822959578715685e-06, "loss": 0.84382153, "num_input_tokens_seen": 57939890, "step": 2676, "time_per_iteration": 2.8457534313201904 }, { "auxiliary_loss_clip": 0.01151157, "auxiliary_loss_mlp": 0.01049874, "balance_loss_clip": 1.05746996, "balance_loss_mlp": 1.03162253, "epoch": 0.1609499473921539, "flos": 18625105814400.0, "grad_norm": 2.1166154816193923, "language_loss": 0.73485494, "learning_rate": 3.822799341092573e-06, "loss": 0.75686526, "num_input_tokens_seen": 57957410, "step": 2677, "time_per_iteration": 2.65387225151062 }, { "auxiliary_loss_clip": 0.01138188, "auxiliary_loss_mlp": 0.01044363, "balance_loss_clip": 1.05438483, "balance_loss_mlp": 1.02537322, "epoch": 0.1610100706448219, "flos": 33145728416640.0, "grad_norm": 3.229282061984371, "language_loss": 0.76305777, "learning_rate": 3.822639034348728e-06, "loss": 0.78488332, "num_input_tokens_seen": 57977900, "step": 2678, "time_per_iteration": 2.836071014404297 }, { "auxiliary_loss_clip": 0.01148252, "auxiliary_loss_mlp": 0.01047887, "balance_loss_clip": 1.05379987, "balance_loss_mlp": 1.02789569, "epoch": 0.16107019389748986, "flos": 34676707852800.0, "grad_norm": 8.295814069484678, "language_loss": 0.70340431, "learning_rate": 3.822478658490228e-06, "loss": 0.7253657, "num_input_tokens_seen": 57998210, "step": 2679, "time_per_iteration": 2.771185874938965 }, { "auxiliary_loss_clip": 0.01059502, "auxiliary_loss_mlp": 0.00758644, "balance_loss_clip": 1.04695845, "balance_loss_mlp": 1.00150955, "epoch": 0.16113031715015783, "flos": 65713403260800.0, "grad_norm": 0.7819629653273137, "language_loss": 0.51843339, "learning_rate": 3.822318213523154e-06, "loss": 0.53661484, "num_input_tokens_seen": 58059420, "step": 2680, "time_per_iteration": 3.3107378482818604 }, { "auxiliary_loss_clip": 0.01144342, "auxiliary_loss_mlp": 0.01047358, "balance_loss_clip": 1.05360317, "balance_loss_mlp": 1.02632904, "epoch": 0.1611904404028258, "flos": 20810413353600.0, "grad_norm": 1.6718368455031125, "language_loss": 0.8028667, "learning_rate": 3.8221576994535925e-06, "loss": 0.82478368, "num_input_tokens_seen": 58078370, "step": 2681, "time_per_iteration": 2.6986513137817383 }, { "auxiliary_loss_clip": 0.01139192, "auxiliary_loss_mlp": 0.01055518, "balance_loss_clip": 1.05603266, "balance_loss_mlp": 1.03602743, "epoch": 0.16125056365549376, "flos": 27013335062400.0, "grad_norm": 2.154781054673542, "language_loss": 0.68957973, "learning_rate": 3.821997116287627e-06, "loss": 0.71152687, "num_input_tokens_seen": 58097395, "step": 2682, "time_per_iteration": 2.794686794281006 }, { "auxiliary_loss_clip": 0.01139216, "auxiliary_loss_mlp": 0.01052349, "balance_loss_clip": 1.05670619, "balance_loss_mlp": 1.03195262, "epoch": 0.16131068690816172, "flos": 19276524915840.0, "grad_norm": 1.9802191055590168, "language_loss": 0.87362224, "learning_rate": 3.821836464031348e-06, "loss": 0.89553785, "num_input_tokens_seen": 58115630, "step": 2683, "time_per_iteration": 2.703634262084961 }, { "auxiliary_loss_clip": 0.01165497, "auxiliary_loss_mlp": 0.0105575, "balance_loss_clip": 1.05714059, "balance_loss_mlp": 1.03491259, "epoch": 0.16137081016082971, "flos": 35337931367040.0, "grad_norm": 1.939499216066865, "language_loss": 0.74143028, "learning_rate": 3.821675742690849e-06, "loss": 0.76364273, "num_input_tokens_seen": 58138655, "step": 2684, "time_per_iteration": 2.7890264987945557 }, { "auxiliary_loss_clip": 0.01136683, "auxiliary_loss_mlp": 0.00778989, "balance_loss_clip": 1.05435085, "balance_loss_mlp": 1.00176883, "epoch": 0.16143093341349768, "flos": 34235257703040.0, "grad_norm": 1.9009911635557044, "language_loss": 0.70506597, "learning_rate": 3.821514952272223e-06, "loss": 0.72422272, "num_input_tokens_seen": 58157440, "step": 2685, "time_per_iteration": 2.803942918777466 }, { "auxiliary_loss_clip": 0.01116315, "auxiliary_loss_mlp": 0.01059092, "balance_loss_clip": 1.05291295, "balance_loss_mlp": 1.03757524, "epoch": 0.16149105666616564, "flos": 27999262546560.0, "grad_norm": 2.295686008167468, "language_loss": 0.72060591, "learning_rate": 3.821354092781567e-06, "loss": 0.74236, "num_input_tokens_seen": 58176660, "step": 2686, "time_per_iteration": 2.850309133529663 }, { "auxiliary_loss_clip": 0.01153803, "auxiliary_loss_mlp": 0.01048887, "balance_loss_clip": 1.05603862, "balance_loss_mlp": 1.02922952, "epoch": 0.1615511799188336, "flos": 19422214479360.0, "grad_norm": 2.056921120199424, "language_loss": 0.81720114, "learning_rate": 3.821193164224981e-06, "loss": 0.83922803, "num_input_tokens_seen": 58195085, "step": 2687, "time_per_iteration": 2.7077832221984863 }, { "auxiliary_loss_clip": 0.01154388, "auxiliary_loss_mlp": 0.01050682, "balance_loss_clip": 1.05335689, "balance_loss_mlp": 1.02910483, "epoch": 0.16161130317150157, "flos": 22854915578880.0, "grad_norm": 1.6747986106054085, "language_loss": 0.71680355, "learning_rate": 3.821032166608568e-06, "loss": 0.73885429, "num_input_tokens_seen": 58213540, "step": 2688, "time_per_iteration": 2.700073480606079 }, { "auxiliary_loss_clip": 0.0112226, "auxiliary_loss_mlp": 0.0105252, "balance_loss_clip": 1.0517168, "balance_loss_mlp": 1.03330338, "epoch": 0.16167142642416954, "flos": 26110577520000.0, "grad_norm": 2.2887064413695253, "language_loss": 0.76168394, "learning_rate": 3.8208710999384325e-06, "loss": 0.78343177, "num_input_tokens_seen": 58236995, "step": 2689, "time_per_iteration": 2.846964120864868 }, { "auxiliary_loss_clip": 0.01166324, "auxiliary_loss_mlp": 0.01052979, "balance_loss_clip": 1.05979431, "balance_loss_mlp": 1.03308284, "epoch": 0.1617315496768375, "flos": 22779646629120.0, "grad_norm": 2.045037041298705, "language_loss": 0.87211925, "learning_rate": 3.820709964220683e-06, "loss": 0.89431226, "num_input_tokens_seen": 58257230, "step": 2690, "time_per_iteration": 2.704497814178467 }, { "auxiliary_loss_clip": 0.01143898, "auxiliary_loss_mlp": 0.01046571, "balance_loss_clip": 1.05318451, "balance_loss_mlp": 1.02890396, "epoch": 0.1617916729295055, "flos": 22017299351040.0, "grad_norm": 1.7518031225399346, "language_loss": 0.87899524, "learning_rate": 3.8205487594614284e-06, "loss": 0.90089989, "num_input_tokens_seen": 58277080, "step": 2691, "time_per_iteration": 2.6763153076171875 }, { "auxiliary_loss_clip": 0.01150265, "auxiliary_loss_mlp": 0.01053114, "balance_loss_clip": 1.05237532, "balance_loss_mlp": 1.03142977, "epoch": 0.16185179618217346, "flos": 23438248450560.0, "grad_norm": 2.1723450057475313, "language_loss": 0.81989783, "learning_rate": 3.820387485666784e-06, "loss": 0.84193164, "num_input_tokens_seen": 58294815, "step": 2692, "time_per_iteration": 2.6381001472473145 }, { "auxiliary_loss_clip": 0.01167881, "auxiliary_loss_mlp": 0.0104606, "balance_loss_clip": 1.05555534, "balance_loss_mlp": 1.02499604, "epoch": 0.16191191943484143, "flos": 25666110627840.0, "grad_norm": 2.194958172554253, "language_loss": 0.81381011, "learning_rate": 3.820226142842862e-06, "loss": 0.83594954, "num_input_tokens_seen": 58313215, "step": 2693, "time_per_iteration": 2.6366944313049316 }, { "auxiliary_loss_clip": 0.01164466, "auxiliary_loss_mlp": 0.01058298, "balance_loss_clip": 1.0587461, "balance_loss_mlp": 1.03991616, "epoch": 0.1619720426875094, "flos": 23477355383040.0, "grad_norm": 2.778189532536263, "language_loss": 0.83837044, "learning_rate": 3.820064730995783e-06, "loss": 0.86059809, "num_input_tokens_seen": 58333215, "step": 2694, "time_per_iteration": 2.7802140712738037 }, { "auxiliary_loss_clip": 0.01116209, "auxiliary_loss_mlp": 0.0105764, "balance_loss_clip": 1.04927421, "balance_loss_mlp": 1.0366354, "epoch": 0.16203216594017736, "flos": 24133658734080.0, "grad_norm": 1.8201511645490482, "language_loss": 0.69709098, "learning_rate": 3.819903250131667e-06, "loss": 0.71882945, "num_input_tokens_seen": 58351160, "step": 2695, "time_per_iteration": 2.756904125213623 }, { "auxiliary_loss_clip": 0.01155526, "auxiliary_loss_mlp": 0.01050837, "balance_loss_clip": 1.05799723, "balance_loss_mlp": 1.03026128, "epoch": 0.16209228919284532, "flos": 22340889999360.0, "grad_norm": 2.1550523064219487, "language_loss": 0.82986331, "learning_rate": 3.819741700256637e-06, "loss": 0.85192692, "num_input_tokens_seen": 58368505, "step": 2696, "time_per_iteration": 2.651510238647461 }, { "auxiliary_loss_clip": 0.01174193, "auxiliary_loss_mlp": 0.01052819, "balance_loss_clip": 1.05826569, "balance_loss_mlp": 1.03095615, "epoch": 0.1621524124455133, "flos": 15815131827840.0, "grad_norm": 2.9267990143146503, "language_loss": 0.8862049, "learning_rate": 3.8195800813768194e-06, "loss": 0.90847504, "num_input_tokens_seen": 58385085, "step": 2697, "time_per_iteration": 2.5935380458831787 }, { "auxiliary_loss_clip": 0.01158945, "auxiliary_loss_mlp": 0.01045471, "balance_loss_clip": 1.0552485, "balance_loss_mlp": 1.02719641, "epoch": 0.16221253569818128, "flos": 30186688988160.0, "grad_norm": 1.7480298293719791, "language_loss": 0.80844599, "learning_rate": 3.819418393498343e-06, "loss": 0.83049017, "num_input_tokens_seen": 58406985, "step": 2698, "time_per_iteration": 2.6685965061187744 }, { "auxiliary_loss_clip": 0.01151678, "auxiliary_loss_mlp": 0.01050084, "balance_loss_clip": 1.05785704, "balance_loss_mlp": 1.03060579, "epoch": 0.16227265895084925, "flos": 24605991601920.0, "grad_norm": 1.590231062064763, "language_loss": 0.77499473, "learning_rate": 3.819256636627339e-06, "loss": 0.79701245, "num_input_tokens_seen": 58426205, "step": 2699, "time_per_iteration": 2.7206287384033203 }, { "auxiliary_loss_clip": 0.01134482, "auxiliary_loss_mlp": 0.01043888, "balance_loss_clip": 1.0504272, "balance_loss_mlp": 1.02510071, "epoch": 0.1623327822035172, "flos": 19573326996480.0, "grad_norm": 2.299083669251571, "language_loss": 0.85903585, "learning_rate": 3.81909481076994e-06, "loss": 0.88081944, "num_input_tokens_seen": 58443830, "step": 2700, "time_per_iteration": 2.6440224647521973 }, { "auxiliary_loss_clip": 0.01150266, "auxiliary_loss_mlp": 0.00778348, "balance_loss_clip": 1.05360484, "balance_loss_mlp": 1.00180686, "epoch": 0.16239290545618518, "flos": 26468462678400.0, "grad_norm": 1.7679372116400307, "language_loss": 0.80424523, "learning_rate": 3.818932915932284e-06, "loss": 0.82353133, "num_input_tokens_seen": 58464405, "step": 2701, "time_per_iteration": 2.6943976879119873 }, { "auxiliary_loss_clip": 0.01144477, "auxiliary_loss_mlp": 0.01046291, "balance_loss_clip": 1.05771017, "balance_loss_mlp": 1.02664542, "epoch": 0.16245302870885314, "flos": 15851940289920.0, "grad_norm": 1.6539412057050027, "language_loss": 0.72777367, "learning_rate": 3.818770952120511e-06, "loss": 0.74968135, "num_input_tokens_seen": 58483295, "step": 2702, "time_per_iteration": 2.6914141178131104 }, { "auxiliary_loss_clip": 0.01156069, "auxiliary_loss_mlp": 0.01050141, "balance_loss_clip": 1.05802381, "balance_loss_mlp": 1.02896905, "epoch": 0.1625131519615211, "flos": 14756521173120.0, "grad_norm": 1.8265391375227176, "language_loss": 0.7273894, "learning_rate": 3.81860891934076e-06, "loss": 0.74945152, "num_input_tokens_seen": 58501205, "step": 2703, "time_per_iteration": 2.6301820278167725 }, { "auxiliary_loss_clip": 0.01165642, "auxiliary_loss_mlp": 0.01050857, "balance_loss_clip": 1.0553968, "balance_loss_mlp": 1.02942359, "epoch": 0.1625732752141891, "flos": 28220508368640.0, "grad_norm": 3.0329584489902666, "language_loss": 0.70018482, "learning_rate": 3.818446817599176e-06, "loss": 0.72234988, "num_input_tokens_seen": 58522315, "step": 2704, "time_per_iteration": 2.6667227745056152 }, { "auxiliary_loss_clip": 0.01034679, "auxiliary_loss_mlp": 0.01001657, "balance_loss_clip": 1.03343439, "balance_loss_mlp": 0.99865305, "epoch": 0.16263339846685707, "flos": 67327947688320.0, "grad_norm": 0.7801109588151329, "language_loss": 0.5336051, "learning_rate": 3.818284646901907e-06, "loss": 0.55396849, "num_input_tokens_seen": 58586695, "step": 2705, "time_per_iteration": 4.808594465255737 }, { "auxiliary_loss_clip": 0.01138628, "auxiliary_loss_mlp": 0.00781324, "balance_loss_clip": 1.0539608, "balance_loss_mlp": 1.00171995, "epoch": 0.16269352171952503, "flos": 14319165173760.0, "grad_norm": 2.3827832530074455, "language_loss": 0.7536028, "learning_rate": 3.818122407255102e-06, "loss": 0.77280229, "num_input_tokens_seen": 58602435, "step": 2706, "time_per_iteration": 4.126614570617676 }, { "auxiliary_loss_clip": 0.01130684, "auxiliary_loss_mlp": 0.01047489, "balance_loss_clip": 1.0523324, "balance_loss_mlp": 1.02859437, "epoch": 0.162753644972193, "flos": 28361205941760.0, "grad_norm": 2.2272392184651038, "language_loss": 0.72203928, "learning_rate": 3.817960098664914e-06, "loss": 0.74382102, "num_input_tokens_seen": 58621275, "step": 2707, "time_per_iteration": 4.2739410400390625 }, { "auxiliary_loss_clip": 0.01142142, "auxiliary_loss_mlp": 0.01047652, "balance_loss_clip": 1.05433679, "balance_loss_mlp": 1.02898431, "epoch": 0.16281376822486096, "flos": 19937856170880.0, "grad_norm": 3.192481802987827, "language_loss": 0.83481139, "learning_rate": 3.817797721137495e-06, "loss": 0.85670936, "num_input_tokens_seen": 58637550, "step": 2708, "time_per_iteration": 2.7163965702056885 }, { "auxiliary_loss_clip": 0.01101561, "auxiliary_loss_mlp": 0.00781217, "balance_loss_clip": 1.04896522, "balance_loss_mlp": 1.00177419, "epoch": 0.16287389147752893, "flos": 21251719848960.0, "grad_norm": 2.2850459718507654, "language_loss": 0.86162847, "learning_rate": 3.817635274679006e-06, "loss": 0.88045627, "num_input_tokens_seen": 58654135, "step": 2709, "time_per_iteration": 4.474989652633667 }, { "auxiliary_loss_clip": 0.0114031, "auxiliary_loss_mlp": 0.00777602, "balance_loss_clip": 1.05267572, "balance_loss_mlp": 1.00172114, "epoch": 0.1629340147301969, "flos": 19244672530560.0, "grad_norm": 2.581053296112052, "language_loss": 0.91410124, "learning_rate": 3.817472759295605e-06, "loss": 0.93328035, "num_input_tokens_seen": 58674320, "step": 2710, "time_per_iteration": 2.6951892375946045 }, { "auxiliary_loss_clip": 0.01118597, "auxiliary_loss_mlp": 0.01054854, "balance_loss_clip": 1.05254805, "balance_loss_mlp": 1.03451669, "epoch": 0.16299413798286488, "flos": 21249816428160.0, "grad_norm": 2.4322540773438437, "language_loss": 0.81690979, "learning_rate": 3.817310174993453e-06, "loss": 0.83864427, "num_input_tokens_seen": 58691000, "step": 2711, "time_per_iteration": 2.7854437828063965 }, { "auxiliary_loss_clip": 0.01146056, "auxiliary_loss_mlp": 0.01040648, "balance_loss_clip": 1.04954815, "balance_loss_mlp": 1.02107334, "epoch": 0.16305426123553285, "flos": 18770579896320.0, "grad_norm": 3.73256798888747, "language_loss": 0.8091476, "learning_rate": 3.817147521778719e-06, "loss": 0.83101463, "num_input_tokens_seen": 58710230, "step": 2712, "time_per_iteration": 2.834291458129883 }, { "auxiliary_loss_clip": 0.01171211, "auxiliary_loss_mlp": 0.01053015, "balance_loss_clip": 1.0590024, "balance_loss_mlp": 1.03273714, "epoch": 0.16311438448820081, "flos": 22087648137600.0, "grad_norm": 2.3460895846171996, "language_loss": 0.7681579, "learning_rate": 3.816984799657568e-06, "loss": 0.79040015, "num_input_tokens_seen": 58728610, "step": 2713, "time_per_iteration": 2.6188278198242188 }, { "auxiliary_loss_clip": 0.01156539, "auxiliary_loss_mlp": 0.0105792, "balance_loss_clip": 1.06240916, "balance_loss_mlp": 1.03832221, "epoch": 0.16317450774086878, "flos": 16467700164480.0, "grad_norm": 2.543173325075216, "language_loss": 0.79012156, "learning_rate": 3.8168220086361715e-06, "loss": 0.81226611, "num_input_tokens_seen": 58744385, "step": 2714, "time_per_iteration": 2.6534018516540527 }, { "auxiliary_loss_clip": 0.01149567, "auxiliary_loss_mlp": 0.01056152, "balance_loss_clip": 1.05467987, "balance_loss_mlp": 1.03724504, "epoch": 0.16323463099353674, "flos": 24352929308160.0, "grad_norm": 1.614702766215493, "language_loss": 0.77693665, "learning_rate": 3.816659148720702e-06, "loss": 0.79899377, "num_input_tokens_seen": 58763905, "step": 2715, "time_per_iteration": 2.856006383895874 }, { "auxiliary_loss_clip": 0.01129437, "auxiliary_loss_mlp": 0.01044046, "balance_loss_clip": 1.04810584, "balance_loss_mlp": 1.02525854, "epoch": 0.1632947542462047, "flos": 24900782520960.0, "grad_norm": 2.374975046722651, "language_loss": 0.81513858, "learning_rate": 3.816496219917336e-06, "loss": 0.83687335, "num_input_tokens_seen": 58785580, "step": 2716, "time_per_iteration": 2.6750845909118652 }, { "auxiliary_loss_clip": 0.01144393, "auxiliary_loss_mlp": 0.01055927, "balance_loss_clip": 1.05851114, "balance_loss_mlp": 1.03703237, "epoch": 0.1633548774988727, "flos": 24900279730560.0, "grad_norm": 1.8186679286330678, "language_loss": 0.86522418, "learning_rate": 3.816333222232251e-06, "loss": 0.88722742, "num_input_tokens_seen": 58806075, "step": 2717, "time_per_iteration": 2.761622428894043 }, { "auxiliary_loss_clip": 0.01135377, "auxiliary_loss_mlp": 0.01045964, "balance_loss_clip": 1.05334044, "balance_loss_mlp": 1.0274632, "epoch": 0.16341500075154067, "flos": 30441798357120.0, "grad_norm": 1.8799656187942837, "language_loss": 0.76924133, "learning_rate": 3.816170155671629e-06, "loss": 0.79105473, "num_input_tokens_seen": 58827405, "step": 2718, "time_per_iteration": 2.7946770191192627 }, { "auxiliary_loss_clip": 0.01145146, "auxiliary_loss_mlp": 0.01043682, "balance_loss_clip": 1.05553615, "balance_loss_mlp": 1.02566922, "epoch": 0.16347512400420863, "flos": 22784530878720.0, "grad_norm": 2.2449478392049906, "language_loss": 0.73827291, "learning_rate": 3.816007020241652e-06, "loss": 0.76016116, "num_input_tokens_seen": 58847205, "step": 2719, "time_per_iteration": 2.719980478286743 }, { "auxiliary_loss_clip": 0.01128361, "auxiliary_loss_mlp": 0.01045887, "balance_loss_clip": 1.04900515, "balance_loss_mlp": 1.02732563, "epoch": 0.1635352472568766, "flos": 22633274707200.0, "grad_norm": 1.7092252575708884, "language_loss": 0.72267497, "learning_rate": 3.815843815948507e-06, "loss": 0.74441749, "num_input_tokens_seen": 58866865, "step": 2720, "time_per_iteration": 2.8737292289733887 }, { "auxiliary_loss_clip": 0.01109456, "auxiliary_loss_mlp": 0.01049703, "balance_loss_clip": 1.05004287, "balance_loss_mlp": 1.02840054, "epoch": 0.16359537050954456, "flos": 15522998515200.0, "grad_norm": 2.1621365878543153, "language_loss": 0.75120997, "learning_rate": 3.8156805427983824e-06, "loss": 0.77280164, "num_input_tokens_seen": 58885200, "step": 2721, "time_per_iteration": 2.785296678543091 }, { "auxiliary_loss_clip": 0.01110342, "auxiliary_loss_mlp": 0.01059955, "balance_loss_clip": 1.04597676, "balance_loss_mlp": 1.03734064, "epoch": 0.16365549376221253, "flos": 22090162089600.0, "grad_norm": 1.9032438792006017, "language_loss": 0.79073942, "learning_rate": 3.8155172007974695e-06, "loss": 0.81244236, "num_input_tokens_seen": 58906385, "step": 2722, "time_per_iteration": 2.7850708961486816 }, { "auxiliary_loss_clip": 0.01149809, "auxiliary_loss_mlp": 0.00778798, "balance_loss_clip": 1.05395257, "balance_loss_mlp": 1.00171757, "epoch": 0.1637156170148805, "flos": 24060400945920.0, "grad_norm": 2.3019049903761215, "language_loss": 0.84954333, "learning_rate": 3.8153537899519624e-06, "loss": 0.86882937, "num_input_tokens_seen": 58925040, "step": 2723, "time_per_iteration": 2.7268764972686768 }, { "auxiliary_loss_clip": 0.01108328, "auxiliary_loss_mlp": 0.01044851, "balance_loss_clip": 1.04805517, "balance_loss_mlp": 1.02493143, "epoch": 0.1637757402675485, "flos": 26685362954880.0, "grad_norm": 1.8985615531712963, "language_loss": 0.71018666, "learning_rate": 3.815190310268058e-06, "loss": 0.73171842, "num_input_tokens_seen": 58944790, "step": 2724, "time_per_iteration": 2.7691783905029297 }, { "auxiliary_loss_clip": 0.01118053, "auxiliary_loss_mlp": 0.01041883, "balance_loss_clip": 1.05226958, "balance_loss_mlp": 1.02364373, "epoch": 0.16383586352021645, "flos": 16106941918080.0, "grad_norm": 2.1059770262776136, "language_loss": 0.70552838, "learning_rate": 3.815026761751955e-06, "loss": 0.72712779, "num_input_tokens_seen": 58962500, "step": 2725, "time_per_iteration": 2.6936957836151123 }, { "auxiliary_loss_clip": 0.01112368, "auxiliary_loss_mlp": 0.01046594, "balance_loss_clip": 1.04912174, "balance_loss_mlp": 1.028391, "epoch": 0.16389598677288442, "flos": 19165991788800.0, "grad_norm": 2.27810298992254, "language_loss": 0.88491893, "learning_rate": 3.814863144409855e-06, "loss": 0.90650856, "num_input_tokens_seen": 58980355, "step": 2726, "time_per_iteration": 2.7967143058776855 }, { "auxiliary_loss_clip": 0.01157668, "auxiliary_loss_mlp": 0.0105068, "balance_loss_clip": 1.06062055, "balance_loss_mlp": 1.03099847, "epoch": 0.16395611002555238, "flos": 21507008785920.0, "grad_norm": 2.0584475237926303, "language_loss": 0.7469939, "learning_rate": 3.814699458247963e-06, "loss": 0.7690773, "num_input_tokens_seen": 58999505, "step": 2727, "time_per_iteration": 2.6818623542785645 }, { "auxiliary_loss_clip": 0.01150971, "auxiliary_loss_mlp": 0.01052077, "balance_loss_clip": 1.0570507, "balance_loss_mlp": 1.03527999, "epoch": 0.16401623327822035, "flos": 21470918595840.0, "grad_norm": 1.6112579442237729, "language_loss": 0.83097756, "learning_rate": 3.8145357032724855e-06, "loss": 0.85300803, "num_input_tokens_seen": 59017930, "step": 2728, "time_per_iteration": 2.675360918045044 }, { "auxiliary_loss_clip": 0.01156153, "auxiliary_loss_mlp": 0.01045609, "balance_loss_clip": 1.05826735, "balance_loss_mlp": 1.02602315, "epoch": 0.1640763565308883, "flos": 13626232928640.0, "grad_norm": 2.5738755626941106, "language_loss": 0.84892929, "learning_rate": 3.814371879489633e-06, "loss": 0.87094688, "num_input_tokens_seen": 59035130, "step": 2729, "time_per_iteration": 2.7004599571228027 }, { "auxiliary_loss_clip": 0.01167293, "auxiliary_loss_mlp": 0.01048461, "balance_loss_clip": 1.0591594, "balance_loss_mlp": 1.03053224, "epoch": 0.16413647978355628, "flos": 15451464579840.0, "grad_norm": 1.9897225699042427, "language_loss": 0.72895479, "learning_rate": 3.814207986905616e-06, "loss": 0.75111228, "num_input_tokens_seen": 59053080, "step": 2730, "time_per_iteration": 2.593179702758789 }, { "auxiliary_loss_clip": 0.01142509, "auxiliary_loss_mlp": 0.01050071, "balance_loss_clip": 1.05208349, "balance_loss_mlp": 1.02908981, "epoch": 0.16419660303622427, "flos": 45878682015360.0, "grad_norm": 1.6754501336017709, "language_loss": 0.74384654, "learning_rate": 3.814044025526651e-06, "loss": 0.76577234, "num_input_tokens_seen": 59075610, "step": 2731, "time_per_iteration": 2.8702962398529053 }, { "auxiliary_loss_clip": 0.01122791, "auxiliary_loss_mlp": 0.01047176, "balance_loss_clip": 1.05006754, "balance_loss_mlp": 1.02650499, "epoch": 0.16425672628889224, "flos": 18952826526720.0, "grad_norm": 2.031351475505915, "language_loss": 0.79190683, "learning_rate": 3.8138799953589548e-06, "loss": 0.8136065, "num_input_tokens_seen": 59094555, "step": 2732, "time_per_iteration": 2.734529972076416 }, { "auxiliary_loss_clip": 0.01141118, "auxiliary_loss_mlp": 0.01047385, "balance_loss_clip": 1.05340672, "balance_loss_mlp": 1.02796555, "epoch": 0.1643168495415602, "flos": 24312996362880.0, "grad_norm": 2.250003976384769, "language_loss": 0.69526887, "learning_rate": 3.8137158964087473e-06, "loss": 0.71715385, "num_input_tokens_seen": 59113515, "step": 2733, "time_per_iteration": 2.672377109527588 }, { "auxiliary_loss_clip": 0.01143332, "auxiliary_loss_mlp": 0.01053232, "balance_loss_clip": 1.05603123, "balance_loss_mlp": 1.0325135, "epoch": 0.16437697279422817, "flos": 26428421992320.0, "grad_norm": 2.000873580428856, "language_loss": 0.80976766, "learning_rate": 3.8135517286822508e-06, "loss": 0.83173329, "num_input_tokens_seen": 59133275, "step": 2734, "time_per_iteration": 2.710293769836426 }, { "auxiliary_loss_clip": 0.01135758, "auxiliary_loss_mlp": 0.01056722, "balance_loss_clip": 1.05488348, "balance_loss_mlp": 1.03470409, "epoch": 0.16443709604689613, "flos": 34532239351680.0, "grad_norm": 2.100664117201308, "language_loss": 0.81810421, "learning_rate": 3.8133874921856914e-06, "loss": 0.840029, "num_input_tokens_seen": 59154095, "step": 2735, "time_per_iteration": 2.8074140548706055 }, { "auxiliary_loss_clip": 0.01070875, "auxiliary_loss_mlp": 0.01044313, "balance_loss_clip": 1.04323888, "balance_loss_mlp": 1.02508426, "epoch": 0.1644972192995641, "flos": 23258048895360.0, "grad_norm": 2.405088987017839, "language_loss": 0.78515649, "learning_rate": 3.813223186925296e-06, "loss": 0.80630839, "num_input_tokens_seen": 59173795, "step": 2736, "time_per_iteration": 2.839087963104248 }, { "auxiliary_loss_clip": 0.01147998, "auxiliary_loss_mlp": 0.01054659, "balance_loss_clip": 1.05859447, "balance_loss_mlp": 1.03513288, "epoch": 0.1645573425522321, "flos": 26979543342720.0, "grad_norm": 1.9462182296456145, "language_loss": 0.81052899, "learning_rate": 3.8130588129072964e-06, "loss": 0.83255553, "num_input_tokens_seen": 59191610, "step": 2737, "time_per_iteration": 2.7328996658325195 }, { "auxiliary_loss_clip": 0.01150424, "auxiliary_loss_mlp": 0.01052207, "balance_loss_clip": 1.0559026, "balance_loss_mlp": 1.03065443, "epoch": 0.16461746580490005, "flos": 28731768600960.0, "grad_norm": 1.8596348168124566, "language_loss": 0.87449318, "learning_rate": 3.8128943701379246e-06, "loss": 0.89651948, "num_input_tokens_seen": 59213000, "step": 2738, "time_per_iteration": 2.7345526218414307 }, { "auxiliary_loss_clip": 0.01139154, "auxiliary_loss_mlp": 0.0106055, "balance_loss_clip": 1.05534518, "balance_loss_mlp": 1.04079759, "epoch": 0.16467758905756802, "flos": 24930156867840.0, "grad_norm": 1.728421510231393, "language_loss": 0.71997833, "learning_rate": 3.8127298586234167e-06, "loss": 0.74197543, "num_input_tokens_seen": 59232340, "step": 2739, "time_per_iteration": 2.7091422080993652 }, { "auxiliary_loss_clip": 0.01154419, "auxiliary_loss_mlp": 0.0105106, "balance_loss_clip": 1.05673754, "balance_loss_mlp": 1.0312835, "epoch": 0.16473771231023598, "flos": 24826519152000.0, "grad_norm": 1.8559436932352185, "language_loss": 0.81645715, "learning_rate": 3.8125652783700104e-06, "loss": 0.83851194, "num_input_tokens_seen": 59253950, "step": 2740, "time_per_iteration": 2.712658166885376 }, { "auxiliary_loss_clip": 0.01114061, "auxiliary_loss_mlp": 0.01068725, "balance_loss_clip": 1.04991829, "balance_loss_mlp": 1.04307163, "epoch": 0.16479783556290395, "flos": 39896072375040.0, "grad_norm": 2.0528021789830837, "language_loss": 0.69467485, "learning_rate": 3.8124006293839475e-06, "loss": 0.71650267, "num_input_tokens_seen": 59275545, "step": 2741, "time_per_iteration": 2.8629493713378906 }, { "auxiliary_loss_clip": 0.01167543, "auxiliary_loss_mlp": 0.01048721, "balance_loss_clip": 1.05907226, "balance_loss_mlp": 1.02906334, "epoch": 0.16485795881557191, "flos": 19897061299200.0, "grad_norm": 1.7765193730452222, "language_loss": 0.79811072, "learning_rate": 3.812235911671472e-06, "loss": 0.8202734, "num_input_tokens_seen": 59293480, "step": 2742, "time_per_iteration": 2.626775026321411 }, { "auxiliary_loss_clip": 0.01141681, "auxiliary_loss_mlp": 0.01055663, "balance_loss_clip": 1.05664062, "balance_loss_mlp": 1.03477716, "epoch": 0.16491808206823988, "flos": 20556129997440.0, "grad_norm": 1.91797408289014, "language_loss": 0.8499459, "learning_rate": 3.8120711252388274e-06, "loss": 0.87191939, "num_input_tokens_seen": 59313435, "step": 2743, "time_per_iteration": 2.8218302726745605 }, { "auxiliary_loss_clip": 0.01162447, "auxiliary_loss_mlp": 0.01051969, "balance_loss_clip": 1.05743837, "balance_loss_mlp": 1.03196514, "epoch": 0.16497820532090787, "flos": 23800802376960.0, "grad_norm": 1.4425200129075006, "language_loss": 0.85558498, "learning_rate": 3.811906270092265e-06, "loss": 0.87772918, "num_input_tokens_seen": 59331535, "step": 2744, "time_per_iteration": 4.206263542175293 }, { "auxiliary_loss_clip": 0.01131671, "auxiliary_loss_mlp": 0.0104676, "balance_loss_clip": 1.05206287, "balance_loss_mlp": 1.02812767, "epoch": 0.16503832857357584, "flos": 25482642935040.0, "grad_norm": 1.6285200980820358, "language_loss": 0.82770813, "learning_rate": 3.811741346238036e-06, "loss": 0.84949243, "num_input_tokens_seen": 59350680, "step": 2745, "time_per_iteration": 4.331594467163086 }, { "auxiliary_loss_clip": 0.011344, "auxiliary_loss_mlp": 0.01057242, "balance_loss_clip": 1.05874014, "balance_loss_mlp": 1.03825223, "epoch": 0.1650984518262438, "flos": 17676058619520.0, "grad_norm": 6.766690288332402, "language_loss": 0.76811314, "learning_rate": 3.8115763536823923e-06, "loss": 0.79002959, "num_input_tokens_seen": 59367020, "step": 2746, "time_per_iteration": 4.225586414337158 }, { "auxiliary_loss_clip": 0.01164296, "auxiliary_loss_mlp": 0.01055636, "balance_loss_clip": 1.05781221, "balance_loss_mlp": 1.03533494, "epoch": 0.16515857507891177, "flos": 18698327688960.0, "grad_norm": 1.9760186874049024, "language_loss": 0.80818808, "learning_rate": 3.811411292431592e-06, "loss": 0.83038735, "num_input_tokens_seen": 59386075, "step": 2747, "time_per_iteration": 2.6862480640411377 }, { "auxiliary_loss_clip": 0.01157975, "auxiliary_loss_mlp": 0.0104673, "balance_loss_clip": 1.05990267, "balance_loss_mlp": 1.02664328, "epoch": 0.16521869833157973, "flos": 15010481306880.0, "grad_norm": 2.0608482379031337, "language_loss": 0.69433749, "learning_rate": 3.8112461624918945e-06, "loss": 0.71638453, "num_input_tokens_seen": 59402690, "step": 2748, "time_per_iteration": 2.6520986557006836 }, { "auxiliary_loss_clip": 0.01169692, "auxiliary_loss_mlp": 0.00778195, "balance_loss_clip": 1.06237423, "balance_loss_mlp": 1.00173104, "epoch": 0.1652788215842477, "flos": 22121152548480.0, "grad_norm": 2.259215537482641, "language_loss": 0.88012803, "learning_rate": 3.811080963869561e-06, "loss": 0.89960694, "num_input_tokens_seen": 59421130, "step": 2749, "time_per_iteration": 4.260679244995117 }, { "auxiliary_loss_clip": 0.01154179, "auxiliary_loss_mlp": 0.01045617, "balance_loss_clip": 1.05586052, "balance_loss_mlp": 1.02542281, "epoch": 0.16533894483691566, "flos": 18333080242560.0, "grad_norm": 2.0880864906339864, "language_loss": 0.79240286, "learning_rate": 3.8109156965708557e-06, "loss": 0.81440079, "num_input_tokens_seen": 59438970, "step": 2750, "time_per_iteration": 2.6335251331329346 }, { "auxiliary_loss_clip": 0.01153343, "auxiliary_loss_mlp": 0.0104591, "balance_loss_clip": 1.0579437, "balance_loss_mlp": 1.02602625, "epoch": 0.16539906808958366, "flos": 22382115834240.0, "grad_norm": 1.6952801391084946, "language_loss": 0.94854712, "learning_rate": 3.8107503606020455e-06, "loss": 0.97053963, "num_input_tokens_seen": 59458510, "step": 2751, "time_per_iteration": 2.697174310684204 }, { "auxiliary_loss_clip": 0.0106803, "auxiliary_loss_mlp": 0.0105236, "balance_loss_clip": 1.04625726, "balance_loss_mlp": 1.03247619, "epoch": 0.16545919134225162, "flos": 22711093522560.0, "grad_norm": 2.614588592950962, "language_loss": 0.71231711, "learning_rate": 3.8105849559693997e-06, "loss": 0.73352098, "num_input_tokens_seen": 59477110, "step": 2752, "time_per_iteration": 2.7780745029449463 }, { "auxiliary_loss_clip": 0.01090521, "auxiliary_loss_mlp": 0.01022104, "balance_loss_clip": 1.05741131, "balance_loss_mlp": 1.01941013, "epoch": 0.1655193145949196, "flos": 67802974076160.0, "grad_norm": 0.7721529651221379, "language_loss": 0.54058975, "learning_rate": 3.810419482679192e-06, "loss": 0.56171602, "num_input_tokens_seen": 59541155, "step": 2753, "time_per_iteration": 3.3371469974517822 }, { "auxiliary_loss_clip": 0.01163808, "auxiliary_loss_mlp": 0.00778536, "balance_loss_clip": 1.05587018, "balance_loss_mlp": 1.00172091, "epoch": 0.16557943784758755, "flos": 24280389792000.0, "grad_norm": 1.6411537728312637, "language_loss": 0.75436741, "learning_rate": 3.8102539407376954e-06, "loss": 0.7737909, "num_input_tokens_seen": 59561155, "step": 2754, "time_per_iteration": 2.6382133960723877 }, { "auxiliary_loss_clip": 0.01139421, "auxiliary_loss_mlp": 0.01060584, "balance_loss_clip": 1.05406713, "balance_loss_mlp": 1.03768396, "epoch": 0.16563956110025552, "flos": 20083617561600.0, "grad_norm": 2.4067479946694137, "language_loss": 0.86654639, "learning_rate": 3.810088330151188e-06, "loss": 0.88854647, "num_input_tokens_seen": 59580460, "step": 2755, "time_per_iteration": 2.6590075492858887 }, { "auxiliary_loss_clip": 0.01122817, "auxiliary_loss_mlp": 0.01053169, "balance_loss_clip": 1.04948378, "balance_loss_mlp": 1.03293943, "epoch": 0.16569968435292348, "flos": 28034454896640.0, "grad_norm": 1.7268487777137649, "language_loss": 0.73350251, "learning_rate": 3.80992265092595e-06, "loss": 0.75526237, "num_input_tokens_seen": 59600025, "step": 2756, "time_per_iteration": 2.771820545196533 }, { "auxiliary_loss_clip": 0.01128662, "auxiliary_loss_mlp": 0.01049666, "balance_loss_clip": 1.05550277, "balance_loss_mlp": 1.02969813, "epoch": 0.16575980760559147, "flos": 26250233598720.0, "grad_norm": 1.5540667033085804, "language_loss": 0.75308084, "learning_rate": 3.8097569030682636e-06, "loss": 0.77486414, "num_input_tokens_seen": 59620600, "step": 2757, "time_per_iteration": 2.8106157779693604 }, { "auxiliary_loss_clip": 0.01143608, "auxiliary_loss_mlp": 0.01054064, "balance_loss_clip": 1.057634, "balance_loss_mlp": 1.03390563, "epoch": 0.16581993085825944, "flos": 26943955943040.0, "grad_norm": 1.8675154897424497, "language_loss": 0.84604371, "learning_rate": 3.8095910865844137e-06, "loss": 0.86802036, "num_input_tokens_seen": 59641385, "step": 2758, "time_per_iteration": 2.8663368225097656 }, { "auxiliary_loss_clip": 0.01168186, "auxiliary_loss_mlp": 0.01058337, "balance_loss_clip": 1.06166434, "balance_loss_mlp": 1.03952527, "epoch": 0.1658800541109274, "flos": 21653632103040.0, "grad_norm": 2.0824774555850243, "language_loss": 0.78848934, "learning_rate": 3.809425201480689e-06, "loss": 0.81075454, "num_input_tokens_seen": 59659865, "step": 2759, "time_per_iteration": 2.655371904373169 }, { "auxiliary_loss_clip": 0.01098973, "auxiliary_loss_mlp": 0.0104879, "balance_loss_clip": 1.0491066, "balance_loss_mlp": 1.02846527, "epoch": 0.16594017736359537, "flos": 16435488643200.0, "grad_norm": 2.4005603702739613, "language_loss": 0.75130272, "learning_rate": 3.8092592477633793e-06, "loss": 0.77278036, "num_input_tokens_seen": 59678780, "step": 2760, "time_per_iteration": 2.767866611480713 }, { "auxiliary_loss_clip": 0.01117278, "auxiliary_loss_mlp": 0.0104823, "balance_loss_clip": 1.05129814, "balance_loss_mlp": 1.02867997, "epoch": 0.16600030061626334, "flos": 22637297030400.0, "grad_norm": 1.5792623632565632, "language_loss": 0.73425764, "learning_rate": 3.8090932254387774e-06, "loss": 0.75591272, "num_input_tokens_seen": 59698795, "step": 2761, "time_per_iteration": 2.762836456298828 }, { "auxiliary_loss_clip": 0.0113507, "auxiliary_loss_mlp": 0.01050415, "balance_loss_clip": 1.05250192, "balance_loss_mlp": 1.03018475, "epoch": 0.1660604238689313, "flos": 26396569607040.0, "grad_norm": 2.9515424803015033, "language_loss": 0.88832974, "learning_rate": 3.8089271345131788e-06, "loss": 0.91018462, "num_input_tokens_seen": 59718795, "step": 2762, "time_per_iteration": 2.766324281692505 }, { "auxiliary_loss_clip": 0.01115163, "auxiliary_loss_mlp": 0.01050144, "balance_loss_clip": 1.05208707, "balance_loss_mlp": 1.03080845, "epoch": 0.16612054712159927, "flos": 23039999383680.0, "grad_norm": 1.84507980271118, "language_loss": 0.87992418, "learning_rate": 3.8087609749928822e-06, "loss": 0.90157735, "num_input_tokens_seen": 59737555, "step": 2763, "time_per_iteration": 2.7734055519104004 }, { "auxiliary_loss_clip": 0.01086152, "auxiliary_loss_mlp": 0.01013622, "balance_loss_clip": 1.0448606, "balance_loss_mlp": 1.01065338, "epoch": 0.16618067037426726, "flos": 59241225202560.0, "grad_norm": 0.7790832079967882, "language_loss": 0.59799927, "learning_rate": 3.8085947468841885e-06, "loss": 0.61899698, "num_input_tokens_seen": 59800915, "step": 2764, "time_per_iteration": 3.1728692054748535 }, { "auxiliary_loss_clip": 0.01152232, "auxiliary_loss_mlp": 0.01053607, "balance_loss_clip": 1.05467176, "balance_loss_mlp": 1.03254318, "epoch": 0.16624079362693522, "flos": 27198813916800.0, "grad_norm": 1.7436496772383425, "language_loss": 0.82260036, "learning_rate": 3.808428450193401e-06, "loss": 0.84465873, "num_input_tokens_seen": 59822910, "step": 2765, "time_per_iteration": 2.72440767288208 }, { "auxiliary_loss_clip": 0.01171844, "auxiliary_loss_mlp": 0.01049085, "balance_loss_clip": 1.05882454, "balance_loss_mlp": 1.02746069, "epoch": 0.1663009168796032, "flos": 10925068216320.0, "grad_norm": 2.128015994498251, "language_loss": 0.69980019, "learning_rate": 3.8082620849268244e-06, "loss": 0.72200948, "num_input_tokens_seen": 59838805, "step": 2766, "time_per_iteration": 2.5810647010803223 }, { "auxiliary_loss_clip": 0.0115036, "auxiliary_loss_mlp": 0.01047665, "balance_loss_clip": 1.05772817, "balance_loss_mlp": 1.02792454, "epoch": 0.16636104013227115, "flos": 17894431353600.0, "grad_norm": 2.107381123394178, "language_loss": 0.8845337, "learning_rate": 3.808095651090769e-06, "loss": 0.90651393, "num_input_tokens_seen": 59855345, "step": 2767, "time_per_iteration": 2.659240245819092 }, { "auxiliary_loss_clip": 0.01077283, "auxiliary_loss_mlp": 0.01002999, "balance_loss_clip": 1.046556, "balance_loss_mlp": 1.00020981, "epoch": 0.16642116338493912, "flos": 66726050463360.0, "grad_norm": 0.6403612433239105, "language_loss": 0.5289067, "learning_rate": 3.8079291486915447e-06, "loss": 0.54970956, "num_input_tokens_seen": 59917710, "step": 2768, "time_per_iteration": 3.28488826751709 }, { "auxiliary_loss_clip": 0.01137637, "auxiliary_loss_mlp": 0.01051692, "balance_loss_clip": 1.05451822, "balance_loss_mlp": 1.03034163, "epoch": 0.16648128663760708, "flos": 19026048401280.0, "grad_norm": 2.4342686570828267, "language_loss": 0.84962058, "learning_rate": 3.8077625777354667e-06, "loss": 0.87151396, "num_input_tokens_seen": 59935105, "step": 2769, "time_per_iteration": 2.753257989883423 }, { "auxiliary_loss_clip": 0.01068987, "auxiliary_loss_mlp": 0.0100573, "balance_loss_clip": 1.04678345, "balance_loss_mlp": 1.00316668, "epoch": 0.16654140989027508, "flos": 70134976759680.0, "grad_norm": 0.8107434108728753, "language_loss": 0.57455683, "learning_rate": 3.80759593822885e-06, "loss": 0.59530401, "num_input_tokens_seen": 59984085, "step": 2770, "time_per_iteration": 3.2202906608581543 }, { "auxiliary_loss_clip": 0.01054548, "auxiliary_loss_mlp": 0.01003676, "balance_loss_clip": 1.04637623, "balance_loss_mlp": 1.00086308, "epoch": 0.16660153314294304, "flos": 70272406195200.0, "grad_norm": 0.8940719168038874, "language_loss": 0.56241393, "learning_rate": 3.807429230178015e-06, "loss": 0.58299619, "num_input_tokens_seen": 60043470, "step": 2771, "time_per_iteration": 3.3302085399627686 }, { "auxiliary_loss_clip": 0.01110714, "auxiliary_loss_mlp": 0.01053994, "balance_loss_clip": 1.04819679, "balance_loss_mlp": 1.03316772, "epoch": 0.166661656395611, "flos": 23075048079360.0, "grad_norm": 2.9137693497887778, "language_loss": 0.70419657, "learning_rate": 3.8072624535892817e-06, "loss": 0.72584367, "num_input_tokens_seen": 60063045, "step": 2772, "time_per_iteration": 2.845414161682129 }, { "auxiliary_loss_clip": 0.0114592, "auxiliary_loss_mlp": 0.01049708, "balance_loss_clip": 1.05082583, "balance_loss_mlp": 1.02923954, "epoch": 0.16672177964827897, "flos": 28366341586560.0, "grad_norm": 2.20945076195277, "language_loss": 0.86324167, "learning_rate": 3.807095608468975e-06, "loss": 0.88519788, "num_input_tokens_seen": 60081945, "step": 2773, "time_per_iteration": 2.669412851333618 }, { "auxiliary_loss_clip": 0.01095425, "auxiliary_loss_mlp": 0.01049097, "balance_loss_clip": 1.04436934, "balance_loss_mlp": 1.0300827, "epoch": 0.16678190290094694, "flos": 19091010147840.0, "grad_norm": 2.0211952616678937, "language_loss": 0.82141376, "learning_rate": 3.8069286948234224e-06, "loss": 0.84285897, "num_input_tokens_seen": 60096820, "step": 2774, "time_per_iteration": 2.7111308574676514 }, { "auxiliary_loss_clip": 0.01123493, "auxiliary_loss_mlp": 0.01045144, "balance_loss_clip": 1.05252421, "balance_loss_mlp": 1.02446127, "epoch": 0.1668420261536149, "flos": 21799106184960.0, "grad_norm": 3.3781068524499, "language_loss": 0.8298822, "learning_rate": 3.806761712658952e-06, "loss": 0.85156858, "num_input_tokens_seen": 60116140, "step": 2775, "time_per_iteration": 2.7367632389068604 }, { "auxiliary_loss_clip": 0.01150495, "auxiliary_loss_mlp": 0.01051475, "balance_loss_clip": 1.05761933, "balance_loss_mlp": 1.03264022, "epoch": 0.16690214940628287, "flos": 19062533640960.0, "grad_norm": 1.8115651629444076, "language_loss": 0.80919641, "learning_rate": 3.806594661981897e-06, "loss": 0.8312161, "num_input_tokens_seen": 60134235, "step": 2776, "time_per_iteration": 2.651723623275757 }, { "auxiliary_loss_clip": 0.0113775, "auxiliary_loss_mlp": 0.01054199, "balance_loss_clip": 1.05518723, "balance_loss_mlp": 1.0346483, "epoch": 0.16696227265895086, "flos": 18588548747520.0, "grad_norm": 2.7510345221850336, "language_loss": 0.80203485, "learning_rate": 3.8064275427985906e-06, "loss": 0.82395434, "num_input_tokens_seen": 60153275, "step": 2777, "time_per_iteration": 2.6380929946899414 }, { "auxiliary_loss_clip": 0.01147967, "auxiliary_loss_mlp": 0.01045166, "balance_loss_clip": 1.05270481, "balance_loss_mlp": 1.02640271, "epoch": 0.16702239591161883, "flos": 23294139085440.0, "grad_norm": 1.6179722336290305, "language_loss": 0.85384095, "learning_rate": 3.806260355115371e-06, "loss": 0.87577224, "num_input_tokens_seen": 60173215, "step": 2778, "time_per_iteration": 2.754652500152588 }, { "auxiliary_loss_clip": 0.01136802, "auxiliary_loss_mlp": 0.01040643, "balance_loss_clip": 1.0531714, "balance_loss_mlp": 1.02148652, "epoch": 0.1670825191642868, "flos": 24425648392320.0, "grad_norm": 3.2091470007324414, "language_loss": 0.74180603, "learning_rate": 3.8060930989385778e-06, "loss": 0.76358056, "num_input_tokens_seen": 60190515, "step": 2779, "time_per_iteration": 2.777193784713745 }, { "auxiliary_loss_clip": 0.01112683, "auxiliary_loss_mlp": 0.00777451, "balance_loss_clip": 1.04981184, "balance_loss_mlp": 1.0015173, "epoch": 0.16714264241695476, "flos": 26797512193920.0, "grad_norm": 2.127789274190337, "language_loss": 0.6557346, "learning_rate": 3.805925774274554e-06, "loss": 0.67463589, "num_input_tokens_seen": 60211655, "step": 2780, "time_per_iteration": 2.896976947784424 }, { "auxiliary_loss_clip": 0.01120921, "auxiliary_loss_mlp": 0.01045506, "balance_loss_clip": 1.04843462, "balance_loss_mlp": 1.02547836, "epoch": 0.16720276566962272, "flos": 21835304115840.0, "grad_norm": 2.46647860258999, "language_loss": 0.78422606, "learning_rate": 3.805758381129643e-06, "loss": 0.80589032, "num_input_tokens_seen": 60230860, "step": 2781, "time_per_iteration": 2.725782632827759 }, { "auxiliary_loss_clip": 0.01094692, "auxiliary_loss_mlp": 0.01050104, "balance_loss_clip": 1.04439843, "balance_loss_mlp": 1.03056526, "epoch": 0.1672628889222907, "flos": 21470415805440.0, "grad_norm": 26.23767952829368, "language_loss": 0.75119764, "learning_rate": 3.805590919510193e-06, "loss": 0.77264553, "num_input_tokens_seen": 60250535, "step": 2782, "time_per_iteration": 2.7064197063446045 }, { "auxiliary_loss_clip": 0.01129162, "auxiliary_loss_mlp": 0.01047612, "balance_loss_clip": 1.05152631, "balance_loss_mlp": 1.02764392, "epoch": 0.16732301217495865, "flos": 30774008269440.0, "grad_norm": 2.116531296279042, "language_loss": 0.67398441, "learning_rate": 3.8054233894225547e-06, "loss": 0.69575214, "num_input_tokens_seen": 60269530, "step": 2783, "time_per_iteration": 2.7901556491851807 }, { "auxiliary_loss_clip": 0.01158882, "auxiliary_loss_mlp": 0.0105166, "balance_loss_clip": 1.05460215, "balance_loss_mlp": 1.03271747, "epoch": 0.16738313542762664, "flos": 23474625949440.0, "grad_norm": 1.7768362036873409, "language_loss": 0.69919086, "learning_rate": 3.805255790873081e-06, "loss": 0.72129631, "num_input_tokens_seen": 60289900, "step": 2784, "time_per_iteration": 5.714844226837158 }, { "auxiliary_loss_clip": 0.01137618, "auxiliary_loss_mlp": 0.01056022, "balance_loss_clip": 1.05217624, "balance_loss_mlp": 1.03539932, "epoch": 0.1674432586802946, "flos": 29789086366080.0, "grad_norm": 4.741795209709136, "language_loss": 0.60970068, "learning_rate": 3.805088123868126e-06, "loss": 0.6316371, "num_input_tokens_seen": 60310025, "step": 2785, "time_per_iteration": 4.219547510147095 }, { "auxiliary_loss_clip": 0.01057886, "auxiliary_loss_mlp": 0.0100398, "balance_loss_clip": 1.03758883, "balance_loss_mlp": 1.00141752, "epoch": 0.16750338193296258, "flos": 66136073575680.0, "grad_norm": 0.773077721474628, "language_loss": 0.58780885, "learning_rate": 3.8049203884140492e-06, "loss": 0.60842752, "num_input_tokens_seen": 60377800, "step": 2786, "time_per_iteration": 3.2306320667266846 }, { "auxiliary_loss_clip": 0.0113927, "auxiliary_loss_mlp": 0.01044966, "balance_loss_clip": 1.0496738, "balance_loss_mlp": 1.02589226, "epoch": 0.16756350518563054, "flos": 25696777864320.0, "grad_norm": 1.7333132735183339, "language_loss": 0.76308596, "learning_rate": 3.80475258451721e-06, "loss": 0.78492826, "num_input_tokens_seen": 60398215, "step": 2787, "time_per_iteration": 2.6434125900268555 }, { "auxiliary_loss_clip": 0.01146924, "auxiliary_loss_mlp": 0.01043386, "balance_loss_clip": 1.0529089, "balance_loss_mlp": 1.02544546, "epoch": 0.1676236284382985, "flos": 23836102467840.0, "grad_norm": 1.7210472408736244, "language_loss": 0.7717936, "learning_rate": 3.804584712183972e-06, "loss": 0.79369676, "num_input_tokens_seen": 60416910, "step": 2788, "time_per_iteration": 4.359618425369263 }, { "auxiliary_loss_clip": 0.01054629, "auxiliary_loss_mlp": 0.00999991, "balance_loss_clip": 1.03482509, "balance_loss_mlp": 0.99746382, "epoch": 0.16768375169096647, "flos": 59874902985600.0, "grad_norm": 0.8596744797543817, "language_loss": 0.59331679, "learning_rate": 3.8044167714207013e-06, "loss": 0.61386299, "num_input_tokens_seen": 60468660, "step": 2789, "time_per_iteration": 3.0742650032043457 }, { "auxiliary_loss_clip": 0.01148272, "auxiliary_loss_mlp": 0.01053856, "balance_loss_clip": 1.05450928, "balance_loss_mlp": 1.03428209, "epoch": 0.16774387494363446, "flos": 38435657207040.0, "grad_norm": 1.689036486923415, "language_loss": 0.7012763, "learning_rate": 3.804248762233765e-06, "loss": 0.7232976, "num_input_tokens_seen": 60492370, "step": 2790, "time_per_iteration": 2.872232437133789 }, { "auxiliary_loss_clip": 0.0112492, "auxiliary_loss_mlp": 0.01051622, "balance_loss_clip": 1.0497216, "balance_loss_mlp": 1.0334661, "epoch": 0.16780399819630243, "flos": 22637620252800.0, "grad_norm": 1.864386369112868, "language_loss": 0.79464513, "learning_rate": 3.8040806846295356e-06, "loss": 0.81641054, "num_input_tokens_seen": 60512655, "step": 2791, "time_per_iteration": 2.7180140018463135 }, { "auxiliary_loss_clip": 0.01122456, "auxiliary_loss_mlp": 0.01050939, "balance_loss_clip": 1.04977369, "balance_loss_mlp": 1.03106701, "epoch": 0.1678641214489704, "flos": 32891516887680.0, "grad_norm": 1.705849915566178, "language_loss": 0.71547955, "learning_rate": 3.8039125386143853e-06, "loss": 0.73721349, "num_input_tokens_seen": 60533090, "step": 2792, "time_per_iteration": 2.9221818447113037 }, { "auxiliary_loss_clip": 0.01131469, "auxiliary_loss_mlp": 0.01044061, "balance_loss_clip": 1.05479562, "balance_loss_mlp": 1.02551246, "epoch": 0.16792424470163836, "flos": 19974916028160.0, "grad_norm": 1.9301593564774673, "language_loss": 0.71581644, "learning_rate": 3.803744324194691e-06, "loss": 0.73757172, "num_input_tokens_seen": 60553190, "step": 2793, "time_per_iteration": 2.75104022026062 }, { "auxiliary_loss_clip": 0.01143072, "auxiliary_loss_mlp": 0.01053231, "balance_loss_clip": 1.05276942, "balance_loss_mlp": 1.03452659, "epoch": 0.16798436795430632, "flos": 19719878486400.0, "grad_norm": 2.3859650274226833, "language_loss": 0.7717455, "learning_rate": 3.803576041376831e-06, "loss": 0.79370856, "num_input_tokens_seen": 60571995, "step": 2794, "time_per_iteration": 2.6007745265960693 }, { "auxiliary_loss_clip": 0.01137828, "auxiliary_loss_mlp": 0.0104987, "balance_loss_clip": 1.05250025, "balance_loss_mlp": 1.03010476, "epoch": 0.1680444912069743, "flos": 28104839596800.0, "grad_norm": 2.7692472240964747, "language_loss": 0.71609265, "learning_rate": 3.803407690167187e-06, "loss": 0.73796958, "num_input_tokens_seen": 60591275, "step": 2795, "time_per_iteration": 2.693826198577881 }, { "auxiliary_loss_clip": 0.01131865, "auxiliary_loss_mlp": 0.01041012, "balance_loss_clip": 1.04973865, "balance_loss_mlp": 1.02302384, "epoch": 0.16810461445964225, "flos": 18075205526400.0, "grad_norm": 1.990096863808903, "language_loss": 0.84230494, "learning_rate": 3.803239270572142e-06, "loss": 0.8640337, "num_input_tokens_seen": 60609235, "step": 2796, "time_per_iteration": 2.697253465652466 }, { "auxiliary_loss_clip": 0.01101634, "auxiliary_loss_mlp": 0.01045196, "balance_loss_clip": 1.04877055, "balance_loss_mlp": 1.0262773, "epoch": 0.16816473771231025, "flos": 23878657105920.0, "grad_norm": 1.9272276676322646, "language_loss": 0.81609607, "learning_rate": 3.8030707825980838e-06, "loss": 0.83756441, "num_input_tokens_seen": 60629880, "step": 2797, "time_per_iteration": 2.8784244060516357 }, { "auxiliary_loss_clip": 0.0114057, "auxiliary_loss_mlp": 0.01041282, "balance_loss_clip": 1.05136061, "balance_loss_mlp": 1.02448523, "epoch": 0.1682248609649782, "flos": 22783597125120.0, "grad_norm": 1.7015769336052518, "language_loss": 0.74811113, "learning_rate": 3.802902226251401e-06, "loss": 0.76992965, "num_input_tokens_seen": 60651175, "step": 2798, "time_per_iteration": 2.700727939605713 }, { "auxiliary_loss_clip": 0.01161342, "auxiliary_loss_mlp": 0.01048462, "balance_loss_clip": 1.05728281, "balance_loss_mlp": 1.03075945, "epoch": 0.16828498421764618, "flos": 20705123612160.0, "grad_norm": 1.5964091182578661, "language_loss": 0.79693568, "learning_rate": 3.8027336015384845e-06, "loss": 0.81903368, "num_input_tokens_seen": 60670210, "step": 2799, "time_per_iteration": 2.6582021713256836 }, { "auxiliary_loss_clip": 0.01077177, "auxiliary_loss_mlp": 0.01045216, "balance_loss_clip": 1.04514158, "balance_loss_mlp": 1.02374637, "epoch": 0.16834510747031414, "flos": 29420606695680.0, "grad_norm": 4.227726163531211, "language_loss": 0.70963746, "learning_rate": 3.8025649084657296e-06, "loss": 0.73086143, "num_input_tokens_seen": 60690895, "step": 2800, "time_per_iteration": 2.8856699466705322 }, { "auxiliary_loss_clip": 0.01108822, "auxiliary_loss_mlp": 0.00777078, "balance_loss_clip": 1.04776788, "balance_loss_mlp": 1.00161195, "epoch": 0.1684052307229821, "flos": 18145374744960.0, "grad_norm": 1.9902029671619985, "language_loss": 0.83663505, "learning_rate": 3.8023961470395326e-06, "loss": 0.85549408, "num_input_tokens_seen": 60708280, "step": 2801, "time_per_iteration": 2.6917035579681396 }, { "auxiliary_loss_clip": 0.01128148, "auxiliary_loss_mlp": 0.01049324, "balance_loss_clip": 1.05011535, "balance_loss_mlp": 1.03084683, "epoch": 0.16846535397565007, "flos": 16574929240320.0, "grad_norm": 2.4052305427948735, "language_loss": 0.82509923, "learning_rate": 3.8022273172662933e-06, "loss": 0.84687394, "num_input_tokens_seen": 60724150, "step": 2802, "time_per_iteration": 2.882611036300659 }, { "auxiliary_loss_clip": 0.01150156, "auxiliary_loss_mlp": 0.01048717, "balance_loss_clip": 1.05517435, "balance_loss_mlp": 1.02885723, "epoch": 0.16852547722831807, "flos": 30408868563840.0, "grad_norm": 3.107584498439891, "language_loss": 0.80643189, "learning_rate": 3.802058419152413e-06, "loss": 0.8284207, "num_input_tokens_seen": 60746485, "step": 2803, "time_per_iteration": 2.7886922359466553 }, { "auxiliary_loss_clip": 0.01148107, "auxiliary_loss_mlp": 0.01047852, "balance_loss_clip": 1.0556829, "balance_loss_mlp": 1.02918339, "epoch": 0.16858560048098603, "flos": 33507420416640.0, "grad_norm": 2.2127389669880713, "language_loss": 0.76168799, "learning_rate": 3.801889452704297e-06, "loss": 0.7836476, "num_input_tokens_seen": 60762875, "step": 2804, "time_per_iteration": 2.7588601112365723 }, { "auxiliary_loss_clip": 0.01045171, "auxiliary_loss_mlp": 0.01013955, "balance_loss_clip": 1.03581083, "balance_loss_mlp": 1.01078367, "epoch": 0.168645723733654, "flos": 67370502326400.0, "grad_norm": 0.8536034833258724, "language_loss": 0.55464876, "learning_rate": 3.8017204179283526e-06, "loss": 0.57524002, "num_input_tokens_seen": 60825510, "step": 2805, "time_per_iteration": 3.2089412212371826 }, { "auxiliary_loss_clip": 0.01138275, "auxiliary_loss_mlp": 0.0103974, "balance_loss_clip": 1.05013156, "balance_loss_mlp": 1.02239537, "epoch": 0.16870584698632196, "flos": 21324618501120.0, "grad_norm": 2.2836767274778427, "language_loss": 0.73090243, "learning_rate": 3.8015513148309892e-06, "loss": 0.75268269, "num_input_tokens_seen": 60844440, "step": 2806, "time_per_iteration": 2.643596649169922 }, { "auxiliary_loss_clip": 0.01117063, "auxiliary_loss_mlp": 0.01045402, "balance_loss_clip": 1.05330753, "balance_loss_mlp": 1.02766335, "epoch": 0.16876597023898993, "flos": 20740746925440.0, "grad_norm": 1.8406859431587912, "language_loss": 0.69773197, "learning_rate": 3.80138214341862e-06, "loss": 0.71935666, "num_input_tokens_seen": 60863210, "step": 2807, "time_per_iteration": 2.6946568489074707 }, { "auxiliary_loss_clip": 0.01130702, "auxiliary_loss_mlp": 0.01047199, "balance_loss_clip": 1.04842246, "balance_loss_mlp": 1.02794707, "epoch": 0.1688260934916579, "flos": 20303498666880.0, "grad_norm": 3.042021842274248, "language_loss": 0.70280695, "learning_rate": 3.8012129036976587e-06, "loss": 0.72458601, "num_input_tokens_seen": 60882510, "step": 2808, "time_per_iteration": 2.6656088829040527 }, { "auxiliary_loss_clip": 0.01119025, "auxiliary_loss_mlp": 0.01041739, "balance_loss_clip": 1.05019665, "balance_loss_mlp": 1.02164018, "epoch": 0.16888621674432586, "flos": 20340702178560.0, "grad_norm": 2.0835789337145965, "language_loss": 0.79903001, "learning_rate": 3.8010435956745236e-06, "loss": 0.8206377, "num_input_tokens_seen": 60901105, "step": 2809, "time_per_iteration": 2.7665679454803467 }, { "auxiliary_loss_clip": 0.01155146, "auxiliary_loss_mlp": 0.01042018, "balance_loss_clip": 1.0557605, "balance_loss_mlp": 1.02252758, "epoch": 0.16894633999699385, "flos": 16244802316800.0, "grad_norm": 2.0672093223845245, "language_loss": 0.88076419, "learning_rate": 3.8008742193556358e-06, "loss": 0.90273583, "num_input_tokens_seen": 60915340, "step": 2810, "time_per_iteration": 2.6186363697052 }, { "auxiliary_loss_clip": 0.01149997, "auxiliary_loss_mlp": 0.0104631, "balance_loss_clip": 1.05503082, "balance_loss_mlp": 1.02715337, "epoch": 0.16900646324966181, "flos": 19610171372160.0, "grad_norm": 1.8921026809528976, "language_loss": 0.92376304, "learning_rate": 3.800704774747416e-06, "loss": 0.9457261, "num_input_tokens_seen": 60933735, "step": 2811, "time_per_iteration": 2.6567442417144775 }, { "auxiliary_loss_clip": 0.01140053, "auxiliary_loss_mlp": 0.01049063, "balance_loss_clip": 1.05383325, "balance_loss_mlp": 1.03039432, "epoch": 0.16906658650232978, "flos": 22018089450240.0, "grad_norm": 2.116573413654177, "language_loss": 0.78582352, "learning_rate": 3.800535261856291e-06, "loss": 0.8077147, "num_input_tokens_seen": 60953105, "step": 2812, "time_per_iteration": 2.6796023845672607 }, { "auxiliary_loss_clip": 0.01147895, "auxiliary_loss_mlp": 0.01043917, "balance_loss_clip": 1.05772316, "balance_loss_mlp": 1.02653646, "epoch": 0.16912670975499774, "flos": 11763690024960.0, "grad_norm": 2.5483899062625093, "language_loss": 0.75195068, "learning_rate": 3.8003656806886887e-06, "loss": 0.7738688, "num_input_tokens_seen": 60969150, "step": 2813, "time_per_iteration": 2.621772050857544 }, { "auxiliary_loss_clip": 0.01136313, "auxiliary_loss_mlp": 0.01045037, "balance_loss_clip": 1.05311871, "balance_loss_mlp": 1.02599943, "epoch": 0.1691868330076657, "flos": 17161386595200.0, "grad_norm": 3.0041182480764554, "language_loss": 0.69118392, "learning_rate": 3.8001960312510396e-06, "loss": 0.7129975, "num_input_tokens_seen": 60982825, "step": 2814, "time_per_iteration": 2.837264060974121 }, { "auxiliary_loss_clip": 0.01163835, "auxiliary_loss_mlp": 0.01039837, "balance_loss_clip": 1.05900145, "balance_loss_mlp": 1.02134776, "epoch": 0.16924695626033368, "flos": 22416553998720.0, "grad_norm": 3.1079956206415833, "language_loss": 0.61439502, "learning_rate": 3.800026313549776e-06, "loss": 0.63643175, "num_input_tokens_seen": 61000875, "step": 2815, "time_per_iteration": 2.6967194080352783 }, { "auxiliary_loss_clip": 0.01129827, "auxiliary_loss_mlp": 0.01042692, "balance_loss_clip": 1.05139673, "balance_loss_mlp": 1.02382088, "epoch": 0.16930707951300164, "flos": 25739655724800.0, "grad_norm": 1.7930623183302479, "language_loss": 0.82490849, "learning_rate": 3.7998565275913342e-06, "loss": 0.84663367, "num_input_tokens_seen": 61021940, "step": 2816, "time_per_iteration": 2.7227163314819336 }, { "auxiliary_loss_clip": 0.01133129, "auxiliary_loss_mlp": 0.01047914, "balance_loss_clip": 1.05375743, "balance_loss_mlp": 1.02853012, "epoch": 0.16936720276566963, "flos": 22747040058240.0, "grad_norm": 3.083808689594852, "language_loss": 0.87322289, "learning_rate": 3.799686673382153e-06, "loss": 0.89503324, "num_input_tokens_seen": 61040285, "step": 2817, "time_per_iteration": 2.733180522918701 }, { "auxiliary_loss_clip": 0.01141455, "auxiliary_loss_mlp": 0.01052753, "balance_loss_clip": 1.05800366, "balance_loss_mlp": 1.03352427, "epoch": 0.1694273260183376, "flos": 19573973441280.0, "grad_norm": 1.8594303503608436, "language_loss": 0.81247765, "learning_rate": 3.799516750928672e-06, "loss": 0.83441973, "num_input_tokens_seen": 61059020, "step": 2818, "time_per_iteration": 2.7384097576141357 }, { "auxiliary_loss_clip": 0.01160132, "auxiliary_loss_mlp": 0.01044196, "balance_loss_clip": 1.05699944, "balance_loss_mlp": 1.02496791, "epoch": 0.16948744927100556, "flos": 12457843332480.0, "grad_norm": 2.739998367204505, "language_loss": 0.80788404, "learning_rate": 3.799346760237336e-06, "loss": 0.82992733, "num_input_tokens_seen": 61074245, "step": 2819, "time_per_iteration": 2.609870672225952 }, { "auxiliary_loss_clip": 0.01069019, "auxiliary_loss_mlp": 0.01015301, "balance_loss_clip": 1.0485003, "balance_loss_mlp": 1.0125947, "epoch": 0.16954757252367353, "flos": 71291694435840.0, "grad_norm": 0.9309223426502673, "language_loss": 0.61031163, "learning_rate": 3.7991767013145902e-06, "loss": 0.63115478, "num_input_tokens_seen": 61127080, "step": 2820, "time_per_iteration": 3.161051034927368 }, { "auxiliary_loss_clip": 0.01125604, "auxiliary_loss_mlp": 0.0105036, "balance_loss_clip": 1.05106986, "balance_loss_mlp": 1.03207326, "epoch": 0.1696076957763415, "flos": 29606516513280.0, "grad_norm": 1.8682266790688726, "language_loss": 0.78265435, "learning_rate": 3.7990065741668844e-06, "loss": 0.80441403, "num_input_tokens_seen": 61146955, "step": 2821, "time_per_iteration": 2.838730573654175 }, { "auxiliary_loss_clip": 0.0113863, "auxiliary_loss_mlp": 0.01055528, "balance_loss_clip": 1.05282724, "balance_loss_mlp": 1.03494084, "epoch": 0.16966781902900946, "flos": 24388588535040.0, "grad_norm": 2.1667405259997516, "language_loss": 0.78521514, "learning_rate": 3.7988363788006685e-06, "loss": 0.80715668, "num_input_tokens_seen": 61166605, "step": 2822, "time_per_iteration": 2.783385753631592 }, { "auxiliary_loss_clip": 0.01143597, "auxiliary_loss_mlp": 0.00777154, "balance_loss_clip": 1.05367076, "balance_loss_mlp": 1.00129986, "epoch": 0.16972794228167745, "flos": 23038814234880.0, "grad_norm": 1.8038457392731222, "language_loss": 0.74939907, "learning_rate": 3.7986661152223967e-06, "loss": 0.76860654, "num_input_tokens_seen": 61186535, "step": 2823, "time_per_iteration": 4.329328298568726 }, { "auxiliary_loss_clip": 0.01129469, "auxiliary_loss_mlp": 0.0105385, "balance_loss_clip": 1.05166912, "balance_loss_mlp": 1.03496754, "epoch": 0.16978806553434542, "flos": 35228691129600.0, "grad_norm": 3.336653609493179, "language_loss": 0.60266119, "learning_rate": 3.7984957834385257e-06, "loss": 0.62449437, "num_input_tokens_seen": 61208965, "step": 2824, "time_per_iteration": 5.892346620559692 }, { "auxiliary_loss_clip": 0.01138249, "auxiliary_loss_mlp": 0.01042322, "balance_loss_clip": 1.05565047, "balance_loss_mlp": 1.02287912, "epoch": 0.16984818878701338, "flos": 32014290936960.0, "grad_norm": 2.152838804074104, "language_loss": 0.73322558, "learning_rate": 3.7983253834555144e-06, "loss": 0.75503135, "num_input_tokens_seen": 61230670, "step": 2825, "time_per_iteration": 2.834482431411743 }, { "auxiliary_loss_clip": 0.01161467, "auxiliary_loss_mlp": 0.01047701, "balance_loss_clip": 1.05502653, "balance_loss_mlp": 1.02762675, "epoch": 0.16990831203968135, "flos": 22818609907200.0, "grad_norm": 2.05671259677731, "language_loss": 0.85638934, "learning_rate": 3.7981549152798245e-06, "loss": 0.87848103, "num_input_tokens_seen": 61249510, "step": 2826, "time_per_iteration": 2.6443135738372803 }, { "auxiliary_loss_clip": 0.01139368, "auxiliary_loss_mlp": 0.01047749, "balance_loss_clip": 1.05266595, "balance_loss_mlp": 1.02856779, "epoch": 0.1699684352923493, "flos": 23039604334080.0, "grad_norm": 1.9562557148441426, "language_loss": 0.82465482, "learning_rate": 3.7979843789179196e-06, "loss": 0.84652597, "num_input_tokens_seen": 61269440, "step": 2827, "time_per_iteration": 2.7683157920837402 }, { "auxiliary_loss_clip": 0.01131885, "auxiliary_loss_mlp": 0.0104561, "balance_loss_clip": 1.05320346, "balance_loss_mlp": 1.02536786, "epoch": 0.17002855854501728, "flos": 21434110133760.0, "grad_norm": 1.7386401818136152, "language_loss": 0.73704529, "learning_rate": 3.797813774376267e-06, "loss": 0.75882024, "num_input_tokens_seen": 61288195, "step": 2828, "time_per_iteration": 4.465311288833618 }, { "auxiliary_loss_clip": 0.01061458, "auxiliary_loss_mlp": 0.01009538, "balance_loss_clip": 1.04764342, "balance_loss_mlp": 1.00620067, "epoch": 0.17008868179768524, "flos": 71453509205760.0, "grad_norm": 0.7670168832041738, "language_loss": 0.56426483, "learning_rate": 3.797643101661336e-06, "loss": 0.58497471, "num_input_tokens_seen": 61350850, "step": 2829, "time_per_iteration": 3.3114631175994873 }, { "auxiliary_loss_clip": 0.01111753, "auxiliary_loss_mlp": 0.01051557, "balance_loss_clip": 1.04527223, "balance_loss_mlp": 1.03088641, "epoch": 0.17014880505035324, "flos": 24900315644160.0, "grad_norm": 1.7961285206560338, "language_loss": 0.83465374, "learning_rate": 3.7974723607795983e-06, "loss": 0.85628688, "num_input_tokens_seen": 61370765, "step": 2830, "time_per_iteration": 2.795253038406372 }, { "auxiliary_loss_clip": 0.01121533, "auxiliary_loss_mlp": 0.0104408, "balance_loss_clip": 1.04901659, "balance_loss_mlp": 1.02442193, "epoch": 0.1702089283030212, "flos": 29862415981440.0, "grad_norm": 2.4873654173451727, "language_loss": 0.78360993, "learning_rate": 3.797301551737529e-06, "loss": 0.80526608, "num_input_tokens_seen": 61388935, "step": 2831, "time_per_iteration": 2.7864232063293457 }, { "auxiliary_loss_clip": 0.01123612, "auxiliary_loss_mlp": 0.01051154, "balance_loss_clip": 1.05275893, "balance_loss_mlp": 1.0311985, "epoch": 0.17026905155568917, "flos": 17744180762880.0, "grad_norm": 2.532473263441992, "language_loss": 0.79668158, "learning_rate": 3.7971306745416044e-06, "loss": 0.81842923, "num_input_tokens_seen": 61407350, "step": 2832, "time_per_iteration": 2.842217206954956 }, { "auxiliary_loss_clip": 0.01127135, "auxiliary_loss_mlp": 0.01048966, "balance_loss_clip": 1.05029321, "balance_loss_mlp": 1.02984488, "epoch": 0.17032917480835713, "flos": 23148665003520.0, "grad_norm": 1.8387196201649116, "language_loss": 0.88638175, "learning_rate": 3.7969597291983046e-06, "loss": 0.90814275, "num_input_tokens_seen": 61429010, "step": 2833, "time_per_iteration": 2.75942325592041 }, { "auxiliary_loss_clip": 0.01158799, "auxiliary_loss_mlp": 0.01046883, "balance_loss_clip": 1.05633831, "balance_loss_mlp": 1.02842951, "epoch": 0.1703892980610251, "flos": 39202565512320.0, "grad_norm": 2.49094605220443, "language_loss": 0.71924698, "learning_rate": 3.7967887157141115e-06, "loss": 0.74130386, "num_input_tokens_seen": 61450040, "step": 2834, "time_per_iteration": 2.9035184383392334 }, { "auxiliary_loss_clip": 0.01119873, "auxiliary_loss_mlp": 0.01052215, "balance_loss_clip": 1.05165124, "balance_loss_mlp": 1.03428626, "epoch": 0.17044942131369306, "flos": 23039101543680.0, "grad_norm": 1.9093816511111852, "language_loss": 0.86831236, "learning_rate": 3.7966176340955106e-06, "loss": 0.89003325, "num_input_tokens_seen": 61468585, "step": 2835, "time_per_iteration": 2.7627484798431396 }, { "auxiliary_loss_clip": 0.01149332, "auxiliary_loss_mlp": 0.01049844, "balance_loss_clip": 1.0536654, "balance_loss_mlp": 1.02887547, "epoch": 0.17050954456636103, "flos": 17054983532160.0, "grad_norm": 2.1227367002258153, "language_loss": 0.74483943, "learning_rate": 3.796446484348989e-06, "loss": 0.76683116, "num_input_tokens_seen": 61486330, "step": 2836, "time_per_iteration": 2.6748619079589844 }, { "auxiliary_loss_clip": 0.01102249, "auxiliary_loss_mlp": 0.01049533, "balance_loss_clip": 1.04775679, "balance_loss_mlp": 1.02790809, "epoch": 0.17056966781902902, "flos": 16836969934080.0, "grad_norm": 2.1718385109372824, "language_loss": 0.79959226, "learning_rate": 3.796275266481036e-06, "loss": 0.82111007, "num_input_tokens_seen": 61503950, "step": 2837, "time_per_iteration": 2.757340908050537 }, { "auxiliary_loss_clip": 0.01144378, "auxiliary_loss_mlp": 0.01044803, "balance_loss_clip": 1.05493581, "balance_loss_mlp": 1.02644491, "epoch": 0.17062979107169698, "flos": 17712543859200.0, "grad_norm": 1.6825251002952497, "language_loss": 0.83258498, "learning_rate": 3.7961039804981456e-06, "loss": 0.85447681, "num_input_tokens_seen": 61523550, "step": 2838, "time_per_iteration": 2.705357551574707 }, { "auxiliary_loss_clip": 0.0110604, "auxiliary_loss_mlp": 0.01044889, "balance_loss_clip": 1.05217135, "balance_loss_mlp": 1.02685261, "epoch": 0.17068991432436495, "flos": 22525040050560.0, "grad_norm": 1.7789799751303759, "language_loss": 0.93788463, "learning_rate": 3.795932626406812e-06, "loss": 0.95939398, "num_input_tokens_seen": 61542720, "step": 2839, "time_per_iteration": 2.7881791591644287 }, { "auxiliary_loss_clip": 0.01126465, "auxiliary_loss_mlp": 0.01045617, "balance_loss_clip": 1.05183244, "balance_loss_mlp": 1.0250175, "epoch": 0.17075003757703291, "flos": 25882939077120.0, "grad_norm": 2.3337760403585435, "language_loss": 0.83974946, "learning_rate": 3.7957612042135336e-06, "loss": 0.86147022, "num_input_tokens_seen": 61563040, "step": 2840, "time_per_iteration": 2.7564892768859863 }, { "auxiliary_loss_clip": 0.01151834, "auxiliary_loss_mlp": 0.01044417, "balance_loss_clip": 1.05555129, "balance_loss_mlp": 1.02449679, "epoch": 0.17081016082970088, "flos": 20120713332480.0, "grad_norm": 1.9037435592597944, "language_loss": 0.76307738, "learning_rate": 3.79558971392481e-06, "loss": 0.7850399, "num_input_tokens_seen": 61581890, "step": 2841, "time_per_iteration": 2.695525646209717 }, { "auxiliary_loss_clip": 0.01136217, "auxiliary_loss_mlp": 0.01045847, "balance_loss_clip": 1.0527097, "balance_loss_mlp": 1.02744126, "epoch": 0.17087028408236885, "flos": 24936477661440.0, "grad_norm": 1.7844240011089845, "language_loss": 0.77076876, "learning_rate": 3.7954181555471443e-06, "loss": 0.79258937, "num_input_tokens_seen": 61602095, "step": 2842, "time_per_iteration": 2.773792266845703 }, { "auxiliary_loss_clip": 0.01155915, "auxiliary_loss_mlp": 0.01043896, "balance_loss_clip": 1.05616069, "balance_loss_mlp": 1.02503705, "epoch": 0.17093040733503684, "flos": 19057864872960.0, "grad_norm": 1.8430349199993477, "language_loss": 0.85694385, "learning_rate": 3.795246529087043e-06, "loss": 0.87894201, "num_input_tokens_seen": 61620400, "step": 2843, "time_per_iteration": 2.5860671997070312 }, { "auxiliary_loss_clip": 0.01154742, "auxiliary_loss_mlp": 0.01044059, "balance_loss_clip": 1.05549574, "balance_loss_mlp": 1.02608204, "epoch": 0.1709905305877048, "flos": 13078954333440.0, "grad_norm": 2.0353470349004485, "language_loss": 0.68646181, "learning_rate": 3.7950748345510126e-06, "loss": 0.70844984, "num_input_tokens_seen": 61637680, "step": 2844, "time_per_iteration": 2.5961523056030273 }, { "auxiliary_loss_clip": 0.01133396, "auxiliary_loss_mlp": 0.00778162, "balance_loss_clip": 1.05117011, "balance_loss_mlp": 1.00112617, "epoch": 0.17105065384037277, "flos": 19209336526080.0, "grad_norm": 2.027694794878894, "language_loss": 0.78771943, "learning_rate": 3.7949030719455646e-06, "loss": 0.806835, "num_input_tokens_seen": 61655630, "step": 2845, "time_per_iteration": 2.720193386077881 }, { "auxiliary_loss_clip": 0.01145033, "auxiliary_loss_mlp": 0.01047407, "balance_loss_clip": 1.05443549, "balance_loss_mlp": 1.02914453, "epoch": 0.17111077709304073, "flos": 18515183218560.0, "grad_norm": 2.2586144454646306, "language_loss": 0.7811147, "learning_rate": 3.7947312412772127e-06, "loss": 0.80303913, "num_input_tokens_seen": 61673475, "step": 2846, "time_per_iteration": 2.691033363342285 }, { "auxiliary_loss_clip": 0.01143809, "auxiliary_loss_mlp": 0.0104645, "balance_loss_clip": 1.05425262, "balance_loss_mlp": 1.02865243, "epoch": 0.1711709003457087, "flos": 25082670015360.0, "grad_norm": 2.2208975060456426, "language_loss": 0.79762948, "learning_rate": 3.794559342552472e-06, "loss": 0.8195321, "num_input_tokens_seen": 61693370, "step": 2847, "time_per_iteration": 2.7504522800445557 }, { "auxiliary_loss_clip": 0.01142651, "auxiliary_loss_mlp": 0.01045695, "balance_loss_clip": 1.05101562, "balance_loss_mlp": 1.02668071, "epoch": 0.17123102359837666, "flos": 17566387418880.0, "grad_norm": 2.4457083156230017, "language_loss": 0.8665086, "learning_rate": 3.7943873757778614e-06, "loss": 0.88839209, "num_input_tokens_seen": 61710820, "step": 2848, "time_per_iteration": 2.642946720123291 }, { "auxiliary_loss_clip": 0.0111167, "auxiliary_loss_mlp": 0.01044479, "balance_loss_clip": 1.04839015, "balance_loss_mlp": 1.02559662, "epoch": 0.17129114685104463, "flos": 26173635845760.0, "grad_norm": 3.6033710399461856, "language_loss": 0.75238276, "learning_rate": 3.794215340959902e-06, "loss": 0.77394426, "num_input_tokens_seen": 61729855, "step": 2849, "time_per_iteration": 2.7511017322540283 }, { "auxiliary_loss_clip": 0.0103263, "auxiliary_loss_mlp": 0.01006833, "balance_loss_clip": 1.02775574, "balance_loss_mlp": 1.00413883, "epoch": 0.17135127010371262, "flos": 69269710037760.0, "grad_norm": 0.7881928427119427, "language_loss": 0.57514679, "learning_rate": 3.7940432381051163e-06, "loss": 0.59554148, "num_input_tokens_seen": 61790290, "step": 2850, "time_per_iteration": 3.234609603881836 }, { "auxiliary_loss_clip": 0.01115021, "auxiliary_loss_mlp": 0.01044381, "balance_loss_clip": 1.05049884, "balance_loss_mlp": 1.02661848, "epoch": 0.1714113933563806, "flos": 23550110380800.0, "grad_norm": 2.962731712990184, "language_loss": 0.81328994, "learning_rate": 3.793871067220031e-06, "loss": 0.83488399, "num_input_tokens_seen": 61809265, "step": 2851, "time_per_iteration": 2.78957200050354 }, { "auxiliary_loss_clip": 0.01114419, "auxiliary_loss_mlp": 0.01043587, "balance_loss_clip": 1.05193233, "balance_loss_mlp": 1.02592039, "epoch": 0.17147151660904855, "flos": 21142443697920.0, "grad_norm": 2.049906502724323, "language_loss": 0.93085313, "learning_rate": 3.7936988283111764e-06, "loss": 0.95243311, "num_input_tokens_seen": 61828980, "step": 2852, "time_per_iteration": 2.8247029781341553 }, { "auxiliary_loss_clip": 0.01123258, "auxiliary_loss_mlp": 0.01048953, "balance_loss_clip": 1.04961288, "balance_loss_mlp": 1.03045225, "epoch": 0.17153163986171652, "flos": 18624890332800.0, "grad_norm": 1.8770741979814063, "language_loss": 0.69465554, "learning_rate": 3.7935265213850817e-06, "loss": 0.71637762, "num_input_tokens_seen": 61847915, "step": 2853, "time_per_iteration": 2.814162492752075 }, { "auxiliary_loss_clip": 0.01120856, "auxiliary_loss_mlp": 0.0104692, "balance_loss_clip": 1.05593121, "balance_loss_mlp": 1.02899122, "epoch": 0.17159176311438448, "flos": 18223265387520.0, "grad_norm": 2.5884803351111705, "language_loss": 0.66611075, "learning_rate": 3.7933541464482815e-06, "loss": 0.68778855, "num_input_tokens_seen": 61865570, "step": 2854, "time_per_iteration": 2.7968995571136475 }, { "auxiliary_loss_clip": 0.01120742, "auxiliary_loss_mlp": 0.01052217, "balance_loss_clip": 1.04853106, "balance_loss_mlp": 1.0349679, "epoch": 0.17165188636705245, "flos": 20738987159040.0, "grad_norm": 1.705510390491261, "language_loss": 0.8929621, "learning_rate": 3.7931817035073124e-06, "loss": 0.91469175, "num_input_tokens_seen": 61883340, "step": 2855, "time_per_iteration": 2.7045016288757324 }, { "auxiliary_loss_clip": 0.01157319, "auxiliary_loss_mlp": 0.01043813, "balance_loss_clip": 1.05505848, "balance_loss_mlp": 1.02662265, "epoch": 0.17171200961972044, "flos": 24899884680960.0, "grad_norm": 2.117219134143716, "language_loss": 0.83963835, "learning_rate": 3.7930091925687134e-06, "loss": 0.86164963, "num_input_tokens_seen": 61900610, "step": 2856, "time_per_iteration": 2.7349936962127686 }, { "auxiliary_loss_clip": 0.01150108, "auxiliary_loss_mlp": 0.0104615, "balance_loss_clip": 1.05812418, "balance_loss_mlp": 1.02783966, "epoch": 0.1717721328723884, "flos": 20157234485760.0, "grad_norm": 2.234025867710235, "language_loss": 0.86309886, "learning_rate": 3.792836613639026e-06, "loss": 0.88506144, "num_input_tokens_seen": 61916795, "step": 2857, "time_per_iteration": 2.749356746673584 }, { "auxiliary_loss_clip": 0.01144467, "auxiliary_loss_mlp": 0.0105057, "balance_loss_clip": 1.05469525, "balance_loss_mlp": 1.0324626, "epoch": 0.17183225612505637, "flos": 23361650697600.0, "grad_norm": 2.069122070501307, "language_loss": 0.78334701, "learning_rate": 3.7926639667247947e-06, "loss": 0.80529737, "num_input_tokens_seen": 61936665, "step": 2858, "time_per_iteration": 2.6673583984375 }, { "auxiliary_loss_clip": 0.01147374, "auxiliary_loss_mlp": 0.0105371, "balance_loss_clip": 1.05591416, "balance_loss_mlp": 1.03263378, "epoch": 0.17189237937772434, "flos": 18114240631680.0, "grad_norm": 2.1629422323642453, "language_loss": 0.77565676, "learning_rate": 3.7924912518325663e-06, "loss": 0.79766762, "num_input_tokens_seen": 61954415, "step": 2859, "time_per_iteration": 2.646648645401001 }, { "auxiliary_loss_clip": 0.0110879, "auxiliary_loss_mlp": 0.01047481, "balance_loss_clip": 1.05317724, "balance_loss_mlp": 1.02887201, "epoch": 0.1719525026303923, "flos": 23258408031360.0, "grad_norm": 2.088627069497316, "language_loss": 0.77088714, "learning_rate": 3.7923184689688902e-06, "loss": 0.79244983, "num_input_tokens_seen": 61973940, "step": 2860, "time_per_iteration": 2.7671573162078857 }, { "auxiliary_loss_clip": 0.01145562, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.05316472, "balance_loss_mlp": 1.02416611, "epoch": 0.17201262588306027, "flos": 20810413353600.0, "grad_norm": 2.1608688480628304, "language_loss": 0.81384242, "learning_rate": 3.792145618140317e-06, "loss": 0.83571851, "num_input_tokens_seen": 61991845, "step": 2861, "time_per_iteration": 2.6492061614990234 }, { "auxiliary_loss_clip": 0.011306, "auxiliary_loss_mlp": 0.01051558, "balance_loss_clip": 1.05280077, "balance_loss_mlp": 1.0335927, "epoch": 0.17207274913572823, "flos": 20375858615040.0, "grad_norm": 2.0128324416816192, "language_loss": 0.85691392, "learning_rate": 3.7919726993534038e-06, "loss": 0.87873554, "num_input_tokens_seen": 62009395, "step": 2862, "time_per_iteration": 4.290126323699951 }, { "auxiliary_loss_clip": 0.01116765, "auxiliary_loss_mlp": 0.01043444, "balance_loss_clip": 1.05126834, "balance_loss_mlp": 1.02655208, "epoch": 0.17213287238839622, "flos": 26797727675520.0, "grad_norm": 3.7047120479299993, "language_loss": 0.78047049, "learning_rate": 3.7917997126147054e-06, "loss": 0.80207253, "num_input_tokens_seen": 62029005, "step": 2863, "time_per_iteration": 4.275500774383545 }, { "auxiliary_loss_clip": 0.01122315, "auxiliary_loss_mlp": 0.00776596, "balance_loss_clip": 1.05132961, "balance_loss_mlp": 1.00090909, "epoch": 0.1721929956410642, "flos": 26030819370240.0, "grad_norm": 1.7350128683820358, "language_loss": 0.72135127, "learning_rate": 3.7916266579307823e-06, "loss": 0.74034035, "num_input_tokens_seen": 62048730, "step": 2864, "time_per_iteration": 4.414710998535156 }, { "auxiliary_loss_clip": 0.01121488, "auxiliary_loss_mlp": 0.01049611, "balance_loss_clip": 1.05114079, "balance_loss_mlp": 1.03099, "epoch": 0.17225311889373215, "flos": 22273091078400.0, "grad_norm": 1.9270646210248614, "language_loss": 0.73002023, "learning_rate": 3.7914535353081973e-06, "loss": 0.75173128, "num_input_tokens_seen": 62069000, "step": 2865, "time_per_iteration": 2.7463715076446533 }, { "auxiliary_loss_clip": 0.01145037, "auxiliary_loss_mlp": 0.0077644, "balance_loss_clip": 1.05669165, "balance_loss_mlp": 1.00120521, "epoch": 0.17231324214640012, "flos": 21287774125440.0, "grad_norm": 2.669585642962841, "language_loss": 0.78357804, "learning_rate": 3.7912803447535145e-06, "loss": 0.80279285, "num_input_tokens_seen": 62086750, "step": 2866, "time_per_iteration": 2.785146713256836 }, { "auxiliary_loss_clip": 0.01157272, "auxiliary_loss_mlp": 0.01044358, "balance_loss_clip": 1.05600274, "balance_loss_mlp": 1.02536821, "epoch": 0.17237336539906808, "flos": 19680735640320.0, "grad_norm": 2.551277931358127, "language_loss": 0.79755104, "learning_rate": 3.7911070862733016e-06, "loss": 0.81956732, "num_input_tokens_seen": 62106240, "step": 2867, "time_per_iteration": 4.3145318031311035 }, { "auxiliary_loss_clip": 0.01132297, "auxiliary_loss_mlp": 0.01041396, "balance_loss_clip": 1.0529356, "balance_loss_mlp": 1.02274013, "epoch": 0.17243348865173605, "flos": 17529650784000.0, "grad_norm": 1.8689780270661371, "language_loss": 0.79206991, "learning_rate": 3.7909337598741276e-06, "loss": 0.81380683, "num_input_tokens_seen": 62124895, "step": 2868, "time_per_iteration": 2.7683827877044678 }, { "auxiliary_loss_clip": 0.01111702, "auxiliary_loss_mlp": 0.01041717, "balance_loss_clip": 1.05331647, "balance_loss_mlp": 1.02427697, "epoch": 0.17249361190440402, "flos": 18259858368000.0, "grad_norm": 2.0344588273772923, "language_loss": 0.84221756, "learning_rate": 3.7907603655625674e-06, "loss": 0.86375177, "num_input_tokens_seen": 62143510, "step": 2869, "time_per_iteration": 2.729156970977783 }, { "auxiliary_loss_clip": 0.01132999, "auxiliary_loss_mlp": 0.01048405, "balance_loss_clip": 1.0535363, "balance_loss_mlp": 1.02955842, "epoch": 0.172553735157072, "flos": 21174367910400.0, "grad_norm": 1.8935704627114847, "language_loss": 0.77299273, "learning_rate": 3.7905869033451932e-06, "loss": 0.79480684, "num_input_tokens_seen": 62162285, "step": 2870, "time_per_iteration": 2.752739191055298 }, { "auxiliary_loss_clip": 0.0115398, "auxiliary_loss_mlp": 0.01037809, "balance_loss_clip": 1.05671024, "balance_loss_mlp": 1.02110744, "epoch": 0.17261385840973997, "flos": 22273270646400.0, "grad_norm": 2.0115587398764396, "language_loss": 0.77409238, "learning_rate": 3.7904133732285857e-06, "loss": 0.79601026, "num_input_tokens_seen": 62180970, "step": 2871, "time_per_iteration": 2.660627603530884 }, { "auxiliary_loss_clip": 0.01132474, "auxiliary_loss_mlp": 0.01041073, "balance_loss_clip": 1.05313993, "balance_loss_mlp": 1.0222379, "epoch": 0.17267398166240794, "flos": 27922233830400.0, "grad_norm": 2.203011669690562, "language_loss": 0.74197829, "learning_rate": 3.7902397752193228e-06, "loss": 0.76371384, "num_input_tokens_seen": 62198965, "step": 2872, "time_per_iteration": 2.6959900856018066 }, { "auxiliary_loss_clip": 0.01150773, "auxiliary_loss_mlp": 0.01041508, "balance_loss_clip": 1.05359554, "balance_loss_mlp": 1.02362645, "epoch": 0.1727341049150759, "flos": 21945118970880.0, "grad_norm": 1.7914171074077658, "language_loss": 0.82336062, "learning_rate": 3.790066109323988e-06, "loss": 0.84528345, "num_input_tokens_seen": 62219890, "step": 2873, "time_per_iteration": 2.603564977645874 }, { "auxiliary_loss_clip": 0.01108819, "auxiliary_loss_mlp": 0.01044995, "balance_loss_clip": 1.04744792, "balance_loss_mlp": 1.02522969, "epoch": 0.17279422816774387, "flos": 18107883924480.0, "grad_norm": 3.7341652608759297, "language_loss": 0.75355422, "learning_rate": 3.7898923755491678e-06, "loss": 0.77509236, "num_input_tokens_seen": 62237140, "step": 2874, "time_per_iteration": 2.8438260555267334 }, { "auxiliary_loss_clip": 0.01159322, "auxiliary_loss_mlp": 0.01044415, "balance_loss_clip": 1.05658269, "balance_loss_mlp": 1.02404249, "epoch": 0.17285435142041183, "flos": 21835447770240.0, "grad_norm": 2.7053876793207037, "language_loss": 0.80239916, "learning_rate": 3.7897185739014487e-06, "loss": 0.82443655, "num_input_tokens_seen": 62255405, "step": 2875, "time_per_iteration": 2.625183343887329 }, { "auxiliary_loss_clip": 0.01135727, "auxiliary_loss_mlp": 0.0105273, "balance_loss_clip": 1.0535475, "balance_loss_mlp": 1.03297722, "epoch": 0.17291447467307983, "flos": 18368452160640.0, "grad_norm": 3.840653645811056, "language_loss": 0.87621164, "learning_rate": 3.7895447043874217e-06, "loss": 0.8980962, "num_input_tokens_seen": 62271280, "step": 2876, "time_per_iteration": 2.6782751083374023 }, { "auxiliary_loss_clip": 0.01136898, "auxiliary_loss_mlp": 0.01044228, "balance_loss_clip": 1.05730534, "balance_loss_mlp": 1.02559566, "epoch": 0.1729745979257478, "flos": 18624638937600.0, "grad_norm": 1.8931416121171032, "language_loss": 0.84386718, "learning_rate": 3.789370767013681e-06, "loss": 0.86567843, "num_input_tokens_seen": 62289140, "step": 2877, "time_per_iteration": 2.681131362915039 }, { "auxiliary_loss_clip": 0.01120759, "auxiliary_loss_mlp": 0.01043962, "balance_loss_clip": 1.05222571, "balance_loss_mlp": 1.02499604, "epoch": 0.17303472117841576, "flos": 22998234844800.0, "grad_norm": 2.106635210245156, "language_loss": 0.79660022, "learning_rate": 3.7891967617868204e-06, "loss": 0.81824744, "num_input_tokens_seen": 62307490, "step": 2878, "time_per_iteration": 2.8118834495544434 }, { "auxiliary_loss_clip": 0.01136112, "auxiliary_loss_mlp": 0.01047222, "balance_loss_clip": 1.05593777, "balance_loss_mlp": 1.02953172, "epoch": 0.17309484443108372, "flos": 25664386775040.0, "grad_norm": 1.9675557254753375, "language_loss": 0.70236337, "learning_rate": 3.78902268871344e-06, "loss": 0.72419673, "num_input_tokens_seen": 62328570, "step": 2879, "time_per_iteration": 2.7998502254486084 }, { "auxiliary_loss_clip": 0.01130517, "auxiliary_loss_mlp": 0.01051722, "balance_loss_clip": 1.05183411, "balance_loss_mlp": 1.03337598, "epoch": 0.1731549676837517, "flos": 13552903313280.0, "grad_norm": 2.0545155253910163, "language_loss": 0.82884222, "learning_rate": 3.78884854780014e-06, "loss": 0.85066462, "num_input_tokens_seen": 62345735, "step": 2880, "time_per_iteration": 2.6707684993743896 }, { "auxiliary_loss_clip": 0.01110706, "auxiliary_loss_mlp": 0.01054327, "balance_loss_clip": 1.05214918, "balance_loss_mlp": 1.03303647, "epoch": 0.17321509093641965, "flos": 22857070394880.0, "grad_norm": 1.9029231217608267, "language_loss": 0.80879176, "learning_rate": 3.7886743390535236e-06, "loss": 0.83044201, "num_input_tokens_seen": 62365525, "step": 2881, "time_per_iteration": 2.7851576805114746 }, { "auxiliary_loss_clip": 0.01135983, "auxiliary_loss_mlp": 0.01046895, "balance_loss_clip": 1.05544055, "balance_loss_mlp": 1.02921653, "epoch": 0.17327521418908762, "flos": 24352785653760.0, "grad_norm": 2.753231520615002, "language_loss": 0.77268815, "learning_rate": 3.788500062480197e-06, "loss": 0.79451692, "num_input_tokens_seen": 62385160, "step": 2882, "time_per_iteration": 2.7785212993621826 }, { "auxiliary_loss_clip": 0.01124099, "auxiliary_loss_mlp": 0.01047516, "balance_loss_clip": 1.0633558, "balance_loss_mlp": 1.02947998, "epoch": 0.1733353374417556, "flos": 33105651816960.0, "grad_norm": 2.096311926604511, "language_loss": 0.76714236, "learning_rate": 3.788325718086769e-06, "loss": 0.78885853, "num_input_tokens_seen": 62405280, "step": 2883, "time_per_iteration": 2.838848352432251 }, { "auxiliary_loss_clip": 0.01110924, "auxiliary_loss_mlp": 0.0104619, "balance_loss_clip": 1.04929209, "balance_loss_mlp": 1.02821302, "epoch": 0.17339546069442358, "flos": 24388947671040.0, "grad_norm": 2.1194201700326873, "language_loss": 0.8555252, "learning_rate": 3.7881513058798503e-06, "loss": 0.87709635, "num_input_tokens_seen": 62423665, "step": 2884, "time_per_iteration": 2.829376220703125 }, { "auxiliary_loss_clip": 0.01133962, "auxiliary_loss_mlp": 0.00775817, "balance_loss_clip": 1.05472779, "balance_loss_mlp": 1.00088096, "epoch": 0.17345558394709154, "flos": 27454174680960.0, "grad_norm": 1.7131036779262108, "language_loss": 0.74756771, "learning_rate": 3.787976825866055e-06, "loss": 0.76666546, "num_input_tokens_seen": 62445170, "step": 2885, "time_per_iteration": 2.8710989952087402 }, { "auxiliary_loss_clip": 0.01128977, "auxiliary_loss_mlp": 0.01044901, "balance_loss_clip": 1.05498922, "balance_loss_mlp": 1.0280925, "epoch": 0.1735157071997595, "flos": 24682158391680.0, "grad_norm": 2.374438581614022, "language_loss": 0.7107017, "learning_rate": 3.7878022780519998e-06, "loss": 0.73244053, "num_input_tokens_seen": 62466135, "step": 2886, "time_per_iteration": 2.726621150970459 }, { "auxiliary_loss_clip": 0.01142411, "auxiliary_loss_mlp": 0.01041857, "balance_loss_clip": 1.05233932, "balance_loss_mlp": 1.02408338, "epoch": 0.17357583045242747, "flos": 21688932193920.0, "grad_norm": 2.0566537172661747, "language_loss": 0.69906294, "learning_rate": 3.7876276624443024e-06, "loss": 0.72090566, "num_input_tokens_seen": 62483910, "step": 2887, "time_per_iteration": 2.7066688537597656 }, { "auxiliary_loss_clip": 0.01116425, "auxiliary_loss_mlp": 0.01045383, "balance_loss_clip": 1.05328536, "balance_loss_mlp": 1.02728677, "epoch": 0.17363595370509544, "flos": 15375728753280.0, "grad_norm": 2.038016964464323, "language_loss": 0.85257947, "learning_rate": 3.787452979049585e-06, "loss": 0.87419748, "num_input_tokens_seen": 62501530, "step": 2888, "time_per_iteration": 2.7514970302581787 }, { "auxiliary_loss_clip": 0.01095063, "auxiliary_loss_mlp": 0.01049413, "balance_loss_clip": 1.05020595, "balance_loss_mlp": 1.02822983, "epoch": 0.1736960769577634, "flos": 23440941970560.0, "grad_norm": 2.196318077733749, "language_loss": 0.78491282, "learning_rate": 3.7872782278744718e-06, "loss": 0.80635762, "num_input_tokens_seen": 62521295, "step": 2889, "time_per_iteration": 2.8221559524536133 }, { "auxiliary_loss_clip": 0.01112139, "auxiliary_loss_mlp": 0.0077601, "balance_loss_clip": 1.05236733, "balance_loss_mlp": 1.00114667, "epoch": 0.1737562002104314, "flos": 18587830475520.0, "grad_norm": 2.333227367674716, "language_loss": 0.84076989, "learning_rate": 3.7871034089255883e-06, "loss": 0.85965133, "num_input_tokens_seen": 62539615, "step": 2890, "time_per_iteration": 2.7213382720947266 }, { "auxiliary_loss_clip": 0.01142218, "auxiliary_loss_mlp": 0.01054918, "balance_loss_clip": 1.05530691, "balance_loss_mlp": 1.03752589, "epoch": 0.17381632346309936, "flos": 15998060816640.0, "grad_norm": 2.7278091568285596, "language_loss": 0.82205319, "learning_rate": 3.7869285222095653e-06, "loss": 0.84402454, "num_input_tokens_seen": 62556820, "step": 2891, "time_per_iteration": 2.625162363052368 }, { "auxiliary_loss_clip": 0.01097361, "auxiliary_loss_mlp": 0.01050012, "balance_loss_clip": 1.04281187, "balance_loss_mlp": 1.02876878, "epoch": 0.17387644671576732, "flos": 13369830670080.0, "grad_norm": 1.9017653264876209, "language_loss": 0.81200826, "learning_rate": 3.7867535677330334e-06, "loss": 0.83348203, "num_input_tokens_seen": 62572450, "step": 2892, "time_per_iteration": 2.7682459354400635 }, { "auxiliary_loss_clip": 0.01148834, "auxiliary_loss_mlp": 0.0105551, "balance_loss_clip": 1.05707812, "balance_loss_mlp": 1.03631687, "epoch": 0.1739365699684353, "flos": 26615516958720.0, "grad_norm": 2.0056711213447436, "language_loss": 0.73950225, "learning_rate": 3.786578545502627e-06, "loss": 0.76154572, "num_input_tokens_seen": 62592580, "step": 2893, "time_per_iteration": 2.8463022708892822 }, { "auxiliary_loss_clip": 0.01132474, "auxiliary_loss_mlp": 0.01043509, "balance_loss_clip": 1.05198765, "balance_loss_mlp": 1.02443516, "epoch": 0.17399669322110325, "flos": 23367971491200.0, "grad_norm": 4.010773627073901, "language_loss": 0.82507658, "learning_rate": 3.7864034555249828e-06, "loss": 0.84683645, "num_input_tokens_seen": 62611220, "step": 2894, "time_per_iteration": 2.719564914703369 }, { "auxiliary_loss_clip": 0.01113951, "auxiliary_loss_mlp": 0.01046249, "balance_loss_clip": 1.0506922, "balance_loss_mlp": 1.02463603, "epoch": 0.17405681647377122, "flos": 22054107813120.0, "grad_norm": 2.3322053123967574, "language_loss": 0.73826683, "learning_rate": 3.786228297806741e-06, "loss": 0.7598688, "num_input_tokens_seen": 62629185, "step": 2895, "time_per_iteration": 2.743992805480957 }, { "auxiliary_loss_clip": 0.01037578, "auxiliary_loss_mlp": 0.01011099, "balance_loss_clip": 1.0404408, "balance_loss_mlp": 1.00788069, "epoch": 0.1741169397264392, "flos": 61457559114240.0, "grad_norm": 0.8765647158253519, "language_loss": 0.62754023, "learning_rate": 3.7860530723545435e-06, "loss": 0.64802706, "num_input_tokens_seen": 62691895, "step": 2896, "time_per_iteration": 3.345099687576294 }, { "auxiliary_loss_clip": 0.0113101, "auxiliary_loss_mlp": 0.00776588, "balance_loss_clip": 1.05246758, "balance_loss_mlp": 1.00102258, "epoch": 0.17417706297910718, "flos": 27017680608000.0, "grad_norm": 1.7338863964520728, "language_loss": 0.75822324, "learning_rate": 3.785877779175034e-06, "loss": 0.77729923, "num_input_tokens_seen": 62713790, "step": 2897, "time_per_iteration": 2.772292137145996 }, { "auxiliary_loss_clip": 0.01141357, "auxiliary_loss_mlp": 0.01042983, "balance_loss_clip": 1.0545547, "balance_loss_mlp": 1.02512598, "epoch": 0.17423718623177514, "flos": 33508856960640.0, "grad_norm": 1.944569306659421, "language_loss": 0.6883949, "learning_rate": 3.7857024182748606e-06, "loss": 0.71023834, "num_input_tokens_seen": 62736285, "step": 2898, "time_per_iteration": 2.7278554439544678 }, { "auxiliary_loss_clip": 0.01128715, "auxiliary_loss_mlp": 0.01044216, "balance_loss_clip": 1.05251193, "balance_loss_mlp": 1.02504694, "epoch": 0.1742973094844431, "flos": 27198634348800.0, "grad_norm": 2.99011081330885, "language_loss": 0.76445562, "learning_rate": 3.7855269896606717e-06, "loss": 0.78618491, "num_input_tokens_seen": 62756240, "step": 2899, "time_per_iteration": 2.8052010536193848 }, { "auxiliary_loss_clip": 0.01095069, "auxiliary_loss_mlp": 0.01045896, "balance_loss_clip": 1.04680347, "balance_loss_mlp": 1.02632213, "epoch": 0.17435743273711107, "flos": 22710734386560.0, "grad_norm": 3.2965812335226357, "language_loss": 0.72860038, "learning_rate": 3.785351493339121e-06, "loss": 0.75001007, "num_input_tokens_seen": 62775910, "step": 2900, "time_per_iteration": 2.868218421936035 }, { "auxiliary_loss_clip": 0.01110522, "auxiliary_loss_mlp": 0.00776698, "balance_loss_clip": 1.05202782, "balance_loss_mlp": 1.000983, "epoch": 0.17441755598977904, "flos": 41646466039680.0, "grad_norm": 1.5488662608930523, "language_loss": 0.69946706, "learning_rate": 3.785175929316863e-06, "loss": 0.71833932, "num_input_tokens_seen": 62799385, "step": 2901, "time_per_iteration": 4.407040596008301 }, { "auxiliary_loss_clip": 0.01129098, "auxiliary_loss_mlp": 0.01045525, "balance_loss_clip": 1.05246592, "balance_loss_mlp": 1.02764344, "epoch": 0.174477679242447, "flos": 26287077974400.0, "grad_norm": 2.1785959913748965, "language_loss": 0.76588804, "learning_rate": 3.7850002976005543e-06, "loss": 0.78763425, "num_input_tokens_seen": 62819380, "step": 2902, "time_per_iteration": 4.2244462966918945 }, { "auxiliary_loss_clip": 0.01145685, "auxiliary_loss_mlp": 0.0104382, "balance_loss_clip": 1.0531354, "balance_loss_mlp": 1.02567625, "epoch": 0.174537802495115, "flos": 17858412990720.0, "grad_norm": 2.2508699895191073, "language_loss": 0.81588745, "learning_rate": 3.7848245981968558e-06, "loss": 0.83778256, "num_input_tokens_seen": 62836205, "step": 2903, "time_per_iteration": 4.132925271987915 }, { "auxiliary_loss_clip": 0.01126443, "auxiliary_loss_mlp": 0.0103942, "balance_loss_clip": 1.05449986, "balance_loss_mlp": 1.02135992, "epoch": 0.17459792574778296, "flos": 16940715390720.0, "grad_norm": 2.4085694554154187, "language_loss": 0.73316491, "learning_rate": 3.784648831112429e-06, "loss": 0.75482351, "num_input_tokens_seen": 62854045, "step": 2904, "time_per_iteration": 2.7033374309539795 }, { "auxiliary_loss_clip": 0.01105192, "auxiliary_loss_mlp": 0.0104577, "balance_loss_clip": 1.05250716, "balance_loss_mlp": 1.02822256, "epoch": 0.17465804900045093, "flos": 25520026014720.0, "grad_norm": 1.8783326609306377, "language_loss": 0.64233291, "learning_rate": 3.7844729963539406e-06, "loss": 0.66384256, "num_input_tokens_seen": 62873075, "step": 2905, "time_per_iteration": 2.8325791358947754 }, { "auxiliary_loss_clip": 0.01135256, "auxiliary_loss_mlp": 0.01053006, "balance_loss_clip": 1.05869055, "balance_loss_mlp": 1.03370619, "epoch": 0.1747181722531189, "flos": 24129708238080.0, "grad_norm": 2.820817719352069, "language_loss": 0.79504299, "learning_rate": 3.7842970939280566e-06, "loss": 0.81692564, "num_input_tokens_seen": 62892675, "step": 2906, "time_per_iteration": 4.491498231887817 }, { "auxiliary_loss_clip": 0.01146195, "auxiliary_loss_mlp": 0.01050729, "balance_loss_clip": 1.05623174, "balance_loss_mlp": 1.03258538, "epoch": 0.17477829550578686, "flos": 17748813617280.0, "grad_norm": 2.262709441571415, "language_loss": 0.81318873, "learning_rate": 3.784121123841449e-06, "loss": 0.83515799, "num_input_tokens_seen": 62910675, "step": 2907, "time_per_iteration": 2.6855854988098145 }, { "auxiliary_loss_clip": 0.01143202, "auxiliary_loss_mlp": 0.01043315, "balance_loss_clip": 1.05374384, "balance_loss_mlp": 1.0253861, "epoch": 0.17483841875845482, "flos": 15377344865280.0, "grad_norm": 2.068635027461873, "language_loss": 0.81342787, "learning_rate": 3.7839450861007886e-06, "loss": 0.83529305, "num_input_tokens_seen": 62928130, "step": 2908, "time_per_iteration": 2.6449570655822754 }, { "auxiliary_loss_clip": 0.01127136, "auxiliary_loss_mlp": 0.01050925, "balance_loss_clip": 1.05178046, "balance_loss_mlp": 1.03163743, "epoch": 0.17489854201112282, "flos": 17163254102400.0, "grad_norm": 3.147433356867123, "language_loss": 0.80020624, "learning_rate": 3.7837689807127518e-06, "loss": 0.82198691, "num_input_tokens_seen": 62944290, "step": 2909, "time_per_iteration": 2.6820569038391113 }, { "auxiliary_loss_clip": 0.0109059, "auxiliary_loss_mlp": 0.01052625, "balance_loss_clip": 1.05020881, "balance_loss_mlp": 1.0310595, "epoch": 0.17495866526379078, "flos": 19755286318080.0, "grad_norm": 1.6978440546881337, "language_loss": 0.76742244, "learning_rate": 3.783592807684017e-06, "loss": 0.7888546, "num_input_tokens_seen": 62963505, "step": 2910, "time_per_iteration": 2.6980416774749756 }, { "auxiliary_loss_clip": 0.01158552, "auxiliary_loss_mlp": 0.01049407, "balance_loss_clip": 1.05618358, "balance_loss_mlp": 1.03059566, "epoch": 0.17501878851645875, "flos": 28511133310080.0, "grad_norm": 1.9812610358315632, "language_loss": 0.8698765, "learning_rate": 3.7834165670212645e-06, "loss": 0.89195609, "num_input_tokens_seen": 62985020, "step": 2911, "time_per_iteration": 2.692662477493286 }, { "auxiliary_loss_clip": 0.01154744, "auxiliary_loss_mlp": 0.00777232, "balance_loss_clip": 1.05323184, "balance_loss_mlp": 1.00110698, "epoch": 0.1750789117691267, "flos": 17931203902080.0, "grad_norm": 3.030740090796483, "language_loss": 0.89883876, "learning_rate": 3.7832402587311764e-06, "loss": 0.91815847, "num_input_tokens_seen": 63001745, "step": 2912, "time_per_iteration": 2.600738763809204 }, { "auxiliary_loss_clip": 0.01146165, "auxiliary_loss_mlp": 0.01045616, "balance_loss_clip": 1.0538094, "balance_loss_mlp": 1.02655411, "epoch": 0.17513903502179468, "flos": 18259427404800.0, "grad_norm": 2.03479884577424, "language_loss": 0.72818935, "learning_rate": 3.783063882820439e-06, "loss": 0.75010711, "num_input_tokens_seen": 63019750, "step": 2913, "time_per_iteration": 2.623342275619507 }, { "auxiliary_loss_clip": 0.01140074, "auxiliary_loss_mlp": 0.01043928, "balance_loss_clip": 1.05781865, "balance_loss_mlp": 1.02557003, "epoch": 0.17519915827446264, "flos": 20704728562560.0, "grad_norm": 2.137073079496124, "language_loss": 0.6891731, "learning_rate": 3.782887439295741e-06, "loss": 0.71101314, "num_input_tokens_seen": 63039500, "step": 2914, "time_per_iteration": 2.7065770626068115 }, { "auxiliary_loss_clip": 0.01142434, "auxiliary_loss_mlp": 0.01045043, "balance_loss_clip": 1.05532789, "balance_loss_mlp": 1.02649403, "epoch": 0.1752592815271306, "flos": 20523415685760.0, "grad_norm": 2.051329837479214, "language_loss": 0.93125081, "learning_rate": 3.782710928163772e-06, "loss": 0.9531256, "num_input_tokens_seen": 63059785, "step": 2915, "time_per_iteration": 2.659029245376587 }, { "auxiliary_loss_clip": 0.01114731, "auxiliary_loss_mlp": 0.01040999, "balance_loss_clip": 1.04957223, "balance_loss_mlp": 1.02243853, "epoch": 0.1753194047797986, "flos": 21799178012160.0, "grad_norm": 1.604344576738792, "language_loss": 0.81092978, "learning_rate": 3.782534349431226e-06, "loss": 0.83248705, "num_input_tokens_seen": 63079385, "step": 2916, "time_per_iteration": 2.7099549770355225 }, { "auxiliary_loss_clip": 0.0114211, "auxiliary_loss_mlp": 0.01046221, "balance_loss_clip": 1.05090034, "balance_loss_mlp": 1.02780342, "epoch": 0.17537952803246656, "flos": 20668351063680.0, "grad_norm": 3.7582760939418716, "language_loss": 0.73829222, "learning_rate": 3.782357703104799e-06, "loss": 0.76017547, "num_input_tokens_seen": 63098970, "step": 2917, "time_per_iteration": 2.666717767715454 }, { "auxiliary_loss_clip": 0.01133449, "auxiliary_loss_mlp": 0.01047353, "balance_loss_clip": 1.05319786, "balance_loss_mlp": 1.02821994, "epoch": 0.17543965128513453, "flos": 23295072839040.0, "grad_norm": 1.813699779869167, "language_loss": 0.76739681, "learning_rate": 3.7821809891911897e-06, "loss": 0.78920484, "num_input_tokens_seen": 63118750, "step": 2918, "time_per_iteration": 2.647634744644165 }, { "auxiliary_loss_clip": 0.01093958, "auxiliary_loss_mlp": 0.01045643, "balance_loss_clip": 1.0476644, "balance_loss_mlp": 1.02425694, "epoch": 0.1754997745378025, "flos": 29095615416960.0, "grad_norm": 2.436739755969174, "language_loss": 0.73624814, "learning_rate": 3.782004207697098e-06, "loss": 0.75764406, "num_input_tokens_seen": 63136865, "step": 2919, "time_per_iteration": 2.7904632091522217 }, { "auxiliary_loss_clip": 0.0112465, "auxiliary_loss_mlp": 0.01046524, "balance_loss_clip": 1.04938293, "balance_loss_mlp": 1.02805829, "epoch": 0.17555989779047046, "flos": 30371844620160.0, "grad_norm": 2.5113730227003814, "language_loss": 0.74840331, "learning_rate": 3.781827358629228e-06, "loss": 0.77011508, "num_input_tokens_seen": 63158325, "step": 2920, "time_per_iteration": 2.727890968322754 }, { "auxiliary_loss_clip": 0.01117257, "auxiliary_loss_mlp": 0.01042893, "balance_loss_clip": 1.0462867, "balance_loss_mlp": 1.02371216, "epoch": 0.17562002104313842, "flos": 23287746464640.0, "grad_norm": 3.6617213109535536, "language_loss": 0.79731411, "learning_rate": 3.7816504419942873e-06, "loss": 0.81891561, "num_input_tokens_seen": 63173115, "step": 2921, "time_per_iteration": 2.753817558288574 }, { "auxiliary_loss_clip": 0.01121718, "auxiliary_loss_mlp": 0.01046234, "balance_loss_clip": 1.05232286, "balance_loss_mlp": 1.02679133, "epoch": 0.1756801442958064, "flos": 24790500789120.0, "grad_norm": 2.6301689129577546, "language_loss": 0.87826073, "learning_rate": 3.7814734577989823e-06, "loss": 0.89994025, "num_input_tokens_seen": 63192880, "step": 2922, "time_per_iteration": 2.7411837577819824 }, { "auxiliary_loss_clip": 0.01144004, "auxiliary_loss_mlp": 0.01047403, "balance_loss_clip": 1.05196273, "balance_loss_mlp": 1.02778149, "epoch": 0.17574026754847438, "flos": 25771651764480.0, "grad_norm": 4.4893841411537085, "language_loss": 0.62347209, "learning_rate": 3.7812964060500253e-06, "loss": 0.64538622, "num_input_tokens_seen": 63214395, "step": 2923, "time_per_iteration": 2.7666683197021484 }, { "auxiliary_loss_clip": 0.01134872, "auxiliary_loss_mlp": 0.01048692, "balance_loss_clip": 1.05887377, "balance_loss_mlp": 1.02847457, "epoch": 0.17580039080114235, "flos": 17456608477440.0, "grad_norm": 2.8552131957437914, "language_loss": 0.80392253, "learning_rate": 3.78111928675413e-06, "loss": 0.82575822, "num_input_tokens_seen": 63231020, "step": 2924, "time_per_iteration": 2.729403257369995 }, { "auxiliary_loss_clip": 0.01132783, "auxiliary_loss_mlp": 0.01051456, "balance_loss_clip": 1.05193377, "balance_loss_mlp": 1.03082108, "epoch": 0.1758605140538103, "flos": 14864648088960.0, "grad_norm": 5.080042666316876, "language_loss": 0.71374178, "learning_rate": 3.7809420999180126e-06, "loss": 0.73558426, "num_input_tokens_seen": 63246245, "step": 2925, "time_per_iteration": 2.9538233280181885 }, { "auxiliary_loss_clip": 0.01117196, "auxiliary_loss_mlp": 0.01045706, "balance_loss_clip": 1.05052948, "balance_loss_mlp": 1.02744341, "epoch": 0.17592063730647828, "flos": 23004268329600.0, "grad_norm": 1.6620026542608322, "language_loss": 0.71931666, "learning_rate": 3.7807648455483934e-06, "loss": 0.74094564, "num_input_tokens_seen": 63267790, "step": 2926, "time_per_iteration": 2.7738964557647705 }, { "auxiliary_loss_clip": 0.01105944, "auxiliary_loss_mlp": 0.01045732, "balance_loss_clip": 1.04915071, "balance_loss_mlp": 1.02253425, "epoch": 0.17598076055914624, "flos": 20741501111040.0, "grad_norm": 2.6318732447225837, "language_loss": 0.84724289, "learning_rate": 3.7805875236519918e-06, "loss": 0.86875963, "num_input_tokens_seen": 63286830, "step": 2927, "time_per_iteration": 2.704437494277954 }, { "auxiliary_loss_clip": 0.01100437, "auxiliary_loss_mlp": 0.01046684, "balance_loss_clip": 1.05039644, "balance_loss_mlp": 1.02887452, "epoch": 0.1760408838118142, "flos": 34092441227520.0, "grad_norm": 1.9547597089289632, "language_loss": 0.72147644, "learning_rate": 3.7804101342355336e-06, "loss": 0.74294758, "num_input_tokens_seen": 63308870, "step": 2928, "time_per_iteration": 2.793802261352539 }, { "auxiliary_loss_clip": 0.01120251, "auxiliary_loss_mlp": 0.01045623, "balance_loss_clip": 1.0516876, "balance_loss_mlp": 1.02679992, "epoch": 0.1761010070644822, "flos": 24168384207360.0, "grad_norm": 1.8474008440192304, "language_loss": 0.83097279, "learning_rate": 3.780232677305744e-06, "loss": 0.85263157, "num_input_tokens_seen": 63329005, "step": 2929, "time_per_iteration": 2.733339786529541 }, { "auxiliary_loss_clip": 0.01124127, "auxiliary_loss_mlp": 0.01042521, "balance_loss_clip": 1.04853475, "balance_loss_mlp": 1.02479422, "epoch": 0.17616113031715017, "flos": 26576697335040.0, "grad_norm": 2.4427170552109163, "language_loss": 0.79211783, "learning_rate": 3.7800551528693535e-06, "loss": 0.81378424, "num_input_tokens_seen": 63349390, "step": 2930, "time_per_iteration": 2.748080015182495 }, { "auxiliary_loss_clip": 0.01160654, "auxiliary_loss_mlp": 0.01047281, "balance_loss_clip": 1.05925918, "balance_loss_mlp": 1.02758813, "epoch": 0.17622125356981813, "flos": 25666685245440.0, "grad_norm": 2.504124366499191, "language_loss": 0.76502466, "learning_rate": 3.7798775609330927e-06, "loss": 0.78710401, "num_input_tokens_seen": 63368835, "step": 2931, "time_per_iteration": 2.6691603660583496 }, { "auxiliary_loss_clip": 0.01076453, "auxiliary_loss_mlp": 0.01043586, "balance_loss_clip": 1.04577017, "balance_loss_mlp": 1.02478647, "epoch": 0.1762813768224861, "flos": 16508530949760.0, "grad_norm": 2.941321746162514, "language_loss": 0.76070881, "learning_rate": 3.779699901503696e-06, "loss": 0.78190923, "num_input_tokens_seen": 63385220, "step": 2932, "time_per_iteration": 2.809630870819092 }, { "auxiliary_loss_clip": 0.01148627, "auxiliary_loss_mlp": 0.01043149, "balance_loss_clip": 1.05284405, "balance_loss_mlp": 1.0229789, "epoch": 0.17634150007515406, "flos": 11211850402560.0, "grad_norm": 5.168612276821382, "language_loss": 0.90027422, "learning_rate": 3.7795221745879016e-06, "loss": 0.92219198, "num_input_tokens_seen": 63400865, "step": 2933, "time_per_iteration": 2.6665337085723877 }, { "auxiliary_loss_clip": 0.01154114, "auxiliary_loss_mlp": 0.01055985, "balance_loss_clip": 1.05539656, "balance_loss_mlp": 1.03766203, "epoch": 0.17640162332782203, "flos": 23659925235840.0, "grad_norm": 2.009210784374188, "language_loss": 0.88323247, "learning_rate": 3.779344380192448e-06, "loss": 0.90533352, "num_input_tokens_seen": 63421390, "step": 2934, "time_per_iteration": 2.6649580001831055 }, { "auxiliary_loss_clip": 0.01128495, "auxiliary_loss_mlp": 0.01048067, "balance_loss_clip": 1.05581188, "balance_loss_mlp": 1.03028131, "epoch": 0.17646174658049, "flos": 53796984606720.0, "grad_norm": 1.6302121247923247, "language_loss": 0.70403945, "learning_rate": 3.779166518324077e-06, "loss": 0.72580504, "num_input_tokens_seen": 63444715, "step": 2935, "time_per_iteration": 3.006019115447998 }, { "auxiliary_loss_clip": 0.01126189, "auxiliary_loss_mlp": 0.01040034, "balance_loss_clip": 1.05360174, "balance_loss_mlp": 1.02135396, "epoch": 0.17652186983315798, "flos": 24243868638720.0, "grad_norm": 2.5931578566124807, "language_loss": 0.69721985, "learning_rate": 3.7789885889895325e-06, "loss": 0.71888208, "num_input_tokens_seen": 63465525, "step": 2936, "time_per_iteration": 2.7517428398132324 }, { "auxiliary_loss_clip": 0.01105644, "auxiliary_loss_mlp": 0.01045896, "balance_loss_clip": 1.05023837, "balance_loss_mlp": 1.02737129, "epoch": 0.17658199308582595, "flos": 27454282421760.0, "grad_norm": 1.9170676229980566, "language_loss": 0.71288073, "learning_rate": 3.7788105921955634e-06, "loss": 0.73439616, "num_input_tokens_seen": 63485815, "step": 2937, "time_per_iteration": 2.837181329727173 }, { "auxiliary_loss_clip": 0.01141008, "auxiliary_loss_mlp": 0.01046843, "balance_loss_clip": 1.05945122, "balance_loss_mlp": 1.02674472, "epoch": 0.17664211633849392, "flos": 22418672901120.0, "grad_norm": 2.267148270780071, "language_loss": 0.75439745, "learning_rate": 3.7786325279489184e-06, "loss": 0.77627593, "num_input_tokens_seen": 63503905, "step": 2938, "time_per_iteration": 2.883162021636963 }, { "auxiliary_loss_clip": 0.01147345, "auxiliary_loss_mlp": 0.01043976, "balance_loss_clip": 1.05576169, "balance_loss_mlp": 1.02553487, "epoch": 0.17670223959116188, "flos": 24715124098560.0, "grad_norm": 2.921726967662053, "language_loss": 0.71015209, "learning_rate": 3.7784543962563495e-06, "loss": 0.73206532, "num_input_tokens_seen": 63521985, "step": 2939, "time_per_iteration": 2.6938419342041016 }, { "auxiliary_loss_clip": 0.01160437, "auxiliary_loss_mlp": 0.01046921, "balance_loss_clip": 1.05818558, "balance_loss_mlp": 1.02794337, "epoch": 0.17676236284382985, "flos": 22527051212160.0, "grad_norm": 3.114901170192376, "language_loss": 0.73513985, "learning_rate": 3.7782761971246115e-06, "loss": 0.75721341, "num_input_tokens_seen": 63539830, "step": 2940, "time_per_iteration": 4.145469665527344 }, { "auxiliary_loss_clip": 0.0112582, "auxiliary_loss_mlp": 0.01046611, "balance_loss_clip": 1.05631542, "balance_loss_mlp": 1.02731109, "epoch": 0.1768224860964978, "flos": 12385160161920.0, "grad_norm": 3.071469776016301, "language_loss": 0.85375023, "learning_rate": 3.7780979305604616e-06, "loss": 0.87547457, "num_input_tokens_seen": 63555495, "step": 2941, "time_per_iteration": 4.279599666595459 }, { "auxiliary_loss_clip": 0.01161068, "auxiliary_loss_mlp": 0.01045254, "balance_loss_clip": 1.05717027, "balance_loss_mlp": 1.0257628, "epoch": 0.1768826093491658, "flos": 24353360271360.0, "grad_norm": 2.434766510066968, "language_loss": 0.76885259, "learning_rate": 3.7779195965706607e-06, "loss": 0.79091585, "num_input_tokens_seen": 63575290, "step": 2942, "time_per_iteration": 4.2280871868133545 }, { "auxiliary_loss_clip": 0.01106234, "auxiliary_loss_mlp": 0.00780676, "balance_loss_clip": 1.04992843, "balance_loss_mlp": 1.00087166, "epoch": 0.17694273260183377, "flos": 23587062497280.0, "grad_norm": 3.301743041114179, "language_loss": 0.8024286, "learning_rate": 3.77774119516197e-06, "loss": 0.82129776, "num_input_tokens_seen": 63594670, "step": 2943, "time_per_iteration": 2.8921029567718506 }, { "auxiliary_loss_clip": 0.01132848, "auxiliary_loss_mlp": 0.01052225, "balance_loss_clip": 1.05352235, "balance_loss_mlp": 1.03124392, "epoch": 0.17700285585450173, "flos": 26760991040640.0, "grad_norm": 5.7613375603973465, "language_loss": 0.80809408, "learning_rate": 3.777562726341155e-06, "loss": 0.82994485, "num_input_tokens_seen": 63614780, "step": 2944, "time_per_iteration": 2.692831039428711 }, { "auxiliary_loss_clip": 0.01161854, "auxiliary_loss_mlp": 0.01056825, "balance_loss_clip": 1.05807233, "balance_loss_mlp": 1.03796625, "epoch": 0.1770629791071697, "flos": 42776323320960.0, "grad_norm": 2.4257754996125227, "language_loss": 0.73812854, "learning_rate": 3.7773841901149835e-06, "loss": 0.7603153, "num_input_tokens_seen": 63637190, "step": 2945, "time_per_iteration": 2.782910108566284 }, { "auxiliary_loss_clip": 0.011481, "auxiliary_loss_mlp": 0.01047361, "balance_loss_clip": 1.05756998, "balance_loss_mlp": 1.02862108, "epoch": 0.17712310235983766, "flos": 17345572560000.0, "grad_norm": 2.8106797532110637, "language_loss": 0.7793628, "learning_rate": 3.7772055864902256e-06, "loss": 0.80131739, "num_input_tokens_seen": 63652140, "step": 2946, "time_per_iteration": 4.278741121292114 }, { "auxiliary_loss_clip": 0.01109059, "auxiliary_loss_mlp": 0.01052842, "balance_loss_clip": 1.04997015, "balance_loss_mlp": 1.03341079, "epoch": 0.17718322561250563, "flos": 23878477537920.0, "grad_norm": 2.172386857191393, "language_loss": 0.76068008, "learning_rate": 3.7770269154736535e-06, "loss": 0.7822991, "num_input_tokens_seen": 63671700, "step": 2947, "time_per_iteration": 2.7949914932250977 }, { "auxiliary_loss_clip": 0.0114934, "auxiliary_loss_mlp": 0.01044342, "balance_loss_clip": 1.05480659, "balance_loss_mlp": 1.025388, "epoch": 0.1772433488651736, "flos": 36466352104320.0, "grad_norm": 2.6793588646204745, "language_loss": 0.72557831, "learning_rate": 3.7768481770720424e-06, "loss": 0.74751514, "num_input_tokens_seen": 63691685, "step": 2948, "time_per_iteration": 2.901662826538086 }, { "auxiliary_loss_clip": 0.01151572, "auxiliary_loss_mlp": 0.01050692, "balance_loss_clip": 1.05921662, "balance_loss_mlp": 1.03236949, "epoch": 0.1773034721178416, "flos": 26684716510080.0, "grad_norm": 1.8296543316983853, "language_loss": 0.81782824, "learning_rate": 3.776669371292171e-06, "loss": 0.8398509, "num_input_tokens_seen": 63711720, "step": 2949, "time_per_iteration": 2.7284891605377197 }, { "auxiliary_loss_clip": 0.01080853, "auxiliary_loss_mlp": 0.0100651, "balance_loss_clip": 1.04975748, "balance_loss_mlp": 1.00226629, "epoch": 0.17736359537050955, "flos": 57117467617920.0, "grad_norm": 0.768126622018234, "language_loss": 0.64989161, "learning_rate": 3.7764904981408186e-06, "loss": 0.67076528, "num_input_tokens_seen": 63776280, "step": 2950, "time_per_iteration": 3.2761552333831787 }, { "auxiliary_loss_clip": 0.01121454, "auxiliary_loss_mlp": 0.01045861, "balance_loss_clip": 1.05373287, "balance_loss_mlp": 1.02743077, "epoch": 0.17742371862317752, "flos": 27198203385600.0, "grad_norm": 2.9882590699755927, "language_loss": 0.83619881, "learning_rate": 3.7763115576247686e-06, "loss": 0.85787189, "num_input_tokens_seen": 63797535, "step": 2951, "time_per_iteration": 2.7637627124786377 }, { "auxiliary_loss_clip": 0.01125929, "auxiliary_loss_mlp": 0.01046039, "balance_loss_clip": 1.05109882, "balance_loss_mlp": 1.02682269, "epoch": 0.17748384187584548, "flos": 20959694277120.0, "grad_norm": 2.3133151959471796, "language_loss": 0.80395055, "learning_rate": 3.776132549750806e-06, "loss": 0.82567012, "num_input_tokens_seen": 63817045, "step": 2952, "time_per_iteration": 2.7605957984924316 }, { "auxiliary_loss_clip": 0.01162679, "auxiliary_loss_mlp": 0.01044862, "balance_loss_clip": 1.05858529, "balance_loss_mlp": 1.02513337, "epoch": 0.17754396512851345, "flos": 25009986844800.0, "grad_norm": 2.8185319653472116, "language_loss": 0.79273909, "learning_rate": 3.7759534745257194e-06, "loss": 0.81481451, "num_input_tokens_seen": 63837665, "step": 2953, "time_per_iteration": 2.798912525177002 }, { "auxiliary_loss_clip": 0.0112399, "auxiliary_loss_mlp": 0.01043314, "balance_loss_clip": 1.05482125, "balance_loss_mlp": 1.02470589, "epoch": 0.1776040883811814, "flos": 32051566275840.0, "grad_norm": 2.017710353628998, "language_loss": 0.87963271, "learning_rate": 3.7757743319562994e-06, "loss": 0.90130568, "num_input_tokens_seen": 63858455, "step": 2954, "time_per_iteration": 2.838931083679199 }, { "auxiliary_loss_clip": 0.01144028, "auxiliary_loss_mlp": 0.01052958, "balance_loss_clip": 1.06043494, "balance_loss_mlp": 1.03296697, "epoch": 0.17766421163384938, "flos": 21574125348480.0, "grad_norm": 1.9130853947826985, "language_loss": 0.85313326, "learning_rate": 3.7755951220493386e-06, "loss": 0.87510312, "num_input_tokens_seen": 63876935, "step": 2955, "time_per_iteration": 2.7965714931488037 }, { "auxiliary_loss_clip": 0.01127677, "auxiliary_loss_mlp": 0.01047004, "balance_loss_clip": 1.05093336, "balance_loss_mlp": 1.02660692, "epoch": 0.17772433488651737, "flos": 22419319345920.0, "grad_norm": 18.24238703278013, "language_loss": 0.71152055, "learning_rate": 3.7754158448116327e-06, "loss": 0.73326737, "num_input_tokens_seen": 63896815, "step": 2956, "time_per_iteration": 2.8358442783355713 }, { "auxiliary_loss_clip": 0.01150063, "auxiliary_loss_mlp": 0.010506, "balance_loss_clip": 1.05813813, "balance_loss_mlp": 1.03156281, "epoch": 0.17778445813918534, "flos": 25629445820160.0, "grad_norm": 2.981126112172262, "language_loss": 0.82881534, "learning_rate": 3.7752365002499795e-06, "loss": 0.85082197, "num_input_tokens_seen": 63916140, "step": 2957, "time_per_iteration": 2.7034976482391357 }, { "auxiliary_loss_clip": 0.01100452, "auxiliary_loss_mlp": 0.01047239, "balance_loss_clip": 1.04976833, "balance_loss_mlp": 1.02789164, "epoch": 0.1778445813918533, "flos": 25628871202560.0, "grad_norm": 2.7180995933425622, "language_loss": 0.75164193, "learning_rate": 3.7750570883711807e-06, "loss": 0.77311885, "num_input_tokens_seen": 63935220, "step": 2958, "time_per_iteration": 2.8312718868255615 }, { "auxiliary_loss_clip": 0.01146025, "auxiliary_loss_mlp": 0.01043359, "balance_loss_clip": 1.06117964, "balance_loss_mlp": 1.02502513, "epoch": 0.17790470464452127, "flos": 22345522853760.0, "grad_norm": 9.439636088267013, "language_loss": 0.80363399, "learning_rate": 3.7748776091820397e-06, "loss": 0.82552785, "num_input_tokens_seen": 63954550, "step": 2959, "time_per_iteration": 2.722102642059326 }, { "auxiliary_loss_clip": 0.01164621, "auxiliary_loss_mlp": 0.01049069, "balance_loss_clip": 1.05812871, "balance_loss_mlp": 1.02938771, "epoch": 0.17796482789718923, "flos": 18765875214720.0, "grad_norm": 2.62580469975692, "language_loss": 0.51511085, "learning_rate": 3.774698062689362e-06, "loss": 0.53724772, "num_input_tokens_seen": 63972425, "step": 2960, "time_per_iteration": 2.6222047805786133 }, { "auxiliary_loss_clip": 0.01111843, "auxiliary_loss_mlp": 0.01052801, "balance_loss_clip": 1.05275989, "balance_loss_mlp": 1.03228474, "epoch": 0.1780249511498572, "flos": 23440941970560.0, "grad_norm": 1.7626913000215665, "language_loss": 0.88908094, "learning_rate": 3.7745184488999548e-06, "loss": 0.91072738, "num_input_tokens_seen": 63992165, "step": 2961, "time_per_iteration": 2.8088786602020264 }, { "auxiliary_loss_clip": 0.01116231, "auxiliary_loss_mlp": 0.01054867, "balance_loss_clip": 1.05181062, "balance_loss_mlp": 1.03385067, "epoch": 0.1780850744025252, "flos": 23367468700800.0, "grad_norm": 1.716412227369414, "language_loss": 0.79170465, "learning_rate": 3.774338767820631e-06, "loss": 0.81341565, "num_input_tokens_seen": 64013470, "step": 2962, "time_per_iteration": 2.7546913623809814 }, { "auxiliary_loss_clip": 0.01145526, "auxiliary_loss_mlp": 0.01052794, "balance_loss_clip": 1.05649889, "balance_loss_mlp": 1.03104997, "epoch": 0.17814519765519315, "flos": 13771994319360.0, "grad_norm": 2.3241756501763446, "language_loss": 0.74910223, "learning_rate": 3.774159019458203e-06, "loss": 0.77108544, "num_input_tokens_seen": 64030975, "step": 2963, "time_per_iteration": 2.680356979370117 }, { "auxiliary_loss_clip": 0.01140656, "auxiliary_loss_mlp": 0.01043225, "balance_loss_clip": 1.05769885, "balance_loss_mlp": 1.02347231, "epoch": 0.17820532090786112, "flos": 21976396738560.0, "grad_norm": 1.747536927551571, "language_loss": 0.78837025, "learning_rate": 3.7739792038194877e-06, "loss": 0.81020904, "num_input_tokens_seen": 64050075, "step": 2964, "time_per_iteration": 2.748398780822754 }, { "auxiliary_loss_clip": 0.01151685, "auxiliary_loss_mlp": 0.00776982, "balance_loss_clip": 1.05950594, "balance_loss_mlp": 1.00098181, "epoch": 0.17826544416052909, "flos": 24790752184320.0, "grad_norm": 3.046027397796258, "language_loss": 0.81160808, "learning_rate": 3.7737993209113027e-06, "loss": 0.83089471, "num_input_tokens_seen": 64071920, "step": 2965, "time_per_iteration": 2.8090012073516846 }, { "auxiliary_loss_clip": 0.01151658, "auxiliary_loss_mlp": 0.01047086, "balance_loss_clip": 1.06002402, "balance_loss_mlp": 1.02916884, "epoch": 0.17832556741319705, "flos": 13879582531200.0, "grad_norm": 2.554359630612449, "language_loss": 0.95307338, "learning_rate": 3.7736193707404698e-06, "loss": 0.97506082, "num_input_tokens_seen": 64086835, "step": 2966, "time_per_iteration": 2.7159550189971924 }, { "auxiliary_loss_clip": 0.01112928, "auxiliary_loss_mlp": 0.00777395, "balance_loss_clip": 1.05336046, "balance_loss_mlp": 1.00083637, "epoch": 0.17838569066586502, "flos": 36641703323520.0, "grad_norm": 7.5683867487642065, "language_loss": 0.72833109, "learning_rate": 3.7734393533138127e-06, "loss": 0.74723434, "num_input_tokens_seen": 64107360, "step": 2967, "time_per_iteration": 2.9540669918060303 }, { "auxiliary_loss_clip": 0.01129124, "auxiliary_loss_mlp": 0.01046817, "balance_loss_clip": 1.05574143, "balance_loss_mlp": 1.02775562, "epoch": 0.17844581391853298, "flos": 18727271072640.0, "grad_norm": 2.1617023205672523, "language_loss": 0.76897681, "learning_rate": 3.773259268638157e-06, "loss": 0.7907362, "num_input_tokens_seen": 64124690, "step": 2968, "time_per_iteration": 2.752717971801758 }, { "auxiliary_loss_clip": 0.01085006, "auxiliary_loss_mlp": 0.01044958, "balance_loss_clip": 1.04640651, "balance_loss_mlp": 1.02559829, "epoch": 0.17850593717120097, "flos": 27378259286400.0, "grad_norm": 2.039560504387258, "language_loss": 0.75839806, "learning_rate": 3.7730791167203333e-06, "loss": 0.77969772, "num_input_tokens_seen": 64146315, "step": 2969, "time_per_iteration": 2.9161994457244873 }, { "auxiliary_loss_clip": 0.01075271, "auxiliary_loss_mlp": 0.01013071, "balance_loss_clip": 1.06177902, "balance_loss_mlp": 1.00932813, "epoch": 0.17856606042386894, "flos": 66996025084800.0, "grad_norm": 0.8520394227890811, "language_loss": 0.69012916, "learning_rate": 3.772898897567171e-06, "loss": 0.7110126, "num_input_tokens_seen": 64210875, "step": 2970, "time_per_iteration": 3.3269262313842773 }, { "auxiliary_loss_clip": 0.011313, "auxiliary_loss_mlp": 0.01044166, "balance_loss_clip": 1.05561864, "balance_loss_mlp": 1.02493763, "epoch": 0.1786261836765369, "flos": 36977001805440.0, "grad_norm": 1.9951166568015506, "language_loss": 0.67617297, "learning_rate": 3.772718611185505e-06, "loss": 0.69792765, "num_input_tokens_seen": 64230740, "step": 2971, "time_per_iteration": 2.8691961765289307 }, { "auxiliary_loss_clip": 0.01110831, "auxiliary_loss_mlp": 0.01052779, "balance_loss_clip": 1.05309939, "balance_loss_mlp": 1.03266823, "epoch": 0.17868630692920487, "flos": 24825441744000.0, "grad_norm": 1.5664358375440484, "language_loss": 0.8971802, "learning_rate": 3.7725382575821717e-06, "loss": 0.91881633, "num_input_tokens_seen": 64252300, "step": 2972, "time_per_iteration": 2.893923759460449 }, { "auxiliary_loss_clip": 0.01124705, "auxiliary_loss_mlp": 0.01055871, "balance_loss_clip": 1.05635929, "balance_loss_mlp": 1.03466403, "epoch": 0.17874643018187283, "flos": 16981977139200.0, "grad_norm": 2.4611679901229153, "language_loss": 0.88593906, "learning_rate": 3.77235783676401e-06, "loss": 0.90774482, "num_input_tokens_seen": 64270105, "step": 2973, "time_per_iteration": 2.7340333461761475 }, { "auxiliary_loss_clip": 0.01164127, "auxiliary_loss_mlp": 0.01047073, "balance_loss_clip": 1.06285155, "balance_loss_mlp": 1.0283215, "epoch": 0.1788065534345408, "flos": 21032233793280.0, "grad_norm": 3.4039298885336557, "language_loss": 0.7668556, "learning_rate": 3.7721773487378615e-06, "loss": 0.78896761, "num_input_tokens_seen": 64287250, "step": 2974, "time_per_iteration": 2.632495403289795 }, { "auxiliary_loss_clip": 0.0114187, "auxiliary_loss_mlp": 0.01053, "balance_loss_clip": 1.06101942, "balance_loss_mlp": 1.03390288, "epoch": 0.17886667668720876, "flos": 23987717775360.0, "grad_norm": 2.484949778027245, "language_loss": 0.74701655, "learning_rate": 3.7719967935105705e-06, "loss": 0.76896524, "num_input_tokens_seen": 64307140, "step": 2975, "time_per_iteration": 2.704012870788574 }, { "auxiliary_loss_clip": 0.01149026, "auxiliary_loss_mlp": 0.01048788, "balance_loss_clip": 1.05678535, "balance_loss_mlp": 1.03004813, "epoch": 0.17892679993987676, "flos": 25739476156800.0, "grad_norm": 1.518747487377626, "language_loss": 0.73032069, "learning_rate": 3.7718161710889833e-06, "loss": 0.75229883, "num_input_tokens_seen": 64328760, "step": 2976, "time_per_iteration": 2.7357017993927 }, { "auxiliary_loss_clip": 0.01150398, "auxiliary_loss_mlp": 0.01038685, "balance_loss_clip": 1.06239033, "balance_loss_mlp": 1.0229373, "epoch": 0.17898692319254472, "flos": 25699686865920.0, "grad_norm": 1.4579507247258654, "language_loss": 0.770594, "learning_rate": 3.7716354814799495e-06, "loss": 0.79248488, "num_input_tokens_seen": 64348800, "step": 2977, "time_per_iteration": 2.727318286895752 }, { "auxiliary_loss_clip": 0.01131521, "auxiliary_loss_mlp": 0.01045834, "balance_loss_clip": 1.06618452, "balance_loss_mlp": 1.02841735, "epoch": 0.1790470464452127, "flos": 19317786664320.0, "grad_norm": 2.7286854986191282, "language_loss": 0.80235189, "learning_rate": 3.7714547246903203e-06, "loss": 0.82412547, "num_input_tokens_seen": 64367955, "step": 2978, "time_per_iteration": 2.8178791999816895 }, { "auxiliary_loss_clip": 0.0114307, "auxiliary_loss_mlp": 0.01052978, "balance_loss_clip": 1.05818772, "balance_loss_mlp": 1.03330874, "epoch": 0.17910716969788065, "flos": 30044267562240.0, "grad_norm": 1.4967765935497133, "language_loss": 0.76192784, "learning_rate": 3.7712739007269508e-06, "loss": 0.7838884, "num_input_tokens_seen": 64389805, "step": 2979, "time_per_iteration": 4.241487741470337 }, { "auxiliary_loss_clip": 0.01122958, "auxiliary_loss_mlp": 0.0104457, "balance_loss_clip": 1.0590893, "balance_loss_mlp": 1.02660525, "epoch": 0.17916729295054862, "flos": 19427709260160.0, "grad_norm": 1.9491816848203256, "language_loss": 0.68945503, "learning_rate": 3.7710930095966976e-06, "loss": 0.71113026, "num_input_tokens_seen": 64408220, "step": 2980, "time_per_iteration": 2.6817352771759033 }, { "auxiliary_loss_clip": 0.01152986, "auxiliary_loss_mlp": 0.0104519, "balance_loss_clip": 1.0588038, "balance_loss_mlp": 1.02497244, "epoch": 0.17922741620321658, "flos": 14611549881600.0, "grad_norm": 1.9134992191513662, "language_loss": 0.70793843, "learning_rate": 3.7709120513064196e-06, "loss": 0.72992027, "num_input_tokens_seen": 64426380, "step": 2981, "time_per_iteration": 4.310532331466675 }, { "auxiliary_loss_clip": 0.01137747, "auxiliary_loss_mlp": 0.01056086, "balance_loss_clip": 1.06083858, "balance_loss_mlp": 1.03686976, "epoch": 0.17928753945588458, "flos": 17165301177600.0, "grad_norm": 2.529665562311581, "language_loss": 0.8190546, "learning_rate": 3.7707310258629796e-06, "loss": 0.84099293, "num_input_tokens_seen": 64444355, "step": 2982, "time_per_iteration": 2.710726261138916 }, { "auxiliary_loss_clip": 0.01162978, "auxiliary_loss_mlp": 0.01041014, "balance_loss_clip": 1.06181359, "balance_loss_mlp": 1.02306128, "epoch": 0.17934766270855254, "flos": 31395622060800.0, "grad_norm": 1.6440716861921114, "language_loss": 0.83123535, "learning_rate": 3.7705499332732413e-06, "loss": 0.85327524, "num_input_tokens_seen": 64467800, "step": 2983, "time_per_iteration": 2.700378656387329 }, { "auxiliary_loss_clip": 0.01153001, "auxiliary_loss_mlp": 0.01048341, "balance_loss_clip": 1.05694914, "balance_loss_mlp": 1.02932739, "epoch": 0.1794077859612205, "flos": 20814184281600.0, "grad_norm": 1.6703280507743268, "language_loss": 0.85149562, "learning_rate": 3.7703687735440718e-06, "loss": 0.87350899, "num_input_tokens_seen": 64487230, "step": 2984, "time_per_iteration": 2.6529407501220703 }, { "auxiliary_loss_clip": 0.01126981, "auxiliary_loss_mlp": 0.01043442, "balance_loss_clip": 1.05520201, "balance_loss_mlp": 1.02424896, "epoch": 0.17946790921388847, "flos": 28986447006720.0, "grad_norm": 2.4609160562432053, "language_loss": 0.8935222, "learning_rate": 3.7701875466823416e-06, "loss": 0.9152264, "num_input_tokens_seen": 64509165, "step": 2985, "time_per_iteration": 4.528426170349121 }, { "auxiliary_loss_clip": 0.01160091, "auxiliary_loss_mlp": 0.01040749, "balance_loss_clip": 1.06142831, "balance_loss_mlp": 1.02434587, "epoch": 0.17952803246655644, "flos": 20737406960640.0, "grad_norm": 2.095497349072142, "language_loss": 0.69538593, "learning_rate": 3.770006252694922e-06, "loss": 0.71739429, "num_input_tokens_seen": 64527940, "step": 2986, "time_per_iteration": 2.6890172958374023 }, { "auxiliary_loss_clip": 0.01158556, "auxiliary_loss_mlp": 0.00776, "balance_loss_clip": 1.05752599, "balance_loss_mlp": 1.00081134, "epoch": 0.1795881557192244, "flos": 28255988027520.0, "grad_norm": 2.4599229747435123, "language_loss": 0.77855188, "learning_rate": 3.769824891588688e-06, "loss": 0.79789746, "num_input_tokens_seen": 64545230, "step": 2987, "time_per_iteration": 2.650761842727661 }, { "auxiliary_loss_clip": 0.0116216, "auxiliary_loss_mlp": 0.01043775, "balance_loss_clip": 1.05775642, "balance_loss_mlp": 1.02441502, "epoch": 0.17964827897189237, "flos": 18552027594240.0, "grad_norm": 2.0190394876224467, "language_loss": 0.77958816, "learning_rate": 3.7696434633705164e-06, "loss": 0.80164748, "num_input_tokens_seen": 64563820, "step": 2988, "time_per_iteration": 2.6151437759399414 }, { "auxiliary_loss_clip": 0.01059513, "auxiliary_loss_mlp": 0.00756906, "balance_loss_clip": 1.07071137, "balance_loss_mlp": 1.00131369, "epoch": 0.17970840222456036, "flos": 58165088711040.0, "grad_norm": 0.7650122273387262, "language_loss": 0.62709254, "learning_rate": 3.7694619680472875e-06, "loss": 0.64525676, "num_input_tokens_seen": 64621315, "step": 2989, "time_per_iteration": 3.1990275382995605 }, { "auxiliary_loss_clip": 0.01137168, "auxiliary_loss_mlp": 0.01038826, "balance_loss_clip": 1.05553865, "balance_loss_mlp": 1.02128983, "epoch": 0.17976852547722832, "flos": 20300805146880.0, "grad_norm": 2.3566032567209483, "language_loss": 0.71070904, "learning_rate": 3.7692804056258837e-06, "loss": 0.73246896, "num_input_tokens_seen": 64639885, "step": 2990, "time_per_iteration": 2.7275335788726807 }, { "auxiliary_loss_clip": 0.01135847, "auxiliary_loss_mlp": 0.01044966, "balance_loss_clip": 1.05398035, "balance_loss_mlp": 1.02639365, "epoch": 0.1798286487298963, "flos": 39669367685760.0, "grad_norm": 1.8035266350414116, "language_loss": 0.68888462, "learning_rate": 3.7690987761131893e-06, "loss": 0.7106927, "num_input_tokens_seen": 64661220, "step": 2991, "time_per_iteration": 2.8237311840057373 }, { "auxiliary_loss_clip": 0.01104375, "auxiliary_loss_mlp": 0.01046061, "balance_loss_clip": 1.05156851, "balance_loss_mlp": 1.02663028, "epoch": 0.17988877198256426, "flos": 25520313323520.0, "grad_norm": 1.6063564491400402, "language_loss": 0.82933879, "learning_rate": 3.7689170795160924e-06, "loss": 0.85084313, "num_input_tokens_seen": 64682530, "step": 2992, "time_per_iteration": 2.8303778171539307 }, { "auxiliary_loss_clip": 0.01140805, "auxiliary_loss_mlp": 0.01035603, "balance_loss_clip": 1.05302262, "balance_loss_mlp": 1.0187583, "epoch": 0.17994889523523222, "flos": 18807496099200.0, "grad_norm": 2.076285453641059, "language_loss": 0.82228035, "learning_rate": 3.7687353158414822e-06, "loss": 0.84404445, "num_input_tokens_seen": 64701025, "step": 2993, "time_per_iteration": 2.710369110107422 }, { "auxiliary_loss_clip": 0.01135151, "auxiliary_loss_mlp": 0.01040493, "balance_loss_clip": 1.05135202, "balance_loss_mlp": 1.02236176, "epoch": 0.18000901848790019, "flos": 21104450087040.0, "grad_norm": 1.7027458997386926, "language_loss": 0.78129464, "learning_rate": 3.7685534850962517e-06, "loss": 0.80305111, "num_input_tokens_seen": 64719570, "step": 2994, "time_per_iteration": 2.6666738986968994 }, { "auxiliary_loss_clip": 0.01158877, "auxiliary_loss_mlp": 0.01045455, "balance_loss_clip": 1.05657315, "balance_loss_mlp": 1.02819359, "epoch": 0.18006914174056818, "flos": 19646441130240.0, "grad_norm": 2.4198973911698434, "language_loss": 0.81139499, "learning_rate": 3.768371587287296e-06, "loss": 0.83343828, "num_input_tokens_seen": 64738110, "step": 2995, "time_per_iteration": 2.699521541595459 }, { "auxiliary_loss_clip": 0.01142902, "auxiliary_loss_mlp": 0.01047606, "balance_loss_clip": 1.05350447, "balance_loss_mlp": 1.0310601, "epoch": 0.18012926499323614, "flos": 19499889640320.0, "grad_norm": 1.8607496799697536, "language_loss": 0.84162772, "learning_rate": 3.768189622421512e-06, "loss": 0.86353278, "num_input_tokens_seen": 64756345, "step": 2996, "time_per_iteration": 2.696723461151123 }, { "auxiliary_loss_clip": 0.01127214, "auxiliary_loss_mlp": 0.01039953, "balance_loss_clip": 1.06094205, "balance_loss_mlp": 1.02273917, "epoch": 0.1801893882459041, "flos": 19464553635840.0, "grad_norm": 2.1291201116421283, "language_loss": 0.88189137, "learning_rate": 3.7680075905058006e-06, "loss": 0.90356302, "num_input_tokens_seen": 64776375, "step": 2997, "time_per_iteration": 2.785522699356079 }, { "auxiliary_loss_clip": 0.01134376, "auxiliary_loss_mlp": 0.01045962, "balance_loss_clip": 1.04949927, "balance_loss_mlp": 1.02753246, "epoch": 0.18024951149857207, "flos": 26870590414080.0, "grad_norm": 1.7579499924576911, "language_loss": 0.85068727, "learning_rate": 3.7678254915470643e-06, "loss": 0.87249064, "num_input_tokens_seen": 64796210, "step": 2998, "time_per_iteration": 2.6912384033203125 }, { "auxiliary_loss_clip": 0.01159537, "auxiliary_loss_mlp": 0.01044427, "balance_loss_clip": 1.06019807, "balance_loss_mlp": 1.02641416, "epoch": 0.18030963475124004, "flos": 30226621933440.0, "grad_norm": 1.8075624565441775, "language_loss": 0.84176779, "learning_rate": 3.7676433255522084e-06, "loss": 0.86380744, "num_input_tokens_seen": 64818590, "step": 2999, "time_per_iteration": 2.722447395324707 }, { "auxiliary_loss_clip": 0.01143605, "auxiliary_loss_mlp": 0.01047321, "balance_loss_clip": 1.05324686, "balance_loss_mlp": 1.02870023, "epoch": 0.180369758003908, "flos": 22307493329280.0, "grad_norm": 1.8789697336390492, "language_loss": 0.75206578, "learning_rate": 3.76746109252814e-06, "loss": 0.77397501, "num_input_tokens_seen": 64838350, "step": 3000, "time_per_iteration": 2.669875144958496 }, { "auxiliary_loss_clip": 0.01130052, "auxiliary_loss_mlp": 0.00775745, "balance_loss_clip": 1.0526886, "balance_loss_mlp": 1.00060582, "epoch": 0.18042988125657597, "flos": 23732033788800.0, "grad_norm": 2.1714361871851704, "language_loss": 0.71088028, "learning_rate": 3.76727879248177e-06, "loss": 0.72993821, "num_input_tokens_seen": 64858065, "step": 3001, "time_per_iteration": 2.7207603454589844 }, { "auxiliary_loss_clip": 0.01150091, "auxiliary_loss_mlp": 0.01044695, "balance_loss_clip": 1.05701649, "balance_loss_mlp": 1.02605033, "epoch": 0.18049000450924396, "flos": 24093582134400.0, "grad_norm": 2.218812983953599, "language_loss": 0.8849982, "learning_rate": 3.767096425420011e-06, "loss": 0.90694606, "num_input_tokens_seen": 64877305, "step": 3002, "time_per_iteration": 2.6577625274658203 }, { "auxiliary_loss_clip": 0.01157827, "auxiliary_loss_mlp": 0.01048268, "balance_loss_clip": 1.05624068, "balance_loss_mlp": 1.03076851, "epoch": 0.18055012776191193, "flos": 22163168482560.0, "grad_norm": 1.6287780165264572, "language_loss": 0.80328667, "learning_rate": 3.7669139913497788e-06, "loss": 0.8253476, "num_input_tokens_seen": 64896955, "step": 3003, "time_per_iteration": 2.6274783611297607 }, { "auxiliary_loss_clip": 0.01158367, "auxiliary_loss_mlp": 0.01043654, "balance_loss_clip": 1.05622995, "balance_loss_mlp": 1.02596307, "epoch": 0.1806102510145799, "flos": 28913512440960.0, "grad_norm": 2.3308952017896956, "language_loss": 0.67250973, "learning_rate": 3.7667314902779907e-06, "loss": 0.69452989, "num_input_tokens_seen": 64917080, "step": 3004, "time_per_iteration": 2.6652631759643555 }, { "auxiliary_loss_clip": 0.01147517, "auxiliary_loss_mlp": 0.01054518, "balance_loss_clip": 1.05606318, "balance_loss_mlp": 1.03528929, "epoch": 0.18067037426724786, "flos": 19025689265280.0, "grad_norm": 2.592432277036083, "language_loss": 0.85111535, "learning_rate": 3.7665489222115677e-06, "loss": 0.87313569, "num_input_tokens_seen": 64935215, "step": 3005, "time_per_iteration": 2.654977560043335 }, { "auxiliary_loss_clip": 0.0114499, "auxiliary_loss_mlp": 0.01041993, "balance_loss_clip": 1.05690646, "balance_loss_mlp": 1.02489829, "epoch": 0.18073049751991582, "flos": 27453635976960.0, "grad_norm": 1.5217876402754629, "language_loss": 0.83215338, "learning_rate": 3.766366287157432e-06, "loss": 0.85402322, "num_input_tokens_seen": 64956275, "step": 3006, "time_per_iteration": 2.7118306159973145 }, { "auxiliary_loss_clip": 0.01127168, "auxiliary_loss_mlp": 0.01050084, "balance_loss_clip": 1.05063033, "balance_loss_mlp": 1.03105807, "epoch": 0.1807906207725838, "flos": 28729039167360.0, "grad_norm": 1.6327495611050657, "language_loss": 0.77377248, "learning_rate": 3.7661835851225103e-06, "loss": 0.79554498, "num_input_tokens_seen": 64979390, "step": 3007, "time_per_iteration": 2.7996537685394287 }, { "auxiliary_loss_clip": 0.01070026, "auxiliary_loss_mlp": 0.01030441, "balance_loss_clip": 1.04936945, "balance_loss_mlp": 1.02712655, "epoch": 0.18085074402525175, "flos": 64466515468800.0, "grad_norm": 0.801982400183398, "language_loss": 0.56987137, "learning_rate": 3.7660008161137294e-06, "loss": 0.5908761, "num_input_tokens_seen": 65043135, "step": 3008, "time_per_iteration": 3.4269092082977295 }, { "auxiliary_loss_clip": 0.01130838, "auxiliary_loss_mlp": 0.01047085, "balance_loss_clip": 1.05308366, "balance_loss_mlp": 1.02686691, "epoch": 0.18091086727791975, "flos": 23476960333440.0, "grad_norm": 1.8424126412451678, "language_loss": 0.67248082, "learning_rate": 3.765817980138021e-06, "loss": 0.69426012, "num_input_tokens_seen": 65062845, "step": 3009, "time_per_iteration": 2.7875866889953613 }, { "auxiliary_loss_clip": 0.01161719, "auxiliary_loss_mlp": 0.01044187, "balance_loss_clip": 1.0595516, "balance_loss_mlp": 1.02673507, "epoch": 0.1809709905305877, "flos": 24170467196160.0, "grad_norm": 2.4429360498363986, "language_loss": 0.75690198, "learning_rate": 3.7656350772023177e-06, "loss": 0.778961, "num_input_tokens_seen": 65082110, "step": 3010, "time_per_iteration": 2.6060268878936768 }, { "auxiliary_loss_clip": 0.01127916, "auxiliary_loss_mlp": 0.01037817, "balance_loss_clip": 1.05715132, "balance_loss_mlp": 1.02063942, "epoch": 0.18103111378325568, "flos": 21650902669440.0, "grad_norm": 1.6324915654296899, "language_loss": 0.67356348, "learning_rate": 3.7654521073135553e-06, "loss": 0.69522083, "num_input_tokens_seen": 65101985, "step": 3011, "time_per_iteration": 2.763596534729004 }, { "auxiliary_loss_clip": 0.01105034, "auxiliary_loss_mlp": 0.00777475, "balance_loss_clip": 1.04540467, "balance_loss_mlp": 1.00078559, "epoch": 0.18109123703592364, "flos": 53686918356480.0, "grad_norm": 1.551526807882757, "language_loss": 0.71288514, "learning_rate": 3.7652690704786723e-06, "loss": 0.73171026, "num_input_tokens_seen": 65129295, "step": 3012, "time_per_iteration": 3.037775993347168 }, { "auxiliary_loss_clip": 0.01132189, "auxiliary_loss_mlp": 0.01052085, "balance_loss_clip": 1.05564284, "balance_loss_mlp": 1.03348863, "epoch": 0.1811513602885916, "flos": 35845564325760.0, "grad_norm": 2.095737131475866, "language_loss": 0.62309992, "learning_rate": 3.765085966704609e-06, "loss": 0.64494264, "num_input_tokens_seen": 65150625, "step": 3013, "time_per_iteration": 2.7692227363586426 }, { "auxiliary_loss_clip": 0.01131323, "auxiliary_loss_mlp": 0.0105253, "balance_loss_clip": 1.05343401, "balance_loss_mlp": 1.03486276, "epoch": 0.18121148354125957, "flos": 23732572492800.0, "grad_norm": 1.6679267545988328, "language_loss": 0.76147234, "learning_rate": 3.764902795998309e-06, "loss": 0.78331089, "num_input_tokens_seen": 65170880, "step": 3014, "time_per_iteration": 2.7296786308288574 }, { "auxiliary_loss_clip": 0.01163543, "auxiliary_loss_mlp": 0.01050053, "balance_loss_clip": 1.05964816, "balance_loss_mlp": 1.02987087, "epoch": 0.18127160679392756, "flos": 28728320895360.0, "grad_norm": 2.1234423596691796, "language_loss": 0.66310829, "learning_rate": 3.7647195583667184e-06, "loss": 0.6852442, "num_input_tokens_seen": 65192530, "step": 3015, "time_per_iteration": 2.7575571537017822 }, { "auxiliary_loss_clip": 0.0113004, "auxiliary_loss_mlp": 0.00776613, "balance_loss_clip": 1.05429327, "balance_loss_mlp": 1.00067461, "epoch": 0.18133173004659553, "flos": 20485062938880.0, "grad_norm": 1.7837261279259933, "language_loss": 0.78152305, "learning_rate": 3.764536253816785e-06, "loss": 0.80058956, "num_input_tokens_seen": 65211675, "step": 3016, "time_per_iteration": 2.6718828678131104 }, { "auxiliary_loss_clip": 0.01145073, "auxiliary_loss_mlp": 0.01049504, "balance_loss_clip": 1.05684161, "balance_loss_mlp": 1.03068125, "epoch": 0.1813918532992635, "flos": 22852078404480.0, "grad_norm": 1.7248072345223011, "language_loss": 0.8351965, "learning_rate": 3.7643528823554602e-06, "loss": 0.85714233, "num_input_tokens_seen": 65231185, "step": 3017, "time_per_iteration": 2.6879045963287354 }, { "auxiliary_loss_clip": 0.0114091, "auxiliary_loss_mlp": 0.01042994, "balance_loss_clip": 1.05404854, "balance_loss_mlp": 1.02539897, "epoch": 0.18145197655193146, "flos": 36065122208640.0, "grad_norm": 2.2664795482488924, "language_loss": 0.6769017, "learning_rate": 3.764169443989697e-06, "loss": 0.69874066, "num_input_tokens_seen": 65251645, "step": 3018, "time_per_iteration": 4.31333327293396 }, { "auxiliary_loss_clip": 0.01147629, "auxiliary_loss_mlp": 0.00776661, "balance_loss_clip": 1.05706179, "balance_loss_mlp": 1.00074184, "epoch": 0.18151209980459942, "flos": 24023951619840.0, "grad_norm": 1.8935259017451227, "language_loss": 0.76396847, "learning_rate": 3.7639859387264518e-06, "loss": 0.78321135, "num_input_tokens_seen": 65271125, "step": 3019, "time_per_iteration": 2.7667160034179688 }, { "auxiliary_loss_clip": 0.01121465, "auxiliary_loss_mlp": 0.01046742, "balance_loss_clip": 1.05550635, "balance_loss_mlp": 1.02722728, "epoch": 0.1815722230572674, "flos": 23951627585280.0, "grad_norm": 2.042490471678265, "language_loss": 0.81550395, "learning_rate": 3.7638023665726834e-06, "loss": 0.83718598, "num_input_tokens_seen": 65290600, "step": 3020, "time_per_iteration": 4.3900346755981445 }, { "auxiliary_loss_clip": 0.01136424, "auxiliary_loss_mlp": 0.01046217, "balance_loss_clip": 1.05758023, "balance_loss_mlp": 1.02567708, "epoch": 0.18163234630993536, "flos": 24386469632640.0, "grad_norm": 1.9628186536024828, "language_loss": 0.7757082, "learning_rate": 3.763618727535352e-06, "loss": 0.79753458, "num_input_tokens_seen": 65311040, "step": 3021, "time_per_iteration": 4.3029396533966064 }, { "auxiliary_loss_clip": 0.01143245, "auxiliary_loss_mlp": 0.01047278, "balance_loss_clip": 1.05453348, "balance_loss_mlp": 1.02907431, "epoch": 0.18169246956260335, "flos": 24681332378880.0, "grad_norm": 1.725306643191844, "language_loss": 0.84863859, "learning_rate": 3.763435021621422e-06, "loss": 0.87054378, "num_input_tokens_seen": 65332115, "step": 3022, "time_per_iteration": 2.7353312969207764 }, { "auxiliary_loss_clip": 0.01132435, "auxiliary_loss_mlp": 0.01042747, "balance_loss_clip": 1.05769348, "balance_loss_mlp": 1.0235188, "epoch": 0.1817525928152713, "flos": 24243294021120.0, "grad_norm": 2.230341519134859, "language_loss": 0.69367266, "learning_rate": 3.763251248837859e-06, "loss": 0.71542448, "num_input_tokens_seen": 65352210, "step": 3023, "time_per_iteration": 2.775200605392456 }, { "auxiliary_loss_clip": 0.01127605, "auxiliary_loss_mlp": 0.01043947, "balance_loss_clip": 1.04900002, "balance_loss_mlp": 1.02556491, "epoch": 0.18181271606793928, "flos": 16472081623680.0, "grad_norm": 2.150764188548567, "language_loss": 0.74107385, "learning_rate": 3.7630674091916317e-06, "loss": 0.76278937, "num_input_tokens_seen": 65370600, "step": 3024, "time_per_iteration": 2.7364041805267334 }, { "auxiliary_loss_clip": 0.01145205, "auxiliary_loss_mlp": 0.01046837, "balance_loss_clip": 1.05719447, "balance_loss_mlp": 1.02900314, "epoch": 0.18187283932060724, "flos": 18581042805120.0, "grad_norm": 2.148591016046099, "language_loss": 0.8835662, "learning_rate": 3.7628835026897123e-06, "loss": 0.90548658, "num_input_tokens_seen": 65387270, "step": 3025, "time_per_iteration": 4.274658679962158 }, { "auxiliary_loss_clip": 0.01133667, "auxiliary_loss_mlp": 0.01050575, "balance_loss_clip": 1.05470932, "balance_loss_mlp": 1.03137028, "epoch": 0.1819329625732752, "flos": 20266833859200.0, "grad_norm": 3.6399614210311206, "language_loss": 0.79041791, "learning_rate": 3.7626995293390735e-06, "loss": 0.81226033, "num_input_tokens_seen": 65406550, "step": 3026, "time_per_iteration": 2.7589778900146484 }, { "auxiliary_loss_clip": 0.01132736, "auxiliary_loss_mlp": 0.01055367, "balance_loss_clip": 1.05774415, "balance_loss_mlp": 1.03679442, "epoch": 0.18199308582594317, "flos": 25915186512000.0, "grad_norm": 1.6980721374313217, "language_loss": 0.759978, "learning_rate": 3.762515489146692e-06, "loss": 0.78185904, "num_input_tokens_seen": 65425955, "step": 3027, "time_per_iteration": 2.7347826957702637 }, { "auxiliary_loss_clip": 0.01163558, "auxiliary_loss_mlp": 0.01053369, "balance_loss_clip": 1.05835891, "balance_loss_mlp": 1.03378284, "epoch": 0.18205320907861114, "flos": 15377524433280.0, "grad_norm": 2.2893837743041368, "language_loss": 0.85592651, "learning_rate": 3.762331382119546e-06, "loss": 0.87809575, "num_input_tokens_seen": 65442820, "step": 3028, "time_per_iteration": 2.598905563354492 }, { "auxiliary_loss_clip": 0.01156921, "auxiliary_loss_mlp": 0.0104449, "balance_loss_clip": 1.0578618, "balance_loss_mlp": 1.0260129, "epoch": 0.18211333233127913, "flos": 25624310175360.0, "grad_norm": 1.8897570500397638, "language_loss": 0.82807779, "learning_rate": 3.7621472082646183e-06, "loss": 0.85009193, "num_input_tokens_seen": 65461825, "step": 3029, "time_per_iteration": 2.677332639694214 }, { "auxiliary_loss_clip": 0.01114993, "auxiliary_loss_mlp": 0.01050232, "balance_loss_clip": 1.05223596, "balance_loss_mlp": 1.02931094, "epoch": 0.1821734555839471, "flos": 14976007228800.0, "grad_norm": 10.840079090220346, "language_loss": 0.78091359, "learning_rate": 3.761962967588891e-06, "loss": 0.80256593, "num_input_tokens_seen": 65479480, "step": 3030, "time_per_iteration": 2.6865499019622803 }, { "auxiliary_loss_clip": 0.01139676, "auxiliary_loss_mlp": 0.01043273, "balance_loss_clip": 1.05401075, "balance_loss_mlp": 1.0240562, "epoch": 0.18223357883661506, "flos": 20194007034240.0, "grad_norm": 2.05958060196279, "language_loss": 0.85162055, "learning_rate": 3.761778660099352e-06, "loss": 0.87345004, "num_input_tokens_seen": 65497775, "step": 3031, "time_per_iteration": 2.6336488723754883 }, { "auxiliary_loss_clip": 0.01116657, "auxiliary_loss_mlp": 0.00776186, "balance_loss_clip": 1.0497843, "balance_loss_mlp": 1.00052071, "epoch": 0.18229370208928303, "flos": 15231978524160.0, "grad_norm": 1.83501853384953, "language_loss": 0.79992211, "learning_rate": 3.76159428580299e-06, "loss": 0.81885058, "num_input_tokens_seen": 65516505, "step": 3032, "time_per_iteration": 2.6879780292510986 }, { "auxiliary_loss_clip": 0.01166412, "auxiliary_loss_mlp": 0.01048902, "balance_loss_clip": 1.06163025, "balance_loss_mlp": 1.03038836, "epoch": 0.182353825341951, "flos": 23840483927040.0, "grad_norm": 1.8132660189598853, "language_loss": 0.81316388, "learning_rate": 3.761409844706795e-06, "loss": 0.83531702, "num_input_tokens_seen": 65536160, "step": 3033, "time_per_iteration": 2.628100872039795 }, { "auxiliary_loss_clip": 0.01048591, "auxiliary_loss_mlp": 0.0100128, "balance_loss_clip": 1.05392861, "balance_loss_mlp": 0.99850291, "epoch": 0.18241394859461896, "flos": 61190957393280.0, "grad_norm": 0.8825814513625035, "language_loss": 0.63439631, "learning_rate": 3.7612253368177625e-06, "loss": 0.65489495, "num_input_tokens_seen": 65589375, "step": 3034, "time_per_iteration": 3.2329187393188477 }, { "auxiliary_loss_clip": 0.0112853, "auxiliary_loss_mlp": 0.01041043, "balance_loss_clip": 1.05698252, "balance_loss_mlp": 1.02384114, "epoch": 0.18247407184728695, "flos": 18471694826880.0, "grad_norm": 3.107937736318082, "language_loss": 0.79893476, "learning_rate": 3.7610407621428893e-06, "loss": 0.82063049, "num_input_tokens_seen": 65606720, "step": 3035, "time_per_iteration": 2.7644357681274414 }, { "auxiliary_loss_clip": 0.01134115, "auxiliary_loss_mlp": 0.01046396, "balance_loss_clip": 1.05675578, "balance_loss_mlp": 1.02906322, "epoch": 0.18253419509995492, "flos": 21795191602560.0, "grad_norm": 1.870086430131469, "language_loss": 0.85076666, "learning_rate": 3.7608561206891735e-06, "loss": 0.87257177, "num_input_tokens_seen": 65625495, "step": 3036, "time_per_iteration": 2.7102303504943848 }, { "auxiliary_loss_clip": 0.01140083, "auxiliary_loss_mlp": 0.01039078, "balance_loss_clip": 1.05572963, "balance_loss_mlp": 1.02192414, "epoch": 0.18259431835262288, "flos": 20149764456960.0, "grad_norm": 2.1821496235124727, "language_loss": 0.80254716, "learning_rate": 3.760671412463617e-06, "loss": 0.82433879, "num_input_tokens_seen": 65643515, "step": 3037, "time_per_iteration": 2.6703832149505615 }, { "auxiliary_loss_clip": 0.01139652, "auxiliary_loss_mlp": 0.00776941, "balance_loss_clip": 1.05986989, "balance_loss_mlp": 1.00062871, "epoch": 0.18265444160529085, "flos": 16981653916800.0, "grad_norm": 3.0764011293768023, "language_loss": 0.7950514, "learning_rate": 3.7604866374732246e-06, "loss": 0.81421733, "num_input_tokens_seen": 65658155, "step": 3038, "time_per_iteration": 2.7410895824432373 }, { "auxiliary_loss_clip": 0.01125628, "auxiliary_loss_mlp": 0.01044597, "balance_loss_clip": 1.05254972, "balance_loss_mlp": 1.02551126, "epoch": 0.1827145648579588, "flos": 34423250509440.0, "grad_norm": 1.9524772610579864, "language_loss": 0.67722493, "learning_rate": 3.7603017957250023e-06, "loss": 0.69892722, "num_input_tokens_seen": 65679310, "step": 3039, "time_per_iteration": 2.756833076477051 }, { "auxiliary_loss_clip": 0.0113051, "auxiliary_loss_mlp": 0.01051065, "balance_loss_clip": 1.053087, "balance_loss_mlp": 1.03304029, "epoch": 0.18277468811062678, "flos": 53287017264000.0, "grad_norm": 1.8757227718998248, "language_loss": 0.73394251, "learning_rate": 3.7601168872259593e-06, "loss": 0.75575823, "num_input_tokens_seen": 65705235, "step": 3040, "time_per_iteration": 3.026679039001465 }, { "auxiliary_loss_clip": 0.01143558, "auxiliary_loss_mlp": 0.01042261, "balance_loss_clip": 1.05585194, "balance_loss_mlp": 1.02373624, "epoch": 0.18283481136329474, "flos": 31650659602560.0, "grad_norm": 2.017308993436446, "language_loss": 0.60348576, "learning_rate": 3.7599319119831075e-06, "loss": 0.62534392, "num_input_tokens_seen": 65727575, "step": 3041, "time_per_iteration": 2.738554000854492 }, { "auxiliary_loss_clip": 0.01116972, "auxiliary_loss_mlp": 0.01053827, "balance_loss_clip": 1.05058599, "balance_loss_mlp": 1.03544497, "epoch": 0.18289493461596273, "flos": 53137664513280.0, "grad_norm": 2.3558133433802104, "language_loss": 0.59825706, "learning_rate": 3.7597468700034616e-06, "loss": 0.61996508, "num_input_tokens_seen": 65751370, "step": 3042, "time_per_iteration": 3.0009193420410156 }, { "auxiliary_loss_clip": 0.0112422, "auxiliary_loss_mlp": 0.01046569, "balance_loss_clip": 1.05319464, "balance_loss_mlp": 1.02917695, "epoch": 0.1829550578686307, "flos": 25589369220480.0, "grad_norm": 1.5313119565207096, "language_loss": 0.8757726, "learning_rate": 3.7595617612940374e-06, "loss": 0.89748049, "num_input_tokens_seen": 65771040, "step": 3043, "time_per_iteration": 2.7406487464904785 }, { "auxiliary_loss_clip": 0.01056788, "auxiliary_loss_mlp": 0.01056357, "balance_loss_clip": 1.04592645, "balance_loss_mlp": 1.03712869, "epoch": 0.18301518112129866, "flos": 22601422321920.0, "grad_norm": 2.144378235575635, "language_loss": 0.70980251, "learning_rate": 3.7593765858618552e-06, "loss": 0.73093396, "num_input_tokens_seen": 65789345, "step": 3044, "time_per_iteration": 2.785931348800659 }, { "auxiliary_loss_clip": 0.01105073, "auxiliary_loss_mlp": 0.01059118, "balance_loss_clip": 1.05111921, "balance_loss_mlp": 1.0381608, "epoch": 0.18307530437396663, "flos": 34020799551360.0, "grad_norm": 3.097061979225562, "language_loss": 0.64460731, "learning_rate": 3.7591913437139365e-06, "loss": 0.66624922, "num_input_tokens_seen": 65810990, "step": 3045, "time_per_iteration": 2.8085720539093018 }, { "auxiliary_loss_clip": 0.01155246, "auxiliary_loss_mlp": 0.01044973, "balance_loss_clip": 1.05604315, "balance_loss_mlp": 1.02780676, "epoch": 0.1831354276266346, "flos": 21279765392640.0, "grad_norm": 11.455833434854163, "language_loss": 0.78461385, "learning_rate": 3.7590060348573066e-06, "loss": 0.80661607, "num_input_tokens_seen": 65827230, "step": 3046, "time_per_iteration": 2.603299140930176 }, { "auxiliary_loss_clip": 0.01118725, "auxiliary_loss_mlp": 0.01042864, "balance_loss_clip": 1.04837, "balance_loss_mlp": 1.0240643, "epoch": 0.18319555087930256, "flos": 21032952065280.0, "grad_norm": 1.9889932097770582, "language_loss": 0.78733194, "learning_rate": 3.7588206592989903e-06, "loss": 0.8089478, "num_input_tokens_seen": 65845900, "step": 3047, "time_per_iteration": 2.7109453678131104 }, { "auxiliary_loss_clip": 0.01144516, "auxiliary_loss_mlp": 0.01042422, "balance_loss_clip": 1.05723858, "balance_loss_mlp": 1.0254705, "epoch": 0.18325567413197055, "flos": 34382958428160.0, "grad_norm": 1.5191744259185578, "language_loss": 0.80704039, "learning_rate": 3.7586352170460194e-06, "loss": 0.82890975, "num_input_tokens_seen": 65868730, "step": 3048, "time_per_iteration": 2.7485053539276123 }, { "auxiliary_loss_clip": 0.01139433, "auxiliary_loss_mlp": 0.01046004, "balance_loss_clip": 1.05405188, "balance_loss_mlp": 1.02552414, "epoch": 0.18331579738463852, "flos": 20558464381440.0, "grad_norm": 2.1437824577601354, "language_loss": 0.86579728, "learning_rate": 3.758449708105424e-06, "loss": 0.88765168, "num_input_tokens_seen": 65888420, "step": 3049, "time_per_iteration": 2.6876962184906006 }, { "auxiliary_loss_clip": 0.01143881, "auxiliary_loss_mlp": 0.01045208, "balance_loss_clip": 1.05379057, "balance_loss_mlp": 1.02544308, "epoch": 0.18337592063730648, "flos": 19607872901760.0, "grad_norm": 2.616661567020713, "language_loss": 0.77827966, "learning_rate": 3.75826413248424e-06, "loss": 0.80017054, "num_input_tokens_seen": 65905840, "step": 3050, "time_per_iteration": 2.5814058780670166 }, { "auxiliary_loss_clip": 0.01126116, "auxiliary_loss_mlp": 0.01041302, "balance_loss_clip": 1.04954183, "balance_loss_mlp": 1.0238502, "epoch": 0.18343604388997445, "flos": 20850885002880.0, "grad_norm": 2.3686375880611656, "language_loss": 0.99064422, "learning_rate": 3.7580784901895035e-06, "loss": 1.01231837, "num_input_tokens_seen": 65922845, "step": 3051, "time_per_iteration": 2.701848268508911 }, { "auxiliary_loss_clip": 0.01125492, "auxiliary_loss_mlp": 0.010397, "balance_loss_clip": 1.05189931, "balance_loss_mlp": 1.02078128, "epoch": 0.1834961671426424, "flos": 24394370624640.0, "grad_norm": 2.0338529701436237, "language_loss": 0.8607648, "learning_rate": 3.7578927812282542e-06, "loss": 0.88241673, "num_input_tokens_seen": 65945555, "step": 3052, "time_per_iteration": 2.7252042293548584 }, { "auxiliary_loss_clip": 0.01152967, "auxiliary_loss_mlp": 0.01044648, "balance_loss_clip": 1.05449986, "balance_loss_mlp": 1.02737474, "epoch": 0.18355629039531038, "flos": 21251612108160.0, "grad_norm": 1.8649432496703628, "language_loss": 0.73393309, "learning_rate": 3.7577070056075356e-06, "loss": 0.7559092, "num_input_tokens_seen": 65963965, "step": 3053, "time_per_iteration": 2.6331369876861572 }, { "auxiliary_loss_clip": 0.01158728, "auxiliary_loss_mlp": 0.01044052, "balance_loss_clip": 1.05783379, "balance_loss_mlp": 1.02565801, "epoch": 0.18361641364797834, "flos": 28656499651200.0, "grad_norm": 1.5358769917973574, "language_loss": 0.61891186, "learning_rate": 3.7575211633343902e-06, "loss": 0.64093965, "num_input_tokens_seen": 65985965, "step": 3054, "time_per_iteration": 2.6792421340942383 }, { "auxiliary_loss_clip": 0.01108826, "auxiliary_loss_mlp": 0.01042654, "balance_loss_clip": 1.05558836, "balance_loss_mlp": 1.02502322, "epoch": 0.18367653690064634, "flos": 20918827578240.0, "grad_norm": 2.2474279661883667, "language_loss": 0.78218341, "learning_rate": 3.7573352544158663e-06, "loss": 0.80369824, "num_input_tokens_seen": 66005645, "step": 3055, "time_per_iteration": 2.778691053390503 }, { "auxiliary_loss_clip": 0.01096638, "auxiliary_loss_mlp": 0.01050677, "balance_loss_clip": 1.05003095, "balance_loss_mlp": 1.03211594, "epoch": 0.1837366601533143, "flos": 28765596234240.0, "grad_norm": 1.8043720478204575, "language_loss": 0.7022509, "learning_rate": 3.757149278859014e-06, "loss": 0.72372401, "num_input_tokens_seen": 66025675, "step": 3056, "time_per_iteration": 2.794254779815674 }, { "auxiliary_loss_clip": 0.01140367, "auxiliary_loss_mlp": 0.01038358, "balance_loss_clip": 1.05211461, "balance_loss_mlp": 1.02181149, "epoch": 0.18379678340598227, "flos": 21251432540160.0, "grad_norm": 1.8709784760841586, "language_loss": 0.80357504, "learning_rate": 3.7569632366708842e-06, "loss": 0.82536227, "num_input_tokens_seen": 66046125, "step": 3057, "time_per_iteration": 2.644728899002075 }, { "auxiliary_loss_clip": 0.01150041, "auxiliary_loss_mlp": 0.01043781, "balance_loss_clip": 1.05482352, "balance_loss_mlp": 1.02332497, "epoch": 0.18385690665865023, "flos": 20449619193600.0, "grad_norm": 7.225766788646501, "language_loss": 0.82570755, "learning_rate": 3.756777127858533e-06, "loss": 0.84764576, "num_input_tokens_seen": 66064375, "step": 3058, "time_per_iteration": 4.136845588684082 }, { "auxiliary_loss_clip": 0.01119139, "auxiliary_loss_mlp": 0.00776668, "balance_loss_clip": 1.04992914, "balance_loss_mlp": 1.00066566, "epoch": 0.1839170299113182, "flos": 26140562398080.0, "grad_norm": 2.277694088171661, "language_loss": 0.85071868, "learning_rate": 3.756590952429017e-06, "loss": 0.86967677, "num_input_tokens_seen": 66084590, "step": 3059, "time_per_iteration": 2.745020866394043 }, { "auxiliary_loss_clip": 0.01151831, "auxiliary_loss_mlp": 0.00775088, "balance_loss_clip": 1.05359423, "balance_loss_mlp": 1.00077426, "epoch": 0.18397715316398616, "flos": 31758032332800.0, "grad_norm": 2.3540516696336216, "language_loss": 0.72983348, "learning_rate": 3.756404710389396e-06, "loss": 0.74910271, "num_input_tokens_seen": 66107105, "step": 3060, "time_per_iteration": 5.792214393615723 }, { "auxiliary_loss_clip": 0.01149482, "auxiliary_loss_mlp": 0.01041417, "balance_loss_clip": 1.05812132, "balance_loss_mlp": 1.02266574, "epoch": 0.18403727641665413, "flos": 24611989173120.0, "grad_norm": 1.5810457302838978, "language_loss": 0.73126459, "learning_rate": 3.7562184017467323e-06, "loss": 0.75317359, "num_input_tokens_seen": 66129295, "step": 3061, "time_per_iteration": 2.754167318344116 }, { "auxiliary_loss_clip": 0.01138281, "auxiliary_loss_mlp": 0.01043599, "balance_loss_clip": 1.05435956, "balance_loss_mlp": 1.02379823, "epoch": 0.18409739966932212, "flos": 23439900476160.0, "grad_norm": 1.8413104246803462, "language_loss": 0.81937188, "learning_rate": 3.7560320265080906e-06, "loss": 0.8411907, "num_input_tokens_seen": 66146910, "step": 3062, "time_per_iteration": 2.7545394897460938 }, { "auxiliary_loss_clip": 0.01144664, "auxiliary_loss_mlp": 0.01040639, "balance_loss_clip": 1.05668104, "balance_loss_mlp": 1.02259111, "epoch": 0.18415752292199009, "flos": 21872112577920.0, "grad_norm": 2.011374259171591, "language_loss": 0.72994816, "learning_rate": 3.7558455846805383e-06, "loss": 0.75180125, "num_input_tokens_seen": 66165370, "step": 3063, "time_per_iteration": 2.738293170928955 }, { "auxiliary_loss_clip": 0.01133824, "auxiliary_loss_mlp": 0.01040987, "balance_loss_clip": 1.05164194, "balance_loss_mlp": 1.02490544, "epoch": 0.18421764617465805, "flos": 25410678036480.0, "grad_norm": 2.2975785147287953, "language_loss": 0.65614092, "learning_rate": 3.7556590762711463e-06, "loss": 0.67788899, "num_input_tokens_seen": 66186210, "step": 3064, "time_per_iteration": 4.404583930969238 }, { "auxiliary_loss_clip": 0.01141547, "auxiliary_loss_mlp": 0.01042996, "balance_loss_clip": 1.05395937, "balance_loss_mlp": 1.02498376, "epoch": 0.18427776942732602, "flos": 27198131558400.0, "grad_norm": 2.1874829734431898, "language_loss": 0.68347883, "learning_rate": 3.7554725012869853e-06, "loss": 0.70532429, "num_input_tokens_seen": 66204800, "step": 3065, "time_per_iteration": 2.7149577140808105 }, { "auxiliary_loss_clip": 0.01136969, "auxiliary_loss_mlp": 0.01045319, "balance_loss_clip": 1.05518305, "balance_loss_mlp": 1.02674615, "epoch": 0.18433789267999398, "flos": 27852351920640.0, "grad_norm": 2.2758854533642925, "language_loss": 0.73142231, "learning_rate": 3.7552858597351318e-06, "loss": 0.75324523, "num_input_tokens_seen": 66222195, "step": 3066, "time_per_iteration": 2.672675609588623 }, { "auxiliary_loss_clip": 0.01125186, "auxiliary_loss_mlp": 0.01043389, "balance_loss_clip": 1.04947495, "balance_loss_mlp": 1.0256983, "epoch": 0.18439801593266195, "flos": 17856940533120.0, "grad_norm": 2.1067167513095444, "language_loss": 0.82191038, "learning_rate": 3.7550991516226622e-06, "loss": 0.8435961, "num_input_tokens_seen": 66239505, "step": 3067, "time_per_iteration": 2.697768211364746 }, { "auxiliary_loss_clip": 0.01082345, "auxiliary_loss_mlp": 0.00756782, "balance_loss_clip": 1.04466891, "balance_loss_mlp": 1.00113225, "epoch": 0.18445813918532994, "flos": 56389522590720.0, "grad_norm": 0.7960107429271657, "language_loss": 0.59750569, "learning_rate": 3.754912376956657e-06, "loss": 0.61589694, "num_input_tokens_seen": 66295695, "step": 3068, "time_per_iteration": 3.0305213928222656 }, { "auxiliary_loss_clip": 0.01127048, "auxiliary_loss_mlp": 0.01041294, "balance_loss_clip": 1.05452299, "balance_loss_mlp": 1.02356791, "epoch": 0.1845182624379979, "flos": 20957180325120.0, "grad_norm": 3.7299324256794244, "language_loss": 0.76434112, "learning_rate": 3.7547255357441987e-06, "loss": 0.78602457, "num_input_tokens_seen": 66315315, "step": 3069, "time_per_iteration": 2.6757962703704834 }, { "auxiliary_loss_clip": 0.01146412, "auxiliary_loss_mlp": 0.010456, "balance_loss_clip": 1.05468106, "balance_loss_mlp": 1.02798057, "epoch": 0.18457838569066587, "flos": 20485170679680.0, "grad_norm": 1.9225240149566294, "language_loss": 0.8491416, "learning_rate": 3.7545386279923718e-06, "loss": 0.87106168, "num_input_tokens_seen": 66333675, "step": 3070, "time_per_iteration": 2.617023229598999 }, { "auxiliary_loss_clip": 0.01127789, "auxiliary_loss_mlp": 0.01043452, "balance_loss_clip": 1.0553112, "balance_loss_mlp": 1.02510571, "epoch": 0.18463850894333383, "flos": 25010022758400.0, "grad_norm": 6.700503585098448, "language_loss": 0.77807182, "learning_rate": 3.754351653708265e-06, "loss": 0.79978424, "num_input_tokens_seen": 66354075, "step": 3071, "time_per_iteration": 2.847329616546631 }, { "auxiliary_loss_clip": 0.01109458, "auxiliary_loss_mlp": 0.01049978, "balance_loss_clip": 1.05054557, "balance_loss_mlp": 1.03154778, "epoch": 0.1846986321960018, "flos": 16800628348800.0, "grad_norm": 2.0836336776071565, "language_loss": 0.77414191, "learning_rate": 3.7541646128989674e-06, "loss": 0.79573631, "num_input_tokens_seen": 66372520, "step": 3072, "time_per_iteration": 2.780921220779419 }, { "auxiliary_loss_clip": 0.01138997, "auxiliary_loss_mlp": 0.01043594, "balance_loss_clip": 1.05106127, "balance_loss_mlp": 1.02465141, "epoch": 0.18475875544866976, "flos": 20814327936000.0, "grad_norm": 4.959080593148226, "language_loss": 0.86546457, "learning_rate": 3.7539775055715715e-06, "loss": 0.88729048, "num_input_tokens_seen": 66390745, "step": 3073, "time_per_iteration": 2.631913661956787 }, { "auxiliary_loss_clip": 0.01158717, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.05862749, "balance_loss_mlp": 1.02366686, "epoch": 0.18481887870133773, "flos": 22601422321920.0, "grad_norm": 2.162700927804164, "language_loss": 0.91831195, "learning_rate": 3.7537903317331732e-06, "loss": 0.94030046, "num_input_tokens_seen": 66410525, "step": 3074, "time_per_iteration": 2.6152567863464355 }, { "auxiliary_loss_clip": 0.01104968, "auxiliary_loss_mlp": 0.01047718, "balance_loss_clip": 1.04757643, "balance_loss_mlp": 1.02763104, "epoch": 0.18487900195400572, "flos": 29458815788160.0, "grad_norm": 1.9967983521568784, "language_loss": 0.64783108, "learning_rate": 3.75360309139087e-06, "loss": 0.66935796, "num_input_tokens_seen": 66432535, "step": 3075, "time_per_iteration": 2.763559103012085 }, { "auxiliary_loss_clip": 0.01135247, "auxiliary_loss_mlp": 0.01046601, "balance_loss_clip": 1.05689573, "balance_loss_mlp": 1.02913702, "epoch": 0.1849391252066737, "flos": 20628777254400.0, "grad_norm": 1.8996898495981898, "language_loss": 0.72803432, "learning_rate": 3.753415784551761e-06, "loss": 0.74985278, "num_input_tokens_seen": 66450620, "step": 3076, "time_per_iteration": 2.76629376411438 }, { "auxiliary_loss_clip": 0.01124833, "auxiliary_loss_mlp": 0.01042344, "balance_loss_clip": 1.0584389, "balance_loss_mlp": 1.0249157, "epoch": 0.18499924845934165, "flos": 14428549065600.0, "grad_norm": 2.4862024108169556, "language_loss": 0.80772626, "learning_rate": 3.7532284112229507e-06, "loss": 0.82939804, "num_input_tokens_seen": 66467865, "step": 3077, "time_per_iteration": 2.7296142578125 }, { "auxiliary_loss_clip": 0.01128471, "auxiliary_loss_mlp": 0.01041495, "balance_loss_clip": 1.05401397, "balance_loss_mlp": 1.02428079, "epoch": 0.18505937171200962, "flos": 23727652329600.0, "grad_norm": 1.8214336253769514, "language_loss": 0.78693211, "learning_rate": 3.7530409714115424e-06, "loss": 0.80863178, "num_input_tokens_seen": 66486245, "step": 3078, "time_per_iteration": 2.715838670730591 }, { "auxiliary_loss_clip": 0.01154963, "auxiliary_loss_mlp": 0.01043373, "balance_loss_clip": 1.05546641, "balance_loss_mlp": 1.02655268, "epoch": 0.18511949496467758, "flos": 25957489754880.0, "grad_norm": 1.7455066055145632, "language_loss": 0.77326959, "learning_rate": 3.7528534651246453e-06, "loss": 0.79525292, "num_input_tokens_seen": 66506510, "step": 3079, "time_per_iteration": 2.674128770828247 }, { "auxiliary_loss_clip": 0.01119079, "auxiliary_loss_mlp": 0.01041512, "balance_loss_clip": 1.04717147, "balance_loss_mlp": 1.02328515, "epoch": 0.18517961821734555, "flos": 42413553912960.0, "grad_norm": 1.885086933557342, "language_loss": 0.82143807, "learning_rate": 3.752665892369369e-06, "loss": 0.84304404, "num_input_tokens_seen": 66530960, "step": 3080, "time_per_iteration": 2.906940460205078 }, { "auxiliary_loss_clip": 0.01123637, "auxiliary_loss_mlp": 0.01044031, "balance_loss_clip": 1.05894399, "balance_loss_mlp": 1.02563691, "epoch": 0.18523974147001354, "flos": 24097568544000.0, "grad_norm": 2.065822240576764, "language_loss": 0.73973286, "learning_rate": 3.7524782531528266e-06, "loss": 0.76140958, "num_input_tokens_seen": 66550275, "step": 3081, "time_per_iteration": 2.7960739135742188 }, { "auxiliary_loss_clip": 0.01126977, "auxiliary_loss_mlp": 0.01051674, "balance_loss_clip": 1.05360913, "balance_loss_mlp": 1.03286242, "epoch": 0.1852998647226815, "flos": 27375278457600.0, "grad_norm": 1.9854893879184425, "language_loss": 0.71991849, "learning_rate": 3.7522905474821334e-06, "loss": 0.74170506, "num_input_tokens_seen": 66569040, "step": 3082, "time_per_iteration": 2.6965079307556152 }, { "auxiliary_loss_clip": 0.01124933, "auxiliary_loss_mlp": 0.01046296, "balance_loss_clip": 1.05649543, "balance_loss_mlp": 1.02694798, "epoch": 0.18535998797534947, "flos": 18332757020160.0, "grad_norm": 2.0424653419479886, "language_loss": 0.69580144, "learning_rate": 3.752102775364407e-06, "loss": 0.71751374, "num_input_tokens_seen": 66587775, "step": 3083, "time_per_iteration": 2.727252721786499 }, { "auxiliary_loss_clip": 0.01122388, "auxiliary_loss_mlp": 0.01046999, "balance_loss_clip": 1.05204451, "balance_loss_mlp": 1.02964258, "epoch": 0.18542011122801744, "flos": 37845859887360.0, "grad_norm": 2.185713468975319, "language_loss": 0.68965334, "learning_rate": 3.751914936806767e-06, "loss": 0.71134722, "num_input_tokens_seen": 66610800, "step": 3084, "time_per_iteration": 2.95849871635437 }, { "auxiliary_loss_clip": 0.01155184, "auxiliary_loss_mlp": 0.01043029, "balance_loss_clip": 1.05578482, "balance_loss_mlp": 1.0257436, "epoch": 0.1854802344806854, "flos": 25186128163200.0, "grad_norm": 1.6859724806626923, "language_loss": 0.77390355, "learning_rate": 3.7517270318163377e-06, "loss": 0.79588568, "num_input_tokens_seen": 66630960, "step": 3085, "time_per_iteration": 2.68961501121521 }, { "auxiliary_loss_clip": 0.01152089, "auxiliary_loss_mlp": 0.01049004, "balance_loss_clip": 1.05316019, "balance_loss_mlp": 1.03142118, "epoch": 0.18554035773335337, "flos": 26684788337280.0, "grad_norm": 1.993169596996871, "language_loss": 0.73752379, "learning_rate": 3.751539060400244e-06, "loss": 0.75953472, "num_input_tokens_seen": 66650585, "step": 3086, "time_per_iteration": 2.652475595474243 }, { "auxiliary_loss_clip": 0.01142754, "auxiliary_loss_mlp": 0.01049865, "balance_loss_clip": 1.05530787, "balance_loss_mlp": 1.03134012, "epoch": 0.18560048098602133, "flos": 22346887570560.0, "grad_norm": 7.927127736744579, "language_loss": 0.69762361, "learning_rate": 3.7513510225656132e-06, "loss": 0.71954978, "num_input_tokens_seen": 66670045, "step": 3087, "time_per_iteration": 2.668849229812622 }, { "auxiliary_loss_clip": 0.01119022, "auxiliary_loss_mlp": 0.01055302, "balance_loss_clip": 1.05543649, "balance_loss_mlp": 1.03546548, "epoch": 0.18566060423868933, "flos": 17748526308480.0, "grad_norm": 2.1117122734340263, "language_loss": 0.72513628, "learning_rate": 3.7511629183195764e-06, "loss": 0.74687952, "num_input_tokens_seen": 66688790, "step": 3088, "time_per_iteration": 2.7150719165802 }, { "auxiliary_loss_clip": 0.0112638, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.04933047, "balance_loss_mlp": 1.02616334, "epoch": 0.1857207274913573, "flos": 24677274142080.0, "grad_norm": 2.112009927874319, "language_loss": 0.91859758, "learning_rate": 3.7509747476692663e-06, "loss": 0.94030321, "num_input_tokens_seen": 66708090, "step": 3089, "time_per_iteration": 2.7239248752593994 }, { "auxiliary_loss_clip": 0.01104754, "auxiliary_loss_mlp": 0.01046981, "balance_loss_clip": 1.0494597, "balance_loss_mlp": 1.02919531, "epoch": 0.18578085074402526, "flos": 28147825198080.0, "grad_norm": 2.490831087537115, "language_loss": 0.57275403, "learning_rate": 3.7507865106218176e-06, "loss": 0.59427136, "num_input_tokens_seen": 66727320, "step": 3090, "time_per_iteration": 2.8263309001922607 }, { "auxiliary_loss_clip": 0.01125877, "auxiliary_loss_mlp": 0.0104478, "balance_loss_clip": 1.04981184, "balance_loss_mlp": 1.02636242, "epoch": 0.18584097399669322, "flos": 23951878980480.0, "grad_norm": 1.7797305478565062, "language_loss": 0.81704801, "learning_rate": 3.7505982071843695e-06, "loss": 0.83875453, "num_input_tokens_seen": 66747505, "step": 3091, "time_per_iteration": 2.697525978088379 }, { "auxiliary_loss_clip": 0.01101743, "auxiliary_loss_mlp": 0.01050837, "balance_loss_clip": 1.04999971, "balance_loss_mlp": 1.03277707, "epoch": 0.18590109724936119, "flos": 17201678676480.0, "grad_norm": 2.0826959244757832, "language_loss": 0.83704746, "learning_rate": 3.7504098373640617e-06, "loss": 0.8585732, "num_input_tokens_seen": 66766425, "step": 3092, "time_per_iteration": 2.8379435539245605 }, { "auxiliary_loss_clip": 0.01136846, "auxiliary_loss_mlp": 0.01048758, "balance_loss_clip": 1.05389428, "balance_loss_mlp": 1.03036356, "epoch": 0.18596122050202915, "flos": 17234644383360.0, "grad_norm": 5.439917179387958, "language_loss": 0.93443698, "learning_rate": 3.750221401168038e-06, "loss": 0.95629299, "num_input_tokens_seen": 66781130, "step": 3093, "time_per_iteration": 2.8053483963012695 }, { "auxiliary_loss_clip": 0.01130362, "auxiliary_loss_mlp": 0.01042367, "balance_loss_clip": 1.05440521, "balance_loss_mlp": 1.02464092, "epoch": 0.18602134375469712, "flos": 19020733188480.0, "grad_norm": 1.7318887555782294, "language_loss": 0.77516603, "learning_rate": 3.750032898603443e-06, "loss": 0.7968933, "num_input_tokens_seen": 66797535, "step": 3094, "time_per_iteration": 2.7402310371398926 }, { "auxiliary_loss_clip": 0.0109741, "auxiliary_loss_mlp": 0.01049219, "balance_loss_clip": 1.0519228, "balance_loss_mlp": 1.0323391, "epoch": 0.1860814670073651, "flos": 50950094417280.0, "grad_norm": 1.7033453736007413, "language_loss": 0.69854707, "learning_rate": 3.749844329677425e-06, "loss": 0.72001338, "num_input_tokens_seen": 66821720, "step": 3095, "time_per_iteration": 3.133192777633667 }, { "auxiliary_loss_clip": 0.01113224, "auxiliary_loss_mlp": 0.010546, "balance_loss_clip": 1.0511899, "balance_loss_mlp": 1.03415525, "epoch": 0.18614159026003307, "flos": 19390972625280.0, "grad_norm": 2.2828801406167307, "language_loss": 0.81214821, "learning_rate": 3.749655694397135e-06, "loss": 0.83382642, "num_input_tokens_seen": 66839060, "step": 3096, "time_per_iteration": 2.7599101066589355 }, { "auxiliary_loss_clip": 0.01147399, "auxiliary_loss_mlp": 0.0104683, "balance_loss_clip": 1.05678356, "balance_loss_mlp": 1.02810192, "epoch": 0.18620171351270104, "flos": 21798782962560.0, "grad_norm": 2.430947734084612, "language_loss": 0.75326216, "learning_rate": 3.7494669927697255e-06, "loss": 0.77520448, "num_input_tokens_seen": 66857760, "step": 3097, "time_per_iteration": 4.255983114242554 }, { "auxiliary_loss_clip": 0.01133757, "auxiliary_loss_mlp": 0.01050365, "balance_loss_clip": 1.05756521, "balance_loss_mlp": 1.03228104, "epoch": 0.186261836765369, "flos": 16362877299840.0, "grad_norm": 2.553895603581972, "language_loss": 0.66602015, "learning_rate": 3.749278224802352e-06, "loss": 0.68786132, "num_input_tokens_seen": 66876460, "step": 3098, "time_per_iteration": 2.723567247390747 }, { "auxiliary_loss_clip": 0.01163461, "auxiliary_loss_mlp": 0.01052357, "balance_loss_clip": 1.05991709, "balance_loss_mlp": 1.03212702, "epoch": 0.18632196001803697, "flos": 23370054480000.0, "grad_norm": 1.6168121451860142, "language_loss": 0.69838905, "learning_rate": 3.7490893905021733e-06, "loss": 0.7205472, "num_input_tokens_seen": 66897960, "step": 3099, "time_per_iteration": 5.687380075454712 }, { "auxiliary_loss_clip": 0.01148363, "auxiliary_loss_mlp": 0.01051556, "balance_loss_clip": 1.05713868, "balance_loss_mlp": 1.03243458, "epoch": 0.18638208327070493, "flos": 22492002516480.0, "grad_norm": 1.7060244708994476, "language_loss": 0.71840072, "learning_rate": 3.7489004898763494e-06, "loss": 0.74039996, "num_input_tokens_seen": 66917675, "step": 3100, "time_per_iteration": 2.6711015701293945 }, { "auxiliary_loss_clip": 0.01138377, "auxiliary_loss_mlp": 0.01050667, "balance_loss_clip": 1.05749035, "balance_loss_mlp": 1.03133154, "epoch": 0.18644220652337293, "flos": 29165245931520.0, "grad_norm": 1.9639279354826686, "language_loss": 0.80343997, "learning_rate": 3.7487115229320444e-06, "loss": 0.82533038, "num_input_tokens_seen": 66936000, "step": 3101, "time_per_iteration": 2.6996583938598633 }, { "auxiliary_loss_clip": 0.01112778, "auxiliary_loss_mlp": 0.01042097, "balance_loss_clip": 1.05307627, "balance_loss_mlp": 1.02478826, "epoch": 0.1865023297760409, "flos": 24243796811520.0, "grad_norm": 1.8804860702941575, "language_loss": 0.77053607, "learning_rate": 3.7485224896764222e-06, "loss": 0.79208481, "num_input_tokens_seen": 66955700, "step": 3102, "time_per_iteration": 2.726146936416626 }, { "auxiliary_loss_clip": 0.01150817, "auxiliary_loss_mlp": 0.01039303, "balance_loss_clip": 1.057688, "balance_loss_mlp": 1.0213027, "epoch": 0.18656245302870886, "flos": 19128716449920.0, "grad_norm": 2.314682178811096, "language_loss": 0.76689744, "learning_rate": 3.7483333901166525e-06, "loss": 0.78879869, "num_input_tokens_seen": 66972815, "step": 3103, "time_per_iteration": 4.374122619628906 }, { "auxiliary_loss_clip": 0.01132531, "auxiliary_loss_mlp": 0.0104481, "balance_loss_clip": 1.05477643, "balance_loss_mlp": 1.02671361, "epoch": 0.18662257628137682, "flos": 17786088956160.0, "grad_norm": 1.6956506235876265, "language_loss": 0.79252636, "learning_rate": 3.7481442242599054e-06, "loss": 0.8142997, "num_input_tokens_seen": 66992280, "step": 3104, "time_per_iteration": 2.695012092590332 }, { "auxiliary_loss_clip": 0.01106786, "auxiliary_loss_mlp": 0.01050273, "balance_loss_clip": 1.05117702, "balance_loss_mlp": 1.03096056, "epoch": 0.1866826995340448, "flos": 24024382583040.0, "grad_norm": 2.065624302338532, "language_loss": 0.8496474, "learning_rate": 3.747954992113354e-06, "loss": 0.87121809, "num_input_tokens_seen": 67012220, "step": 3105, "time_per_iteration": 2.761521816253662 }, { "auxiliary_loss_clip": 0.0112324, "auxiliary_loss_mlp": 0.01043689, "balance_loss_clip": 1.05166531, "balance_loss_mlp": 1.02407932, "epoch": 0.18674282278671275, "flos": 26141244756480.0, "grad_norm": 1.8352441384571676, "language_loss": 0.86880243, "learning_rate": 3.7477656936841742e-06, "loss": 0.8904717, "num_input_tokens_seen": 67032030, "step": 3106, "time_per_iteration": 2.785738706588745 }, { "auxiliary_loss_clip": 0.01150222, "auxiliary_loss_mlp": 0.01040973, "balance_loss_clip": 1.0566026, "balance_loss_mlp": 1.02281737, "epoch": 0.18680294603938072, "flos": 19201938324480.0, "grad_norm": 2.128833658771433, "language_loss": 0.78226906, "learning_rate": 3.7475763289795445e-06, "loss": 0.80418098, "num_input_tokens_seen": 67048920, "step": 3107, "time_per_iteration": 2.693995237350464 }, { "auxiliary_loss_clip": 0.01153763, "auxiliary_loss_mlp": 0.01053056, "balance_loss_clip": 1.05873394, "balance_loss_mlp": 1.03341043, "epoch": 0.1868630692920487, "flos": 28544889116160.0, "grad_norm": 3.0927798335187506, "language_loss": 0.74159014, "learning_rate": 3.7473868980066446e-06, "loss": 0.7636584, "num_input_tokens_seen": 67068645, "step": 3108, "time_per_iteration": 2.795715570449829 }, { "auxiliary_loss_clip": 0.01107582, "auxiliary_loss_mlp": 0.01042714, "balance_loss_clip": 1.05207491, "balance_loss_mlp": 1.02451098, "epoch": 0.18692319254471668, "flos": 17238020261760.0, "grad_norm": 1.6837485322309411, "language_loss": 0.74348569, "learning_rate": 3.747197400772658e-06, "loss": 0.76498872, "num_input_tokens_seen": 67087075, "step": 3109, "time_per_iteration": 2.7627830505371094 }, { "auxiliary_loss_clip": 0.01145572, "auxiliary_loss_mlp": 0.01044117, "balance_loss_clip": 1.05631042, "balance_loss_mlp": 1.02526462, "epoch": 0.18698331579738464, "flos": 23185186156800.0, "grad_norm": 1.499459601293056, "language_loss": 0.84250218, "learning_rate": 3.747007837284772e-06, "loss": 0.86439908, "num_input_tokens_seen": 67108040, "step": 3110, "time_per_iteration": 2.7665328979492188 }, { "auxiliary_loss_clip": 0.01147578, "auxiliary_loss_mlp": 0.01042389, "balance_loss_clip": 1.05929494, "balance_loss_mlp": 1.02381575, "epoch": 0.1870434390500526, "flos": 25516721963520.0, "grad_norm": 1.9108380391903876, "language_loss": 0.84738445, "learning_rate": 3.7468182075501737e-06, "loss": 0.86928415, "num_input_tokens_seen": 67127605, "step": 3111, "time_per_iteration": 2.729233741760254 }, { "auxiliary_loss_clip": 0.01128, "auxiliary_loss_mlp": 0.01044544, "balance_loss_clip": 1.05348754, "balance_loss_mlp": 1.02635229, "epoch": 0.18710356230272057, "flos": 19500823393920.0, "grad_norm": 1.8704338434966796, "language_loss": 0.76875687, "learning_rate": 3.7466285115760536e-06, "loss": 0.79048228, "num_input_tokens_seen": 67145785, "step": 3112, "time_per_iteration": 2.7392494678497314 }, { "auxiliary_loss_clip": 0.0114846, "auxiliary_loss_mlp": 0.0104709, "balance_loss_clip": 1.05636978, "balance_loss_mlp": 1.02913654, "epoch": 0.18716368555538854, "flos": 26760847386240.0, "grad_norm": 1.8996972204761096, "language_loss": 0.64466536, "learning_rate": 3.7464387493696046e-06, "loss": 0.66662085, "num_input_tokens_seen": 67165930, "step": 3113, "time_per_iteration": 2.7393765449523926 }, { "auxiliary_loss_clip": 0.01153807, "auxiliary_loss_mlp": 0.01048748, "balance_loss_clip": 1.05685568, "balance_loss_mlp": 1.02900672, "epoch": 0.1872238088080565, "flos": 25189827264000.0, "grad_norm": 6.483287708452815, "language_loss": 0.817972, "learning_rate": 3.746248920938024e-06, "loss": 0.83999759, "num_input_tokens_seen": 67185830, "step": 3114, "time_per_iteration": 2.740229368209839 }, { "auxiliary_loss_clip": 0.01104278, "auxiliary_loss_mlp": 0.01050738, "balance_loss_clip": 1.04921412, "balance_loss_mlp": 1.03024614, "epoch": 0.1872839320607245, "flos": 24134305178880.0, "grad_norm": 2.3064843449079175, "language_loss": 0.57413173, "learning_rate": 3.74605902628851e-06, "loss": 0.59568191, "num_input_tokens_seen": 67206930, "step": 3115, "time_per_iteration": 2.811549663543701 }, { "auxiliary_loss_clip": 0.01123025, "auxiliary_loss_mlp": 0.01052226, "balance_loss_clip": 1.05446446, "balance_loss_mlp": 1.03241396, "epoch": 0.18734405531339246, "flos": 21173793292800.0, "grad_norm": 2.577640519639585, "language_loss": 0.70842528, "learning_rate": 3.745869065428261e-06, "loss": 0.73017788, "num_input_tokens_seen": 67226290, "step": 3116, "time_per_iteration": 2.8053951263427734 }, { "auxiliary_loss_clip": 0.0115042, "auxiliary_loss_mlp": 0.01035569, "balance_loss_clip": 1.05196476, "balance_loss_mlp": 1.01787841, "epoch": 0.18740417856606043, "flos": 17237697039360.0, "grad_norm": 3.010261965906642, "language_loss": 0.78994375, "learning_rate": 3.7456790383644833e-06, "loss": 0.81180358, "num_input_tokens_seen": 67244410, "step": 3117, "time_per_iteration": 2.819415330886841 }, { "auxiliary_loss_clip": 0.01132901, "auxiliary_loss_mlp": 0.01049724, "balance_loss_clip": 1.05260777, "balance_loss_mlp": 1.03047204, "epoch": 0.1874643018187284, "flos": 32558049999360.0, "grad_norm": 2.2828109389679865, "language_loss": 0.83903432, "learning_rate": 3.745488945104381e-06, "loss": 0.86086059, "num_input_tokens_seen": 67264470, "step": 3118, "time_per_iteration": 2.783804416656494 }, { "auxiliary_loss_clip": 0.01144867, "auxiliary_loss_mlp": 0.0104452, "balance_loss_clip": 1.05412436, "balance_loss_mlp": 1.02688873, "epoch": 0.18752442507139636, "flos": 23258156636160.0, "grad_norm": 3.566737352043019, "language_loss": 0.76283264, "learning_rate": 3.7452987856551636e-06, "loss": 0.78472656, "num_input_tokens_seen": 67284315, "step": 3119, "time_per_iteration": 2.6872506141662598 }, { "auxiliary_loss_clip": 0.01156835, "auxiliary_loss_mlp": 0.01046653, "balance_loss_clip": 1.05519438, "balance_loss_mlp": 1.02899814, "epoch": 0.18758454832406432, "flos": 21760933006080.0, "grad_norm": 1.7224942549361077, "language_loss": 0.82017547, "learning_rate": 3.7451085600240406e-06, "loss": 0.84221041, "num_input_tokens_seen": 67302780, "step": 3120, "time_per_iteration": 2.637505292892456 }, { "auxiliary_loss_clip": 0.0113033, "auxiliary_loss_mlp": 0.01035538, "balance_loss_clip": 1.05060756, "balance_loss_mlp": 1.01828837, "epoch": 0.1876446715767323, "flos": 29570210841600.0, "grad_norm": 2.5027223446471982, "language_loss": 0.84992659, "learning_rate": 3.7449182682182263e-06, "loss": 0.87158525, "num_input_tokens_seen": 67323405, "step": 3121, "time_per_iteration": 2.788353681564331 }, { "auxiliary_loss_clip": 0.01096681, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.045645, "balance_loss_mlp": 1.02599168, "epoch": 0.18770479482940028, "flos": 30339992234880.0, "grad_norm": 2.1738591443482362, "language_loss": 0.70032287, "learning_rate": 3.744727910244937e-06, "loss": 0.72173256, "num_input_tokens_seen": 67345800, "step": 3122, "time_per_iteration": 3.0225250720977783 }, { "auxiliary_loss_clip": 0.01153439, "auxiliary_loss_mlp": 0.01042355, "balance_loss_clip": 1.05445123, "balance_loss_mlp": 1.02288795, "epoch": 0.18776491808206824, "flos": 14465357527680.0, "grad_norm": 4.839579375412361, "language_loss": 0.70661515, "learning_rate": 3.7445374861113905e-06, "loss": 0.72857308, "num_input_tokens_seen": 67363575, "step": 3123, "time_per_iteration": 2.779904365539551 }, { "auxiliary_loss_clip": 0.01142265, "auxiliary_loss_mlp": 0.01041425, "balance_loss_clip": 1.05286181, "balance_loss_mlp": 1.02454507, "epoch": 0.1878250413347362, "flos": 24498547044480.0, "grad_norm": 2.057520579072589, "language_loss": 0.74103826, "learning_rate": 3.7443469958248066e-06, "loss": 0.76287514, "num_input_tokens_seen": 67381765, "step": 3124, "time_per_iteration": 2.6336071491241455 }, { "auxiliary_loss_clip": 0.01157579, "auxiliary_loss_mlp": 0.01052509, "balance_loss_clip": 1.05653572, "balance_loss_mlp": 1.03333998, "epoch": 0.18788516458740417, "flos": 39786185692800.0, "grad_norm": 3.0670363966795096, "language_loss": 0.80654436, "learning_rate": 3.7441564393924106e-06, "loss": 0.82864523, "num_input_tokens_seen": 67405000, "step": 3125, "time_per_iteration": 2.7224199771881104 }, { "auxiliary_loss_clip": 0.01046615, "auxiliary_loss_mlp": 0.01006504, "balance_loss_clip": 1.04444218, "balance_loss_mlp": 1.00435853, "epoch": 0.18794528784007214, "flos": 64699250664960.0, "grad_norm": 0.9424570711133922, "language_loss": 0.63647306, "learning_rate": 3.7439658168214273e-06, "loss": 0.65700436, "num_input_tokens_seen": 67467140, "step": 3126, "time_per_iteration": 3.313321113586426 }, { "auxiliary_loss_clip": 0.01128308, "auxiliary_loss_mlp": 0.01040458, "balance_loss_clip": 1.05377257, "balance_loss_mlp": 1.02236164, "epoch": 0.1880054110927401, "flos": 28622061486720.0, "grad_norm": 1.8734163453478039, "language_loss": 0.81308508, "learning_rate": 3.7437751281190857e-06, "loss": 0.83477271, "num_input_tokens_seen": 67487980, "step": 3127, "time_per_iteration": 2.7137866020202637 }, { "auxiliary_loss_clip": 0.01088267, "auxiliary_loss_mlp": 0.0101138, "balance_loss_clip": 1.04814553, "balance_loss_mlp": 1.00912714, "epoch": 0.1880655343454081, "flos": 64488958490880.0, "grad_norm": 0.7699217277386954, "language_loss": 0.61922526, "learning_rate": 3.7435843732926164e-06, "loss": 0.64022171, "num_input_tokens_seen": 67552500, "step": 3128, "time_per_iteration": 3.264270782470703 }, { "auxiliary_loss_clip": 0.01108205, "auxiliary_loss_mlp": 0.01049422, "balance_loss_clip": 1.04763842, "balance_loss_mlp": 1.02907288, "epoch": 0.18812565759807606, "flos": 32124464928000.0, "grad_norm": 2.4867495334212175, "language_loss": 0.70985162, "learning_rate": 3.7433935523492536e-06, "loss": 0.73142785, "num_input_tokens_seen": 67573295, "step": 3129, "time_per_iteration": 2.79929256439209 }, { "auxiliary_loss_clip": 0.01158485, "auxiliary_loss_mlp": 0.01050611, "balance_loss_clip": 1.05767536, "balance_loss_mlp": 1.03109634, "epoch": 0.18818578085074403, "flos": 20624539449600.0, "grad_norm": 2.4831518001798676, "language_loss": 0.85035253, "learning_rate": 3.7432026652962314e-06, "loss": 0.87244344, "num_input_tokens_seen": 67590010, "step": 3130, "time_per_iteration": 2.60624361038208 }, { "auxiliary_loss_clip": 0.01107202, "auxiliary_loss_mlp": 0.01049966, "balance_loss_clip": 1.04649067, "balance_loss_mlp": 1.03023696, "epoch": 0.188245904103412, "flos": 28840506048000.0, "grad_norm": 9.096753382647533, "language_loss": 0.7643525, "learning_rate": 3.7430117121407897e-06, "loss": 0.7859242, "num_input_tokens_seen": 67611110, "step": 3131, "time_per_iteration": 2.759230136871338 }, { "auxiliary_loss_clip": 0.0112329, "auxiliary_loss_mlp": 0.01049221, "balance_loss_clip": 1.05344164, "balance_loss_mlp": 1.03014708, "epoch": 0.18830602735607996, "flos": 29420319386880.0, "grad_norm": 2.109252219381847, "language_loss": 0.80713749, "learning_rate": 3.74282069289017e-06, "loss": 0.82886261, "num_input_tokens_seen": 67631990, "step": 3132, "time_per_iteration": 2.773817777633667 }, { "auxiliary_loss_clip": 0.01093588, "auxiliary_loss_mlp": 0.00779094, "balance_loss_clip": 1.04652429, "balance_loss_mlp": 1.00091529, "epoch": 0.18836615060874792, "flos": 28872933050880.0, "grad_norm": 2.092242478448591, "language_loss": 0.79653811, "learning_rate": 3.742629607551614e-06, "loss": 0.81526494, "num_input_tokens_seen": 67650490, "step": 3133, "time_per_iteration": 2.7873754501342773 }, { "auxiliary_loss_clip": 0.01119878, "auxiliary_loss_mlp": 0.01059381, "balance_loss_clip": 1.05341148, "balance_loss_mlp": 1.03921056, "epoch": 0.18842627386141592, "flos": 22601673717120.0, "grad_norm": 1.9069857551930867, "language_loss": 0.83001804, "learning_rate": 3.7424384561323698e-06, "loss": 0.85181063, "num_input_tokens_seen": 67668860, "step": 3134, "time_per_iteration": 2.9284298419952393 }, { "auxiliary_loss_clip": 0.01131578, "auxiliary_loss_mlp": 0.01046681, "balance_loss_clip": 1.05168402, "balance_loss_mlp": 1.02802503, "epoch": 0.18848639711408388, "flos": 24573600512640.0, "grad_norm": 2.0376543711114152, "language_loss": 0.82859468, "learning_rate": 3.742247238639684e-06, "loss": 0.85037726, "num_input_tokens_seen": 67690220, "step": 3135, "time_per_iteration": 2.8006811141967773 }, { "auxiliary_loss_clip": 0.01143148, "auxiliary_loss_mlp": 0.01050197, "balance_loss_clip": 1.05505157, "balance_loss_mlp": 1.03146911, "epoch": 0.18854652036675185, "flos": 34166920078080.0, "grad_norm": 1.9728388324049713, "language_loss": 0.78658557, "learning_rate": 3.7420559550808083e-06, "loss": 0.80851901, "num_input_tokens_seen": 67709820, "step": 3136, "time_per_iteration": 4.256143569946289 }, { "auxiliary_loss_clip": 0.01135545, "auxiliary_loss_mlp": 0.01048618, "balance_loss_clip": 1.05388892, "balance_loss_mlp": 1.03006911, "epoch": 0.1886066436194198, "flos": 24200236592640.0, "grad_norm": 1.7483697887361769, "language_loss": 0.80820233, "learning_rate": 3.741864605462996e-06, "loss": 0.83004391, "num_input_tokens_seen": 67729490, "step": 3137, "time_per_iteration": 2.7538130283355713 }, { "auxiliary_loss_clip": 0.01159054, "auxiliary_loss_mlp": 0.01048373, "balance_loss_clip": 1.05827475, "balance_loss_mlp": 1.03107548, "epoch": 0.18866676687208778, "flos": 21251109317760.0, "grad_norm": 1.9799764624272802, "language_loss": 0.81274408, "learning_rate": 3.741673189793504e-06, "loss": 0.83481836, "num_input_tokens_seen": 67749665, "step": 3138, "time_per_iteration": 4.143909931182861 }, { "auxiliary_loss_clip": 0.01150082, "auxiliary_loss_mlp": 0.01056444, "balance_loss_clip": 1.05626798, "balance_loss_mlp": 1.03713167, "epoch": 0.18872689012475574, "flos": 37308673013760.0, "grad_norm": 2.326218248348143, "language_loss": 0.63655496, "learning_rate": 3.7414817080795896e-06, "loss": 0.65862024, "num_input_tokens_seen": 67776230, "step": 3139, "time_per_iteration": 4.30991268157959 }, { "auxiliary_loss_clip": 0.0115289, "auxiliary_loss_mlp": 0.01043021, "balance_loss_clip": 1.05286491, "balance_loss_mlp": 1.02356625, "epoch": 0.1887870133774237, "flos": 21652303299840.0, "grad_norm": 2.1185902638296525, "language_loss": 0.7148211, "learning_rate": 3.741290160328514e-06, "loss": 0.73678017, "num_input_tokens_seen": 67795080, "step": 3140, "time_per_iteration": 2.6880578994750977 }, { "auxiliary_loss_clip": 0.01154738, "auxiliary_loss_mlp": 0.01043099, "balance_loss_clip": 1.05349982, "balance_loss_mlp": 1.02382278, "epoch": 0.1888471366300917, "flos": 15924659374080.0, "grad_norm": 2.6250212982316574, "language_loss": 0.87069929, "learning_rate": 3.7410985465475412e-06, "loss": 0.89267766, "num_input_tokens_seen": 67813110, "step": 3141, "time_per_iteration": 2.6677181720733643 }, { "auxiliary_loss_clip": 0.01130655, "auxiliary_loss_mlp": 0.01052882, "balance_loss_clip": 1.0507834, "balance_loss_mlp": 1.03243756, "epoch": 0.18890725988275966, "flos": 18551955767040.0, "grad_norm": 1.873404502116747, "language_loss": 0.7744689, "learning_rate": 3.7409068667439378e-06, "loss": 0.79630429, "num_input_tokens_seen": 67831070, "step": 3142, "time_per_iteration": 2.63077449798584 }, { "auxiliary_loss_clip": 0.01128192, "auxiliary_loss_mlp": 0.01038074, "balance_loss_clip": 1.05298221, "balance_loss_mlp": 1.02132463, "epoch": 0.18896738313542763, "flos": 28840865184000.0, "grad_norm": 1.6611052928231447, "language_loss": 0.78867507, "learning_rate": 3.740715120924971e-06, "loss": 0.81033778, "num_input_tokens_seen": 67852170, "step": 3143, "time_per_iteration": 4.417406797409058 }, { "auxiliary_loss_clip": 0.0111986, "auxiliary_loss_mlp": 0.01048019, "balance_loss_clip": 1.05024099, "balance_loss_mlp": 1.02821851, "epoch": 0.1890275063880956, "flos": 22412747157120.0, "grad_norm": 2.855732191409361, "language_loss": 0.71476078, "learning_rate": 3.740523309097912e-06, "loss": 0.73643959, "num_input_tokens_seen": 67869945, "step": 3144, "time_per_iteration": 2.8104894161224365 }, { "auxiliary_loss_clip": 0.01125398, "auxiliary_loss_mlp": 0.01044816, "balance_loss_clip": 1.05102479, "balance_loss_mlp": 1.02492023, "epoch": 0.18908762964076356, "flos": 24243904552320.0, "grad_norm": 2.5973078221757144, "language_loss": 0.73390597, "learning_rate": 3.7403314312700356e-06, "loss": 0.75560808, "num_input_tokens_seen": 67890240, "step": 3145, "time_per_iteration": 2.715609312057495 }, { "auxiliary_loss_clip": 0.01110308, "auxiliary_loss_mlp": 0.01042542, "balance_loss_clip": 1.04543984, "balance_loss_mlp": 1.02446938, "epoch": 0.18914775289343153, "flos": 16982910892800.0, "grad_norm": 2.915733862437625, "language_loss": 0.76263785, "learning_rate": 3.740139487448616e-06, "loss": 0.78416634, "num_input_tokens_seen": 67907825, "step": 3146, "time_per_iteration": 2.777221202850342 }, { "auxiliary_loss_clip": 0.01092807, "auxiliary_loss_mlp": 0.01049336, "balance_loss_clip": 1.04319823, "balance_loss_mlp": 1.02829611, "epoch": 0.1892078761460995, "flos": 21543781334400.0, "grad_norm": 1.988128972125699, "language_loss": 0.7837925, "learning_rate": 3.7399474776409326e-06, "loss": 0.80521393, "num_input_tokens_seen": 67926670, "step": 3147, "time_per_iteration": 2.8039205074310303 }, { "auxiliary_loss_clip": 0.01143577, "auxiliary_loss_mlp": 0.01042953, "balance_loss_clip": 1.0548687, "balance_loss_mlp": 1.02454758, "epoch": 0.18926799939876748, "flos": 23001538896000.0, "grad_norm": 3.932544798883504, "language_loss": 0.67477876, "learning_rate": 3.739755401854267e-06, "loss": 0.69664401, "num_input_tokens_seen": 67943645, "step": 3148, "time_per_iteration": 2.7273359298706055 }, { "auxiliary_loss_clip": 0.01112331, "auxiliary_loss_mlp": 0.01039139, "balance_loss_clip": 1.04617155, "balance_loss_mlp": 1.02014899, "epoch": 0.18932812265143545, "flos": 22273019251200.0, "grad_norm": 2.9848849244070315, "language_loss": 0.76207471, "learning_rate": 3.739563260095902e-06, "loss": 0.78358936, "num_input_tokens_seen": 67962345, "step": 3149, "time_per_iteration": 2.8031978607177734 }, { "auxiliary_loss_clip": 0.01130375, "auxiliary_loss_mlp": 0.01045773, "balance_loss_clip": 1.05438852, "balance_loss_mlp": 1.02797484, "epoch": 0.1893882459041034, "flos": 18624423456000.0, "grad_norm": 2.3661599820320136, "language_loss": 0.80378366, "learning_rate": 3.7393710523731245e-06, "loss": 0.82554519, "num_input_tokens_seen": 67979760, "step": 3150, "time_per_iteration": 2.7836129665374756 }, { "auxiliary_loss_clip": 0.01137112, "auxiliary_loss_mlp": 0.0104876, "balance_loss_clip": 1.0528239, "balance_loss_mlp": 1.03019929, "epoch": 0.18944836915677138, "flos": 22892981016960.0, "grad_norm": 2.0711129864945956, "language_loss": 0.85251844, "learning_rate": 3.7391787786932215e-06, "loss": 0.87437713, "num_input_tokens_seen": 67996895, "step": 3151, "time_per_iteration": 2.7782201766967773 }, { "auxiliary_loss_clip": 0.01121267, "auxiliary_loss_mlp": 0.01046776, "balance_loss_clip": 1.05223882, "balance_loss_mlp": 1.02839363, "epoch": 0.18950849240943934, "flos": 26796542526720.0, "grad_norm": 2.1337439707996673, "language_loss": 0.74114192, "learning_rate": 3.7389864390634857e-06, "loss": 0.76282233, "num_input_tokens_seen": 68018365, "step": 3152, "time_per_iteration": 2.8767755031585693 }, { "auxiliary_loss_clip": 0.01120312, "auxiliary_loss_mlp": 0.0104438, "balance_loss_clip": 1.05119991, "balance_loss_mlp": 1.02463925, "epoch": 0.1895686156621073, "flos": 24971239048320.0, "grad_norm": 1.9471461777193173, "language_loss": 0.75520492, "learning_rate": 3.738794033491209e-06, "loss": 0.77685189, "num_input_tokens_seen": 68037985, "step": 3153, "time_per_iteration": 2.7722980976104736 }, { "auxiliary_loss_clip": 0.01158287, "auxiliary_loss_mlp": 0.01049678, "balance_loss_clip": 1.0559293, "balance_loss_mlp": 1.03102183, "epoch": 0.1896287389147753, "flos": 21944544353280.0, "grad_norm": 2.099749434473157, "language_loss": 0.79984629, "learning_rate": 3.7386015619836887e-06, "loss": 0.82192594, "num_input_tokens_seen": 68057975, "step": 3154, "time_per_iteration": 2.6530587673187256 }, { "auxiliary_loss_clip": 0.01117992, "auxiliary_loss_mlp": 0.01056707, "balance_loss_clip": 1.04851115, "balance_loss_mlp": 1.03536844, "epoch": 0.18968886216744327, "flos": 18179058723840.0, "grad_norm": 3.210440214164498, "language_loss": 0.73046303, "learning_rate": 3.738409024548223e-06, "loss": 0.75220996, "num_input_tokens_seen": 68074175, "step": 3155, "time_per_iteration": 2.729832410812378 }, { "auxiliary_loss_clip": 0.01126019, "auxiliary_loss_mlp": 0.01045659, "balance_loss_clip": 1.05104291, "balance_loss_mlp": 1.02626419, "epoch": 0.18974898542011123, "flos": 20412487509120.0, "grad_norm": 1.8299076145086866, "language_loss": 0.73869717, "learning_rate": 3.7382164211921136e-06, "loss": 0.76041389, "num_input_tokens_seen": 68095230, "step": 3156, "time_per_iteration": 2.6747231483459473 }, { "auxiliary_loss_clip": 0.01156549, "auxiliary_loss_mlp": 0.0104418, "balance_loss_clip": 1.05489409, "balance_loss_mlp": 1.02645326, "epoch": 0.1898091086727792, "flos": 23985024255360.0, "grad_norm": 1.9629652277148564, "language_loss": 0.68053937, "learning_rate": 3.7380237519226623e-06, "loss": 0.70254672, "num_input_tokens_seen": 68113805, "step": 3157, "time_per_iteration": 2.7092478275299072 }, { "auxiliary_loss_clip": 0.01114914, "auxiliary_loss_mlp": 0.01044181, "balance_loss_clip": 1.04805827, "balance_loss_mlp": 1.02533436, "epoch": 0.18986923192544716, "flos": 27637067756160.0, "grad_norm": 1.7829025355963362, "language_loss": 0.79893303, "learning_rate": 3.737831016747176e-06, "loss": 0.82052404, "num_input_tokens_seen": 68133190, "step": 3158, "time_per_iteration": 2.7921364307403564 }, { "auxiliary_loss_clip": 0.01163231, "auxiliary_loss_mlp": 0.01049502, "balance_loss_clip": 1.05787683, "balance_loss_mlp": 1.02923679, "epoch": 0.18992935517811513, "flos": 25484151306240.0, "grad_norm": 1.856283461980025, "language_loss": 0.72348613, "learning_rate": 3.737638215672964e-06, "loss": 0.74561346, "num_input_tokens_seen": 68152330, "step": 3159, "time_per_iteration": 2.6111273765563965 }, { "auxiliary_loss_clip": 0.01149613, "auxiliary_loss_mlp": 0.01053808, "balance_loss_clip": 1.05840325, "balance_loss_mlp": 1.03386414, "epoch": 0.1899894784307831, "flos": 17420805596160.0, "grad_norm": 2.2573250756933647, "language_loss": 0.84977192, "learning_rate": 3.7374453487073366e-06, "loss": 0.87180614, "num_input_tokens_seen": 68170185, "step": 3160, "time_per_iteration": 2.659259796142578 }, { "auxiliary_loss_clip": 0.01129342, "auxiliary_loss_mlp": 0.01049909, "balance_loss_clip": 1.05297387, "balance_loss_mlp": 1.03289795, "epoch": 0.19004960168345109, "flos": 27492240119040.0, "grad_norm": 2.752358611011079, "language_loss": 0.73407793, "learning_rate": 3.7372524158576074e-06, "loss": 0.7558704, "num_input_tokens_seen": 68191665, "step": 3161, "time_per_iteration": 2.784040689468384 }, { "auxiliary_loss_clip": 0.01139858, "auxiliary_loss_mlp": 0.0105519, "balance_loss_clip": 1.05456805, "balance_loss_mlp": 1.03476942, "epoch": 0.19010972493611905, "flos": 38654676385920.0, "grad_norm": 1.6629026055958476, "language_loss": 0.8115741, "learning_rate": 3.7370594171310926e-06, "loss": 0.83352458, "num_input_tokens_seen": 68214635, "step": 3162, "time_per_iteration": 2.9375386238098145 }, { "auxiliary_loss_clip": 0.01157449, "auxiliary_loss_mlp": 0.01040035, "balance_loss_clip": 1.05625844, "balance_loss_mlp": 1.02062798, "epoch": 0.19016984818878702, "flos": 19244744357760.0, "grad_norm": 2.448016750033594, "language_loss": 0.75615001, "learning_rate": 3.73686635253511e-06, "loss": 0.77812481, "num_input_tokens_seen": 68232150, "step": 3163, "time_per_iteration": 2.7344541549682617 }, { "auxiliary_loss_clip": 0.0110099, "auxiliary_loss_mlp": 0.01050093, "balance_loss_clip": 1.050578, "balance_loss_mlp": 1.02880192, "epoch": 0.19022997144145498, "flos": 37596891744000.0, "grad_norm": 2.2644227245470514, "language_loss": 0.74093997, "learning_rate": 3.736673222076982e-06, "loss": 0.76245081, "num_input_tokens_seen": 68253370, "step": 3164, "time_per_iteration": 2.9165730476379395 }, { "auxiliary_loss_clip": 0.01141317, "auxiliary_loss_mlp": 0.01038043, "balance_loss_clip": 1.05518687, "balance_loss_mlp": 1.0195303, "epoch": 0.19029009469412295, "flos": 61530921665280.0, "grad_norm": 1.5484522746055986, "language_loss": 0.66844344, "learning_rate": 3.7364800257640313e-06, "loss": 0.69023699, "num_input_tokens_seen": 68278895, "step": 3165, "time_per_iteration": 3.006096124649048 }, { "auxiliary_loss_clip": 0.01146225, "auxiliary_loss_mlp": 0.0104856, "balance_loss_clip": 1.05512285, "balance_loss_mlp": 1.02848506, "epoch": 0.1903502179467909, "flos": 13954851480960.0, "grad_norm": 2.8598536292657144, "language_loss": 0.74239767, "learning_rate": 3.7362867636035835e-06, "loss": 0.76434553, "num_input_tokens_seen": 68294880, "step": 3166, "time_per_iteration": 2.678844928741455 }, { "auxiliary_loss_clip": 0.01050093, "auxiliary_loss_mlp": 0.01014959, "balance_loss_clip": 1.04342103, "balance_loss_mlp": 1.01201403, "epoch": 0.1904103411994589, "flos": 66899641916160.0, "grad_norm": 0.7754190343967906, "language_loss": 0.50311053, "learning_rate": 3.736093435602968e-06, "loss": 0.52376103, "num_input_tokens_seen": 68359665, "step": 3167, "time_per_iteration": 3.277529239654541 }, { "auxiliary_loss_clip": 0.01138483, "auxiliary_loss_mlp": 0.01051348, "balance_loss_clip": 1.05485487, "balance_loss_mlp": 1.03293037, "epoch": 0.19047046445212687, "flos": 21908741472000.0, "grad_norm": 2.3487387451986192, "language_loss": 0.74504036, "learning_rate": 3.7359000417695156e-06, "loss": 0.76693863, "num_input_tokens_seen": 68378950, "step": 3168, "time_per_iteration": 2.690995216369629 }, { "auxiliary_loss_clip": 0.01040165, "auxiliary_loss_mlp": 0.01023518, "balance_loss_clip": 1.03869283, "balance_loss_mlp": 1.02085996, "epoch": 0.19053058770479483, "flos": 59255156701440.0, "grad_norm": 0.8605055473788603, "language_loss": 0.60079956, "learning_rate": 3.73570658211056e-06, "loss": 0.62143636, "num_input_tokens_seen": 68434235, "step": 3169, "time_per_iteration": 3.2108101844787598 }, { "auxiliary_loss_clip": 0.01103792, "auxiliary_loss_mlp": 0.01056606, "balance_loss_clip": 1.05267787, "balance_loss_mlp": 1.03741288, "epoch": 0.1905907109574628, "flos": 23951304362880.0, "grad_norm": 1.5575975614891868, "language_loss": 0.78179795, "learning_rate": 3.735513056633436e-06, "loss": 0.80340189, "num_input_tokens_seen": 68453830, "step": 3170, "time_per_iteration": 2.832043409347534 }, { "auxiliary_loss_clip": 0.01142047, "auxiliary_loss_mlp": 0.01045041, "balance_loss_clip": 1.05325115, "balance_loss_mlp": 1.02605128, "epoch": 0.19065083421013077, "flos": 20812316774400.0, "grad_norm": 1.7671932984988854, "language_loss": 0.78177166, "learning_rate": 3.7353194653454834e-06, "loss": 0.80364257, "num_input_tokens_seen": 68473005, "step": 3171, "time_per_iteration": 2.7823612689971924 }, { "auxiliary_loss_clip": 0.01158227, "auxiliary_loss_mlp": 0.01047345, "balance_loss_clip": 1.05499291, "balance_loss_mlp": 1.0285697, "epoch": 0.19071095746279873, "flos": 31284981192960.0, "grad_norm": 2.1976685633770905, "language_loss": 0.77953529, "learning_rate": 3.7351258082540426e-06, "loss": 0.80159104, "num_input_tokens_seen": 68493470, "step": 3172, "time_per_iteration": 2.746279001235962 }, { "auxiliary_loss_clip": 0.01145112, "auxiliary_loss_mlp": 0.01055334, "balance_loss_clip": 1.05438328, "balance_loss_mlp": 1.03703523, "epoch": 0.1907710807154667, "flos": 14356117290240.0, "grad_norm": 1.5258786569967644, "language_loss": 0.80223799, "learning_rate": 3.7349320853664576e-06, "loss": 0.82424247, "num_input_tokens_seen": 68511290, "step": 3173, "time_per_iteration": 2.7396810054779053 }, { "auxiliary_loss_clip": 0.01113266, "auxiliary_loss_mlp": 0.00778142, "balance_loss_clip": 1.04967713, "balance_loss_mlp": 1.00094676, "epoch": 0.1908312039681347, "flos": 26907039740160.0, "grad_norm": 1.5341307852526682, "language_loss": 0.78495061, "learning_rate": 3.7347382966900735e-06, "loss": 0.80386466, "num_input_tokens_seen": 68532575, "step": 3174, "time_per_iteration": 2.8579304218292236 }, { "auxiliary_loss_clip": 0.01106714, "auxiliary_loss_mlp": 0.01047557, "balance_loss_clip": 1.04928994, "balance_loss_mlp": 1.02838778, "epoch": 0.19089132722080265, "flos": 14494695960960.0, "grad_norm": 1.8075853216546063, "language_loss": 0.81067109, "learning_rate": 3.7345444422322395e-06, "loss": 0.83221382, "num_input_tokens_seen": 68548760, "step": 3175, "time_per_iteration": 2.718254804611206 }, { "auxiliary_loss_clip": 0.01080497, "auxiliary_loss_mlp": 0.01053652, "balance_loss_clip": 1.04361629, "balance_loss_mlp": 1.0342685, "epoch": 0.19095145047347062, "flos": 13952876232960.0, "grad_norm": 2.2545261224105873, "language_loss": 0.85529047, "learning_rate": 3.7343505220003067e-06, "loss": 0.87663192, "num_input_tokens_seen": 68563100, "step": 3176, "time_per_iteration": 4.2962729930877686 }, { "auxiliary_loss_clip": 0.0113361, "auxiliary_loss_mlp": 0.01059849, "balance_loss_clip": 1.05418086, "balance_loss_mlp": 1.03928506, "epoch": 0.19101157372613858, "flos": 25301832848640.0, "grad_norm": 2.0896270593066832, "language_loss": 0.813025, "learning_rate": 3.7341565360016285e-06, "loss": 0.83495957, "num_input_tokens_seen": 68581650, "step": 3177, "time_per_iteration": 2.815127372741699 }, { "auxiliary_loss_clip": 0.01122377, "auxiliary_loss_mlp": 0.01044946, "balance_loss_clip": 1.0482533, "balance_loss_mlp": 1.0265398, "epoch": 0.19107169697880655, "flos": 20558212986240.0, "grad_norm": 2.67963335978105, "language_loss": 0.7530241, "learning_rate": 3.73396248424356e-06, "loss": 0.7746973, "num_input_tokens_seen": 68600360, "step": 3178, "time_per_iteration": 4.351228475570679 }, { "auxiliary_loss_clip": 0.01146729, "auxiliary_loss_mlp": 0.01042476, "balance_loss_clip": 1.05574143, "balance_loss_mlp": 1.02458286, "epoch": 0.19113182023147451, "flos": 22163204396160.0, "grad_norm": 4.753014277211421, "language_loss": 0.81381619, "learning_rate": 3.7337683667334606e-06, "loss": 0.83570826, "num_input_tokens_seen": 68617885, "step": 3179, "time_per_iteration": 4.259284019470215 }, { "auxiliary_loss_clip": 0.01147837, "auxiliary_loss_mlp": 0.01048144, "balance_loss_clip": 1.05645823, "balance_loss_mlp": 1.0291661, "epoch": 0.19119194348414248, "flos": 18581796990720.0, "grad_norm": 2.753081884541086, "language_loss": 0.79384613, "learning_rate": 3.733574183478691e-06, "loss": 0.81580591, "num_input_tokens_seen": 68634550, "step": 3180, "time_per_iteration": 2.6609203815460205 }, { "auxiliary_loss_clip": 0.01129361, "auxiliary_loss_mlp": 0.0105402, "balance_loss_clip": 1.05249727, "balance_loss_mlp": 1.03445804, "epoch": 0.19125206673681047, "flos": 19026623018880.0, "grad_norm": 2.660238694189741, "language_loss": 0.79517245, "learning_rate": 3.733379934486615e-06, "loss": 0.81700623, "num_input_tokens_seen": 68651895, "step": 3181, "time_per_iteration": 2.6877176761627197 }, { "auxiliary_loss_clip": 0.0114301, "auxiliary_loss_mlp": 0.01053621, "balance_loss_clip": 1.05339336, "balance_loss_mlp": 1.03527462, "epoch": 0.19131218998947844, "flos": 21690153256320.0, "grad_norm": 2.2179888965480243, "language_loss": 0.74570775, "learning_rate": 3.7331856197645973e-06, "loss": 0.76767409, "num_input_tokens_seen": 68671500, "step": 3182, "time_per_iteration": 4.2829508781433105 }, { "auxiliary_loss_clip": 0.01128679, "auxiliary_loss_mlp": 0.01044063, "balance_loss_clip": 1.05578041, "balance_loss_mlp": 1.02575254, "epoch": 0.1913723132421464, "flos": 18442500048000.0, "grad_norm": 1.7534728284311585, "language_loss": 0.64618582, "learning_rate": 3.7329912393200084e-06, "loss": 0.66791326, "num_input_tokens_seen": 68690570, "step": 3183, "time_per_iteration": 2.7652854919433594 }, { "auxiliary_loss_clip": 0.01132257, "auxiliary_loss_mlp": 0.01050867, "balance_loss_clip": 1.0512805, "balance_loss_mlp": 1.0311259, "epoch": 0.19143243649481437, "flos": 27160102033920.0, "grad_norm": 1.555926798692704, "language_loss": 0.73459226, "learning_rate": 3.7327967931602173e-06, "loss": 0.75642347, "num_input_tokens_seen": 68709735, "step": 3184, "time_per_iteration": 2.6929056644439697 }, { "auxiliary_loss_clip": 0.01122578, "auxiliary_loss_mlp": 0.01054123, "balance_loss_clip": 1.05015373, "balance_loss_mlp": 1.03347623, "epoch": 0.19149255974748233, "flos": 21718952985600.0, "grad_norm": 2.0989643169058514, "language_loss": 0.87983418, "learning_rate": 3.732602281292598e-06, "loss": 0.9016012, "num_input_tokens_seen": 68727565, "step": 3185, "time_per_iteration": 2.6859230995178223 }, { "auxiliary_loss_clip": 0.01153787, "auxiliary_loss_mlp": 0.01044436, "balance_loss_clip": 1.05334914, "balance_loss_mlp": 1.02505302, "epoch": 0.1915526830001503, "flos": 22963293889920.0, "grad_norm": 2.4520480945942587, "language_loss": 0.73240852, "learning_rate": 3.7324077037245267e-06, "loss": 0.75439072, "num_input_tokens_seen": 68748110, "step": 3186, "time_per_iteration": 2.6398978233337402 }, { "auxiliary_loss_clip": 0.01132874, "auxiliary_loss_mlp": 0.01044989, "balance_loss_clip": 1.05609488, "balance_loss_mlp": 1.02379346, "epoch": 0.1916128062528183, "flos": 26140741966080.0, "grad_norm": 2.739457234253781, "language_loss": 0.83550584, "learning_rate": 3.7322130604633825e-06, "loss": 0.85728443, "num_input_tokens_seen": 68769765, "step": 3187, "time_per_iteration": 2.7476372718811035 }, { "auxiliary_loss_clip": 0.01076264, "auxiliary_loss_mlp": 0.01021317, "balance_loss_clip": 1.04604995, "balance_loss_mlp": 1.01892138, "epoch": 0.19167292950548626, "flos": 54925767457920.0, "grad_norm": 0.8659386797819415, "language_loss": 0.55824959, "learning_rate": 3.732018351516544e-06, "loss": 0.57922542, "num_input_tokens_seen": 68826815, "step": 3188, "time_per_iteration": 3.2144031524658203 }, { "auxiliary_loss_clip": 0.01139007, "auxiliary_loss_mlp": 0.01054399, "balance_loss_clip": 1.054564, "balance_loss_mlp": 1.03537333, "epoch": 0.19173305275815422, "flos": 29935601942400.0, "grad_norm": 2.2897904709915573, "language_loss": 0.69839454, "learning_rate": 3.731823576891397e-06, "loss": 0.72032857, "num_input_tokens_seen": 68847585, "step": 3189, "time_per_iteration": 2.7998950481414795 }, { "auxiliary_loss_clip": 0.01118438, "auxiliary_loss_mlp": 0.01038566, "balance_loss_clip": 1.04930174, "balance_loss_mlp": 1.02116132, "epoch": 0.1917931760108222, "flos": 24752471264640.0, "grad_norm": 2.362312815249866, "language_loss": 0.74320328, "learning_rate": 3.7316287365953266e-06, "loss": 0.76477331, "num_input_tokens_seen": 68866620, "step": 3190, "time_per_iteration": 2.7386670112609863 }, { "auxiliary_loss_clip": 0.01111071, "auxiliary_loss_mlp": 0.0106718, "balance_loss_clip": 1.04946983, "balance_loss_mlp": 1.04702199, "epoch": 0.19185329926349015, "flos": 18843550375680.0, "grad_norm": 3.545467698458187, "language_loss": 0.8444041, "learning_rate": 3.73143383063572e-06, "loss": 0.8661865, "num_input_tokens_seen": 68885515, "step": 3191, "time_per_iteration": 2.7025794982910156 }, { "auxiliary_loss_clip": 0.01127894, "auxiliary_loss_mlp": 0.01039849, "balance_loss_clip": 1.05251908, "balance_loss_mlp": 1.02231336, "epoch": 0.19191342251615812, "flos": 22086858038400.0, "grad_norm": 2.0663841109071526, "language_loss": 0.89985192, "learning_rate": 3.73123885901997e-06, "loss": 0.92152941, "num_input_tokens_seen": 68903225, "step": 3192, "time_per_iteration": 2.802852153778076 }, { "auxiliary_loss_clip": 0.01130336, "auxiliary_loss_mlp": 0.01054766, "balance_loss_clip": 1.05716372, "balance_loss_mlp": 1.03509688, "epoch": 0.19197354576882608, "flos": 22199115018240.0, "grad_norm": 2.3467564445058775, "language_loss": 0.75159264, "learning_rate": 3.7310438217554687e-06, "loss": 0.77344358, "num_input_tokens_seen": 68922860, "step": 3193, "time_per_iteration": 2.7680914402008057 }, { "auxiliary_loss_clip": 0.01128303, "auxiliary_loss_mlp": 0.00777332, "balance_loss_clip": 1.05222785, "balance_loss_mlp": 1.00071752, "epoch": 0.19203366902149407, "flos": 24896185580160.0, "grad_norm": 2.078743387775855, "language_loss": 0.75189757, "learning_rate": 3.730848718849612e-06, "loss": 0.77095383, "num_input_tokens_seen": 68943000, "step": 3194, "time_per_iteration": 2.7537553310394287 }, { "auxiliary_loss_clip": 0.01068142, "auxiliary_loss_mlp": 0.01004387, "balance_loss_clip": 1.03910232, "balance_loss_mlp": 1.00182378, "epoch": 0.19209379227416204, "flos": 68416722789120.0, "grad_norm": 0.7955224937316553, "language_loss": 0.68507159, "learning_rate": 3.7306535503097985e-06, "loss": 0.70579696, "num_input_tokens_seen": 69000255, "step": 3195, "time_per_iteration": 3.117191791534424 }, { "auxiliary_loss_clip": 0.01116081, "auxiliary_loss_mlp": 0.01052392, "balance_loss_clip": 1.05205238, "balance_loss_mlp": 1.0320189, "epoch": 0.19215391552683, "flos": 22055185221120.0, "grad_norm": 2.6559439291645757, "language_loss": 0.73141015, "learning_rate": 3.730458316143429e-06, "loss": 0.75309479, "num_input_tokens_seen": 69019665, "step": 3196, "time_per_iteration": 2.7234303951263428 }, { "auxiliary_loss_clip": 0.01139018, "auxiliary_loss_mlp": 0.01044947, "balance_loss_clip": 1.06151462, "balance_loss_mlp": 1.02596927, "epoch": 0.19221403877949797, "flos": 20302959962880.0, "grad_norm": 3.0997718824135734, "language_loss": 0.83654135, "learning_rate": 3.7302630163579068e-06, "loss": 0.85838103, "num_input_tokens_seen": 69039055, "step": 3197, "time_per_iteration": 2.72575306892395 }, { "auxiliary_loss_clip": 0.01086216, "auxiliary_loss_mlp": 0.01055059, "balance_loss_clip": 1.04615641, "balance_loss_mlp": 1.03320754, "epoch": 0.19227416203216594, "flos": 23185329811200.0, "grad_norm": 2.2465298420006383, "language_loss": 0.80656433, "learning_rate": 3.7300676509606373e-06, "loss": 0.82797706, "num_input_tokens_seen": 69056370, "step": 3198, "time_per_iteration": 2.741678237915039 }, { "auxiliary_loss_clip": 0.01135487, "auxiliary_loss_mlp": 0.01056572, "balance_loss_clip": 1.05502987, "balance_loss_mlp": 1.03655636, "epoch": 0.1923342852848339, "flos": 25776607841280.0, "grad_norm": 1.9205907836873994, "language_loss": 0.78993976, "learning_rate": 3.729872219959029e-06, "loss": 0.81186032, "num_input_tokens_seen": 69075915, "step": 3199, "time_per_iteration": 2.7821297645568848 }, { "auxiliary_loss_clip": 0.01116808, "auxiliary_loss_mlp": 0.01056964, "balance_loss_clip": 1.05010581, "balance_loss_mlp": 1.036412, "epoch": 0.19239440853750187, "flos": 17128349061120.0, "grad_norm": 3.662083840248298, "language_loss": 0.83574522, "learning_rate": 3.7296767233604934e-06, "loss": 0.85748297, "num_input_tokens_seen": 69094145, "step": 3200, "time_per_iteration": 2.7095022201538086 }, { "auxiliary_loss_clip": 0.01159025, "auxiliary_loss_mlp": 0.01048823, "balance_loss_clip": 1.05997193, "balance_loss_mlp": 1.03060746, "epoch": 0.19245453179016986, "flos": 16435093593600.0, "grad_norm": 1.9278966392289572, "language_loss": 0.79092836, "learning_rate": 3.729481161172443e-06, "loss": 0.81300688, "num_input_tokens_seen": 69111110, "step": 3201, "time_per_iteration": 2.684979200363159 }, { "auxiliary_loss_clip": 0.01103349, "auxiliary_loss_mlp": 0.01053366, "balance_loss_clip": 1.04825675, "balance_loss_mlp": 1.03418541, "epoch": 0.19251465504283782, "flos": 20230276792320.0, "grad_norm": 2.4062417134527645, "language_loss": 0.69276404, "learning_rate": 3.7292855334022927e-06, "loss": 0.71433127, "num_input_tokens_seen": 69130280, "step": 3202, "time_per_iteration": 2.8284943103790283 }, { "auxiliary_loss_clip": 0.01132334, "auxiliary_loss_mlp": 0.01041011, "balance_loss_clip": 1.05389905, "balance_loss_mlp": 1.02256894, "epoch": 0.1925747782955058, "flos": 19464374067840.0, "grad_norm": 1.9491265782204168, "language_loss": 0.91396749, "learning_rate": 3.7290898400574627e-06, "loss": 0.93570089, "num_input_tokens_seen": 69149570, "step": 3203, "time_per_iteration": 2.802433729171753 }, { "auxiliary_loss_clip": 0.0114953, "auxiliary_loss_mlp": 0.01049732, "balance_loss_clip": 1.05674863, "balance_loss_mlp": 1.02959776, "epoch": 0.19263490154817375, "flos": 17785586165760.0, "grad_norm": 5.05881669068558, "language_loss": 0.81689429, "learning_rate": 3.7288940811453725e-06, "loss": 0.83888692, "num_input_tokens_seen": 69168190, "step": 3204, "time_per_iteration": 2.671285629272461 }, { "auxiliary_loss_clip": 0.01116988, "auxiliary_loss_mlp": 0.01048941, "balance_loss_clip": 1.04950142, "balance_loss_mlp": 1.0298202, "epoch": 0.19269502480084172, "flos": 17457075354240.0, "grad_norm": 2.296941025186916, "language_loss": 0.76167846, "learning_rate": 3.7286982566734454e-06, "loss": 0.78333771, "num_input_tokens_seen": 69186950, "step": 3205, "time_per_iteration": 2.8654470443725586 }, { "auxiliary_loss_clip": 0.01140852, "auxiliary_loss_mlp": 0.01046651, "balance_loss_clip": 1.05839586, "balance_loss_mlp": 1.02749407, "epoch": 0.19275514805350968, "flos": 21506901045120.0, "grad_norm": 3.761768843322395, "language_loss": 0.83394569, "learning_rate": 3.728502366649107e-06, "loss": 0.85582072, "num_input_tokens_seen": 69204850, "step": 3206, "time_per_iteration": 2.8610613346099854 }, { "auxiliary_loss_clip": 0.0105715, "auxiliary_loss_mlp": 0.01004055, "balance_loss_clip": 1.03779244, "balance_loss_mlp": 1.00174224, "epoch": 0.19281527130617768, "flos": 47695979738880.0, "grad_norm": 0.8644529519848262, "language_loss": 0.60561717, "learning_rate": 3.728306411079786e-06, "loss": 0.62622917, "num_input_tokens_seen": 69259200, "step": 3207, "time_per_iteration": 3.126537322998047 }, { "auxiliary_loss_clip": 0.01120285, "auxiliary_loss_mlp": 0.01045527, "balance_loss_clip": 1.05201781, "balance_loss_mlp": 1.02678764, "epoch": 0.19287539455884564, "flos": 11801252672640.0, "grad_norm": 2.296187182186814, "language_loss": 0.75463599, "learning_rate": 3.7281103899729125e-06, "loss": 0.77629405, "num_input_tokens_seen": 69275835, "step": 3208, "time_per_iteration": 2.6978750228881836 }, { "auxiliary_loss_clip": 0.01150534, "auxiliary_loss_mlp": 0.00777875, "balance_loss_clip": 1.05520236, "balance_loss_mlp": 1.00063884, "epoch": 0.1929355178115136, "flos": 20631434860800.0, "grad_norm": 1.9483983315924505, "language_loss": 0.60869855, "learning_rate": 3.7279143033359195e-06, "loss": 0.62798262, "num_input_tokens_seen": 69294810, "step": 3209, "time_per_iteration": 2.699798107147217 }, { "auxiliary_loss_clip": 0.01158758, "auxiliary_loss_mlp": 0.01053815, "balance_loss_clip": 1.05472994, "balance_loss_mlp": 1.03261995, "epoch": 0.19299564106418157, "flos": 40807916058240.0, "grad_norm": 1.9992177661428934, "language_loss": 0.80025005, "learning_rate": 3.727718151176243e-06, "loss": 0.82237577, "num_input_tokens_seen": 69316065, "step": 3210, "time_per_iteration": 2.832665205001831 }, { "auxiliary_loss_clip": 0.01118997, "auxiliary_loss_mlp": 0.01047494, "balance_loss_clip": 1.05044246, "balance_loss_mlp": 1.02920699, "epoch": 0.19305576431684954, "flos": 11361418634880.0, "grad_norm": 2.515510367397107, "language_loss": 0.82571948, "learning_rate": 3.7275219335013217e-06, "loss": 0.84738445, "num_input_tokens_seen": 69332900, "step": 3211, "time_per_iteration": 2.7664191722869873 }, { "auxiliary_loss_clip": 0.01073663, "auxiliary_loss_mlp": 0.01002544, "balance_loss_clip": 1.03501034, "balance_loss_mlp": 1.00021982, "epoch": 0.1931158875695175, "flos": 54511895975040.0, "grad_norm": 0.9633495631759209, "language_loss": 0.63641912, "learning_rate": 3.7273256503185953e-06, "loss": 0.6571812, "num_input_tokens_seen": 69382535, "step": 3212, "time_per_iteration": 2.974940299987793 }, { "auxiliary_loss_clip": 0.01131742, "auxiliary_loss_mlp": 0.01044059, "balance_loss_clip": 1.05586314, "balance_loss_mlp": 1.02565336, "epoch": 0.19317601082218547, "flos": 19828436365440.0, "grad_norm": 1.7209148950717332, "language_loss": 0.76375663, "learning_rate": 3.7271293016355074e-06, "loss": 0.78551459, "num_input_tokens_seen": 69400600, "step": 3213, "time_per_iteration": 2.7898454666137695 }, { "auxiliary_loss_clip": 0.01123196, "auxiliary_loss_mlp": 0.0105066, "balance_loss_clip": 1.05261111, "balance_loss_mlp": 1.03116894, "epoch": 0.19323613407485346, "flos": 13152068467200.0, "grad_norm": 2.349758973823363, "language_loss": 0.70871878, "learning_rate": 3.726932887459503e-06, "loss": 0.73045731, "num_input_tokens_seen": 69417350, "step": 3214, "time_per_iteration": 2.8155152797698975 }, { "auxiliary_loss_clip": 0.01155585, "auxiliary_loss_mlp": 0.01047831, "balance_loss_clip": 1.05412841, "balance_loss_mlp": 1.02807808, "epoch": 0.19329625732752143, "flos": 14027247342720.0, "grad_norm": 2.190607045917922, "language_loss": 0.75067955, "learning_rate": 3.72673640779803e-06, "loss": 0.77271378, "num_input_tokens_seen": 69431845, "step": 3215, "time_per_iteration": 4.111938238143921 }, { "auxiliary_loss_clip": 0.01112217, "auxiliary_loss_mlp": 0.01049964, "balance_loss_clip": 1.04928339, "balance_loss_mlp": 1.0323447, "epoch": 0.1933563805801894, "flos": 23441732069760.0, "grad_norm": 1.7842520268521305, "language_loss": 0.88426638, "learning_rate": 3.72653986265854e-06, "loss": 0.9058882, "num_input_tokens_seen": 69453275, "step": 3216, "time_per_iteration": 2.7699615955352783 }, { "auxiliary_loss_clip": 0.01153806, "auxiliary_loss_mlp": 0.01052131, "balance_loss_clip": 1.05435801, "balance_loss_mlp": 1.03442836, "epoch": 0.19341650383285736, "flos": 20485314334080.0, "grad_norm": 1.6996051239972392, "language_loss": 0.7974773, "learning_rate": 3.726343252048485e-06, "loss": 0.81953669, "num_input_tokens_seen": 69471830, "step": 3217, "time_per_iteration": 2.6788718700408936 }, { "auxiliary_loss_clip": 0.01143281, "auxiliary_loss_mlp": 0.0104914, "balance_loss_clip": 1.05695105, "balance_loss_mlp": 1.02864754, "epoch": 0.19347662708552532, "flos": 17858484817920.0, "grad_norm": 4.708784796317305, "language_loss": 0.6161437, "learning_rate": 3.7261465759753206e-06, "loss": 0.6380679, "num_input_tokens_seen": 69489320, "step": 3218, "time_per_iteration": 4.352849960327148 }, { "auxiliary_loss_clip": 0.01157355, "auxiliary_loss_mlp": 0.01047211, "balance_loss_clip": 1.05723107, "balance_loss_mlp": 1.02873373, "epoch": 0.1935367503381933, "flos": 18187247024640.0, "grad_norm": 1.9724785552136583, "language_loss": 0.80345452, "learning_rate": 3.7259498344465053e-06, "loss": 0.82550013, "num_input_tokens_seen": 69506665, "step": 3219, "time_per_iteration": 4.1739161014556885 }, { "auxiliary_loss_clip": 0.01104687, "auxiliary_loss_mlp": 0.01047672, "balance_loss_clip": 1.05145359, "balance_loss_mlp": 1.02819324, "epoch": 0.19359687359086128, "flos": 15957122290560.0, "grad_norm": 2.7508533279024077, "language_loss": 0.85693008, "learning_rate": 3.7257530274694993e-06, "loss": 0.87845367, "num_input_tokens_seen": 69523835, "step": 3220, "time_per_iteration": 2.777284622192383 }, { "auxiliary_loss_clip": 0.01149581, "auxiliary_loss_mlp": 0.01041747, "balance_loss_clip": 1.05441856, "balance_loss_mlp": 1.02511764, "epoch": 0.19365699684352924, "flos": 21215198695680.0, "grad_norm": 2.05545450883527, "language_loss": 0.84637755, "learning_rate": 3.725556155051766e-06, "loss": 0.86829084, "num_input_tokens_seen": 69542620, "step": 3221, "time_per_iteration": 4.224115371704102 }, { "auxiliary_loss_clip": 0.01143661, "auxiliary_loss_mlp": 0.01044558, "balance_loss_clip": 1.05466259, "balance_loss_mlp": 1.02730846, "epoch": 0.1937171200961972, "flos": 17311098481920.0, "grad_norm": 2.658004231066563, "language_loss": 0.86087942, "learning_rate": 3.7253592172007702e-06, "loss": 0.8827616, "num_input_tokens_seen": 69561130, "step": 3222, "time_per_iteration": 2.6400530338287354 }, { "auxiliary_loss_clip": 0.01069453, "auxiliary_loss_mlp": 0.01045281, "balance_loss_clip": 1.04206085, "balance_loss_mlp": 1.02599275, "epoch": 0.19377724334886517, "flos": 22635968227200.0, "grad_norm": 1.8604116943694204, "language_loss": 0.78510809, "learning_rate": 3.72516221392398e-06, "loss": 0.8062554, "num_input_tokens_seen": 69580425, "step": 3223, "time_per_iteration": 2.9685652256011963 }, { "auxiliary_loss_clip": 0.01146062, "auxiliary_loss_mlp": 0.01046815, "balance_loss_clip": 1.05697751, "balance_loss_mlp": 1.02819431, "epoch": 0.19383736660153314, "flos": 15077813351040.0, "grad_norm": 1.8958208586464897, "language_loss": 0.75391948, "learning_rate": 3.7249651452288653e-06, "loss": 0.77584827, "num_input_tokens_seen": 69597085, "step": 3224, "time_per_iteration": 2.665294885635376 }, { "auxiliary_loss_clip": 0.01102293, "auxiliary_loss_mlp": 0.01050181, "balance_loss_clip": 1.04728186, "balance_loss_mlp": 1.02927208, "epoch": 0.1938974898542011, "flos": 47119934350080.0, "grad_norm": 3.358076005999295, "language_loss": 0.71180636, "learning_rate": 3.7247680111229e-06, "loss": 0.73333108, "num_input_tokens_seen": 69618885, "step": 3225, "time_per_iteration": 2.997511863708496 }, { "auxiliary_loss_clip": 0.0112035, "auxiliary_loss_mlp": 0.01053167, "balance_loss_clip": 1.0519309, "balance_loss_mlp": 1.03480864, "epoch": 0.19395761310686907, "flos": 25812554376960.0, "grad_norm": 2.42331686427639, "language_loss": 0.69379079, "learning_rate": 3.7245708116135585e-06, "loss": 0.71552593, "num_input_tokens_seen": 69638200, "step": 3226, "time_per_iteration": 2.746338129043579 }, { "auxiliary_loss_clip": 0.01126783, "auxiliary_loss_mlp": 0.01042276, "balance_loss_clip": 1.05692983, "balance_loss_mlp": 1.02264214, "epoch": 0.19401773635953706, "flos": 23039604334080.0, "grad_norm": 2.1006513764454864, "language_loss": 0.76236808, "learning_rate": 3.7243735467083193e-06, "loss": 0.78405869, "num_input_tokens_seen": 69657550, "step": 3227, "time_per_iteration": 2.760087728500366 }, { "auxiliary_loss_clip": 0.01117794, "auxiliary_loss_mlp": 0.010438, "balance_loss_clip": 1.05304587, "balance_loss_mlp": 1.0256561, "epoch": 0.19407785961220503, "flos": 15920780705280.0, "grad_norm": 2.8268368707906397, "language_loss": 0.69577461, "learning_rate": 3.724176216414662e-06, "loss": 0.71739054, "num_input_tokens_seen": 69675005, "step": 3228, "time_per_iteration": 2.6779348850250244 }, { "auxiliary_loss_clip": 0.01148199, "auxiliary_loss_mlp": 0.01042315, "balance_loss_clip": 1.05775642, "balance_loss_mlp": 1.02445757, "epoch": 0.194137982864873, "flos": 25921722787200.0, "grad_norm": 1.7694943420266864, "language_loss": 0.74160898, "learning_rate": 3.72397882074007e-06, "loss": 0.76351416, "num_input_tokens_seen": 69696455, "step": 3229, "time_per_iteration": 2.7229623794555664 }, { "auxiliary_loss_clip": 0.01119678, "auxiliary_loss_mlp": 0.01044155, "balance_loss_clip": 1.05435359, "balance_loss_mlp": 1.0262022, "epoch": 0.19419810611754096, "flos": 13261344618240.0, "grad_norm": 1.9766126324167548, "language_loss": 0.65722096, "learning_rate": 3.7237813596920285e-06, "loss": 0.67885935, "num_input_tokens_seen": 69714245, "step": 3230, "time_per_iteration": 2.740324020385742 }, { "auxiliary_loss_clip": 0.01124671, "auxiliary_loss_mlp": 0.00776003, "balance_loss_clip": 1.05223823, "balance_loss_mlp": 1.00081468, "epoch": 0.19425822937020892, "flos": 15705568368000.0, "grad_norm": 1.9307338208311895, "language_loss": 0.82042694, "learning_rate": 3.7235838332780254e-06, "loss": 0.83943367, "num_input_tokens_seen": 69731515, "step": 3231, "time_per_iteration": 2.7453513145446777 }, { "auxiliary_loss_clip": 0.0113141, "auxiliary_loss_mlp": 0.01042332, "balance_loss_clip": 1.05393946, "balance_loss_mlp": 1.02220988, "epoch": 0.1943183526228769, "flos": 23105392093440.0, "grad_norm": 10.866686758212083, "language_loss": 0.87038374, "learning_rate": 3.72338624150555e-06, "loss": 0.89212114, "num_input_tokens_seen": 69748885, "step": 3232, "time_per_iteration": 2.7575178146362305 }, { "auxiliary_loss_clip": 0.01100451, "auxiliary_loss_mlp": 0.01050878, "balance_loss_clip": 1.05029583, "balance_loss_mlp": 1.03102958, "epoch": 0.19437847587554485, "flos": 24712610146560.0, "grad_norm": 2.531838729905544, "language_loss": 0.85189134, "learning_rate": 3.723188584382096e-06, "loss": 0.87340462, "num_input_tokens_seen": 69767540, "step": 3233, "time_per_iteration": 2.8617444038391113 }, { "auxiliary_loss_clip": 0.01149478, "auxiliary_loss_mlp": 0.01054519, "balance_loss_clip": 1.0574832, "balance_loss_mlp": 1.0357672, "epoch": 0.19443859912821285, "flos": 23116130259840.0, "grad_norm": 1.7408859410354203, "language_loss": 0.89099532, "learning_rate": 3.722990861915158e-06, "loss": 0.91303527, "num_input_tokens_seen": 69789340, "step": 3234, "time_per_iteration": 2.7648239135742188 }, { "auxiliary_loss_clip": 0.01135157, "auxiliary_loss_mlp": 0.01044708, "balance_loss_clip": 1.05003643, "balance_loss_mlp": 1.02544403, "epoch": 0.1944987223808808, "flos": 15084385539840.0, "grad_norm": 2.4074482975555926, "language_loss": 0.78673434, "learning_rate": 3.722793074112234e-06, "loss": 0.80853301, "num_input_tokens_seen": 69806470, "step": 3235, "time_per_iteration": 2.76930832862854 }, { "auxiliary_loss_clip": 0.01136497, "auxiliary_loss_mlp": 0.01046749, "balance_loss_clip": 1.0580672, "balance_loss_mlp": 1.0293448, "epoch": 0.19455884563354878, "flos": 17126876603520.0, "grad_norm": 2.2511193258734354, "language_loss": 0.79391634, "learning_rate": 3.7225952209808233e-06, "loss": 0.81574875, "num_input_tokens_seen": 69822655, "step": 3236, "time_per_iteration": 2.7060179710388184 }, { "auxiliary_loss_clip": 0.01156991, "auxiliary_loss_mlp": 0.01044638, "balance_loss_clip": 1.05862045, "balance_loss_mlp": 1.02482522, "epoch": 0.19461896888621674, "flos": 20193396503040.0, "grad_norm": 2.1553329609131713, "language_loss": 0.76224017, "learning_rate": 3.72239730252843e-06, "loss": 0.78425646, "num_input_tokens_seen": 69841895, "step": 3237, "time_per_iteration": 2.642235040664673 }, { "auxiliary_loss_clip": 0.01158804, "auxiliary_loss_mlp": 0.01051059, "balance_loss_clip": 1.05648041, "balance_loss_mlp": 1.03289127, "epoch": 0.1946790921388847, "flos": 25301365971840.0, "grad_norm": 1.5204653275468003, "language_loss": 0.74828202, "learning_rate": 3.7221993187625583e-06, "loss": 0.77038062, "num_input_tokens_seen": 69862220, "step": 3238, "time_per_iteration": 2.6618688106536865 }, { "auxiliary_loss_clip": 0.01108331, "auxiliary_loss_mlp": 0.01046572, "balance_loss_clip": 1.04992437, "balance_loss_mlp": 1.02791595, "epoch": 0.19473921539155267, "flos": 20193396503040.0, "grad_norm": 3.1324225641798518, "language_loss": 0.734164, "learning_rate": 3.7220012696907155e-06, "loss": 0.75571299, "num_input_tokens_seen": 69881830, "step": 3239, "time_per_iteration": 2.7637152671813965 }, { "auxiliary_loss_clip": 0.01132567, "auxiliary_loss_mlp": 0.01047988, "balance_loss_clip": 1.05458641, "balance_loss_mlp": 1.02947509, "epoch": 0.19479933864422067, "flos": 20887549810560.0, "grad_norm": 2.155392951393246, "language_loss": 0.73291272, "learning_rate": 3.721803155320412e-06, "loss": 0.7547183, "num_input_tokens_seen": 69900515, "step": 3240, "time_per_iteration": 2.6980888843536377 }, { "auxiliary_loss_clip": 0.01131601, "auxiliary_loss_mlp": 0.0103943, "balance_loss_clip": 1.05846488, "balance_loss_mlp": 1.02208555, "epoch": 0.19485946189688863, "flos": 23295072839040.0, "grad_norm": 5.847648280625993, "language_loss": 0.65809447, "learning_rate": 3.7216049756591606e-06, "loss": 0.6798048, "num_input_tokens_seen": 69920060, "step": 3241, "time_per_iteration": 2.659707546234131 }, { "auxiliary_loss_clip": 0.01128971, "auxiliary_loss_mlp": 0.01048707, "balance_loss_clip": 1.05226684, "balance_loss_mlp": 1.03039646, "epoch": 0.1949195851495566, "flos": 23295036925440.0, "grad_norm": 1.4408225707306088, "language_loss": 0.82747853, "learning_rate": 3.7214067307144754e-06, "loss": 0.84925532, "num_input_tokens_seen": 69939820, "step": 3242, "time_per_iteration": 2.7137632369995117 }, { "auxiliary_loss_clip": 0.01077632, "auxiliary_loss_mlp": 0.01014225, "balance_loss_clip": 1.04083347, "balance_loss_mlp": 1.01131678, "epoch": 0.19497970840222456, "flos": 64962871557120.0, "grad_norm": 0.853263603243422, "language_loss": 0.57500821, "learning_rate": 3.721208420493875e-06, "loss": 0.59592682, "num_input_tokens_seen": 70002145, "step": 3243, "time_per_iteration": 3.1446309089660645 }, { "auxiliary_loss_clip": 0.01138548, "auxiliary_loss_mlp": 0.01050428, "balance_loss_clip": 1.05331421, "balance_loss_mlp": 1.02988815, "epoch": 0.19503983165489253, "flos": 19644717277440.0, "grad_norm": 7.2345723863132, "language_loss": 0.83789021, "learning_rate": 3.7210100450048784e-06, "loss": 0.85977995, "num_input_tokens_seen": 70020510, "step": 3244, "time_per_iteration": 2.6194229125976562 }, { "auxiliary_loss_clip": 0.01143261, "auxiliary_loss_mlp": 0.01046223, "balance_loss_clip": 1.05732584, "balance_loss_mlp": 1.02869976, "epoch": 0.1950999549075605, "flos": 21141976821120.0, "grad_norm": 2.0710390949438837, "language_loss": 0.7739507, "learning_rate": 3.7208116042550088e-06, "loss": 0.79584551, "num_input_tokens_seen": 70040760, "step": 3245, "time_per_iteration": 2.6684374809265137 }, { "auxiliary_loss_clip": 0.01142874, "auxiliary_loss_mlp": 0.01043114, "balance_loss_clip": 1.05566645, "balance_loss_mlp": 1.02431464, "epoch": 0.19516007816022846, "flos": 20884820376960.0, "grad_norm": 2.1010289547443133, "language_loss": 0.83988321, "learning_rate": 3.7206130982517906e-06, "loss": 0.86174309, "num_input_tokens_seen": 70058720, "step": 3246, "time_per_iteration": 2.6595354080200195 }, { "auxiliary_loss_clip": 0.0114599, "auxiliary_loss_mlp": 0.00776442, "balance_loss_clip": 1.05517101, "balance_loss_mlp": 1.00080454, "epoch": 0.19522020141289645, "flos": 16910515031040.0, "grad_norm": 3.3581015873305438, "language_loss": 0.76840878, "learning_rate": 3.7204145270027514e-06, "loss": 0.78763306, "num_input_tokens_seen": 70076470, "step": 3247, "time_per_iteration": 2.7777793407440186 }, { "auxiliary_loss_clip": 0.01121778, "auxiliary_loss_mlp": 0.01043977, "balance_loss_clip": 1.05689096, "balance_loss_mlp": 1.02651262, "epoch": 0.19528032466556441, "flos": 26724829023360.0, "grad_norm": 1.8981807103962522, "language_loss": 0.75459039, "learning_rate": 3.720215890515421e-06, "loss": 0.77624786, "num_input_tokens_seen": 70096220, "step": 3248, "time_per_iteration": 2.8088901042938232 }, { "auxiliary_loss_clip": 0.01156017, "auxiliary_loss_mlp": 0.01048303, "balance_loss_clip": 1.05548215, "balance_loss_mlp": 1.03008783, "epoch": 0.19534044791823238, "flos": 21032808410880.0, "grad_norm": 2.7209722336942135, "language_loss": 0.77774823, "learning_rate": 3.7200171887973316e-06, "loss": 0.79979146, "num_input_tokens_seen": 70114800, "step": 3249, "time_per_iteration": 2.610877752304077 }, { "auxiliary_loss_clip": 0.01148434, "auxiliary_loss_mlp": 0.01050332, "balance_loss_clip": 1.05689144, "balance_loss_mlp": 1.03299928, "epoch": 0.19540057117090034, "flos": 22344050396160.0, "grad_norm": 1.5551573885822045, "language_loss": 0.73118901, "learning_rate": 3.7198184218560176e-06, "loss": 0.75317669, "num_input_tokens_seen": 70134930, "step": 3250, "time_per_iteration": 2.5901567935943604 }, { "auxiliary_loss_clip": 0.01101628, "auxiliary_loss_mlp": 0.01046467, "balance_loss_clip": 1.05080378, "balance_loss_mlp": 1.02876413, "epoch": 0.1954606944235683, "flos": 20301631159680.0, "grad_norm": 2.030501302548557, "language_loss": 0.79203367, "learning_rate": 3.719619589699017e-06, "loss": 0.81351459, "num_input_tokens_seen": 70152045, "step": 3251, "time_per_iteration": 2.6619749069213867 }, { "auxiliary_loss_clip": 0.0115825, "auxiliary_loss_mlp": 0.01044132, "balance_loss_clip": 1.05741858, "balance_loss_mlp": 1.02606022, "epoch": 0.19552081767623627, "flos": 17346865449600.0, "grad_norm": 7.451515078679223, "language_loss": 0.83871722, "learning_rate": 3.7194206923338695e-06, "loss": 0.86074108, "num_input_tokens_seen": 70169240, "step": 3252, "time_per_iteration": 2.5029656887054443 }, { "auxiliary_loss_clip": 0.01142752, "auxiliary_loss_mlp": 0.01057294, "balance_loss_clip": 1.05278862, "balance_loss_mlp": 1.03518057, "epoch": 0.19558094092890424, "flos": 31977626129280.0, "grad_norm": 1.7140417843701068, "language_loss": 0.73995864, "learning_rate": 3.719221729768117e-06, "loss": 0.76195908, "num_input_tokens_seen": 70192690, "step": 3253, "time_per_iteration": 2.609117269515991 }, { "auxiliary_loss_clip": 0.01102675, "auxiliary_loss_mlp": 0.01046707, "balance_loss_clip": 1.04759037, "balance_loss_mlp": 1.02782381, "epoch": 0.19564106418157223, "flos": 22268889187200.0, "grad_norm": 2.1302159220485675, "language_loss": 0.76167047, "learning_rate": 3.7190227020093037e-06, "loss": 0.78316426, "num_input_tokens_seen": 70209685, "step": 3254, "time_per_iteration": 4.174965858459473 }, { "auxiliary_loss_clip": 0.01043127, "auxiliary_loss_mlp": 0.01006966, "balance_loss_clip": 1.04737842, "balance_loss_mlp": 1.0036757, "epoch": 0.1957011874342402, "flos": 54364554385920.0, "grad_norm": 0.84452007287803, "language_loss": 0.55275303, "learning_rate": 3.7188236090649774e-06, "loss": 0.57325399, "num_input_tokens_seen": 70265050, "step": 3255, "time_per_iteration": 3.2241716384887695 }, { "auxiliary_loss_clip": 0.01133721, "auxiliary_loss_mlp": 0.01041696, "balance_loss_clip": 1.0557251, "balance_loss_mlp": 1.02349281, "epoch": 0.19576131068690816, "flos": 16506699356160.0, "grad_norm": 2.6103802859468392, "language_loss": 0.70870697, "learning_rate": 3.718624450942688e-06, "loss": 0.73046112, "num_input_tokens_seen": 70281830, "step": 3256, "time_per_iteration": 2.641296148300171 }, { "auxiliary_loss_clip": 0.01152768, "auxiliary_loss_mlp": 0.01042867, "balance_loss_clip": 1.0544858, "balance_loss_mlp": 1.02523613, "epoch": 0.19582143393957613, "flos": 14719676797440.0, "grad_norm": 2.649319646209249, "language_loss": 0.80722409, "learning_rate": 3.718425227649987e-06, "loss": 0.82918048, "num_input_tokens_seen": 70297420, "step": 3257, "time_per_iteration": 4.258259057998657 }, { "auxiliary_loss_clip": 0.01106644, "auxiliary_loss_mlp": 0.01043385, "balance_loss_clip": 1.05470431, "balance_loss_mlp": 1.02601588, "epoch": 0.1958815571922441, "flos": 24425504737920.0, "grad_norm": 6.015808523610408, "language_loss": 0.75124931, "learning_rate": 3.7182259391944292e-06, "loss": 0.77274966, "num_input_tokens_seen": 70319210, "step": 3258, "time_per_iteration": 4.386433362960815 }, { "auxiliary_loss_clip": 0.01082287, "auxiliary_loss_mlp": 0.01044148, "balance_loss_clip": 1.04533339, "balance_loss_mlp": 1.0237875, "epoch": 0.19594168044491206, "flos": 24900279730560.0, "grad_norm": 1.8034996675319444, "language_loss": 0.73872411, "learning_rate": 3.7180265855835714e-06, "loss": 0.75998843, "num_input_tokens_seen": 70339045, "step": 3259, "time_per_iteration": 2.815469264984131 }, { "auxiliary_loss_clip": 0.01131793, "auxiliary_loss_mlp": 0.01043364, "balance_loss_clip": 1.05167735, "balance_loss_mlp": 1.02392125, "epoch": 0.19600180369758005, "flos": 12057008486400.0, "grad_norm": 2.2096667980592, "language_loss": 0.77053022, "learning_rate": 3.7178271668249735e-06, "loss": 0.79228187, "num_input_tokens_seen": 70356505, "step": 3260, "time_per_iteration": 4.2817702293396 }, { "auxiliary_loss_clip": 0.01148118, "auxiliary_loss_mlp": 0.01043761, "balance_loss_clip": 1.0551343, "balance_loss_mlp": 1.0248661, "epoch": 0.19606192695024802, "flos": 20850202644480.0, "grad_norm": 5.605178759176999, "language_loss": 0.82261205, "learning_rate": 3.7176276829261975e-06, "loss": 0.84453082, "num_input_tokens_seen": 70375410, "step": 3261, "time_per_iteration": 2.673092842102051 }, { "auxiliary_loss_clip": 0.01121379, "auxiliary_loss_mlp": 0.01044043, "balance_loss_clip": 1.0550617, "balance_loss_mlp": 1.02488637, "epoch": 0.19612205020291598, "flos": 28475509996800.0, "grad_norm": 1.8492209450679535, "language_loss": 0.76671481, "learning_rate": 3.717428133894807e-06, "loss": 0.78836906, "num_input_tokens_seen": 70396315, "step": 3262, "time_per_iteration": 2.803938150405884 }, { "auxiliary_loss_clip": 0.01148893, "auxiliary_loss_mlp": 0.01047259, "balance_loss_clip": 1.05960584, "balance_loss_mlp": 1.02950907, "epoch": 0.19618217345558395, "flos": 25556618995200.0, "grad_norm": 1.7278621785184562, "language_loss": 0.8668195, "learning_rate": 3.71722851973837e-06, "loss": 0.88878107, "num_input_tokens_seen": 70417945, "step": 3263, "time_per_iteration": 2.6677918434143066 }, { "auxiliary_loss_clip": 0.0113123, "auxiliary_loss_mlp": 0.01042546, "balance_loss_clip": 1.05328059, "balance_loss_mlp": 1.02505815, "epoch": 0.1962422967082519, "flos": 25264413855360.0, "grad_norm": 3.447639973868791, "language_loss": 0.73775035, "learning_rate": 3.717028840464455e-06, "loss": 0.75948811, "num_input_tokens_seen": 70438690, "step": 3264, "time_per_iteration": 2.6973094940185547 }, { "auxiliary_loss_clip": 0.01144053, "auxiliary_loss_mlp": 0.01049918, "balance_loss_clip": 1.05736756, "balance_loss_mlp": 1.03223944, "epoch": 0.19630241996091988, "flos": 18807352444800.0, "grad_norm": 2.4424358562200927, "language_loss": 0.78513813, "learning_rate": 3.7168290960806344e-06, "loss": 0.80707777, "num_input_tokens_seen": 70455385, "step": 3265, "time_per_iteration": 2.625739336013794 }, { "auxiliary_loss_clip": 0.01031434, "auxiliary_loss_mlp": 0.01002481, "balance_loss_clip": 1.03386986, "balance_loss_mlp": 0.99983466, "epoch": 0.19636254321358784, "flos": 62321137896960.0, "grad_norm": 0.7932330660809486, "language_loss": 0.53389955, "learning_rate": 3.716629286594483e-06, "loss": 0.55423868, "num_input_tokens_seen": 70514280, "step": 3266, "time_per_iteration": 3.2586586475372314 }, { "auxiliary_loss_clip": 0.01124628, "auxiliary_loss_mlp": 0.00776501, "balance_loss_clip": 1.04957044, "balance_loss_mlp": 1.00080895, "epoch": 0.19642266646625584, "flos": 21069329564160.0, "grad_norm": 2.0008611208986133, "language_loss": 0.80109024, "learning_rate": 3.7164294120135767e-06, "loss": 0.8201015, "num_input_tokens_seen": 70531800, "step": 3267, "time_per_iteration": 2.678537368774414 }, { "auxiliary_loss_clip": 0.01130982, "auxiliary_loss_mlp": 0.01043983, "balance_loss_clip": 1.05263019, "balance_loss_mlp": 1.02660179, "epoch": 0.1964827897189238, "flos": 14538651229440.0, "grad_norm": 1.9909459598185588, "language_loss": 0.86758262, "learning_rate": 3.7162294723454953e-06, "loss": 0.88933229, "num_input_tokens_seen": 70550615, "step": 3268, "time_per_iteration": 2.6949849128723145 }, { "auxiliary_loss_clip": 0.01099432, "auxiliary_loss_mlp": 0.01041621, "balance_loss_clip": 1.04954004, "balance_loss_mlp": 1.02408528, "epoch": 0.19654291297159177, "flos": 19244636616960.0, "grad_norm": 2.2632495429204127, "language_loss": 0.68785441, "learning_rate": 3.7160294675978197e-06, "loss": 0.70926493, "num_input_tokens_seen": 70568690, "step": 3269, "time_per_iteration": 2.770078182220459 }, { "auxiliary_loss_clip": 0.01116538, "auxiliary_loss_mlp": 0.01052319, "balance_loss_clip": 1.05113554, "balance_loss_mlp": 1.03330541, "epoch": 0.19660303622425973, "flos": 25775710001280.0, "grad_norm": 7.1863103423452355, "language_loss": 0.80241841, "learning_rate": 3.715829397778135e-06, "loss": 0.82410699, "num_input_tokens_seen": 70588665, "step": 3270, "time_per_iteration": 2.7294864654541016 }, { "auxiliary_loss_clip": 0.01139501, "auxiliary_loss_mlp": 0.01045694, "balance_loss_clip": 1.05189824, "balance_loss_mlp": 1.02833724, "epoch": 0.1966631594769277, "flos": 20595093275520.0, "grad_norm": 1.9668649321541274, "language_loss": 0.83912349, "learning_rate": 3.715629262894028e-06, "loss": 0.86097538, "num_input_tokens_seen": 70606900, "step": 3271, "time_per_iteration": 2.640235662460327 }, { "auxiliary_loss_clip": 0.01139368, "auxiliary_loss_mlp": 0.01051303, "balance_loss_clip": 1.05468225, "balance_loss_mlp": 1.0332067, "epoch": 0.19672328272959566, "flos": 23623188600960.0, "grad_norm": 1.9968416702279483, "language_loss": 0.79902714, "learning_rate": 3.715429062953087e-06, "loss": 0.82093388, "num_input_tokens_seen": 70625955, "step": 3272, "time_per_iteration": 2.636629343032837 }, { "auxiliary_loss_clip": 0.01124328, "auxiliary_loss_mlp": 0.01058493, "balance_loss_clip": 1.05192566, "balance_loss_mlp": 1.03715479, "epoch": 0.19678340598226365, "flos": 23110922787840.0, "grad_norm": 1.7302013075823783, "language_loss": 0.80942369, "learning_rate": 3.7152287979629043e-06, "loss": 0.83125186, "num_input_tokens_seen": 70646090, "step": 3273, "time_per_iteration": 2.6967809200286865 }, { "auxiliary_loss_clip": 0.01144024, "auxiliary_loss_mlp": 0.01054564, "balance_loss_clip": 1.05456042, "balance_loss_mlp": 1.03655195, "epoch": 0.19684352923493162, "flos": 24534852716160.0, "grad_norm": 2.225126358921887, "language_loss": 0.77984649, "learning_rate": 3.7150284679310735e-06, "loss": 0.80183232, "num_input_tokens_seen": 70666065, "step": 3274, "time_per_iteration": 2.6808643341064453 }, { "auxiliary_loss_clip": 0.01141267, "auxiliary_loss_mlp": 0.01046445, "balance_loss_clip": 1.05480242, "balance_loss_mlp": 1.02840877, "epoch": 0.19690365248759958, "flos": 21796448578560.0, "grad_norm": 2.318697297640889, "language_loss": 0.81433225, "learning_rate": 3.7148280728651914e-06, "loss": 0.8362093, "num_input_tokens_seen": 70681580, "step": 3275, "time_per_iteration": 2.672672986984253 }, { "auxiliary_loss_clip": 0.01115756, "auxiliary_loss_mlp": 0.01045314, "balance_loss_clip": 1.05148947, "balance_loss_mlp": 1.02686024, "epoch": 0.19696377574026755, "flos": 19056643810560.0, "grad_norm": 2.4665004531377166, "language_loss": 0.80909657, "learning_rate": 3.7146276127728563e-06, "loss": 0.83070731, "num_input_tokens_seen": 70697745, "step": 3276, "time_per_iteration": 2.726970672607422 }, { "auxiliary_loss_clip": 0.01142619, "auxiliary_loss_mlp": 0.01043042, "balance_loss_clip": 1.05443609, "balance_loss_mlp": 1.02491045, "epoch": 0.19702389899293551, "flos": 22820656982400.0, "grad_norm": 2.17541075016206, "language_loss": 0.89113599, "learning_rate": 3.7144270876616713e-06, "loss": 0.9129926, "num_input_tokens_seen": 70715110, "step": 3277, "time_per_iteration": 2.6738827228546143 }, { "auxiliary_loss_clip": 0.01103709, "auxiliary_loss_mlp": 0.01048433, "balance_loss_clip": 1.04638815, "balance_loss_mlp": 1.02864444, "epoch": 0.19708402224560348, "flos": 22894237992960.0, "grad_norm": 2.640727897616601, "language_loss": 0.62070847, "learning_rate": 3.714226497539239e-06, "loss": 0.64222991, "num_input_tokens_seen": 70734715, "step": 3278, "time_per_iteration": 2.7382938861846924 }, { "auxiliary_loss_clip": 0.01115303, "auxiliary_loss_mlp": 0.0105759, "balance_loss_clip": 1.05033016, "balance_loss_mlp": 1.03793263, "epoch": 0.19714414549827144, "flos": 25662519267840.0, "grad_norm": 1.930104581155035, "language_loss": 0.73606467, "learning_rate": 3.714025842413166e-06, "loss": 0.75779366, "num_input_tokens_seen": 70752650, "step": 3279, "time_per_iteration": 2.8123648166656494 }, { "auxiliary_loss_clip": 0.0114648, "auxiliary_loss_mlp": 0.01042853, "balance_loss_clip": 1.05422091, "balance_loss_mlp": 1.02567458, "epoch": 0.19720426875093944, "flos": 23915824704000.0, "grad_norm": 1.7034036878345749, "language_loss": 0.82685816, "learning_rate": 3.713825122291061e-06, "loss": 0.84875143, "num_input_tokens_seen": 70772365, "step": 3280, "time_per_iteration": 2.7000861167907715 }, { "auxiliary_loss_clip": 0.01106655, "auxiliary_loss_mlp": 0.01048884, "balance_loss_clip": 1.04887283, "balance_loss_mlp": 1.03071654, "epoch": 0.1972643920036074, "flos": 13881952828800.0, "grad_norm": 2.435959864664923, "language_loss": 0.78173983, "learning_rate": 3.713624337180536e-06, "loss": 0.80329525, "num_input_tokens_seen": 70790340, "step": 3281, "time_per_iteration": 2.7017247676849365 }, { "auxiliary_loss_clip": 0.01125353, "auxiliary_loss_mlp": 0.0104135, "balance_loss_clip": 1.05461836, "balance_loss_mlp": 1.02519727, "epoch": 0.19732451525627537, "flos": 19863592801920.0, "grad_norm": 1.7390973872526612, "language_loss": 0.79777479, "learning_rate": 3.7134234870892045e-06, "loss": 0.8194418, "num_input_tokens_seen": 70809295, "step": 3282, "time_per_iteration": 2.7064146995544434 }, { "auxiliary_loss_clip": 0.01112073, "auxiliary_loss_mlp": 0.01043047, "balance_loss_clip": 1.05485284, "balance_loss_mlp": 1.02538049, "epoch": 0.19738463850894333, "flos": 24973429777920.0, "grad_norm": 2.512566515566025, "language_loss": 0.7192747, "learning_rate": 3.7132225720246826e-06, "loss": 0.74082589, "num_input_tokens_seen": 70828765, "step": 3283, "time_per_iteration": 2.775297164916992 }, { "auxiliary_loss_clip": 0.01137498, "auxiliary_loss_mlp": 0.01043438, "balance_loss_clip": 1.05320621, "balance_loss_mlp": 1.02665281, "epoch": 0.1974447617616113, "flos": 18368883123840.0, "grad_norm": 1.8864815757917637, "language_loss": 0.78981179, "learning_rate": 3.7130215919945886e-06, "loss": 0.81162113, "num_input_tokens_seen": 70846805, "step": 3284, "time_per_iteration": 2.6344916820526123 }, { "auxiliary_loss_clip": 0.01126512, "auxiliary_loss_mlp": 0.00776821, "balance_loss_clip": 1.05065584, "balance_loss_mlp": 1.00114048, "epoch": 0.19750488501427926, "flos": 22892945103360.0, "grad_norm": 2.1903874509936982, "language_loss": 0.86317503, "learning_rate": 3.7128205470065445e-06, "loss": 0.88220835, "num_input_tokens_seen": 70863805, "step": 3285, "time_per_iteration": 2.725186586380005 }, { "auxiliary_loss_clip": 0.01115791, "auxiliary_loss_mlp": 0.01044707, "balance_loss_clip": 1.05167055, "balance_loss_mlp": 1.02658761, "epoch": 0.19756500826694723, "flos": 21871502046720.0, "grad_norm": 2.208260347555195, "language_loss": 0.88770825, "learning_rate": 3.712619437068174e-06, "loss": 0.90931326, "num_input_tokens_seen": 70882660, "step": 3286, "time_per_iteration": 2.6819698810577393 }, { "auxiliary_loss_clip": 0.01118742, "auxiliary_loss_mlp": 0.01052526, "balance_loss_clip": 1.05227792, "balance_loss_mlp": 1.03016233, "epoch": 0.19762513151961522, "flos": 15158972131200.0, "grad_norm": 2.0768117117784874, "language_loss": 0.77941382, "learning_rate": 3.712418262187102e-06, "loss": 0.80112648, "num_input_tokens_seen": 70898765, "step": 3287, "time_per_iteration": 2.641193389892578 }, { "auxiliary_loss_clip": 0.01127955, "auxiliary_loss_mlp": 0.01047337, "balance_loss_clip": 1.0526104, "balance_loss_mlp": 1.02849019, "epoch": 0.1976852547722832, "flos": 16979175878400.0, "grad_norm": 2.061421898899755, "language_loss": 0.80853081, "learning_rate": 3.7122170223709584e-06, "loss": 0.83028376, "num_input_tokens_seen": 70916370, "step": 3288, "time_per_iteration": 2.625068426132202 }, { "auxiliary_loss_clip": 0.01132408, "auxiliary_loss_mlp": 0.01048194, "balance_loss_clip": 1.05143857, "balance_loss_mlp": 1.03045535, "epoch": 0.19774537802495115, "flos": 20302924049280.0, "grad_norm": 2.345717890688315, "language_loss": 0.7317158, "learning_rate": 3.712015717627374e-06, "loss": 0.75352174, "num_input_tokens_seen": 70934870, "step": 3289, "time_per_iteration": 2.6319406032562256 }, { "auxiliary_loss_clip": 0.01133413, "auxiliary_loss_mlp": 0.01045224, "balance_loss_clip": 1.05575252, "balance_loss_mlp": 1.02678204, "epoch": 0.19780550127761912, "flos": 27235478724480.0, "grad_norm": 1.9087552003653308, "language_loss": 0.79608113, "learning_rate": 3.7118143479639813e-06, "loss": 0.81786746, "num_input_tokens_seen": 70955140, "step": 3290, "time_per_iteration": 2.706570863723755 }, { "auxiliary_loss_clip": 0.01049926, "auxiliary_loss_mlp": 0.0101105, "balance_loss_clip": 1.0327636, "balance_loss_mlp": 1.00853467, "epoch": 0.19786562453028708, "flos": 63550972684800.0, "grad_norm": 0.8952067644857119, "language_loss": 0.60318571, "learning_rate": 3.711612913388418e-06, "loss": 0.62379545, "num_input_tokens_seen": 71012005, "step": 3291, "time_per_iteration": 3.2849009037017822 }, { "auxiliary_loss_clip": 0.01158891, "auxiliary_loss_mlp": 0.01040785, "balance_loss_clip": 1.05417156, "balance_loss_mlp": 1.02088892, "epoch": 0.19792574778295505, "flos": 26286647011200.0, "grad_norm": 1.932789926440358, "language_loss": 0.81595641, "learning_rate": 3.7114114139083204e-06, "loss": 0.83795315, "num_input_tokens_seen": 71031140, "step": 3292, "time_per_iteration": 2.6751551628112793 }, { "auxiliary_loss_clip": 0.01119797, "auxiliary_loss_mlp": 0.00778082, "balance_loss_clip": 1.05296063, "balance_loss_mlp": 1.00086236, "epoch": 0.19798587103562304, "flos": 19938107566080.0, "grad_norm": 2.409042629875397, "language_loss": 0.81013, "learning_rate": 3.7112098495313313e-06, "loss": 0.82910883, "num_input_tokens_seen": 71050250, "step": 3293, "time_per_iteration": 4.3039703369140625 }, { "auxiliary_loss_clip": 0.01137316, "auxiliary_loss_mlp": 0.01052434, "balance_loss_clip": 1.05370128, "balance_loss_mlp": 1.03277683, "epoch": 0.198045994288291, "flos": 20120282369280.0, "grad_norm": 1.8764131105986912, "language_loss": 0.61480314, "learning_rate": 3.711008220265093e-06, "loss": 0.63670063, "num_input_tokens_seen": 71068665, "step": 3294, "time_per_iteration": 2.671241044998169 }, { "auxiliary_loss_clip": 0.01132208, "auxiliary_loss_mlp": 0.01039978, "balance_loss_clip": 1.05456376, "balance_loss_mlp": 1.02201271, "epoch": 0.19810611754095897, "flos": 17967653228160.0, "grad_norm": 2.0334748560156393, "language_loss": 0.87313825, "learning_rate": 3.710806526117251e-06, "loss": 0.89486015, "num_input_tokens_seen": 71085320, "step": 3295, "time_per_iteration": 2.659680128097534 }, { "auxiliary_loss_clip": 0.01113106, "auxiliary_loss_mlp": 0.01050184, "balance_loss_clip": 1.05079484, "balance_loss_mlp": 1.03256536, "epoch": 0.19816624079362694, "flos": 15084996071040.0, "grad_norm": 2.5215255479345067, "language_loss": 0.80839241, "learning_rate": 3.7106047670954544e-06, "loss": 0.83002532, "num_input_tokens_seen": 71102020, "step": 3296, "time_per_iteration": 4.299339294433594 }, { "auxiliary_loss_clip": 0.01123906, "auxiliary_loss_mlp": 0.01045438, "balance_loss_clip": 1.05233586, "balance_loss_mlp": 1.02522039, "epoch": 0.1982263640462949, "flos": 24900315644160.0, "grad_norm": 2.528943220563754, "language_loss": 0.68126047, "learning_rate": 3.710402943207354e-06, "loss": 0.70295388, "num_input_tokens_seen": 71123390, "step": 3297, "time_per_iteration": 4.258284091949463 }, { "auxiliary_loss_clip": 0.01153129, "auxiliary_loss_mlp": 0.01037574, "balance_loss_clip": 1.05660713, "balance_loss_mlp": 1.02031219, "epoch": 0.19828648729896287, "flos": 20376181837440.0, "grad_norm": 1.9083451106828888, "language_loss": 0.81310993, "learning_rate": 3.7102010544606016e-06, "loss": 0.83501697, "num_input_tokens_seen": 71141800, "step": 3298, "time_per_iteration": 2.6156656742095947 }, { "auxiliary_loss_clip": 0.01137409, "auxiliary_loss_mlp": 0.01042227, "balance_loss_clip": 1.0573976, "balance_loss_mlp": 1.02159238, "epoch": 0.19834661055163083, "flos": 18880035615360.0, "grad_norm": 1.8996943203321497, "language_loss": 0.85154539, "learning_rate": 3.7099991008628544e-06, "loss": 0.87334174, "num_input_tokens_seen": 71159505, "step": 3299, "time_per_iteration": 2.6749041080474854 }, { "auxiliary_loss_clip": 0.01036953, "auxiliary_loss_mlp": 0.01013935, "balance_loss_clip": 1.02875936, "balance_loss_mlp": 1.01106215, "epoch": 0.19840673380429882, "flos": 60259184640000.0, "grad_norm": 0.82907550606663, "language_loss": 0.53206414, "learning_rate": 3.7097970824217706e-06, "loss": 0.55257303, "num_input_tokens_seen": 71223265, "step": 3300, "time_per_iteration": 4.83857798576355 }, { "auxiliary_loss_clip": 0.01105122, "auxiliary_loss_mlp": 0.01064471, "balance_loss_clip": 1.04748702, "balance_loss_mlp": 1.0410459, "epoch": 0.1984668570569668, "flos": 19902017376000.0, "grad_norm": 316.1702389408657, "language_loss": 0.73014295, "learning_rate": 3.7095949991450093e-06, "loss": 0.75183886, "num_input_tokens_seen": 71242385, "step": 3301, "time_per_iteration": 2.700654983520508 }, { "auxiliary_loss_clip": 0.01118926, "auxiliary_loss_mlp": 0.01044315, "balance_loss_clip": 1.05295372, "balance_loss_mlp": 1.02619529, "epoch": 0.19852698030963475, "flos": 15630766295040.0, "grad_norm": 2.410718710355122, "language_loss": 0.88264418, "learning_rate": 3.709392851040235e-06, "loss": 0.90427655, "num_input_tokens_seen": 71258990, "step": 3302, "time_per_iteration": 2.7190146446228027 }, { "auxiliary_loss_clip": 0.01118067, "auxiliary_loss_mlp": 0.01045078, "balance_loss_clip": 1.05155802, "balance_loss_mlp": 1.02661204, "epoch": 0.19858710356230272, "flos": 43143007311360.0, "grad_norm": 2.210364764996701, "language_loss": 0.73592931, "learning_rate": 3.709190638115111e-06, "loss": 0.75756073, "num_input_tokens_seen": 71282770, "step": 3303, "time_per_iteration": 2.9379186630249023 }, { "auxiliary_loss_clip": 0.01143275, "auxiliary_loss_mlp": 0.01048515, "balance_loss_clip": 1.05491257, "balance_loss_mlp": 1.03002524, "epoch": 0.19864722681497068, "flos": 35144084643840.0, "grad_norm": 1.9482807590384623, "language_loss": 0.75103521, "learning_rate": 3.7089883603773084e-06, "loss": 0.77295315, "num_input_tokens_seen": 71301410, "step": 3304, "time_per_iteration": 2.743474245071411 }, { "auxiliary_loss_clip": 0.01133571, "auxiliary_loss_mlp": 0.01034983, "balance_loss_clip": 1.05309725, "balance_loss_mlp": 1.01710188, "epoch": 0.19870735006763865, "flos": 19426200888960.0, "grad_norm": 1.8722016114425952, "language_loss": 0.8628391, "learning_rate": 3.7087860178344955e-06, "loss": 0.8845247, "num_input_tokens_seen": 71319670, "step": 3305, "time_per_iteration": 2.7129390239715576 }, { "auxiliary_loss_clip": 0.01128329, "auxiliary_loss_mlp": 0.01044081, "balance_loss_clip": 1.04770195, "balance_loss_mlp": 1.02603281, "epoch": 0.19876747332030664, "flos": 23547380947200.0, "grad_norm": 2.9829227362861106, "language_loss": 0.68476367, "learning_rate": 3.7085836104943445e-06, "loss": 0.70648777, "num_input_tokens_seen": 71339850, "step": 3306, "time_per_iteration": 2.7083208560943604 }, { "auxiliary_loss_clip": 0.01119386, "auxiliary_loss_mlp": 0.01038782, "balance_loss_clip": 1.04822719, "balance_loss_mlp": 1.02168787, "epoch": 0.1988275965729746, "flos": 19829406032640.0, "grad_norm": 1.683647244561179, "language_loss": 0.76433122, "learning_rate": 3.7083811383645332e-06, "loss": 0.78591287, "num_input_tokens_seen": 71359795, "step": 3307, "time_per_iteration": 2.728661298751831 }, { "auxiliary_loss_clip": 0.01157548, "auxiliary_loss_mlp": 0.01044665, "balance_loss_clip": 1.05895782, "balance_loss_mlp": 1.02714145, "epoch": 0.19888771982564257, "flos": 23513625141120.0, "grad_norm": 2.438172575069382, "language_loss": 0.75991976, "learning_rate": 3.708178601452737e-06, "loss": 0.78194201, "num_input_tokens_seen": 71378885, "step": 3308, "time_per_iteration": 2.6580557823181152 }, { "auxiliary_loss_clip": 0.01107283, "auxiliary_loss_mlp": 0.01041656, "balance_loss_clip": 1.05453563, "balance_loss_mlp": 1.02307141, "epoch": 0.19894784307831054, "flos": 18150510389760.0, "grad_norm": 1.928689575161362, "language_loss": 0.76043576, "learning_rate": 3.7079759997666374e-06, "loss": 0.7819252, "num_input_tokens_seen": 71397285, "step": 3309, "time_per_iteration": 2.77226185798645 }, { "auxiliary_loss_clip": 0.0114115, "auxiliary_loss_mlp": 0.01045061, "balance_loss_clip": 1.05222607, "balance_loss_mlp": 1.02592754, "epoch": 0.1990079663309785, "flos": 24276044246400.0, "grad_norm": 75.17312936609292, "language_loss": 0.87855697, "learning_rate": 3.707773333313917e-06, "loss": 0.90041906, "num_input_tokens_seen": 71415775, "step": 3310, "time_per_iteration": 2.6789662837982178 }, { "auxiliary_loss_clip": 0.01153037, "auxiliary_loss_mlp": 0.01039864, "balance_loss_clip": 1.05415869, "balance_loss_mlp": 1.02139854, "epoch": 0.19906808958364647, "flos": 34897666366080.0, "grad_norm": 2.3155756588664342, "language_loss": 0.63650048, "learning_rate": 3.70757060210226e-06, "loss": 0.6584295, "num_input_tokens_seen": 71437315, "step": 3311, "time_per_iteration": 2.7604620456695557 }, { "auxiliary_loss_clip": 0.01115133, "auxiliary_loss_mlp": 0.01043871, "balance_loss_clip": 1.04763019, "balance_loss_mlp": 1.02501202, "epoch": 0.19912821283631443, "flos": 24024885373440.0, "grad_norm": 3.8064295514597717, "language_loss": 0.74542546, "learning_rate": 3.707367806139355e-06, "loss": 0.76701546, "num_input_tokens_seen": 71456320, "step": 3312, "time_per_iteration": 2.796475410461426 }, { "auxiliary_loss_clip": 0.01141587, "auxiliary_loss_mlp": 0.01037435, "balance_loss_clip": 1.05358124, "balance_loss_mlp": 1.02017355, "epoch": 0.19918833608898243, "flos": 19859031774720.0, "grad_norm": 2.2312990164825943, "language_loss": 0.84033173, "learning_rate": 3.7071649454328915e-06, "loss": 0.86212194, "num_input_tokens_seen": 71475360, "step": 3313, "time_per_iteration": 2.6044952869415283 }, { "auxiliary_loss_clip": 0.01146797, "auxiliary_loss_mlp": 0.01042166, "balance_loss_clip": 1.05695391, "balance_loss_mlp": 1.02422476, "epoch": 0.1992484593416504, "flos": 29095794984960.0, "grad_norm": 3.856678450124864, "language_loss": 0.810305, "learning_rate": 3.7069620199905625e-06, "loss": 0.83219463, "num_input_tokens_seen": 71496155, "step": 3314, "time_per_iteration": 2.68841814994812 }, { "auxiliary_loss_clip": 0.01112846, "auxiliary_loss_mlp": 0.01043677, "balance_loss_clip": 1.04617178, "balance_loss_mlp": 1.02643955, "epoch": 0.19930858259431836, "flos": 23295001011840.0, "grad_norm": 1.4822079401394097, "language_loss": 0.87391549, "learning_rate": 3.7067590298200627e-06, "loss": 0.89548075, "num_input_tokens_seen": 71517295, "step": 3315, "time_per_iteration": 2.720093011856079 }, { "auxiliary_loss_clip": 0.0111589, "auxiliary_loss_mlp": 0.00777002, "balance_loss_clip": 1.04992676, "balance_loss_mlp": 1.00093687, "epoch": 0.19936870584698632, "flos": 25378825651200.0, "grad_norm": 1.7805516248937883, "language_loss": 0.70957202, "learning_rate": 3.7065559749290892e-06, "loss": 0.72850096, "num_input_tokens_seen": 71540000, "step": 3316, "time_per_iteration": 2.850100517272949 }, { "auxiliary_loss_clip": 0.01019745, "auxiliary_loss_mlp": 0.01012504, "balance_loss_clip": 1.03032303, "balance_loss_mlp": 1.01003671, "epoch": 0.1994288290996543, "flos": 62168053109760.0, "grad_norm": 0.8326978726055106, "language_loss": 0.66287398, "learning_rate": 3.706352855325342e-06, "loss": 0.68319643, "num_input_tokens_seen": 71607880, "step": 3317, "time_per_iteration": 3.425114870071411 }, { "auxiliary_loss_clip": 0.01148059, "auxiliary_loss_mlp": 0.01048913, "balance_loss_clip": 1.05397809, "balance_loss_mlp": 1.02964854, "epoch": 0.19948895235232225, "flos": 19025832919680.0, "grad_norm": 2.282515690517884, "language_loss": 0.74494618, "learning_rate": 3.7061496710165233e-06, "loss": 0.76691592, "num_input_tokens_seen": 71625695, "step": 3318, "time_per_iteration": 2.6815896034240723 }, { "auxiliary_loss_clip": 0.01114942, "auxiliary_loss_mlp": 0.01044681, "balance_loss_clip": 1.04767084, "balance_loss_mlp": 1.02786088, "epoch": 0.19954907560499022, "flos": 37815803182080.0, "grad_norm": 1.8966456913695608, "language_loss": 0.78894758, "learning_rate": 3.7059464220103385e-06, "loss": 0.81054389, "num_input_tokens_seen": 71648520, "step": 3319, "time_per_iteration": 2.847911834716797 }, { "auxiliary_loss_clip": 0.01134557, "auxiliary_loss_mlp": 0.01042988, "balance_loss_clip": 1.05354095, "balance_loss_mlp": 1.02312756, "epoch": 0.1996091988576582, "flos": 49565199594240.0, "grad_norm": 2.1348540211051197, "language_loss": 0.76006937, "learning_rate": 3.7057431083144945e-06, "loss": 0.78184479, "num_input_tokens_seen": 71672185, "step": 3320, "time_per_iteration": 2.9324615001678467 }, { "auxiliary_loss_clip": 0.01120226, "auxiliary_loss_mlp": 0.01042998, "balance_loss_clip": 1.05083311, "balance_loss_mlp": 1.02496171, "epoch": 0.19966932211032618, "flos": 22635788659200.0, "grad_norm": 2.2436863685702546, "language_loss": 0.80077857, "learning_rate": 3.705539729936701e-06, "loss": 0.82241082, "num_input_tokens_seen": 71692890, "step": 3321, "time_per_iteration": 2.7534186840057373 }, { "auxiliary_loss_clip": 0.01033096, "auxiliary_loss_mlp": 0.01011167, "balance_loss_clip": 1.02391553, "balance_loss_mlp": 1.00828266, "epoch": 0.19972944536299414, "flos": 54082117745280.0, "grad_norm": 0.874673110280983, "language_loss": 0.65145189, "learning_rate": 3.7053362868846696e-06, "loss": 0.67189455, "num_input_tokens_seen": 71745815, "step": 3322, "time_per_iteration": 3.0398683547973633 }, { "auxiliary_loss_clip": 0.01039999, "auxiliary_loss_mlp": 0.01007775, "balance_loss_clip": 1.02971482, "balance_loss_mlp": 1.00479472, "epoch": 0.1997895686156621, "flos": 69355031817600.0, "grad_norm": 0.7915334307535052, "language_loss": 0.56919783, "learning_rate": 3.7051327791661153e-06, "loss": 0.58967561, "num_input_tokens_seen": 71806915, "step": 3323, "time_per_iteration": 3.2814581394195557 }, { "auxiliary_loss_clip": 0.01131487, "auxiliary_loss_mlp": 0.00776139, "balance_loss_clip": 1.05244064, "balance_loss_mlp": 1.00085235, "epoch": 0.19984969186833007, "flos": 18552063507840.0, "grad_norm": 1.8766856730809967, "language_loss": 0.80573648, "learning_rate": 3.7049292067887555e-06, "loss": 0.82481277, "num_input_tokens_seen": 71824645, "step": 3324, "time_per_iteration": 2.66456937789917 }, { "auxiliary_loss_clip": 0.01132572, "auxiliary_loss_mlp": 0.01050254, "balance_loss_clip": 1.04625165, "balance_loss_mlp": 1.03027487, "epoch": 0.19990981512099804, "flos": 26429678968320.0, "grad_norm": 2.4535669107623486, "language_loss": 0.53931105, "learning_rate": 3.7047255697603092e-06, "loss": 0.56113935, "num_input_tokens_seen": 71845125, "step": 3325, "time_per_iteration": 2.696556329727173 }, { "auxiliary_loss_clip": 0.01130165, "auxiliary_loss_mlp": 0.01050725, "balance_loss_clip": 1.05065942, "balance_loss_mlp": 1.03328443, "epoch": 0.19996993837366603, "flos": 16325997010560.0, "grad_norm": 2.1570763946475187, "language_loss": 0.86074936, "learning_rate": 3.7045218680884984e-06, "loss": 0.88255823, "num_input_tokens_seen": 71863500, "step": 3326, "time_per_iteration": 2.7167885303497314 }, { "auxiliary_loss_clip": 0.0115173, "auxiliary_loss_mlp": 0.01042065, "balance_loss_clip": 1.05427039, "balance_loss_mlp": 1.02511311, "epoch": 0.200030061626334, "flos": 20844169159680.0, "grad_norm": 2.0419576492150395, "language_loss": 0.71793801, "learning_rate": 3.7043181017810476e-06, "loss": 0.73987597, "num_input_tokens_seen": 71881845, "step": 3327, "time_per_iteration": 2.6097662448883057 }, { "auxiliary_loss_clip": 0.01131035, "auxiliary_loss_mlp": 0.01052756, "balance_loss_clip": 1.05146813, "balance_loss_mlp": 1.03290796, "epoch": 0.20009018487900196, "flos": 23762629198080.0, "grad_norm": 1.8948781463857982, "language_loss": 0.7668376, "learning_rate": 3.7041142708456833e-06, "loss": 0.78867549, "num_input_tokens_seen": 71900940, "step": 3328, "time_per_iteration": 2.6869349479675293 }, { "auxiliary_loss_clip": 0.01118681, "auxiliary_loss_mlp": 0.01044603, "balance_loss_clip": 1.04693103, "balance_loss_mlp": 1.02799726, "epoch": 0.20015030813166992, "flos": 28111555440000.0, "grad_norm": 2.0833377369651984, "language_loss": 0.69400644, "learning_rate": 3.7039103752901353e-06, "loss": 0.71563935, "num_input_tokens_seen": 71921925, "step": 3329, "time_per_iteration": 2.844280481338501 }, { "auxiliary_loss_clip": 0.01107384, "auxiliary_loss_mlp": 0.01069575, "balance_loss_clip": 1.04727411, "balance_loss_mlp": 1.04641271, "epoch": 0.2002104313843379, "flos": 26067160955520.0, "grad_norm": 3.099532194576676, "language_loss": 0.81395614, "learning_rate": 3.7037064151221353e-06, "loss": 0.83572567, "num_input_tokens_seen": 71941855, "step": 3330, "time_per_iteration": 2.841885566711426 }, { "auxiliary_loss_clip": 0.01137825, "auxiliary_loss_mlp": 0.01048123, "balance_loss_clip": 1.05147684, "balance_loss_mlp": 1.02977705, "epoch": 0.20027055463700585, "flos": 22966633854720.0, "grad_norm": 2.224132696455658, "language_loss": 0.76606882, "learning_rate": 3.703502390349417e-06, "loss": 0.78792834, "num_input_tokens_seen": 71960915, "step": 3331, "time_per_iteration": 2.7007360458374023 }, { "auxiliary_loss_clip": 0.01093521, "auxiliary_loss_mlp": 0.01069739, "balance_loss_clip": 1.04292202, "balance_loss_mlp": 1.04851985, "epoch": 0.20033067788967382, "flos": 17165660313600.0, "grad_norm": 2.044808670508971, "language_loss": 0.79330826, "learning_rate": 3.7032983009797176e-06, "loss": 0.81494087, "num_input_tokens_seen": 71979220, "step": 3332, "time_per_iteration": 4.518973112106323 }, { "auxiliary_loss_clip": 0.01046467, "auxiliary_loss_mlp": 0.010754, "balance_loss_clip": 1.02134657, "balance_loss_mlp": 1.07303989, "epoch": 0.2003908011423418, "flos": 60825566292480.0, "grad_norm": 0.9607431077817938, "language_loss": 0.61968678, "learning_rate": 3.703094147020776e-06, "loss": 0.64090544, "num_input_tokens_seen": 72033950, "step": 3333, "time_per_iteration": 3.074782371520996 }, { "auxiliary_loss_clip": 0.01112058, "auxiliary_loss_mlp": 0.00777645, "balance_loss_clip": 1.04686844, "balance_loss_mlp": 1.00099933, "epoch": 0.20045092439500978, "flos": 24206234163840.0, "grad_norm": 2.9954165903614447, "language_loss": 0.81385547, "learning_rate": 3.7028899284803334e-06, "loss": 0.83275253, "num_input_tokens_seen": 72051395, "step": 3334, "time_per_iteration": 4.270732641220093 }, { "auxiliary_loss_clip": 0.01096467, "auxiliary_loss_mlp": 0.01058699, "balance_loss_clip": 1.04709518, "balance_loss_mlp": 1.03889799, "epoch": 0.20051104764767774, "flos": 29387605075200.0, "grad_norm": 2.9016061168315703, "language_loss": 0.74238038, "learning_rate": 3.702685645366134e-06, "loss": 0.76393211, "num_input_tokens_seen": 72071305, "step": 3335, "time_per_iteration": 4.376626491546631 }, { "auxiliary_loss_clip": 0.01149242, "auxiliary_loss_mlp": 0.01059851, "balance_loss_clip": 1.05611062, "balance_loss_mlp": 1.04120684, "epoch": 0.2005711709003457, "flos": 23513804709120.0, "grad_norm": 1.700795836589561, "language_loss": 0.79981416, "learning_rate": 3.7024812976859243e-06, "loss": 0.82190514, "num_input_tokens_seen": 72090165, "step": 3336, "time_per_iteration": 2.7031586170196533 }, { "auxiliary_loss_clip": 0.01116655, "auxiliary_loss_mlp": 0.01048065, "balance_loss_clip": 1.04808092, "balance_loss_mlp": 1.0272038, "epoch": 0.20063129415301367, "flos": 22523388024960.0, "grad_norm": 2.0182523905302157, "language_loss": 0.7761423, "learning_rate": 3.7022768854474532e-06, "loss": 0.79778945, "num_input_tokens_seen": 72107210, "step": 3337, "time_per_iteration": 2.6990835666656494 }, { "auxiliary_loss_clip": 0.01158617, "auxiliary_loss_mlp": 0.01045618, "balance_loss_clip": 1.05752003, "balance_loss_mlp": 1.02631783, "epoch": 0.20069141740568164, "flos": 25958243940480.0, "grad_norm": 2.232061800350416, "language_loss": 0.69108742, "learning_rate": 3.7020724086584724e-06, "loss": 0.71312982, "num_input_tokens_seen": 72126315, "step": 3338, "time_per_iteration": 2.6827659606933594 }, { "auxiliary_loss_clip": 0.01117671, "auxiliary_loss_mlp": 0.01053755, "balance_loss_clip": 1.04930723, "balance_loss_mlp": 1.03543282, "epoch": 0.2007515406583496, "flos": 24790608529920.0, "grad_norm": 2.685005372503905, "language_loss": 0.68898237, "learning_rate": 3.701867867326735e-06, "loss": 0.71069658, "num_input_tokens_seen": 72146470, "step": 3339, "time_per_iteration": 4.430418014526367 }, { "auxiliary_loss_clip": 0.01123098, "auxiliary_loss_mlp": 0.01041763, "balance_loss_clip": 1.05656064, "balance_loss_mlp": 1.02408433, "epoch": 0.2008116639110176, "flos": 37925582123520.0, "grad_norm": 2.0597617887640607, "language_loss": 0.66606021, "learning_rate": 3.7016632614599974e-06, "loss": 0.6877088, "num_input_tokens_seen": 72166600, "step": 3340, "time_per_iteration": 3.0020461082458496 }, { "auxiliary_loss_clip": 0.01145166, "auxiliary_loss_mlp": 0.01036815, "balance_loss_clip": 1.05326021, "balance_loss_mlp": 1.01712155, "epoch": 0.20087178716368556, "flos": 20740531443840.0, "grad_norm": 6.669810478748975, "language_loss": 0.74554622, "learning_rate": 3.701458591066019e-06, "loss": 0.76736599, "num_input_tokens_seen": 72185160, "step": 3341, "time_per_iteration": 2.762573480606079 }, { "auxiliary_loss_clip": 0.01110242, "auxiliary_loss_mlp": 0.01044424, "balance_loss_clip": 1.04981375, "balance_loss_mlp": 1.02595794, "epoch": 0.20093191041635353, "flos": 23842279607040.0, "grad_norm": 7.177474445031109, "language_loss": 0.71779013, "learning_rate": 3.70125385615256e-06, "loss": 0.73933673, "num_input_tokens_seen": 72205160, "step": 3342, "time_per_iteration": 2.7128167152404785 }, { "auxiliary_loss_clip": 0.01114025, "auxiliary_loss_mlp": 0.01045057, "balance_loss_clip": 1.05036438, "balance_loss_mlp": 1.02749765, "epoch": 0.2009920336690215, "flos": 21792067119360.0, "grad_norm": 2.3652416151608873, "language_loss": 0.72892809, "learning_rate": 3.701049056727384e-06, "loss": 0.75051892, "num_input_tokens_seen": 72223555, "step": 3343, "time_per_iteration": 2.8155410289764404 }, { "auxiliary_loss_clip": 0.01113341, "auxiliary_loss_mlp": 0.01046556, "balance_loss_clip": 1.04568779, "balance_loss_mlp": 1.02762532, "epoch": 0.20105215692168946, "flos": 26359222440960.0, "grad_norm": 2.2972411099560195, "language_loss": 0.80645263, "learning_rate": 3.7008441927982574e-06, "loss": 0.82805163, "num_input_tokens_seen": 72242465, "step": 3344, "time_per_iteration": 2.780198335647583 }, { "auxiliary_loss_clip": 0.01155099, "auxiliary_loss_mlp": 0.01045938, "balance_loss_clip": 1.05386972, "balance_loss_mlp": 1.02773499, "epoch": 0.20111228017435742, "flos": 18807280617600.0, "grad_norm": 2.2640230255386125, "language_loss": 0.83114576, "learning_rate": 3.700639264372948e-06, "loss": 0.85315621, "num_input_tokens_seen": 72260655, "step": 3345, "time_per_iteration": 2.6209781169891357 }, { "auxiliary_loss_clip": 0.01093716, "auxiliary_loss_mlp": 0.01041329, "balance_loss_clip": 1.04619193, "balance_loss_mlp": 1.02492619, "epoch": 0.20117240342702541, "flos": 19975059682560.0, "grad_norm": 1.7610524328763844, "language_loss": 0.67947632, "learning_rate": 3.7004342714592283e-06, "loss": 0.70082676, "num_input_tokens_seen": 72279055, "step": 3346, "time_per_iteration": 2.692222833633423 }, { "auxiliary_loss_clip": 0.01114086, "auxiliary_loss_mlp": 0.01048128, "balance_loss_clip": 1.04710329, "balance_loss_mlp": 1.03028262, "epoch": 0.20123252667969338, "flos": 23142703345920.0, "grad_norm": 2.3067659385334958, "language_loss": 0.72993439, "learning_rate": 3.70022921406487e-06, "loss": 0.75155658, "num_input_tokens_seen": 72297895, "step": 3347, "time_per_iteration": 2.7501564025878906 }, { "auxiliary_loss_clip": 0.01142236, "auxiliary_loss_mlp": 0.01047715, "balance_loss_clip": 1.05465829, "balance_loss_mlp": 1.03122878, "epoch": 0.20129264993236134, "flos": 23221671396480.0, "grad_norm": 1.5798788242702444, "language_loss": 0.86869538, "learning_rate": 3.70002409219765e-06, "loss": 0.8905949, "num_input_tokens_seen": 72318385, "step": 3348, "time_per_iteration": 2.688606023788452 }, { "auxiliary_loss_clip": 0.01099793, "auxiliary_loss_mlp": 0.01045183, "balance_loss_clip": 1.04737949, "balance_loss_mlp": 1.02587092, "epoch": 0.2013527731850293, "flos": 21871466133120.0, "grad_norm": 1.8024729376762028, "language_loss": 0.71082795, "learning_rate": 3.699818905865346e-06, "loss": 0.73227775, "num_input_tokens_seen": 72338235, "step": 3349, "time_per_iteration": 2.8423163890838623 }, { "auxiliary_loss_clip": 0.01119982, "auxiliary_loss_mlp": 0.01044662, "balance_loss_clip": 1.0504061, "balance_loss_mlp": 1.02520752, "epoch": 0.20141289643769728, "flos": 18040803275520.0, "grad_norm": 1.7324672298731074, "language_loss": 0.71324664, "learning_rate": 3.6996136550757377e-06, "loss": 0.73489314, "num_input_tokens_seen": 72357825, "step": 3350, "time_per_iteration": 2.7691454887390137 }, { "auxiliary_loss_clip": 0.01126392, "auxiliary_loss_mlp": 0.01043835, "balance_loss_clip": 1.0497458, "balance_loss_mlp": 1.02312887, "epoch": 0.20147301969036524, "flos": 23951412103680.0, "grad_norm": 2.3965463087123107, "language_loss": 0.76391226, "learning_rate": 3.69940833983661e-06, "loss": 0.78561449, "num_input_tokens_seen": 72376335, "step": 3351, "time_per_iteration": 2.701244592666626 }, { "auxiliary_loss_clip": 0.01134085, "auxiliary_loss_mlp": 0.01047695, "balance_loss_clip": 1.05303741, "balance_loss_mlp": 1.02840734, "epoch": 0.2015331429430332, "flos": 25588471380480.0, "grad_norm": 1.5574195085232978, "language_loss": 0.80808926, "learning_rate": 3.699202960155748e-06, "loss": 0.82990712, "num_input_tokens_seen": 72395440, "step": 3352, "time_per_iteration": 2.707792043685913 }, { "auxiliary_loss_clip": 0.011457, "auxiliary_loss_mlp": 0.01042883, "balance_loss_clip": 1.05415952, "balance_loss_mlp": 1.0244298, "epoch": 0.2015932661957012, "flos": 26724972677760.0, "grad_norm": 1.9831574274346238, "language_loss": 0.80594563, "learning_rate": 3.6989975160409396e-06, "loss": 0.82783151, "num_input_tokens_seen": 72414670, "step": 3353, "time_per_iteration": 2.675960063934326 }, { "auxiliary_loss_clip": 0.01126272, "auxiliary_loss_mlp": 0.01045978, "balance_loss_clip": 1.05195928, "balance_loss_mlp": 1.02787042, "epoch": 0.20165338944836916, "flos": 15633136592640.0, "grad_norm": 2.0684163707657763, "language_loss": 0.90046668, "learning_rate": 3.6987920074999747e-06, "loss": 0.92218912, "num_input_tokens_seen": 72432210, "step": 3354, "time_per_iteration": 2.6648361682891846 }, { "auxiliary_loss_clip": 0.0104514, "auxiliary_loss_mlp": 0.0075774, "balance_loss_clip": 1.0285337, "balance_loss_mlp": 1.00170481, "epoch": 0.20171351270103713, "flos": 57912529207680.0, "grad_norm": 0.8264169258847935, "language_loss": 0.55863291, "learning_rate": 3.6985864345406465e-06, "loss": 0.57666171, "num_input_tokens_seen": 72489225, "step": 3355, "time_per_iteration": 3.155352830886841 }, { "auxiliary_loss_clip": 0.01127799, "auxiliary_loss_mlp": 0.00776255, "balance_loss_clip": 1.05133796, "balance_loss_mlp": 1.00109434, "epoch": 0.2017736359537051, "flos": 20814363849600.0, "grad_norm": 1.8367443502770229, "language_loss": 0.84333616, "learning_rate": 3.698380797170751e-06, "loss": 0.86237669, "num_input_tokens_seen": 72508715, "step": 3356, "time_per_iteration": 2.754645586013794 }, { "auxiliary_loss_clip": 0.01127514, "auxiliary_loss_mlp": 0.01052066, "balance_loss_clip": 1.04904747, "balance_loss_mlp": 1.02811635, "epoch": 0.20183375920637306, "flos": 17092043389440.0, "grad_norm": 3.2349249330618504, "language_loss": 0.70046175, "learning_rate": 3.698175095398085e-06, "loss": 0.72225749, "num_input_tokens_seen": 72525135, "step": 3357, "time_per_iteration": 2.6905863285064697 }, { "auxiliary_loss_clip": 0.0113535, "auxiliary_loss_mlp": 0.01044956, "balance_loss_clip": 1.05209541, "balance_loss_mlp": 1.02590632, "epoch": 0.20189388245904102, "flos": 18661339658880.0, "grad_norm": 2.41944886120848, "language_loss": 0.7169627, "learning_rate": 3.6979693292304493e-06, "loss": 0.73876572, "num_input_tokens_seen": 72543690, "step": 3358, "time_per_iteration": 2.696295738220215 }, { "auxiliary_loss_clip": 0.01139673, "auxiliary_loss_mlp": 0.01052145, "balance_loss_clip": 1.05050206, "balance_loss_mlp": 1.03496706, "epoch": 0.20195400571170902, "flos": 16797539779200.0, "grad_norm": 2.6870341127491675, "language_loss": 0.83242267, "learning_rate": 3.6977634986756463e-06, "loss": 0.85434085, "num_input_tokens_seen": 72560725, "step": 3359, "time_per_iteration": 2.6779677867889404 }, { "auxiliary_loss_clip": 0.01052166, "auxiliary_loss_mlp": 0.01026452, "balance_loss_clip": 1.02534354, "balance_loss_mlp": 1.02345943, "epoch": 0.20201412896437698, "flos": 67174716268800.0, "grad_norm": 0.8259567660078829, "language_loss": 0.58980465, "learning_rate": 3.697557603741482e-06, "loss": 0.61059082, "num_input_tokens_seen": 72621940, "step": 3360, "time_per_iteration": 3.1175289154052734 }, { "auxiliary_loss_clip": 0.01096543, "auxiliary_loss_mlp": 0.01051237, "balance_loss_clip": 1.05081403, "balance_loss_mlp": 1.03154337, "epoch": 0.20207425221704495, "flos": 21325013550720.0, "grad_norm": 2.668010943284884, "language_loss": 0.63219774, "learning_rate": 3.697351644435763e-06, "loss": 0.65367556, "num_input_tokens_seen": 72639135, "step": 3361, "time_per_iteration": 2.7732017040252686 }, { "auxiliary_loss_clip": 0.01119862, "auxiliary_loss_mlp": 0.01069748, "balance_loss_clip": 1.04988885, "balance_loss_mlp": 1.05035317, "epoch": 0.2021343754697129, "flos": 22527158952960.0, "grad_norm": 1.9150118782569074, "language_loss": 0.75946522, "learning_rate": 3.6971456207662993e-06, "loss": 0.78136134, "num_input_tokens_seen": 72658525, "step": 3362, "time_per_iteration": 2.755686044692993 }, { "auxiliary_loss_clip": 0.01139499, "auxiliary_loss_mlp": 0.00777827, "balance_loss_clip": 1.05068207, "balance_loss_mlp": 1.0011797, "epoch": 0.20219449872238088, "flos": 19062785036160.0, "grad_norm": 2.043450343479612, "language_loss": 0.76542944, "learning_rate": 3.6969395327409035e-06, "loss": 0.78460264, "num_input_tokens_seen": 72678085, "step": 3363, "time_per_iteration": 2.788773775100708 }, { "auxiliary_loss_clip": 0.01143235, "auxiliary_loss_mlp": 0.01068217, "balance_loss_clip": 1.05241406, "balance_loss_mlp": 1.0511229, "epoch": 0.20225462197504884, "flos": 24717027519360.0, "grad_norm": 1.8380065969237507, "language_loss": 0.75088942, "learning_rate": 3.696733380367391e-06, "loss": 0.773004, "num_input_tokens_seen": 72698695, "step": 3364, "time_per_iteration": 2.7484803199768066 }, { "auxiliary_loss_clip": 0.01111683, "auxiliary_loss_mlp": 0.01065374, "balance_loss_clip": 1.05202723, "balance_loss_mlp": 1.04583549, "epoch": 0.2023147452277168, "flos": 22018304931840.0, "grad_norm": 2.1478979049108395, "language_loss": 0.71917796, "learning_rate": 3.6965271636535783e-06, "loss": 0.7409485, "num_input_tokens_seen": 72717880, "step": 3365, "time_per_iteration": 2.770939350128174 }, { "auxiliary_loss_clip": 0.01110149, "auxiliary_loss_mlp": 0.01064133, "balance_loss_clip": 1.04989934, "balance_loss_mlp": 1.04559648, "epoch": 0.2023748684803848, "flos": 17745365911680.0, "grad_norm": 2.2136098995040228, "language_loss": 0.85318875, "learning_rate": 3.696320882607286e-06, "loss": 0.87493157, "num_input_tokens_seen": 72736410, "step": 3366, "time_per_iteration": 2.717759609222412 }, { "auxiliary_loss_clip": 0.01116913, "auxiliary_loss_mlp": 0.0106476, "balance_loss_clip": 1.050488, "balance_loss_mlp": 1.04605615, "epoch": 0.20243499173305277, "flos": 31138932493440.0, "grad_norm": 2.048733189447585, "language_loss": 0.69766563, "learning_rate": 3.696114537236335e-06, "loss": 0.71948242, "num_input_tokens_seen": 72758295, "step": 3367, "time_per_iteration": 2.788444995880127 }, { "auxiliary_loss_clip": 0.01144949, "auxiliary_loss_mlp": 0.01060722, "balance_loss_clip": 1.04997301, "balance_loss_mlp": 1.03857303, "epoch": 0.20249511498572073, "flos": 33839235279360.0, "grad_norm": 1.942153338299175, "language_loss": 0.68162113, "learning_rate": 3.6959081275485512e-06, "loss": 0.70367789, "num_input_tokens_seen": 72782495, "step": 3368, "time_per_iteration": 2.7339746952056885 }, { "auxiliary_loss_clip": 0.01123527, "auxiliary_loss_mlp": 0.01063426, "balance_loss_clip": 1.0543493, "balance_loss_mlp": 1.04405439, "epoch": 0.2025552382383887, "flos": 21215629658880.0, "grad_norm": 1.8860162071579365, "language_loss": 0.77298439, "learning_rate": 3.6957016535517615e-06, "loss": 0.79485393, "num_input_tokens_seen": 72801885, "step": 3369, "time_per_iteration": 2.739088535308838 }, { "auxiliary_loss_clip": 0.01136965, "auxiliary_loss_mlp": 0.01071822, "balance_loss_clip": 1.05140853, "balance_loss_mlp": 1.05315351, "epoch": 0.20261536149105666, "flos": 14647388676480.0, "grad_norm": 2.9806431283259354, "language_loss": 0.65055734, "learning_rate": 3.695495115253795e-06, "loss": 0.67264521, "num_input_tokens_seen": 72816990, "step": 3370, "time_per_iteration": 2.7082977294921875 }, { "auxiliary_loss_clip": 0.0105828, "auxiliary_loss_mlp": 0.01019528, "balance_loss_clip": 1.03235602, "balance_loss_mlp": 1.01690567, "epoch": 0.20267548474372463, "flos": 66783649921920.0, "grad_norm": 0.678414814309544, "language_loss": 0.58126765, "learning_rate": 3.6952885126624834e-06, "loss": 0.60204571, "num_input_tokens_seen": 72879240, "step": 3371, "time_per_iteration": 4.805691242218018 }, { "auxiliary_loss_clip": 0.01117624, "auxiliary_loss_mlp": 0.01050757, "balance_loss_clip": 1.04833245, "balance_loss_mlp": 1.0329231, "epoch": 0.2027356079963926, "flos": 24680793674880.0, "grad_norm": 2.167047343870177, "language_loss": 0.91830015, "learning_rate": 3.6950818457856617e-06, "loss": 0.9399839, "num_input_tokens_seen": 72899030, "step": 3372, "time_per_iteration": 4.306687831878662 }, { "auxiliary_loss_clip": 0.01137734, "auxiliary_loss_mlp": 0.01057192, "balance_loss_clip": 1.05065978, "balance_loss_mlp": 1.03598428, "epoch": 0.20279573124906058, "flos": 26392762765440.0, "grad_norm": 2.1240220719821195, "language_loss": 0.78505349, "learning_rate": 3.694875114631167e-06, "loss": 0.80700278, "num_input_tokens_seen": 72919190, "step": 3373, "time_per_iteration": 4.223219394683838 }, { "auxiliary_loss_clip": 0.01091396, "auxiliary_loss_mlp": 0.01058555, "balance_loss_clip": 1.04464257, "balance_loss_mlp": 1.03719246, "epoch": 0.20285585450172855, "flos": 33799984692480.0, "grad_norm": 2.5403716567908745, "language_loss": 0.71275264, "learning_rate": 3.6946683192068377e-06, "loss": 0.7342521, "num_input_tokens_seen": 72939720, "step": 3374, "time_per_iteration": 2.853079319000244 }, { "auxiliary_loss_clip": 0.01042818, "auxiliary_loss_mlp": 0.01010518, "balance_loss_clip": 1.02580416, "balance_loss_mlp": 1.00797904, "epoch": 0.20291597775439651, "flos": 71164823598720.0, "grad_norm": 0.9711663240936556, "language_loss": 0.62466931, "learning_rate": 3.694461459520516e-06, "loss": 0.64520264, "num_input_tokens_seen": 73000015, "step": 3375, "time_per_iteration": 3.2016799449920654 }, { "auxiliary_loss_clip": 0.01153133, "auxiliary_loss_mlp": 0.01048539, "balance_loss_clip": 1.05278802, "balance_loss_mlp": 1.03021622, "epoch": 0.20297610100706448, "flos": 19494287118720.0, "grad_norm": 1.613636998778186, "language_loss": 0.82316196, "learning_rate": 3.6942545355800463e-06, "loss": 0.84517872, "num_input_tokens_seen": 73017675, "step": 3376, "time_per_iteration": 2.6073458194732666 }, { "auxiliary_loss_clip": 0.01142412, "auxiliary_loss_mlp": 0.01038523, "balance_loss_clip": 1.0506475, "balance_loss_mlp": 1.01912737, "epoch": 0.20303622425973245, "flos": 25044245441280.0, "grad_norm": 2.0454517065820026, "language_loss": 0.81243992, "learning_rate": 3.6940475473932743e-06, "loss": 0.83424926, "num_input_tokens_seen": 73036135, "step": 3377, "time_per_iteration": 2.6802914142608643 }, { "auxiliary_loss_clip": 0.01127133, "auxiliary_loss_mlp": 0.01049784, "balance_loss_clip": 1.05416846, "balance_loss_mlp": 1.03053212, "epoch": 0.2030963475124004, "flos": 21979988098560.0, "grad_norm": 1.9719049052811064, "language_loss": 0.76726258, "learning_rate": 3.69384049496805e-06, "loss": 0.78903174, "num_input_tokens_seen": 73054075, "step": 3378, "time_per_iteration": 2.7052531242370605 }, { "auxiliary_loss_clip": 0.01087342, "auxiliary_loss_mlp": 0.01049115, "balance_loss_clip": 1.04531622, "balance_loss_mlp": 1.02726364, "epoch": 0.2031564707650684, "flos": 19500392430720.0, "grad_norm": 2.0079998756584017, "language_loss": 0.7982831, "learning_rate": 3.6936333783122242e-06, "loss": 0.81964767, "num_input_tokens_seen": 73073530, "step": 3379, "time_per_iteration": 4.379331588745117 }, { "auxiliary_loss_clip": 0.01139431, "auxiliary_loss_mlp": 0.01039085, "balance_loss_clip": 1.05384874, "balance_loss_mlp": 1.02164412, "epoch": 0.20321659401773637, "flos": 22747075971840.0, "grad_norm": 1.5868581768713355, "language_loss": 0.86639273, "learning_rate": 3.6934261974336505e-06, "loss": 0.88817787, "num_input_tokens_seen": 73092820, "step": 3380, "time_per_iteration": 2.7405402660369873 }, { "auxiliary_loss_clip": 0.01156702, "auxiliary_loss_mlp": 0.01053775, "balance_loss_clip": 1.05730438, "balance_loss_mlp": 1.03507149, "epoch": 0.20327671727040433, "flos": 22455840499200.0, "grad_norm": 2.063467458189152, "language_loss": 0.74637043, "learning_rate": 3.693218952340186e-06, "loss": 0.76847517, "num_input_tokens_seen": 73113385, "step": 3381, "time_per_iteration": 2.6237549781799316 }, { "auxiliary_loss_clip": 0.01118794, "auxiliary_loss_mlp": 0.01042351, "balance_loss_clip": 1.04590273, "balance_loss_mlp": 1.02289653, "epoch": 0.2033368405230723, "flos": 19535010163200.0, "grad_norm": 1.6994666268173182, "language_loss": 0.79167414, "learning_rate": 3.6930116430396895e-06, "loss": 0.81328559, "num_input_tokens_seen": 73131195, "step": 3382, "time_per_iteration": 2.6707420349121094 }, { "auxiliary_loss_clip": 0.01113758, "auxiliary_loss_mlp": 0.00779415, "balance_loss_clip": 1.0459373, "balance_loss_mlp": 1.00091934, "epoch": 0.20339696377574026, "flos": 13809233744640.0, "grad_norm": 1.9483404178521286, "language_loss": 0.8042953, "learning_rate": 3.6928042695400214e-06, "loss": 0.82322699, "num_input_tokens_seen": 73148850, "step": 3383, "time_per_iteration": 2.7859487533569336 }, { "auxiliary_loss_clip": 0.01100731, "auxiliary_loss_mlp": 0.01046151, "balance_loss_clip": 1.04473877, "balance_loss_mlp": 1.02621913, "epoch": 0.20345708702840823, "flos": 20339409288960.0, "grad_norm": 3.0507793260875693, "language_loss": 0.74539214, "learning_rate": 3.6925968318490464e-06, "loss": 0.76686096, "num_input_tokens_seen": 73166775, "step": 3384, "time_per_iteration": 2.802645206451416 }, { "auxiliary_loss_clip": 0.0114772, "auxiliary_loss_mlp": 0.01042851, "balance_loss_clip": 1.05207324, "balance_loss_mlp": 1.02232289, "epoch": 0.2035172102810762, "flos": 20333950421760.0, "grad_norm": 7.661095363155204, "language_loss": 0.76801658, "learning_rate": 3.6923893299746293e-06, "loss": 0.7899223, "num_input_tokens_seen": 73183215, "step": 3385, "time_per_iteration": 2.823343515396118 }, { "auxiliary_loss_clip": 0.01107407, "auxiliary_loss_mlp": 0.01063941, "balance_loss_clip": 1.04730904, "balance_loss_mlp": 1.04331779, "epoch": 0.2035773335337442, "flos": 23330983461120.0, "grad_norm": 41.05937457193927, "language_loss": 0.68458641, "learning_rate": 3.692181763924639e-06, "loss": 0.70629984, "num_input_tokens_seen": 73203290, "step": 3386, "time_per_iteration": 2.830810546875 }, { "auxiliary_loss_clip": 0.01104248, "auxiliary_loss_mlp": 0.01064893, "balance_loss_clip": 1.04774165, "balance_loss_mlp": 1.04379284, "epoch": 0.20363745678641215, "flos": 28330287310080.0, "grad_norm": 3.4161658794101384, "language_loss": 0.80985248, "learning_rate": 3.691974133706947e-06, "loss": 0.83154386, "num_input_tokens_seen": 73226185, "step": 3387, "time_per_iteration": 2.8204662799835205 }, { "auxiliary_loss_clip": 0.0112504, "auxiliary_loss_mlp": 0.01049361, "balance_loss_clip": 1.05224109, "balance_loss_mlp": 1.03000104, "epoch": 0.20369758003908012, "flos": 18915658928640.0, "grad_norm": 2.703878094865874, "language_loss": 0.7988956, "learning_rate": 3.6917664393294262e-06, "loss": 0.82063961, "num_input_tokens_seen": 73243300, "step": 3388, "time_per_iteration": 2.687053918838501 }, { "auxiliary_loss_clip": 0.01157403, "auxiliary_loss_mlp": 0.01048089, "balance_loss_clip": 1.05471182, "balance_loss_mlp": 1.0281812, "epoch": 0.20375770329174808, "flos": 19206499351680.0, "grad_norm": 1.8133180655285324, "language_loss": 0.7184962, "learning_rate": 3.6915586807999527e-06, "loss": 0.74055111, "num_input_tokens_seen": 73261490, "step": 3389, "time_per_iteration": 2.614321708679199 }, { "auxiliary_loss_clip": 0.01141855, "auxiliary_loss_mlp": 0.01054311, "balance_loss_clip": 1.05387521, "balance_loss_mlp": 1.0351541, "epoch": 0.20381782654441605, "flos": 19391008538880.0, "grad_norm": 1.8982692343761227, "language_loss": 0.87280858, "learning_rate": 3.691350858126404e-06, "loss": 0.89477026, "num_input_tokens_seen": 73280180, "step": 3390, "time_per_iteration": 2.6770312786102295 }, { "auxiliary_loss_clip": 0.01125093, "auxiliary_loss_mlp": 0.01052498, "balance_loss_clip": 1.05142403, "balance_loss_mlp": 1.03129053, "epoch": 0.203877949797084, "flos": 24827704300800.0, "grad_norm": 2.3308941901233355, "language_loss": 0.71194077, "learning_rate": 3.691142971316662e-06, "loss": 0.73371667, "num_input_tokens_seen": 73300680, "step": 3391, "time_per_iteration": 2.7198221683502197 }, { "auxiliary_loss_clip": 0.01120121, "auxiliary_loss_mlp": 0.01051383, "balance_loss_clip": 1.05222178, "balance_loss_mlp": 1.0318923, "epoch": 0.20393807304975198, "flos": 18003707504640.0, "grad_norm": 2.4765720957839217, "language_loss": 0.86745828, "learning_rate": 3.6909350203786086e-06, "loss": 0.88917333, "num_input_tokens_seen": 73316760, "step": 3392, "time_per_iteration": 2.6961052417755127 }, { "auxiliary_loss_clip": 0.01145712, "auxiliary_loss_mlp": 0.01051212, "balance_loss_clip": 1.05204964, "balance_loss_mlp": 1.03236461, "epoch": 0.20399819630241997, "flos": 24206988349440.0, "grad_norm": 1.665333238668028, "language_loss": 0.80659354, "learning_rate": 3.69072700532013e-06, "loss": 0.82856286, "num_input_tokens_seen": 73339385, "step": 3393, "time_per_iteration": 2.6883490085601807 }, { "auxiliary_loss_clip": 0.01123025, "auxiliary_loss_mlp": 0.010424, "balance_loss_clip": 1.04751348, "balance_loss_mlp": 1.02385163, "epoch": 0.20405831955508794, "flos": 20777124424320.0, "grad_norm": 1.8745864895680615, "language_loss": 0.86126244, "learning_rate": 3.6905189261491137e-06, "loss": 0.88291663, "num_input_tokens_seen": 73357235, "step": 3394, "time_per_iteration": 2.758887767791748 }, { "auxiliary_loss_clip": 0.0114219, "auxiliary_loss_mlp": 0.01049288, "balance_loss_clip": 1.05699492, "balance_loss_mlp": 1.03088212, "epoch": 0.2041184428077559, "flos": 15486908325120.0, "grad_norm": 2.5133342949273416, "language_loss": 0.83761692, "learning_rate": 3.69031078287345e-06, "loss": 0.85953164, "num_input_tokens_seen": 73374435, "step": 3395, "time_per_iteration": 2.6468729972839355 }, { "auxiliary_loss_clip": 0.01145796, "auxiliary_loss_mlp": 0.01039804, "balance_loss_clip": 1.05311751, "balance_loss_mlp": 1.0200156, "epoch": 0.20417856606042387, "flos": 15588463052160.0, "grad_norm": 2.8477422591662376, "language_loss": 0.83736277, "learning_rate": 3.690102575501033e-06, "loss": 0.85921878, "num_input_tokens_seen": 73391025, "step": 3396, "time_per_iteration": 2.6296958923339844 }, { "auxiliary_loss_clip": 0.01112843, "auxiliary_loss_mlp": 0.01045334, "balance_loss_clip": 1.04787922, "balance_loss_mlp": 1.02616525, "epoch": 0.20423868931309183, "flos": 24279348297600.0, "grad_norm": 2.1192113228666303, "language_loss": 0.77199841, "learning_rate": 3.6898943040397556e-06, "loss": 0.79358017, "num_input_tokens_seen": 73409270, "step": 3397, "time_per_iteration": 2.776784896850586 }, { "auxiliary_loss_clip": 0.01128614, "auxiliary_loss_mlp": 0.01050131, "balance_loss_clip": 1.05143905, "balance_loss_mlp": 1.03264332, "epoch": 0.2042988125657598, "flos": 18614870438400.0, "grad_norm": 3.16091809956727, "language_loss": 0.8791461, "learning_rate": 3.689685968497518e-06, "loss": 0.9009335, "num_input_tokens_seen": 73425225, "step": 3398, "time_per_iteration": 2.6866374015808105 }, { "auxiliary_loss_clip": 0.01126796, "auxiliary_loss_mlp": 0.01052169, "balance_loss_clip": 1.05476117, "balance_loss_mlp": 1.03316689, "epoch": 0.2043589358184278, "flos": 17851230270720.0, "grad_norm": 2.139785862197821, "language_loss": 0.78045064, "learning_rate": 3.6894775688822186e-06, "loss": 0.80224031, "num_input_tokens_seen": 73440940, "step": 3399, "time_per_iteration": 2.6545825004577637 }, { "auxiliary_loss_clip": 0.01144155, "auxiliary_loss_mlp": 0.01042424, "balance_loss_clip": 1.05252838, "balance_loss_mlp": 1.02299261, "epoch": 0.20441905907109575, "flos": 21435223455360.0, "grad_norm": 3.6374157446104802, "language_loss": 0.76563728, "learning_rate": 3.6892691052017603e-06, "loss": 0.787503, "num_input_tokens_seen": 73458805, "step": 3400, "time_per_iteration": 2.7279481887817383 }, { "auxiliary_loss_clip": 0.01121071, "auxiliary_loss_mlp": 0.00776799, "balance_loss_clip": 1.05304742, "balance_loss_mlp": 1.00072634, "epoch": 0.20447918232376372, "flos": 27707703851520.0, "grad_norm": 1.8758513970592474, "language_loss": 0.79382575, "learning_rate": 3.6890605774640487e-06, "loss": 0.81280446, "num_input_tokens_seen": 73479380, "step": 3401, "time_per_iteration": 2.7918031215667725 }, { "auxiliary_loss_clip": 0.01131319, "auxiliary_loss_mlp": 0.01044892, "balance_loss_clip": 1.0484674, "balance_loss_mlp": 1.02540183, "epoch": 0.20453930557643168, "flos": 30524214113280.0, "grad_norm": 2.2159471948141034, "language_loss": 0.69798994, "learning_rate": 3.688851985676991e-06, "loss": 0.71975207, "num_input_tokens_seen": 73505105, "step": 3402, "time_per_iteration": 2.79670786857605 }, { "auxiliary_loss_clip": 0.01120554, "auxiliary_loss_mlp": 0.01043946, "balance_loss_clip": 1.05060196, "balance_loss_mlp": 1.02439535, "epoch": 0.20459942882909965, "flos": 18987767481600.0, "grad_norm": 1.7908768446457861, "language_loss": 0.81114817, "learning_rate": 3.688643329848496e-06, "loss": 0.83279312, "num_input_tokens_seen": 73523700, "step": 3403, "time_per_iteration": 2.70182728767395 }, { "auxiliary_loss_clip": 0.01144248, "auxiliary_loss_mlp": 0.01041199, "balance_loss_clip": 1.05348516, "balance_loss_mlp": 1.02295971, "epoch": 0.20465955208176762, "flos": 20339050152960.0, "grad_norm": 2.511955552730785, "language_loss": 0.83403814, "learning_rate": 3.6884346099864772e-06, "loss": 0.8558926, "num_input_tokens_seen": 73542625, "step": 3404, "time_per_iteration": 2.630807399749756 }, { "auxiliary_loss_clip": 0.01138937, "auxiliary_loss_mlp": 0.01048101, "balance_loss_clip": 1.04838705, "balance_loss_mlp": 1.0292058, "epoch": 0.20471967533443558, "flos": 21251288885760.0, "grad_norm": 1.7149716538767368, "language_loss": 0.86209136, "learning_rate": 3.6882258260988487e-06, "loss": 0.88396174, "num_input_tokens_seen": 73561450, "step": 3405, "time_per_iteration": 2.6076929569244385 }, { "auxiliary_loss_clip": 0.01116224, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.05039132, "balance_loss_mlp": 1.02621806, "epoch": 0.20477979858710357, "flos": 14501555458560.0, "grad_norm": 2.1633598971137435, "language_loss": 0.84356105, "learning_rate": 3.6880169781935276e-06, "loss": 0.86516619, "num_input_tokens_seen": 73577155, "step": 3406, "time_per_iteration": 2.768890142440796 }, { "auxiliary_loss_clip": 0.01152751, "auxiliary_loss_mlp": 0.01039548, "balance_loss_clip": 1.0542599, "balance_loss_mlp": 1.02191663, "epoch": 0.20483992183977154, "flos": 11400310085760.0, "grad_norm": 2.4892039461455675, "language_loss": 0.67453218, "learning_rate": 3.6878080662784336e-06, "loss": 0.69645512, "num_input_tokens_seen": 73594900, "step": 3407, "time_per_iteration": 2.5661377906799316 }, { "auxiliary_loss_clip": 0.0115175, "auxiliary_loss_mlp": 0.01050505, "balance_loss_clip": 1.05328465, "balance_loss_mlp": 1.03294516, "epoch": 0.2049000450924395, "flos": 19060271084160.0, "grad_norm": 2.4363182538361285, "language_loss": 0.84214294, "learning_rate": 3.6875990903614886e-06, "loss": 0.86416554, "num_input_tokens_seen": 73613810, "step": 3408, "time_per_iteration": 2.585186004638672 }, { "auxiliary_loss_clip": 0.01154901, "auxiliary_loss_mlp": 0.01042295, "balance_loss_clip": 1.0536257, "balance_loss_mlp": 1.02471161, "epoch": 0.20496016834510747, "flos": 14574561851520.0, "grad_norm": 2.317815935455145, "language_loss": 0.63898516, "learning_rate": 3.6873900504506166e-06, "loss": 0.6609571, "num_input_tokens_seen": 73631495, "step": 3409, "time_per_iteration": 2.5877959728240967 }, { "auxiliary_loss_clip": 0.0113795, "auxiliary_loss_mlp": 0.01042481, "balance_loss_clip": 1.04903567, "balance_loss_mlp": 1.02409852, "epoch": 0.20502029159777543, "flos": 22126647329280.0, "grad_norm": 1.3925959707869588, "language_loss": 0.80547982, "learning_rate": 3.687180946553745e-06, "loss": 0.8272841, "num_input_tokens_seen": 73652840, "step": 3410, "time_per_iteration": 4.1697752475738525 }, { "auxiliary_loss_clip": 0.01099823, "auxiliary_loss_mlp": 0.01046015, "balance_loss_clip": 1.05186486, "balance_loss_mlp": 1.02820492, "epoch": 0.2050804148504434, "flos": 25367907916800.0, "grad_norm": 2.407452066099965, "language_loss": 0.75804615, "learning_rate": 3.686971778678803e-06, "loss": 0.77950454, "num_input_tokens_seen": 73672150, "step": 3411, "time_per_iteration": 2.8072102069854736 }, { "auxiliary_loss_clip": 0.0113879, "auxiliary_loss_mlp": 0.01046868, "balance_loss_clip": 1.05501246, "balance_loss_mlp": 1.02887905, "epoch": 0.2051405381031114, "flos": 23620171858560.0, "grad_norm": 2.4936494073109445, "language_loss": 0.73356283, "learning_rate": 3.686762546833722e-06, "loss": 0.75541937, "num_input_tokens_seen": 73691940, "step": 3412, "time_per_iteration": 5.778446912765503 }, { "auxiliary_loss_clip": 0.01127692, "auxiliary_loss_mlp": 0.01057937, "balance_loss_clip": 1.04926813, "balance_loss_mlp": 1.03748107, "epoch": 0.20520066135577936, "flos": 19565533745280.0, "grad_norm": 2.3541654180764353, "language_loss": 0.77958596, "learning_rate": 3.6865532510264362e-06, "loss": 0.80144227, "num_input_tokens_seen": 73709080, "step": 3413, "time_per_iteration": 2.6457245349884033 }, { "auxiliary_loss_clip": 0.0110869, "auxiliary_loss_mlp": 0.01047866, "balance_loss_clip": 1.04991519, "balance_loss_mlp": 1.02862608, "epoch": 0.20526078460844732, "flos": 17676345928320.0, "grad_norm": 2.4834314093653673, "language_loss": 0.85112405, "learning_rate": 3.6863438912648823e-06, "loss": 0.8726896, "num_input_tokens_seen": 73727670, "step": 3414, "time_per_iteration": 2.7343668937683105 }, { "auxiliary_loss_clip": 0.01140219, "auxiliary_loss_mlp": 0.01039448, "balance_loss_clip": 1.05012155, "balance_loss_mlp": 1.02118468, "epoch": 0.2053209078611153, "flos": 21500328856320.0, "grad_norm": 2.0410772094937433, "language_loss": 0.80372798, "learning_rate": 3.6861344675569986e-06, "loss": 0.82552463, "num_input_tokens_seen": 73747170, "step": 3415, "time_per_iteration": 2.6669082641601562 }, { "auxiliary_loss_clip": 0.01087022, "auxiliary_loss_mlp": 0.01042771, "balance_loss_clip": 1.04786301, "balance_loss_mlp": 1.02643943, "epoch": 0.20538103111378325, "flos": 25663524848640.0, "grad_norm": 1.941742032659622, "language_loss": 0.72958827, "learning_rate": 3.6859249799107275e-06, "loss": 0.75088626, "num_input_tokens_seen": 73767690, "step": 3416, "time_per_iteration": 2.892782211303711 }, { "auxiliary_loss_clip": 0.01145149, "auxiliary_loss_mlp": 0.01044328, "balance_loss_clip": 1.05453372, "balance_loss_mlp": 1.02577877, "epoch": 0.20544115436645122, "flos": 23148952312320.0, "grad_norm": 2.508583707985938, "language_loss": 0.78741407, "learning_rate": 3.6857154283340115e-06, "loss": 0.80930889, "num_input_tokens_seen": 73786900, "step": 3417, "time_per_iteration": 2.7298929691314697 }, { "auxiliary_loss_clip": 0.01145459, "auxiliary_loss_mlp": 0.0104683, "balance_loss_clip": 1.0536468, "balance_loss_mlp": 1.02819777, "epoch": 0.20550127761911918, "flos": 19390433921280.0, "grad_norm": 2.4305498920504043, "language_loss": 0.8729043, "learning_rate": 3.685505812834798e-06, "loss": 0.89482725, "num_input_tokens_seen": 73804515, "step": 3418, "time_per_iteration": 4.382033109664917 }, { "auxiliary_loss_clip": 0.01140182, "auxiliary_loss_mlp": 0.01046543, "balance_loss_clip": 1.05682349, "balance_loss_mlp": 1.02776778, "epoch": 0.20556140087178718, "flos": 22893124671360.0, "grad_norm": 14.690715253896212, "language_loss": 0.62538671, "learning_rate": 3.685296133421035e-06, "loss": 0.64725399, "num_input_tokens_seen": 73822910, "step": 3419, "time_per_iteration": 2.7318668365478516 }, { "auxiliary_loss_clip": 0.01139691, "auxiliary_loss_mlp": 0.01046928, "balance_loss_clip": 1.05550981, "balance_loss_mlp": 1.02651954, "epoch": 0.20562152412445514, "flos": 19789652655360.0, "grad_norm": 1.8153871521224594, "language_loss": 0.86339438, "learning_rate": 3.685086390100674e-06, "loss": 0.88526058, "num_input_tokens_seen": 73841160, "step": 3420, "time_per_iteration": 2.723606824874878 }, { "auxiliary_loss_clip": 0.01104401, "auxiliary_loss_mlp": 0.00780617, "balance_loss_clip": 1.04621911, "balance_loss_mlp": 1.00071514, "epoch": 0.2056816473771231, "flos": 31501989210240.0, "grad_norm": 2.3982854973621954, "language_loss": 0.7127136, "learning_rate": 3.684876582881668e-06, "loss": 0.73156381, "num_input_tokens_seen": 73862795, "step": 3421, "time_per_iteration": 2.8138315677642822 }, { "auxiliary_loss_clip": 0.01153254, "auxiliary_loss_mlp": 0.01039984, "balance_loss_clip": 1.05382609, "balance_loss_mlp": 1.02160168, "epoch": 0.20574177062979107, "flos": 23258372117760.0, "grad_norm": 6.231519820465981, "language_loss": 0.70559299, "learning_rate": 3.6846667117719732e-06, "loss": 0.72752541, "num_input_tokens_seen": 73881525, "step": 3422, "time_per_iteration": 2.6411848068237305 }, { "auxiliary_loss_clip": 0.01062123, "auxiliary_loss_mlp": 0.01005097, "balance_loss_clip": 1.03459418, "balance_loss_mlp": 1.00220013, "epoch": 0.20580189388245904, "flos": 70312518708480.0, "grad_norm": 0.740118932422812, "language_loss": 0.55461621, "learning_rate": 3.684456776779548e-06, "loss": 0.57528841, "num_input_tokens_seen": 73937775, "step": 3423, "time_per_iteration": 3.259685516357422 }, { "auxiliary_loss_clip": 0.01104389, "auxiliary_loss_mlp": 0.01039296, "balance_loss_clip": 1.04975653, "balance_loss_mlp": 1.02089024, "epoch": 0.205862017135127, "flos": 30737846252160.0, "grad_norm": 1.9242047681435088, "language_loss": 0.71910381, "learning_rate": 3.684246777912353e-06, "loss": 0.74054068, "num_input_tokens_seen": 73958250, "step": 3424, "time_per_iteration": 2.800283432006836 }, { "auxiliary_loss_clip": 0.01125916, "auxiliary_loss_mlp": 0.00777945, "balance_loss_clip": 1.05704927, "balance_loss_mlp": 1.00086677, "epoch": 0.20592214038779497, "flos": 21324546673920.0, "grad_norm": 1.6235965502825092, "language_loss": 0.74980927, "learning_rate": 3.684036715178351e-06, "loss": 0.76884782, "num_input_tokens_seen": 73977775, "step": 3425, "time_per_iteration": 2.751030206680298 }, { "auxiliary_loss_clip": 0.01104665, "auxiliary_loss_mlp": 0.01058685, "balance_loss_clip": 1.05047321, "balance_loss_mlp": 1.03983784, "epoch": 0.20598226364046296, "flos": 22891652213760.0, "grad_norm": 1.7765616723027935, "language_loss": 0.87936616, "learning_rate": 3.683826588585508e-06, "loss": 0.90099961, "num_input_tokens_seen": 73996590, "step": 3426, "time_per_iteration": 2.8539180755615234 }, { "auxiliary_loss_clip": 0.01144422, "auxiliary_loss_mlp": 0.01045493, "balance_loss_clip": 1.05773449, "balance_loss_mlp": 1.0281601, "epoch": 0.20604238689313092, "flos": 23878549365120.0, "grad_norm": 1.836530467647624, "language_loss": 0.76435733, "learning_rate": 3.6836163981417926e-06, "loss": 0.78625643, "num_input_tokens_seen": 74015935, "step": 3427, "time_per_iteration": 2.7024967670440674 }, { "auxiliary_loss_clip": 0.01159387, "auxiliary_loss_mlp": 0.01050023, "balance_loss_clip": 1.0577209, "balance_loss_mlp": 1.03185558, "epoch": 0.2061025101457989, "flos": 22491535639680.0, "grad_norm": 2.7350574840199964, "language_loss": 0.74176943, "learning_rate": 3.683406143855174e-06, "loss": 0.76386356, "num_input_tokens_seen": 74036575, "step": 3428, "time_per_iteration": 2.593151569366455 }, { "auxiliary_loss_clip": 0.01132797, "auxiliary_loss_mlp": 0.01046534, "balance_loss_clip": 1.05232322, "balance_loss_mlp": 1.0274843, "epoch": 0.20616263339846685, "flos": 22778928357120.0, "grad_norm": 3.829070534376961, "language_loss": 0.73316109, "learning_rate": 3.6831958257336256e-06, "loss": 0.75495446, "num_input_tokens_seen": 74055365, "step": 3429, "time_per_iteration": 2.7357261180877686 }, { "auxiliary_loss_clip": 0.01144108, "auxiliary_loss_mlp": 0.01049081, "balance_loss_clip": 1.05838966, "balance_loss_mlp": 1.03030515, "epoch": 0.20622275665113482, "flos": 20882198684160.0, "grad_norm": 2.201354934958512, "language_loss": 0.85586745, "learning_rate": 3.6829854437851237e-06, "loss": 0.87779927, "num_input_tokens_seen": 74074875, "step": 3430, "time_per_iteration": 2.658486843109131 }, { "auxiliary_loss_clip": 0.01088509, "auxiliary_loss_mlp": 0.01053254, "balance_loss_clip": 1.04814601, "balance_loss_mlp": 1.03387105, "epoch": 0.20628287990380278, "flos": 19354415558400.0, "grad_norm": 1.8292569880077065, "language_loss": 0.68859613, "learning_rate": 3.6827749980176444e-06, "loss": 0.71001375, "num_input_tokens_seen": 74094505, "step": 3431, "time_per_iteration": 2.811061143875122 }, { "auxiliary_loss_clip": 0.01027012, "auxiliary_loss_mlp": 0.01012446, "balance_loss_clip": 1.03099978, "balance_loss_mlp": 1.00976419, "epoch": 0.20634300315647078, "flos": 71517932248320.0, "grad_norm": 0.8066063325789609, "language_loss": 0.60172188, "learning_rate": 3.6825644884391693e-06, "loss": 0.62211645, "num_input_tokens_seen": 74158500, "step": 3432, "time_per_iteration": 3.415828227996826 }, { "auxiliary_loss_clip": 0.01146488, "auxiliary_loss_mlp": 0.01044703, "balance_loss_clip": 1.0583806, "balance_loss_mlp": 1.02669072, "epoch": 0.20640312640913874, "flos": 21723944976000.0, "grad_norm": 2.5535613418278116, "language_loss": 0.72622889, "learning_rate": 3.682353915057679e-06, "loss": 0.74814081, "num_input_tokens_seen": 74176685, "step": 3433, "time_per_iteration": 2.715195655822754 }, { "auxiliary_loss_clip": 0.0109694, "auxiliary_loss_mlp": 0.01050867, "balance_loss_clip": 1.04781306, "balance_loss_mlp": 1.03019655, "epoch": 0.2064632496618067, "flos": 20554621626240.0, "grad_norm": 2.096486283687917, "language_loss": 0.87233114, "learning_rate": 3.6821432778811604e-06, "loss": 0.8938092, "num_input_tokens_seen": 74194935, "step": 3434, "time_per_iteration": 2.7781460285186768 }, { "auxiliary_loss_clip": 0.01151381, "auxiliary_loss_mlp": 0.01045497, "balance_loss_clip": 1.05561388, "balance_loss_mlp": 1.02719867, "epoch": 0.20652337291447467, "flos": 29823273135360.0, "grad_norm": 1.7621185839090663, "language_loss": 0.69533503, "learning_rate": 3.6819325769176004e-06, "loss": 0.71730381, "num_input_tokens_seen": 74215400, "step": 3435, "time_per_iteration": 2.7425992488861084 }, { "auxiliary_loss_clip": 0.01127853, "auxiliary_loss_mlp": 0.01045604, "balance_loss_clip": 1.05583, "balance_loss_mlp": 1.02672172, "epoch": 0.20658349616714264, "flos": 26213640618240.0, "grad_norm": 30.077934868422773, "language_loss": 0.89116997, "learning_rate": 3.681721812174988e-06, "loss": 0.91290456, "num_input_tokens_seen": 74234090, "step": 3436, "time_per_iteration": 2.7460577487945557 }, { "auxiliary_loss_clip": 0.01118033, "auxiliary_loss_mlp": 0.01041557, "balance_loss_clip": 1.05178559, "balance_loss_mlp": 1.02168477, "epoch": 0.2066436194198106, "flos": 25994370044160.0, "grad_norm": 1.7370712778981523, "language_loss": 0.77330887, "learning_rate": 3.6815109836613163e-06, "loss": 0.79490477, "num_input_tokens_seen": 74253345, "step": 3437, "time_per_iteration": 2.7507588863372803 }, { "auxiliary_loss_clip": 0.01144607, "auxiliary_loss_mlp": 0.01040376, "balance_loss_clip": 1.05298507, "balance_loss_mlp": 1.02323389, "epoch": 0.20670374267247857, "flos": 21361067827200.0, "grad_norm": 1.8326742989814773, "language_loss": 0.77813125, "learning_rate": 3.6813000913845795e-06, "loss": 0.799981, "num_input_tokens_seen": 74271615, "step": 3438, "time_per_iteration": 2.7624385356903076 }, { "auxiliary_loss_clip": 0.01063811, "auxiliary_loss_mlp": 0.01002308, "balance_loss_clip": 1.03603387, "balance_loss_mlp": 0.9995541, "epoch": 0.20676386592514656, "flos": 66383281952640.0, "grad_norm": 0.8298524953876073, "language_loss": 0.67093015, "learning_rate": 3.6810891353527747e-06, "loss": 0.69159138, "num_input_tokens_seen": 74331390, "step": 3439, "time_per_iteration": 3.2026216983795166 }, { "auxiliary_loss_clip": 0.01148913, "auxiliary_loss_mlp": 0.01041213, "balance_loss_clip": 1.05590546, "balance_loss_mlp": 1.02299786, "epoch": 0.20682398917781453, "flos": 17274577328640.0, "grad_norm": 1.9537104709510729, "language_loss": 0.83907467, "learning_rate": 3.6808781155739014e-06, "loss": 0.86097592, "num_input_tokens_seen": 74347335, "step": 3440, "time_per_iteration": 2.6949758529663086 }, { "auxiliary_loss_clip": 0.01147739, "auxiliary_loss_mlp": 0.01041939, "balance_loss_clip": 1.05509627, "balance_loss_mlp": 1.02458239, "epoch": 0.2068841124304825, "flos": 18077288515200.0, "grad_norm": 1.8008884636634683, "language_loss": 0.84828413, "learning_rate": 3.6806670320559614e-06, "loss": 0.8701809, "num_input_tokens_seen": 74366310, "step": 3441, "time_per_iteration": 2.6440463066101074 }, { "auxiliary_loss_clip": 0.01110175, "auxiliary_loss_mlp": 0.01048552, "balance_loss_clip": 1.05599904, "balance_loss_mlp": 1.03050399, "epoch": 0.20694423568315046, "flos": 27347017432320.0, "grad_norm": 1.7415147413468661, "language_loss": 0.85854685, "learning_rate": 3.680455884806959e-06, "loss": 0.88013411, "num_input_tokens_seen": 74387100, "step": 3442, "time_per_iteration": 2.8222689628601074 }, { "auxiliary_loss_clip": 0.01078025, "auxiliary_loss_mlp": 0.01050799, "balance_loss_clip": 1.05186844, "balance_loss_mlp": 1.03095019, "epoch": 0.20700435893581842, "flos": 20229845829120.0, "grad_norm": 1.9775081815037283, "language_loss": 0.73038852, "learning_rate": 3.6802446738349014e-06, "loss": 0.75167674, "num_input_tokens_seen": 74404460, "step": 3443, "time_per_iteration": 2.8044140338897705 }, { "auxiliary_loss_clip": 0.01127625, "auxiliary_loss_mlp": 0.00776303, "balance_loss_clip": 1.05408895, "balance_loss_mlp": 1.00079513, "epoch": 0.2070644821884864, "flos": 20631111638400.0, "grad_norm": 1.84636320729986, "language_loss": 0.85586846, "learning_rate": 3.680033399147797e-06, "loss": 0.87490773, "num_input_tokens_seen": 74423790, "step": 3444, "time_per_iteration": 2.7582647800445557 }, { "auxiliary_loss_clip": 0.01036759, "auxiliary_loss_mlp": 0.01007145, "balance_loss_clip": 1.03905272, "balance_loss_mlp": 1.0042963, "epoch": 0.20712460544115438, "flos": 65941077617280.0, "grad_norm": 0.6999396122177431, "language_loss": 0.57092249, "learning_rate": 3.6798220607536585e-06, "loss": 0.59136152, "num_input_tokens_seen": 74488130, "step": 3445, "time_per_iteration": 3.249602794647217 }, { "auxiliary_loss_clip": 0.01152738, "auxiliary_loss_mlp": 0.00776634, "balance_loss_clip": 1.0538106, "balance_loss_mlp": 1.00088191, "epoch": 0.20718472869382235, "flos": 19425734012160.0, "grad_norm": 1.6453630130444594, "language_loss": 0.78469276, "learning_rate": 3.6796106586604987e-06, "loss": 0.80398649, "num_input_tokens_seen": 74506720, "step": 3446, "time_per_iteration": 2.6341898441314697 }, { "auxiliary_loss_clip": 0.01151445, "auxiliary_loss_mlp": 0.01043774, "balance_loss_clip": 1.05439711, "balance_loss_mlp": 1.02297151, "epoch": 0.2072448519464903, "flos": 24499049834880.0, "grad_norm": 2.013256457797304, "language_loss": 0.63031304, "learning_rate": 3.679399192876334e-06, "loss": 0.65226525, "num_input_tokens_seen": 74525330, "step": 3447, "time_per_iteration": 2.6912922859191895 }, { "auxiliary_loss_clip": 0.01103828, "auxiliary_loss_mlp": 0.01058453, "balance_loss_clip": 1.04668319, "balance_loss_mlp": 1.03828287, "epoch": 0.20730497519915828, "flos": 23075694524160.0, "grad_norm": 1.7423220349735584, "language_loss": 0.86291325, "learning_rate": 3.679187663409184e-06, "loss": 0.88453603, "num_input_tokens_seen": 74544535, "step": 3448, "time_per_iteration": 2.787576675415039 }, { "auxiliary_loss_clip": 0.01128629, "auxiliary_loss_mlp": 0.01045151, "balance_loss_clip": 1.049932, "balance_loss_mlp": 1.02556467, "epoch": 0.20736509845182624, "flos": 21069042255360.0, "grad_norm": 3.8253504349982044, "language_loss": 0.75264204, "learning_rate": 3.6789760702670696e-06, "loss": 0.77437979, "num_input_tokens_seen": 74562300, "step": 3449, "time_per_iteration": 4.354467391967773 }, { "auxiliary_loss_clip": 0.01141162, "auxiliary_loss_mlp": 0.01050212, "balance_loss_clip": 1.0534308, "balance_loss_mlp": 1.03073323, "epoch": 0.2074252217044942, "flos": 17633288499840.0, "grad_norm": 2.156163289660715, "language_loss": 0.76558924, "learning_rate": 3.6787644134580134e-06, "loss": 0.787503, "num_input_tokens_seen": 74580080, "step": 3450, "time_per_iteration": 2.7020533084869385 }, { "auxiliary_loss_clip": 0.01128554, "auxiliary_loss_mlp": 0.01044182, "balance_loss_clip": 1.05234683, "balance_loss_mlp": 1.02522802, "epoch": 0.20748534495716217, "flos": 23546985897600.0, "grad_norm": 1.6446708221415856, "language_loss": 0.82074821, "learning_rate": 3.6785526929900436e-06, "loss": 0.84247565, "num_input_tokens_seen": 74598980, "step": 3451, "time_per_iteration": 2.7753186225891113 }, { "auxiliary_loss_clip": 0.01064426, "auxiliary_loss_mlp": 0.01003577, "balance_loss_clip": 1.02722275, "balance_loss_mlp": 1.00099015, "epoch": 0.20754546820983016, "flos": 52252935598080.0, "grad_norm": 0.793594031040259, "language_loss": 0.56562752, "learning_rate": 3.6783409088711875e-06, "loss": 0.58630753, "num_input_tokens_seen": 74655275, "step": 3452, "time_per_iteration": 6.257205963134766 }, { "auxiliary_loss_clip": 0.01124123, "auxiliary_loss_mlp": 0.00776806, "balance_loss_clip": 1.05206704, "balance_loss_mlp": 1.0008918, "epoch": 0.20760559146249813, "flos": 20412379768320.0, "grad_norm": 2.245823129763223, "language_loss": 0.88341558, "learning_rate": 3.6781290611094755e-06, "loss": 0.90242493, "num_input_tokens_seen": 74674560, "step": 3453, "time_per_iteration": 2.7009050846099854 }, { "auxiliary_loss_clip": 0.01146287, "auxiliary_loss_mlp": 0.01044217, "balance_loss_clip": 1.05471313, "balance_loss_mlp": 1.02521539, "epoch": 0.2076657147151661, "flos": 23186012169600.0, "grad_norm": 2.2325669459725574, "language_loss": 0.79920429, "learning_rate": 3.6779171497129407e-06, "loss": 0.82110935, "num_input_tokens_seen": 74694500, "step": 3454, "time_per_iteration": 2.7080893516540527 }, { "auxiliary_loss_clip": 0.01104984, "auxiliary_loss_mlp": 0.00777717, "balance_loss_clip": 1.04356718, "balance_loss_mlp": 1.0007751, "epoch": 0.20772583796783406, "flos": 18293219124480.0, "grad_norm": 3.601668384502942, "language_loss": 0.76601356, "learning_rate": 3.6777051746896202e-06, "loss": 0.78484058, "num_input_tokens_seen": 74710485, "step": 3455, "time_per_iteration": 2.6733248233795166 }, { "auxiliary_loss_clip": 0.01115407, "auxiliary_loss_mlp": 0.01050321, "balance_loss_clip": 1.04759336, "balance_loss_mlp": 1.0326066, "epoch": 0.20778596122050202, "flos": 17602800831360.0, "grad_norm": 1.908671081537558, "language_loss": 0.80200219, "learning_rate": 3.6774931360475516e-06, "loss": 0.82365942, "num_input_tokens_seen": 74727450, "step": 3456, "time_per_iteration": 2.6950278282165527 }, { "auxiliary_loss_clip": 0.01112832, "auxiliary_loss_mlp": 0.00777675, "balance_loss_clip": 1.05166578, "balance_loss_mlp": 1.00099969, "epoch": 0.20784608447317, "flos": 23805578885760.0, "grad_norm": 2.135320694722552, "language_loss": 0.78070557, "learning_rate": 3.6772810337947745e-06, "loss": 0.79961067, "num_input_tokens_seen": 74746725, "step": 3457, "time_per_iteration": 4.381137132644653 }, { "auxiliary_loss_clip": 0.01082177, "auxiliary_loss_mlp": 0.01058291, "balance_loss_clip": 1.04310393, "balance_loss_mlp": 1.03651094, "epoch": 0.20790620772583795, "flos": 17639286071040.0, "grad_norm": 1.7652855773158553, "language_loss": 0.8360287, "learning_rate": 3.677068867939333e-06, "loss": 0.85743344, "num_input_tokens_seen": 74765255, "step": 3458, "time_per_iteration": 2.7332653999328613 }, { "auxiliary_loss_clip": 0.01140275, "auxiliary_loss_mlp": 0.0077698, "balance_loss_clip": 1.05156302, "balance_loss_mlp": 1.00095606, "epoch": 0.20796633097850595, "flos": 27673481168640.0, "grad_norm": 11.883071119862361, "language_loss": 0.75769317, "learning_rate": 3.676856638489272e-06, "loss": 0.77686572, "num_input_tokens_seen": 74785710, "step": 3459, "time_per_iteration": 2.705026626586914 }, { "auxiliary_loss_clip": 0.01089168, "auxiliary_loss_mlp": 0.01038825, "balance_loss_clip": 1.04769015, "balance_loss_mlp": 1.02081251, "epoch": 0.2080264542311739, "flos": 19245606284160.0, "grad_norm": 2.1071303009051428, "language_loss": 0.77105331, "learning_rate": 3.6766443454526382e-06, "loss": 0.79233319, "num_input_tokens_seen": 74804490, "step": 3460, "time_per_iteration": 2.749965190887451 }, { "auxiliary_loss_clip": 0.0109477, "auxiliary_loss_mlp": 0.01047592, "balance_loss_clip": 1.04938984, "balance_loss_mlp": 1.02838707, "epoch": 0.20808657748384188, "flos": 27525924097920.0, "grad_norm": 9.5480036120023, "language_loss": 0.75802225, "learning_rate": 3.6764319888374836e-06, "loss": 0.77944589, "num_input_tokens_seen": 74826340, "step": 3461, "time_per_iteration": 2.7929086685180664 }, { "auxiliary_loss_clip": 0.01124748, "auxiliary_loss_mlp": 0.01041543, "balance_loss_clip": 1.04610133, "balance_loss_mlp": 1.02203989, "epoch": 0.20814670073650984, "flos": 26906931999360.0, "grad_norm": 2.001927586001653, "language_loss": 0.8848443, "learning_rate": 3.6762195686518604e-06, "loss": 0.90650725, "num_input_tokens_seen": 74844960, "step": 3462, "time_per_iteration": 2.7031619548797607 }, { "auxiliary_loss_clip": 0.01023861, "auxiliary_loss_mlp": 0.00757905, "balance_loss_clip": 1.02540636, "balance_loss_mlp": 1.00168896, "epoch": 0.2082068239891778, "flos": 70175735717760.0, "grad_norm": 0.7622558664505636, "language_loss": 0.59010452, "learning_rate": 3.6760070849038226e-06, "loss": 0.6079222, "num_input_tokens_seen": 74909075, "step": 3463, "time_per_iteration": 3.4111485481262207 }, { "auxiliary_loss_clip": 0.01132553, "auxiliary_loss_mlp": 0.01047591, "balance_loss_clip": 1.04893148, "balance_loss_mlp": 1.02866018, "epoch": 0.20826694724184577, "flos": 24608074590720.0, "grad_norm": 2.6002828602708283, "language_loss": 0.66744608, "learning_rate": 3.675794537601429e-06, "loss": 0.68924749, "num_input_tokens_seen": 74928125, "step": 3464, "time_per_iteration": 2.718229293823242 }, { "auxiliary_loss_clip": 0.0112374, "auxiliary_loss_mlp": 0.0104712, "balance_loss_clip": 1.05101657, "balance_loss_mlp": 1.02755797, "epoch": 0.20832707049451377, "flos": 12892829034240.0, "grad_norm": 2.9384916482598205, "language_loss": 0.84044278, "learning_rate": 3.6755819267527373e-06, "loss": 0.86215138, "num_input_tokens_seen": 74945090, "step": 3465, "time_per_iteration": 2.732109546661377 }, { "auxiliary_loss_clip": 0.01096712, "auxiliary_loss_mlp": 0.01040605, "balance_loss_clip": 1.04373813, "balance_loss_mlp": 1.02221096, "epoch": 0.20838719374718173, "flos": 22198827709440.0, "grad_norm": 2.576139197384499, "language_loss": 0.81923312, "learning_rate": 3.6753692523658113e-06, "loss": 0.84060633, "num_input_tokens_seen": 74963630, "step": 3466, "time_per_iteration": 2.7758567333221436 }, { "auxiliary_loss_clip": 0.01140158, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.05322194, "balance_loss_mlp": 1.02787983, "epoch": 0.2084473169998497, "flos": 15158648908800.0, "grad_norm": 4.780862188541671, "language_loss": 0.82008922, "learning_rate": 3.675156514448716e-06, "loss": 0.84193271, "num_input_tokens_seen": 74981875, "step": 3467, "time_per_iteration": 2.5788159370422363 }, { "auxiliary_loss_clip": 0.01149826, "auxiliary_loss_mlp": 0.01040027, "balance_loss_clip": 1.05362797, "balance_loss_mlp": 1.02265835, "epoch": 0.20850744025251766, "flos": 17456788045440.0, "grad_norm": 2.009157691583003, "language_loss": 0.82178962, "learning_rate": 3.674943713009518e-06, "loss": 0.84368813, "num_input_tokens_seen": 74999155, "step": 3468, "time_per_iteration": 2.5874218940734863 }, { "auxiliary_loss_clip": 0.01143942, "auxiliary_loss_mlp": 0.01048537, "balance_loss_clip": 1.05300629, "balance_loss_mlp": 1.02774715, "epoch": 0.20856756350518563, "flos": 25698968593920.0, "grad_norm": 2.0793964386868584, "language_loss": 0.90328556, "learning_rate": 3.6747308480562856e-06, "loss": 0.92521036, "num_input_tokens_seen": 75017850, "step": 3469, "time_per_iteration": 2.6595447063446045 }, { "auxiliary_loss_clip": 0.01125181, "auxiliary_loss_mlp": 0.0104984, "balance_loss_clip": 1.05548537, "balance_loss_mlp": 1.03175592, "epoch": 0.2086276867578536, "flos": 37889060970240.0, "grad_norm": 1.9058635967771913, "language_loss": 0.76809812, "learning_rate": 3.674517919597092e-06, "loss": 0.78984833, "num_input_tokens_seen": 75039270, "step": 3470, "time_per_iteration": 2.908046245574951 }, { "auxiliary_loss_clip": 0.01133446, "auxiliary_loss_mlp": 0.01047618, "balance_loss_clip": 1.0551517, "balance_loss_mlp": 1.02942634, "epoch": 0.20868781001052156, "flos": 25557049958400.0, "grad_norm": 2.301093296435647, "language_loss": 0.75801277, "learning_rate": 3.674304927640011e-06, "loss": 0.77982342, "num_input_tokens_seen": 75059350, "step": 3471, "time_per_iteration": 2.713533401489258 }, { "auxiliary_loss_clip": 0.01123818, "auxiliary_loss_mlp": 0.01053513, "balance_loss_clip": 1.04961812, "balance_loss_mlp": 1.03384328, "epoch": 0.20874793326318955, "flos": 27529192235520.0, "grad_norm": 2.366290140730035, "language_loss": 0.75703716, "learning_rate": 3.67409187219312e-06, "loss": 0.77881044, "num_input_tokens_seen": 75080150, "step": 3472, "time_per_iteration": 2.785034656524658 }, { "auxiliary_loss_clip": 0.01140589, "auxiliary_loss_mlp": 0.01046494, "balance_loss_clip": 1.05084538, "balance_loss_mlp": 1.02854145, "epoch": 0.20880805651585752, "flos": 18548795370240.0, "grad_norm": 7.277377921302429, "language_loss": 0.84276807, "learning_rate": 3.6738787532644966e-06, "loss": 0.86463886, "num_input_tokens_seen": 75097920, "step": 3473, "time_per_iteration": 2.6236281394958496 }, { "auxiliary_loss_clip": 0.01057043, "auxiliary_loss_mlp": 0.01037704, "balance_loss_clip": 1.05363917, "balance_loss_mlp": 1.03434241, "epoch": 0.20886817976852548, "flos": 65946644225280.0, "grad_norm": 0.9045809123115837, "language_loss": 0.63652557, "learning_rate": 3.6736655708622235e-06, "loss": 0.65747303, "num_input_tokens_seen": 75152410, "step": 3474, "time_per_iteration": 3.1946537494659424 }, { "auxiliary_loss_clip": 0.0113535, "auxiliary_loss_mlp": 0.01045984, "balance_loss_clip": 1.05276895, "balance_loss_mlp": 1.02782845, "epoch": 0.20892830302119345, "flos": 36539178929280.0, "grad_norm": 3.2311626254468795, "language_loss": 0.69970965, "learning_rate": 3.6734523249943844e-06, "loss": 0.72152305, "num_input_tokens_seen": 75173265, "step": 3475, "time_per_iteration": 2.7967529296875 }, { "auxiliary_loss_clip": 0.01158022, "auxiliary_loss_mlp": 0.01046944, "balance_loss_clip": 1.05606794, "balance_loss_mlp": 1.02862167, "epoch": 0.2089884262738614, "flos": 20956749361920.0, "grad_norm": 1.9789108228051473, "language_loss": 0.70372891, "learning_rate": 3.673239015669065e-06, "loss": 0.72577858, "num_input_tokens_seen": 75193640, "step": 3476, "time_per_iteration": 2.629687786102295 }, { "auxiliary_loss_clip": 0.01131765, "auxiliary_loss_mlp": 0.01045236, "balance_loss_clip": 1.05439556, "balance_loss_mlp": 1.02722347, "epoch": 0.20904854952652938, "flos": 22784028088320.0, "grad_norm": 2.3868812434184603, "language_loss": 0.89227062, "learning_rate": 3.6730256428943544e-06, "loss": 0.91404068, "num_input_tokens_seen": 75212545, "step": 3477, "time_per_iteration": 2.7574357986450195 }, { "auxiliary_loss_clip": 0.01092922, "auxiliary_loss_mlp": 0.01046119, "balance_loss_clip": 1.045825, "balance_loss_mlp": 1.02737951, "epoch": 0.20910867277919734, "flos": 27303277645440.0, "grad_norm": 2.6092415644893814, "language_loss": 0.67816859, "learning_rate": 3.672812206678344e-06, "loss": 0.69955903, "num_input_tokens_seen": 75230865, "step": 3478, "time_per_iteration": 2.7929017543792725 }, { "auxiliary_loss_clip": 0.01094689, "auxiliary_loss_mlp": 0.01042766, "balance_loss_clip": 1.04024661, "balance_loss_mlp": 1.02308464, "epoch": 0.20916879603186533, "flos": 14319237000960.0, "grad_norm": 4.056245481336458, "language_loss": 0.84239435, "learning_rate": 3.672598707029127e-06, "loss": 0.86376888, "num_input_tokens_seen": 75248285, "step": 3479, "time_per_iteration": 2.743544816970825 }, { "auxiliary_loss_clip": 0.01111533, "auxiliary_loss_mlp": 0.01050991, "balance_loss_clip": 1.04863191, "balance_loss_mlp": 1.03028417, "epoch": 0.2092289192845333, "flos": 22273019251200.0, "grad_norm": 9.599906344578406, "language_loss": 0.74294043, "learning_rate": 3.6723851439548003e-06, "loss": 0.76456571, "num_input_tokens_seen": 75266310, "step": 3480, "time_per_iteration": 2.7278034687042236 }, { "auxiliary_loss_clip": 0.01107791, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.04748154, "balance_loss_mlp": 1.02226901, "epoch": 0.20928904253720126, "flos": 14830712714880.0, "grad_norm": 2.178942595840573, "language_loss": 0.75664043, "learning_rate": 3.67217151746346e-06, "loss": 0.77810597, "num_input_tokens_seen": 75284175, "step": 3481, "time_per_iteration": 2.71073842048645 }, { "auxiliary_loss_clip": 0.01090021, "auxiliary_loss_mlp": 0.01046234, "balance_loss_clip": 1.04561555, "balance_loss_mlp": 1.02727938, "epoch": 0.20934916578986923, "flos": 23259162216960.0, "grad_norm": 1.816378391984801, "language_loss": 0.8517971, "learning_rate": 3.671957827563209e-06, "loss": 0.87315965, "num_input_tokens_seen": 75303465, "step": 3482, "time_per_iteration": 2.8777174949645996 }, { "auxiliary_loss_clip": 0.01099298, "auxiliary_loss_mlp": 0.01046228, "balance_loss_clip": 1.05039477, "balance_loss_mlp": 1.02817941, "epoch": 0.2094092890425372, "flos": 32014398677760.0, "grad_norm": 1.802490425012806, "language_loss": 0.70550174, "learning_rate": 3.6717440742621494e-06, "loss": 0.72695696, "num_input_tokens_seen": 75325290, "step": 3483, "time_per_iteration": 2.8599836826324463 }, { "auxiliary_loss_clip": 0.01127333, "auxiliary_loss_mlp": 0.01048954, "balance_loss_clip": 1.05204535, "balance_loss_mlp": 1.03082263, "epoch": 0.20946941229520516, "flos": 20010647082240.0, "grad_norm": 1.9649551735344426, "language_loss": 0.74867833, "learning_rate": 3.6715302575683865e-06, "loss": 0.77044123, "num_input_tokens_seen": 75343895, "step": 3484, "time_per_iteration": 2.655538320541382 }, { "auxiliary_loss_clip": 0.01117623, "auxiliary_loss_mlp": 0.01046902, "balance_loss_clip": 1.0514648, "balance_loss_mlp": 1.0274353, "epoch": 0.20952953554787315, "flos": 30740072895360.0, "grad_norm": 1.6308141537991403, "language_loss": 0.70815694, "learning_rate": 3.6713163774900292e-06, "loss": 0.72980225, "num_input_tokens_seen": 75367100, "step": 3485, "time_per_iteration": 2.744417667388916 }, { "auxiliary_loss_clip": 0.01083098, "auxiliary_loss_mlp": 0.00777163, "balance_loss_clip": 1.0433619, "balance_loss_mlp": 1.00097859, "epoch": 0.20958965880054112, "flos": 27049209770880.0, "grad_norm": 2.030771632516388, "language_loss": 0.83274543, "learning_rate": 3.6711024340351875e-06, "loss": 0.85134804, "num_input_tokens_seen": 75389925, "step": 3486, "time_per_iteration": 2.742042303085327 }, { "auxiliary_loss_clip": 0.01140212, "auxiliary_loss_mlp": 0.01048337, "balance_loss_clip": 1.05242062, "balance_loss_mlp": 1.03115916, "epoch": 0.20964978205320908, "flos": 34204123589760.0, "grad_norm": 1.6926372989653347, "language_loss": 0.87134725, "learning_rate": 3.6708884272119737e-06, "loss": 0.89323276, "num_input_tokens_seen": 75408575, "step": 3487, "time_per_iteration": 2.708331346511841 }, { "auxiliary_loss_clip": 0.01112214, "auxiliary_loss_mlp": 0.01041678, "balance_loss_clip": 1.04791641, "balance_loss_mlp": 1.0228194, "epoch": 0.20970990530587705, "flos": 23477391296640.0, "grad_norm": 4.471143750410675, "language_loss": 0.72291327, "learning_rate": 3.670674357028504e-06, "loss": 0.74445224, "num_input_tokens_seen": 75427155, "step": 3488, "time_per_iteration": 4.250715970993042 }, { "auxiliary_loss_clip": 0.01121403, "auxiliary_loss_mlp": 0.01037296, "balance_loss_clip": 1.05096245, "balance_loss_mlp": 1.02014148, "epoch": 0.209770028558545, "flos": 18551452976640.0, "grad_norm": 2.6694226497987437, "language_loss": 0.79665899, "learning_rate": 3.6704602234928945e-06, "loss": 0.81824595, "num_input_tokens_seen": 75444450, "step": 3489, "time_per_iteration": 2.6926958560943604 }, { "auxiliary_loss_clip": 0.01152639, "auxiliary_loss_mlp": 0.01045785, "balance_loss_clip": 1.05325401, "balance_loss_mlp": 1.02875018, "epoch": 0.20983015181121298, "flos": 21617003208960.0, "grad_norm": 2.022409198347131, "language_loss": 0.72505707, "learning_rate": 3.670246026613266e-06, "loss": 0.74704129, "num_input_tokens_seen": 75462625, "step": 3490, "time_per_iteration": 4.133761644363403 }, { "auxiliary_loss_clip": 0.01122247, "auxiliary_loss_mlp": 0.01050283, "balance_loss_clip": 1.0509479, "balance_loss_mlp": 1.03402328, "epoch": 0.20989027506388094, "flos": 16614718531200.0, "grad_norm": 1.8035978449536252, "language_loss": 0.70332754, "learning_rate": 3.6700317663977415e-06, "loss": 0.72505283, "num_input_tokens_seen": 75480640, "step": 3491, "time_per_iteration": 2.667243003845215 }, { "auxiliary_loss_clip": 0.0113848, "auxiliary_loss_mlp": 0.0077627, "balance_loss_clip": 1.05017376, "balance_loss_mlp": 1.00098944, "epoch": 0.20995039831654894, "flos": 23216823060480.0, "grad_norm": 2.379943808529104, "language_loss": 0.79751909, "learning_rate": 3.669817442854444e-06, "loss": 0.81666666, "num_input_tokens_seen": 75494900, "step": 3492, "time_per_iteration": 4.270704984664917 }, { "auxiliary_loss_clip": 0.01138825, "auxiliary_loss_mlp": 0.00776339, "balance_loss_clip": 1.05182219, "balance_loss_mlp": 1.00108409, "epoch": 0.2100105215692169, "flos": 18147493647360.0, "grad_norm": 2.2783194747149906, "language_loss": 0.86987948, "learning_rate": 3.669603055991502e-06, "loss": 0.88903111, "num_input_tokens_seen": 75513370, "step": 3493, "time_per_iteration": 2.7830448150634766 }, { "auxiliary_loss_clip": 0.01110786, "auxiliary_loss_mlp": 0.01037681, "balance_loss_clip": 1.04520118, "balance_loss_mlp": 1.02105093, "epoch": 0.21007064482188487, "flos": 15961611490560.0, "grad_norm": 6.813030650079402, "language_loss": 0.68622243, "learning_rate": 3.6693886058170455e-06, "loss": 0.70770705, "num_input_tokens_seen": 75532480, "step": 3494, "time_per_iteration": 2.8479061126708984 }, { "auxiliary_loss_clip": 0.01145467, "auxiliary_loss_mlp": 0.01037272, "balance_loss_clip": 1.05302739, "balance_loss_mlp": 1.01998639, "epoch": 0.21013076807455283, "flos": 32234315696640.0, "grad_norm": 1.7516454579581615, "language_loss": 0.78848761, "learning_rate": 3.6691740923392053e-06, "loss": 0.81031501, "num_input_tokens_seen": 75552745, "step": 3495, "time_per_iteration": 2.9313197135925293 }, { "auxiliary_loss_clip": 0.01119614, "auxiliary_loss_mlp": 0.01045108, "balance_loss_clip": 1.04760814, "balance_loss_mlp": 1.02708316, "epoch": 0.2101908913272208, "flos": 23696625957120.0, "grad_norm": 2.1492916784611844, "language_loss": 0.77302933, "learning_rate": 3.668959515566116e-06, "loss": 0.79467654, "num_input_tokens_seen": 75574355, "step": 3496, "time_per_iteration": 4.467881441116333 }, { "auxiliary_loss_clip": 0.01135202, "auxiliary_loss_mlp": 0.01046618, "balance_loss_clip": 1.05169654, "balance_loss_mlp": 1.02839065, "epoch": 0.21025101457988876, "flos": 20375786787840.0, "grad_norm": 2.146148958862047, "language_loss": 0.82076812, "learning_rate": 3.668744875505915e-06, "loss": 0.8425864, "num_input_tokens_seen": 75592215, "step": 3497, "time_per_iteration": 2.683037281036377 }, { "auxiliary_loss_clip": 0.01144559, "auxiliary_loss_mlp": 0.01047188, "balance_loss_clip": 1.05445957, "balance_loss_mlp": 1.02967596, "epoch": 0.21031113783255675, "flos": 25775638174080.0, "grad_norm": 1.732381679276629, "language_loss": 0.67239833, "learning_rate": 3.668530172166741e-06, "loss": 0.69431579, "num_input_tokens_seen": 75610740, "step": 3498, "time_per_iteration": 2.685481548309326 }, { "auxiliary_loss_clip": 0.01121255, "auxiliary_loss_mlp": 0.01044553, "balance_loss_clip": 1.04974794, "balance_loss_mlp": 1.02611172, "epoch": 0.21037126108522472, "flos": 22018197191040.0, "grad_norm": 1.7892967196850054, "language_loss": 0.80832362, "learning_rate": 3.6683154055567352e-06, "loss": 0.82998168, "num_input_tokens_seen": 75631005, "step": 3499, "time_per_iteration": 2.744995355606079 }, { "auxiliary_loss_clip": 0.01139753, "auxiliary_loss_mlp": 0.01039729, "balance_loss_clip": 1.05226696, "balance_loss_mlp": 1.02312946, "epoch": 0.21043138433789269, "flos": 25334403505920.0, "grad_norm": 1.6464696881852638, "language_loss": 0.77983701, "learning_rate": 3.668100575684043e-06, "loss": 0.80163181, "num_input_tokens_seen": 75650655, "step": 3500, "time_per_iteration": 2.7704038619995117 }, { "auxiliary_loss_clip": 0.01129369, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.05095315, "balance_loss_mlp": 1.02390063, "epoch": 0.21049150759056065, "flos": 25556654908800.0, "grad_norm": 1.5981262394728393, "language_loss": 0.74450207, "learning_rate": 3.6678856825568094e-06, "loss": 0.76621759, "num_input_tokens_seen": 75669895, "step": 3501, "time_per_iteration": 2.7066893577575684 }, { "auxiliary_loss_clip": 0.01134924, "auxiliary_loss_mlp": 0.01039556, "balance_loss_clip": 1.04989994, "balance_loss_mlp": 1.02227044, "epoch": 0.21055163084322862, "flos": 24495602129280.0, "grad_norm": 1.6188770382514572, "language_loss": 0.75278366, "learning_rate": 3.667670726183183e-06, "loss": 0.77452844, "num_input_tokens_seen": 75689535, "step": 3502, "time_per_iteration": 2.724635124206543 }, { "auxiliary_loss_clip": 0.01098479, "auxiliary_loss_mlp": 0.01040924, "balance_loss_clip": 1.04576206, "balance_loss_mlp": 1.02248216, "epoch": 0.21061175409589658, "flos": 25739045193600.0, "grad_norm": 1.9441266701933382, "language_loss": 0.77188909, "learning_rate": 3.667455706571316e-06, "loss": 0.7932831, "num_input_tokens_seen": 75709265, "step": 3503, "time_per_iteration": 2.7545289993286133 }, { "auxiliary_loss_clip": 0.010957, "auxiliary_loss_mlp": 0.01045911, "balance_loss_clip": 1.04817343, "balance_loss_mlp": 1.02478695, "epoch": 0.21067187734856455, "flos": 18989168112000.0, "grad_norm": 2.256374081289255, "language_loss": 0.78297234, "learning_rate": 3.6672406237293617e-06, "loss": 0.8043884, "num_input_tokens_seen": 75727050, "step": 3504, "time_per_iteration": 2.7454304695129395 }, { "auxiliary_loss_clip": 0.01117408, "auxiliary_loss_mlp": 0.01049815, "balance_loss_clip": 1.0488404, "balance_loss_mlp": 1.03152788, "epoch": 0.21073200060123254, "flos": 24681368292480.0, "grad_norm": 1.5753219052286964, "language_loss": 0.76731002, "learning_rate": 3.6670254776654754e-06, "loss": 0.78898227, "num_input_tokens_seen": 75747175, "step": 3505, "time_per_iteration": 2.7509703636169434 }, { "auxiliary_loss_clip": 0.01120291, "auxiliary_loss_mlp": 0.01052026, "balance_loss_clip": 1.04882348, "balance_loss_mlp": 1.03383446, "epoch": 0.2107921238539005, "flos": 28549342402560.0, "grad_norm": 1.9938386598136906, "language_loss": 0.63933277, "learning_rate": 3.6668102683878163e-06, "loss": 0.66105598, "num_input_tokens_seen": 75767690, "step": 3506, "time_per_iteration": 2.773611545562744 }, { "auxiliary_loss_clip": 0.01138444, "auxiliary_loss_mlp": 0.01050655, "balance_loss_clip": 1.05078697, "balance_loss_mlp": 1.03257108, "epoch": 0.21085224710656847, "flos": 25885848078720.0, "grad_norm": 2.170999698474249, "language_loss": 0.82010436, "learning_rate": 3.6665949959045443e-06, "loss": 0.84199536, "num_input_tokens_seen": 75787255, "step": 3507, "time_per_iteration": 2.6604206562042236 }, { "auxiliary_loss_clip": 0.01136754, "auxiliary_loss_mlp": 0.01043314, "balance_loss_clip": 1.04972744, "balance_loss_mlp": 1.02472949, "epoch": 0.21091237035923643, "flos": 14976294537600.0, "grad_norm": 2.0519706535557414, "language_loss": 0.75213134, "learning_rate": 3.666379660223824e-06, "loss": 0.77393204, "num_input_tokens_seen": 75805890, "step": 3508, "time_per_iteration": 2.7164604663848877 }, { "auxiliary_loss_clip": 0.01154655, "auxiliary_loss_mlp": 0.01036811, "balance_loss_clip": 1.05263913, "balance_loss_mlp": 1.01894128, "epoch": 0.2109724936119044, "flos": 16362518163840.0, "grad_norm": 3.4182125548434112, "language_loss": 0.84984946, "learning_rate": 3.6661642613538192e-06, "loss": 0.87176406, "num_input_tokens_seen": 75821620, "step": 3509, "time_per_iteration": 2.661743402481079 }, { "auxiliary_loss_clip": 0.01120944, "auxiliary_loss_mlp": 0.01044014, "balance_loss_clip": 1.05299115, "balance_loss_mlp": 1.02443957, "epoch": 0.21103261686457236, "flos": 31502492000640.0, "grad_norm": 2.210880078691599, "language_loss": 0.68125075, "learning_rate": 3.6659487993026987e-06, "loss": 0.70290035, "num_input_tokens_seen": 75842490, "step": 3510, "time_per_iteration": 2.7881460189819336 }, { "auxiliary_loss_clip": 0.01152569, "auxiliary_loss_mlp": 0.01046993, "balance_loss_clip": 1.05026078, "balance_loss_mlp": 1.02892137, "epoch": 0.21109274011724033, "flos": 27344072517120.0, "grad_norm": 1.958863999940011, "language_loss": 0.72639364, "learning_rate": 3.6657332740786327e-06, "loss": 0.74838924, "num_input_tokens_seen": 75865985, "step": 3511, "time_per_iteration": 2.6942689418792725 }, { "auxiliary_loss_clip": 0.01066393, "auxiliary_loss_mlp": 0.01041278, "balance_loss_clip": 1.04279399, "balance_loss_mlp": 1.0208931, "epoch": 0.21115286336990832, "flos": 17820383466240.0, "grad_norm": 3.2801391377369686, "language_loss": 0.69354337, "learning_rate": 3.665517685689794e-06, "loss": 0.71462011, "num_input_tokens_seen": 75882745, "step": 3512, "time_per_iteration": 2.8260998725891113 }, { "auxiliary_loss_clip": 0.01140043, "auxiliary_loss_mlp": 0.01050555, "balance_loss_clip": 1.04943943, "balance_loss_mlp": 1.03082585, "epoch": 0.2112129866225763, "flos": 27197987904000.0, "grad_norm": 2.072678482519775, "language_loss": 0.73145646, "learning_rate": 3.6653020341443584e-06, "loss": 0.75336242, "num_input_tokens_seen": 75904305, "step": 3513, "time_per_iteration": 2.9639391899108887 }, { "auxiliary_loss_clip": 0.01121964, "auxiliary_loss_mlp": 0.01038325, "balance_loss_clip": 1.04785061, "balance_loss_mlp": 1.02089679, "epoch": 0.21127310987524425, "flos": 23731279603200.0, "grad_norm": 2.0322171916220086, "language_loss": 0.74422491, "learning_rate": 3.665086319450502e-06, "loss": 0.76582778, "num_input_tokens_seen": 75923710, "step": 3514, "time_per_iteration": 2.7379143238067627 }, { "auxiliary_loss_clip": 0.01136944, "auxiliary_loss_mlp": 0.01038225, "balance_loss_clip": 1.05334568, "balance_loss_mlp": 1.01941383, "epoch": 0.21133323312791222, "flos": 18332505624960.0, "grad_norm": 2.431934297389972, "language_loss": 0.76738697, "learning_rate": 3.6648705416164062e-06, "loss": 0.78913867, "num_input_tokens_seen": 75942625, "step": 3515, "time_per_iteration": 2.6339287757873535 }, { "auxiliary_loss_clip": 0.011289, "auxiliary_loss_mlp": 0.01047482, "balance_loss_clip": 1.05247736, "balance_loss_mlp": 1.0288614, "epoch": 0.21139335638058018, "flos": 17931203902080.0, "grad_norm": 2.7460645413082756, "language_loss": 0.68756706, "learning_rate": 3.6646547006502518e-06, "loss": 0.70933092, "num_input_tokens_seen": 75959930, "step": 3516, "time_per_iteration": 2.6489672660827637 }, { "auxiliary_loss_clip": 0.01118182, "auxiliary_loss_mlp": 0.01049447, "balance_loss_clip": 1.05634522, "balance_loss_mlp": 1.03045666, "epoch": 0.21145347963324815, "flos": 24572092141440.0, "grad_norm": 1.8368744753927078, "language_loss": 0.85010064, "learning_rate": 3.664438796560225e-06, "loss": 0.87177694, "num_input_tokens_seen": 75980335, "step": 3517, "time_per_iteration": 2.745887279510498 }, { "auxiliary_loss_clip": 0.01125904, "auxiliary_loss_mlp": 0.01042813, "balance_loss_clip": 1.04719234, "balance_loss_mlp": 1.02506244, "epoch": 0.21151360288591614, "flos": 35845959375360.0, "grad_norm": 2.246330970109572, "language_loss": 0.63672101, "learning_rate": 3.664222829354512e-06, "loss": 0.65840822, "num_input_tokens_seen": 76002095, "step": 3518, "time_per_iteration": 2.7990219593048096 }, { "auxiliary_loss_clip": 0.01089367, "auxiliary_loss_mlp": 0.01057733, "balance_loss_clip": 1.05040181, "balance_loss_mlp": 1.04001832, "epoch": 0.2115737261385841, "flos": 24641579001600.0, "grad_norm": 2.1349107177710875, "language_loss": 0.89256221, "learning_rate": 3.664006799041303e-06, "loss": 0.91403317, "num_input_tokens_seen": 76020425, "step": 3519, "time_per_iteration": 2.8022944927215576 }, { "auxiliary_loss_clip": 0.01135146, "auxiliary_loss_mlp": 0.01049587, "balance_loss_clip": 1.05320001, "balance_loss_mlp": 1.03140712, "epoch": 0.21163384939125207, "flos": 25226887121280.0, "grad_norm": 1.8050755180524396, "language_loss": 0.81235015, "learning_rate": 3.6637907056287886e-06, "loss": 0.8341974, "num_input_tokens_seen": 76041210, "step": 3520, "time_per_iteration": 2.750988245010376 }, { "auxiliary_loss_clip": 0.01124406, "auxiliary_loss_mlp": 0.01048631, "balance_loss_clip": 1.05111551, "balance_loss_mlp": 1.03095269, "epoch": 0.21169397264392004, "flos": 26067520091520.0, "grad_norm": 1.92815865975435, "language_loss": 0.76254267, "learning_rate": 3.6635745491251642e-06, "loss": 0.78427303, "num_input_tokens_seen": 76062685, "step": 3521, "time_per_iteration": 2.7965810298919678 }, { "auxiliary_loss_clip": 0.0109789, "auxiliary_loss_mlp": 0.01044794, "balance_loss_clip": 1.04872918, "balance_loss_mlp": 1.02841413, "epoch": 0.211754095896588, "flos": 23108265181440.0, "grad_norm": 2.0270933567011302, "language_loss": 0.75752926, "learning_rate": 3.663358329538626e-06, "loss": 0.77895606, "num_input_tokens_seen": 76082300, "step": 3522, "time_per_iteration": 2.8280131816864014 }, { "auxiliary_loss_clip": 0.01153324, "auxiliary_loss_mlp": 0.01053431, "balance_loss_clip": 1.05353725, "balance_loss_mlp": 1.03541851, "epoch": 0.21181421914925597, "flos": 27922341571200.0, "grad_norm": 1.8399634756194385, "language_loss": 0.70481133, "learning_rate": 3.663142046877374e-06, "loss": 0.72687888, "num_input_tokens_seen": 76101135, "step": 3523, "time_per_iteration": 2.6909022331237793 }, { "auxiliary_loss_clip": 0.01139749, "auxiliary_loss_mlp": 0.01054127, "balance_loss_clip": 1.05166054, "balance_loss_mlp": 1.03619766, "epoch": 0.21187434240192393, "flos": 17128636369920.0, "grad_norm": 2.455264594190525, "language_loss": 0.77290082, "learning_rate": 3.6629257011496085e-06, "loss": 0.7948395, "num_input_tokens_seen": 76119320, "step": 3524, "time_per_iteration": 2.6844334602355957 }, { "auxiliary_loss_clip": 0.01132697, "auxiliary_loss_mlp": 0.0104457, "balance_loss_clip": 1.05066419, "balance_loss_mlp": 1.02621162, "epoch": 0.21193446565459192, "flos": 22347318533760.0, "grad_norm": 1.841652047976503, "language_loss": 0.81680572, "learning_rate": 3.6627092923635338e-06, "loss": 0.83857846, "num_input_tokens_seen": 76137445, "step": 3525, "time_per_iteration": 2.71073842048645 }, { "auxiliary_loss_clip": 0.01088536, "auxiliary_loss_mlp": 0.01041509, "balance_loss_clip": 1.04158318, "balance_loss_mlp": 1.02353263, "epoch": 0.2119945889072599, "flos": 27199316707200.0, "grad_norm": 1.867957043941215, "language_loss": 0.75627208, "learning_rate": 3.662492820527356e-06, "loss": 0.77757257, "num_input_tokens_seen": 76159500, "step": 3526, "time_per_iteration": 2.973966598510742 }, { "auxiliary_loss_clip": 0.0115455, "auxiliary_loss_mlp": 0.01041027, "balance_loss_clip": 1.05324817, "balance_loss_mlp": 1.023229, "epoch": 0.21205471215992786, "flos": 20991869884800.0, "grad_norm": 1.8230643924086412, "language_loss": 0.77070421, "learning_rate": 3.662276285649284e-06, "loss": 0.79265994, "num_input_tokens_seen": 76177990, "step": 3527, "time_per_iteration": 2.648961067199707 }, { "auxiliary_loss_clip": 0.01151081, "auxiliary_loss_mlp": 0.0104874, "balance_loss_clip": 1.05143785, "balance_loss_mlp": 1.02977419, "epoch": 0.21211483541259582, "flos": 20777663128320.0, "grad_norm": 2.807984733302778, "language_loss": 0.7815178, "learning_rate": 3.662059687737528e-06, "loss": 0.80351603, "num_input_tokens_seen": 76197125, "step": 3528, "time_per_iteration": 4.401185989379883 }, { "auxiliary_loss_clip": 0.01135768, "auxiliary_loss_mlp": 0.01045736, "balance_loss_clip": 1.04889631, "balance_loss_mlp": 1.02817655, "epoch": 0.21217495866526379, "flos": 18989994124800.0, "grad_norm": 2.1271435469609257, "language_loss": 0.8128866, "learning_rate": 3.6618430268003024e-06, "loss": 0.8347016, "num_input_tokens_seen": 76216215, "step": 3529, "time_per_iteration": 4.309772968292236 }, { "auxiliary_loss_clip": 0.0113319, "auxiliary_loss_mlp": 0.00777373, "balance_loss_clip": 1.04967499, "balance_loss_mlp": 1.00112891, "epoch": 0.21223508191793175, "flos": 20667309569280.0, "grad_norm": 1.9704727824538568, "language_loss": 0.76427567, "learning_rate": 3.6616263028458235e-06, "loss": 0.78338128, "num_input_tokens_seen": 76237010, "step": 3530, "time_per_iteration": 2.7592365741729736 }, { "auxiliary_loss_clip": 0.0115078, "auxiliary_loss_mlp": 0.01047067, "balance_loss_clip": 1.0522244, "balance_loss_mlp": 1.02990103, "epoch": 0.21229520517059972, "flos": 21616464504960.0, "grad_norm": 2.1154933827202274, "language_loss": 0.82973897, "learning_rate": 3.661409515882308e-06, "loss": 0.85171747, "num_input_tokens_seen": 76255965, "step": 3531, "time_per_iteration": 4.168981313705444 }, { "auxiliary_loss_clip": 0.01120152, "auxiliary_loss_mlp": 0.01042697, "balance_loss_clip": 1.04767489, "balance_loss_mlp": 1.02313459, "epoch": 0.2123553284232677, "flos": 13991049411840.0, "grad_norm": 2.335526210972433, "language_loss": 0.73087364, "learning_rate": 3.661192665917977e-06, "loss": 0.75250214, "num_input_tokens_seen": 76272150, "step": 3532, "time_per_iteration": 2.6797189712524414 }, { "auxiliary_loss_clip": 0.01126693, "auxiliary_loss_mlp": 0.01041409, "balance_loss_clip": 1.0539782, "balance_loss_mlp": 1.02269292, "epoch": 0.21241545167593567, "flos": 18296774570880.0, "grad_norm": 6.22254473074881, "language_loss": 0.74268675, "learning_rate": 3.660975752961054e-06, "loss": 0.76436776, "num_input_tokens_seen": 76291425, "step": 3533, "time_per_iteration": 2.741152048110962 }, { "auxiliary_loss_clip": 0.01146682, "auxiliary_loss_mlp": 0.0104438, "balance_loss_clip": 1.05342829, "balance_loss_mlp": 1.0265224, "epoch": 0.21247557492860364, "flos": 34713121265280.0, "grad_norm": 2.0406923816018714, "language_loss": 0.70889592, "learning_rate": 3.6607587770197634e-06, "loss": 0.73080653, "num_input_tokens_seen": 76313975, "step": 3534, "time_per_iteration": 2.8210513591766357 }, { "auxiliary_loss_clip": 0.01133157, "auxiliary_loss_mlp": 0.01043651, "balance_loss_clip": 1.05234385, "balance_loss_mlp": 1.02463722, "epoch": 0.2125356981812716, "flos": 22053820504320.0, "grad_norm": 2.102271516852891, "language_loss": 0.71675557, "learning_rate": 3.6605417381023346e-06, "loss": 0.73852366, "num_input_tokens_seen": 76330955, "step": 3535, "time_per_iteration": 2.804506540298462 }, { "auxiliary_loss_clip": 0.01137461, "auxiliary_loss_mlp": 0.01053804, "balance_loss_clip": 1.05108476, "balance_loss_mlp": 1.03607774, "epoch": 0.21259582143393957, "flos": 28548336821760.0, "grad_norm": 24.01704513629389, "language_loss": 0.70639503, "learning_rate": 3.660324636216996e-06, "loss": 0.72830772, "num_input_tokens_seen": 76352680, "step": 3536, "time_per_iteration": 4.442729473114014 }, { "auxiliary_loss_clip": 0.011554, "auxiliary_loss_mlp": 0.01049939, "balance_loss_clip": 1.05231214, "balance_loss_mlp": 1.03082991, "epoch": 0.21265594468660753, "flos": 20120892900480.0, "grad_norm": 2.2527167001205806, "language_loss": 0.8784188, "learning_rate": 3.660107471371981e-06, "loss": 0.90047216, "num_input_tokens_seen": 76370750, "step": 3537, "time_per_iteration": 2.6365723609924316 }, { "auxiliary_loss_clip": 0.01137536, "auxiliary_loss_mlp": 0.00776226, "balance_loss_clip": 1.04911351, "balance_loss_mlp": 1.00101614, "epoch": 0.21271606793927553, "flos": 23076161400960.0, "grad_norm": 1.8080285651248438, "language_loss": 0.80480909, "learning_rate": 3.659890243575524e-06, "loss": 0.82394671, "num_input_tokens_seen": 76390610, "step": 3538, "time_per_iteration": 2.7403554916381836 }, { "auxiliary_loss_clip": 0.01080631, "auxiliary_loss_mlp": 0.0105169, "balance_loss_clip": 1.04171312, "balance_loss_mlp": 1.03219926, "epoch": 0.2127761911919435, "flos": 26388201738240.0, "grad_norm": 2.705287390300715, "language_loss": 0.86691839, "learning_rate": 3.659672952835863e-06, "loss": 0.88824159, "num_input_tokens_seen": 76408860, "step": 3539, "time_per_iteration": 2.8177876472473145 }, { "auxiliary_loss_clip": 0.01120184, "auxiliary_loss_mlp": 0.01047424, "balance_loss_clip": 1.04577422, "balance_loss_mlp": 1.0295074, "epoch": 0.21283631444461146, "flos": 20228265630720.0, "grad_norm": 5.212413836862573, "language_loss": 0.57756186, "learning_rate": 3.659455599161237e-06, "loss": 0.59923792, "num_input_tokens_seen": 76424980, "step": 3540, "time_per_iteration": 2.786552667617798 }, { "auxiliary_loss_clip": 0.01154193, "auxiliary_loss_mlp": 0.010403, "balance_loss_clip": 1.05276537, "balance_loss_mlp": 1.02131045, "epoch": 0.21289643769727942, "flos": 13516992691200.0, "grad_norm": 2.318388810062464, "language_loss": 0.76114893, "learning_rate": 3.659238182559888e-06, "loss": 0.78309381, "num_input_tokens_seen": 76443135, "step": 3541, "time_per_iteration": 2.646207332611084 }, { "auxiliary_loss_clip": 0.01108241, "auxiliary_loss_mlp": 0.01044876, "balance_loss_clip": 1.0464325, "balance_loss_mlp": 1.02676797, "epoch": 0.2129565609499474, "flos": 24827021942400.0, "grad_norm": 3.508596736579257, "language_loss": 0.69749588, "learning_rate": 3.6590207030400615e-06, "loss": 0.71902704, "num_input_tokens_seen": 76462470, "step": 3542, "time_per_iteration": 2.746612787246704 }, { "auxiliary_loss_clip": 0.01149445, "auxiliary_loss_mlp": 0.01038617, "balance_loss_clip": 1.05146265, "balance_loss_mlp": 1.02160525, "epoch": 0.21301668420261535, "flos": 23659242877440.0, "grad_norm": 2.3488794859192397, "language_loss": 0.75651306, "learning_rate": 3.658803160610004e-06, "loss": 0.77839369, "num_input_tokens_seen": 76481995, "step": 3543, "time_per_iteration": 2.665900230407715 }, { "auxiliary_loss_clip": 0.0112855, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.05257249, "balance_loss_mlp": 1.02409506, "epoch": 0.21307680745528332, "flos": 16362805472640.0, "grad_norm": 1.8076409354305347, "language_loss": 0.66981912, "learning_rate": 3.6585855552779634e-06, "loss": 0.6915251, "num_input_tokens_seen": 76500245, "step": 3544, "time_per_iteration": 2.6692638397216797 }, { "auxiliary_loss_clip": 0.01121216, "auxiliary_loss_mlp": 0.01046395, "balance_loss_clip": 1.0480237, "balance_loss_mlp": 1.02897835, "epoch": 0.2131369307079513, "flos": 19099054794240.0, "grad_norm": 1.8644107460894377, "language_loss": 0.70977402, "learning_rate": 3.6583678870521934e-06, "loss": 0.73145014, "num_input_tokens_seen": 76519535, "step": 3545, "time_per_iteration": 2.686939001083374 }, { "auxiliary_loss_clip": 0.01128605, "auxiliary_loss_mlp": 0.01048325, "balance_loss_clip": 1.05368018, "balance_loss_mlp": 1.0300498, "epoch": 0.21319705396061928, "flos": 30372275583360.0, "grad_norm": 1.8809403827144264, "language_loss": 0.72329843, "learning_rate": 3.658150155940946e-06, "loss": 0.74506772, "num_input_tokens_seen": 76542065, "step": 3546, "time_per_iteration": 2.8044040203094482 }, { "auxiliary_loss_clip": 0.01115103, "auxiliary_loss_mlp": 0.01050245, "balance_loss_clip": 1.0539, "balance_loss_mlp": 1.03250647, "epoch": 0.21325717721328724, "flos": 21756192410880.0, "grad_norm": 3.48585993087404, "language_loss": 0.80431038, "learning_rate": 3.657932361952479e-06, "loss": 0.82596385, "num_input_tokens_seen": 76560540, "step": 3547, "time_per_iteration": 2.7981739044189453 }, { "auxiliary_loss_clip": 0.01154388, "auxiliary_loss_mlp": 0.01045355, "balance_loss_clip": 1.05115056, "balance_loss_mlp": 1.02685428, "epoch": 0.2133173004659552, "flos": 28730870760960.0, "grad_norm": 2.460294966859189, "language_loss": 0.7449761, "learning_rate": 3.6577145050950504e-06, "loss": 0.7669735, "num_input_tokens_seen": 76581760, "step": 3548, "time_per_iteration": 2.709476947784424 }, { "auxiliary_loss_clip": 0.01117193, "auxiliary_loss_mlp": 0.01059153, "balance_loss_clip": 1.05099797, "balance_loss_mlp": 1.03938842, "epoch": 0.21337742371862317, "flos": 16837077674880.0, "grad_norm": 2.783715227630402, "language_loss": 0.74218595, "learning_rate": 3.657496585376922e-06, "loss": 0.76394939, "num_input_tokens_seen": 76599940, "step": 3549, "time_per_iteration": 2.751401662826538 }, { "auxiliary_loss_clip": 0.01121431, "auxiliary_loss_mlp": 0.01050546, "balance_loss_clip": 1.05331278, "balance_loss_mlp": 1.03283179, "epoch": 0.21343754697129114, "flos": 24424930120320.0, "grad_norm": 1.8583266555890872, "language_loss": 0.80719978, "learning_rate": 3.657278602806357e-06, "loss": 0.82891953, "num_input_tokens_seen": 76619580, "step": 3550, "time_per_iteration": 2.74678373336792 }, { "auxiliary_loss_clip": 0.01151996, "auxiliary_loss_mlp": 0.01048347, "balance_loss_clip": 1.05428052, "balance_loss_mlp": 1.03147876, "epoch": 0.21349767022395913, "flos": 19277817805440.0, "grad_norm": 1.7548210279469212, "language_loss": 0.88234103, "learning_rate": 3.657060557391621e-06, "loss": 0.90434444, "num_input_tokens_seen": 76638195, "step": 3551, "time_per_iteration": 2.746938705444336 }, { "auxiliary_loss_clip": 0.01151269, "auxiliary_loss_mlp": 0.01048306, "balance_loss_clip": 1.05139017, "balance_loss_mlp": 1.03111625, "epoch": 0.2135577934766271, "flos": 17347547808000.0, "grad_norm": 1.8976063035050816, "language_loss": 0.83877259, "learning_rate": 3.656842449140983e-06, "loss": 0.86076838, "num_input_tokens_seen": 76656695, "step": 3552, "time_per_iteration": 2.616567373275757 }, { "auxiliary_loss_clip": 0.0113626, "auxiliary_loss_mlp": 0.01050705, "balance_loss_clip": 1.04937124, "balance_loss_mlp": 1.0325495, "epoch": 0.21361791672929506, "flos": 24057204635520.0, "grad_norm": 2.604872460919843, "language_loss": 0.76370007, "learning_rate": 3.656624278062713e-06, "loss": 0.78556973, "num_input_tokens_seen": 76677430, "step": 3553, "time_per_iteration": 2.730829954147339 }, { "auxiliary_loss_clip": 0.01142267, "auxiliary_loss_mlp": 0.01046102, "balance_loss_clip": 1.05434144, "balance_loss_mlp": 1.02915072, "epoch": 0.21367803998196302, "flos": 22162306556160.0, "grad_norm": 1.5008078028945642, "language_loss": 0.72580731, "learning_rate": 3.6564060441650843e-06, "loss": 0.74769098, "num_input_tokens_seen": 76697615, "step": 3554, "time_per_iteration": 2.701207399368286 }, { "auxiliary_loss_clip": 0.01097601, "auxiliary_loss_mlp": 0.00776401, "balance_loss_clip": 1.04785013, "balance_loss_mlp": 1.00128174, "epoch": 0.213738163234631, "flos": 20886867452160.0, "grad_norm": 2.0681583889949957, "language_loss": 0.67728174, "learning_rate": 3.6561877474563724e-06, "loss": 0.69602168, "num_input_tokens_seen": 76715685, "step": 3555, "time_per_iteration": 2.76454758644104 }, { "auxiliary_loss_clip": 0.01124456, "auxiliary_loss_mlp": 0.01045031, "balance_loss_clip": 1.06086278, "balance_loss_mlp": 1.02689981, "epoch": 0.21379828648729896, "flos": 28403114135040.0, "grad_norm": 2.155752981705525, "language_loss": 0.64553648, "learning_rate": 3.6559693879448553e-06, "loss": 0.66723132, "num_input_tokens_seen": 76735405, "step": 3556, "time_per_iteration": 2.839993953704834 }, { "auxiliary_loss_clip": 0.01139371, "auxiliary_loss_mlp": 0.01051642, "balance_loss_clip": 1.05236566, "balance_loss_mlp": 1.0331769, "epoch": 0.21385840973996692, "flos": 25479662106240.0, "grad_norm": 1.7378281716746964, "language_loss": 0.72588408, "learning_rate": 3.6557509656388125e-06, "loss": 0.74779421, "num_input_tokens_seen": 76754395, "step": 3557, "time_per_iteration": 2.7678587436676025 }, { "auxiliary_loss_clip": 0.01151319, "auxiliary_loss_mlp": 0.00776703, "balance_loss_clip": 1.0647192, "balance_loss_mlp": 1.00117195, "epoch": 0.2139185329926349, "flos": 28074280101120.0, "grad_norm": 1.8333462571334693, "language_loss": 0.6714859, "learning_rate": 3.655532480546528e-06, "loss": 0.6907661, "num_input_tokens_seen": 76777210, "step": 3558, "time_per_iteration": 2.7584826946258545 }, { "auxiliary_loss_clip": 0.01159331, "auxiliary_loss_mlp": 0.01041115, "balance_loss_clip": 1.0541842, "balance_loss_mlp": 1.02297139, "epoch": 0.21397865624530288, "flos": 19608698914560.0, "grad_norm": 1.8974456617751176, "language_loss": 0.79882181, "learning_rate": 3.655313932676286e-06, "loss": 0.82082617, "num_input_tokens_seen": 76795830, "step": 3559, "time_per_iteration": 2.6918041706085205 }, { "auxiliary_loss_clip": 0.01155068, "auxiliary_loss_mlp": 0.01046018, "balance_loss_clip": 1.05566323, "balance_loss_mlp": 1.0295198, "epoch": 0.21403877949797084, "flos": 24681476033280.0, "grad_norm": 1.8730564704536732, "language_loss": 0.68085694, "learning_rate": 3.655095322036373e-06, "loss": 0.70286781, "num_input_tokens_seen": 76814700, "step": 3560, "time_per_iteration": 2.6445770263671875 }, { "auxiliary_loss_clip": 0.01145074, "auxiliary_loss_mlp": 0.01043706, "balance_loss_clip": 1.0535686, "balance_loss_mlp": 1.02537155, "epoch": 0.2140989027506388, "flos": 19861150677120.0, "grad_norm": 1.8952415763477797, "language_loss": 0.73272544, "learning_rate": 3.65487664863508e-06, "loss": 0.75461322, "num_input_tokens_seen": 76833400, "step": 3561, "time_per_iteration": 2.6568899154663086 }, { "auxiliary_loss_clip": 0.01133795, "auxiliary_loss_mlp": 0.01044555, "balance_loss_clip": 1.05333674, "balance_loss_mlp": 1.02700794, "epoch": 0.21415902600330677, "flos": 19135324552320.0, "grad_norm": 2.1953085541278203, "language_loss": 0.78028738, "learning_rate": 3.654657912480698e-06, "loss": 0.80207092, "num_input_tokens_seen": 76850645, "step": 3562, "time_per_iteration": 2.73655104637146 }, { "auxiliary_loss_clip": 0.01155634, "auxiliary_loss_mlp": 0.01042255, "balance_loss_clip": 1.05661631, "balance_loss_mlp": 1.02457595, "epoch": 0.21421914925597474, "flos": 22272624201600.0, "grad_norm": 3.5245068195694937, "language_loss": 0.84338713, "learning_rate": 3.6544391135815237e-06, "loss": 0.86536604, "num_input_tokens_seen": 76870135, "step": 3563, "time_per_iteration": 2.676630973815918 }, { "auxiliary_loss_clip": 0.01157426, "auxiliary_loss_mlp": 0.01036109, "balance_loss_clip": 1.05830729, "balance_loss_mlp": 1.01957488, "epoch": 0.2142792725086427, "flos": 33875109987840.0, "grad_norm": 1.5172669047015535, "language_loss": 0.76581991, "learning_rate": 3.6542202519458507e-06, "loss": 0.78775525, "num_input_tokens_seen": 76893905, "step": 3564, "time_per_iteration": 2.7504193782806396 }, { "auxiliary_loss_clip": 0.01134427, "auxiliary_loss_mlp": 0.01044002, "balance_loss_clip": 1.06131172, "balance_loss_mlp": 1.02674031, "epoch": 0.2143393957613107, "flos": 19860216923520.0, "grad_norm": 1.7115347614953564, "language_loss": 0.88466394, "learning_rate": 3.654001327581981e-06, "loss": 0.90644825, "num_input_tokens_seen": 76914205, "step": 3565, "time_per_iteration": 2.7911624908447266 }, { "auxiliary_loss_clip": 0.01071735, "auxiliary_loss_mlp": 0.01008336, "balance_loss_clip": 1.05462575, "balance_loss_mlp": 1.0057019, "epoch": 0.21439951901397866, "flos": 68530093090560.0, "grad_norm": 0.8339683756542131, "language_loss": 0.52192736, "learning_rate": 3.653782340498215e-06, "loss": 0.54272807, "num_input_tokens_seen": 76975650, "step": 3566, "time_per_iteration": 3.1801936626434326 }, { "auxiliary_loss_clip": 0.01141614, "auxiliary_loss_mlp": 0.01041326, "balance_loss_clip": 1.05527854, "balance_loss_mlp": 1.02505386, "epoch": 0.21445964226664663, "flos": 19682998197120.0, "grad_norm": 1.8485820369681922, "language_loss": 0.67324477, "learning_rate": 3.6535632907028566e-06, "loss": 0.6950742, "num_input_tokens_seen": 76992615, "step": 3567, "time_per_iteration": 2.6948626041412354 }, { "auxiliary_loss_clip": 0.01123629, "auxiliary_loss_mlp": 0.01045447, "balance_loss_clip": 1.05142832, "balance_loss_mlp": 1.02749455, "epoch": 0.2145197655193146, "flos": 31107259676160.0, "grad_norm": 3.2542445550844317, "language_loss": 0.74213678, "learning_rate": 3.6533441782042126e-06, "loss": 0.76382756, "num_input_tokens_seen": 77017005, "step": 3568, "time_per_iteration": 4.396210670471191 }, { "auxiliary_loss_clip": 0.01140095, "auxiliary_loss_mlp": 0.01050029, "balance_loss_clip": 1.05480075, "balance_loss_mlp": 1.03333998, "epoch": 0.21457988877198256, "flos": 20120785159680.0, "grad_norm": 1.7132363384404574, "language_loss": 0.77343202, "learning_rate": 3.6531250030105917e-06, "loss": 0.79533333, "num_input_tokens_seen": 77034990, "step": 3569, "time_per_iteration": 4.224002122879028 }, { "auxiliary_loss_clip": 0.011511, "auxiliary_loss_mlp": 0.0104435, "balance_loss_clip": 1.05651093, "balance_loss_mlp": 1.02521753, "epoch": 0.21464001202465052, "flos": 18588045957120.0, "grad_norm": 2.6050136504577583, "language_loss": 0.70278227, "learning_rate": 3.6529057651303053e-06, "loss": 0.72473681, "num_input_tokens_seen": 77052610, "step": 3570, "time_per_iteration": 2.668304681777954 }, { "auxiliary_loss_clip": 0.01158856, "auxiliary_loss_mlp": 0.01046783, "balance_loss_clip": 1.05765057, "balance_loss_mlp": 1.02955759, "epoch": 0.21470013527731852, "flos": 21835160461440.0, "grad_norm": 2.5503136440013647, "language_loss": 0.79031628, "learning_rate": 3.6526864645716666e-06, "loss": 0.81237268, "num_input_tokens_seen": 77072475, "step": 3571, "time_per_iteration": 4.066440105438232 }, { "auxiliary_loss_clip": 0.0113831, "auxiliary_loss_mlp": 0.01047146, "balance_loss_clip": 1.05283594, "balance_loss_mlp": 1.02703547, "epoch": 0.21476025852998648, "flos": 17603195880960.0, "grad_norm": 1.9606975528380188, "language_loss": 0.82601345, "learning_rate": 3.652467101342991e-06, "loss": 0.84786803, "num_input_tokens_seen": 77089930, "step": 3572, "time_per_iteration": 2.6096267700195312 }, { "auxiliary_loss_clip": 0.01134964, "auxiliary_loss_mlp": 0.01041355, "balance_loss_clip": 1.05588293, "balance_loss_mlp": 1.02358127, "epoch": 0.21482038178265445, "flos": 24828135264000.0, "grad_norm": 4.1014522432452285, "language_loss": 0.65240026, "learning_rate": 3.652247675452598e-06, "loss": 0.67416352, "num_input_tokens_seen": 77108970, "step": 3573, "time_per_iteration": 2.690986394882202 }, { "auxiliary_loss_clip": 0.01147698, "auxiliary_loss_mlp": 0.01048414, "balance_loss_clip": 1.05253768, "balance_loss_mlp": 1.03140295, "epoch": 0.2148805050353224, "flos": 23258228463360.0, "grad_norm": 2.3397683674355565, "language_loss": 0.75229824, "learning_rate": 3.652028186908807e-06, "loss": 0.77425939, "num_input_tokens_seen": 77126045, "step": 3574, "time_per_iteration": 2.621736526489258 }, { "auxiliary_loss_clip": 0.01138272, "auxiliary_loss_mlp": 0.01041549, "balance_loss_clip": 1.0526228, "balance_loss_mlp": 1.02414417, "epoch": 0.21494062828799038, "flos": 21321098968320.0, "grad_norm": 1.8157113535402463, "language_loss": 0.72179317, "learning_rate": 3.6518086357199416e-06, "loss": 0.74359143, "num_input_tokens_seen": 77144600, "step": 3575, "time_per_iteration": 4.362869501113892 }, { "auxiliary_loss_clip": 0.01126687, "auxiliary_loss_mlp": 0.01041186, "balance_loss_clip": 1.05261374, "balance_loss_mlp": 1.02422237, "epoch": 0.21500075154065834, "flos": 18843334894080.0, "grad_norm": 3.8402092268612216, "language_loss": 0.68255925, "learning_rate": 3.6515890218943277e-06, "loss": 0.70423794, "num_input_tokens_seen": 77162965, "step": 3576, "time_per_iteration": 2.665370225906372 }, { "auxiliary_loss_clip": 0.01138295, "auxiliary_loss_mlp": 0.01049053, "balance_loss_clip": 1.05064976, "balance_loss_mlp": 1.02859676, "epoch": 0.2150608747933263, "flos": 18441997257600.0, "grad_norm": 2.2101409055401566, "language_loss": 0.88707685, "learning_rate": 3.651369345440292e-06, "loss": 0.90895033, "num_input_tokens_seen": 77179960, "step": 3577, "time_per_iteration": 2.655118465423584 }, { "auxiliary_loss_clip": 0.01070337, "auxiliary_loss_mlp": 0.01022454, "balance_loss_clip": 1.0487709, "balance_loss_mlp": 1.01998615, "epoch": 0.2151209980459943, "flos": 66598242894720.0, "grad_norm": 0.8146982557647512, "language_loss": 0.56184745, "learning_rate": 3.6511496063661654e-06, "loss": 0.58277535, "num_input_tokens_seen": 77239500, "step": 3578, "time_per_iteration": 3.2133536338806152 }, { "auxiliary_loss_clip": 0.01144391, "auxiliary_loss_mlp": 0.00775114, "balance_loss_clip": 1.05492067, "balance_loss_mlp": 1.00130272, "epoch": 0.21518112129866226, "flos": 21575885114880.0, "grad_norm": 2.988933296047806, "language_loss": 0.88686001, "learning_rate": 3.6509298046802807e-06, "loss": 0.90605509, "num_input_tokens_seen": 77254680, "step": 3579, "time_per_iteration": 2.6801605224609375 }, { "auxiliary_loss_clip": 0.01143273, "auxiliary_loss_mlp": 0.0104707, "balance_loss_clip": 1.05253708, "balance_loss_mlp": 1.02945101, "epoch": 0.21524124455133023, "flos": 20047635112320.0, "grad_norm": 1.8556029181899094, "language_loss": 0.77953792, "learning_rate": 3.650709940390972e-06, "loss": 0.80144137, "num_input_tokens_seen": 77274060, "step": 3580, "time_per_iteration": 2.6932644844055176 }, { "auxiliary_loss_clip": 0.01145284, "auxiliary_loss_mlp": 0.01043211, "balance_loss_clip": 1.05702484, "balance_loss_mlp": 1.02543712, "epoch": 0.2153013678039982, "flos": 23951807153280.0, "grad_norm": 1.9843281400180077, "language_loss": 0.72948015, "learning_rate": 3.6504900135065775e-06, "loss": 0.75136507, "num_input_tokens_seen": 77293255, "step": 3581, "time_per_iteration": 2.712376117706299 }, { "auxiliary_loss_clip": 0.01138503, "auxiliary_loss_mlp": 0.0104555, "balance_loss_clip": 1.05348194, "balance_loss_mlp": 1.0269891, "epoch": 0.21536149105666616, "flos": 20594841880320.0, "grad_norm": 2.4257233983700113, "language_loss": 0.70726413, "learning_rate": 3.6502700240354357e-06, "loss": 0.72910464, "num_input_tokens_seen": 77312390, "step": 3582, "time_per_iteration": 2.67122220993042 }, { "auxiliary_loss_clip": 0.01154755, "auxiliary_loss_mlp": 0.01040327, "balance_loss_clip": 1.05591798, "balance_loss_mlp": 1.0227195, "epoch": 0.21542161430933413, "flos": 12860042895360.0, "grad_norm": 2.4025311229753363, "language_loss": 0.84906816, "learning_rate": 3.650049971985889e-06, "loss": 0.87101901, "num_input_tokens_seen": 77330985, "step": 3583, "time_per_iteration": 2.6395328044891357 }, { "auxiliary_loss_clip": 0.01133287, "auxiliary_loss_mlp": 0.01047024, "balance_loss_clip": 1.05368245, "balance_loss_mlp": 1.02971518, "epoch": 0.21548173756200212, "flos": 26103933504000.0, "grad_norm": 2.7569743809923533, "language_loss": 0.83223897, "learning_rate": 3.6498298573662824e-06, "loss": 0.85404205, "num_input_tokens_seen": 77350770, "step": 3584, "time_per_iteration": 2.730823040008545 }, { "auxiliary_loss_clip": 0.01118851, "auxiliary_loss_mlp": 0.00774813, "balance_loss_clip": 1.0520674, "balance_loss_mlp": 1.00120699, "epoch": 0.21554186081467008, "flos": 22163779013760.0, "grad_norm": 1.9634031706782962, "language_loss": 0.90054697, "learning_rate": 3.6496096801849625e-06, "loss": 0.9194836, "num_input_tokens_seen": 77370510, "step": 3585, "time_per_iteration": 2.722216844558716 }, { "auxiliary_loss_clip": 0.01145179, "auxiliary_loss_mlp": 0.01045359, "balance_loss_clip": 1.05783939, "balance_loss_mlp": 1.02793026, "epoch": 0.21560198406733805, "flos": 22966741595520.0, "grad_norm": 1.9859337557251673, "language_loss": 0.74663597, "learning_rate": 3.649389440450277e-06, "loss": 0.76854134, "num_input_tokens_seen": 77390645, "step": 3586, "time_per_iteration": 2.7681503295898438 }, { "auxiliary_loss_clip": 0.01120328, "auxiliary_loss_mlp": 0.01046334, "balance_loss_clip": 1.05628061, "balance_loss_mlp": 1.03011, "epoch": 0.215662107320006, "flos": 22784064001920.0, "grad_norm": 2.903090853788092, "language_loss": 0.83029532, "learning_rate": 3.6491691381705804e-06, "loss": 0.85196197, "num_input_tokens_seen": 77409655, "step": 3587, "time_per_iteration": 2.788416624069214 }, { "auxiliary_loss_clip": 0.01109364, "auxiliary_loss_mlp": 0.00776304, "balance_loss_clip": 1.05255485, "balance_loss_mlp": 1.00129569, "epoch": 0.21572223057267398, "flos": 30883859038080.0, "grad_norm": 1.7067147212291012, "language_loss": 0.75593436, "learning_rate": 3.648948773354224e-06, "loss": 0.774791, "num_input_tokens_seen": 77430560, "step": 3588, "time_per_iteration": 2.866584062576294 }, { "auxiliary_loss_clip": 0.01136336, "auxiliary_loss_mlp": 0.01039583, "balance_loss_clip": 1.04921389, "balance_loss_mlp": 1.0224762, "epoch": 0.21578235382534194, "flos": 26910487445760.0, "grad_norm": 1.721393113594195, "language_loss": 0.80745661, "learning_rate": 3.6487283460095643e-06, "loss": 0.82921582, "num_input_tokens_seen": 77455000, "step": 3589, "time_per_iteration": 2.8839404582977295 }, { "auxiliary_loss_clip": 0.01157121, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.05677748, "balance_loss_mlp": 1.01992083, "epoch": 0.2158424770780099, "flos": 24425720219520.0, "grad_norm": 2.201221744880259, "language_loss": 0.72849286, "learning_rate": 3.648507856144961e-06, "loss": 0.75042707, "num_input_tokens_seen": 77475075, "step": 3590, "time_per_iteration": 2.6692256927490234 }, { "auxiliary_loss_clip": 0.01134591, "auxiliary_loss_mlp": 0.01044904, "balance_loss_clip": 1.05195427, "balance_loss_mlp": 1.02623618, "epoch": 0.2159026003306779, "flos": 23949975559680.0, "grad_norm": 2.25677544320114, "language_loss": 0.8402462, "learning_rate": 3.648287303768775e-06, "loss": 0.86204112, "num_input_tokens_seen": 77495945, "step": 3591, "time_per_iteration": 2.7531416416168213 }, { "auxiliary_loss_clip": 0.01123784, "auxiliary_loss_mlp": 0.01049552, "balance_loss_clip": 1.05391979, "balance_loss_mlp": 1.02972734, "epoch": 0.21596272358334587, "flos": 30040963511040.0, "grad_norm": 2.2410681113576585, "language_loss": 0.69175243, "learning_rate": 3.6480666888893686e-06, "loss": 0.71348578, "num_input_tokens_seen": 77517140, "step": 3592, "time_per_iteration": 2.8716177940368652 }, { "auxiliary_loss_clip": 0.01117322, "auxiliary_loss_mlp": 0.01050667, "balance_loss_clip": 1.04998767, "balance_loss_mlp": 1.03179634, "epoch": 0.21602284683601383, "flos": 20376217751040.0, "grad_norm": 2.3652325886308123, "language_loss": 0.84022737, "learning_rate": 3.647846011515108e-06, "loss": 0.86190724, "num_input_tokens_seen": 77536085, "step": 3593, "time_per_iteration": 2.7185158729553223 }, { "auxiliary_loss_clip": 0.01123006, "auxiliary_loss_mlp": 0.01048394, "balance_loss_clip": 1.05243289, "balance_loss_mlp": 1.029809, "epoch": 0.2160829700886818, "flos": 20777339905920.0, "grad_norm": 4.017970268493579, "language_loss": 0.75192308, "learning_rate": 3.6476252716543625e-06, "loss": 0.77363706, "num_input_tokens_seen": 77553675, "step": 3594, "time_per_iteration": 2.726027011871338 }, { "auxiliary_loss_clip": 0.01140408, "auxiliary_loss_mlp": 0.01044406, "balance_loss_clip": 1.05318236, "balance_loss_mlp": 1.02650058, "epoch": 0.21614309334134976, "flos": 22309755886080.0, "grad_norm": 1.541030891618627, "language_loss": 0.80459857, "learning_rate": 3.6474044693155007e-06, "loss": 0.82644665, "num_input_tokens_seen": 77573360, "step": 3595, "time_per_iteration": 2.66504168510437 }, { "auxiliary_loss_clip": 0.01119754, "auxiliary_loss_mlp": 0.01039521, "balance_loss_clip": 1.05060601, "balance_loss_mlp": 1.02125788, "epoch": 0.21620321659401773, "flos": 19609524927360.0, "grad_norm": 2.1030283577585007, "language_loss": 0.78930759, "learning_rate": 3.647183604506897e-06, "loss": 0.81090033, "num_input_tokens_seen": 77591865, "step": 3596, "time_per_iteration": 2.7159698009490967 }, { "auxiliary_loss_clip": 0.01080261, "auxiliary_loss_mlp": 0.01047978, "balance_loss_clip": 1.04591155, "balance_loss_mlp": 1.03106225, "epoch": 0.2162633398466857, "flos": 18844555956480.0, "grad_norm": 1.6709210997095376, "language_loss": 0.83061242, "learning_rate": 3.6469626772369253e-06, "loss": 0.85189474, "num_input_tokens_seen": 77611600, "step": 3597, "time_per_iteration": 2.79276704788208 }, { "auxiliary_loss_clip": 0.01133147, "auxiliary_loss_mlp": 0.00775626, "balance_loss_clip": 1.05385637, "balance_loss_mlp": 1.00146937, "epoch": 0.21632346309935369, "flos": 18768820129920.0, "grad_norm": 1.6388312470031852, "language_loss": 0.80549502, "learning_rate": 3.6467416875139642e-06, "loss": 0.8245827, "num_input_tokens_seen": 77630665, "step": 3598, "time_per_iteration": 2.6823580265045166 }, { "auxiliary_loss_clip": 0.01123845, "auxiliary_loss_mlp": 0.01051638, "balance_loss_clip": 1.05069876, "balance_loss_mlp": 1.03218365, "epoch": 0.21638358635202165, "flos": 26324173745280.0, "grad_norm": 1.9066675721358164, "language_loss": 0.82023275, "learning_rate": 3.6465206353463934e-06, "loss": 0.84198749, "num_input_tokens_seen": 77650835, "step": 3599, "time_per_iteration": 2.73583722114563 }, { "auxiliary_loss_clip": 0.0110774, "auxiliary_loss_mlp": 0.00775854, "balance_loss_clip": 1.04651821, "balance_loss_mlp": 1.00131536, "epoch": 0.21644370960468962, "flos": 20740854666240.0, "grad_norm": 2.996184273033617, "language_loss": 0.76724887, "learning_rate": 3.6462995207425947e-06, "loss": 0.78608489, "num_input_tokens_seen": 77669000, "step": 3600, "time_per_iteration": 2.695081949234009 }, { "auxiliary_loss_clip": 0.01112458, "auxiliary_loss_mlp": 0.01044855, "balance_loss_clip": 1.04869664, "balance_loss_mlp": 1.02886891, "epoch": 0.21650383285735758, "flos": 23952238116480.0, "grad_norm": 2.259096111885494, "language_loss": 0.80784452, "learning_rate": 3.6460783437109533e-06, "loss": 0.82941765, "num_input_tokens_seen": 77688745, "step": 3601, "time_per_iteration": 2.8094849586486816 }, { "auxiliary_loss_clip": 0.01155408, "auxiliary_loss_mlp": 0.01046912, "balance_loss_clip": 1.0550983, "balance_loss_mlp": 1.02973413, "epoch": 0.21656395611002555, "flos": 23696087253120.0, "grad_norm": 2.558776342313561, "language_loss": 0.83192647, "learning_rate": 3.6458571042598565e-06, "loss": 0.85394967, "num_input_tokens_seen": 77708445, "step": 3602, "time_per_iteration": 2.652876377105713 }, { "auxiliary_loss_clip": 0.0115161, "auxiliary_loss_mlp": 0.0105032, "balance_loss_clip": 1.0525223, "balance_loss_mlp": 1.03286743, "epoch": 0.2166240793626935, "flos": 20666052593280.0, "grad_norm": 1.768938326380195, "language_loss": 0.7449019, "learning_rate": 3.645635802397693e-06, "loss": 0.76692116, "num_input_tokens_seen": 77728465, "step": 3603, "time_per_iteration": 2.619614601135254 }, { "auxiliary_loss_clip": 0.01116481, "auxiliary_loss_mlp": 0.01047384, "balance_loss_clip": 1.04873598, "balance_loss_mlp": 1.02883554, "epoch": 0.2166842026153615, "flos": 21580410228480.0, "grad_norm": 1.6710689829239502, "language_loss": 0.74178421, "learning_rate": 3.645414438132855e-06, "loss": 0.76342291, "num_input_tokens_seen": 77746735, "step": 3604, "time_per_iteration": 2.730182647705078 }, { "auxiliary_loss_clip": 0.01138214, "auxiliary_loss_mlp": 0.01038079, "balance_loss_clip": 1.05246544, "balance_loss_mlp": 1.02124691, "epoch": 0.21674432586802947, "flos": 25629948610560.0, "grad_norm": 1.7167946204354523, "language_loss": 0.7990489, "learning_rate": 3.6451930114737366e-06, "loss": 0.82081187, "num_input_tokens_seen": 77768105, "step": 3605, "time_per_iteration": 2.67668080329895 }, { "auxiliary_loss_clip": 0.01079717, "auxiliary_loss_mlp": 0.01002026, "balance_loss_clip": 1.0400598, "balance_loss_mlp": 0.99942732, "epoch": 0.21680444912069743, "flos": 56417783616000.0, "grad_norm": 0.7112415560884942, "language_loss": 0.5834192, "learning_rate": 3.6449715224287347e-06, "loss": 0.6042366, "num_input_tokens_seen": 77833750, "step": 3606, "time_per_iteration": 3.2736570835113525 }, { "auxiliary_loss_clip": 0.01155294, "auxiliary_loss_mlp": 0.01043491, "balance_loss_clip": 1.05404341, "balance_loss_mlp": 1.02498984, "epoch": 0.2168645723733654, "flos": 23878944414720.0, "grad_norm": 2.2731951350022275, "language_loss": 0.73142302, "learning_rate": 3.644749971006248e-06, "loss": 0.75341088, "num_input_tokens_seen": 77853780, "step": 3607, "time_per_iteration": 4.267899990081787 }, { "auxiliary_loss_clip": 0.01133762, "auxiliary_loss_mlp": 0.01046639, "balance_loss_clip": 1.05282903, "balance_loss_mlp": 1.02789962, "epoch": 0.21692469562603336, "flos": 16946174257920.0, "grad_norm": 2.181379073292718, "language_loss": 0.76540339, "learning_rate": 3.6445283572146765e-06, "loss": 0.78720737, "num_input_tokens_seen": 77872575, "step": 3608, "time_per_iteration": 4.285630464553833 }, { "auxiliary_loss_clip": 0.01080204, "auxiliary_loss_mlp": 0.01047623, "balance_loss_clip": 1.04536235, "balance_loss_mlp": 1.0309217, "epoch": 0.21698481887870133, "flos": 25119047514240.0, "grad_norm": 2.042587105390135, "language_loss": 0.74584132, "learning_rate": 3.6443066810624255e-06, "loss": 0.76711953, "num_input_tokens_seen": 77892700, "step": 3609, "time_per_iteration": 2.802569627761841 }, { "auxiliary_loss_clip": 0.01131798, "auxiliary_loss_mlp": 0.01049353, "balance_loss_clip": 1.05227149, "balance_loss_mlp": 1.03159094, "epoch": 0.2170449421313693, "flos": 17894682748800.0, "grad_norm": 1.9074832440543417, "language_loss": 0.89132321, "learning_rate": 3.6440849425579e-06, "loss": 0.91313475, "num_input_tokens_seen": 77911060, "step": 3610, "time_per_iteration": 4.189727306365967 }, { "auxiliary_loss_clip": 0.01155294, "auxiliary_loss_mlp": 0.01044238, "balance_loss_clip": 1.05534768, "balance_loss_mlp": 1.02649963, "epoch": 0.2171050653840373, "flos": 22638446265600.0, "grad_norm": 2.058717355808165, "language_loss": 0.77779067, "learning_rate": 3.6438631417095095e-06, "loss": 0.79978603, "num_input_tokens_seen": 77929930, "step": 3611, "time_per_iteration": 2.6317896842956543 }, { "auxiliary_loss_clip": 0.01088447, "auxiliary_loss_mlp": 0.01047447, "balance_loss_clip": 1.04764366, "balance_loss_mlp": 1.03026867, "epoch": 0.21716518863670525, "flos": 19499997381120.0, "grad_norm": 2.3883055198257184, "language_loss": 0.63578451, "learning_rate": 3.6436412785256637e-06, "loss": 0.65714347, "num_input_tokens_seen": 77949060, "step": 3612, "time_per_iteration": 2.8771228790283203 }, { "auxiliary_loss_clip": 0.01091118, "auxiliary_loss_mlp": 0.01053996, "balance_loss_clip": 1.04585218, "balance_loss_mlp": 1.03454065, "epoch": 0.21722531188937322, "flos": 19792022952960.0, "grad_norm": 1.801964584441428, "language_loss": 0.75912857, "learning_rate": 3.643419353014776e-06, "loss": 0.78057969, "num_input_tokens_seen": 77967920, "step": 3613, "time_per_iteration": 2.710601568222046 }, { "auxiliary_loss_clip": 0.0110572, "auxiliary_loss_mlp": 0.01051253, "balance_loss_clip": 1.05008733, "balance_loss_mlp": 1.03121352, "epoch": 0.21728543514204118, "flos": 13334386924800.0, "grad_norm": 1.9293696862218277, "language_loss": 0.71047795, "learning_rate": 3.643197365185261e-06, "loss": 0.73204768, "num_input_tokens_seen": 77985330, "step": 3614, "time_per_iteration": 4.407632112503052 }, { "auxiliary_loss_clip": 0.0114355, "auxiliary_loss_mlp": 0.01048776, "balance_loss_clip": 1.05521107, "balance_loss_mlp": 1.0306083, "epoch": 0.21734555839470915, "flos": 15231870783360.0, "grad_norm": 1.7289280951335333, "language_loss": 0.73030001, "learning_rate": 3.6429753150455378e-06, "loss": 0.75222325, "num_input_tokens_seen": 78003105, "step": 3615, "time_per_iteration": 2.6358401775360107 }, { "auxiliary_loss_clip": 0.01145731, "auxiliary_loss_mlp": 0.01046632, "balance_loss_clip": 1.05206716, "balance_loss_mlp": 1.02703404, "epoch": 0.2174056816473771, "flos": 19973982274560.0, "grad_norm": 2.3648922858816976, "language_loss": 0.90127194, "learning_rate": 3.6427532026040263e-06, "loss": 0.92319548, "num_input_tokens_seen": 78019655, "step": 3616, "time_per_iteration": 2.659787178039551 }, { "auxiliary_loss_clip": 0.01103597, "auxiliary_loss_mlp": 0.01040899, "balance_loss_clip": 1.048136, "balance_loss_mlp": 1.02244496, "epoch": 0.21746580490004508, "flos": 16687293960960.0, "grad_norm": 2.928463545610362, "language_loss": 0.81107831, "learning_rate": 3.642531027869148e-06, "loss": 0.83252329, "num_input_tokens_seen": 78036025, "step": 3617, "time_per_iteration": 2.7723491191864014 }, { "auxiliary_loss_clip": 0.01132531, "auxiliary_loss_mlp": 0.01041286, "balance_loss_clip": 1.05330408, "balance_loss_mlp": 1.02382231, "epoch": 0.21752592815271307, "flos": 25772298209280.0, "grad_norm": 1.9251992817215786, "language_loss": 0.75688154, "learning_rate": 3.642308790849329e-06, "loss": 0.77861977, "num_input_tokens_seen": 78055645, "step": 3618, "time_per_iteration": 2.7608227729797363 }, { "auxiliary_loss_clip": 0.01147874, "auxiliary_loss_mlp": 0.01048647, "balance_loss_clip": 1.05600834, "balance_loss_mlp": 1.03045571, "epoch": 0.21758605140538104, "flos": 11254692349440.0, "grad_norm": 2.18435089101569, "language_loss": 0.69099152, "learning_rate": 3.642086491552996e-06, "loss": 0.71295673, "num_input_tokens_seen": 78071660, "step": 3619, "time_per_iteration": 2.671637773513794 }, { "auxiliary_loss_clip": 0.01144421, "auxiliary_loss_mlp": 0.01042659, "balance_loss_clip": 1.05394137, "balance_loss_mlp": 1.02482569, "epoch": 0.217646174658049, "flos": 19242625455360.0, "grad_norm": 4.829425462001391, "language_loss": 0.78716505, "learning_rate": 3.641864129988579e-06, "loss": 0.8090359, "num_input_tokens_seen": 78091265, "step": 3620, "time_per_iteration": 2.7232043743133545 }, { "auxiliary_loss_clip": 0.01148457, "auxiliary_loss_mlp": 0.01042109, "balance_loss_clip": 1.05161178, "balance_loss_mlp": 1.02507412, "epoch": 0.21770629791071697, "flos": 21945083057280.0, "grad_norm": 1.4663479636678602, "language_loss": 0.79966211, "learning_rate": 3.641641706164509e-06, "loss": 0.82156777, "num_input_tokens_seen": 78110095, "step": 3621, "time_per_iteration": 2.6326823234558105 }, { "auxiliary_loss_clip": 0.01143183, "auxiliary_loss_mlp": 0.01035793, "balance_loss_clip": 1.05334592, "balance_loss_mlp": 1.01955688, "epoch": 0.21776642116338493, "flos": 24936764970240.0, "grad_norm": 1.609721344037994, "language_loss": 0.87796915, "learning_rate": 3.641419220089221e-06, "loss": 0.89975888, "num_input_tokens_seen": 78129475, "step": 3622, "time_per_iteration": 2.6864428520202637 }, { "auxiliary_loss_clip": 0.01146899, "auxiliary_loss_mlp": 0.01037591, "balance_loss_clip": 1.05495822, "balance_loss_mlp": 1.01801729, "epoch": 0.2178265444160529, "flos": 17821317219840.0, "grad_norm": 1.856609178217172, "language_loss": 0.77077621, "learning_rate": 3.641196671771152e-06, "loss": 0.79262108, "num_input_tokens_seen": 78146880, "step": 3623, "time_per_iteration": 2.743601083755493 }, { "auxiliary_loss_clip": 0.01121788, "auxiliary_loss_mlp": 0.01052122, "balance_loss_clip": 1.05279899, "balance_loss_mlp": 1.03226197, "epoch": 0.2178866676687209, "flos": 17712902995200.0, "grad_norm": 2.4362835431673036, "language_loss": 0.84600008, "learning_rate": 3.640974061218741e-06, "loss": 0.86773914, "num_input_tokens_seen": 78165065, "step": 3624, "time_per_iteration": 2.7499353885650635 }, { "auxiliary_loss_clip": 0.01139543, "auxiliary_loss_mlp": 0.01057514, "balance_loss_clip": 1.05353129, "balance_loss_mlp": 1.03804684, "epoch": 0.21794679092138886, "flos": 16945851035520.0, "grad_norm": 2.4333310175924905, "language_loss": 0.78037703, "learning_rate": 3.640751388440429e-06, "loss": 0.80234766, "num_input_tokens_seen": 78180005, "step": 3625, "time_per_iteration": 2.6314821243286133 }, { "auxiliary_loss_clip": 0.01061536, "auxiliary_loss_mlp": 0.01003869, "balance_loss_clip": 1.03318405, "balance_loss_mlp": 1.00130582, "epoch": 0.21800691417405682, "flos": 63718566566400.0, "grad_norm": 0.8242097668179436, "language_loss": 0.60701489, "learning_rate": 3.64052865344466e-06, "loss": 0.62766898, "num_input_tokens_seen": 78245350, "step": 3626, "time_per_iteration": 3.257289409637451 }, { "auxiliary_loss_clip": 0.0112643, "auxiliary_loss_mlp": 0.00776719, "balance_loss_clip": 1.05120194, "balance_loss_mlp": 1.00134754, "epoch": 0.21806703742672479, "flos": 21616392677760.0, "grad_norm": 2.2464694521793094, "language_loss": 0.9077245, "learning_rate": 3.6403058562398795e-06, "loss": 0.92675602, "num_input_tokens_seen": 78264165, "step": 3627, "time_per_iteration": 2.6639885902404785 }, { "auxiliary_loss_clip": 0.0109778, "auxiliary_loss_mlp": 0.01043665, "balance_loss_clip": 1.04912198, "balance_loss_mlp": 1.02471113, "epoch": 0.21812716067939275, "flos": 19354882435200.0, "grad_norm": 1.8437472480823303, "language_loss": 0.73480809, "learning_rate": 3.6400829968345365e-06, "loss": 0.75622261, "num_input_tokens_seen": 78283745, "step": 3628, "time_per_iteration": 2.7430238723754883 }, { "auxiliary_loss_clip": 0.01151444, "auxiliary_loss_mlp": 0.01042108, "balance_loss_clip": 1.05143893, "balance_loss_mlp": 1.02391696, "epoch": 0.21818728393206072, "flos": 23548063305600.0, "grad_norm": 2.8127332529660296, "language_loss": 0.77337319, "learning_rate": 3.6398600752370826e-06, "loss": 0.79530871, "num_input_tokens_seen": 78302900, "step": 3629, "time_per_iteration": 2.6468687057495117 }, { "auxiliary_loss_clip": 0.01142447, "auxiliary_loss_mlp": 0.01044137, "balance_loss_clip": 1.0532223, "balance_loss_mlp": 1.02709055, "epoch": 0.21824740718472868, "flos": 30225652266240.0, "grad_norm": 1.7154004506833416, "language_loss": 0.71373391, "learning_rate": 3.63963709145597e-06, "loss": 0.73559982, "num_input_tokens_seen": 78326470, "step": 3630, "time_per_iteration": 2.7334208488464355 }, { "auxiliary_loss_clip": 0.01089422, "auxiliary_loss_mlp": 0.01040838, "balance_loss_clip": 1.04771948, "balance_loss_mlp": 1.02488792, "epoch": 0.21830753043739667, "flos": 26134672567680.0, "grad_norm": 2.4394061962398625, "language_loss": 0.76502508, "learning_rate": 3.6394140454996544e-06, "loss": 0.78632766, "num_input_tokens_seen": 78345810, "step": 3631, "time_per_iteration": 2.9277098178863525 }, { "auxiliary_loss_clip": 0.01153805, "auxiliary_loss_mlp": 0.01036973, "balance_loss_clip": 1.05322635, "balance_loss_mlp": 1.01950908, "epoch": 0.21836765369006464, "flos": 21720712752000.0, "grad_norm": 3.3333075141454556, "language_loss": 0.75291955, "learning_rate": 3.639190937376594e-06, "loss": 0.77482736, "num_input_tokens_seen": 78364085, "step": 3632, "time_per_iteration": 2.666961908340454 }, { "auxiliary_loss_clip": 0.01149425, "auxiliary_loss_mlp": 0.01038996, "balance_loss_clip": 1.05168736, "balance_loss_mlp": 1.02262831, "epoch": 0.2184277769427326, "flos": 19937604775680.0, "grad_norm": 2.135610011090477, "language_loss": 0.83723396, "learning_rate": 3.638967767095249e-06, "loss": 0.85911822, "num_input_tokens_seen": 78381385, "step": 3633, "time_per_iteration": 2.6193437576293945 }, { "auxiliary_loss_clip": 0.0112373, "auxiliary_loss_mlp": 0.01049933, "balance_loss_clip": 1.05514872, "balance_loss_mlp": 1.03280258, "epoch": 0.21848790019540057, "flos": 20340235301760.0, "grad_norm": 1.713148643324746, "language_loss": 0.81381643, "learning_rate": 3.6387445346640823e-06, "loss": 0.83555305, "num_input_tokens_seen": 78400500, "step": 3634, "time_per_iteration": 2.7383267879486084 }, { "auxiliary_loss_clip": 0.01144832, "auxiliary_loss_mlp": 0.01040423, "balance_loss_clip": 1.0548327, "balance_loss_mlp": 1.02263677, "epoch": 0.21854802344806853, "flos": 15450818135040.0, "grad_norm": 1.8988648345390304, "language_loss": 0.74810624, "learning_rate": 3.638521240091558e-06, "loss": 0.76995879, "num_input_tokens_seen": 78418340, "step": 3635, "time_per_iteration": 2.7461390495300293 }, { "auxiliary_loss_clip": 0.01124703, "auxiliary_loss_mlp": 0.01052922, "balance_loss_clip": 1.05011106, "balance_loss_mlp": 1.03524303, "epoch": 0.2186081467007365, "flos": 16320717711360.0, "grad_norm": 2.2147010555825295, "language_loss": 0.88340998, "learning_rate": 3.6382978833861445e-06, "loss": 0.90518618, "num_input_tokens_seen": 78434375, "step": 3636, "time_per_iteration": 2.631352186203003 }, { "auxiliary_loss_clip": 0.01121776, "auxiliary_loss_mlp": 0.00776363, "balance_loss_clip": 1.05596519, "balance_loss_mlp": 1.00133038, "epoch": 0.2186682699534045, "flos": 21689255416320.0, "grad_norm": 2.464516707854487, "language_loss": 0.76037598, "learning_rate": 3.638074464556311e-06, "loss": 0.77935731, "num_input_tokens_seen": 78451735, "step": 3637, "time_per_iteration": 2.823063373565674 }, { "auxiliary_loss_clip": 0.01137371, "auxiliary_loss_mlp": 0.0104323, "balance_loss_clip": 1.05512452, "balance_loss_mlp": 1.02393031, "epoch": 0.21872839320607246, "flos": 17739260599680.0, "grad_norm": 2.6753688852020328, "language_loss": 0.89996254, "learning_rate": 3.63785098361053e-06, "loss": 0.92176855, "num_input_tokens_seen": 78462730, "step": 3638, "time_per_iteration": 2.6404030323028564 }, { "auxiliary_loss_clip": 0.01142035, "auxiliary_loss_mlp": 0.01051888, "balance_loss_clip": 1.0538702, "balance_loss_mlp": 1.03351748, "epoch": 0.21878851645874042, "flos": 18652289431680.0, "grad_norm": 2.4375531856602692, "language_loss": 0.89243078, "learning_rate": 3.637627440557275e-06, "loss": 0.91436994, "num_input_tokens_seen": 78476300, "step": 3639, "time_per_iteration": 2.6214118003845215 }, { "auxiliary_loss_clip": 0.01134092, "auxiliary_loss_mlp": 0.00776277, "balance_loss_clip": 1.05406988, "balance_loss_mlp": 1.00129211, "epoch": 0.2188486397114084, "flos": 25557301353600.0, "grad_norm": 1.9800691484462982, "language_loss": 0.79167712, "learning_rate": 3.637403835405024e-06, "loss": 0.81078082, "num_input_tokens_seen": 78496135, "step": 3640, "time_per_iteration": 2.7559502124786377 }, { "auxiliary_loss_clip": 0.01149345, "auxiliary_loss_mlp": 0.01055855, "balance_loss_clip": 1.05816483, "balance_loss_mlp": 1.03617346, "epoch": 0.21890876296407635, "flos": 17892061056000.0, "grad_norm": 2.2045237000129942, "language_loss": 0.71708757, "learning_rate": 3.637180168162255e-06, "loss": 0.73913956, "num_input_tokens_seen": 78513855, "step": 3641, "time_per_iteration": 2.6673953533172607 }, { "auxiliary_loss_clip": 0.01130115, "auxiliary_loss_mlp": 0.0104373, "balance_loss_clip": 1.05217481, "balance_loss_mlp": 1.02593243, "epoch": 0.21896888621674432, "flos": 17749100926080.0, "grad_norm": 1.9358190088314053, "language_loss": 0.81427026, "learning_rate": 3.63695643883745e-06, "loss": 0.83600873, "num_input_tokens_seen": 78531740, "step": 3642, "time_per_iteration": 2.6722965240478516 }, { "auxiliary_loss_clip": 0.01150265, "auxiliary_loss_mlp": 0.01044184, "balance_loss_clip": 1.05707705, "balance_loss_mlp": 1.02520561, "epoch": 0.21902900946941228, "flos": 23076161400960.0, "grad_norm": 2.2890480980316865, "language_loss": 0.7124145, "learning_rate": 3.6367326474390928e-06, "loss": 0.73435903, "num_input_tokens_seen": 78549600, "step": 3643, "time_per_iteration": 2.6586625576019287 }, { "auxiliary_loss_clip": 0.01156283, "auxiliary_loss_mlp": 0.01046488, "balance_loss_clip": 1.05430686, "balance_loss_mlp": 1.02728367, "epoch": 0.21908913272208028, "flos": 48178545004800.0, "grad_norm": 2.705040309825256, "language_loss": 0.68497038, "learning_rate": 3.6365087939756696e-06, "loss": 0.70699811, "num_input_tokens_seen": 78573350, "step": 3644, "time_per_iteration": 2.835944414138794 }, { "auxiliary_loss_clip": 0.01157461, "auxiliary_loss_mlp": 0.01049851, "balance_loss_clip": 1.05381823, "balance_loss_mlp": 1.03175521, "epoch": 0.21914925597474824, "flos": 22236749493120.0, "grad_norm": 2.498314523319793, "language_loss": 0.77761143, "learning_rate": 3.636284878455669e-06, "loss": 0.79968452, "num_input_tokens_seen": 78591005, "step": 3645, "time_per_iteration": 2.6053528785705566 }, { "auxiliary_loss_clip": 0.01142456, "auxiliary_loss_mlp": 0.01054431, "balance_loss_clip": 1.05606842, "balance_loss_mlp": 1.03732491, "epoch": 0.2192093792274162, "flos": 22125605834880.0, "grad_norm": 3.1951942186566766, "language_loss": 0.82604313, "learning_rate": 3.636060900887582e-06, "loss": 0.84801197, "num_input_tokens_seen": 78610645, "step": 3646, "time_per_iteration": 4.198619842529297 }, { "auxiliary_loss_clip": 0.01141068, "auxiliary_loss_mlp": 0.01040772, "balance_loss_clip": 1.05287766, "balance_loss_mlp": 1.02365351, "epoch": 0.21926950248008417, "flos": 15669442264320.0, "grad_norm": 1.720246481727725, "language_loss": 0.82877636, "learning_rate": 3.635836861279901e-06, "loss": 0.85059476, "num_input_tokens_seen": 78628340, "step": 3647, "time_per_iteration": 4.229920387268066 }, { "auxiliary_loss_clip": 0.0115057, "auxiliary_loss_mlp": 0.01054202, "balance_loss_clip": 1.05145597, "balance_loss_mlp": 1.03685677, "epoch": 0.21932962573275214, "flos": 30262496641920.0, "grad_norm": 1.6932394069108108, "language_loss": 0.72652817, "learning_rate": 3.635612759641123e-06, "loss": 0.74857587, "num_input_tokens_seen": 78649355, "step": 3648, "time_per_iteration": 2.7226104736328125 }, { "auxiliary_loss_clip": 0.01110484, "auxiliary_loss_mlp": 0.01057841, "balance_loss_clip": 1.04757857, "balance_loss_mlp": 1.03643107, "epoch": 0.2193897489854201, "flos": 10780132838400.0, "grad_norm": 3.9115777702699175, "language_loss": 0.74917972, "learning_rate": 3.635388595979745e-06, "loss": 0.77086294, "num_input_tokens_seen": 78664915, "step": 3649, "time_per_iteration": 4.201031446456909 }, { "auxiliary_loss_clip": 0.01138726, "auxiliary_loss_mlp": 0.0105421, "balance_loss_clip": 1.0536499, "balance_loss_mlp": 1.03718746, "epoch": 0.21944987223808807, "flos": 19133313390720.0, "grad_norm": 1.8914434058388716, "language_loss": 0.86353791, "learning_rate": 3.635164370304267e-06, "loss": 0.88546729, "num_input_tokens_seen": 78681475, "step": 3650, "time_per_iteration": 2.6061322689056396 }, { "auxiliary_loss_clip": 0.01130852, "auxiliary_loss_mlp": 0.01052398, "balance_loss_clip": 1.04992914, "balance_loss_mlp": 1.03439701, "epoch": 0.21950999549075606, "flos": 22711093522560.0, "grad_norm": 2.798139483493165, "language_loss": 0.83541161, "learning_rate": 3.6349400826231927e-06, "loss": 0.85724407, "num_input_tokens_seen": 78702300, "step": 3651, "time_per_iteration": 2.7605133056640625 }, { "auxiliary_loss_clip": 0.01143643, "auxiliary_loss_mlp": 0.0105251, "balance_loss_clip": 1.05282581, "balance_loss_mlp": 1.03511763, "epoch": 0.21957011874342403, "flos": 10561329141120.0, "grad_norm": 1.9065881796375543, "language_loss": 0.74475014, "learning_rate": 3.634715732945027e-06, "loss": 0.76671165, "num_input_tokens_seen": 78720230, "step": 3652, "time_per_iteration": 2.597443103790283 }, { "auxiliary_loss_clip": 0.01038431, "auxiliary_loss_mlp": 0.01009267, "balance_loss_clip": 1.0361495, "balance_loss_mlp": 1.0068711, "epoch": 0.219630241996092, "flos": 65747913252480.0, "grad_norm": 0.7482502800744824, "language_loss": 0.51550615, "learning_rate": 3.6344913212782764e-06, "loss": 0.5359832, "num_input_tokens_seen": 78780200, "step": 3653, "time_per_iteration": 3.324497699737549 }, { "auxiliary_loss_clip": 0.01125533, "auxiliary_loss_mlp": 0.01062527, "balance_loss_clip": 1.05436754, "balance_loss_mlp": 1.04470527, "epoch": 0.21969036524875996, "flos": 23696518216320.0, "grad_norm": 1.9578946934595152, "language_loss": 0.75356162, "learning_rate": 3.6342668476314514e-06, "loss": 0.77544224, "num_input_tokens_seen": 78800575, "step": 3654, "time_per_iteration": 4.296064615249634 }, { "auxiliary_loss_clip": 0.01152337, "auxiliary_loss_mlp": 0.01051249, "balance_loss_clip": 1.05944824, "balance_loss_mlp": 1.03376114, "epoch": 0.21975048850142792, "flos": 19640910435840.0, "grad_norm": 1.8387519277823352, "language_loss": 0.72646022, "learning_rate": 3.634042312013064e-06, "loss": 0.74849606, "num_input_tokens_seen": 78819585, "step": 3655, "time_per_iteration": 2.6634860038757324 }, { "auxiliary_loss_clip": 0.01130021, "auxiliary_loss_mlp": 0.01048784, "balance_loss_clip": 1.05423379, "balance_loss_mlp": 1.03071189, "epoch": 0.21981061175409589, "flos": 22448550038400.0, "grad_norm": 1.722985511504472, "language_loss": 0.80795759, "learning_rate": 3.6338177144316276e-06, "loss": 0.82974565, "num_input_tokens_seen": 78837330, "step": 3656, "time_per_iteration": 2.730391502380371 }, { "auxiliary_loss_clip": 0.01124773, "auxiliary_loss_mlp": 0.00776202, "balance_loss_clip": 1.06113994, "balance_loss_mlp": 1.00139225, "epoch": 0.21987073500676388, "flos": 18151049093760.0, "grad_norm": 2.646453773467974, "language_loss": 0.84885842, "learning_rate": 3.63359305489566e-06, "loss": 0.86786819, "num_input_tokens_seen": 78854955, "step": 3657, "time_per_iteration": 2.657607078552246 }, { "auxiliary_loss_clip": 0.01142645, "auxiliary_loss_mlp": 0.01040533, "balance_loss_clip": 1.05631852, "balance_loss_mlp": 1.02260423, "epoch": 0.21993085825943184, "flos": 25626177682560.0, "grad_norm": 2.6990832263195585, "language_loss": 0.80355585, "learning_rate": 3.6333683334136803e-06, "loss": 0.82538766, "num_input_tokens_seen": 78874965, "step": 3658, "time_per_iteration": 2.6584107875823975 }, { "auxiliary_loss_clip": 0.01048937, "auxiliary_loss_mlp": 0.0100499, "balance_loss_clip": 1.03857517, "balance_loss_mlp": 1.00202215, "epoch": 0.2199909815120998, "flos": 70923217743360.0, "grad_norm": 0.7788612160796681, "language_loss": 0.58191586, "learning_rate": 3.6331435499942095e-06, "loss": 0.60245514, "num_input_tokens_seen": 78937740, "step": 3659, "time_per_iteration": 3.3395371437072754 }, { "auxiliary_loss_clip": 0.01111007, "auxiliary_loss_mlp": 0.0105329, "balance_loss_clip": 1.05029392, "balance_loss_mlp": 1.03471744, "epoch": 0.22005110476476777, "flos": 21543529939200.0, "grad_norm": 4.382741616753977, "language_loss": 0.7477597, "learning_rate": 3.632918704645772e-06, "loss": 0.76940262, "num_input_tokens_seen": 78955055, "step": 3660, "time_per_iteration": 2.782975435256958 }, { "auxiliary_loss_clip": 0.01147277, "auxiliary_loss_mlp": 0.01044652, "balance_loss_clip": 1.05691171, "balance_loss_mlp": 1.02653265, "epoch": 0.22011122801743574, "flos": 22054502862720.0, "grad_norm": 1.8856077512582532, "language_loss": 0.81484449, "learning_rate": 3.632693797376893e-06, "loss": 0.83676374, "num_input_tokens_seen": 78974895, "step": 3661, "time_per_iteration": 2.7780110836029053 }, { "auxiliary_loss_clip": 0.01126694, "auxiliary_loss_mlp": 0.01056397, "balance_loss_clip": 1.05167532, "balance_loss_mlp": 1.03800273, "epoch": 0.2201713512701037, "flos": 26687589598080.0, "grad_norm": 1.9746283079458686, "language_loss": 0.73154199, "learning_rate": 3.632468828196102e-06, "loss": 0.75337297, "num_input_tokens_seen": 78994990, "step": 3662, "time_per_iteration": 2.7189040184020996 }, { "auxiliary_loss_clip": 0.0113519, "auxiliary_loss_mlp": 0.01051686, "balance_loss_clip": 1.05718994, "balance_loss_mlp": 1.03555691, "epoch": 0.22023147452277167, "flos": 22162198815360.0, "grad_norm": 2.0576168655035714, "language_loss": 0.78066969, "learning_rate": 3.632243797111929e-06, "loss": 0.80253839, "num_input_tokens_seen": 79014405, "step": 3663, "time_per_iteration": 2.731412410736084 }, { "auxiliary_loss_clip": 0.01142837, "auxiliary_loss_mlp": 0.01063521, "balance_loss_clip": 1.05659413, "balance_loss_mlp": 1.04352939, "epoch": 0.22029159777543966, "flos": 22523280284160.0, "grad_norm": 1.752119258875799, "language_loss": 0.80294079, "learning_rate": 3.632018704132908e-06, "loss": 0.82500434, "num_input_tokens_seen": 79032375, "step": 3664, "time_per_iteration": 2.7043297290802 }, { "auxiliary_loss_clip": 0.01134207, "auxiliary_loss_mlp": 0.01044352, "balance_loss_clip": 1.05424213, "balance_loss_mlp": 1.02474177, "epoch": 0.22035172102810763, "flos": 13042469093760.0, "grad_norm": 3.138103913885462, "language_loss": 0.76388288, "learning_rate": 3.6317935492675742e-06, "loss": 0.78566849, "num_input_tokens_seen": 79049635, "step": 3665, "time_per_iteration": 2.68300199508667 }, { "auxiliary_loss_clip": 0.01128405, "auxiliary_loss_mlp": 0.01053304, "balance_loss_clip": 1.05599689, "balance_loss_mlp": 1.03589976, "epoch": 0.2204118442807756, "flos": 12165817760640.0, "grad_norm": 2.9738702224471583, "language_loss": 0.9800086, "learning_rate": 3.631568332524466e-06, "loss": 1.00182581, "num_input_tokens_seen": 79062890, "step": 3666, "time_per_iteration": 2.702584981918335 }, { "auxiliary_loss_clip": 0.01141573, "auxiliary_loss_mlp": 0.00776689, "balance_loss_clip": 1.05254698, "balance_loss_mlp": 1.00133562, "epoch": 0.22047196753344356, "flos": 40108806673920.0, "grad_norm": 1.894759892223008, "language_loss": 0.80946934, "learning_rate": 3.631343053912122e-06, "loss": 0.82865196, "num_input_tokens_seen": 79085495, "step": 3667, "time_per_iteration": 2.8920814990997314 }, { "auxiliary_loss_clip": 0.01149896, "auxiliary_loss_mlp": 0.01051036, "balance_loss_clip": 1.06145239, "balance_loss_mlp": 1.03161693, "epoch": 0.22053209078611152, "flos": 20701137202560.0, "grad_norm": 1.8771463594277091, "language_loss": 0.7736783, "learning_rate": 3.631117713439087e-06, "loss": 0.79568756, "num_input_tokens_seen": 79101820, "step": 3668, "time_per_iteration": 2.6733500957489014 }, { "auxiliary_loss_clip": 0.01143618, "auxiliary_loss_mlp": 0.01047462, "balance_loss_clip": 1.05955744, "balance_loss_mlp": 1.02972412, "epoch": 0.2205922140387795, "flos": 24716309247360.0, "grad_norm": 1.7809066581326154, "language_loss": 0.71624571, "learning_rate": 3.630892311113904e-06, "loss": 0.7381565, "num_input_tokens_seen": 79123320, "step": 3669, "time_per_iteration": 2.7298974990844727 }, { "auxiliary_loss_clip": 0.01155448, "auxiliary_loss_mlp": 0.01039044, "balance_loss_clip": 1.0544126, "balance_loss_mlp": 1.0217346, "epoch": 0.22065233729144745, "flos": 23477247642240.0, "grad_norm": 2.1257290130035082, "language_loss": 0.85160267, "learning_rate": 3.6306668469451215e-06, "loss": 0.87354761, "num_input_tokens_seen": 79141615, "step": 3670, "time_per_iteration": 2.6624948978424072 }, { "auxiliary_loss_clip": 0.01137906, "auxiliary_loss_mlp": 0.01042298, "balance_loss_clip": 1.05475712, "balance_loss_mlp": 1.02376091, "epoch": 0.22071246054411545, "flos": 35225566646400.0, "grad_norm": 1.8008957470192373, "language_loss": 0.76928926, "learning_rate": 3.6304413209412886e-06, "loss": 0.79109132, "num_input_tokens_seen": 79164910, "step": 3671, "time_per_iteration": 2.7914648056030273 }, { "auxiliary_loss_clip": 0.01126159, "auxiliary_loss_mlp": 0.01040764, "balance_loss_clip": 1.05423856, "balance_loss_mlp": 1.02281129, "epoch": 0.2207725837967834, "flos": 18150294908160.0, "grad_norm": 2.015071454696955, "language_loss": 0.80643147, "learning_rate": 3.6302157331109573e-06, "loss": 0.82810068, "num_input_tokens_seen": 79179685, "step": 3672, "time_per_iteration": 2.674381732940674 }, { "auxiliary_loss_clip": 0.01149005, "auxiliary_loss_mlp": 0.01047239, "balance_loss_clip": 1.05706501, "balance_loss_mlp": 1.02992952, "epoch": 0.22083270704945138, "flos": 20479675898880.0, "grad_norm": 2.222038104071356, "language_loss": 0.73278964, "learning_rate": 3.629990083462682e-06, "loss": 0.75475204, "num_input_tokens_seen": 79196285, "step": 3673, "time_per_iteration": 2.6856846809387207 }, { "auxiliary_loss_clip": 0.01121745, "auxiliary_loss_mlp": 0.01044908, "balance_loss_clip": 1.05473876, "balance_loss_mlp": 1.02608538, "epoch": 0.22089283030211934, "flos": 34125801984000.0, "grad_norm": 1.9530426336903413, "language_loss": 0.76384282, "learning_rate": 3.6297643720050203e-06, "loss": 0.78550935, "num_input_tokens_seen": 79216060, "step": 3674, "time_per_iteration": 2.816190242767334 }, { "auxiliary_loss_clip": 0.01156134, "auxiliary_loss_mlp": 0.01047969, "balance_loss_clip": 1.05650616, "balance_loss_mlp": 1.02850175, "epoch": 0.2209529535547873, "flos": 18077216688000.0, "grad_norm": 2.045565300481816, "language_loss": 0.74367136, "learning_rate": 3.6295385987465293e-06, "loss": 0.76571238, "num_input_tokens_seen": 79235145, "step": 3675, "time_per_iteration": 2.69748592376709 }, { "auxiliary_loss_clip": 0.01155113, "auxiliary_loss_mlp": 0.01045626, "balance_loss_clip": 1.05442023, "balance_loss_mlp": 1.02800727, "epoch": 0.22101307680745527, "flos": 27235335070080.0, "grad_norm": 1.898816078558846, "language_loss": 0.79801333, "learning_rate": 3.629312763695772e-06, "loss": 0.82002068, "num_input_tokens_seen": 79256960, "step": 3676, "time_per_iteration": 2.6792948246002197 }, { "auxiliary_loss_clip": 0.01133095, "auxiliary_loss_mlp": 0.01049823, "balance_loss_clip": 1.05366707, "balance_loss_mlp": 1.03257358, "epoch": 0.22107320006012326, "flos": 16543256423040.0, "grad_norm": 2.1537198076644954, "language_loss": 0.75327688, "learning_rate": 3.6290868668613107e-06, "loss": 0.77510607, "num_input_tokens_seen": 79274860, "step": 3677, "time_per_iteration": 2.781393527984619 }, { "auxiliary_loss_clip": 0.0111612, "auxiliary_loss_mlp": 0.01050059, "balance_loss_clip": 1.04986429, "balance_loss_mlp": 1.03212988, "epoch": 0.22113332331279123, "flos": 22054466949120.0, "grad_norm": 1.7875463894855461, "language_loss": 0.83287871, "learning_rate": 3.628860908251712e-06, "loss": 0.85454059, "num_input_tokens_seen": 79294005, "step": 3678, "time_per_iteration": 2.752838611602783 }, { "auxiliary_loss_clip": 0.01094052, "auxiliary_loss_mlp": 0.01058605, "balance_loss_clip": 1.04951406, "balance_loss_mlp": 1.03992522, "epoch": 0.2211934465654592, "flos": 26612787525120.0, "grad_norm": 1.6742153249136704, "language_loss": 0.89135075, "learning_rate": 3.6286348878755452e-06, "loss": 0.91287732, "num_input_tokens_seen": 79314005, "step": 3679, "time_per_iteration": 2.8282527923583984 }, { "auxiliary_loss_clip": 0.01147641, "auxiliary_loss_mlp": 0.01054276, "balance_loss_clip": 1.05507338, "balance_loss_mlp": 1.03615618, "epoch": 0.22125356981812716, "flos": 16360363347840.0, "grad_norm": 3.092644946410345, "language_loss": 0.8649044, "learning_rate": 3.6284088057413803e-06, "loss": 0.88692355, "num_input_tokens_seen": 79331030, "step": 3680, "time_per_iteration": 2.630829095840454 }, { "auxiliary_loss_clip": 0.0111249, "auxiliary_loss_mlp": 0.01052062, "balance_loss_clip": 1.05374503, "balance_loss_mlp": 1.03395414, "epoch": 0.22131369307079513, "flos": 21651118151040.0, "grad_norm": 1.9427224492838853, "language_loss": 0.81773758, "learning_rate": 3.6281826618577894e-06, "loss": 0.83938313, "num_input_tokens_seen": 79348560, "step": 3681, "time_per_iteration": 2.805880069732666 }, { "auxiliary_loss_clip": 0.01148508, "auxiliary_loss_mlp": 0.00775652, "balance_loss_clip": 1.0530386, "balance_loss_mlp": 1.00146043, "epoch": 0.2213738163234631, "flos": 19609524927360.0, "grad_norm": 2.296553230959153, "language_loss": 0.80099678, "learning_rate": 3.62795645623335e-06, "loss": 0.82023835, "num_input_tokens_seen": 79367175, "step": 3682, "time_per_iteration": 2.624234199523926 }, { "auxiliary_loss_clip": 0.0112405, "auxiliary_loss_mlp": 0.0105126, "balance_loss_clip": 1.0500052, "balance_loss_mlp": 1.03198409, "epoch": 0.22143393957613106, "flos": 23623404082560.0, "grad_norm": 1.6781760642146926, "language_loss": 0.77394038, "learning_rate": 3.627730188876638e-06, "loss": 0.7956934, "num_input_tokens_seen": 79388435, "step": 3683, "time_per_iteration": 2.6746323108673096 }, { "auxiliary_loss_clip": 0.01129753, "auxiliary_loss_mlp": 0.01051291, "balance_loss_clip": 1.05048668, "balance_loss_mlp": 1.03411245, "epoch": 0.22149406282879905, "flos": 26177801823360.0, "grad_norm": 2.1201256685163323, "language_loss": 0.72406399, "learning_rate": 3.627503859796234e-06, "loss": 0.7458744, "num_input_tokens_seen": 79407910, "step": 3684, "time_per_iteration": 2.695958375930786 }, { "auxiliary_loss_clip": 0.01084051, "auxiliary_loss_mlp": 0.01045612, "balance_loss_clip": 1.04670835, "balance_loss_mlp": 1.02571654, "epoch": 0.221554186081467, "flos": 14538758970240.0, "grad_norm": 2.1308896442870893, "language_loss": 0.79817796, "learning_rate": 3.6272774690007207e-06, "loss": 0.81947458, "num_input_tokens_seen": 79424020, "step": 3685, "time_per_iteration": 2.7443795204162598 }, { "auxiliary_loss_clip": 0.01147394, "auxiliary_loss_mlp": 0.01045457, "balance_loss_clip": 1.05201805, "balance_loss_mlp": 1.02867222, "epoch": 0.22161430933413498, "flos": 22238257864320.0, "grad_norm": 1.6870532517893482, "language_loss": 0.87305272, "learning_rate": 3.6270510164986823e-06, "loss": 0.89498115, "num_input_tokens_seen": 79445605, "step": 3686, "time_per_iteration": 4.388494968414307 }, { "auxiliary_loss_clip": 0.01137917, "auxiliary_loss_mlp": 0.0104367, "balance_loss_clip": 1.052562, "balance_loss_mlp": 1.02620554, "epoch": 0.22167443258680294, "flos": 23476529370240.0, "grad_norm": 1.8821221420403713, "language_loss": 0.78069639, "learning_rate": 3.626824502298707e-06, "loss": 0.80251229, "num_input_tokens_seen": 79463850, "step": 3687, "time_per_iteration": 4.123531103134155 }, { "auxiliary_loss_clip": 0.0112545, "auxiliary_loss_mlp": 0.01052599, "balance_loss_clip": 1.0494144, "balance_loss_mlp": 1.0331558, "epoch": 0.2217345558394709, "flos": 23221132692480.0, "grad_norm": 1.8251811803295879, "language_loss": 0.84860861, "learning_rate": 3.626597926409383e-06, "loss": 0.8703891, "num_input_tokens_seen": 79482845, "step": 3688, "time_per_iteration": 4.287938594818115 }, { "auxiliary_loss_clip": 0.01110764, "auxiliary_loss_mlp": 0.01051634, "balance_loss_clip": 1.04967332, "balance_loss_mlp": 1.03254843, "epoch": 0.22179467909213887, "flos": 20011078045440.0, "grad_norm": 1.7785994747216247, "language_loss": 0.81150943, "learning_rate": 3.6263712888393027e-06, "loss": 0.83313334, "num_input_tokens_seen": 79501550, "step": 3689, "time_per_iteration": 2.7521302700042725 }, { "auxiliary_loss_clip": 0.01124628, "auxiliary_loss_mlp": 0.01048971, "balance_loss_clip": 1.05078936, "balance_loss_mlp": 1.03131568, "epoch": 0.22185480234480687, "flos": 19683034110720.0, "grad_norm": 1.7481542974535997, "language_loss": 0.70018351, "learning_rate": 3.626144589597061e-06, "loss": 0.72191954, "num_input_tokens_seen": 79519680, "step": 3690, "time_per_iteration": 2.6664223670959473 }, { "auxiliary_loss_clip": 0.01147193, "auxiliary_loss_mlp": 0.00777365, "balance_loss_clip": 1.0537169, "balance_loss_mlp": 1.00153625, "epoch": 0.22191492559747483, "flos": 21981316901760.0, "grad_norm": 1.8112729447523994, "language_loss": 0.72609359, "learning_rate": 3.6259178286912528e-06, "loss": 0.74533916, "num_input_tokens_seen": 79539000, "step": 3691, "time_per_iteration": 2.6724495887756348 }, { "auxiliary_loss_clip": 0.01144688, "auxiliary_loss_mlp": 0.01046427, "balance_loss_clip": 1.05663919, "balance_loss_mlp": 1.0275923, "epoch": 0.2219750488501428, "flos": 23222066446080.0, "grad_norm": 1.8134603978799304, "language_loss": 0.71503472, "learning_rate": 3.625691006130477e-06, "loss": 0.73694593, "num_input_tokens_seen": 79559695, "step": 3692, "time_per_iteration": 2.6743686199188232 }, { "auxiliary_loss_clip": 0.01147828, "auxiliary_loss_mlp": 0.01048973, "balance_loss_clip": 1.05410266, "balance_loss_mlp": 1.03098464, "epoch": 0.22203517210281076, "flos": 22453685683200.0, "grad_norm": 2.1147705582229577, "language_loss": 0.87551594, "learning_rate": 3.6254641219233362e-06, "loss": 0.89748394, "num_input_tokens_seen": 79579095, "step": 3693, "time_per_iteration": 4.2962939739227295 }, { "auxiliary_loss_clip": 0.01141134, "auxiliary_loss_mlp": 0.01041066, "balance_loss_clip": 1.0537045, "balance_loss_mlp": 1.02479386, "epoch": 0.22209529535547873, "flos": 17564555825280.0, "grad_norm": 1.9865017520636683, "language_loss": 0.85553116, "learning_rate": 3.6252371760784325e-06, "loss": 0.87735319, "num_input_tokens_seen": 79596430, "step": 3694, "time_per_iteration": 2.585657835006714 }, { "auxiliary_loss_clip": 0.01107468, "auxiliary_loss_mlp": 0.01045482, "balance_loss_clip": 1.04370403, "balance_loss_mlp": 1.02640843, "epoch": 0.2221554186081467, "flos": 21469015175040.0, "grad_norm": 2.1752375595399136, "language_loss": 0.68740189, "learning_rate": 3.6250101686043725e-06, "loss": 0.70893133, "num_input_tokens_seen": 79615825, "step": 3695, "time_per_iteration": 2.744264841079712 }, { "auxiliary_loss_clip": 0.01118075, "auxiliary_loss_mlp": 0.01047291, "balance_loss_clip": 1.051736, "balance_loss_mlp": 1.0310905, "epoch": 0.22221554186081466, "flos": 27673445255040.0, "grad_norm": 1.6851408018575031, "language_loss": 0.71540272, "learning_rate": 3.6247830995097637e-06, "loss": 0.73705637, "num_input_tokens_seen": 79637875, "step": 3696, "time_per_iteration": 2.7320780754089355 }, { "auxiliary_loss_clip": 0.01140935, "auxiliary_loss_mlp": 0.0104304, "balance_loss_clip": 1.05123305, "balance_loss_mlp": 1.02455115, "epoch": 0.22227566511348265, "flos": 25958926298880.0, "grad_norm": 1.7186386141421306, "language_loss": 0.87905443, "learning_rate": 3.624555968803217e-06, "loss": 0.90089417, "num_input_tokens_seen": 79656970, "step": 3697, "time_per_iteration": 2.65919828414917 }, { "auxiliary_loss_clip": 0.01118987, "auxiliary_loss_mlp": 0.0104214, "balance_loss_clip": 1.04718316, "balance_loss_mlp": 1.0255338, "epoch": 0.22233578836615062, "flos": 39203678833920.0, "grad_norm": 1.6515031384229777, "language_loss": 0.65900242, "learning_rate": 3.624328776493346e-06, "loss": 0.6806137, "num_input_tokens_seen": 79680275, "step": 3698, "time_per_iteration": 2.7708024978637695 }, { "auxiliary_loss_clip": 0.01142696, "auxiliary_loss_mlp": 0.01049333, "balance_loss_clip": 1.05630088, "balance_loss_mlp": 1.03102303, "epoch": 0.22239591161881858, "flos": 36283782251520.0, "grad_norm": 1.9634592665257078, "language_loss": 0.82520199, "learning_rate": 3.6241015225887637e-06, "loss": 0.84712231, "num_input_tokens_seen": 79701255, "step": 3699, "time_per_iteration": 2.7743008136749268 }, { "auxiliary_loss_clip": 0.01129692, "auxiliary_loss_mlp": 0.01047594, "balance_loss_clip": 1.05154991, "balance_loss_mlp": 1.02939105, "epoch": 0.22245603487148655, "flos": 19719591177600.0, "grad_norm": 1.6711069078421557, "language_loss": 0.79384553, "learning_rate": 3.62387420709809e-06, "loss": 0.8156184, "num_input_tokens_seen": 79721315, "step": 3700, "time_per_iteration": 2.652172327041626 }, { "auxiliary_loss_clip": 0.01111144, "auxiliary_loss_mlp": 0.01045464, "balance_loss_clip": 1.04893112, "balance_loss_mlp": 1.02608061, "epoch": 0.2225161581241545, "flos": 46280450615040.0, "grad_norm": 2.123831341506728, "language_loss": 0.72503817, "learning_rate": 3.623646830029943e-06, "loss": 0.74660432, "num_input_tokens_seen": 79742705, "step": 3701, "time_per_iteration": 2.943124294281006 }, { "auxiliary_loss_clip": 0.01139412, "auxiliary_loss_mlp": 0.0104206, "balance_loss_clip": 1.05053067, "balance_loss_mlp": 1.0246197, "epoch": 0.22257628137682248, "flos": 23696194993920.0, "grad_norm": 1.9127522113256972, "language_loss": 0.79901838, "learning_rate": 3.6234193913929454e-06, "loss": 0.82083315, "num_input_tokens_seen": 79763000, "step": 3702, "time_per_iteration": 2.6978282928466797 }, { "auxiliary_loss_clip": 0.01129024, "auxiliary_loss_mlp": 0.01044082, "balance_loss_clip": 1.04707038, "balance_loss_mlp": 1.02655816, "epoch": 0.22263640462949044, "flos": 19353984595200.0, "grad_norm": 1.8258996761992496, "language_loss": 0.78237271, "learning_rate": 3.623191891195723e-06, "loss": 0.80410373, "num_input_tokens_seen": 79781335, "step": 3703, "time_per_iteration": 2.6528990268707275 }, { "auxiliary_loss_clip": 0.01140219, "auxiliary_loss_mlp": 0.01036919, "balance_loss_clip": 1.0503273, "balance_loss_mlp": 1.0171181, "epoch": 0.22269652788215843, "flos": 20776047016320.0, "grad_norm": 2.1693263198920563, "language_loss": 0.74490714, "learning_rate": 3.6229643294469005e-06, "loss": 0.76667851, "num_input_tokens_seen": 79800150, "step": 3704, "time_per_iteration": 2.679184913635254 }, { "auxiliary_loss_clip": 0.0110341, "auxiliary_loss_mlp": 0.01043861, "balance_loss_clip": 1.046996, "balance_loss_mlp": 1.02684951, "epoch": 0.2227566511348264, "flos": 47958843467520.0, "grad_norm": 1.8279463297536431, "language_loss": 0.644319, "learning_rate": 3.6227367061551074e-06, "loss": 0.66579175, "num_input_tokens_seen": 79822390, "step": 3705, "time_per_iteration": 2.972221612930298 }, { "auxiliary_loss_clip": 0.01037239, "auxiliary_loss_mlp": 0.01023153, "balance_loss_clip": 1.03748369, "balance_loss_mlp": 1.02111423, "epoch": 0.22281677438749437, "flos": 66218953230720.0, "grad_norm": 1.2472387125776994, "language_loss": 0.65169704, "learning_rate": 3.6225090213289766e-06, "loss": 0.67230093, "num_input_tokens_seen": 79873350, "step": 3706, "time_per_iteration": 3.118619203567505 }, { "auxiliary_loss_clip": 0.01116185, "auxiliary_loss_mlp": 0.01040401, "balance_loss_clip": 1.04938805, "balance_loss_mlp": 1.02290082, "epoch": 0.22287689764016233, "flos": 21871609787520.0, "grad_norm": 1.912279921070755, "language_loss": 0.80597419, "learning_rate": 3.622281274977141e-06, "loss": 0.8275401, "num_input_tokens_seen": 79891715, "step": 3707, "time_per_iteration": 2.6555368900299072 }, { "auxiliary_loss_clip": 0.01149897, "auxiliary_loss_mlp": 0.01039316, "balance_loss_clip": 1.05199265, "balance_loss_mlp": 1.02203059, "epoch": 0.2229370208928303, "flos": 27672475587840.0, "grad_norm": 1.9339558574691282, "language_loss": 0.78542316, "learning_rate": 3.6220534671082367e-06, "loss": 0.80731529, "num_input_tokens_seen": 79911175, "step": 3708, "time_per_iteration": 2.7179131507873535 }, { "auxiliary_loss_clip": 0.01128276, "auxiliary_loss_mlp": 0.01042525, "balance_loss_clip": 1.05055118, "balance_loss_mlp": 1.02363038, "epoch": 0.22299714414549826, "flos": 30154657034880.0, "grad_norm": 1.8085596793383067, "language_loss": 0.80606776, "learning_rate": 3.6218255977309024e-06, "loss": 0.82777578, "num_input_tokens_seen": 79931875, "step": 3709, "time_per_iteration": 2.810605764389038 }, { "auxiliary_loss_clip": 0.01135044, "auxiliary_loss_mlp": 0.00777248, "balance_loss_clip": 1.0480969, "balance_loss_mlp": 1.0014261, "epoch": 0.22305726739816625, "flos": 23143134309120.0, "grad_norm": 2.100780376064183, "language_loss": 0.69068789, "learning_rate": 3.6215976668537787e-06, "loss": 0.70981085, "num_input_tokens_seen": 79952445, "step": 3710, "time_per_iteration": 2.7197980880737305 }, { "auxiliary_loss_clip": 0.01111671, "auxiliary_loss_mlp": 0.01050475, "balance_loss_clip": 1.04630041, "balance_loss_mlp": 1.03220057, "epoch": 0.22311739065083422, "flos": 19172061187200.0, "grad_norm": 2.1025491711486763, "language_loss": 0.90782154, "learning_rate": 3.6213696744855096e-06, "loss": 0.92944294, "num_input_tokens_seen": 79971030, "step": 3711, "time_per_iteration": 2.808014154434204 }, { "auxiliary_loss_clip": 0.01117969, "auxiliary_loss_mlp": 0.01059175, "balance_loss_clip": 1.04696095, "balance_loss_mlp": 1.03921938, "epoch": 0.22317751390350218, "flos": 13617757319040.0, "grad_norm": 6.2447945102939615, "language_loss": 0.89070308, "learning_rate": 3.6211416206347395e-06, "loss": 0.91247451, "num_input_tokens_seen": 79982085, "step": 3712, "time_per_iteration": 2.6701955795288086 }, { "auxiliary_loss_clip": 0.01150852, "auxiliary_loss_mlp": 0.01044271, "balance_loss_clip": 1.05445373, "balance_loss_mlp": 1.02627039, "epoch": 0.22323763715617015, "flos": 11029065068160.0, "grad_norm": 5.249819485386642, "language_loss": 0.75858659, "learning_rate": 3.620913505310117e-06, "loss": 0.78053784, "num_input_tokens_seen": 79997460, "step": 3713, "time_per_iteration": 2.5961148738861084 }, { "auxiliary_loss_clip": 0.01106588, "auxiliary_loss_mlp": 0.01043158, "balance_loss_clip": 1.05345535, "balance_loss_mlp": 1.0252645, "epoch": 0.22329776040883811, "flos": 41351531466240.0, "grad_norm": 1.7774284049242903, "language_loss": 0.62422931, "learning_rate": 3.6206853285202917e-06, "loss": 0.6457268, "num_input_tokens_seen": 80022450, "step": 3714, "time_per_iteration": 2.9655838012695312 }, { "auxiliary_loss_clip": 0.0112071, "auxiliary_loss_mlp": 0.01033065, "balance_loss_clip": 1.05258489, "balance_loss_mlp": 1.0163759, "epoch": 0.22335788366150608, "flos": 25119478477440.0, "grad_norm": 5.465931600334143, "language_loss": 0.79076529, "learning_rate": 3.6204570902739164e-06, "loss": 0.81230301, "num_input_tokens_seen": 80042100, "step": 3715, "time_per_iteration": 2.8040106296539307 }, { "auxiliary_loss_clip": 0.01113318, "auxiliary_loss_mlp": 0.01049585, "balance_loss_clip": 1.05601192, "balance_loss_mlp": 1.03176367, "epoch": 0.22341800691417404, "flos": 16983377769600.0, "grad_norm": 2.696607190089822, "language_loss": 0.77416688, "learning_rate": 3.620228790579645e-06, "loss": 0.79579592, "num_input_tokens_seen": 80059690, "step": 3716, "time_per_iteration": 2.721008777618408 }, { "auxiliary_loss_clip": 0.01123787, "auxiliary_loss_mlp": 0.01043954, "balance_loss_clip": 1.04860306, "balance_loss_mlp": 1.02644157, "epoch": 0.22347813016684204, "flos": 14136738975360.0, "grad_norm": 3.4762745813408884, "language_loss": 0.79258984, "learning_rate": 3.6200004294461367e-06, "loss": 0.81426722, "num_input_tokens_seen": 80076060, "step": 3717, "time_per_iteration": 2.724637746810913 }, { "auxiliary_loss_clip": 0.0107853, "auxiliary_loss_mlp": 0.01042478, "balance_loss_clip": 1.04485083, "balance_loss_mlp": 1.02390504, "epoch": 0.22353825341951, "flos": 23583147914880.0, "grad_norm": 1.9798483733973138, "language_loss": 0.67890245, "learning_rate": 3.6197720068820497e-06, "loss": 0.70011252, "num_input_tokens_seen": 80094760, "step": 3718, "time_per_iteration": 2.8178799152374268 }, { "auxiliary_loss_clip": 0.01128946, "auxiliary_loss_mlp": 0.01043035, "balance_loss_clip": 1.04887676, "balance_loss_mlp": 1.02374721, "epoch": 0.22359837667217797, "flos": 29824206888960.0, "grad_norm": 1.6261924310986715, "language_loss": 0.81046188, "learning_rate": 3.619543522896045e-06, "loss": 0.83218175, "num_input_tokens_seen": 80114475, "step": 3719, "time_per_iteration": 2.8068079948425293 }, { "auxiliary_loss_clip": 0.0112823, "auxiliary_loss_mlp": 0.0105526, "balance_loss_clip": 1.05054009, "balance_loss_mlp": 1.03555441, "epoch": 0.22365849992484593, "flos": 17603088140160.0, "grad_norm": 2.128611791985372, "language_loss": 0.86535168, "learning_rate": 3.6193149774967885e-06, "loss": 0.88718653, "num_input_tokens_seen": 80132920, "step": 3720, "time_per_iteration": 2.726252794265747 }, { "auxiliary_loss_clip": 0.01123833, "auxiliary_loss_mlp": 0.01039252, "balance_loss_clip": 1.05347347, "balance_loss_mlp": 1.0207628, "epoch": 0.2237186231775139, "flos": 22710949868160.0, "grad_norm": 1.725668609175168, "language_loss": 0.7471531, "learning_rate": 3.619086370692945e-06, "loss": 0.76878393, "num_input_tokens_seen": 80152845, "step": 3721, "time_per_iteration": 2.77329158782959 }, { "auxiliary_loss_clip": 0.01158005, "auxiliary_loss_mlp": 0.01043442, "balance_loss_clip": 1.05607998, "balance_loss_mlp": 1.02497673, "epoch": 0.22377874643018186, "flos": 13371518609280.0, "grad_norm": 3.166607303525693, "language_loss": 0.7957024, "learning_rate": 3.6188577024931844e-06, "loss": 0.8177169, "num_input_tokens_seen": 80170680, "step": 3722, "time_per_iteration": 2.7204909324645996 }, { "auxiliary_loss_clip": 0.01113056, "auxiliary_loss_mlp": 0.01041868, "balance_loss_clip": 1.0520618, "balance_loss_mlp": 1.02571511, "epoch": 0.22383886968284986, "flos": 17894970057600.0, "grad_norm": 2.0043774256219997, "language_loss": 0.82129884, "learning_rate": 3.618628972906178e-06, "loss": 0.84284806, "num_input_tokens_seen": 80189030, "step": 3723, "time_per_iteration": 2.7908549308776855 }, { "auxiliary_loss_clip": 0.01155309, "auxiliary_loss_mlp": 0.01046826, "balance_loss_clip": 1.05468059, "balance_loss_mlp": 1.02857494, "epoch": 0.22389899293551782, "flos": 23879123982720.0, "grad_norm": 2.0838579777085022, "language_loss": 0.84742224, "learning_rate": 3.6184001819405984e-06, "loss": 0.86944354, "num_input_tokens_seen": 80208365, "step": 3724, "time_per_iteration": 2.691678047180176 }, { "auxiliary_loss_clip": 0.01123425, "auxiliary_loss_mlp": 0.01042537, "balance_loss_clip": 1.0494504, "balance_loss_mlp": 1.02516866, "epoch": 0.2239591161881858, "flos": 27272430840960.0, "grad_norm": 1.76453761267329, "language_loss": 0.79456621, "learning_rate": 3.618171329605121e-06, "loss": 0.81622583, "num_input_tokens_seen": 80228685, "step": 3725, "time_per_iteration": 4.339299917221069 }, { "auxiliary_loss_clip": 0.01091555, "auxiliary_loss_mlp": 0.01043361, "balance_loss_clip": 1.05116296, "balance_loss_mlp": 1.02538443, "epoch": 0.22401923944085375, "flos": 22236857233920.0, "grad_norm": 1.776149940187026, "language_loss": 0.77333415, "learning_rate": 3.6179424159084254e-06, "loss": 0.79468334, "num_input_tokens_seen": 80247635, "step": 3726, "time_per_iteration": 4.320322275161743 }, { "auxiliary_loss_clip": 0.0115151, "auxiliary_loss_mlp": 0.01047267, "balance_loss_clip": 1.05424356, "balance_loss_mlp": 1.02664328, "epoch": 0.22407936269352172, "flos": 12053668521600.0, "grad_norm": 2.83844669603944, "language_loss": 0.72643399, "learning_rate": 3.6177134408591914e-06, "loss": 0.74842173, "num_input_tokens_seen": 80260045, "step": 3727, "time_per_iteration": 4.218656539916992 }, { "auxiliary_loss_clip": 0.01157504, "auxiliary_loss_mlp": 0.01043436, "balance_loss_clip": 1.0541296, "balance_loss_mlp": 1.02321815, "epoch": 0.22413948594618968, "flos": 19353553632000.0, "grad_norm": 2.250671737688348, "language_loss": 0.86600292, "learning_rate": 3.6174844044661013e-06, "loss": 0.88801229, "num_input_tokens_seen": 80277680, "step": 3728, "time_per_iteration": 2.650423765182495 }, { "auxiliary_loss_clip": 0.01122602, "auxiliary_loss_mlp": 0.01053562, "balance_loss_clip": 1.050982, "balance_loss_mlp": 1.03134131, "epoch": 0.22419960919885765, "flos": 24170000319360.0, "grad_norm": 2.1953419048873877, "language_loss": 0.80038953, "learning_rate": 3.6172553067378406e-06, "loss": 0.82215106, "num_input_tokens_seen": 80294795, "step": 3729, "time_per_iteration": 2.7553794384002686 }, { "auxiliary_loss_clip": 0.01126228, "auxiliary_loss_mlp": 0.01046911, "balance_loss_clip": 1.05183935, "balance_loss_mlp": 1.02992368, "epoch": 0.22425973245152564, "flos": 27378977558400.0, "grad_norm": 1.8211738544282683, "language_loss": 0.86968076, "learning_rate": 3.6170261476830964e-06, "loss": 0.89141214, "num_input_tokens_seen": 80315425, "step": 3730, "time_per_iteration": 2.8044395446777344 }, { "auxiliary_loss_clip": 0.01121982, "auxiliary_loss_mlp": 0.00775761, "balance_loss_clip": 1.04924226, "balance_loss_mlp": 1.00148201, "epoch": 0.2243198557041936, "flos": 13735652734080.0, "grad_norm": 2.1817469574553017, "language_loss": 0.73091185, "learning_rate": 3.616796927310559e-06, "loss": 0.74988931, "num_input_tokens_seen": 80333905, "step": 3731, "time_per_iteration": 2.764198064804077 }, { "auxiliary_loss_clip": 0.01127044, "auxiliary_loss_mlp": 0.0104235, "balance_loss_clip": 1.05654919, "balance_loss_mlp": 1.02467108, "epoch": 0.22437997895686157, "flos": 19530700531200.0, "grad_norm": 2.1924274894904787, "language_loss": 0.75427651, "learning_rate": 3.6165676456289195e-06, "loss": 0.77597046, "num_input_tokens_seen": 80352165, "step": 3732, "time_per_iteration": 4.544835090637207 }, { "auxiliary_loss_clip": 0.01155285, "auxiliary_loss_mlp": 0.01053522, "balance_loss_clip": 1.05655456, "balance_loss_mlp": 1.03560436, "epoch": 0.22444010220952954, "flos": 23696230907520.0, "grad_norm": 1.745203479087184, "language_loss": 0.88139856, "learning_rate": 3.616338302646873e-06, "loss": 0.90348667, "num_input_tokens_seen": 80371305, "step": 3733, "time_per_iteration": 2.7097933292388916 }, { "auxiliary_loss_clip": 0.0110922, "auxiliary_loss_mlp": 0.01040674, "balance_loss_clip": 1.05094051, "balance_loss_mlp": 1.02264953, "epoch": 0.2245002254621975, "flos": 22382905933440.0, "grad_norm": 1.6873732683679492, "language_loss": 0.84643197, "learning_rate": 3.6161088983731166e-06, "loss": 0.86793089, "num_input_tokens_seen": 80391020, "step": 3734, "time_per_iteration": 2.7647547721862793 }, { "auxiliary_loss_clip": 0.0113181, "auxiliary_loss_mlp": 0.01049327, "balance_loss_clip": 1.05362856, "balance_loss_mlp": 1.03149319, "epoch": 0.22456034871486547, "flos": 26942303917440.0, "grad_norm": 1.774553175519815, "language_loss": 0.7679311, "learning_rate": 3.6158794328163482e-06, "loss": 0.78974247, "num_input_tokens_seen": 80411365, "step": 3735, "time_per_iteration": 2.7682430744171143 }, { "auxiliary_loss_clip": 0.01138858, "auxiliary_loss_mlp": 0.01045746, "balance_loss_clip": 1.06029248, "balance_loss_mlp": 1.02927136, "epoch": 0.22462047196753343, "flos": 28983538005120.0, "grad_norm": 1.671324371931155, "language_loss": 0.842767, "learning_rate": 3.6156499059852702e-06, "loss": 0.86461306, "num_input_tokens_seen": 80431075, "step": 3736, "time_per_iteration": 3.009368419647217 }, { "auxiliary_loss_clip": 0.0111279, "auxiliary_loss_mlp": 0.01044111, "balance_loss_clip": 1.05240226, "balance_loss_mlp": 1.02677774, "epoch": 0.22468059522020142, "flos": 20011329440640.0, "grad_norm": 1.8971112354532307, "language_loss": 0.86643183, "learning_rate": 3.615420317888586e-06, "loss": 0.88800085, "num_input_tokens_seen": 80449240, "step": 3737, "time_per_iteration": 2.792965888977051 }, { "auxiliary_loss_clip": 0.0115891, "auxiliary_loss_mlp": 0.0104972, "balance_loss_clip": 1.05792093, "balance_loss_mlp": 1.03051496, "epoch": 0.2247407184728694, "flos": 29314239546240.0, "grad_norm": 6.664079021041442, "language_loss": 0.79027152, "learning_rate": 3.6151906685350006e-06, "loss": 0.81235784, "num_input_tokens_seen": 80467900, "step": 3738, "time_per_iteration": 2.716878652572632 }, { "auxiliary_loss_clip": 0.01122737, "auxiliary_loss_mlp": 0.01047993, "balance_loss_clip": 1.0520165, "balance_loss_mlp": 1.0315063, "epoch": 0.22480084172553735, "flos": 22310366417280.0, "grad_norm": 1.837059456311059, "language_loss": 0.76693523, "learning_rate": 3.614960957933224e-06, "loss": 0.78864253, "num_input_tokens_seen": 80487100, "step": 3739, "time_per_iteration": 2.743222713470459 }, { "auxiliary_loss_clip": 0.01116493, "auxiliary_loss_mlp": 0.01049772, "balance_loss_clip": 1.05008686, "balance_loss_mlp": 1.03011417, "epoch": 0.22486096497820532, "flos": 25591272641280.0, "grad_norm": 2.2924613412630133, "language_loss": 0.74577379, "learning_rate": 3.6147311860919655e-06, "loss": 0.7674365, "num_input_tokens_seen": 80508625, "step": 3740, "time_per_iteration": 2.7339253425598145 }, { "auxiliary_loss_clip": 0.01152276, "auxiliary_loss_mlp": 0.01045147, "balance_loss_clip": 1.05556941, "balance_loss_mlp": 1.02728927, "epoch": 0.22492108823087328, "flos": 17639824775040.0, "grad_norm": 1.9086069443180373, "language_loss": 0.75610423, "learning_rate": 3.614501353019939e-06, "loss": 0.77807844, "num_input_tokens_seen": 80527345, "step": 3741, "time_per_iteration": 2.7347571849823 }, { "auxiliary_loss_clip": 0.01133279, "auxiliary_loss_mlp": 0.01039745, "balance_loss_clip": 1.05599904, "balance_loss_mlp": 1.02316284, "epoch": 0.22498121148354125, "flos": 16034653797120.0, "grad_norm": 1.7754272123040742, "language_loss": 0.87332213, "learning_rate": 3.6142714587258592e-06, "loss": 0.89505225, "num_input_tokens_seen": 80545545, "step": 3742, "time_per_iteration": 2.702103614807129 }, { "auxiliary_loss_clip": 0.01095068, "auxiliary_loss_mlp": 0.01053093, "balance_loss_clip": 1.04728913, "balance_loss_mlp": 1.03398395, "epoch": 0.22504133473620924, "flos": 24023772051840.0, "grad_norm": 2.1035678371185256, "language_loss": 0.812823, "learning_rate": 3.614041503218444e-06, "loss": 0.83430457, "num_input_tokens_seen": 80565040, "step": 3743, "time_per_iteration": 2.777566909790039 }, { "auxiliary_loss_clip": 0.01142483, "auxiliary_loss_mlp": 0.01040692, "balance_loss_clip": 1.05282855, "balance_loss_mlp": 1.02319252, "epoch": 0.2251014579888772, "flos": 16763963541120.0, "grad_norm": 2.836562973763206, "language_loss": 0.63821399, "learning_rate": 3.6138114865064134e-06, "loss": 0.66004574, "num_input_tokens_seen": 80582815, "step": 3744, "time_per_iteration": 2.6738698482513428 }, { "auxiliary_loss_clip": 0.01139201, "auxiliary_loss_mlp": 0.01043137, "balance_loss_clip": 1.05523586, "balance_loss_mlp": 1.0255779, "epoch": 0.22516158124154517, "flos": 13991013498240.0, "grad_norm": 4.405698565190268, "language_loss": 0.76340199, "learning_rate": 3.613581408598489e-06, "loss": 0.78522527, "num_input_tokens_seen": 80600865, "step": 3745, "time_per_iteration": 2.8423044681549072 }, { "auxiliary_loss_clip": 0.01116037, "auxiliary_loss_mlp": 0.0104407, "balance_loss_clip": 1.04906797, "balance_loss_mlp": 1.0267489, "epoch": 0.22522170449421314, "flos": 14390016750720.0, "grad_norm": 7.51155110796741, "language_loss": 0.8056733, "learning_rate": 3.6133512695033965e-06, "loss": 0.82727438, "num_input_tokens_seen": 80617455, "step": 3746, "time_per_iteration": 2.743417739868164 }, { "auxiliary_loss_clip": 0.01142091, "auxiliary_loss_mlp": 0.01050597, "balance_loss_clip": 1.05323768, "balance_loss_mlp": 1.0328114, "epoch": 0.2252818277468811, "flos": 23805542972160.0, "grad_norm": 2.6189948571262116, "language_loss": 0.86153656, "learning_rate": 3.613121069229862e-06, "loss": 0.88346344, "num_input_tokens_seen": 80635125, "step": 3747, "time_per_iteration": 2.7622148990631104 }, { "auxiliary_loss_clip": 0.01138021, "auxiliary_loss_mlp": 0.0077598, "balance_loss_clip": 1.05126321, "balance_loss_mlp": 1.00154519, "epoch": 0.22534195099954907, "flos": 24718033100160.0, "grad_norm": 2.3477587169419483, "language_loss": 0.76400602, "learning_rate": 3.6128908077866145e-06, "loss": 0.78314602, "num_input_tokens_seen": 80656370, "step": 3748, "time_per_iteration": 2.7347261905670166 }, { "auxiliary_loss_clip": 0.01156837, "auxiliary_loss_mlp": 0.01043045, "balance_loss_clip": 1.05704546, "balance_loss_mlp": 1.02525926, "epoch": 0.22540207425221703, "flos": 21032341534080.0, "grad_norm": 1.5503962030073002, "language_loss": 0.7984724, "learning_rate": 3.6126604851823864e-06, "loss": 0.82047117, "num_input_tokens_seen": 80676495, "step": 3749, "time_per_iteration": 2.6900558471679688 }, { "auxiliary_loss_clip": 0.01123701, "auxiliary_loss_mlp": 0.01041028, "balance_loss_clip": 1.05050755, "balance_loss_mlp": 1.02436304, "epoch": 0.22546219750488503, "flos": 19390362094080.0, "grad_norm": 3.015206251853355, "language_loss": 0.79585081, "learning_rate": 3.6124301014259108e-06, "loss": 0.81749809, "num_input_tokens_seen": 80694755, "step": 3750, "time_per_iteration": 2.727651596069336 }, { "auxiliary_loss_clip": 0.01097337, "auxiliary_loss_mlp": 0.01055462, "balance_loss_clip": 1.05065274, "balance_loss_mlp": 1.03756917, "epoch": 0.225522320757553, "flos": 25192628524800.0, "grad_norm": 2.662961533862713, "language_loss": 0.82433236, "learning_rate": 3.6121996565259244e-06, "loss": 0.84586036, "num_input_tokens_seen": 80713670, "step": 3751, "time_per_iteration": 2.827995538711548 }, { "auxiliary_loss_clip": 0.01121046, "auxiliary_loss_mlp": 0.01046103, "balance_loss_clip": 1.05429292, "balance_loss_mlp": 1.02828133, "epoch": 0.22558244401022096, "flos": 17163110448000.0, "grad_norm": 2.0142745824369315, "language_loss": 0.83813727, "learning_rate": 3.611969150491165e-06, "loss": 0.8598088, "num_input_tokens_seen": 80731450, "step": 3752, "time_per_iteration": 2.78725266456604 }, { "auxiliary_loss_clip": 0.01152116, "auxiliary_loss_mlp": 0.01037502, "balance_loss_clip": 1.05584741, "balance_loss_mlp": 1.02123034, "epoch": 0.22564256726288892, "flos": 15231008856960.0, "grad_norm": 1.9292267305553392, "language_loss": 0.78254855, "learning_rate": 3.611738583330375e-06, "loss": 0.80444479, "num_input_tokens_seen": 80748415, "step": 3753, "time_per_iteration": 2.7116169929504395 }, { "auxiliary_loss_clip": 0.01126321, "auxiliary_loss_mlp": 0.0104341, "balance_loss_clip": 1.05120027, "balance_loss_mlp": 1.02546871, "epoch": 0.2257026905155569, "flos": 34568652764160.0, "grad_norm": 1.8777790089425805, "language_loss": 0.78391469, "learning_rate": 3.611507955052295e-06, "loss": 0.80561191, "num_input_tokens_seen": 80770835, "step": 3754, "time_per_iteration": 2.91738224029541 }, { "auxiliary_loss_clip": 0.01128102, "auxiliary_loss_mlp": 0.01048192, "balance_loss_clip": 1.05648673, "balance_loss_mlp": 1.03040624, "epoch": 0.22576281376822485, "flos": 19938430788480.0, "grad_norm": 1.9337610105869587, "language_loss": 0.70648986, "learning_rate": 3.6112772656656727e-06, "loss": 0.72825277, "num_input_tokens_seen": 80787840, "step": 3755, "time_per_iteration": 2.7427992820739746 }, { "auxiliary_loss_clip": 0.01126515, "auxiliary_loss_mlp": 0.01053366, "balance_loss_clip": 1.05531752, "balance_loss_mlp": 1.03559232, "epoch": 0.22582293702089282, "flos": 24602005192320.0, "grad_norm": 3.9817469401483216, "language_loss": 0.77865845, "learning_rate": 3.6110465151792547e-06, "loss": 0.80045724, "num_input_tokens_seen": 80806335, "step": 3756, "time_per_iteration": 2.7879996299743652 }, { "auxiliary_loss_clip": 0.01132066, "auxiliary_loss_mlp": 0.01044227, "balance_loss_clip": 1.0559032, "balance_loss_mlp": 1.0261426, "epoch": 0.2258830602735608, "flos": 23035438356480.0, "grad_norm": 1.801741818571408, "language_loss": 0.82615864, "learning_rate": 3.6108157036017916e-06, "loss": 0.84792161, "num_input_tokens_seen": 80825355, "step": 3757, "time_per_iteration": 2.685218095779419 }, { "auxiliary_loss_clip": 0.01140048, "auxiliary_loss_mlp": 0.01047555, "balance_loss_clip": 1.05321026, "balance_loss_mlp": 1.02917302, "epoch": 0.22594318352622877, "flos": 22158427887360.0, "grad_norm": 2.3786564016745495, "language_loss": 0.73007452, "learning_rate": 3.6105848309420358e-06, "loss": 0.7519505, "num_input_tokens_seen": 80842570, "step": 3758, "time_per_iteration": 2.6716878414154053 }, { "auxiliary_loss_clip": 0.01137739, "auxiliary_loss_mlp": 0.01048984, "balance_loss_clip": 1.0577718, "balance_loss_mlp": 1.03019619, "epoch": 0.22600330677889674, "flos": 20594303176320.0, "grad_norm": 2.226232476294752, "language_loss": 0.77150333, "learning_rate": 3.6103538972087412e-06, "loss": 0.79337054, "num_input_tokens_seen": 80858745, "step": 3759, "time_per_iteration": 2.787487030029297 }, { "auxiliary_loss_clip": 0.01104852, "auxiliary_loss_mlp": 0.01043473, "balance_loss_clip": 1.04747176, "balance_loss_mlp": 1.02507949, "epoch": 0.2260634300315647, "flos": 35659798162560.0, "grad_norm": 1.6253921855068183, "language_loss": 0.78189945, "learning_rate": 3.6101229024106655e-06, "loss": 0.80338269, "num_input_tokens_seen": 80880085, "step": 3760, "time_per_iteration": 2.8760766983032227 }, { "auxiliary_loss_clip": 0.01042849, "auxiliary_loss_mlp": 0.01009599, "balance_loss_clip": 1.03235281, "balance_loss_mlp": 1.00633264, "epoch": 0.22612355328423267, "flos": 72090455126400.0, "grad_norm": 0.9481639821873915, "language_loss": 0.60083473, "learning_rate": 3.609891846556569e-06, "loss": 0.62135923, "num_input_tokens_seen": 80937660, "step": 3761, "time_per_iteration": 3.2168753147125244 }, { "auxiliary_loss_clip": 0.01114836, "auxiliary_loss_mlp": 0.01051216, "balance_loss_clip": 1.0493567, "balance_loss_mlp": 1.03295338, "epoch": 0.22618367653690064, "flos": 22783776693120.0, "grad_norm": 2.3328987294287047, "language_loss": 0.76767397, "learning_rate": 3.609660729655211e-06, "loss": 0.78933448, "num_input_tokens_seen": 80956265, "step": 3762, "time_per_iteration": 2.8012428283691406 }, { "auxiliary_loss_clip": 0.01128732, "auxiliary_loss_mlp": 0.01042327, "balance_loss_clip": 1.05266595, "balance_loss_mlp": 1.02190685, "epoch": 0.22624379978956863, "flos": 20448254476800.0, "grad_norm": 2.7297545785195907, "language_loss": 0.79000401, "learning_rate": 3.6094295517153573e-06, "loss": 0.81171465, "num_input_tokens_seen": 80975185, "step": 3763, "time_per_iteration": 2.7217857837677 }, { "auxiliary_loss_clip": 0.01142679, "auxiliary_loss_mlp": 0.01057425, "balance_loss_clip": 1.0557214, "balance_loss_mlp": 1.03835177, "epoch": 0.2263039230422366, "flos": 17494314779520.0, "grad_norm": 31.68022075556768, "language_loss": 0.91241246, "learning_rate": 3.6091983127457743e-06, "loss": 0.93441343, "num_input_tokens_seen": 80992830, "step": 3764, "time_per_iteration": 4.232046842575073 }, { "auxiliary_loss_clip": 0.01131876, "auxiliary_loss_mlp": 0.01055516, "balance_loss_clip": 1.05196834, "balance_loss_mlp": 1.0367409, "epoch": 0.22636404629490456, "flos": 28329748606080.0, "grad_norm": 1.9816130101247444, "language_loss": 0.75202596, "learning_rate": 3.6089670127552293e-06, "loss": 0.77389991, "num_input_tokens_seen": 81013675, "step": 3765, "time_per_iteration": 4.291628122329712 }, { "auxiliary_loss_clip": 0.01140284, "auxiliary_loss_mlp": 0.01047009, "balance_loss_clip": 1.05632913, "balance_loss_mlp": 1.02942574, "epoch": 0.22642416954757252, "flos": 17489143221120.0, "grad_norm": 2.1881182413466176, "language_loss": 0.8966549, "learning_rate": 3.608735651752494e-06, "loss": 0.91852784, "num_input_tokens_seen": 81030345, "step": 3766, "time_per_iteration": 2.6462960243225098 }, { "auxiliary_loss_clip": 0.01126107, "auxiliary_loss_mlp": 0.01047462, "balance_loss_clip": 1.05579042, "balance_loss_mlp": 1.02950931, "epoch": 0.2264842928002405, "flos": 24384530298240.0, "grad_norm": 1.6297384952566736, "language_loss": 0.74816859, "learning_rate": 3.6085042297463417e-06, "loss": 0.76990426, "num_input_tokens_seen": 81051000, "step": 3767, "time_per_iteration": 4.181917667388916 }, { "auxiliary_loss_clip": 0.01139766, "auxiliary_loss_mlp": 0.01048037, "balance_loss_clip": 1.05206823, "balance_loss_mlp": 1.02981031, "epoch": 0.22654441605290845, "flos": 19830519354240.0, "grad_norm": 1.6389844555489992, "language_loss": 0.71764815, "learning_rate": 3.6082727467455477e-06, "loss": 0.73952615, "num_input_tokens_seen": 81071205, "step": 3768, "time_per_iteration": 2.6622893810272217 }, { "auxiliary_loss_clip": 0.01143239, "auxiliary_loss_mlp": 0.01057198, "balance_loss_clip": 1.05766034, "balance_loss_mlp": 1.03895879, "epoch": 0.22660453930557642, "flos": 27454569730560.0, "grad_norm": 1.5883345705718652, "language_loss": 0.78320074, "learning_rate": 3.6080412027588905e-06, "loss": 0.80520505, "num_input_tokens_seen": 81091880, "step": 3769, "time_per_iteration": 2.692366123199463 }, { "auxiliary_loss_clip": 0.01121985, "auxiliary_loss_mlp": 0.01045951, "balance_loss_clip": 1.0452522, "balance_loss_mlp": 1.02712774, "epoch": 0.2266646625582444, "flos": 23988148738560.0, "grad_norm": 1.8427419299971495, "language_loss": 0.6877771, "learning_rate": 3.6078095977951488e-06, "loss": 0.70945644, "num_input_tokens_seen": 81113290, "step": 3770, "time_per_iteration": 2.7605137825012207 }, { "auxiliary_loss_clip": 0.01155061, "auxiliary_loss_mlp": 0.01053072, "balance_loss_clip": 1.0551908, "balance_loss_mlp": 1.03454649, "epoch": 0.22672478581091238, "flos": 26028054023040.0, "grad_norm": 1.6594447480271795, "language_loss": 0.80540276, "learning_rate": 3.6075779318631067e-06, "loss": 0.82748413, "num_input_tokens_seen": 81133535, "step": 3771, "time_per_iteration": 4.265140771865845 }, { "auxiliary_loss_clip": 0.0110854, "auxiliary_loss_mlp": 0.01058177, "balance_loss_clip": 1.04661536, "balance_loss_mlp": 1.04091501, "epoch": 0.22678490906358034, "flos": 23841812730240.0, "grad_norm": 1.6696234119475444, "language_loss": 0.78947794, "learning_rate": 3.6073462049715486e-06, "loss": 0.81114507, "num_input_tokens_seen": 81154650, "step": 3772, "time_per_iteration": 2.7325806617736816 }, { "auxiliary_loss_clip": 0.01036659, "auxiliary_loss_mlp": 0.0100656, "balance_loss_clip": 1.0461247, "balance_loss_mlp": 1.00336492, "epoch": 0.2268450323162483, "flos": 65048088574080.0, "grad_norm": 0.653194629863103, "language_loss": 0.54380804, "learning_rate": 3.607114417129261e-06, "loss": 0.56424022, "num_input_tokens_seen": 81221240, "step": 3773, "time_per_iteration": 3.3729567527770996 }, { "auxiliary_loss_clip": 0.0111914, "auxiliary_loss_mlp": 0.01046238, "balance_loss_clip": 1.05257821, "balance_loss_mlp": 1.02851129, "epoch": 0.22690515556891627, "flos": 22526081544960.0, "grad_norm": 1.81548541557593, "language_loss": 0.70406783, "learning_rate": 3.6068825683450334e-06, "loss": 0.7257216, "num_input_tokens_seen": 81241520, "step": 3774, "time_per_iteration": 2.7159364223480225 }, { "auxiliary_loss_clip": 0.01125586, "auxiliary_loss_mlp": 0.01046805, "balance_loss_clip": 1.05404115, "balance_loss_mlp": 1.02929282, "epoch": 0.22696527882158424, "flos": 18223444955520.0, "grad_norm": 2.2603412716687523, "language_loss": 0.74377871, "learning_rate": 3.606650658627658e-06, "loss": 0.76550257, "num_input_tokens_seen": 81256825, "step": 3775, "time_per_iteration": 2.7857720851898193 }, { "auxiliary_loss_clip": 0.01152024, "auxiliary_loss_mlp": 0.01045868, "balance_loss_clip": 1.05331159, "balance_loss_mlp": 1.02915478, "epoch": 0.22702540207425223, "flos": 17019252478080.0, "grad_norm": 1.8428958927362264, "language_loss": 0.81582248, "learning_rate": 3.606418687985928e-06, "loss": 0.83780146, "num_input_tokens_seen": 81275695, "step": 3776, "time_per_iteration": 2.6054935455322266 }, { "auxiliary_loss_clip": 0.01135081, "auxiliary_loss_mlp": 0.01043769, "balance_loss_clip": 1.05466735, "balance_loss_mlp": 1.02654314, "epoch": 0.2270855253269202, "flos": 21325731822720.0, "grad_norm": 1.7711090356153572, "language_loss": 0.82893199, "learning_rate": 3.606186656428641e-06, "loss": 0.85072052, "num_input_tokens_seen": 81294920, "step": 3777, "time_per_iteration": 2.722621202468872 }, { "auxiliary_loss_clip": 0.01127657, "auxiliary_loss_mlp": 0.01042436, "balance_loss_clip": 1.05438471, "balance_loss_mlp": 1.02435195, "epoch": 0.22714564857958816, "flos": 23550469516800.0, "grad_norm": 2.3905711679994295, "language_loss": 0.72538829, "learning_rate": 3.6059545639645955e-06, "loss": 0.74708927, "num_input_tokens_seen": 81314275, "step": 3778, "time_per_iteration": 2.730919599533081 }, { "auxiliary_loss_clip": 0.01112853, "auxiliary_loss_mlp": 0.01040216, "balance_loss_clip": 1.05304575, "balance_loss_mlp": 1.02241838, "epoch": 0.22720577183225613, "flos": 25989880844160.0, "grad_norm": 2.4150679449588535, "language_loss": 0.64176035, "learning_rate": 3.605722410602591e-06, "loss": 0.66329098, "num_input_tokens_seen": 81333890, "step": 3779, "time_per_iteration": 2.7663822174072266 }, { "auxiliary_loss_clip": 0.01132359, "auxiliary_loss_mlp": 0.01047274, "balance_loss_clip": 1.05292106, "balance_loss_mlp": 1.02928495, "epoch": 0.2272658950849241, "flos": 20814076540800.0, "grad_norm": 1.6627524387617407, "language_loss": 0.70659381, "learning_rate": 3.6054901963514323e-06, "loss": 0.72839016, "num_input_tokens_seen": 81353640, "step": 3780, "time_per_iteration": 2.666081666946411 }, { "auxiliary_loss_clip": 0.0114157, "auxiliary_loss_mlp": 0.01046965, "balance_loss_clip": 1.05450416, "balance_loss_mlp": 1.02880907, "epoch": 0.22732601833759206, "flos": 23909324342400.0, "grad_norm": 1.783300050979337, "language_loss": 0.89418924, "learning_rate": 3.6052579212199246e-06, "loss": 0.91607457, "num_input_tokens_seen": 81371595, "step": 3781, "time_per_iteration": 2.686478614807129 }, { "auxiliary_loss_clip": 0.01152428, "auxiliary_loss_mlp": 0.01041162, "balance_loss_clip": 1.05349672, "balance_loss_mlp": 1.02354264, "epoch": 0.22738614159026002, "flos": 15924407978880.0, "grad_norm": 19.977426185094338, "language_loss": 0.74404943, "learning_rate": 3.6050255852168753e-06, "loss": 0.76598531, "num_input_tokens_seen": 81388435, "step": 3782, "time_per_iteration": 2.5633177757263184 }, { "auxiliary_loss_clip": 0.01129007, "auxiliary_loss_mlp": 0.01045443, "balance_loss_clip": 1.05195391, "balance_loss_mlp": 1.02926588, "epoch": 0.22744626484292801, "flos": 24205515891840.0, "grad_norm": 2.051662638457334, "language_loss": 0.82665169, "learning_rate": 3.604793188351095e-06, "loss": 0.84839618, "num_input_tokens_seen": 81410195, "step": 3783, "time_per_iteration": 2.742572069168091 }, { "auxiliary_loss_clip": 0.01129724, "auxiliary_loss_mlp": 0.01043254, "balance_loss_clip": 1.055516, "balance_loss_mlp": 1.02495527, "epoch": 0.22750638809559598, "flos": 24791614110720.0, "grad_norm": 2.0126417567412256, "language_loss": 0.75996566, "learning_rate": 3.6045607306313964e-06, "loss": 0.78169543, "num_input_tokens_seen": 81430060, "step": 3784, "time_per_iteration": 2.7283668518066406 }, { "auxiliary_loss_clip": 0.01148666, "auxiliary_loss_mlp": 0.01041397, "balance_loss_clip": 1.05224681, "balance_loss_mlp": 1.02382576, "epoch": 0.22756651134826394, "flos": 22236498097920.0, "grad_norm": 1.784429661746796, "language_loss": 0.7105484, "learning_rate": 3.604328212066594e-06, "loss": 0.73244894, "num_input_tokens_seen": 81447375, "step": 3785, "time_per_iteration": 2.627401351928711 }, { "auxiliary_loss_clip": 0.01042691, "auxiliary_loss_mlp": 0.0101642, "balance_loss_clip": 1.03303862, "balance_loss_mlp": 1.01427412, "epoch": 0.2276266346009319, "flos": 62707466626560.0, "grad_norm": 0.8323137639565091, "language_loss": 0.6189881, "learning_rate": 3.6040956326655047e-06, "loss": 0.63957924, "num_input_tokens_seen": 81505235, "step": 3786, "time_per_iteration": 3.321380376815796 }, { "auxiliary_loss_clip": 0.01135149, "auxiliary_loss_mlp": 0.01044526, "balance_loss_clip": 1.0540669, "balance_loss_mlp": 1.02645397, "epoch": 0.22768675785359987, "flos": 18613936684800.0, "grad_norm": 2.677223616893363, "language_loss": 0.86047274, "learning_rate": 3.6038629924369486e-06, "loss": 0.8822695, "num_input_tokens_seen": 81518685, "step": 3787, "time_per_iteration": 2.72554349899292 }, { "auxiliary_loss_clip": 0.01129718, "auxiliary_loss_mlp": 0.01039908, "balance_loss_clip": 1.05296564, "balance_loss_mlp": 1.02323031, "epoch": 0.22774688110626784, "flos": 26870195364480.0, "grad_norm": 1.361320938410825, "language_loss": 0.72755021, "learning_rate": 3.6036302913897474e-06, "loss": 0.74924648, "num_input_tokens_seen": 81538940, "step": 3788, "time_per_iteration": 2.7717456817626953 }, { "auxiliary_loss_clip": 0.01125411, "auxiliary_loss_mlp": 0.01035437, "balance_loss_clip": 1.05099773, "balance_loss_mlp": 1.01800895, "epoch": 0.2278070043589358, "flos": 15553593924480.0, "grad_norm": 2.510042380876752, "language_loss": 0.67785919, "learning_rate": 3.6033975295327243e-06, "loss": 0.69946766, "num_input_tokens_seen": 81555525, "step": 3789, "time_per_iteration": 2.6492021083831787 }, { "auxiliary_loss_clip": 0.01114067, "auxiliary_loss_mlp": 0.01042939, "balance_loss_clip": 1.04577208, "balance_loss_mlp": 1.0244137, "epoch": 0.2278671276116038, "flos": 22416805393920.0, "grad_norm": 2.807016388048184, "language_loss": 0.76026487, "learning_rate": 3.6031647068747065e-06, "loss": 0.7818349, "num_input_tokens_seen": 81576305, "step": 3790, "time_per_iteration": 2.789419412612915 }, { "auxiliary_loss_clip": 0.01094774, "auxiliary_loss_mlp": 0.01043575, "balance_loss_clip": 1.04942632, "balance_loss_mlp": 1.02388144, "epoch": 0.22792725086427176, "flos": 20631363033600.0, "grad_norm": 2.1998519418279843, "language_loss": 0.9070015, "learning_rate": 3.602931823424522e-06, "loss": 0.92838502, "num_input_tokens_seen": 81594115, "step": 3791, "time_per_iteration": 2.74957275390625 }, { "auxiliary_loss_clip": 0.01143903, "auxiliary_loss_mlp": 0.01039768, "balance_loss_clip": 1.05332911, "balance_loss_mlp": 1.02229166, "epoch": 0.22798737411693973, "flos": 31428946903680.0, "grad_norm": 1.6288404079645773, "language_loss": 0.82029706, "learning_rate": 3.6026988791910026e-06, "loss": 0.84213376, "num_input_tokens_seen": 81615355, "step": 3792, "time_per_iteration": 2.7578563690185547 }, { "auxiliary_loss_clip": 0.01074793, "auxiliary_loss_mlp": 0.01002047, "balance_loss_clip": 1.03528738, "balance_loss_mlp": 0.99944824, "epoch": 0.2280474973696077, "flos": 52396685827200.0, "grad_norm": 1.1490057531785423, "language_loss": 0.65688264, "learning_rate": 3.602465874182981e-06, "loss": 0.67765105, "num_input_tokens_seen": 81662075, "step": 3793, "time_per_iteration": 2.892385959625244 }, { "auxiliary_loss_clip": 0.01156846, "auxiliary_loss_mlp": 0.01048751, "balance_loss_clip": 1.05509233, "balance_loss_mlp": 1.03063166, "epoch": 0.22810762062227566, "flos": 26396066816640.0, "grad_norm": 2.315054268007893, "language_loss": 0.77095032, "learning_rate": 3.602232808409293e-06, "loss": 0.79300624, "num_input_tokens_seen": 81681625, "step": 3794, "time_per_iteration": 2.6432933807373047 }, { "auxiliary_loss_clip": 0.01106797, "auxiliary_loss_mlp": 0.0104554, "balance_loss_clip": 1.04641223, "balance_loss_mlp": 1.02560771, "epoch": 0.22816774387494362, "flos": 25630271832960.0, "grad_norm": 2.8263872836139194, "language_loss": 0.80649161, "learning_rate": 3.6019996818787755e-06, "loss": 0.82801497, "num_input_tokens_seen": 81701170, "step": 3795, "time_per_iteration": 2.748461961746216 }, { "auxiliary_loss_clip": 0.01136851, "auxiliary_loss_mlp": 0.01049098, "balance_loss_clip": 1.0527277, "balance_loss_mlp": 1.03194404, "epoch": 0.22822786712761162, "flos": 22451602694400.0, "grad_norm": 1.970346796529307, "language_loss": 0.77348727, "learning_rate": 3.6017664946002704e-06, "loss": 0.79534674, "num_input_tokens_seen": 81721265, "step": 3796, "time_per_iteration": 2.6720409393310547 }, { "auxiliary_loss_clip": 0.01111647, "auxiliary_loss_mlp": 0.0077572, "balance_loss_clip": 1.04920197, "balance_loss_mlp": 1.00161827, "epoch": 0.22828799038027958, "flos": 12202554395520.0, "grad_norm": 3.9384070064251793, "language_loss": 0.95837742, "learning_rate": 3.6015332465826188e-06, "loss": 0.97725105, "num_input_tokens_seen": 81736565, "step": 3797, "time_per_iteration": 2.730684995651245 }, { "auxiliary_loss_clip": 0.01140956, "auxiliary_loss_mlp": 0.00774906, "balance_loss_clip": 1.05310869, "balance_loss_mlp": 1.00178146, "epoch": 0.22834811363294755, "flos": 22085708803200.0, "grad_norm": 2.215225796779507, "language_loss": 0.81875294, "learning_rate": 3.601299937834666e-06, "loss": 0.83791155, "num_input_tokens_seen": 81756240, "step": 3798, "time_per_iteration": 2.7082717418670654 }, { "auxiliary_loss_clip": 0.01113838, "auxiliary_loss_mlp": 0.01041342, "balance_loss_clip": 1.04808974, "balance_loss_mlp": 1.02263761, "epoch": 0.2284082368856155, "flos": 24860634094080.0, "grad_norm": 2.1089113145856344, "language_loss": 0.78796971, "learning_rate": 3.6010665683652596e-06, "loss": 0.8095215, "num_input_tokens_seen": 81775720, "step": 3799, "time_per_iteration": 2.7810587882995605 }, { "auxiliary_loss_clip": 0.01121546, "auxiliary_loss_mlp": 0.01055329, "balance_loss_clip": 1.04926765, "balance_loss_mlp": 1.03627968, "epoch": 0.22846836013828348, "flos": 23292882109440.0, "grad_norm": 1.7973625036918341, "language_loss": 0.75191152, "learning_rate": 3.6008331381832484e-06, "loss": 0.77368033, "num_input_tokens_seen": 81795830, "step": 3800, "time_per_iteration": 2.7185163497924805 }, { "auxiliary_loss_clip": 0.01121477, "auxiliary_loss_mlp": 0.01037963, "balance_loss_clip": 1.04833913, "balance_loss_mlp": 1.02235246, "epoch": 0.22852848339095144, "flos": 27416288810880.0, "grad_norm": 1.7410667809724167, "language_loss": 0.64073247, "learning_rate": 3.600599647297484e-06, "loss": 0.66232693, "num_input_tokens_seen": 81815745, "step": 3801, "time_per_iteration": 2.7509078979492188 }, { "auxiliary_loss_clip": 0.01129432, "auxiliary_loss_mlp": 0.01038736, "balance_loss_clip": 1.05498147, "balance_loss_mlp": 1.02301216, "epoch": 0.2285886066436194, "flos": 26321157002880.0, "grad_norm": 1.6732672610702524, "language_loss": 0.81560862, "learning_rate": 3.60036609571682e-06, "loss": 0.83729029, "num_input_tokens_seen": 81835155, "step": 3802, "time_per_iteration": 2.7188339233398438 }, { "auxiliary_loss_clip": 0.01126952, "auxiliary_loss_mlp": 0.0105215, "balance_loss_clip": 1.05203629, "balance_loss_mlp": 1.0342809, "epoch": 0.2286487298962874, "flos": 29716475022720.0, "grad_norm": 2.0652844737971625, "language_loss": 0.78909743, "learning_rate": 3.600132483450114e-06, "loss": 0.81088841, "num_input_tokens_seen": 81855655, "step": 3803, "time_per_iteration": 2.7760777473449707 }, { "auxiliary_loss_clip": 0.01109356, "auxiliary_loss_mlp": 0.01043096, "balance_loss_clip": 1.04399478, "balance_loss_mlp": 1.02511966, "epoch": 0.22870885314895537, "flos": 21287199507840.0, "grad_norm": 1.7519930287683254, "language_loss": 0.84902716, "learning_rate": 3.5998988105062235e-06, "loss": 0.87055165, "num_input_tokens_seen": 81876385, "step": 3804, "time_per_iteration": 5.891911745071411 }, { "auxiliary_loss_clip": 0.01141965, "auxiliary_loss_mlp": 0.01040951, "balance_loss_clip": 1.05229163, "balance_loss_mlp": 1.02440476, "epoch": 0.22876897640162333, "flos": 14939450161920.0, "grad_norm": 2.045415026345325, "language_loss": 0.76673448, "learning_rate": 3.59966507689401e-06, "loss": 0.78856367, "num_input_tokens_seen": 81893225, "step": 3805, "time_per_iteration": 2.643104076385498 }, { "auxiliary_loss_clip": 0.0112853, "auxiliary_loss_mlp": 0.00775286, "balance_loss_clip": 1.05192351, "balance_loss_mlp": 1.00156116, "epoch": 0.2288290996542913, "flos": 18113917409280.0, "grad_norm": 2.368547935700865, "language_loss": 0.78250653, "learning_rate": 3.5994312826223363e-06, "loss": 0.80154467, "num_input_tokens_seen": 81911350, "step": 3806, "time_per_iteration": 4.312817335128784 }, { "auxiliary_loss_clip": 0.01123441, "auxiliary_loss_mlp": 0.01052484, "balance_loss_clip": 1.05244482, "balance_loss_mlp": 1.03282619, "epoch": 0.22888922290695926, "flos": 39855457071360.0, "grad_norm": 2.0706298183861, "language_loss": 0.700813, "learning_rate": 3.5991974277000684e-06, "loss": 0.72257227, "num_input_tokens_seen": 81935420, "step": 3807, "time_per_iteration": 2.8060836791992188 }, { "auxiliary_loss_clip": 0.01143724, "auxiliary_loss_mlp": 0.01057417, "balance_loss_clip": 1.0545013, "balance_loss_mlp": 1.03891551, "epoch": 0.22894934615962723, "flos": 23403774372480.0, "grad_norm": 4.007429648995762, "language_loss": 0.6543591, "learning_rate": 3.5989635121360733e-06, "loss": 0.6763705, "num_input_tokens_seen": 81953845, "step": 3808, "time_per_iteration": 2.703885078430176 }, { "auxiliary_loss_clip": 0.0109921, "auxiliary_loss_mlp": 0.01061828, "balance_loss_clip": 1.04773676, "balance_loss_mlp": 1.04295671, "epoch": 0.22900946941229522, "flos": 18843011671680.0, "grad_norm": 2.028069656557901, "language_loss": 0.74749511, "learning_rate": 3.598729535939222e-06, "loss": 0.76910543, "num_input_tokens_seen": 81972100, "step": 3809, "time_per_iteration": 2.726862907409668 }, { "auxiliary_loss_clip": 0.01128097, "auxiliary_loss_mlp": 0.01053112, "balance_loss_clip": 1.0527637, "balance_loss_mlp": 1.03666139, "epoch": 0.22906959266496318, "flos": 22929394429440.0, "grad_norm": 1.6287389468918274, "language_loss": 0.81654954, "learning_rate": 3.5984954991183862e-06, "loss": 0.83836162, "num_input_tokens_seen": 81992760, "step": 3810, "time_per_iteration": 2.6750009059906006 }, { "auxiliary_loss_clip": 0.01132496, "auxiliary_loss_mlp": 0.01040979, "balance_loss_clip": 1.05216146, "balance_loss_mlp": 1.0247184, "epoch": 0.22912971591763115, "flos": 19354523299200.0, "grad_norm": 2.375204791625097, "language_loss": 0.78126299, "learning_rate": 3.598261401682441e-06, "loss": 0.80299771, "num_input_tokens_seen": 82009080, "step": 3811, "time_per_iteration": 4.302153587341309 }, { "auxiliary_loss_clip": 0.01130856, "auxiliary_loss_mlp": 0.00775213, "balance_loss_clip": 1.05357778, "balance_loss_mlp": 1.00159776, "epoch": 0.22918983917029911, "flos": 19933546538880.0, "grad_norm": 1.797699433224321, "language_loss": 0.82817954, "learning_rate": 3.5980272436402632e-06, "loss": 0.84724021, "num_input_tokens_seen": 82026705, "step": 3812, "time_per_iteration": 2.635796308517456 }, { "auxiliary_loss_clip": 0.01089198, "auxiliary_loss_mlp": 0.01067747, "balance_loss_clip": 1.04705882, "balance_loss_mlp": 1.0480535, "epoch": 0.22924996242296708, "flos": 16690885320960.0, "grad_norm": 3.3357789636694952, "language_loss": 0.82689399, "learning_rate": 3.5977930250007324e-06, "loss": 0.84846342, "num_input_tokens_seen": 82043245, "step": 3813, "time_per_iteration": 2.7896463871002197 }, { "auxiliary_loss_clip": 0.01135441, "auxiliary_loss_mlp": 0.01044219, "balance_loss_clip": 1.05230987, "balance_loss_mlp": 1.02743411, "epoch": 0.22931008567563504, "flos": 33036164956800.0, "grad_norm": 1.5779710642832598, "language_loss": 0.70018709, "learning_rate": 3.5975587457727298e-06, "loss": 0.72198373, "num_input_tokens_seen": 82066870, "step": 3814, "time_per_iteration": 2.759460687637329 }, { "auxiliary_loss_clip": 0.01141204, "auxiliary_loss_mlp": 0.01046745, "balance_loss_clip": 1.05307984, "balance_loss_mlp": 1.02947164, "epoch": 0.229370208928303, "flos": 23330696152320.0, "grad_norm": 2.3195881009003174, "language_loss": 0.66811371, "learning_rate": 3.597324405965139e-06, "loss": 0.6899932, "num_input_tokens_seen": 82083180, "step": 3815, "time_per_iteration": 2.6878743171691895 }, { "auxiliary_loss_clip": 0.01142177, "auxiliary_loss_mlp": 0.01045942, "balance_loss_clip": 1.05412412, "balance_loss_mlp": 1.02921689, "epoch": 0.229430332180971, "flos": 28617213150720.0, "grad_norm": 2.436037188170917, "language_loss": 0.83555114, "learning_rate": 3.597090005586848e-06, "loss": 0.85743231, "num_input_tokens_seen": 82102950, "step": 3816, "time_per_iteration": 2.702638626098633 }, { "auxiliary_loss_clip": 0.01142001, "auxiliary_loss_mlp": 0.01037145, "balance_loss_clip": 1.05649173, "balance_loss_mlp": 1.01952624, "epoch": 0.22949045543363897, "flos": 17238199829760.0, "grad_norm": 2.261586370580253, "language_loss": 0.8657164, "learning_rate": 3.596855544646742e-06, "loss": 0.88750786, "num_input_tokens_seen": 82119510, "step": 3817, "time_per_iteration": 2.6439061164855957 }, { "auxiliary_loss_clip": 0.01125222, "auxiliary_loss_mlp": 0.01048919, "balance_loss_clip": 1.0493896, "balance_loss_mlp": 1.03166902, "epoch": 0.22955057868630693, "flos": 27489438858240.0, "grad_norm": 3.8274774650765706, "language_loss": 0.74976468, "learning_rate": 3.5966210231537154e-06, "loss": 0.77150607, "num_input_tokens_seen": 82140095, "step": 3818, "time_per_iteration": 2.7610766887664795 }, { "auxiliary_loss_clip": 0.01146421, "auxiliary_loss_mlp": 0.01043004, "balance_loss_clip": 1.05866313, "balance_loss_mlp": 1.02550387, "epoch": 0.2296107019389749, "flos": 23476421629440.0, "grad_norm": 1.7490504114150227, "language_loss": 0.74682397, "learning_rate": 3.596386441116659e-06, "loss": 0.76871818, "num_input_tokens_seen": 82159510, "step": 3819, "time_per_iteration": 2.7125203609466553 }, { "auxiliary_loss_clip": 0.0114108, "auxiliary_loss_mlp": 0.0104377, "balance_loss_clip": 1.05479693, "balance_loss_mlp": 1.02630615, "epoch": 0.22967082519164286, "flos": 31285160760960.0, "grad_norm": 2.0230347194773732, "language_loss": 0.81103987, "learning_rate": 3.5961517985444684e-06, "loss": 0.83288836, "num_input_tokens_seen": 82179580, "step": 3820, "time_per_iteration": 2.7268714904785156 }, { "auxiliary_loss_clip": 0.01129285, "auxiliary_loss_mlp": 0.01044606, "balance_loss_clip": 1.05326903, "balance_loss_mlp": 1.02627158, "epoch": 0.22973094844431083, "flos": 14642935390080.0, "grad_norm": 2.2801321869619153, "language_loss": 0.69099033, "learning_rate": 3.595917095446042e-06, "loss": 0.71272922, "num_input_tokens_seen": 82195585, "step": 3821, "time_per_iteration": 2.659498691558838 }, { "auxiliary_loss_clip": 0.01098739, "auxiliary_loss_mlp": 0.01036962, "balance_loss_clip": 1.05118072, "balance_loss_mlp": 1.01888967, "epoch": 0.2297910716969788, "flos": 22823853292800.0, "grad_norm": 1.473505926288008, "language_loss": 0.82876307, "learning_rate": 3.5956823318302796e-06, "loss": 0.85012007, "num_input_tokens_seen": 82217530, "step": 3822, "time_per_iteration": 2.898287057876587 }, { "auxiliary_loss_clip": 0.01149833, "auxiliary_loss_mlp": 0.01044764, "balance_loss_clip": 1.05239797, "balance_loss_mlp": 1.02617884, "epoch": 0.2298511949496468, "flos": 23039029716480.0, "grad_norm": 2.077495396622281, "language_loss": 0.66552204, "learning_rate": 3.5954475077060833e-06, "loss": 0.68746805, "num_input_tokens_seen": 82237980, "step": 3823, "time_per_iteration": 2.6397016048431396 }, { "auxiliary_loss_clip": 0.01064018, "auxiliary_loss_mlp": 0.01005373, "balance_loss_clip": 1.04052305, "balance_loss_mlp": 1.00196409, "epoch": 0.22991131820231475, "flos": 66890914911360.0, "grad_norm": 0.8015900374762405, "language_loss": 0.56731141, "learning_rate": 3.595212623082357e-06, "loss": 0.5880053, "num_input_tokens_seen": 82301785, "step": 3824, "time_per_iteration": 3.2301526069641113 }, { "auxiliary_loss_clip": 0.01123513, "auxiliary_loss_mlp": 0.01037782, "balance_loss_clip": 1.0506382, "balance_loss_mlp": 1.02098525, "epoch": 0.22997144145498272, "flos": 17887248633600.0, "grad_norm": 2.0770938093466995, "language_loss": 0.7301755, "learning_rate": 3.594977677968009e-06, "loss": 0.7517885, "num_input_tokens_seen": 82317355, "step": 3825, "time_per_iteration": 2.6161818504333496 }, { "auxiliary_loss_clip": 0.01147516, "auxiliary_loss_mlp": 0.01049665, "balance_loss_clip": 1.05828226, "balance_loss_mlp": 1.03119957, "epoch": 0.23003156470765068, "flos": 24676843178880.0, "grad_norm": 1.8689845885894332, "language_loss": 0.87652314, "learning_rate": 3.5947426723719473e-06, "loss": 0.89849496, "num_input_tokens_seen": 82336645, "step": 3826, "time_per_iteration": 2.668858766555786 }, { "auxiliary_loss_clip": 0.01134406, "auxiliary_loss_mlp": 0.01045536, "balance_loss_clip": 1.05722022, "balance_loss_mlp": 1.02697468, "epoch": 0.23009168796031865, "flos": 15814126247040.0, "grad_norm": 2.4660324215504312, "language_loss": 0.81861693, "learning_rate": 3.594507606303083e-06, "loss": 0.84041631, "num_input_tokens_seen": 82354225, "step": 3827, "time_per_iteration": 2.67173171043396 }, { "auxiliary_loss_clip": 0.01083629, "auxiliary_loss_mlp": 0.01046658, "balance_loss_clip": 1.04976189, "balance_loss_mlp": 1.02728689, "epoch": 0.2301518112129866, "flos": 16212842190720.0, "grad_norm": 1.9417227311694012, "language_loss": 0.86676306, "learning_rate": 3.5942724797703314e-06, "loss": 0.88806593, "num_input_tokens_seen": 82370240, "step": 3828, "time_per_iteration": 2.7641990184783936 }, { "auxiliary_loss_clip": 0.01126786, "auxiliary_loss_mlp": 0.01048261, "balance_loss_clip": 1.05381465, "balance_loss_mlp": 1.02981901, "epoch": 0.2302119344656546, "flos": 20595452411520.0, "grad_norm": 2.6386744924703223, "language_loss": 0.7044189, "learning_rate": 3.594037292782607e-06, "loss": 0.72616941, "num_input_tokens_seen": 82389145, "step": 3829, "time_per_iteration": 2.6674952507019043 }, { "auxiliary_loss_clip": 0.01085573, "auxiliary_loss_mlp": 0.01045126, "balance_loss_clip": 1.04650855, "balance_loss_mlp": 1.02835345, "epoch": 0.23027205771832257, "flos": 26796901662720.0, "grad_norm": 1.6431866637768902, "language_loss": 0.84075069, "learning_rate": 3.5938020453488293e-06, "loss": 0.86205769, "num_input_tokens_seen": 82409185, "step": 3830, "time_per_iteration": 2.8631880283355713 }, { "auxiliary_loss_clip": 0.01132962, "auxiliary_loss_mlp": 0.01052116, "balance_loss_clip": 1.0506047, "balance_loss_mlp": 1.03415167, "epoch": 0.23033218097099054, "flos": 43873143068160.0, "grad_norm": 2.3429509345019213, "language_loss": 0.67036134, "learning_rate": 3.5935667374779177e-06, "loss": 0.6922121, "num_input_tokens_seen": 82432070, "step": 3831, "time_per_iteration": 2.91282320022583 }, { "auxiliary_loss_clip": 0.0111204, "auxiliary_loss_mlp": 0.01053367, "balance_loss_clip": 1.05277622, "balance_loss_mlp": 1.03496158, "epoch": 0.2303923042236585, "flos": 26067663745920.0, "grad_norm": 2.3469890931023194, "language_loss": 0.75711727, "learning_rate": 3.5933313691787957e-06, "loss": 0.7787714, "num_input_tokens_seen": 82450625, "step": 3832, "time_per_iteration": 2.759467601776123 }, { "auxiliary_loss_clip": 0.0110298, "auxiliary_loss_mlp": 0.01044565, "balance_loss_clip": 1.05044174, "balance_loss_mlp": 1.02596867, "epoch": 0.23045242747632647, "flos": 18296379521280.0, "grad_norm": 1.7769817461106177, "language_loss": 0.87558299, "learning_rate": 3.593095940460389e-06, "loss": 0.89705843, "num_input_tokens_seen": 82468575, "step": 3833, "time_per_iteration": 2.8548035621643066 }, { "auxiliary_loss_clip": 0.01116173, "auxiliary_loss_mlp": 0.01046082, "balance_loss_clip": 1.05032015, "balance_loss_mlp": 1.02814126, "epoch": 0.23051255072899443, "flos": 25520528805120.0, "grad_norm": 2.030934473686878, "language_loss": 0.74736786, "learning_rate": 3.592860451331624e-06, "loss": 0.7689904, "num_input_tokens_seen": 82488655, "step": 3834, "time_per_iteration": 2.719237804412842 }, { "auxiliary_loss_clip": 0.01104525, "auxiliary_loss_mlp": 0.01064338, "balance_loss_clip": 1.04610491, "balance_loss_mlp": 1.043679, "epoch": 0.2305726739816624, "flos": 21215198695680.0, "grad_norm": 1.9050082770497696, "language_loss": 0.86071098, "learning_rate": 3.592624901801432e-06, "loss": 0.88239956, "num_input_tokens_seen": 82507220, "step": 3835, "time_per_iteration": 2.627782106399536 }, { "auxiliary_loss_clip": 0.01115977, "auxiliary_loss_mlp": 0.01060727, "balance_loss_clip": 1.04934275, "balance_loss_mlp": 1.03979373, "epoch": 0.2306327972343304, "flos": 23331127115520.0, "grad_norm": 2.798777841757382, "language_loss": 0.82434011, "learning_rate": 3.5923892918787432e-06, "loss": 0.84610713, "num_input_tokens_seen": 82527920, "step": 3836, "time_per_iteration": 2.6091606616973877 }, { "auxiliary_loss_clip": 0.01144536, "auxiliary_loss_mlp": 0.0105466, "balance_loss_clip": 1.06090033, "balance_loss_mlp": 1.03683817, "epoch": 0.23069292048699835, "flos": 20666734951680.0, "grad_norm": 1.7189193248017045, "language_loss": 0.79633009, "learning_rate": 3.5921536215724934e-06, "loss": 0.81832206, "num_input_tokens_seen": 82549040, "step": 3837, "time_per_iteration": 2.535435914993286 }, { "auxiliary_loss_clip": 0.01057695, "auxiliary_loss_mlp": 0.01033541, "balance_loss_clip": 1.04840386, "balance_loss_mlp": 1.03003633, "epoch": 0.23075304373966632, "flos": 70454832393600.0, "grad_norm": 0.9031703200773207, "language_loss": 0.65381849, "learning_rate": 3.5919178908916184e-06, "loss": 0.67473078, "num_input_tokens_seen": 82604070, "step": 3838, "time_per_iteration": 3.0868518352508545 }, { "auxiliary_loss_clip": 0.01138177, "auxiliary_loss_mlp": 0.01056497, "balance_loss_clip": 1.05361629, "balance_loss_mlp": 1.0395453, "epoch": 0.23081316699233428, "flos": 16617986668800.0, "grad_norm": 2.5143705705619097, "language_loss": 0.75403488, "learning_rate": 3.591682099845058e-06, "loss": 0.77598161, "num_input_tokens_seen": 82619665, "step": 3839, "time_per_iteration": 2.6391067504882812 }, { "auxiliary_loss_clip": 0.01125705, "auxiliary_loss_mlp": 0.01046933, "balance_loss_clip": 1.05447173, "balance_loss_mlp": 1.02882481, "epoch": 0.23087329024500225, "flos": 13298081253120.0, "grad_norm": 1.8684605740856612, "language_loss": 0.68962026, "learning_rate": 3.591446248441752e-06, "loss": 0.71134663, "num_input_tokens_seen": 82637530, "step": 3840, "time_per_iteration": 2.6295006275177 }, { "auxiliary_loss_clip": 0.01158019, "auxiliary_loss_mlp": 0.01046048, "balance_loss_clip": 1.05840647, "balance_loss_mlp": 1.026057, "epoch": 0.23093341349767021, "flos": 17785729820160.0, "grad_norm": 2.5615469809997697, "language_loss": 0.80033958, "learning_rate": 3.591210336690645e-06, "loss": 0.8223803, "num_input_tokens_seen": 82656130, "step": 3841, "time_per_iteration": 2.6512410640716553 }, { "auxiliary_loss_clip": 0.01145317, "auxiliary_loss_mlp": 0.01047066, "balance_loss_clip": 1.05756617, "balance_loss_mlp": 1.0301621, "epoch": 0.23099353675033818, "flos": 23988076911360.0, "grad_norm": 1.7953422744525294, "language_loss": 0.83389241, "learning_rate": 3.590974364600683e-06, "loss": 0.85581625, "num_input_tokens_seen": 82675295, "step": 3842, "time_per_iteration": 2.7676117420196533 }, { "auxiliary_loss_clip": 0.01144752, "auxiliary_loss_mlp": 0.01044783, "balance_loss_clip": 1.05491304, "balance_loss_mlp": 1.02650845, "epoch": 0.23105366000300617, "flos": 35995168471680.0, "grad_norm": 1.8421697704365976, "language_loss": 0.66661239, "learning_rate": 3.5907383321808135e-06, "loss": 0.68850774, "num_input_tokens_seen": 82703260, "step": 3843, "time_per_iteration": 5.82958722114563 }, { "auxiliary_loss_clip": 0.01142299, "auxiliary_loss_mlp": 0.01047166, "balance_loss_clip": 1.05609, "balance_loss_mlp": 1.02914143, "epoch": 0.23111378325567414, "flos": 31245335556480.0, "grad_norm": 1.8996188882256444, "language_loss": 0.77221334, "learning_rate": 3.590502239439987e-06, "loss": 0.79410803, "num_input_tokens_seen": 82725060, "step": 3844, "time_per_iteration": 2.771226406097412 }, { "auxiliary_loss_clip": 0.01141796, "auxiliary_loss_mlp": 0.01045598, "balance_loss_clip": 1.05503309, "balance_loss_mlp": 1.02607179, "epoch": 0.2311739065083421, "flos": 19208223204480.0, "grad_norm": 1.9651801579729304, "language_loss": 0.78155982, "learning_rate": 3.590266086387156e-06, "loss": 0.80343372, "num_input_tokens_seen": 82742960, "step": 3845, "time_per_iteration": 4.247429370880127 }, { "auxiliary_loss_clip": 0.01117167, "auxiliary_loss_mlp": 0.01039426, "balance_loss_clip": 1.05274439, "balance_loss_mlp": 1.02292788, "epoch": 0.23123402976101007, "flos": 23360178240000.0, "grad_norm": 2.083958857623256, "language_loss": 0.76397669, "learning_rate": 3.590029873031276e-06, "loss": 0.78554261, "num_input_tokens_seen": 82760205, "step": 3846, "time_per_iteration": 2.7805917263031006 }, { "auxiliary_loss_clip": 0.01131462, "auxiliary_loss_mlp": 0.01049247, "balance_loss_clip": 1.05376291, "balance_loss_mlp": 1.03193808, "epoch": 0.23129415301367803, "flos": 13735365425280.0, "grad_norm": 1.8827740097117207, "language_loss": 0.70281041, "learning_rate": 3.589793599381304e-06, "loss": 0.72461748, "num_input_tokens_seen": 82778590, "step": 3847, "time_per_iteration": 2.6848642826080322 }, { "auxiliary_loss_clip": 0.01065475, "auxiliary_loss_mlp": 0.01006045, "balance_loss_clip": 1.04309821, "balance_loss_mlp": 1.00356507, "epoch": 0.231354276266346, "flos": 69737015001600.0, "grad_norm": 0.7955227467680892, "language_loss": 0.61006129, "learning_rate": 3.589557265446198e-06, "loss": 0.63077646, "num_input_tokens_seen": 82833925, "step": 3848, "time_per_iteration": 3.08832049369812 }, { "auxiliary_loss_clip": 0.01142916, "auxiliary_loss_mlp": 0.01044943, "balance_loss_clip": 1.05631924, "balance_loss_mlp": 1.02640557, "epoch": 0.231414399519014, "flos": 18835900778880.0, "grad_norm": 1.9602331138800266, "language_loss": 0.78082883, "learning_rate": 3.589320871234923e-06, "loss": 0.80270743, "num_input_tokens_seen": 82850625, "step": 3849, "time_per_iteration": 2.6830787658691406 }, { "auxiliary_loss_clip": 0.01137959, "auxiliary_loss_mlp": 0.01044864, "balance_loss_clip": 1.05184579, "balance_loss_mlp": 1.02630353, "epoch": 0.23147452277168196, "flos": 36135470995200.0, "grad_norm": 2.354271482082729, "language_loss": 0.71243513, "learning_rate": 3.5890844167564405e-06, "loss": 0.7342633, "num_input_tokens_seen": 82872105, "step": 3850, "time_per_iteration": 4.467762231826782 }, { "auxiliary_loss_clip": 0.01121609, "auxiliary_loss_mlp": 0.00776401, "balance_loss_clip": 1.05099773, "balance_loss_mlp": 1.00153255, "epoch": 0.23153464602434992, "flos": 20812927305600.0, "grad_norm": 4.184777043510671, "language_loss": 0.76577097, "learning_rate": 3.588847902019718e-06, "loss": 0.78475106, "num_input_tokens_seen": 82890595, "step": 3851, "time_per_iteration": 2.7452898025512695 }, { "auxiliary_loss_clip": 0.01152703, "auxiliary_loss_mlp": 0.01038649, "balance_loss_clip": 1.05650854, "balance_loss_mlp": 1.0206244, "epoch": 0.2315947692770179, "flos": 19939256801280.0, "grad_norm": 2.0528428588063914, "language_loss": 0.69642782, "learning_rate": 3.588611327033723e-06, "loss": 0.71834141, "num_input_tokens_seen": 82908910, "step": 3852, "time_per_iteration": 2.613687038421631 }, { "auxiliary_loss_clip": 0.0110964, "auxiliary_loss_mlp": 0.01050002, "balance_loss_clip": 1.05097961, "balance_loss_mlp": 1.0303328, "epoch": 0.23165489252968585, "flos": 12855553695360.0, "grad_norm": 2.8596642791724993, "language_loss": 0.67063856, "learning_rate": 3.588374691807428e-06, "loss": 0.69223493, "num_input_tokens_seen": 82925405, "step": 3853, "time_per_iteration": 2.6974282264709473 }, { "auxiliary_loss_clip": 0.01146149, "auxiliary_loss_mlp": 0.01041525, "balance_loss_clip": 1.05749798, "balance_loss_mlp": 1.02340484, "epoch": 0.23171501578235382, "flos": 30628282792320.0, "grad_norm": 1.7603397459637538, "language_loss": 0.80139267, "learning_rate": 3.5881379963498053e-06, "loss": 0.82326943, "num_input_tokens_seen": 82945615, "step": 3854, "time_per_iteration": 2.712125062942505 }, { "auxiliary_loss_clip": 0.01115767, "auxiliary_loss_mlp": 0.01052387, "balance_loss_clip": 1.04737794, "balance_loss_mlp": 1.03070331, "epoch": 0.23177513903502178, "flos": 23842782397440.0, "grad_norm": 1.9709775740629982, "language_loss": 0.65103847, "learning_rate": 3.587901240669831e-06, "loss": 0.67272007, "num_input_tokens_seen": 82967570, "step": 3855, "time_per_iteration": 2.718756675720215 }, { "auxiliary_loss_clip": 0.01153506, "auxiliary_loss_mlp": 0.01048508, "balance_loss_clip": 1.05417824, "balance_loss_mlp": 1.03050709, "epoch": 0.23183526228768978, "flos": 29570282668800.0, "grad_norm": 1.7803112411977504, "language_loss": 0.70386064, "learning_rate": 3.5876644247764815e-06, "loss": 0.7258808, "num_input_tokens_seen": 82987435, "step": 3856, "time_per_iteration": 2.798675060272217 }, { "auxiliary_loss_clip": 0.01103018, "auxiliary_loss_mlp": 0.01035927, "balance_loss_clip": 1.05080032, "balance_loss_mlp": 1.0200007, "epoch": 0.23189538554035774, "flos": 34458694254720.0, "grad_norm": 1.7837780829213195, "language_loss": 0.77101243, "learning_rate": 3.5874275486787387e-06, "loss": 0.79240191, "num_input_tokens_seen": 83010505, "step": 3857, "time_per_iteration": 2.8545501232147217 }, { "auxiliary_loss_clip": 0.01136868, "auxiliary_loss_mlp": 0.00777317, "balance_loss_clip": 1.0528996, "balance_loss_mlp": 1.00133562, "epoch": 0.2319555087930257, "flos": 18003815245440.0, "grad_norm": 2.445609387195472, "language_loss": 0.91629225, "learning_rate": 3.587190612385584e-06, "loss": 0.9354341, "num_input_tokens_seen": 83026705, "step": 3858, "time_per_iteration": 2.7018845081329346 }, { "auxiliary_loss_clip": 0.01095626, "auxiliary_loss_mlp": 0.01043975, "balance_loss_clip": 1.04882586, "balance_loss_mlp": 1.0263319, "epoch": 0.23201563204569367, "flos": 23143852581120.0, "grad_norm": 1.987074492721614, "language_loss": 0.76833785, "learning_rate": 3.5869536159060026e-06, "loss": 0.78973383, "num_input_tokens_seen": 83046500, "step": 3859, "time_per_iteration": 2.7465155124664307 }, { "auxiliary_loss_clip": 0.01136816, "auxiliary_loss_mlp": 0.01041128, "balance_loss_clip": 1.05060959, "balance_loss_mlp": 1.02316284, "epoch": 0.23207575529836164, "flos": 20667991927680.0, "grad_norm": 1.7166447387893018, "language_loss": 0.84341264, "learning_rate": 3.58671655924898e-06, "loss": 0.86519206, "num_input_tokens_seen": 83065280, "step": 3860, "time_per_iteration": 2.6602063179016113 }, { "auxiliary_loss_clip": 0.01091436, "auxiliary_loss_mlp": 0.01044571, "balance_loss_clip": 1.04641938, "balance_loss_mlp": 1.02640343, "epoch": 0.2321358785510296, "flos": 16472189364480.0, "grad_norm": 2.014536853896284, "language_loss": 0.83431923, "learning_rate": 3.586479442423508e-06, "loss": 0.85567933, "num_input_tokens_seen": 83082310, "step": 3861, "time_per_iteration": 2.728750228881836 }, { "auxiliary_loss_clip": 0.01130655, "auxiliary_loss_mlp": 0.00776368, "balance_loss_clip": 1.05122983, "balance_loss_mlp": 1.00149858, "epoch": 0.2321960018036976, "flos": 21616320850560.0, "grad_norm": 1.8874922149770945, "language_loss": 0.85921204, "learning_rate": 3.586242265438576e-06, "loss": 0.87828225, "num_input_tokens_seen": 83102065, "step": 3862, "time_per_iteration": 2.7289161682128906 }, { "auxiliary_loss_clip": 0.01112788, "auxiliary_loss_mlp": 0.0104236, "balance_loss_clip": 1.04956031, "balance_loss_mlp": 1.02645802, "epoch": 0.23225612505636556, "flos": 22271474966400.0, "grad_norm": 1.4078274786009342, "language_loss": 0.75131166, "learning_rate": 3.5860050283031773e-06, "loss": 0.77286315, "num_input_tokens_seen": 83121445, "step": 3863, "time_per_iteration": 2.7308037281036377 }, { "auxiliary_loss_clip": 0.01109911, "auxiliary_loss_mlp": 0.0104503, "balance_loss_clip": 1.05320251, "balance_loss_mlp": 1.02840066, "epoch": 0.23231624830903352, "flos": 17052325925760.0, "grad_norm": 1.8195520841096788, "language_loss": 0.74952984, "learning_rate": 3.58576773102631e-06, "loss": 0.77107918, "num_input_tokens_seen": 83138175, "step": 3864, "time_per_iteration": 2.669403314590454 }, { "auxiliary_loss_clip": 0.01148697, "auxiliary_loss_mlp": 0.01038596, "balance_loss_clip": 1.05258274, "balance_loss_mlp": 1.02182317, "epoch": 0.2323763715617015, "flos": 34640043045120.0, "grad_norm": 1.757817857347048, "language_loss": 0.70438093, "learning_rate": 3.5855303736169714e-06, "loss": 0.72625393, "num_input_tokens_seen": 83161975, "step": 3865, "time_per_iteration": 2.766399621963501 }, { "auxiliary_loss_clip": 0.01156124, "auxiliary_loss_mlp": 0.01048904, "balance_loss_clip": 1.05352104, "balance_loss_mlp": 1.02978325, "epoch": 0.23243649481436945, "flos": 25551698832000.0, "grad_norm": 1.8965816841290546, "language_loss": 0.94702542, "learning_rate": 3.5852929560841617e-06, "loss": 0.96907574, "num_input_tokens_seen": 83180905, "step": 3866, "time_per_iteration": 2.659867525100708 }, { "auxiliary_loss_clip": 0.01131283, "auxiliary_loss_mlp": 0.01044032, "balance_loss_clip": 1.04904807, "balance_loss_mlp": 1.02683008, "epoch": 0.23249661806703742, "flos": 20483482740480.0, "grad_norm": 4.181849364953483, "language_loss": 0.73026884, "learning_rate": 3.5850554784368846e-06, "loss": 0.75202191, "num_input_tokens_seen": 83196390, "step": 3867, "time_per_iteration": 2.645481586456299 }, { "auxiliary_loss_clip": 0.0112954, "auxiliary_loss_mlp": 0.01046355, "balance_loss_clip": 1.05079126, "balance_loss_mlp": 1.02855754, "epoch": 0.23255674131970538, "flos": 20376612800640.0, "grad_norm": 1.9671041323983256, "language_loss": 0.82770872, "learning_rate": 3.584817940684145e-06, "loss": 0.84946775, "num_input_tokens_seen": 83216165, "step": 3868, "time_per_iteration": 2.7670326232910156 }, { "auxiliary_loss_clip": 0.01125563, "auxiliary_loss_mlp": 0.01043558, "balance_loss_clip": 1.04875207, "balance_loss_mlp": 1.02648687, "epoch": 0.23261686457237338, "flos": 17056096853760.0, "grad_norm": 2.1100994183362967, "language_loss": 0.72952414, "learning_rate": 3.58458034283495e-06, "loss": 0.75121534, "num_input_tokens_seen": 83233845, "step": 3869, "time_per_iteration": 2.6661763191223145 }, { "auxiliary_loss_clip": 0.01132223, "auxiliary_loss_mlp": 0.0105087, "balance_loss_clip": 1.05129242, "balance_loss_mlp": 1.03382349, "epoch": 0.23267698782504134, "flos": 29169878785920.0, "grad_norm": 2.500604422715561, "language_loss": 0.79142725, "learning_rate": 3.5843426848983097e-06, "loss": 0.81325811, "num_input_tokens_seen": 83254930, "step": 3870, "time_per_iteration": 2.707321882247925 }, { "auxiliary_loss_clip": 0.01152434, "auxiliary_loss_mlp": 0.01046711, "balance_loss_clip": 1.05334866, "balance_loss_mlp": 1.02924728, "epoch": 0.2327371110777093, "flos": 21174655219200.0, "grad_norm": 2.176894576680098, "language_loss": 0.70915782, "learning_rate": 3.5841049668832357e-06, "loss": 0.73114932, "num_input_tokens_seen": 83272095, "step": 3871, "time_per_iteration": 2.6389646530151367 }, { "auxiliary_loss_clip": 0.01139847, "auxiliary_loss_mlp": 0.01051541, "balance_loss_clip": 1.05543458, "balance_loss_mlp": 1.03244328, "epoch": 0.23279723433037727, "flos": 24863112132480.0, "grad_norm": 1.8306984701748774, "language_loss": 0.68877381, "learning_rate": 3.5838671887987433e-06, "loss": 0.71068764, "num_input_tokens_seen": 83290980, "step": 3872, "time_per_iteration": 2.662309408187866 }, { "auxiliary_loss_clip": 0.0114472, "auxiliary_loss_mlp": 0.01042459, "balance_loss_clip": 1.05313611, "balance_loss_mlp": 1.02388597, "epoch": 0.23285735758304524, "flos": 38800617344640.0, "grad_norm": 1.5710106481349988, "language_loss": 0.779724, "learning_rate": 3.5836293506538474e-06, "loss": 0.80159569, "num_input_tokens_seen": 83315175, "step": 3873, "time_per_iteration": 2.884542942047119 }, { "auxiliary_loss_clip": 0.01053683, "auxiliary_loss_mlp": 0.01022765, "balance_loss_clip": 1.03691578, "balance_loss_mlp": 1.02038097, "epoch": 0.2329174808357132, "flos": 53944113692160.0, "grad_norm": 0.8561383552409444, "language_loss": 0.6051712, "learning_rate": 3.5833914524575687e-06, "loss": 0.62593567, "num_input_tokens_seen": 83372060, "step": 3874, "time_per_iteration": 3.165809392929077 }, { "auxiliary_loss_clip": 0.0112779, "auxiliary_loss_mlp": 0.01040869, "balance_loss_clip": 1.05157447, "balance_loss_mlp": 1.02328515, "epoch": 0.23297760408838117, "flos": 21216024708480.0, "grad_norm": 2.5039775977564522, "language_loss": 0.80842507, "learning_rate": 3.583153494218927e-06, "loss": 0.83011162, "num_input_tokens_seen": 83389795, "step": 3875, "time_per_iteration": 2.673657178878784 }, { "auxiliary_loss_clip": 0.01147803, "auxiliary_loss_mlp": 0.00774568, "balance_loss_clip": 1.05367982, "balance_loss_mlp": 1.00145388, "epoch": 0.23303772734104916, "flos": 28403006394240.0, "grad_norm": 4.3174446976030465, "language_loss": 0.6123395, "learning_rate": 3.5829154759469464e-06, "loss": 0.63156319, "num_input_tokens_seen": 83410005, "step": 3876, "time_per_iteration": 2.6973021030426025 }, { "auxiliary_loss_clip": 0.01116571, "auxiliary_loss_mlp": 0.01051971, "balance_loss_clip": 1.05002618, "balance_loss_mlp": 1.03345811, "epoch": 0.23309785059371713, "flos": 24314720215680.0, "grad_norm": 2.4263361529850447, "language_loss": 0.70649457, "learning_rate": 3.5826773976506523e-06, "loss": 0.72817999, "num_input_tokens_seen": 83430250, "step": 3877, "time_per_iteration": 2.7506351470947266 }, { "auxiliary_loss_clip": 0.01143537, "auxiliary_loss_mlp": 0.01051311, "balance_loss_clip": 1.05495286, "balance_loss_mlp": 1.03245187, "epoch": 0.2331579738463851, "flos": 15992925171840.0, "grad_norm": 2.202899784913125, "language_loss": 0.80724835, "learning_rate": 3.582439259339073e-06, "loss": 0.82919687, "num_input_tokens_seen": 83447950, "step": 3878, "time_per_iteration": 2.6945395469665527 }, { "auxiliary_loss_clip": 0.0109123, "auxiliary_loss_mlp": 0.01049547, "balance_loss_clip": 1.04632592, "balance_loss_mlp": 1.0298301, "epoch": 0.23321809709905306, "flos": 36426957863040.0, "grad_norm": 1.857420507716431, "language_loss": 0.7521472, "learning_rate": 3.5822010610212374e-06, "loss": 0.77355498, "num_input_tokens_seen": 83467785, "step": 3879, "time_per_iteration": 2.8909342288970947 }, { "auxiliary_loss_clip": 0.01095967, "auxiliary_loss_mlp": 0.01051433, "balance_loss_clip": 1.04621899, "balance_loss_mlp": 1.03238297, "epoch": 0.23327822035172102, "flos": 21324762155520.0, "grad_norm": 2.179587653719585, "language_loss": 0.89532614, "learning_rate": 3.5819628027061795e-06, "loss": 0.91680014, "num_input_tokens_seen": 83485390, "step": 3880, "time_per_iteration": 2.7358896732330322 }, { "auxiliary_loss_clip": 0.01127816, "auxiliary_loss_mlp": 0.01049697, "balance_loss_clip": 1.05119944, "balance_loss_mlp": 1.0319109, "epoch": 0.233338343604389, "flos": 19171881619200.0, "grad_norm": 1.6825190155617658, "language_loss": 0.71915156, "learning_rate": 3.5817244844029334e-06, "loss": 0.74092674, "num_input_tokens_seen": 83504890, "step": 3881, "time_per_iteration": 2.702533721923828 }, { "auxiliary_loss_clip": 0.01148084, "auxiliary_loss_mlp": 0.0104282, "balance_loss_clip": 1.05186546, "balance_loss_mlp": 1.02497458, "epoch": 0.23339846685705698, "flos": 26908368543360.0, "grad_norm": 1.5464986217430505, "language_loss": 0.68210357, "learning_rate": 3.581486106120537e-06, "loss": 0.70401263, "num_input_tokens_seen": 83526475, "step": 3882, "time_per_iteration": 2.6449384689331055 }, { "auxiliary_loss_clip": 0.01106984, "auxiliary_loss_mlp": 0.01053219, "balance_loss_clip": 1.04567862, "balance_loss_mlp": 1.03457499, "epoch": 0.23345859010972494, "flos": 32343160884480.0, "grad_norm": 2.180831821464153, "language_loss": 0.77379489, "learning_rate": 3.5812476678680287e-06, "loss": 0.79539698, "num_input_tokens_seen": 83546620, "step": 3883, "time_per_iteration": 5.806958913803101 }, { "auxiliary_loss_clip": 0.01053192, "auxiliary_loss_mlp": 0.01007679, "balance_loss_clip": 1.03368068, "balance_loss_mlp": 1.0053544, "epoch": 0.2335187133623929, "flos": 58484229050880.0, "grad_norm": 0.7945750769740417, "language_loss": 0.59117424, "learning_rate": 3.58100916965445e-06, "loss": 0.61178291, "num_input_tokens_seen": 83616160, "step": 3884, "time_per_iteration": 3.3524324893951416 }, { "auxiliary_loss_clip": 0.01117007, "auxiliary_loss_mlp": 0.01034005, "balance_loss_clip": 1.04925692, "balance_loss_mlp": 1.01704168, "epoch": 0.23357883661506088, "flos": 24502317972480.0, "grad_norm": 1.6775563031527567, "language_loss": 0.80286831, "learning_rate": 3.5807706114888455e-06, "loss": 0.82437843, "num_input_tokens_seen": 83636795, "step": 3885, "time_per_iteration": 4.295818328857422 }, { "auxiliary_loss_clip": 0.01136024, "auxiliary_loss_mlp": 0.01040639, "balance_loss_clip": 1.05494285, "balance_loss_mlp": 1.02274597, "epoch": 0.23363895986772884, "flos": 18948516894720.0, "grad_norm": 2.2066793657203116, "language_loss": 0.88230193, "learning_rate": 3.580531993380261e-06, "loss": 0.90406859, "num_input_tokens_seen": 83654050, "step": 3886, "time_per_iteration": 2.6672091484069824 }, { "auxiliary_loss_clip": 0.01150675, "auxiliary_loss_mlp": 0.01042457, "balance_loss_clip": 1.05293703, "balance_loss_mlp": 1.02512443, "epoch": 0.2336990831203968, "flos": 31686821619840.0, "grad_norm": 4.0082984179074055, "language_loss": 0.73170543, "learning_rate": 3.5802933153377445e-06, "loss": 0.75363672, "num_input_tokens_seen": 83673720, "step": 3887, "time_per_iteration": 2.7338294982910156 }, { "auxiliary_loss_clip": 0.01140271, "auxiliary_loss_mlp": 0.0104923, "balance_loss_clip": 1.05201173, "balance_loss_mlp": 1.03183722, "epoch": 0.23375920637306477, "flos": 27709750926720.0, "grad_norm": 2.677865426107907, "language_loss": 0.84125429, "learning_rate": 3.5800545773703475e-06, "loss": 0.86314929, "num_input_tokens_seen": 83693470, "step": 3888, "time_per_iteration": 2.7020208835601807 }, { "auxiliary_loss_clip": 0.01121847, "auxiliary_loss_mlp": 0.010605, "balance_loss_clip": 1.04974008, "balance_loss_mlp": 1.04121208, "epoch": 0.23381932962573276, "flos": 17675627656320.0, "grad_norm": 3.2074942430893976, "language_loss": 0.87298381, "learning_rate": 3.5798157794871225e-06, "loss": 0.89480728, "num_input_tokens_seen": 83711620, "step": 3889, "time_per_iteration": 4.319674491882324 }, { "auxiliary_loss_clip": 0.01141703, "auxiliary_loss_mlp": 0.01046248, "balance_loss_clip": 1.05330396, "balance_loss_mlp": 1.02877164, "epoch": 0.23387945287840073, "flos": 14390842763520.0, "grad_norm": 3.8719217250511164, "language_loss": 0.76830876, "learning_rate": 3.579576921697125e-06, "loss": 0.79018819, "num_input_tokens_seen": 83727890, "step": 3890, "time_per_iteration": 2.6133198738098145 }, { "auxiliary_loss_clip": 0.01107139, "auxiliary_loss_mlp": 0.00775386, "balance_loss_clip": 1.04837406, "balance_loss_mlp": 1.00124502, "epoch": 0.2339395761310687, "flos": 46097988503040.0, "grad_norm": 1.8304579433009527, "language_loss": 0.73385048, "learning_rate": 3.579338004009412e-06, "loss": 0.75267571, "num_input_tokens_seen": 83749370, "step": 3891, "time_per_iteration": 3.008927583694458 }, { "auxiliary_loss_clip": 0.01145053, "auxiliary_loss_mlp": 0.01047702, "balance_loss_clip": 1.05121398, "balance_loss_mlp": 1.03035665, "epoch": 0.23399969938373666, "flos": 22382044007040.0, "grad_norm": 1.8316289897122906, "language_loss": 0.82725632, "learning_rate": 3.5790990264330433e-06, "loss": 0.84918392, "num_input_tokens_seen": 83769560, "step": 3892, "time_per_iteration": 2.6455893516540527 }, { "auxiliary_loss_clip": 0.01100914, "auxiliary_loss_mlp": 0.01055558, "balance_loss_clip": 1.04450488, "balance_loss_mlp": 1.03491104, "epoch": 0.23405982263640462, "flos": 43508542066560.0, "grad_norm": 2.707564715226966, "language_loss": 0.64982933, "learning_rate": 3.578859988977082e-06, "loss": 0.67139405, "num_input_tokens_seen": 83795635, "step": 3893, "time_per_iteration": 2.9392964839935303 }, { "auxiliary_loss_clip": 0.01106007, "auxiliary_loss_mlp": 0.01045218, "balance_loss_clip": 1.04782617, "balance_loss_mlp": 1.02701449, "epoch": 0.2341199458890726, "flos": 22564685687040.0, "grad_norm": 2.5782091790717105, "language_loss": 0.79415286, "learning_rate": 3.5786208916505916e-06, "loss": 0.81566513, "num_input_tokens_seen": 83814090, "step": 3894, "time_per_iteration": 2.839935541152954 }, { "auxiliary_loss_clip": 0.01134295, "auxiliary_loss_mlp": 0.01049748, "balance_loss_clip": 1.04747164, "balance_loss_mlp": 1.03253388, "epoch": 0.23418006914174055, "flos": 25633970933760.0, "grad_norm": 1.551347830991082, "language_loss": 0.81978422, "learning_rate": 3.5783817344626383e-06, "loss": 0.84162462, "num_input_tokens_seen": 83836870, "step": 3895, "time_per_iteration": 2.739955425262451 }, { "auxiliary_loss_clip": 0.01134592, "auxiliary_loss_mlp": 0.01052429, "balance_loss_clip": 1.04999852, "balance_loss_mlp": 1.03514385, "epoch": 0.23424019239440855, "flos": 13545936074880.0, "grad_norm": 1.8690411936118732, "language_loss": 0.80239451, "learning_rate": 3.578142517422292e-06, "loss": 0.82426476, "num_input_tokens_seen": 83853275, "step": 3896, "time_per_iteration": 2.681114435195923 }, { "auxiliary_loss_clip": 0.01125586, "auxiliary_loss_mlp": 0.01045792, "balance_loss_clip": 1.04685259, "balance_loss_mlp": 1.02779162, "epoch": 0.2343003156470765, "flos": 22419498913920.0, "grad_norm": 2.2492510100498087, "language_loss": 0.83249009, "learning_rate": 3.577903240538623e-06, "loss": 0.85420382, "num_input_tokens_seen": 83872340, "step": 3897, "time_per_iteration": 2.728916645050049 }, { "auxiliary_loss_clip": 0.01134669, "auxiliary_loss_mlp": 0.01058403, "balance_loss_clip": 1.04949594, "balance_loss_mlp": 1.04016376, "epoch": 0.23436043889974448, "flos": 14790815683200.0, "grad_norm": 1.5875861860902294, "language_loss": 0.78903484, "learning_rate": 3.577663903820705e-06, "loss": 0.81096554, "num_input_tokens_seen": 83888795, "step": 3898, "time_per_iteration": 2.6597952842712402 }, { "auxiliary_loss_clip": 0.01109182, "auxiliary_loss_mlp": 0.01055226, "balance_loss_clip": 1.04657888, "balance_loss_mlp": 1.03785777, "epoch": 0.23442056215241244, "flos": 22965700101120.0, "grad_norm": 1.9975380770167093, "language_loss": 0.73769581, "learning_rate": 3.577424507277614e-06, "loss": 0.75933987, "num_input_tokens_seen": 83906820, "step": 3899, "time_per_iteration": 2.7511518001556396 }, { "auxiliary_loss_clip": 0.01110646, "auxiliary_loss_mlp": 0.01053556, "balance_loss_clip": 1.04662895, "balance_loss_mlp": 1.03530502, "epoch": 0.2344806854050804, "flos": 23071887682560.0, "grad_norm": 2.822835219305806, "language_loss": 0.75323856, "learning_rate": 3.5771850509184277e-06, "loss": 0.77488053, "num_input_tokens_seen": 83926370, "step": 3900, "time_per_iteration": 2.7366316318511963 }, { "auxiliary_loss_clip": 0.01097598, "auxiliary_loss_mlp": 0.01047935, "balance_loss_clip": 1.04771769, "balance_loss_mlp": 1.03019702, "epoch": 0.23454080865774837, "flos": 16327074418560.0, "grad_norm": 1.7042292639984586, "language_loss": 0.67123592, "learning_rate": 3.5769455347522256e-06, "loss": 0.69269133, "num_input_tokens_seen": 83944600, "step": 3901, "time_per_iteration": 2.857386589050293 }, { "auxiliary_loss_clip": 0.01029196, "auxiliary_loss_mlp": 0.01060621, "balance_loss_clip": 1.02959871, "balance_loss_mlp": 1.0584631, "epoch": 0.23460093191041637, "flos": 67760958142080.0, "grad_norm": 0.7708596717968548, "language_loss": 0.58189189, "learning_rate": 3.576705958788091e-06, "loss": 0.60279006, "num_input_tokens_seen": 84005100, "step": 3902, "time_per_iteration": 3.2769579887390137 }, { "auxiliary_loss_clip": 0.01126982, "auxiliary_loss_mlp": 0.01045748, "balance_loss_clip": 1.05044544, "balance_loss_mlp": 1.02691305, "epoch": 0.23466105516308433, "flos": 20077619990400.0, "grad_norm": 2.0309755154884708, "language_loss": 0.80396789, "learning_rate": 3.576466323035108e-06, "loss": 0.82569516, "num_input_tokens_seen": 84023775, "step": 3903, "time_per_iteration": 2.683908462524414 }, { "auxiliary_loss_clip": 0.01092072, "auxiliary_loss_mlp": 0.01044121, "balance_loss_clip": 1.04248238, "balance_loss_mlp": 1.02614391, "epoch": 0.2347211784157523, "flos": 24535714642560.0, "grad_norm": 1.970422818337997, "language_loss": 0.82400727, "learning_rate": 3.5762266275023645e-06, "loss": 0.84536922, "num_input_tokens_seen": 84042605, "step": 3904, "time_per_iteration": 2.8023037910461426 }, { "auxiliary_loss_clip": 0.01147463, "auxiliary_loss_mlp": 0.01043559, "balance_loss_clip": 1.05247784, "balance_loss_mlp": 1.02620173, "epoch": 0.23478130166842026, "flos": 23805040181760.0, "grad_norm": 1.9105311329606578, "language_loss": 0.71330345, "learning_rate": 3.57598687219895e-06, "loss": 0.73521364, "num_input_tokens_seen": 84061520, "step": 3905, "time_per_iteration": 2.650956869125366 }, { "auxiliary_loss_clip": 0.01143661, "auxiliary_loss_mlp": 0.01035514, "balance_loss_clip": 1.05086017, "balance_loss_mlp": 1.01877677, "epoch": 0.23484142492108823, "flos": 24093618048000.0, "grad_norm": 2.334164983860831, "language_loss": 0.71415532, "learning_rate": 3.5757470571339543e-06, "loss": 0.73594707, "num_input_tokens_seen": 84081800, "step": 3906, "time_per_iteration": 2.6635055541992188 }, { "auxiliary_loss_clip": 0.01138147, "auxiliary_loss_mlp": 0.01042098, "balance_loss_clip": 1.04703832, "balance_loss_mlp": 1.02246392, "epoch": 0.2349015481737562, "flos": 29095830898560.0, "grad_norm": 2.5527171953873693, "language_loss": 0.74024308, "learning_rate": 3.575507182316473e-06, "loss": 0.7620455, "num_input_tokens_seen": 84102340, "step": 3907, "time_per_iteration": 2.751154661178589 }, { "auxiliary_loss_clip": 0.01135101, "auxiliary_loss_mlp": 0.01047433, "balance_loss_clip": 1.04911268, "balance_loss_mlp": 1.02950394, "epoch": 0.23496167142642416, "flos": 18916305373440.0, "grad_norm": 1.9847054585906883, "language_loss": 0.72428519, "learning_rate": 3.575267247755601e-06, "loss": 0.74611056, "num_input_tokens_seen": 84120370, "step": 3908, "time_per_iteration": 2.631162166595459 }, { "auxiliary_loss_clip": 0.01053013, "auxiliary_loss_mlp": 0.01020478, "balance_loss_clip": 1.03362584, "balance_loss_mlp": 1.01765239, "epoch": 0.23502179467909215, "flos": 55868062896000.0, "grad_norm": 1.0307072678924762, "language_loss": 0.73359185, "learning_rate": 3.5750272534604367e-06, "loss": 0.75432676, "num_input_tokens_seen": 84165515, "step": 3909, "time_per_iteration": 2.974531650543213 }, { "auxiliary_loss_clip": 0.01136436, "auxiliary_loss_mlp": 0.01046445, "balance_loss_clip": 1.05006361, "balance_loss_mlp": 1.02797985, "epoch": 0.23508191793176011, "flos": 23401763210880.0, "grad_norm": 1.6771333047394956, "language_loss": 0.88288009, "learning_rate": 3.5747871994400822e-06, "loss": 0.90470886, "num_input_tokens_seen": 84184540, "step": 3910, "time_per_iteration": 2.6615123748779297 }, { "auxiliary_loss_clip": 0.01134757, "auxiliary_loss_mlp": 0.01038734, "balance_loss_clip": 1.04980493, "balance_loss_mlp": 1.02188933, "epoch": 0.23514204118442808, "flos": 20047671025920.0, "grad_norm": 1.9388895528834493, "language_loss": 0.76067305, "learning_rate": 3.5745470857036386e-06, "loss": 0.78240794, "num_input_tokens_seen": 84202025, "step": 3911, "time_per_iteration": 2.6846752166748047 }, { "auxiliary_loss_clip": 0.01130294, "auxiliary_loss_mlp": 0.01041364, "balance_loss_clip": 1.04968345, "balance_loss_mlp": 1.02546179, "epoch": 0.23520216443709605, "flos": 21580589796480.0, "grad_norm": 1.5851255377793763, "language_loss": 0.81651384, "learning_rate": 3.5743069122602122e-06, "loss": 0.83823043, "num_input_tokens_seen": 84221895, "step": 3912, "time_per_iteration": 2.6340627670288086 }, { "auxiliary_loss_clip": 0.01123815, "auxiliary_loss_mlp": 0.01046223, "balance_loss_clip": 1.05082059, "balance_loss_mlp": 1.02836537, "epoch": 0.235262287689764, "flos": 23185796688000.0, "grad_norm": 3.1390338867327165, "language_loss": 0.71748006, "learning_rate": 3.574066679118909e-06, "loss": 0.73918045, "num_input_tokens_seen": 84240455, "step": 3913, "time_per_iteration": 2.6716067790985107 }, { "auxiliary_loss_clip": 0.01141007, "auxiliary_loss_mlp": 0.00776535, "balance_loss_clip": 1.05018401, "balance_loss_mlp": 1.00136077, "epoch": 0.23532241094243198, "flos": 23185222070400.0, "grad_norm": 1.7080087282408476, "language_loss": 0.76152158, "learning_rate": 3.57382638628884e-06, "loss": 0.78069693, "num_input_tokens_seen": 84261605, "step": 3914, "time_per_iteration": 2.706982135772705 }, { "auxiliary_loss_clip": 0.01088532, "auxiliary_loss_mlp": 0.01039819, "balance_loss_clip": 1.0485754, "balance_loss_mlp": 1.02153206, "epoch": 0.23538253419509997, "flos": 17019324305280.0, "grad_norm": 2.2148128973951877, "language_loss": 0.89692557, "learning_rate": 3.5735860337791174e-06, "loss": 0.91820902, "num_input_tokens_seen": 84278675, "step": 3915, "time_per_iteration": 2.8005998134613037 }, { "auxiliary_loss_clip": 0.01045613, "auxiliary_loss_mlp": 0.0100868, "balance_loss_clip": 1.02860212, "balance_loss_mlp": 1.00596201, "epoch": 0.23544265744776793, "flos": 63448588967040.0, "grad_norm": 0.8066012642326402, "language_loss": 0.59382623, "learning_rate": 3.573345621598854e-06, "loss": 0.61436915, "num_input_tokens_seen": 84329765, "step": 3916, "time_per_iteration": 3.168708086013794 }, { "auxiliary_loss_clip": 0.01027738, "auxiliary_loss_mlp": 0.01005192, "balance_loss_clip": 1.03619492, "balance_loss_mlp": 1.00231957, "epoch": 0.2355027807004359, "flos": 70515343831680.0, "grad_norm": 0.7680467252570666, "language_loss": 0.49518228, "learning_rate": 3.5731051497571675e-06, "loss": 0.51551157, "num_input_tokens_seen": 84393680, "step": 3917, "time_per_iteration": 3.3240060806274414 }, { "auxiliary_loss_clip": 0.01112941, "auxiliary_loss_mlp": 0.01048231, "balance_loss_clip": 1.04929173, "balance_loss_mlp": 1.03133857, "epoch": 0.23556290395310386, "flos": 21434289701760.0, "grad_norm": 1.9721662885337694, "language_loss": 0.76349282, "learning_rate": 3.5728646182631756e-06, "loss": 0.78510457, "num_input_tokens_seen": 84412640, "step": 3918, "time_per_iteration": 2.739431619644165 }, { "auxiliary_loss_clip": 0.0109904, "auxiliary_loss_mlp": 0.01052049, "balance_loss_clip": 1.04440236, "balance_loss_mlp": 1.03514528, "epoch": 0.23562302720577183, "flos": 18186421011840.0, "grad_norm": 2.001330675769641, "language_loss": 0.69002521, "learning_rate": 3.5726240271259995e-06, "loss": 0.71153617, "num_input_tokens_seen": 84431605, "step": 3919, "time_per_iteration": 2.8809926509857178 }, { "auxiliary_loss_clip": 0.01106851, "auxiliary_loss_mlp": 0.01039357, "balance_loss_clip": 1.04772878, "balance_loss_mlp": 1.02221501, "epoch": 0.2356831504584398, "flos": 33730497832320.0, "grad_norm": 1.6908780146896767, "language_loss": 0.70500779, "learning_rate": 3.5723833763547634e-06, "loss": 0.72646987, "num_input_tokens_seen": 84454210, "step": 3920, "time_per_iteration": 2.7984554767608643 }, { "auxiliary_loss_clip": 0.01124832, "auxiliary_loss_mlp": 0.01054073, "balance_loss_clip": 1.05141807, "balance_loss_mlp": 1.03756285, "epoch": 0.23574327371110776, "flos": 24932778560640.0, "grad_norm": 1.7460619151295316, "language_loss": 0.77363533, "learning_rate": 3.5721426659585916e-06, "loss": 0.7954244, "num_input_tokens_seen": 84475540, "step": 3921, "time_per_iteration": 2.8038690090179443 }, { "auxiliary_loss_clip": 0.01113499, "auxiliary_loss_mlp": 0.01043793, "balance_loss_clip": 1.05042887, "balance_loss_mlp": 1.02692485, "epoch": 0.23580339696377575, "flos": 17822107319040.0, "grad_norm": 2.2761735813493775, "language_loss": 0.74768102, "learning_rate": 3.571901895946612e-06, "loss": 0.76925397, "num_input_tokens_seen": 84494580, "step": 3922, "time_per_iteration": 5.741380929946899 }, { "auxiliary_loss_clip": 0.01116057, "auxiliary_loss_mlp": 0.01041318, "balance_loss_clip": 1.04831624, "balance_loss_mlp": 1.02577269, "epoch": 0.23586352021644372, "flos": 26286611097600.0, "grad_norm": 3.3386441952016868, "language_loss": 0.79846609, "learning_rate": 3.571661066327956e-06, "loss": 0.82003981, "num_input_tokens_seen": 84513850, "step": 3923, "time_per_iteration": 2.7889180183410645 }, { "auxiliary_loss_clip": 0.01089456, "auxiliary_loss_mlp": 0.0105728, "balance_loss_clip": 1.04471469, "balance_loss_mlp": 1.03935063, "epoch": 0.23592364346911168, "flos": 14246697484800.0, "grad_norm": 4.698975622885271, "language_loss": 0.74874711, "learning_rate": 3.571420177111754e-06, "loss": 0.77021456, "num_input_tokens_seen": 84532315, "step": 3924, "time_per_iteration": 4.272740125656128 }, { "auxiliary_loss_clip": 0.01145554, "auxiliary_loss_mlp": 0.01046876, "balance_loss_clip": 1.05115998, "balance_loss_mlp": 1.030568, "epoch": 0.23598376672177965, "flos": 18587938216320.0, "grad_norm": 2.8676741031402977, "language_loss": 0.82357788, "learning_rate": 3.5711792283071416e-06, "loss": 0.8455022, "num_input_tokens_seen": 84550970, "step": 3925, "time_per_iteration": 2.6825013160705566 }, { "auxiliary_loss_clip": 0.0112035, "auxiliary_loss_mlp": 0.01048071, "balance_loss_clip": 1.04567564, "balance_loss_mlp": 1.0315721, "epoch": 0.2360438899744476, "flos": 22675542036480.0, "grad_norm": 1.5755651433289561, "language_loss": 0.59533024, "learning_rate": 3.5709382199232564e-06, "loss": 0.61701441, "num_input_tokens_seen": 84571655, "step": 3926, "time_per_iteration": 2.6960842609405518 }, { "auxiliary_loss_clip": 0.01125496, "auxiliary_loss_mlp": 0.01046163, "balance_loss_clip": 1.04914129, "balance_loss_mlp": 1.0302484, "epoch": 0.23610401322711558, "flos": 29570139014400.0, "grad_norm": 2.4179456581838212, "language_loss": 0.7155292, "learning_rate": 3.570697151969235e-06, "loss": 0.7372458, "num_input_tokens_seen": 84593130, "step": 3927, "time_per_iteration": 2.786576986312866 }, { "auxiliary_loss_clip": 0.01120941, "auxiliary_loss_mlp": 0.01047009, "balance_loss_clip": 1.04764938, "balance_loss_mlp": 1.03125572, "epoch": 0.23616413647978354, "flos": 17858520731520.0, "grad_norm": 1.9380358164668718, "language_loss": 0.74792278, "learning_rate": 3.570456024454221e-06, "loss": 0.76960224, "num_input_tokens_seen": 84612410, "step": 3928, "time_per_iteration": 4.450765609741211 }, { "auxiliary_loss_clip": 0.01118656, "auxiliary_loss_mlp": 0.01047112, "balance_loss_clip": 1.04935324, "balance_loss_mlp": 1.02949333, "epoch": 0.23622425973245154, "flos": 11034847157760.0, "grad_norm": 4.3448767989564745, "language_loss": 0.81905198, "learning_rate": 3.5702148373873576e-06, "loss": 0.84070963, "num_input_tokens_seen": 84627610, "step": 3929, "time_per_iteration": 2.654085874557495 }, { "auxiliary_loss_clip": 0.01151721, "auxiliary_loss_mlp": 0.0105167, "balance_loss_clip": 1.05143714, "balance_loss_mlp": 1.03314447, "epoch": 0.2362843829851195, "flos": 23404061681280.0, "grad_norm": 3.048788180104446, "language_loss": 0.72323942, "learning_rate": 3.569973590777789e-06, "loss": 0.74527335, "num_input_tokens_seen": 84648415, "step": 3930, "time_per_iteration": 2.67429780960083 }, { "auxiliary_loss_clip": 0.01143652, "auxiliary_loss_mlp": 0.01036151, "balance_loss_clip": 1.04880345, "balance_loss_mlp": 1.01985574, "epoch": 0.23634450623778747, "flos": 39529855261440.0, "grad_norm": 2.7450987997323333, "language_loss": 0.74105632, "learning_rate": 3.569732284634665e-06, "loss": 0.76285434, "num_input_tokens_seen": 84670080, "step": 3931, "time_per_iteration": 2.8017847537994385 }, { "auxiliary_loss_clip": 0.01137617, "auxiliary_loss_mlp": 0.01046002, "balance_loss_clip": 1.05250037, "balance_loss_mlp": 1.02853799, "epoch": 0.23640462949045543, "flos": 24207167917440.0, "grad_norm": 2.2419024865888852, "language_loss": 0.8018778, "learning_rate": 3.569490918967136e-06, "loss": 0.82371396, "num_input_tokens_seen": 84686465, "step": 3932, "time_per_iteration": 2.6295793056488037 }, { "auxiliary_loss_clip": 0.01108498, "auxiliary_loss_mlp": 0.0104053, "balance_loss_clip": 1.04981244, "balance_loss_mlp": 1.02614117, "epoch": 0.2364647527431234, "flos": 26177622255360.0, "grad_norm": 2.247824561482015, "language_loss": 0.85683465, "learning_rate": 3.5692494937843537e-06, "loss": 0.87832487, "num_input_tokens_seen": 84708825, "step": 3933, "time_per_iteration": 2.7401201725006104 }, { "auxiliary_loss_clip": 0.01101933, "auxiliary_loss_mlp": 0.010512, "balance_loss_clip": 1.04680276, "balance_loss_mlp": 1.03112483, "epoch": 0.23652487599579136, "flos": 22637009721600.0, "grad_norm": 2.0287283132247547, "language_loss": 0.83179402, "learning_rate": 3.5690080090954727e-06, "loss": 0.85332537, "num_input_tokens_seen": 84726165, "step": 3934, "time_per_iteration": 2.8152921199798584 }, { "auxiliary_loss_clip": 0.01148508, "auxiliary_loss_mlp": 0.01042164, "balance_loss_clip": 1.05208373, "balance_loss_mlp": 1.02556968, "epoch": 0.23658499924845935, "flos": 21762261809280.0, "grad_norm": 1.8368151879100059, "language_loss": 0.78513408, "learning_rate": 3.5687664649096515e-06, "loss": 0.80704081, "num_input_tokens_seen": 84745815, "step": 3935, "time_per_iteration": 2.6769750118255615 }, { "auxiliary_loss_clip": 0.01134595, "auxiliary_loss_mlp": 0.01034926, "balance_loss_clip": 1.05270088, "balance_loss_mlp": 1.01891589, "epoch": 0.23664512250112732, "flos": 21798998444160.0, "grad_norm": 1.5615220666884744, "language_loss": 0.79614085, "learning_rate": 3.5685248612360487e-06, "loss": 0.81783605, "num_input_tokens_seen": 84765415, "step": 3936, "time_per_iteration": 2.7037193775177 }, { "auxiliary_loss_clip": 0.01126163, "auxiliary_loss_mlp": 0.01034739, "balance_loss_clip": 1.04967618, "balance_loss_mlp": 1.01779902, "epoch": 0.23670524575379528, "flos": 22637871648000.0, "grad_norm": 1.671201383656535, "language_loss": 0.7915628, "learning_rate": 3.568283198083826e-06, "loss": 0.81317174, "num_input_tokens_seen": 84787080, "step": 3937, "time_per_iteration": 2.7639834880828857 }, { "auxiliary_loss_clip": 0.01134519, "auxiliary_loss_mlp": 0.01038533, "balance_loss_clip": 1.05320358, "balance_loss_mlp": 1.02313685, "epoch": 0.23676536900646325, "flos": 16725000263040.0, "grad_norm": 1.8758026172480324, "language_loss": 0.85389286, "learning_rate": 3.568041475462147e-06, "loss": 0.8756234, "num_input_tokens_seen": 84805395, "step": 3938, "time_per_iteration": 2.6919057369232178 }, { "auxiliary_loss_clip": 0.01145522, "auxiliary_loss_mlp": 0.01047488, "balance_loss_clip": 1.05159402, "balance_loss_mlp": 1.03076303, "epoch": 0.23682549225913122, "flos": 11135611785600.0, "grad_norm": 4.660879571039018, "language_loss": 0.9365679, "learning_rate": 3.5677996933801785e-06, "loss": 0.958498, "num_input_tokens_seen": 84818090, "step": 3939, "time_per_iteration": 2.7249948978424072 }, { "auxiliary_loss_clip": 0.01149288, "auxiliary_loss_mlp": 0.01041833, "balance_loss_clip": 1.0512023, "balance_loss_mlp": 1.02463138, "epoch": 0.23688561551179918, "flos": 22559226819840.0, "grad_norm": 1.884439522765895, "language_loss": 0.82347792, "learning_rate": 3.567557851847088e-06, "loss": 0.84538913, "num_input_tokens_seen": 84837695, "step": 3940, "time_per_iteration": 2.666647434234619 }, { "auxiliary_loss_clip": 0.01128412, "auxiliary_loss_mlp": 0.00775407, "balance_loss_clip": 1.05063081, "balance_loss_mlp": 1.00109661, "epoch": 0.23694573876446715, "flos": 18514895909760.0, "grad_norm": 2.7155330970608214, "language_loss": 0.88959104, "learning_rate": 3.5673159508720464e-06, "loss": 0.90862918, "num_input_tokens_seen": 84854630, "step": 3941, "time_per_iteration": 2.6898627281188965 }, { "auxiliary_loss_clip": 0.01147095, "auxiliary_loss_mlp": 0.01040548, "balance_loss_clip": 1.04976177, "balance_loss_mlp": 1.0227741, "epoch": 0.23700586201713514, "flos": 15335723980800.0, "grad_norm": 2.436898535695529, "language_loss": 0.8484506, "learning_rate": 3.5670739904642274e-06, "loss": 0.870327, "num_input_tokens_seen": 84871805, "step": 3942, "time_per_iteration": 2.560166835784912 }, { "auxiliary_loss_clip": 0.01109105, "auxiliary_loss_mlp": 0.01042997, "balance_loss_clip": 1.04736543, "balance_loss_mlp": 1.02447248, "epoch": 0.2370659852698031, "flos": 23947605262080.0, "grad_norm": 1.9848651824816348, "language_loss": 0.81126499, "learning_rate": 3.5668319706328065e-06, "loss": 0.83278596, "num_input_tokens_seen": 84889815, "step": 3943, "time_per_iteration": 2.7389075756073 }, { "auxiliary_loss_clip": 0.01114013, "auxiliary_loss_mlp": 0.01044642, "balance_loss_clip": 1.0464983, "balance_loss_mlp": 1.02618814, "epoch": 0.23712610852247107, "flos": 15332527670400.0, "grad_norm": 2.1611381488400143, "language_loss": 0.67060351, "learning_rate": 3.566589891386959e-06, "loss": 0.69219005, "num_input_tokens_seen": 84904380, "step": 3944, "time_per_iteration": 2.6382999420166016 }, { "auxiliary_loss_clip": 0.01117531, "auxiliary_loss_mlp": 0.01038157, "balance_loss_clip": 1.04629564, "balance_loss_mlp": 1.02003753, "epoch": 0.23718623177513903, "flos": 19682567233920.0, "grad_norm": 1.9578725621632602, "language_loss": 0.75573617, "learning_rate": 3.566347752735866e-06, "loss": 0.77729309, "num_input_tokens_seen": 84922935, "step": 3945, "time_per_iteration": 2.678377628326416 }, { "auxiliary_loss_clip": 0.01128604, "auxiliary_loss_mlp": 0.01039043, "balance_loss_clip": 1.0493716, "balance_loss_mlp": 1.02255654, "epoch": 0.237246355027807, "flos": 24973322037120.0, "grad_norm": 1.4378865328543082, "language_loss": 0.63750178, "learning_rate": 3.5661055546887094e-06, "loss": 0.65917826, "num_input_tokens_seen": 84943685, "step": 3946, "time_per_iteration": 2.77178955078125 }, { "auxiliary_loss_clip": 0.01130702, "auxiliary_loss_mlp": 0.01036796, "balance_loss_clip": 1.0460459, "balance_loss_mlp": 1.0186162, "epoch": 0.23730647828047496, "flos": 15377416692480.0, "grad_norm": 2.53957699605931, "language_loss": 0.77666485, "learning_rate": 3.5658632972546734e-06, "loss": 0.79833984, "num_input_tokens_seen": 84959505, "step": 3947, "time_per_iteration": 2.65461802482605 }, { "auxiliary_loss_clip": 0.01145835, "auxiliary_loss_mlp": 0.01040502, "balance_loss_clip": 1.0566994, "balance_loss_mlp": 1.02299047, "epoch": 0.23736660153314296, "flos": 28150662372480.0, "grad_norm": 2.0053805098120123, "language_loss": 0.80706096, "learning_rate": 3.565620980442944e-06, "loss": 0.82892442, "num_input_tokens_seen": 84982130, "step": 3948, "time_per_iteration": 2.756716012954712 }, { "auxiliary_loss_clip": 0.01129664, "auxiliary_loss_mlp": 0.01044051, "balance_loss_clip": 1.05104828, "balance_loss_mlp": 1.02643192, "epoch": 0.23742672478581092, "flos": 22086570729600.0, "grad_norm": 2.5980612684471374, "language_loss": 0.80257607, "learning_rate": 3.5653786042627107e-06, "loss": 0.82431316, "num_input_tokens_seen": 85000640, "step": 3949, "time_per_iteration": 2.74457049369812 }, { "auxiliary_loss_clip": 0.0112363, "auxiliary_loss_mlp": 0.01038665, "balance_loss_clip": 1.04977036, "balance_loss_mlp": 1.02109337, "epoch": 0.2374868480384789, "flos": 19537093152000.0, "grad_norm": 2.0592081961125093, "language_loss": 0.73239946, "learning_rate": 3.565136168723163e-06, "loss": 0.75402236, "num_input_tokens_seen": 85018970, "step": 3950, "time_per_iteration": 2.650508165359497 }, { "auxiliary_loss_clip": 0.01145426, "auxiliary_loss_mlp": 0.01037947, "balance_loss_clip": 1.05055118, "balance_loss_mlp": 1.02204442, "epoch": 0.23754697129114685, "flos": 19422501788160.0, "grad_norm": 1.9969465766046124, "language_loss": 0.72794384, "learning_rate": 3.564893673833495e-06, "loss": 0.74977756, "num_input_tokens_seen": 85035905, "step": 3951, "time_per_iteration": 2.652399778366089 }, { "auxiliary_loss_clip": 0.01122477, "auxiliary_loss_mlp": 0.01039445, "balance_loss_clip": 1.05080223, "balance_loss_mlp": 1.0216229, "epoch": 0.23760709454381482, "flos": 19501002961920.0, "grad_norm": 3.398248459712791, "language_loss": 0.73703241, "learning_rate": 3.564651119602903e-06, "loss": 0.75865161, "num_input_tokens_seen": 85054560, "step": 3952, "time_per_iteration": 2.7522144317626953 }, { "auxiliary_loss_clip": 0.01100804, "auxiliary_loss_mlp": 0.01042567, "balance_loss_clip": 1.04366636, "balance_loss_mlp": 1.02566266, "epoch": 0.23766721779648278, "flos": 27636600879360.0, "grad_norm": 1.7524267936836437, "language_loss": 0.71314329, "learning_rate": 3.564408506040583e-06, "loss": 0.73457694, "num_input_tokens_seen": 85074425, "step": 3953, "time_per_iteration": 2.7846672534942627 }, { "auxiliary_loss_clip": 0.01151909, "auxiliary_loss_mlp": 0.01047443, "balance_loss_clip": 1.05282676, "balance_loss_mlp": 1.02854872, "epoch": 0.23772734104915075, "flos": 23404348990080.0, "grad_norm": 1.9722222736847754, "language_loss": 0.81792426, "learning_rate": 3.5641658331557356e-06, "loss": 0.83991784, "num_input_tokens_seen": 85092865, "step": 3954, "time_per_iteration": 2.6262643337249756 }, { "auxiliary_loss_clip": 0.01127802, "auxiliary_loss_mlp": 0.01044439, "balance_loss_clip": 1.05239391, "balance_loss_mlp": 1.02616453, "epoch": 0.23778746430181874, "flos": 15705496540800.0, "grad_norm": 2.2607510345904824, "language_loss": 0.66270143, "learning_rate": 3.5639231009575634e-06, "loss": 0.68442386, "num_input_tokens_seen": 85110175, "step": 3955, "time_per_iteration": 2.672151803970337 }, { "auxiliary_loss_clip": 0.01149182, "auxiliary_loss_mlp": 0.0104812, "balance_loss_clip": 1.05219805, "balance_loss_mlp": 1.03104961, "epoch": 0.2378475875544867, "flos": 19426452284160.0, "grad_norm": 1.4117933502593074, "language_loss": 0.83963013, "learning_rate": 3.5636803094552704e-06, "loss": 0.86160314, "num_input_tokens_seen": 85129925, "step": 3956, "time_per_iteration": 2.6483681201934814 }, { "auxiliary_loss_clip": 0.01103304, "auxiliary_loss_mlp": 0.01042938, "balance_loss_clip": 1.04726648, "balance_loss_mlp": 1.02556944, "epoch": 0.23790771080715467, "flos": 22268565964800.0, "grad_norm": 2.308539718278817, "language_loss": 0.8482393, "learning_rate": 3.5634374586580635e-06, "loss": 0.86970174, "num_input_tokens_seen": 85147755, "step": 3957, "time_per_iteration": 2.718961715698242 }, { "auxiliary_loss_clip": 0.01087747, "auxiliary_loss_mlp": 0.01039974, "balance_loss_clip": 1.04701853, "balance_loss_mlp": 1.02428651, "epoch": 0.23796783405982264, "flos": 20047311889920.0, "grad_norm": 2.068360920278316, "language_loss": 0.70373344, "learning_rate": 3.563194548575151e-06, "loss": 0.72501063, "num_input_tokens_seen": 85165270, "step": 3958, "time_per_iteration": 2.818115472793579 }, { "auxiliary_loss_clip": 0.01102632, "auxiliary_loss_mlp": 0.01042002, "balance_loss_clip": 1.04540312, "balance_loss_mlp": 1.02276158, "epoch": 0.2380279573124906, "flos": 14245943299200.0, "grad_norm": 2.474231994209954, "language_loss": 0.66273189, "learning_rate": 3.562951579215745e-06, "loss": 0.68417823, "num_input_tokens_seen": 85181555, "step": 3959, "time_per_iteration": 2.71085786819458 }, { "auxiliary_loss_clip": 0.01103257, "auxiliary_loss_mlp": 0.01044748, "balance_loss_clip": 1.04910731, "balance_loss_mlp": 1.02760553, "epoch": 0.23808808056515857, "flos": 21179180332800.0, "grad_norm": 1.922923950627842, "language_loss": 0.72140026, "learning_rate": 3.5627085505890586e-06, "loss": 0.74288028, "num_input_tokens_seen": 85199455, "step": 3960, "time_per_iteration": 2.724398612976074 }, { "auxiliary_loss_clip": 0.01065725, "auxiliary_loss_mlp": 0.01041352, "balance_loss_clip": 1.04778433, "balance_loss_mlp": 1.02385175, "epoch": 0.23814820381782653, "flos": 22528308188160.0, "grad_norm": 1.836282299199184, "language_loss": 0.74303818, "learning_rate": 3.562465462704307e-06, "loss": 0.76410902, "num_input_tokens_seen": 85219170, "step": 3961, "time_per_iteration": 4.592544794082642 }, { "auxiliary_loss_clip": 0.01149701, "auxiliary_loss_mlp": 0.010511, "balance_loss_clip": 1.05083704, "balance_loss_mlp": 1.0321815, "epoch": 0.23820832707049452, "flos": 22304332932480.0, "grad_norm": 1.6798300631958207, "language_loss": 0.6562922, "learning_rate": 3.5622223155707085e-06, "loss": 0.67830026, "num_input_tokens_seen": 85238480, "step": 3962, "time_per_iteration": 4.40812087059021 }, { "auxiliary_loss_clip": 0.01121684, "auxiliary_loss_mlp": 0.01042601, "balance_loss_clip": 1.04743505, "balance_loss_mlp": 1.02511263, "epoch": 0.2382684503231625, "flos": 24864225454080.0, "grad_norm": 1.838705722688445, "language_loss": 0.74284148, "learning_rate": 3.561979109197483e-06, "loss": 0.76448429, "num_input_tokens_seen": 85259180, "step": 3963, "time_per_iteration": 2.7173969745635986 }, { "auxiliary_loss_clip": 0.01120014, "auxiliary_loss_mlp": 0.01045721, "balance_loss_clip": 1.0530858, "balance_loss_mlp": 1.02756512, "epoch": 0.23832857357583045, "flos": 21871609787520.0, "grad_norm": 2.045875790034744, "language_loss": 0.77264321, "learning_rate": 3.5617358435938538e-06, "loss": 0.79430056, "num_input_tokens_seen": 85278550, "step": 3964, "time_per_iteration": 4.25124716758728 }, { "auxiliary_loss_clip": 0.01108604, "auxiliary_loss_mlp": 0.01048343, "balance_loss_clip": 1.04783297, "balance_loss_mlp": 1.03124809, "epoch": 0.23838869682849842, "flos": 21288061434240.0, "grad_norm": 2.3097885565999894, "language_loss": 0.71521109, "learning_rate": 3.561492518769045e-06, "loss": 0.73678052, "num_input_tokens_seen": 85297345, "step": 3965, "time_per_iteration": 2.757647752761841 }, { "auxiliary_loss_clip": 0.01115176, "auxiliary_loss_mlp": 0.01043319, "balance_loss_clip": 1.04632521, "balance_loss_mlp": 1.02647483, "epoch": 0.23844882008116638, "flos": 16180594755840.0, "grad_norm": 2.673966650516871, "language_loss": 0.78003007, "learning_rate": 3.561249134732282e-06, "loss": 0.801615, "num_input_tokens_seen": 85315105, "step": 3966, "time_per_iteration": 2.71159291267395 }, { "auxiliary_loss_clip": 0.01124693, "auxiliary_loss_mlp": 0.01045448, "balance_loss_clip": 1.05071902, "balance_loss_mlp": 1.02899134, "epoch": 0.23850894333383435, "flos": 21069724613760.0, "grad_norm": 2.116401462724705, "language_loss": 0.68767631, "learning_rate": 3.561005691492797e-06, "loss": 0.70937771, "num_input_tokens_seen": 85334735, "step": 3967, "time_per_iteration": 2.7072744369506836 }, { "auxiliary_loss_clip": 0.01116174, "auxiliary_loss_mlp": 0.01055757, "balance_loss_clip": 1.04883289, "balance_loss_mlp": 1.03803015, "epoch": 0.23856906658650234, "flos": 17201606849280.0, "grad_norm": 3.581336577718575, "language_loss": 0.68005061, "learning_rate": 3.5607621890598185e-06, "loss": 0.70176995, "num_input_tokens_seen": 85352875, "step": 3968, "time_per_iteration": 4.378219842910767 }, { "auxiliary_loss_clip": 0.01097883, "auxiliary_loss_mlp": 0.01044394, "balance_loss_clip": 1.05052614, "balance_loss_mlp": 1.0274837, "epoch": 0.2386291898391703, "flos": 29494223619840.0, "grad_norm": 2.210255088762028, "language_loss": 0.77106255, "learning_rate": 3.5605186274425823e-06, "loss": 0.79248536, "num_input_tokens_seen": 85372205, "step": 3969, "time_per_iteration": 2.847663164138794 }, { "auxiliary_loss_clip": 0.01121681, "auxiliary_loss_mlp": 0.01039809, "balance_loss_clip": 1.0498476, "balance_loss_mlp": 1.02334595, "epoch": 0.23868931309183827, "flos": 21142443697920.0, "grad_norm": 2.1326335149840583, "language_loss": 0.7617563, "learning_rate": 3.5602750066503225e-06, "loss": 0.78337121, "num_input_tokens_seen": 85389705, "step": 3970, "time_per_iteration": 2.766862392425537 }, { "auxiliary_loss_clip": 0.01106309, "auxiliary_loss_mlp": 0.01049131, "balance_loss_clip": 1.04287159, "balance_loss_mlp": 1.03111875, "epoch": 0.23874943634450624, "flos": 25659394784640.0, "grad_norm": 2.3319107764636415, "language_loss": 0.85474384, "learning_rate": 3.5600313266922793e-06, "loss": 0.87629819, "num_input_tokens_seen": 85407855, "step": 3971, "time_per_iteration": 2.7597670555114746 }, { "auxiliary_loss_clip": 0.01062507, "auxiliary_loss_mlp": 0.01039144, "balance_loss_clip": 1.03465796, "balance_loss_mlp": 1.03661716, "epoch": 0.2388095595971742, "flos": 58986618624000.0, "grad_norm": 0.7451796217314707, "language_loss": 0.62797832, "learning_rate": 3.5597875875776915e-06, "loss": 0.6489948, "num_input_tokens_seen": 85470885, "step": 3972, "time_per_iteration": 3.2572779655456543 }, { "auxiliary_loss_clip": 0.0112174, "auxiliary_loss_mlp": 0.01037931, "balance_loss_clip": 1.0492239, "balance_loss_mlp": 1.02109838, "epoch": 0.23886968284984217, "flos": 16800341040000.0, "grad_norm": 1.9449657433446057, "language_loss": 0.82093811, "learning_rate": 3.5595437893158013e-06, "loss": 0.84253484, "num_input_tokens_seen": 85488460, "step": 3973, "time_per_iteration": 2.6394145488739014 }, { "auxiliary_loss_clip": 0.01115852, "auxiliary_loss_mlp": 0.01050239, "balance_loss_clip": 1.04884124, "balance_loss_mlp": 1.03272736, "epoch": 0.23892980610251013, "flos": 22382654538240.0, "grad_norm": 1.5639820592628684, "language_loss": 0.79418832, "learning_rate": 3.5592999319158546e-06, "loss": 0.81584924, "num_input_tokens_seen": 85508590, "step": 3974, "time_per_iteration": 2.6926944255828857 }, { "auxiliary_loss_clip": 0.01134012, "auxiliary_loss_mlp": 0.01042703, "balance_loss_clip": 1.05169725, "balance_loss_mlp": 1.02475047, "epoch": 0.23898992935517813, "flos": 12823198519680.0, "grad_norm": 1.8382350241534648, "language_loss": 0.8420803, "learning_rate": 3.5590560153870984e-06, "loss": 0.86384743, "num_input_tokens_seen": 85525970, "step": 3975, "time_per_iteration": 2.6402463912963867 }, { "auxiliary_loss_clip": 0.01126962, "auxiliary_loss_mlp": 0.01042445, "balance_loss_clip": 1.04938245, "balance_loss_mlp": 1.02545786, "epoch": 0.2390500526078461, "flos": 22345666508160.0, "grad_norm": 2.129124681208868, "language_loss": 0.84249294, "learning_rate": 3.5588120397387816e-06, "loss": 0.864187, "num_input_tokens_seen": 85543700, "step": 3976, "time_per_iteration": 2.624758720397949 }, { "auxiliary_loss_clip": 0.01075224, "auxiliary_loss_mlp": 0.01036827, "balance_loss_clip": 1.0434798, "balance_loss_mlp": 1.02103186, "epoch": 0.23911017586051406, "flos": 22635142214400.0, "grad_norm": 1.8888081312271703, "language_loss": 0.74451673, "learning_rate": 3.5585680049801566e-06, "loss": 0.76563722, "num_input_tokens_seen": 85562765, "step": 3977, "time_per_iteration": 2.848529815673828 }, { "auxiliary_loss_clip": 0.01151335, "auxiliary_loss_mlp": 0.01045957, "balance_loss_clip": 1.05476987, "balance_loss_mlp": 1.02829063, "epoch": 0.23917029911318202, "flos": 23653281219840.0, "grad_norm": 1.6816446874821869, "language_loss": 0.72515011, "learning_rate": 3.5583239111204764e-06, "loss": 0.74712306, "num_input_tokens_seen": 85581755, "step": 3978, "time_per_iteration": 2.6967527866363525 }, { "auxiliary_loss_clip": 0.01123321, "auxiliary_loss_mlp": 0.01045192, "balance_loss_clip": 1.04713726, "balance_loss_mlp": 1.02802634, "epoch": 0.23923042236585, "flos": 22783597125120.0, "grad_norm": 2.5130493367739413, "language_loss": 0.78474021, "learning_rate": 3.558079758168997e-06, "loss": 0.80642533, "num_input_tokens_seen": 85599455, "step": 3979, "time_per_iteration": 2.6679623126983643 }, { "auxiliary_loss_clip": 0.01123187, "auxiliary_loss_mlp": 0.01052255, "balance_loss_clip": 1.04774463, "balance_loss_mlp": 1.03390861, "epoch": 0.23929054561851795, "flos": 28147717457280.0, "grad_norm": 1.8353092232149775, "language_loss": 0.81943917, "learning_rate": 3.557835546134977e-06, "loss": 0.84119362, "num_input_tokens_seen": 85619970, "step": 3980, "time_per_iteration": 2.7941136360168457 }, { "auxiliary_loss_clip": 0.01094849, "auxiliary_loss_mlp": 0.01037854, "balance_loss_clip": 1.04719615, "balance_loss_mlp": 1.02036595, "epoch": 0.23935066887118592, "flos": 21686525982720.0, "grad_norm": 1.7388406045293963, "language_loss": 0.83562148, "learning_rate": 3.5575912750276775e-06, "loss": 0.85694849, "num_input_tokens_seen": 85638850, "step": 3981, "time_per_iteration": 2.773372173309326 }, { "auxiliary_loss_clip": 0.01126579, "auxiliary_loss_mlp": 0.01045152, "balance_loss_clip": 1.05084574, "balance_loss_mlp": 1.0267818, "epoch": 0.2394107921238539, "flos": 32122274198400.0, "grad_norm": 2.0270942419393676, "language_loss": 0.76690662, "learning_rate": 3.5573469448563607e-06, "loss": 0.78862393, "num_input_tokens_seen": 85656285, "step": 3982, "time_per_iteration": 2.770089864730835 }, { "auxiliary_loss_clip": 0.01107786, "auxiliary_loss_mlp": 0.01043737, "balance_loss_clip": 1.04928303, "balance_loss_mlp": 1.02757215, "epoch": 0.23947091537652188, "flos": 17019180650880.0, "grad_norm": 2.333665248317953, "language_loss": 0.78243405, "learning_rate": 3.5571025556302915e-06, "loss": 0.80394924, "num_input_tokens_seen": 85673020, "step": 3983, "time_per_iteration": 2.8361902236938477 }, { "auxiliary_loss_clip": 0.01136012, "auxiliary_loss_mlp": 0.00775416, "balance_loss_clip": 1.0530262, "balance_loss_mlp": 1.00106907, "epoch": 0.23953103862918984, "flos": 20593584904320.0, "grad_norm": 1.8468424363822287, "language_loss": 0.73274761, "learning_rate": 3.556858107358737e-06, "loss": 0.75186193, "num_input_tokens_seen": 85692565, "step": 3984, "time_per_iteration": 2.720289468765259 }, { "auxiliary_loss_clip": 0.01102619, "auxiliary_loss_mlp": 0.01051209, "balance_loss_clip": 1.04748976, "balance_loss_mlp": 1.0330658, "epoch": 0.2395911618818578, "flos": 20704405340160.0, "grad_norm": 1.906378165207968, "language_loss": 0.79090226, "learning_rate": 3.5566136000509674e-06, "loss": 0.81244051, "num_input_tokens_seen": 85709730, "step": 3985, "time_per_iteration": 2.8464138507843018 }, { "auxiliary_loss_clip": 0.01102898, "auxiliary_loss_mlp": 0.01047238, "balance_loss_clip": 1.04676175, "balance_loss_mlp": 1.02930927, "epoch": 0.23965128513452577, "flos": 27053519402880.0, "grad_norm": 1.780185130038595, "language_loss": 0.73194253, "learning_rate": 3.556369033716254e-06, "loss": 0.7534439, "num_input_tokens_seen": 85730045, "step": 3986, "time_per_iteration": 2.873837471008301 }, { "auxiliary_loss_clip": 0.01143561, "auxiliary_loss_mlp": 0.01052533, "balance_loss_clip": 1.05392861, "balance_loss_mlp": 1.03523529, "epoch": 0.23971140838719374, "flos": 23144319457920.0, "grad_norm": 1.9275946084378768, "language_loss": 0.88014174, "learning_rate": 3.556124408363871e-06, "loss": 0.90210271, "num_input_tokens_seen": 85747590, "step": 3987, "time_per_iteration": 2.778970718383789 }, { "auxiliary_loss_clip": 0.01131181, "auxiliary_loss_mlp": 0.01037226, "balance_loss_clip": 1.05180991, "balance_loss_mlp": 1.02253985, "epoch": 0.23977153163986173, "flos": 18034554309120.0, "grad_norm": 8.94948058332038, "language_loss": 0.82985806, "learning_rate": 3.5558797240030945e-06, "loss": 0.85154212, "num_input_tokens_seen": 85763460, "step": 3988, "time_per_iteration": 2.6707162857055664 }, { "auxiliary_loss_clip": 0.01132219, "auxiliary_loss_mlp": 0.01039377, "balance_loss_clip": 1.04952908, "balance_loss_mlp": 1.02213907, "epoch": 0.2398316548925297, "flos": 18113378705280.0, "grad_norm": 1.6085860818119202, "language_loss": 0.85336304, "learning_rate": 3.5556349806432035e-06, "loss": 0.87507904, "num_input_tokens_seen": 85782050, "step": 3989, "time_per_iteration": 2.644075632095337 }, { "auxiliary_loss_clip": 0.01144734, "auxiliary_loss_mlp": 0.01039049, "balance_loss_clip": 1.05094743, "balance_loss_mlp": 1.02263403, "epoch": 0.23989177814519766, "flos": 12567730014720.0, "grad_norm": 1.981474679784042, "language_loss": 0.84109843, "learning_rate": 3.555390178293477e-06, "loss": 0.86293626, "num_input_tokens_seen": 85797400, "step": 3990, "time_per_iteration": 2.5778160095214844 }, { "auxiliary_loss_clip": 0.01131361, "auxiliary_loss_mlp": 0.01042102, "balance_loss_clip": 1.04863191, "balance_loss_mlp": 1.02565074, "epoch": 0.23995190139786562, "flos": 25264593423360.0, "grad_norm": 1.5352138463261382, "language_loss": 0.75853264, "learning_rate": 3.5551453169631994e-06, "loss": 0.78026724, "num_input_tokens_seen": 85818995, "step": 3991, "time_per_iteration": 2.7569639682769775 }, { "auxiliary_loss_clip": 0.01040828, "auxiliary_loss_mlp": 0.0100398, "balance_loss_clip": 1.02825403, "balance_loss_mlp": 1.00114298, "epoch": 0.2400120246505336, "flos": 61960379650560.0, "grad_norm": 0.8795356934357302, "language_loss": 0.63683558, "learning_rate": 3.554900396661656e-06, "loss": 0.65728366, "num_input_tokens_seen": 85876695, "step": 3992, "time_per_iteration": 3.2559213638305664 }, { "auxiliary_loss_clip": 0.01055123, "auxiliary_loss_mlp": 0.01005737, "balance_loss_clip": 1.02834392, "balance_loss_mlp": 1.00292385, "epoch": 0.24007214790320155, "flos": 66708560540160.0, "grad_norm": 0.7639831296699208, "language_loss": 0.6297875, "learning_rate": 3.5546554173981334e-06, "loss": 0.65039611, "num_input_tokens_seen": 85940990, "step": 3993, "time_per_iteration": 3.2946221828460693 }, { "auxiliary_loss_clip": 0.0110983, "auxiliary_loss_mlp": 0.01048609, "balance_loss_clip": 1.05077267, "balance_loss_mlp": 1.03078759, "epoch": 0.24013227115586952, "flos": 25809070757760.0, "grad_norm": 1.7227387633537015, "language_loss": 0.7656548, "learning_rate": 3.5544103791819218e-06, "loss": 0.78723919, "num_input_tokens_seen": 85961165, "step": 3994, "time_per_iteration": 2.7735466957092285 }, { "auxiliary_loss_clip": 0.01120115, "auxiliary_loss_mlp": 0.01051235, "balance_loss_clip": 1.04648936, "balance_loss_mlp": 1.0323168, "epoch": 0.2401923944085375, "flos": 25557480921600.0, "grad_norm": 1.7819538389347498, "language_loss": 0.78550023, "learning_rate": 3.5541652820223124e-06, "loss": 0.80721372, "num_input_tokens_seen": 85982710, "step": 3995, "time_per_iteration": 2.8184118270874023 }, { "auxiliary_loss_clip": 0.01034, "auxiliary_loss_mlp": 0.01026353, "balance_loss_clip": 1.02876425, "balance_loss_mlp": 1.0237658, "epoch": 0.24025251766120548, "flos": 54941138478720.0, "grad_norm": 0.9088717203971356, "language_loss": 0.6345036, "learning_rate": 3.5539201259286006e-06, "loss": 0.65510708, "num_input_tokens_seen": 86046935, "step": 3996, "time_per_iteration": 3.304704189300537 }, { "auxiliary_loss_clip": 0.01122635, "auxiliary_loss_mlp": 0.01046678, "balance_loss_clip": 1.04812241, "balance_loss_mlp": 1.02960706, "epoch": 0.24031264091387344, "flos": 20631075724800.0, "grad_norm": 2.5673853359086403, "language_loss": 0.69455099, "learning_rate": 3.5536749109100808e-06, "loss": 0.7162441, "num_input_tokens_seen": 86064355, "step": 3997, "time_per_iteration": 2.6638269424438477 }, { "auxiliary_loss_clip": 0.01136246, "auxiliary_loss_mlp": 0.01041204, "balance_loss_clip": 1.0500989, "balance_loss_mlp": 1.02390659, "epoch": 0.2403727641665414, "flos": 20886256920960.0, "grad_norm": 1.9944619018673675, "language_loss": 0.87352818, "learning_rate": 3.5534296369760535e-06, "loss": 0.89530265, "num_input_tokens_seen": 86081340, "step": 3998, "time_per_iteration": 2.6837756633758545 }, { "auxiliary_loss_clip": 0.01126262, "auxiliary_loss_mlp": 0.01038814, "balance_loss_clip": 1.04337883, "balance_loss_mlp": 1.02173114, "epoch": 0.24043288741920937, "flos": 22820046451200.0, "grad_norm": 1.5798261831400109, "language_loss": 0.75723118, "learning_rate": 3.5531843041358183e-06, "loss": 0.77888191, "num_input_tokens_seen": 86102260, "step": 3999, "time_per_iteration": 2.659717321395874 }, { "auxiliary_loss_clip": 0.01116532, "auxiliary_loss_mlp": 0.01049627, "balance_loss_clip": 1.04679537, "balance_loss_mlp": 1.03259242, "epoch": 0.24049301067187734, "flos": 27959652823680.0, "grad_norm": 2.380373207595884, "language_loss": 0.72602308, "learning_rate": 3.552938912398679e-06, "loss": 0.74768472, "num_input_tokens_seen": 86123400, "step": 4000, "time_per_iteration": 4.285717487335205 }, { "auxiliary_loss_clip": 0.01138397, "auxiliary_loss_mlp": 0.01040819, "balance_loss_clip": 1.05207551, "balance_loss_mlp": 1.02389169, "epoch": 0.24055313392454533, "flos": 27451409333760.0, "grad_norm": 2.3105318706157862, "language_loss": 0.67128104, "learning_rate": 3.5526934617739397e-06, "loss": 0.69307321, "num_input_tokens_seen": 86144060, "step": 4001, "time_per_iteration": 4.2180609703063965 }, { "auxiliary_loss_clip": 0.01144863, "auxiliary_loss_mlp": 0.01043304, "balance_loss_clip": 1.04859209, "balance_loss_mlp": 1.02525568, "epoch": 0.2406132571772133, "flos": 25556618995200.0, "grad_norm": 2.360624564793828, "language_loss": 0.82895994, "learning_rate": 3.5524479522709095e-06, "loss": 0.85084158, "num_input_tokens_seen": 86163005, "step": 4002, "time_per_iteration": 2.6369640827178955 }, { "auxiliary_loss_clip": 0.01106477, "auxiliary_loss_mlp": 0.01045072, "balance_loss_clip": 1.0493201, "balance_loss_mlp": 1.0283823, "epoch": 0.24067338042988126, "flos": 24791398629120.0, "grad_norm": 2.016027139567785, "language_loss": 0.83058953, "learning_rate": 3.552202383898897e-06, "loss": 0.85210502, "num_input_tokens_seen": 86182580, "step": 4003, "time_per_iteration": 4.312098979949951 }, { "auxiliary_loss_clip": 0.01114745, "auxiliary_loss_mlp": 0.01042117, "balance_loss_clip": 1.0474503, "balance_loss_mlp": 1.02458131, "epoch": 0.24073350368254923, "flos": 21177923356800.0, "grad_norm": 1.971328156333658, "language_loss": 0.8672772, "learning_rate": 3.551956756667215e-06, "loss": 0.8888458, "num_input_tokens_seen": 86200665, "step": 4004, "time_per_iteration": 2.646578311920166 }, { "auxiliary_loss_clip": 0.01115631, "auxiliary_loss_mlp": 0.01054344, "balance_loss_clip": 1.04529011, "balance_loss_mlp": 1.03736866, "epoch": 0.2407936269352172, "flos": 22494300986880.0, "grad_norm": 1.9965130860947515, "language_loss": 0.78239757, "learning_rate": 3.551711070585177e-06, "loss": 0.80409735, "num_input_tokens_seen": 86221640, "step": 4005, "time_per_iteration": 2.7220566272735596 }, { "auxiliary_loss_clip": 0.01090518, "auxiliary_loss_mlp": 0.01039515, "balance_loss_clip": 1.04414058, "balance_loss_mlp": 1.02164578, "epoch": 0.24085375018788516, "flos": 18551129754240.0, "grad_norm": 1.6390993289809686, "language_loss": 0.79391652, "learning_rate": 3.5514653256620995e-06, "loss": 0.8152169, "num_input_tokens_seen": 86240795, "step": 4006, "time_per_iteration": 2.7188642024993896 }, { "auxiliary_loss_clip": 0.01130191, "auxiliary_loss_mlp": 0.00777161, "balance_loss_clip": 1.0482645, "balance_loss_mlp": 1.00115335, "epoch": 0.24091387344055312, "flos": 24170539023360.0, "grad_norm": 1.6765272633695874, "language_loss": 0.71939242, "learning_rate": 3.551219521907302e-06, "loss": 0.73846585, "num_input_tokens_seen": 86262000, "step": 4007, "time_per_iteration": 4.3504638671875 }, { "auxiliary_loss_clip": 0.01101925, "auxiliary_loss_mlp": 0.01047677, "balance_loss_clip": 1.04589975, "balance_loss_mlp": 1.03132153, "epoch": 0.24097399669322112, "flos": 11036319615360.0, "grad_norm": 1.6891966370612705, "language_loss": 0.76460171, "learning_rate": 3.5509736593301042e-06, "loss": 0.78609765, "num_input_tokens_seen": 86279680, "step": 4008, "time_per_iteration": 2.700744152069092 }, { "auxiliary_loss_clip": 0.01136495, "auxiliary_loss_mlp": 0.01038852, "balance_loss_clip": 1.05069256, "balance_loss_mlp": 1.02192402, "epoch": 0.24103411994588908, "flos": 17165085696000.0, "grad_norm": 2.427830882471808, "language_loss": 0.74601823, "learning_rate": 3.5507277379398295e-06, "loss": 0.76777172, "num_input_tokens_seen": 86297180, "step": 4009, "time_per_iteration": 2.6175808906555176 }, { "auxiliary_loss_clip": 0.01134079, "auxiliary_loss_mlp": 0.01041957, "balance_loss_clip": 1.05032861, "balance_loss_mlp": 1.02532756, "epoch": 0.24109424319855705, "flos": 20667956014080.0, "grad_norm": 1.6643292794637636, "language_loss": 0.80064976, "learning_rate": 3.550481757745804e-06, "loss": 0.82241005, "num_input_tokens_seen": 86317660, "step": 4010, "time_per_iteration": 2.680511236190796 }, { "auxiliary_loss_clip": 0.01118599, "auxiliary_loss_mlp": 0.01047241, "balance_loss_clip": 1.04658401, "balance_loss_mlp": 1.02779818, "epoch": 0.241154366451225, "flos": 28181796485760.0, "grad_norm": 3.8737422865874245, "language_loss": 0.70889425, "learning_rate": 3.5502357187573555e-06, "loss": 0.73055267, "num_input_tokens_seen": 86338325, "step": 4011, "time_per_iteration": 2.716404676437378 }, { "auxiliary_loss_clip": 0.01065208, "auxiliary_loss_mlp": 0.01047099, "balance_loss_clip": 1.0414176, "balance_loss_mlp": 1.02802527, "epoch": 0.24121448970389298, "flos": 21689722293120.0, "grad_norm": 1.675052333388822, "language_loss": 0.69279736, "learning_rate": 3.5499896209838118e-06, "loss": 0.71392041, "num_input_tokens_seen": 86357615, "step": 4012, "time_per_iteration": 2.804694890975952 }, { "auxiliary_loss_clip": 0.01138123, "auxiliary_loss_mlp": 0.0104149, "balance_loss_clip": 1.05126536, "balance_loss_mlp": 1.02213097, "epoch": 0.24127461295656094, "flos": 39676191269760.0, "grad_norm": 1.5084253296098848, "language_loss": 0.732813, "learning_rate": 3.5497434644345073e-06, "loss": 0.75460911, "num_input_tokens_seen": 86380355, "step": 4013, "time_per_iteration": 2.8192849159240723 }, { "auxiliary_loss_clip": 0.01148497, "auxiliary_loss_mlp": 0.01037798, "balance_loss_clip": 1.05201018, "balance_loss_mlp": 1.02044141, "epoch": 0.2413347362092289, "flos": 19135863256320.0, "grad_norm": 1.8372553923739565, "language_loss": 0.88272971, "learning_rate": 3.5494972491187753e-06, "loss": 0.90459263, "num_input_tokens_seen": 86399125, "step": 4014, "time_per_iteration": 2.6029160022735596 }, { "auxiliary_loss_clip": 0.0111397, "auxiliary_loss_mlp": 0.01046282, "balance_loss_clip": 1.04315281, "balance_loss_mlp": 1.0278163, "epoch": 0.2413948594618969, "flos": 26939430829440.0, "grad_norm": 1.9589493379590102, "language_loss": 0.94862974, "learning_rate": 3.549250975045952e-06, "loss": 0.97023225, "num_input_tokens_seen": 86418625, "step": 4015, "time_per_iteration": 2.6958773136138916 }, { "auxiliary_loss_clip": 0.01120117, "auxiliary_loss_mlp": 0.01041079, "balance_loss_clip": 1.04570341, "balance_loss_mlp": 1.02331638, "epoch": 0.24145498271456486, "flos": 25228108183680.0, "grad_norm": 1.5486712647521637, "language_loss": 0.8271699, "learning_rate": 3.5490046422253768e-06, "loss": 0.84878188, "num_input_tokens_seen": 86438375, "step": 4016, "time_per_iteration": 2.7045071125030518 }, { "auxiliary_loss_clip": 0.01098573, "auxiliary_loss_mlp": 0.01045564, "balance_loss_clip": 1.04334974, "balance_loss_mlp": 1.02838039, "epoch": 0.24151510596723283, "flos": 40661759617920.0, "grad_norm": 1.8022012115417119, "language_loss": 0.69207114, "learning_rate": 3.54875825066639e-06, "loss": 0.71351254, "num_input_tokens_seen": 86463230, "step": 4017, "time_per_iteration": 2.8596649169921875 }, { "auxiliary_loss_clip": 0.01141299, "auxiliary_loss_mlp": 0.01051243, "balance_loss_clip": 1.05106175, "balance_loss_mlp": 1.03278995, "epoch": 0.2415752292199008, "flos": 18146667634560.0, "grad_norm": 1.6419835865444041, "language_loss": 0.84953403, "learning_rate": 3.5485118003783353e-06, "loss": 0.87145936, "num_input_tokens_seen": 86481230, "step": 4018, "time_per_iteration": 2.627629518508911 }, { "auxiliary_loss_clip": 0.01046489, "auxiliary_loss_mlp": 0.01014362, "balance_loss_clip": 1.02139664, "balance_loss_mlp": 1.01140559, "epoch": 0.24163535247256876, "flos": 67288409792640.0, "grad_norm": 0.8221446343976555, "language_loss": 0.60642469, "learning_rate": 3.548265291370558e-06, "loss": 0.62703323, "num_input_tokens_seen": 86541260, "step": 4019, "time_per_iteration": 3.269498586654663 }, { "auxiliary_loss_clip": 0.01114983, "auxiliary_loss_mlp": 0.01049089, "balance_loss_clip": 1.04582107, "balance_loss_mlp": 1.0312674, "epoch": 0.24169547572523672, "flos": 24929941386240.0, "grad_norm": 1.8826005215725077, "language_loss": 0.73324752, "learning_rate": 3.5480187236524055e-06, "loss": 0.75488818, "num_input_tokens_seen": 86559580, "step": 4020, "time_per_iteration": 2.7341055870056152 }, { "auxiliary_loss_clip": 0.01111064, "auxiliary_loss_mlp": 0.01040515, "balance_loss_clip": 1.04833841, "balance_loss_mlp": 1.02315772, "epoch": 0.24175559897790472, "flos": 18728312567040.0, "grad_norm": 1.7964731743776612, "language_loss": 0.81617332, "learning_rate": 3.5477720972332285e-06, "loss": 0.83768916, "num_input_tokens_seen": 86577560, "step": 4021, "time_per_iteration": 2.7154345512390137 }, { "auxiliary_loss_clip": 0.01149117, "auxiliary_loss_mlp": 0.01050015, "balance_loss_clip": 1.04972911, "balance_loss_mlp": 1.03070307, "epoch": 0.24181572223057268, "flos": 23039281111680.0, "grad_norm": 2.078765142897874, "language_loss": 0.76601863, "learning_rate": 3.547525412122378e-06, "loss": 0.78800994, "num_input_tokens_seen": 86595350, "step": 4022, "time_per_iteration": 2.622262716293335 }, { "auxiliary_loss_clip": 0.01102927, "auxiliary_loss_mlp": 0.01053151, "balance_loss_clip": 1.042714, "balance_loss_mlp": 1.03271914, "epoch": 0.24187584548324065, "flos": 20376145923840.0, "grad_norm": 1.7360501926549048, "language_loss": 0.75283015, "learning_rate": 3.5472786683292083e-06, "loss": 0.774391, "num_input_tokens_seen": 86614805, "step": 4023, "time_per_iteration": 2.7339353561401367 }, { "auxiliary_loss_clip": 0.01121416, "auxiliary_loss_mlp": 0.01047921, "balance_loss_clip": 1.04916334, "balance_loss_mlp": 1.0309217, "epoch": 0.2419359687359086, "flos": 21397517153280.0, "grad_norm": 2.4319797200103466, "language_loss": 0.82542646, "learning_rate": 3.5470318658630766e-06, "loss": 0.84711981, "num_input_tokens_seen": 86633700, "step": 4024, "time_per_iteration": 2.6887242794036865 }, { "auxiliary_loss_clip": 0.01133297, "auxiliary_loss_mlp": 0.01047865, "balance_loss_clip": 1.05029452, "balance_loss_mlp": 1.03038907, "epoch": 0.24199609198857658, "flos": 18369385914240.0, "grad_norm": 1.7776330743080708, "language_loss": 0.85974258, "learning_rate": 3.5467850047333424e-06, "loss": 0.88155425, "num_input_tokens_seen": 86650905, "step": 4025, "time_per_iteration": 2.7049782276153564 }, { "auxiliary_loss_clip": 0.01092706, "auxiliary_loss_mlp": 0.01064486, "balance_loss_clip": 1.04161918, "balance_loss_mlp": 1.04456651, "epoch": 0.24205621524124454, "flos": 19463871277440.0, "grad_norm": 1.8800874250001207, "language_loss": 0.71681315, "learning_rate": 3.546538084949365e-06, "loss": 0.73838508, "num_input_tokens_seen": 86669185, "step": 4026, "time_per_iteration": 2.7773284912109375 }, { "auxiliary_loss_clip": 0.01135992, "auxiliary_loss_mlp": 0.01046992, "balance_loss_clip": 1.05109096, "balance_loss_mlp": 1.03088713, "epoch": 0.2421163384939125, "flos": 14976330451200.0, "grad_norm": 1.967847260356932, "language_loss": 0.64436764, "learning_rate": 3.546291106520509e-06, "loss": 0.66619748, "num_input_tokens_seen": 86686805, "step": 4027, "time_per_iteration": 2.6143524646759033 }, { "auxiliary_loss_clip": 0.01136637, "auxiliary_loss_mlp": 0.00775283, "balance_loss_clip": 1.05106425, "balance_loss_mlp": 1.00103092, "epoch": 0.2421764617465805, "flos": 18662057930880.0, "grad_norm": 3.6118562291520813, "language_loss": 0.70909715, "learning_rate": 3.5460440694561388e-06, "loss": 0.72821641, "num_input_tokens_seen": 86705520, "step": 4028, "time_per_iteration": 2.656334400177002 }, { "auxiliary_loss_clip": 0.01053475, "auxiliary_loss_mlp": 0.01050053, "balance_loss_clip": 1.02715707, "balance_loss_mlp": 1.04756165, "epoch": 0.24223658499924847, "flos": 64347327164160.0, "grad_norm": 0.865443083354021, "language_loss": 0.55302447, "learning_rate": 3.545796973765623e-06, "loss": 0.57405978, "num_input_tokens_seen": 86767320, "step": 4029, "time_per_iteration": 3.1736607551574707 }, { "auxiliary_loss_clip": 0.0113268, "auxiliary_loss_mlp": 0.01051074, "balance_loss_clip": 1.04679179, "balance_loss_mlp": 1.03252554, "epoch": 0.24229670825191643, "flos": 25775243124480.0, "grad_norm": 1.6290009052774777, "language_loss": 0.74065894, "learning_rate": 3.54554981945833e-06, "loss": 0.76249647, "num_input_tokens_seen": 86788110, "step": 4030, "time_per_iteration": 2.644153118133545 }, { "auxiliary_loss_clip": 0.01146282, "auxiliary_loss_mlp": 0.01053008, "balance_loss_clip": 1.04945433, "balance_loss_mlp": 1.03495932, "epoch": 0.2423568315045844, "flos": 20667094087680.0, "grad_norm": 2.044571760348203, "language_loss": 0.76492965, "learning_rate": 3.5453026065436343e-06, "loss": 0.78692257, "num_input_tokens_seen": 86807640, "step": 4031, "time_per_iteration": 2.608718156814575 }, { "auxiliary_loss_clip": 0.01130345, "auxiliary_loss_mlp": 0.00776083, "balance_loss_clip": 1.04857934, "balance_loss_mlp": 1.00130129, "epoch": 0.24241695475725236, "flos": 22416805393920.0, "grad_norm": 2.367928778009572, "language_loss": 0.65578043, "learning_rate": 3.5450553350309083e-06, "loss": 0.67484468, "num_input_tokens_seen": 86826795, "step": 4032, "time_per_iteration": 2.713796377182007 }, { "auxiliary_loss_clip": 0.01128183, "auxiliary_loss_mlp": 0.0104339, "balance_loss_clip": 1.04551542, "balance_loss_mlp": 1.02591443, "epoch": 0.24247707800992033, "flos": 17128995505920.0, "grad_norm": 2.055558599382263, "language_loss": 0.81589901, "learning_rate": 3.5448080049295286e-06, "loss": 0.83761466, "num_input_tokens_seen": 86843175, "step": 4033, "time_per_iteration": 2.6381332874298096 }, { "auxiliary_loss_clip": 0.01101134, "auxiliary_loss_mlp": 0.01042507, "balance_loss_clip": 1.04264998, "balance_loss_mlp": 1.02450657, "epoch": 0.2425372012625883, "flos": 31613743399680.0, "grad_norm": 2.655330103252085, "language_loss": 0.68830204, "learning_rate": 3.5445606162488754e-06, "loss": 0.70973849, "num_input_tokens_seen": 86863185, "step": 4034, "time_per_iteration": 2.8269567489624023 }, { "auxiliary_loss_clip": 0.01129717, "auxiliary_loss_mlp": 0.01036472, "balance_loss_clip": 1.05142426, "balance_loss_mlp": 1.01839972, "epoch": 0.24259732451525629, "flos": 16326032924160.0, "grad_norm": 2.305872962411053, "language_loss": 0.96432853, "learning_rate": 3.5443131689983283e-06, "loss": 0.98599035, "num_input_tokens_seen": 86880040, "step": 4035, "time_per_iteration": 2.687131643295288 }, { "auxiliary_loss_clip": 0.01116249, "auxiliary_loss_mlp": 0.01051012, "balance_loss_clip": 1.0467937, "balance_loss_mlp": 1.03419125, "epoch": 0.24265744776792425, "flos": 22856639431680.0, "grad_norm": 1.5931877581057647, "language_loss": 0.7820307, "learning_rate": 3.5440656631872715e-06, "loss": 0.80370331, "num_input_tokens_seen": 86900610, "step": 4036, "time_per_iteration": 2.7576112747192383 }, { "auxiliary_loss_clip": 0.01137826, "auxiliary_loss_mlp": 0.01049747, "balance_loss_clip": 1.05010104, "balance_loss_mlp": 1.03141224, "epoch": 0.24271757102059222, "flos": 21871573873920.0, "grad_norm": 1.6332934168141529, "language_loss": 0.74266672, "learning_rate": 3.5438180988250898e-06, "loss": 0.76454246, "num_input_tokens_seen": 86919385, "step": 4037, "time_per_iteration": 2.7860629558563232 }, { "auxiliary_loss_clip": 0.01100993, "auxiliary_loss_mlp": 0.01042879, "balance_loss_clip": 1.04173183, "balance_loss_mlp": 1.02453303, "epoch": 0.24277769427326018, "flos": 19208582340480.0, "grad_norm": 8.14050816007968, "language_loss": 0.76632005, "learning_rate": 3.543570475921171e-06, "loss": 0.78775871, "num_input_tokens_seen": 86938885, "step": 4038, "time_per_iteration": 2.691695213317871 }, { "auxiliary_loss_clip": 0.01129874, "auxiliary_loss_mlp": 0.01043604, "balance_loss_clip": 1.04768467, "balance_loss_mlp": 1.0249598, "epoch": 0.24283781752592815, "flos": 19499889640320.0, "grad_norm": 3.2334161052349817, "language_loss": 0.71992457, "learning_rate": 3.543322794484905e-06, "loss": 0.7416594, "num_input_tokens_seen": 86957705, "step": 4039, "time_per_iteration": 4.128135442733765 }, { "auxiliary_loss_clip": 0.0112766, "auxiliary_loss_mlp": 0.01048109, "balance_loss_clip": 1.04597354, "balance_loss_mlp": 1.02921474, "epoch": 0.2428979407785961, "flos": 19902196944000.0, "grad_norm": 1.6158763194283545, "language_loss": 0.78655136, "learning_rate": 3.5430750545256843e-06, "loss": 0.80830908, "num_input_tokens_seen": 86975845, "step": 4040, "time_per_iteration": 4.174723863601685 }, { "auxiliary_loss_clip": 0.01090567, "auxiliary_loss_mlp": 0.01038965, "balance_loss_clip": 1.04526615, "balance_loss_mlp": 1.02268124, "epoch": 0.2429580640312641, "flos": 24715878284160.0, "grad_norm": 2.432557236688664, "language_loss": 0.80599713, "learning_rate": 3.5428272560529027e-06, "loss": 0.8272925, "num_input_tokens_seen": 86994800, "step": 4041, "time_per_iteration": 2.7933273315429688 }, { "auxiliary_loss_clip": 0.01108653, "auxiliary_loss_mlp": 0.01044101, "balance_loss_clip": 1.04587245, "balance_loss_mlp": 1.02733982, "epoch": 0.24301818728393207, "flos": 25630343660160.0, "grad_norm": 1.9967913274059828, "language_loss": 0.76708287, "learning_rate": 3.542579399075957e-06, "loss": 0.78861034, "num_input_tokens_seen": 87016845, "step": 4042, "time_per_iteration": 4.336673021316528 }, { "auxiliary_loss_clip": 0.01056541, "auxiliary_loss_mlp": 0.01035377, "balance_loss_clip": 1.04354727, "balance_loss_mlp": 1.01928389, "epoch": 0.24307831053660003, "flos": 26141388410880.0, "grad_norm": 1.8431659047813937, "language_loss": 0.81232125, "learning_rate": 3.542331483604246e-06, "loss": 0.83324039, "num_input_tokens_seen": 87036270, "step": 4043, "time_per_iteration": 2.9156856536865234 }, { "auxiliary_loss_clip": 0.01126576, "auxiliary_loss_mlp": 0.01038857, "balance_loss_clip": 1.04610896, "balance_loss_mlp": 1.02012897, "epoch": 0.243138433789268, "flos": 14972415868800.0, "grad_norm": 2.052349433785912, "language_loss": 0.73095596, "learning_rate": 3.5420835096471706e-06, "loss": 0.75261033, "num_input_tokens_seen": 87049920, "step": 4044, "time_per_iteration": 2.6324286460876465 }, { "auxiliary_loss_clip": 0.0113453, "auxiliary_loss_mlp": 0.01042417, "balance_loss_clip": 1.04967666, "balance_loss_mlp": 1.02445269, "epoch": 0.24319855704193596, "flos": 25191694771200.0, "grad_norm": 1.8848950918191658, "language_loss": 0.83676481, "learning_rate": 3.5418354772141337e-06, "loss": 0.85853434, "num_input_tokens_seen": 87068230, "step": 4045, "time_per_iteration": 2.68994402885437 }, { "auxiliary_loss_clip": 0.010753, "auxiliary_loss_mlp": 0.01047988, "balance_loss_clip": 1.04608011, "balance_loss_mlp": 1.03117943, "epoch": 0.24325868029460393, "flos": 22127221946880.0, "grad_norm": 1.9701839557075844, "language_loss": 0.86895847, "learning_rate": 3.541587386314541e-06, "loss": 0.89019132, "num_input_tokens_seen": 87086435, "step": 4046, "time_per_iteration": 2.908737897872925 }, { "auxiliary_loss_clip": 0.01120714, "auxiliary_loss_mlp": 0.01038682, "balance_loss_clip": 1.04705977, "balance_loss_mlp": 1.02070522, "epoch": 0.2433188035472719, "flos": 23582106420480.0, "grad_norm": 1.8855160425980928, "language_loss": 0.72759771, "learning_rate": 3.5413392369578e-06, "loss": 0.74919164, "num_input_tokens_seen": 87105340, "step": 4047, "time_per_iteration": 4.310218095779419 }, { "auxiliary_loss_clip": 0.01124014, "auxiliary_loss_mlp": 0.01045256, "balance_loss_clip": 1.04447186, "balance_loss_mlp": 1.02637279, "epoch": 0.2433789267999399, "flos": 24462815990400.0, "grad_norm": 2.592486480291502, "language_loss": 0.73029542, "learning_rate": 3.5410910291533213e-06, "loss": 0.75198811, "num_input_tokens_seen": 87125780, "step": 4048, "time_per_iteration": 2.699544668197632 }, { "auxiliary_loss_clip": 0.01112707, "auxiliary_loss_mlp": 0.01045312, "balance_loss_clip": 1.04923105, "balance_loss_mlp": 1.02869391, "epoch": 0.24343905005260785, "flos": 16727909264640.0, "grad_norm": 1.921127999919884, "language_loss": 0.73616529, "learning_rate": 3.5408427629105155e-06, "loss": 0.7577455, "num_input_tokens_seen": 87144470, "step": 4049, "time_per_iteration": 2.6988370418548584 }, { "auxiliary_loss_clip": 0.01093349, "auxiliary_loss_mlp": 0.01041657, "balance_loss_clip": 1.04289758, "balance_loss_mlp": 1.02583802, "epoch": 0.24349917330527582, "flos": 20043756443520.0, "grad_norm": 2.073976648883723, "language_loss": 0.7377705, "learning_rate": 3.5405944382387985e-06, "loss": 0.75912058, "num_input_tokens_seen": 87162830, "step": 4050, "time_per_iteration": 2.718212604522705 }, { "auxiliary_loss_clip": 0.01116995, "auxiliary_loss_mlp": 0.01043968, "balance_loss_clip": 1.04518783, "balance_loss_mlp": 1.02800608, "epoch": 0.24355929655794378, "flos": 17420554200960.0, "grad_norm": 2.361179977901575, "language_loss": 0.75518602, "learning_rate": 3.5403460551475854e-06, "loss": 0.77679563, "num_input_tokens_seen": 87180905, "step": 4051, "time_per_iteration": 2.6522655487060547 }, { "auxiliary_loss_clip": 0.01092567, "auxiliary_loss_mlp": 0.01042511, "balance_loss_clip": 1.04197812, "balance_loss_mlp": 1.02507067, "epoch": 0.24361941981061175, "flos": 25410929431680.0, "grad_norm": 2.2644912923037985, "language_loss": 0.70717591, "learning_rate": 3.540097613646296e-06, "loss": 0.72852671, "num_input_tokens_seen": 87202290, "step": 4052, "time_per_iteration": 2.794059991836548 }, { "auxiliary_loss_clip": 0.0111622, "auxiliary_loss_mlp": 0.01045494, "balance_loss_clip": 1.04823005, "balance_loss_mlp": 1.02833986, "epoch": 0.2436795430632797, "flos": 22820800636800.0, "grad_norm": 1.7022998331113812, "language_loss": 0.80989587, "learning_rate": 3.539849113744351e-06, "loss": 0.83151299, "num_input_tokens_seen": 87221650, "step": 4053, "time_per_iteration": 2.682805299758911 }, { "auxiliary_loss_clip": 0.01148244, "auxiliary_loss_mlp": 0.01038109, "balance_loss_clip": 1.05124915, "balance_loss_mlp": 1.0210743, "epoch": 0.2437396663159477, "flos": 15157786982400.0, "grad_norm": 1.5338885161808513, "language_loss": 0.77628779, "learning_rate": 3.539600555451172e-06, "loss": 0.79815125, "num_input_tokens_seen": 87238515, "step": 4054, "time_per_iteration": 2.635181427001953 }, { "auxiliary_loss_clip": 0.01095192, "auxiliary_loss_mlp": 0.01055244, "balance_loss_clip": 1.04067969, "balance_loss_mlp": 1.03783989, "epoch": 0.24379978956861567, "flos": 22091131756800.0, "grad_norm": 1.8808929031646056, "language_loss": 0.84398115, "learning_rate": 3.5393519387761866e-06, "loss": 0.86548549, "num_input_tokens_seen": 87256290, "step": 4055, "time_per_iteration": 2.757601261138916 }, { "auxiliary_loss_clip": 0.01110063, "auxiliary_loss_mlp": 0.01045315, "balance_loss_clip": 1.04298997, "balance_loss_mlp": 1.02767169, "epoch": 0.24385991282128364, "flos": 31467766527360.0, "grad_norm": 2.5636936013515776, "language_loss": 0.55038011, "learning_rate": 3.5391032637288217e-06, "loss": 0.57193393, "num_input_tokens_seen": 87277085, "step": 4056, "time_per_iteration": 2.7788894176483154 }, { "auxiliary_loss_clip": 0.0113756, "auxiliary_loss_mlp": 0.01046233, "balance_loss_clip": 1.04897046, "balance_loss_mlp": 1.02876842, "epoch": 0.2439200360739516, "flos": 23838795987840.0, "grad_norm": 2.64902132986976, "language_loss": 0.80583262, "learning_rate": 3.538854530318506e-06, "loss": 0.82767057, "num_input_tokens_seen": 87293020, "step": 4057, "time_per_iteration": 2.78110671043396 }, { "auxiliary_loss_clip": 0.01132987, "auxiliary_loss_mlp": 0.01048497, "balance_loss_clip": 1.04877245, "balance_loss_mlp": 1.03145027, "epoch": 0.24398015932661957, "flos": 19169978198400.0, "grad_norm": 1.8133503864036424, "language_loss": 0.79202968, "learning_rate": 3.538605738554673e-06, "loss": 0.81384456, "num_input_tokens_seen": 87311445, "step": 4058, "time_per_iteration": 2.6609115600585938 }, { "auxiliary_loss_clip": 0.01147749, "auxiliary_loss_mlp": 0.01045059, "balance_loss_clip": 1.04827118, "balance_loss_mlp": 1.02920449, "epoch": 0.24404028257928753, "flos": 25262474520960.0, "grad_norm": 3.3482411666646086, "language_loss": 0.85503888, "learning_rate": 3.538356888446756e-06, "loss": 0.87696695, "num_input_tokens_seen": 87332055, "step": 4059, "time_per_iteration": 2.724241256713867 }, { "auxiliary_loss_clip": 0.01126127, "auxiliary_loss_mlp": 0.01038967, "balance_loss_clip": 1.04837418, "balance_loss_mlp": 1.02296889, "epoch": 0.2441004058319555, "flos": 26467600752000.0, "grad_norm": 2.2060888459440617, "language_loss": 0.7483452, "learning_rate": 3.5381079800041913e-06, "loss": 0.76999605, "num_input_tokens_seen": 87351295, "step": 4060, "time_per_iteration": 2.6769304275512695 }, { "auxiliary_loss_clip": 0.01111679, "auxiliary_loss_mlp": 0.01051445, "balance_loss_clip": 1.04629493, "balance_loss_mlp": 1.03247917, "epoch": 0.2441605290846235, "flos": 26760524163840.0, "grad_norm": 2.624850134940939, "language_loss": 0.73482168, "learning_rate": 3.5378590132364182e-06, "loss": 0.75645292, "num_input_tokens_seen": 87370650, "step": 4061, "time_per_iteration": 2.7570559978485107 }, { "auxiliary_loss_clip": 0.01144554, "auxiliary_loss_mlp": 0.01039707, "balance_loss_clip": 1.05180097, "balance_loss_mlp": 1.02394772, "epoch": 0.24422065233729146, "flos": 21105850717440.0, "grad_norm": 4.11905418985837, "language_loss": 0.76135921, "learning_rate": 3.5376099881528768e-06, "loss": 0.78320187, "num_input_tokens_seen": 87389020, "step": 4062, "time_per_iteration": 2.6387689113616943 }, { "auxiliary_loss_clip": 0.01104974, "auxiliary_loss_mlp": 0.01041222, "balance_loss_clip": 1.04618907, "balance_loss_mlp": 1.02458024, "epoch": 0.24428077558995942, "flos": 25263156879360.0, "grad_norm": 2.5628995075758954, "language_loss": 0.85376853, "learning_rate": 3.537360904763011e-06, "loss": 0.87523055, "num_input_tokens_seen": 87409695, "step": 4063, "time_per_iteration": 2.7785301208496094 }, { "auxiliary_loss_clip": 0.01119987, "auxiliary_loss_mlp": 0.01047158, "balance_loss_clip": 1.04776239, "balance_loss_mlp": 1.02789354, "epoch": 0.24434089884262739, "flos": 20485278420480.0, "grad_norm": 2.760332484942286, "language_loss": 0.6845879, "learning_rate": 3.5371117630762656e-06, "loss": 0.70625937, "num_input_tokens_seen": 87428250, "step": 4064, "time_per_iteration": 2.6691763401031494 }, { "auxiliary_loss_clip": 0.01138225, "auxiliary_loss_mlp": 0.01046639, "balance_loss_clip": 1.04773867, "balance_loss_mlp": 1.02892423, "epoch": 0.24440102209529535, "flos": 23621895711360.0, "grad_norm": 1.603702751214229, "language_loss": 0.70247531, "learning_rate": 3.536862563102088e-06, "loss": 0.72432399, "num_input_tokens_seen": 87449380, "step": 4065, "time_per_iteration": 2.6677680015563965 }, { "auxiliary_loss_clip": 0.01150465, "auxiliary_loss_mlp": 0.0104697, "balance_loss_clip": 1.05127215, "balance_loss_mlp": 1.02803993, "epoch": 0.24446114534796332, "flos": 20554729367040.0, "grad_norm": 1.788543447431289, "language_loss": 0.84282506, "learning_rate": 3.5366133048499282e-06, "loss": 0.86479944, "num_input_tokens_seen": 87465365, "step": 4066, "time_per_iteration": 2.5993456840515137 }, { "auxiliary_loss_clip": 0.01067736, "auxiliary_loss_mlp": 0.01002523, "balance_loss_clip": 1.03198457, "balance_loss_mlp": 1.00028193, "epoch": 0.24452126860063128, "flos": 60389575009920.0, "grad_norm": 0.7359455307187547, "language_loss": 0.52283657, "learning_rate": 3.5363639883292374e-06, "loss": 0.54353911, "num_input_tokens_seen": 87522525, "step": 4067, "time_per_iteration": 3.056666374206543 }, { "auxiliary_loss_clip": 0.01123042, "auxiliary_loss_mlp": 0.01045731, "balance_loss_clip": 1.04955244, "balance_loss_mlp": 1.0279212, "epoch": 0.24458139185329927, "flos": 15121660878720.0, "grad_norm": 2.6392300526537493, "language_loss": 0.7185899, "learning_rate": 3.5361146135494706e-06, "loss": 0.74027765, "num_input_tokens_seen": 87539170, "step": 4068, "time_per_iteration": 2.700847864151001 }, { "auxiliary_loss_clip": 0.01086004, "auxiliary_loss_mlp": 0.01047493, "balance_loss_clip": 1.04378593, "balance_loss_mlp": 1.02920675, "epoch": 0.24464151510596724, "flos": 27998723842560.0, "grad_norm": 2.4202919064349744, "language_loss": 0.78083313, "learning_rate": 3.5358651805200835e-06, "loss": 0.80216813, "num_input_tokens_seen": 87558875, "step": 4069, "time_per_iteration": 2.9363162517547607 }, { "auxiliary_loss_clip": 0.01119666, "auxiliary_loss_mlp": 0.0105204, "balance_loss_clip": 1.05164659, "balance_loss_mlp": 1.03445613, "epoch": 0.2447016383586352, "flos": 19792884879360.0, "grad_norm": 4.167143793475273, "language_loss": 0.80607939, "learning_rate": 3.5356156892505347e-06, "loss": 0.82779646, "num_input_tokens_seen": 87576485, "step": 4070, "time_per_iteration": 2.658191204071045 }, { "auxiliary_loss_clip": 0.01127014, "auxiliary_loss_mlp": 0.01049283, "balance_loss_clip": 1.04832387, "balance_loss_mlp": 1.03218853, "epoch": 0.24476176161130317, "flos": 26067340523520.0, "grad_norm": 1.5316441932107319, "language_loss": 0.84351504, "learning_rate": 3.5353661397502854e-06, "loss": 0.86527801, "num_input_tokens_seen": 87598620, "step": 4071, "time_per_iteration": 2.7118849754333496 }, { "auxiliary_loss_clip": 0.01120333, "auxiliary_loss_mlp": 0.01057334, "balance_loss_clip": 1.04778695, "balance_loss_mlp": 1.03601933, "epoch": 0.24482188486397113, "flos": 18843550375680.0, "grad_norm": 1.8860726044388547, "language_loss": 0.80115497, "learning_rate": 3.535116532028798e-06, "loss": 0.82293165, "num_input_tokens_seen": 87616595, "step": 4072, "time_per_iteration": 2.6662774085998535 }, { "auxiliary_loss_clip": 0.01134806, "auxiliary_loss_mlp": 0.0104215, "balance_loss_clip": 1.05156791, "balance_loss_mlp": 1.02614021, "epoch": 0.2448820081166391, "flos": 21251791676160.0, "grad_norm": 3.990887653020168, "language_loss": 0.70466423, "learning_rate": 3.5348668660955382e-06, "loss": 0.72643375, "num_input_tokens_seen": 87635755, "step": 4073, "time_per_iteration": 2.7366209030151367 }, { "auxiliary_loss_clip": 0.01110472, "auxiliary_loss_mlp": 0.01047265, "balance_loss_clip": 1.04666865, "balance_loss_mlp": 1.03090906, "epoch": 0.2449421313693071, "flos": 23950586090880.0, "grad_norm": 2.943884117668681, "language_loss": 0.67292917, "learning_rate": 3.5346171419599728e-06, "loss": 0.69450659, "num_input_tokens_seen": 87652885, "step": 4074, "time_per_iteration": 2.7158730030059814 }, { "auxiliary_loss_clip": 0.01062567, "auxiliary_loss_mlp": 0.01002121, "balance_loss_clip": 1.02741885, "balance_loss_mlp": 0.99986744, "epoch": 0.24500225462197506, "flos": 60687669980160.0, "grad_norm": 0.8927046346070237, "language_loss": 0.68608266, "learning_rate": 3.5343673596315718e-06, "loss": 0.70672953, "num_input_tokens_seen": 87713220, "step": 4075, "time_per_iteration": 3.2283740043640137 }, { "auxiliary_loss_clip": 0.01146172, "auxiliary_loss_mlp": 0.01042507, "balance_loss_clip": 1.05287361, "balance_loss_mlp": 1.02612722, "epoch": 0.24506237787464302, "flos": 26284204886400.0, "grad_norm": 2.3370219869490563, "language_loss": 0.79263043, "learning_rate": 3.5341175191198063e-06, "loss": 0.81451714, "num_input_tokens_seen": 87732680, "step": 4076, "time_per_iteration": 2.6744346618652344 }, { "auxiliary_loss_clip": 0.01128421, "auxiliary_loss_mlp": 0.00775989, "balance_loss_clip": 1.04903293, "balance_loss_mlp": 1.001266, "epoch": 0.245122501127311, "flos": 20552287242240.0, "grad_norm": 1.816414447330212, "language_loss": 0.81986046, "learning_rate": 3.533867620434151e-06, "loss": 0.83890456, "num_input_tokens_seen": 87751880, "step": 4077, "time_per_iteration": 2.729391098022461 }, { "auxiliary_loss_clip": 0.01148302, "auxiliary_loss_mlp": 0.01047154, "balance_loss_clip": 1.05185413, "balance_loss_mlp": 1.0288794, "epoch": 0.24518262437997895, "flos": 29132603447040.0, "grad_norm": 2.0328430965985045, "language_loss": 0.62790757, "learning_rate": 3.533617663584082e-06, "loss": 0.64986217, "num_input_tokens_seen": 87771795, "step": 4078, "time_per_iteration": 2.694767713546753 }, { "auxiliary_loss_clip": 0.01114498, "auxiliary_loss_mlp": 0.01039203, "balance_loss_clip": 1.04953861, "balance_loss_mlp": 1.02270436, "epoch": 0.24524274763264692, "flos": 23476924419840.0, "grad_norm": 1.5687748074794818, "language_loss": 0.75811553, "learning_rate": 3.5333676485790765e-06, "loss": 0.7796526, "num_input_tokens_seen": 87793640, "step": 4079, "time_per_iteration": 4.288895130157471 }, { "auxiliary_loss_clip": 0.01142871, "auxiliary_loss_mlp": 0.01047138, "balance_loss_clip": 1.04899406, "balance_loss_mlp": 1.02955461, "epoch": 0.24530287088531488, "flos": 17201175886080.0, "grad_norm": 1.8811380892336844, "language_loss": 0.74537313, "learning_rate": 3.5331175754286173e-06, "loss": 0.76727325, "num_input_tokens_seen": 87812390, "step": 4080, "time_per_iteration": 2.683969736099243 }, { "auxiliary_loss_clip": 0.01115604, "auxiliary_loss_mlp": 0.01041593, "balance_loss_clip": 1.04717278, "balance_loss_mlp": 1.02558291, "epoch": 0.24536299413798288, "flos": 14867449349760.0, "grad_norm": 2.2859558621761997, "language_loss": 0.83389306, "learning_rate": 3.532867444142186e-06, "loss": 0.85546505, "num_input_tokens_seen": 87830640, "step": 4081, "time_per_iteration": 2.772573947906494 }, { "auxiliary_loss_clip": 0.01114607, "auxiliary_loss_mlp": 0.01040674, "balance_loss_clip": 1.04734826, "balance_loss_mlp": 1.02473605, "epoch": 0.24542311739065084, "flos": 35262051886080.0, "grad_norm": 1.8658741711896472, "language_loss": 0.73223484, "learning_rate": 3.532617254729267e-06, "loss": 0.7537877, "num_input_tokens_seen": 87850450, "step": 4082, "time_per_iteration": 4.3304970264434814 }, { "auxiliary_loss_clip": 0.01104397, "auxiliary_loss_mlp": 0.01047151, "balance_loss_clip": 1.04542649, "balance_loss_mlp": 1.03163004, "epoch": 0.2454832406433188, "flos": 21503130117120.0, "grad_norm": 1.7143564189307843, "language_loss": 0.72032338, "learning_rate": 3.5323670071993485e-06, "loss": 0.74183893, "num_input_tokens_seen": 87868810, "step": 4083, "time_per_iteration": 2.7463390827178955 }, { "auxiliary_loss_clip": 0.01115479, "auxiliary_loss_mlp": 0.01048832, "balance_loss_clip": 1.04441845, "balance_loss_mlp": 1.02979386, "epoch": 0.24554336389598677, "flos": 14756664827520.0, "grad_norm": 2.556114612666859, "language_loss": 0.74363655, "learning_rate": 3.532116701561919e-06, "loss": 0.76527965, "num_input_tokens_seen": 87885685, "step": 4084, "time_per_iteration": 2.6828086376190186 }, { "auxiliary_loss_clip": 0.01126215, "auxiliary_loss_mlp": 0.01040078, "balance_loss_clip": 1.04541206, "balance_loss_mlp": 1.02269721, "epoch": 0.24560348714865474, "flos": 14976402278400.0, "grad_norm": 2.030442784512354, "language_loss": 0.85540497, "learning_rate": 3.531866337826471e-06, "loss": 0.87706792, "num_input_tokens_seen": 87903715, "step": 4085, "time_per_iteration": 4.236302852630615 }, { "auxiliary_loss_clip": 0.01110493, "auxiliary_loss_mlp": 0.01046501, "balance_loss_clip": 1.04634261, "balance_loss_mlp": 1.02932286, "epoch": 0.2456636104013227, "flos": 22675326554880.0, "grad_norm": 2.028282258660301, "language_loss": 0.78985649, "learning_rate": 3.5316159160024982e-06, "loss": 0.8114264, "num_input_tokens_seen": 87923375, "step": 4086, "time_per_iteration": 2.6638717651367188 }, { "auxiliary_loss_clip": 0.01087456, "auxiliary_loss_mlp": 0.0104508, "balance_loss_clip": 1.04792905, "balance_loss_mlp": 1.02847362, "epoch": 0.2457237336539907, "flos": 27417869009280.0, "grad_norm": 5.7080500305845865, "language_loss": 0.75053227, "learning_rate": 3.531365436099496e-06, "loss": 0.77185762, "num_input_tokens_seen": 87943115, "step": 4087, "time_per_iteration": 2.8027901649475098 }, { "auxiliary_loss_clip": 0.01090549, "auxiliary_loss_mlp": 0.01045493, "balance_loss_clip": 1.04807436, "balance_loss_mlp": 1.02680135, "epoch": 0.24578385690665866, "flos": 20412379768320.0, "grad_norm": 2.066557704160291, "language_loss": 0.79291761, "learning_rate": 3.5311148981269635e-06, "loss": 0.81427807, "num_input_tokens_seen": 87959505, "step": 4088, "time_per_iteration": 2.78812575340271 }, { "auxiliary_loss_clip": 0.0110062, "auxiliary_loss_mlp": 0.01035541, "balance_loss_clip": 1.04435658, "balance_loss_mlp": 1.01949525, "epoch": 0.24584398015932662, "flos": 23915393740800.0, "grad_norm": 1.4918864539426413, "language_loss": 0.77053773, "learning_rate": 3.5308643020944e-06, "loss": 0.79189926, "num_input_tokens_seen": 87979725, "step": 4089, "time_per_iteration": 2.75034761428833 }, { "auxiliary_loss_clip": 0.01125156, "auxiliary_loss_mlp": 0.0104201, "balance_loss_clip": 1.04609382, "balance_loss_mlp": 1.02470064, "epoch": 0.2459041034119946, "flos": 41496359103360.0, "grad_norm": 2.3383647352821737, "language_loss": 0.81814516, "learning_rate": 3.530613648011309e-06, "loss": 0.83981681, "num_input_tokens_seen": 87998270, "step": 4090, "time_per_iteration": 2.891878604888916 }, { "auxiliary_loss_clip": 0.01121872, "auxiliary_loss_mlp": 0.01050145, "balance_loss_clip": 1.04687834, "balance_loss_mlp": 1.03163147, "epoch": 0.24596422666466256, "flos": 19936814676480.0, "grad_norm": 1.8221600402702927, "language_loss": 0.73833978, "learning_rate": 3.5303629358871946e-06, "loss": 0.76005995, "num_input_tokens_seen": 88016760, "step": 4091, "time_per_iteration": 2.6410961151123047 }, { "auxiliary_loss_clip": 0.01114038, "auxiliary_loss_mlp": 0.01045509, "balance_loss_clip": 1.05517268, "balance_loss_mlp": 1.0279969, "epoch": 0.24602434991733052, "flos": 21544391865600.0, "grad_norm": 1.8983812190731213, "language_loss": 0.7706998, "learning_rate": 3.5301121657315653e-06, "loss": 0.79229522, "num_input_tokens_seen": 88036465, "step": 4092, "time_per_iteration": 2.7038323879241943 }, { "auxiliary_loss_clip": 0.01115501, "auxiliary_loss_mlp": 0.01040797, "balance_loss_clip": 1.04371238, "balance_loss_mlp": 1.02255797, "epoch": 0.24608447316999849, "flos": 23185078416000.0, "grad_norm": 3.1365051823944627, "language_loss": 0.81200075, "learning_rate": 3.5298613375539287e-06, "loss": 0.83356375, "num_input_tokens_seen": 88053270, "step": 4093, "time_per_iteration": 2.680634021759033 }, { "auxiliary_loss_clip": 0.01135527, "auxiliary_loss_mlp": 0.01043826, "balance_loss_clip": 1.04879606, "balance_loss_mlp": 1.02613521, "epoch": 0.24614459642266648, "flos": 19641951930240.0, "grad_norm": 1.9167765067224862, "language_loss": 0.86932534, "learning_rate": 3.529610451363797e-06, "loss": 0.89111882, "num_input_tokens_seen": 88072305, "step": 4094, "time_per_iteration": 2.6558003425598145 }, { "auxiliary_loss_clip": 0.01007267, "auxiliary_loss_mlp": 0.01019789, "balance_loss_clip": 1.03124738, "balance_loss_mlp": 1.01697576, "epoch": 0.24620471967533444, "flos": 61739816186880.0, "grad_norm": 0.7554163750993251, "language_loss": 0.57503664, "learning_rate": 3.5293595071706833e-06, "loss": 0.59530711, "num_input_tokens_seen": 88137995, "step": 4095, "time_per_iteration": 3.3576478958129883 }, { "auxiliary_loss_clip": 0.01051219, "auxiliary_loss_mlp": 0.0102022, "balance_loss_clip": 1.03409493, "balance_loss_mlp": 1.01790738, "epoch": 0.2462648429280024, "flos": 69154436315520.0, "grad_norm": 0.655284075812517, "language_loss": 0.56260574, "learning_rate": 3.5291085049841042e-06, "loss": 0.58332014, "num_input_tokens_seen": 88208490, "step": 4096, "time_per_iteration": 3.376516580581665 }, { "auxiliary_loss_clip": 0.0112712, "auxiliary_loss_mlp": 0.01040362, "balance_loss_clip": 1.05330801, "balance_loss_mlp": 1.0236733, "epoch": 0.24632496618067037, "flos": 29459605887360.0, "grad_norm": 1.7306008966026363, "language_loss": 0.77629399, "learning_rate": 3.5288574448135773e-06, "loss": 0.79796875, "num_input_tokens_seen": 88228050, "step": 4097, "time_per_iteration": 2.6973912715911865 }, { "auxiliary_loss_clip": 0.01114293, "auxiliary_loss_mlp": 0.01047339, "balance_loss_clip": 1.04898906, "balance_loss_mlp": 1.02842093, "epoch": 0.24638508943333834, "flos": 24316444068480.0, "grad_norm": 2.4079595240953613, "language_loss": 0.75890571, "learning_rate": 3.5286063266686235e-06, "loss": 0.78052205, "num_input_tokens_seen": 88248090, "step": 4098, "time_per_iteration": 2.739947557449341 }, { "auxiliary_loss_clip": 0.0112794, "auxiliary_loss_mlp": 0.01046194, "balance_loss_clip": 1.05179596, "balance_loss_mlp": 1.03002954, "epoch": 0.2464452126860063, "flos": 26613254401920.0, "grad_norm": 2.5671853201902737, "language_loss": 0.68179071, "learning_rate": 3.528355150558764e-06, "loss": 0.7035321, "num_input_tokens_seen": 88267545, "step": 4099, "time_per_iteration": 2.7144618034362793 }, { "auxiliary_loss_clip": 0.01133513, "auxiliary_loss_mlp": 0.01045673, "balance_loss_clip": 1.05187321, "balance_loss_mlp": 1.02897191, "epoch": 0.24650533593867427, "flos": 31212405763200.0, "grad_norm": 2.0343787496625656, "language_loss": 0.65915, "learning_rate": 3.5281039164935237e-06, "loss": 0.68094188, "num_input_tokens_seen": 88289785, "step": 4100, "time_per_iteration": 2.724008560180664 }, { "auxiliary_loss_clip": 0.01054067, "auxiliary_loss_mlp": 0.01041004, "balance_loss_clip": 1.03763318, "balance_loss_mlp": 1.03830957, "epoch": 0.24656545919134226, "flos": 68494002900480.0, "grad_norm": 0.7229502883874133, "language_loss": 0.61514676, "learning_rate": 3.5278526244824304e-06, "loss": 0.63609749, "num_input_tokens_seen": 88357320, "step": 4101, "time_per_iteration": 3.3748011589050293 }, { "auxiliary_loss_clip": 0.01144305, "auxiliary_loss_mlp": 0.01041937, "balance_loss_clip": 1.05133915, "balance_loss_mlp": 1.02455676, "epoch": 0.24662558244401023, "flos": 20084192179200.0, "grad_norm": 2.2333045722985028, "language_loss": 0.73272061, "learning_rate": 3.527601274535012e-06, "loss": 0.754583, "num_input_tokens_seen": 88377040, "step": 4102, "time_per_iteration": 2.7457518577575684 }, { "auxiliary_loss_clip": 0.01124231, "auxiliary_loss_mlp": 0.01043636, "balance_loss_clip": 1.04909408, "balance_loss_mlp": 1.02699423, "epoch": 0.2466857056966782, "flos": 30701361012480.0, "grad_norm": 2.9311552217427774, "language_loss": 0.76528364, "learning_rate": 3.5273498666608004e-06, "loss": 0.78696227, "num_input_tokens_seen": 88395085, "step": 4103, "time_per_iteration": 2.732285499572754 }, { "auxiliary_loss_clip": 0.01128751, "auxiliary_loss_mlp": 0.01051695, "balance_loss_clip": 1.04730439, "balance_loss_mlp": 1.03313375, "epoch": 0.24674582894934616, "flos": 22528523669760.0, "grad_norm": 2.3173933836652902, "language_loss": 0.78658336, "learning_rate": 3.5270984008693288e-06, "loss": 0.80838788, "num_input_tokens_seen": 88413205, "step": 4104, "time_per_iteration": 2.7234179973602295 }, { "auxiliary_loss_clip": 0.01134641, "auxiliary_loss_mlp": 0.01045411, "balance_loss_clip": 1.05110276, "balance_loss_mlp": 1.02601588, "epoch": 0.24680595220201412, "flos": 20704297599360.0, "grad_norm": 1.883953093480743, "language_loss": 0.8375451, "learning_rate": 3.526846877170133e-06, "loss": 0.85934561, "num_input_tokens_seen": 88431525, "step": 4105, "time_per_iteration": 2.7051403522491455 }, { "auxiliary_loss_clip": 0.01149885, "auxiliary_loss_mlp": 0.01051204, "balance_loss_clip": 1.05490828, "balance_loss_mlp": 1.03340602, "epoch": 0.2468660754546821, "flos": 21831174051840.0, "grad_norm": 1.9903096770852142, "language_loss": 0.76503521, "learning_rate": 3.52659529557275e-06, "loss": 0.78704607, "num_input_tokens_seen": 88451210, "step": 4106, "time_per_iteration": 2.6324243545532227 }, { "auxiliary_loss_clip": 0.01107346, "auxiliary_loss_mlp": 0.01058334, "balance_loss_clip": 1.0438261, "balance_loss_mlp": 1.03743649, "epoch": 0.24692619870735008, "flos": 15267709578240.0, "grad_norm": 2.3469304270549487, "language_loss": 0.72399199, "learning_rate": 3.5263436560867205e-06, "loss": 0.74564874, "num_input_tokens_seen": 88467790, "step": 4107, "time_per_iteration": 2.6767516136169434 }, { "auxiliary_loss_clip": 0.01149014, "auxiliary_loss_mlp": 0.01055902, "balance_loss_clip": 1.05365527, "balance_loss_mlp": 1.03840184, "epoch": 0.24698632196001805, "flos": 29680097523840.0, "grad_norm": 2.655550859638868, "language_loss": 0.65495557, "learning_rate": 3.526091958721587e-06, "loss": 0.67700469, "num_input_tokens_seen": 88490330, "step": 4108, "time_per_iteration": 2.666501760482788 }, { "auxiliary_loss_clip": 0.01095567, "auxiliary_loss_mlp": 0.01053352, "balance_loss_clip": 1.04577923, "balance_loss_mlp": 1.0351851, "epoch": 0.247046445212686, "flos": 39165469741440.0, "grad_norm": 1.631565192024798, "language_loss": 0.72685403, "learning_rate": 3.5258402034868936e-06, "loss": 0.74834323, "num_input_tokens_seen": 88512435, "step": 4109, "time_per_iteration": 2.8588712215423584 }, { "auxiliary_loss_clip": 0.01110552, "auxiliary_loss_mlp": 0.01048877, "balance_loss_clip": 1.04754984, "balance_loss_mlp": 1.03132939, "epoch": 0.24710656846535398, "flos": 22998845376000.0, "grad_norm": 1.9000447272053396, "language_loss": 0.79328829, "learning_rate": 3.5255883903921866e-06, "loss": 0.81488264, "num_input_tokens_seen": 88529780, "step": 4110, "time_per_iteration": 2.7403078079223633 }, { "auxiliary_loss_clip": 0.01114435, "auxiliary_loss_mlp": 0.0104359, "balance_loss_clip": 1.04750848, "balance_loss_mlp": 1.02536333, "epoch": 0.24716669171802194, "flos": 26432803451520.0, "grad_norm": 1.9757162932013852, "language_loss": 0.80630267, "learning_rate": 3.5253365194470144e-06, "loss": 0.82788301, "num_input_tokens_seen": 88547200, "step": 4111, "time_per_iteration": 2.6893255710601807 }, { "auxiliary_loss_clip": 0.01143907, "auxiliary_loss_mlp": 0.0104799, "balance_loss_clip": 1.0493356, "balance_loss_mlp": 1.03203976, "epoch": 0.2472268149706899, "flos": 23329870139520.0, "grad_norm": 1.928179444788623, "language_loss": 0.75401616, "learning_rate": 3.5250845906609294e-06, "loss": 0.77593511, "num_input_tokens_seen": 88566415, "step": 4112, "time_per_iteration": 2.641103506088257 }, { "auxiliary_loss_clip": 0.01112249, "auxiliary_loss_mlp": 0.00775958, "balance_loss_clip": 1.04847336, "balance_loss_mlp": 1.00114262, "epoch": 0.24728693822335787, "flos": 23768734510080.0, "grad_norm": 2.1227710866712908, "language_loss": 0.8244158, "learning_rate": 3.5248326040434835e-06, "loss": 0.84329784, "num_input_tokens_seen": 88585225, "step": 4113, "time_per_iteration": 2.831209182739258 }, { "auxiliary_loss_clip": 0.01143893, "auxiliary_loss_mlp": 0.01043423, "balance_loss_clip": 1.04927897, "balance_loss_mlp": 1.02574396, "epoch": 0.24734706147602586, "flos": 19317499355520.0, "grad_norm": 2.5263325514304813, "language_loss": 0.8704375, "learning_rate": 3.5245805596042322e-06, "loss": 0.89231074, "num_input_tokens_seen": 88603280, "step": 4114, "time_per_iteration": 2.7264626026153564 }, { "auxiliary_loss_clip": 0.01096969, "auxiliary_loss_mlp": 0.01047533, "balance_loss_clip": 1.04748011, "balance_loss_mlp": 1.03005731, "epoch": 0.24740718472869383, "flos": 28036932935040.0, "grad_norm": 1.6498261942323098, "language_loss": 0.75283766, "learning_rate": 3.524328457352734e-06, "loss": 0.77428269, "num_input_tokens_seen": 88624925, "step": 4115, "time_per_iteration": 2.755342483520508 }, { "auxiliary_loss_clip": 0.01018711, "auxiliary_loss_mlp": 0.01070163, "balance_loss_clip": 1.03186083, "balance_loss_mlp": 1.06756425, "epoch": 0.2474673079813618, "flos": 68107569408000.0, "grad_norm": 0.6879904854197085, "language_loss": 0.58123159, "learning_rate": 3.5240762972985475e-06, "loss": 0.60212028, "num_input_tokens_seen": 88691475, "step": 4116, "time_per_iteration": 3.4015462398529053 }, { "auxiliary_loss_clip": 0.01122111, "auxiliary_loss_mlp": 0.01038886, "balance_loss_clip": 1.04813063, "balance_loss_mlp": 1.02213693, "epoch": 0.24752743123402976, "flos": 29462119839360.0, "grad_norm": 19.427234883564427, "language_loss": 0.83599627, "learning_rate": 3.523824079451235e-06, "loss": 0.85760617, "num_input_tokens_seen": 88713425, "step": 4117, "time_per_iteration": 2.7881336212158203 }, { "auxiliary_loss_clip": 0.01041379, "auxiliary_loss_mlp": 0.00755386, "balance_loss_clip": 1.02616835, "balance_loss_mlp": 1.0023396, "epoch": 0.24758755448669773, "flos": 58350459824640.0, "grad_norm": 0.909523411860611, "language_loss": 0.63518536, "learning_rate": 3.5235718038203602e-06, "loss": 0.65315294, "num_input_tokens_seen": 88769995, "step": 4118, "time_per_iteration": 3.1125216484069824 }, { "auxiliary_loss_clip": 0.01126335, "auxiliary_loss_mlp": 0.01048787, "balance_loss_clip": 1.04487431, "balance_loss_mlp": 1.03127515, "epoch": 0.2476476777393657, "flos": 20484416494080.0, "grad_norm": 2.1708029437062546, "language_loss": 0.79272264, "learning_rate": 3.523319470415491e-06, "loss": 0.81447387, "num_input_tokens_seen": 88789970, "step": 4119, "time_per_iteration": 6.294121503829956 }, { "auxiliary_loss_clip": 0.01133521, "auxiliary_loss_mlp": 0.01044138, "balance_loss_clip": 1.05223441, "balance_loss_mlp": 1.02707899, "epoch": 0.24770780099203366, "flos": 20485853038080.0, "grad_norm": 1.7395275513138477, "language_loss": 0.74590164, "learning_rate": 3.5230670792461943e-06, "loss": 0.76767826, "num_input_tokens_seen": 88810000, "step": 4120, "time_per_iteration": 2.6947290897369385 }, { "auxiliary_loss_clip": 0.01135162, "auxiliary_loss_mlp": 0.01051636, "balance_loss_clip": 1.04963648, "balance_loss_mlp": 1.03435111, "epoch": 0.24776792424470165, "flos": 15153405523200.0, "grad_norm": 3.32651820696464, "language_loss": 0.88006538, "learning_rate": 3.522814630322041e-06, "loss": 0.90193337, "num_input_tokens_seen": 88827515, "step": 4121, "time_per_iteration": 4.181556224822998 }, { "auxiliary_loss_clip": 0.01147178, "auxiliary_loss_mlp": 0.01042601, "balance_loss_clip": 1.05039763, "balance_loss_mlp": 1.02431381, "epoch": 0.2478280474973696, "flos": 21725453347200.0, "grad_norm": 2.0457274986343204, "language_loss": 0.69676709, "learning_rate": 3.5225621236526045e-06, "loss": 0.71866482, "num_input_tokens_seen": 88845025, "step": 4122, "time_per_iteration": 2.7041239738464355 }, { "auxiliary_loss_clip": 0.01147132, "auxiliary_loss_mlp": 0.01045532, "balance_loss_clip": 1.05045271, "balance_loss_mlp": 1.02655339, "epoch": 0.24788817075003758, "flos": 20412200200320.0, "grad_norm": 2.4058135017179976, "language_loss": 0.8026911, "learning_rate": 3.5223095592474596e-06, "loss": 0.82461774, "num_input_tokens_seen": 88861740, "step": 4123, "time_per_iteration": 2.6154532432556152 }, { "auxiliary_loss_clip": 0.01085408, "auxiliary_loss_mlp": 0.0105298, "balance_loss_clip": 1.04720712, "balance_loss_mlp": 1.0354923, "epoch": 0.24794829400270554, "flos": 22594455083520.0, "grad_norm": 2.2195758993023578, "language_loss": 0.74967635, "learning_rate": 3.5220569371161846e-06, "loss": 0.77106017, "num_input_tokens_seen": 88879740, "step": 4124, "time_per_iteration": 2.787986993789673 }, { "auxiliary_loss_clip": 0.01131947, "auxiliary_loss_mlp": 0.01044392, "balance_loss_clip": 1.04892588, "balance_loss_mlp": 1.02809608, "epoch": 0.2480084172553735, "flos": 39676047615360.0, "grad_norm": 1.4128536066198873, "language_loss": 0.73432529, "learning_rate": 3.521804257268357e-06, "loss": 0.75608873, "num_input_tokens_seen": 88904095, "step": 4125, "time_per_iteration": 4.472416162490845 }, { "auxiliary_loss_clip": 0.01109646, "auxiliary_loss_mlp": 0.00776697, "balance_loss_clip": 1.04420686, "balance_loss_mlp": 1.00122678, "epoch": 0.24806854050804147, "flos": 22053712763520.0, "grad_norm": 1.9607758383710057, "language_loss": 0.69630861, "learning_rate": 3.5215515197135595e-06, "loss": 0.71517205, "num_input_tokens_seen": 88920740, "step": 4126, "time_per_iteration": 2.7412056922912598 }, { "auxiliary_loss_clip": 0.01133758, "auxiliary_loss_mlp": 0.01051914, "balance_loss_clip": 1.047984, "balance_loss_mlp": 1.03331721, "epoch": 0.24812866376070947, "flos": 15486764670720.0, "grad_norm": 2.275786464609162, "language_loss": 0.81219494, "learning_rate": 3.5212987244613764e-06, "loss": 0.83405173, "num_input_tokens_seen": 88938510, "step": 4127, "time_per_iteration": 2.620143413543701 }, { "auxiliary_loss_clip": 0.01136685, "auxiliary_loss_mlp": 0.00775421, "balance_loss_clip": 1.04974318, "balance_loss_mlp": 1.00120401, "epoch": 0.24818878701337743, "flos": 14757419013120.0, "grad_norm": 6.503475382998669, "language_loss": 0.8435086, "learning_rate": 3.5210458715213927e-06, "loss": 0.86262965, "num_input_tokens_seen": 88955235, "step": 4128, "time_per_iteration": 2.6764745712280273 }, { "auxiliary_loss_clip": 0.01117625, "auxiliary_loss_mlp": 0.01057179, "balance_loss_clip": 1.04831362, "balance_loss_mlp": 1.03814149, "epoch": 0.2482489102660454, "flos": 27089501852160.0, "grad_norm": 7.318299516736359, "language_loss": 0.6572547, "learning_rate": 3.5207929609031973e-06, "loss": 0.67900276, "num_input_tokens_seen": 88975210, "step": 4129, "time_per_iteration": 2.7178256511688232 }, { "auxiliary_loss_clip": 0.01098796, "auxiliary_loss_mlp": 0.01044421, "balance_loss_clip": 1.04595077, "balance_loss_mlp": 1.02570498, "epoch": 0.24830903351871336, "flos": 26467528924800.0, "grad_norm": 1.8507928533331595, "language_loss": 0.7496134, "learning_rate": 3.5205399926163806e-06, "loss": 0.77104557, "num_input_tokens_seen": 88996120, "step": 4130, "time_per_iteration": 2.82098126411438 }, { "auxiliary_loss_clip": 0.01078173, "auxiliary_loss_mlp": 0.01050295, "balance_loss_clip": 1.04238284, "balance_loss_mlp": 1.03163934, "epoch": 0.24836915677138133, "flos": 10228436870400.0, "grad_norm": 2.098795320061471, "language_loss": 0.7680133, "learning_rate": 3.520286966670535e-06, "loss": 0.78929794, "num_input_tokens_seen": 89008685, "step": 4131, "time_per_iteration": 2.7543740272521973 }, { "auxiliary_loss_clip": 0.0113176, "auxiliary_loss_mlp": 0.0104424, "balance_loss_clip": 1.04992545, "balance_loss_mlp": 1.02781272, "epoch": 0.2484292800240493, "flos": 30080429579520.0, "grad_norm": 2.181098565661814, "language_loss": 0.83579504, "learning_rate": 3.520033883075255e-06, "loss": 0.85755503, "num_input_tokens_seen": 89031160, "step": 4132, "time_per_iteration": 2.681339979171753 }, { "auxiliary_loss_clip": 0.01120332, "auxiliary_loss_mlp": 0.01043901, "balance_loss_clip": 1.04574823, "balance_loss_mlp": 1.02506626, "epoch": 0.24848940327671726, "flos": 13442944803840.0, "grad_norm": 1.8557605687682572, "language_loss": 0.71320271, "learning_rate": 3.5197807418401386e-06, "loss": 0.73484504, "num_input_tokens_seen": 89047235, "step": 4133, "time_per_iteration": 2.6573541164398193 }, { "auxiliary_loss_clip": 0.01150987, "auxiliary_loss_mlp": 0.0104789, "balance_loss_clip": 1.05105197, "balance_loss_mlp": 1.02624202, "epoch": 0.24854952652938525, "flos": 19970247260160.0, "grad_norm": 3.222598228665933, "language_loss": 0.61894202, "learning_rate": 3.5195275429747834e-06, "loss": 0.64093071, "num_input_tokens_seen": 89064790, "step": 4134, "time_per_iteration": 2.5639493465423584 }, { "auxiliary_loss_clip": 0.01135356, "auxiliary_loss_mlp": 0.01045434, "balance_loss_clip": 1.04877877, "balance_loss_mlp": 1.02764797, "epoch": 0.24860964978205322, "flos": 18150187167360.0, "grad_norm": 1.882175713893398, "language_loss": 0.78382719, "learning_rate": 3.5192742864887914e-06, "loss": 0.80563509, "num_input_tokens_seen": 89083250, "step": 4135, "time_per_iteration": 2.6075639724731445 }, { "auxiliary_loss_clip": 0.01123928, "auxiliary_loss_mlp": 0.01035702, "balance_loss_clip": 1.05297661, "balance_loss_mlp": 1.01917946, "epoch": 0.24866977303472118, "flos": 11728641329280.0, "grad_norm": 2.4269193192884186, "language_loss": 0.83582413, "learning_rate": 3.5190209723917662e-06, "loss": 0.85742044, "num_input_tokens_seen": 89100905, "step": 4136, "time_per_iteration": 2.623377799987793 }, { "auxiliary_loss_clip": 0.01119838, "auxiliary_loss_mlp": 0.01045223, "balance_loss_clip": 1.05071807, "balance_loss_mlp": 1.02713883, "epoch": 0.24872989628738915, "flos": 34823582565120.0, "grad_norm": 2.1322549527950665, "language_loss": 0.7057327, "learning_rate": 3.518767600693314e-06, "loss": 0.72738326, "num_input_tokens_seen": 89122630, "step": 4137, "time_per_iteration": 2.814115524291992 }, { "auxiliary_loss_clip": 0.01133507, "auxiliary_loss_mlp": 0.00775347, "balance_loss_clip": 1.0449059, "balance_loss_mlp": 1.00107706, "epoch": 0.2487900195400571, "flos": 13699347062400.0, "grad_norm": 2.085766315480858, "language_loss": 0.66914427, "learning_rate": 3.518514171403042e-06, "loss": 0.68823284, "num_input_tokens_seen": 89141050, "step": 4138, "time_per_iteration": 2.646043539047241 }, { "auxiliary_loss_clip": 0.01103579, "auxiliary_loss_mlp": 0.01036477, "balance_loss_clip": 1.04612446, "balance_loss_mlp": 1.02000237, "epoch": 0.24885014279272508, "flos": 25337815297920.0, "grad_norm": 1.983116672544965, "language_loss": 0.83913636, "learning_rate": 3.51826068453056e-06, "loss": 0.86053687, "num_input_tokens_seen": 89160810, "step": 4139, "time_per_iteration": 2.741090774536133 }, { "auxiliary_loss_clip": 0.01111549, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.04586422, "balance_loss_mlp": 1.02192068, "epoch": 0.24891026604539307, "flos": 20631434860800.0, "grad_norm": 1.4951428686450043, "language_loss": 0.78923917, "learning_rate": 3.518007140085481e-06, "loss": 0.81075907, "num_input_tokens_seen": 89180610, "step": 4140, "time_per_iteration": 2.712780237197876 }, { "auxiliary_loss_clip": 0.01048621, "auxiliary_loss_mlp": 0.01096526, "balance_loss_clip": 1.02931261, "balance_loss_mlp": 1.09464228, "epoch": 0.24897038929806103, "flos": 66960294030720.0, "grad_norm": 0.8293539951671052, "language_loss": 0.61007011, "learning_rate": 3.51775353807742e-06, "loss": 0.63152146, "num_input_tokens_seen": 89241880, "step": 4141, "time_per_iteration": 3.240020513534546 }, { "auxiliary_loss_clip": 0.01147379, "auxiliary_loss_mlp": 0.01049841, "balance_loss_clip": 1.05116534, "balance_loss_mlp": 1.03240097, "epoch": 0.249030512550729, "flos": 36392555612160.0, "grad_norm": 2.1246942361961025, "language_loss": 0.72794569, "learning_rate": 3.5174998785159913e-06, "loss": 0.74991786, "num_input_tokens_seen": 89263340, "step": 4142, "time_per_iteration": 2.7316160202026367 }, { "auxiliary_loss_clip": 0.01133287, "auxiliary_loss_mlp": 0.01044374, "balance_loss_clip": 1.04780602, "balance_loss_mlp": 1.02705276, "epoch": 0.24909063580339696, "flos": 20154576879360.0, "grad_norm": 1.7635050074541005, "language_loss": 0.80630821, "learning_rate": 3.5172461614108157e-06, "loss": 0.82808483, "num_input_tokens_seen": 89282870, "step": 4143, "time_per_iteration": 2.6763389110565186 }, { "auxiliary_loss_clip": 0.01117552, "auxiliary_loss_mlp": 0.01036613, "balance_loss_clip": 1.04615402, "balance_loss_mlp": 1.02026916, "epoch": 0.24915075905606493, "flos": 26396569607040.0, "grad_norm": 2.7235452599944145, "language_loss": 0.59766376, "learning_rate": 3.5169923867715137e-06, "loss": 0.61920542, "num_input_tokens_seen": 89303830, "step": 4144, "time_per_iteration": 2.789417266845703 }, { "auxiliary_loss_clip": 0.01128344, "auxiliary_loss_mlp": 0.01045393, "balance_loss_clip": 1.04464769, "balance_loss_mlp": 1.02850127, "epoch": 0.2492108823087329, "flos": 27527216987520.0, "grad_norm": 2.1754585056135047, "language_loss": 0.78476733, "learning_rate": 3.516738554607708e-06, "loss": 0.80650467, "num_input_tokens_seen": 89324350, "step": 4145, "time_per_iteration": 2.8416056632995605 }, { "auxiliary_loss_clip": 0.01140077, "auxiliary_loss_mlp": 0.00778414, "balance_loss_clip": 1.04980016, "balance_loss_mlp": 1.00122261, "epoch": 0.24927100556140086, "flos": 16691388111360.0, "grad_norm": 2.035933799021365, "language_loss": 0.64925039, "learning_rate": 3.5164846649290253e-06, "loss": 0.66843534, "num_input_tokens_seen": 89342875, "step": 4146, "time_per_iteration": 2.818240165710449 }, { "auxiliary_loss_clip": 0.01036642, "auxiliary_loss_mlp": 0.0100618, "balance_loss_clip": 1.02582741, "balance_loss_mlp": 1.00403452, "epoch": 0.24933112881406885, "flos": 62772464286720.0, "grad_norm": 0.9560925601012792, "language_loss": 0.67304933, "learning_rate": 3.5162307177450915e-06, "loss": 0.69347757, "num_input_tokens_seen": 89404925, "step": 4147, "time_per_iteration": 3.339989185333252 }, { "auxiliary_loss_clip": 0.01123141, "auxiliary_loss_mlp": 0.0104863, "balance_loss_clip": 1.04991198, "balance_loss_mlp": 1.03078485, "epoch": 0.24939125206673682, "flos": 26651894457600.0, "grad_norm": 2.4221411280533554, "language_loss": 0.89285177, "learning_rate": 3.5159767130655366e-06, "loss": 0.9145695, "num_input_tokens_seen": 89425090, "step": 4148, "time_per_iteration": 2.7497105598449707 }, { "auxiliary_loss_clip": 0.01098234, "auxiliary_loss_mlp": 0.01049718, "balance_loss_clip": 1.04725289, "balance_loss_mlp": 1.02874899, "epoch": 0.24945137531940478, "flos": 20704333512960.0, "grad_norm": 1.90098046882646, "language_loss": 0.68272161, "learning_rate": 3.5157226508999935e-06, "loss": 0.70420116, "num_input_tokens_seen": 89442615, "step": 4149, "time_per_iteration": 2.7739884853363037 }, { "auxiliary_loss_clip": 0.01134907, "auxiliary_loss_mlp": 0.01044357, "balance_loss_clip": 1.0508213, "balance_loss_mlp": 1.02747655, "epoch": 0.24951149857207275, "flos": 23768662682880.0, "grad_norm": 1.67166255010053, "language_loss": 0.71424097, "learning_rate": 3.515468531258095e-06, "loss": 0.73603356, "num_input_tokens_seen": 89463025, "step": 4150, "time_per_iteration": 2.6801233291625977 }, { "auxiliary_loss_clip": 0.01098898, "auxiliary_loss_mlp": 0.0104939, "balance_loss_clip": 1.04628861, "balance_loss_mlp": 1.03149676, "epoch": 0.2495716218247407, "flos": 15664881237120.0, "grad_norm": 4.371450104119659, "language_loss": 0.72732216, "learning_rate": 3.515214354149478e-06, "loss": 0.74880505, "num_input_tokens_seen": 89480225, "step": 4151, "time_per_iteration": 2.7118351459503174 }, { "auxiliary_loss_clip": 0.01142805, "auxiliary_loss_mlp": 0.01054095, "balance_loss_clip": 1.05117846, "balance_loss_mlp": 1.0357486, "epoch": 0.24963174507740868, "flos": 24052499953920.0, "grad_norm": 3.4200711789217397, "language_loss": 0.63707078, "learning_rate": 3.514960119583781e-06, "loss": 0.65903974, "num_input_tokens_seen": 89496985, "step": 4152, "time_per_iteration": 2.6352219581604004 }, { "auxiliary_loss_clip": 0.01128057, "auxiliary_loss_mlp": 0.01043812, "balance_loss_clip": 1.05110407, "balance_loss_mlp": 1.02628791, "epoch": 0.24969186833007664, "flos": 21799501234560.0, "grad_norm": 3.664579624689737, "language_loss": 0.77259195, "learning_rate": 3.514705827570645e-06, "loss": 0.79431069, "num_input_tokens_seen": 89514420, "step": 4153, "time_per_iteration": 2.6120872497558594 }, { "auxiliary_loss_clip": 0.01135035, "auxiliary_loss_mlp": 0.01042909, "balance_loss_clip": 1.05221617, "balance_loss_mlp": 1.02620757, "epoch": 0.24975199158274464, "flos": 19938143479680.0, "grad_norm": 2.5781435797973833, "language_loss": 0.7677725, "learning_rate": 3.514451478119711e-06, "loss": 0.78955191, "num_input_tokens_seen": 89532925, "step": 4154, "time_per_iteration": 2.7488853931427 }, { "auxiliary_loss_clip": 0.0113655, "auxiliary_loss_mlp": 0.01051969, "balance_loss_clip": 1.05146766, "balance_loss_mlp": 1.03251421, "epoch": 0.2498121148354126, "flos": 25338389915520.0, "grad_norm": 1.9052782276095375, "language_loss": 0.70335877, "learning_rate": 3.5141970712406258e-06, "loss": 0.72524405, "num_input_tokens_seen": 89552855, "step": 4155, "time_per_iteration": 2.6622395515441895 }, { "auxiliary_loss_clip": 0.01127695, "auxiliary_loss_mlp": 0.01047805, "balance_loss_clip": 1.05243564, "balance_loss_mlp": 1.03074658, "epoch": 0.24987223808808057, "flos": 20558787603840.0, "grad_norm": 1.6974192026095432, "language_loss": 0.74953228, "learning_rate": 3.513942606943036e-06, "loss": 0.77128726, "num_input_tokens_seen": 89572830, "step": 4156, "time_per_iteration": 2.7599329948425293 }, { "auxiliary_loss_clip": 0.01127061, "auxiliary_loss_mlp": 0.01040498, "balance_loss_clip": 1.04922485, "balance_loss_mlp": 1.02404737, "epoch": 0.24993236134074853, "flos": 19749037351680.0, "grad_norm": 2.6541448192787858, "language_loss": 0.76703429, "learning_rate": 3.513688085236591e-06, "loss": 0.78870988, "num_input_tokens_seen": 89590345, "step": 4157, "time_per_iteration": 4.172720432281494 }, { "auxiliary_loss_clip": 0.01087279, "auxiliary_loss_mlp": 0.01050682, "balance_loss_clip": 1.04686046, "balance_loss_mlp": 1.03302717, "epoch": 0.2499924845934165, "flos": 18770292587520.0, "grad_norm": 6.508490360255271, "language_loss": 0.81656492, "learning_rate": 3.513433506130942e-06, "loss": 0.83794451, "num_input_tokens_seen": 89610295, "step": 4158, "time_per_iteration": 4.373260736465454 }, { "auxiliary_loss_clip": 0.01115824, "auxiliary_loss_mlp": 0.01039502, "balance_loss_clip": 1.04740119, "balance_loss_mlp": 1.02166879, "epoch": 0.25005260784608446, "flos": 16872198197760.0, "grad_norm": 2.799032697181286, "language_loss": 0.76568067, "learning_rate": 3.5131788696357427e-06, "loss": 0.78723395, "num_input_tokens_seen": 89627795, "step": 4159, "time_per_iteration": 2.6529338359832764 }, { "auxiliary_loss_clip": 0.01139337, "auxiliary_loss_mlp": 0.01038581, "balance_loss_clip": 1.05149508, "balance_loss_mlp": 1.02013946, "epoch": 0.2501127310987524, "flos": 22124923476480.0, "grad_norm": 2.4918403268433122, "language_loss": 0.71557873, "learning_rate": 3.512924175760649e-06, "loss": 0.73735791, "num_input_tokens_seen": 89648090, "step": 4160, "time_per_iteration": 4.178418874740601 }, { "auxiliary_loss_clip": 0.01062459, "auxiliary_loss_mlp": 0.01001923, "balance_loss_clip": 1.02823949, "balance_loss_mlp": 0.99992067, "epoch": 0.2501728543514204, "flos": 69458061980160.0, "grad_norm": 0.7611682305123987, "language_loss": 0.56783372, "learning_rate": 3.5126694245153186e-06, "loss": 0.58847755, "num_input_tokens_seen": 89710345, "step": 4161, "time_per_iteration": 3.1690969467163086 }, { "auxiliary_loss_clip": 0.0114076, "auxiliary_loss_mlp": 0.01048659, "balance_loss_clip": 1.05206347, "balance_loss_mlp": 1.0308131, "epoch": 0.25023297760408836, "flos": 16289978647680.0, "grad_norm": 4.523737291621751, "language_loss": 0.80654883, "learning_rate": 3.5124146159094125e-06, "loss": 0.82844305, "num_input_tokens_seen": 89729390, "step": 4162, "time_per_iteration": 2.630491018295288 }, { "auxiliary_loss_clip": 0.01127145, "auxiliary_loss_mlp": 0.00776859, "balance_loss_clip": 1.04807281, "balance_loss_mlp": 1.00124371, "epoch": 0.2502931008567563, "flos": 12237998140800.0, "grad_norm": 3.0029202967601107, "language_loss": 0.87312925, "learning_rate": 3.5121597499525927e-06, "loss": 0.89216936, "num_input_tokens_seen": 89742805, "step": 4163, "time_per_iteration": 2.660985231399536 }, { "auxiliary_loss_clip": 0.01133331, "auxiliary_loss_mlp": 0.01039671, "balance_loss_clip": 1.0538981, "balance_loss_mlp": 1.02234972, "epoch": 0.25035322410942434, "flos": 23181882105600.0, "grad_norm": 1.700690076898522, "language_loss": 0.83170879, "learning_rate": 3.5119048266545232e-06, "loss": 0.85343885, "num_input_tokens_seen": 89761145, "step": 4164, "time_per_iteration": 4.217406988143921 }, { "auxiliary_loss_clip": 0.01131608, "auxiliary_loss_mlp": 0.01047174, "balance_loss_clip": 1.05681539, "balance_loss_mlp": 1.0309732, "epoch": 0.2504133473620923, "flos": 20917534688640.0, "grad_norm": 1.61687510361108, "language_loss": 0.73889691, "learning_rate": 3.5116498460248716e-06, "loss": 0.76068473, "num_input_tokens_seen": 89780905, "step": 4165, "time_per_iteration": 2.7395150661468506 }, { "auxiliary_loss_clip": 0.01112927, "auxiliary_loss_mlp": 0.01043589, "balance_loss_clip": 1.04912043, "balance_loss_mlp": 1.02611279, "epoch": 0.2504734706147603, "flos": 20776549806720.0, "grad_norm": 1.856982928728685, "language_loss": 0.74739552, "learning_rate": 3.5113948080733062e-06, "loss": 0.7689606, "num_input_tokens_seen": 89799230, "step": 4166, "time_per_iteration": 2.7567081451416016 }, { "auxiliary_loss_clip": 0.01110594, "auxiliary_loss_mlp": 0.01042647, "balance_loss_clip": 1.04968488, "balance_loss_mlp": 1.02651834, "epoch": 0.25053359386742824, "flos": 24349373861760.0, "grad_norm": 2.0013578528528724, "language_loss": 0.82254446, "learning_rate": 3.5111397128094973e-06, "loss": 0.84407687, "num_input_tokens_seen": 89818240, "step": 4167, "time_per_iteration": 2.692664384841919 }, { "auxiliary_loss_clip": 0.01130059, "auxiliary_loss_mlp": 0.01043694, "balance_loss_clip": 1.05185139, "balance_loss_mlp": 1.02695727, "epoch": 0.2505937171200962, "flos": 21214336769280.0, "grad_norm": 2.4392619558537407, "language_loss": 0.79381847, "learning_rate": 3.51088456024312e-06, "loss": 0.81555605, "num_input_tokens_seen": 89834485, "step": 4168, "time_per_iteration": 2.6286962032318115 }, { "auxiliary_loss_clip": 0.01138966, "auxiliary_loss_mlp": 0.01046302, "balance_loss_clip": 1.05118442, "balance_loss_mlp": 1.02704966, "epoch": 0.25065384037276417, "flos": 41427231379200.0, "grad_norm": 2.2753262043393243, "language_loss": 0.69603884, "learning_rate": 3.510629350383849e-06, "loss": 0.71789157, "num_input_tokens_seen": 89855645, "step": 4169, "time_per_iteration": 2.7935590744018555 }, { "auxiliary_loss_clip": 0.01110761, "auxiliary_loss_mlp": 0.01049625, "balance_loss_clip": 1.04870963, "balance_loss_mlp": 1.03274524, "epoch": 0.25071396362543213, "flos": 26102389219200.0, "grad_norm": 1.8250030020409629, "language_loss": 0.78045398, "learning_rate": 3.510374083241361e-06, "loss": 0.80205786, "num_input_tokens_seen": 89874895, "step": 4170, "time_per_iteration": 2.7728679180145264 }, { "auxiliary_loss_clip": 0.01128286, "auxiliary_loss_mlp": 0.01043437, "balance_loss_clip": 1.05320668, "balance_loss_mlp": 1.02662849, "epoch": 0.2507740868781001, "flos": 19098982967040.0, "grad_norm": 2.5073993684848004, "language_loss": 0.76440209, "learning_rate": 3.5101187588253368e-06, "loss": 0.78611928, "num_input_tokens_seen": 89891700, "step": 4171, "time_per_iteration": 2.7825160026550293 }, { "auxiliary_loss_clip": 0.01061117, "auxiliary_loss_mlp": 0.01002396, "balance_loss_clip": 1.027282, "balance_loss_mlp": 1.00034571, "epoch": 0.25083421013076806, "flos": 64341868296960.0, "grad_norm": 0.8424544393272001, "language_loss": 0.6006161, "learning_rate": 3.509863377145458e-06, "loss": 0.62125123, "num_input_tokens_seen": 89955775, "step": 4172, "time_per_iteration": 3.1981940269470215 }, { "auxiliary_loss_clip": 0.01125517, "auxiliary_loss_mlp": 0.01046213, "balance_loss_clip": 1.05005789, "balance_loss_mlp": 1.02821243, "epoch": 0.25089433338343603, "flos": 24279599692800.0, "grad_norm": 1.4368714421460043, "language_loss": 0.79106563, "learning_rate": 3.509607938211409e-06, "loss": 0.81278288, "num_input_tokens_seen": 89977150, "step": 4173, "time_per_iteration": 2.8311028480529785 }, { "auxiliary_loss_clip": 0.01152553, "auxiliary_loss_mlp": 0.0104675, "balance_loss_clip": 1.05725241, "balance_loss_mlp": 1.02986968, "epoch": 0.250954456636104, "flos": 14721472477440.0, "grad_norm": 2.103663042812158, "language_loss": 0.83371937, "learning_rate": 3.509352442032875e-06, "loss": 0.85571229, "num_input_tokens_seen": 89994925, "step": 4174, "time_per_iteration": 2.696199893951416 }, { "auxiliary_loss_clip": 0.01095749, "auxiliary_loss_mlp": 0.01049206, "balance_loss_clip": 1.04728913, "balance_loss_mlp": 1.03095484, "epoch": 0.25101457988877196, "flos": 22273593868800.0, "grad_norm": 43.022796554959484, "language_loss": 0.71023381, "learning_rate": 3.509096888619545e-06, "loss": 0.73168337, "num_input_tokens_seen": 90013235, "step": 4175, "time_per_iteration": 2.8337926864624023 }, { "auxiliary_loss_clip": 0.01119154, "auxiliary_loss_mlp": 0.01038924, "balance_loss_clip": 1.05135846, "balance_loss_mlp": 1.02145982, "epoch": 0.2510747031414399, "flos": 25188929424000.0, "grad_norm": 2.017414900854033, "language_loss": 0.80957019, "learning_rate": 3.50884127798111e-06, "loss": 0.83115101, "num_input_tokens_seen": 90032150, "step": 4176, "time_per_iteration": 2.936908483505249 }, { "auxiliary_loss_clip": 0.01127542, "auxiliary_loss_mlp": 0.0104611, "balance_loss_clip": 1.0535233, "balance_loss_mlp": 1.02753711, "epoch": 0.25113482639410795, "flos": 20704189858560.0, "grad_norm": 2.475574978330162, "language_loss": 0.82294285, "learning_rate": 3.5085856101272623e-06, "loss": 0.84467936, "num_input_tokens_seen": 90049085, "step": 4177, "time_per_iteration": 2.7630460262298584 }, { "auxiliary_loss_clip": 0.01110202, "auxiliary_loss_mlp": 0.01051495, "balance_loss_clip": 1.05168724, "balance_loss_mlp": 1.03386414, "epoch": 0.2511949496467759, "flos": 21506936958720.0, "grad_norm": 2.1761277698635593, "language_loss": 0.82517993, "learning_rate": 3.508329885067698e-06, "loss": 0.84679693, "num_input_tokens_seen": 90067695, "step": 4178, "time_per_iteration": 2.7356274127960205 }, { "auxiliary_loss_clip": 0.01145101, "auxiliary_loss_mlp": 0.00775573, "balance_loss_clip": 1.05324888, "balance_loss_mlp": 1.00148535, "epoch": 0.2512550728994439, "flos": 20701999128960.0, "grad_norm": 2.1475299559000947, "language_loss": 0.75229692, "learning_rate": 3.508074102812112e-06, "loss": 0.77150369, "num_input_tokens_seen": 90083890, "step": 4179, "time_per_iteration": 2.631096363067627 }, { "auxiliary_loss_clip": 0.01109293, "auxiliary_loss_mlp": 0.01056583, "balance_loss_clip": 1.04920673, "balance_loss_mlp": 1.03833175, "epoch": 0.25131519615211184, "flos": 18478626151680.0, "grad_norm": 1.9599833122138943, "language_loss": 0.69976825, "learning_rate": 3.507818263370206e-06, "loss": 0.72142696, "num_input_tokens_seen": 90100995, "step": 4180, "time_per_iteration": 2.708122730255127 }, { "auxiliary_loss_clip": 0.01147992, "auxiliary_loss_mlp": 0.01045783, "balance_loss_clip": 1.05343485, "balance_loss_mlp": 1.02909422, "epoch": 0.2513753194047798, "flos": 20484955198080.0, "grad_norm": 1.8622914556591927, "language_loss": 0.85940182, "learning_rate": 3.5075623667516796e-06, "loss": 0.88133955, "num_input_tokens_seen": 90120365, "step": 4181, "time_per_iteration": 2.633091449737549 }, { "auxiliary_loss_clip": 0.01148017, "auxiliary_loss_mlp": 0.01049707, "balance_loss_clip": 1.05351245, "balance_loss_mlp": 1.03270781, "epoch": 0.25143544265744777, "flos": 37670077704960.0, "grad_norm": 2.0695978407502467, "language_loss": 0.6856631, "learning_rate": 3.507306412966238e-06, "loss": 0.70764029, "num_input_tokens_seen": 90142610, "step": 4182, "time_per_iteration": 2.8169894218444824 }, { "auxiliary_loss_clip": 0.01041202, "auxiliary_loss_mlp": 0.010083, "balance_loss_clip": 1.02456141, "balance_loss_mlp": 1.00577307, "epoch": 0.25149556591011574, "flos": 69367457923200.0, "grad_norm": 0.8403189096432666, "language_loss": 0.70032597, "learning_rate": 3.5070504020235853e-06, "loss": 0.72082102, "num_input_tokens_seen": 90200555, "step": 4183, "time_per_iteration": 3.2070610523223877 }, { "auxiliary_loss_clip": 0.01130203, "auxiliary_loss_mlp": 0.01042834, "balance_loss_clip": 1.05145216, "balance_loss_mlp": 1.02441609, "epoch": 0.2515556891627837, "flos": 13990402967040.0, "grad_norm": 1.8802113118438855, "language_loss": 0.73834902, "learning_rate": 3.506794333933431e-06, "loss": 0.76007938, "num_input_tokens_seen": 90218120, "step": 4184, "time_per_iteration": 2.691950559616089 }, { "auxiliary_loss_clip": 0.01136971, "auxiliary_loss_mlp": 0.01047362, "balance_loss_clip": 1.05233765, "balance_loss_mlp": 1.0297792, "epoch": 0.25161581241545167, "flos": 22163527618560.0, "grad_norm": 1.8676646084141537, "language_loss": 0.8334859, "learning_rate": 3.506538208705484e-06, "loss": 0.85532916, "num_input_tokens_seen": 90236790, "step": 4185, "time_per_iteration": 2.6931228637695312 }, { "auxiliary_loss_clip": 0.01022217, "auxiliary_loss_mlp": 0.01010846, "balance_loss_clip": 1.03471541, "balance_loss_mlp": 1.00902176, "epoch": 0.25167593566811963, "flos": 69358407696000.0, "grad_norm": 0.7883550117667959, "language_loss": 0.61448294, "learning_rate": 3.5062820263494574e-06, "loss": 0.63481361, "num_input_tokens_seen": 90297070, "step": 4186, "time_per_iteration": 3.175295829772949 }, { "auxiliary_loss_clip": 0.01107804, "auxiliary_loss_mlp": 0.01041844, "balance_loss_clip": 1.04873872, "balance_loss_mlp": 1.02405787, "epoch": 0.2517360589207876, "flos": 13261452359040.0, "grad_norm": 1.8553357788385085, "language_loss": 0.79070914, "learning_rate": 3.5060257868750656e-06, "loss": 0.81220555, "num_input_tokens_seen": 90315255, "step": 4187, "time_per_iteration": 2.887378215789795 }, { "auxiliary_loss_clip": 0.01091434, "auxiliary_loss_mlp": 0.01049489, "balance_loss_clip": 1.0482558, "balance_loss_mlp": 1.03138089, "epoch": 0.25179618217345556, "flos": 20376828282240.0, "grad_norm": 3.7749228259968586, "language_loss": 0.79629189, "learning_rate": 3.5057694902920244e-06, "loss": 0.8177011, "num_input_tokens_seen": 90334990, "step": 4188, "time_per_iteration": 2.8985629081726074 }, { "auxiliary_loss_clip": 0.01133381, "auxiliary_loss_mlp": 0.01046993, "balance_loss_clip": 1.05168021, "balance_loss_mlp": 1.03012538, "epoch": 0.25185630542612353, "flos": 27664718250240.0, "grad_norm": 1.7363151422402578, "language_loss": 0.74419165, "learning_rate": 3.5055131366100534e-06, "loss": 0.76599538, "num_input_tokens_seen": 90351825, "step": 4189, "time_per_iteration": 2.697097063064575 }, { "auxiliary_loss_clip": 0.01118534, "auxiliary_loss_mlp": 0.01044827, "balance_loss_clip": 1.04871011, "balance_loss_mlp": 1.02862656, "epoch": 0.25191642867879155, "flos": 20996430912000.0, "grad_norm": 2.0536634388060078, "language_loss": 0.84721291, "learning_rate": 3.5052567258388745e-06, "loss": 0.86884648, "num_input_tokens_seen": 90369860, "step": 4190, "time_per_iteration": 2.731227397918701 }, { "auxiliary_loss_clip": 0.01118209, "auxiliary_loss_mlp": 0.01044895, "balance_loss_clip": 1.04597688, "balance_loss_mlp": 1.02633369, "epoch": 0.2519765519314595, "flos": 21105671149440.0, "grad_norm": 2.0130913170662783, "language_loss": 0.75695485, "learning_rate": 3.5050002579882082e-06, "loss": 0.77858591, "num_input_tokens_seen": 90389245, "step": 4191, "time_per_iteration": 2.7403173446655273 }, { "auxiliary_loss_clip": 0.01048031, "auxiliary_loss_mlp": 0.01014765, "balance_loss_clip": 1.02375531, "balance_loss_mlp": 1.0122261, "epoch": 0.2520366751841275, "flos": 62744993360640.0, "grad_norm": 0.7280864395517058, "language_loss": 0.57129633, "learning_rate": 3.5047437330677823e-06, "loss": 0.59192419, "num_input_tokens_seen": 90456735, "step": 4192, "time_per_iteration": 3.237478017807007 }, { "auxiliary_loss_clip": 0.01121978, "auxiliary_loss_mlp": 0.01041578, "balance_loss_clip": 1.05535698, "balance_loss_mlp": 1.02374434, "epoch": 0.25209679843679544, "flos": 22230716008320.0, "grad_norm": 1.8423117439969312, "language_loss": 0.76066267, "learning_rate": 3.504487151087323e-06, "loss": 0.78229821, "num_input_tokens_seen": 90474165, "step": 4193, "time_per_iteration": 2.699486255645752 }, { "auxiliary_loss_clip": 0.01137884, "auxiliary_loss_mlp": 0.01046125, "balance_loss_clip": 1.05232048, "balance_loss_mlp": 1.02869618, "epoch": 0.2521569216894634, "flos": 12166643773440.0, "grad_norm": 3.5003037089711437, "language_loss": 0.84335077, "learning_rate": 3.5042305120565598e-06, "loss": 0.86519086, "num_input_tokens_seen": 90491660, "step": 4194, "time_per_iteration": 2.6561896800994873 }, { "auxiliary_loss_clip": 0.01149932, "auxiliary_loss_mlp": 0.01050793, "balance_loss_clip": 1.05253458, "balance_loss_mlp": 1.03461599, "epoch": 0.2522170449421314, "flos": 23699786353920.0, "grad_norm": 1.3753304678825264, "language_loss": 0.88249695, "learning_rate": 3.5039738159852253e-06, "loss": 0.90450418, "num_input_tokens_seen": 90514025, "step": 4195, "time_per_iteration": 2.67887806892395 }, { "auxiliary_loss_clip": 0.01150202, "auxiliary_loss_mlp": 0.01041959, "balance_loss_clip": 1.05412734, "balance_loss_mlp": 1.02199149, "epoch": 0.25227716819479934, "flos": 20955456472320.0, "grad_norm": 2.4146072325129087, "language_loss": 0.85488242, "learning_rate": 3.503717062883053e-06, "loss": 0.87680399, "num_input_tokens_seen": 90533530, "step": 4196, "time_per_iteration": 2.6358916759490967 }, { "auxiliary_loss_clip": 0.01137804, "auxiliary_loss_mlp": 0.01049246, "balance_loss_clip": 1.05213511, "balance_loss_mlp": 1.03193665, "epoch": 0.2523372914474673, "flos": 23331342597120.0, "grad_norm": 1.9329643035636839, "language_loss": 0.8319478, "learning_rate": 3.5034602527597786e-06, "loss": 0.8538183, "num_input_tokens_seen": 90554025, "step": 4197, "time_per_iteration": 5.738839387893677 }, { "auxiliary_loss_clip": 0.01140063, "auxiliary_loss_mlp": 0.01051416, "balance_loss_clip": 1.05392218, "balance_loss_mlp": 1.03224671, "epoch": 0.25239741470013527, "flos": 36970321875840.0, "grad_norm": 2.1358917159416104, "language_loss": 0.72820318, "learning_rate": 3.5032033856251405e-06, "loss": 0.75011802, "num_input_tokens_seen": 90576930, "step": 4198, "time_per_iteration": 2.8819963932037354 }, { "auxiliary_loss_clip": 0.01152924, "auxiliary_loss_mlp": 0.01048555, "balance_loss_clip": 1.05455935, "balance_loss_mlp": 1.03045893, "epoch": 0.25245753795280323, "flos": 18515757836160.0, "grad_norm": 6.722547943004915, "language_loss": 0.76560014, "learning_rate": 3.50294646148888e-06, "loss": 0.78761488, "num_input_tokens_seen": 90595710, "step": 4199, "time_per_iteration": 2.636993169784546 }, { "auxiliary_loss_clip": 0.01125413, "auxiliary_loss_mlp": 0.00776026, "balance_loss_clip": 1.05274642, "balance_loss_mlp": 1.00117147, "epoch": 0.2525176612054712, "flos": 32344884737280.0, "grad_norm": 1.814097809936595, "language_loss": 0.73571241, "learning_rate": 3.502689480360739e-06, "loss": 0.75472683, "num_input_tokens_seen": 90617945, "step": 4200, "time_per_iteration": 4.297755002975464 }, { "auxiliary_loss_clip": 0.01137136, "auxiliary_loss_mlp": 0.01047957, "balance_loss_clip": 1.05050063, "balance_loss_mlp": 1.03187585, "epoch": 0.25257778445813917, "flos": 45258217459200.0, "grad_norm": 1.6490086858694837, "language_loss": 0.8223114, "learning_rate": 3.5024324422504616e-06, "loss": 0.84416234, "num_input_tokens_seen": 90640855, "step": 4201, "time_per_iteration": 2.859703302383423 }, { "auxiliary_loss_clip": 0.01098423, "auxiliary_loss_mlp": 0.01048, "balance_loss_clip": 1.05422068, "balance_loss_mlp": 1.03126347, "epoch": 0.25263790771080713, "flos": 23367791923200.0, "grad_norm": 1.9307441853812024, "language_loss": 0.74854887, "learning_rate": 3.5021753471677965e-06, "loss": 0.77001321, "num_input_tokens_seen": 90661350, "step": 4202, "time_per_iteration": 2.7475366592407227 }, { "auxiliary_loss_clip": 0.01134371, "auxiliary_loss_mlp": 0.01040637, "balance_loss_clip": 1.05362439, "balance_loss_mlp": 1.02392364, "epoch": 0.25269803096347515, "flos": 18515039564160.0, "grad_norm": 1.882597455778369, "language_loss": 0.7323755, "learning_rate": 3.501918195122491e-06, "loss": 0.75412554, "num_input_tokens_seen": 90680540, "step": 4203, "time_per_iteration": 2.6547653675079346 }, { "auxiliary_loss_clip": 0.01128208, "auxiliary_loss_mlp": 0.01039636, "balance_loss_clip": 1.05176711, "balance_loss_mlp": 1.02239835, "epoch": 0.2527581542161431, "flos": 24610552629120.0, "grad_norm": 1.4386036639708744, "language_loss": 0.77731073, "learning_rate": 3.501660986124297e-06, "loss": 0.79898918, "num_input_tokens_seen": 90703460, "step": 4204, "time_per_iteration": 4.4116432666778564 }, { "auxiliary_loss_clip": 0.01115267, "auxiliary_loss_mlp": 0.01052396, "balance_loss_clip": 1.05262613, "balance_loss_mlp": 1.03453815, "epoch": 0.2528182774688111, "flos": 12641275111680.0, "grad_norm": 1.9357035590368088, "language_loss": 0.72175288, "learning_rate": 3.5014037201829684e-06, "loss": 0.74342954, "num_input_tokens_seen": 90718815, "step": 4205, "time_per_iteration": 2.6750712394714355 }, { "auxiliary_loss_clip": 0.01124756, "auxiliary_loss_mlp": 0.01044172, "balance_loss_clip": 1.05032194, "balance_loss_mlp": 1.02801895, "epoch": 0.25287840072147905, "flos": 46936789879680.0, "grad_norm": 1.4680577763339375, "language_loss": 0.75594378, "learning_rate": 3.50114639730826e-06, "loss": 0.77763301, "num_input_tokens_seen": 90742125, "step": 4206, "time_per_iteration": 2.876408815383911 }, { "auxiliary_loss_clip": 0.01107683, "auxiliary_loss_mlp": 0.01044618, "balance_loss_clip": 1.04771221, "balance_loss_mlp": 1.02780974, "epoch": 0.252938523974147, "flos": 18879712392960.0, "grad_norm": 1.5378963492414741, "language_loss": 0.78807724, "learning_rate": 3.5008890175099296e-06, "loss": 0.80960023, "num_input_tokens_seen": 90760785, "step": 4207, "time_per_iteration": 2.7176475524902344 }, { "auxiliary_loss_clip": 0.01133715, "auxiliary_loss_mlp": 0.01055631, "balance_loss_clip": 1.0547328, "balance_loss_mlp": 1.03984797, "epoch": 0.252998647226815, "flos": 21434720664960.0, "grad_norm": 1.5723877129370716, "language_loss": 0.76399815, "learning_rate": 3.5006315807977375e-06, "loss": 0.78589159, "num_input_tokens_seen": 90780045, "step": 4208, "time_per_iteration": 2.797658920288086 }, { "auxiliary_loss_clip": 0.01131059, "auxiliary_loss_mlp": 0.01040866, "balance_loss_clip": 1.05162513, "balance_loss_mlp": 1.02465391, "epoch": 0.25305877047948294, "flos": 25442171285760.0, "grad_norm": 3.9595354320915166, "language_loss": 0.69848049, "learning_rate": 3.5003740871814456e-06, "loss": 0.72019976, "num_input_tokens_seen": 90797980, "step": 4209, "time_per_iteration": 2.738159418106079 }, { "auxiliary_loss_clip": 0.01046521, "auxiliary_loss_mlp": 0.0100386, "balance_loss_clip": 1.02250004, "balance_loss_mlp": 1.0015471, "epoch": 0.2531188937321509, "flos": 60185603629440.0, "grad_norm": 0.7787603502724176, "language_loss": 0.55091059, "learning_rate": 3.5001165366708175e-06, "loss": 0.57141441, "num_input_tokens_seen": 90864865, "step": 4210, "time_per_iteration": 3.196953535079956 }, { "auxiliary_loss_clip": 0.01113643, "auxiliary_loss_mlp": 0.01038759, "balance_loss_clip": 1.05103207, "balance_loss_mlp": 1.02215338, "epoch": 0.25317901698481887, "flos": 19682387665920.0, "grad_norm": 1.8504444580052586, "language_loss": 0.8006835, "learning_rate": 3.4998589292756204e-06, "loss": 0.82220757, "num_input_tokens_seen": 90882885, "step": 4211, "time_per_iteration": 2.7241647243499756 }, { "auxiliary_loss_clip": 0.01095085, "auxiliary_loss_mlp": 0.01044368, "balance_loss_clip": 1.04594803, "balance_loss_mlp": 1.02844775, "epoch": 0.25323914023748684, "flos": 24424355502720.0, "grad_norm": 1.531596575729193, "language_loss": 0.78362429, "learning_rate": 3.499601265005622e-06, "loss": 0.80501878, "num_input_tokens_seen": 90902985, "step": 4212, "time_per_iteration": 2.788607358932495 }, { "auxiliary_loss_clip": 0.01133893, "auxiliary_loss_mlp": 0.01041038, "balance_loss_clip": 1.04857254, "balance_loss_mlp": 1.02401471, "epoch": 0.2532992634901548, "flos": 25447450584960.0, "grad_norm": 2.123277134845907, "language_loss": 0.53516036, "learning_rate": 3.4993435438705938e-06, "loss": 0.55690968, "num_input_tokens_seen": 90923550, "step": 4213, "time_per_iteration": 2.6675784587860107 }, { "auxiliary_loss_clip": 0.01120924, "auxiliary_loss_mlp": 0.01044765, "balance_loss_clip": 1.05005503, "balance_loss_mlp": 1.0273726, "epoch": 0.25335938674282277, "flos": 18880538405760.0, "grad_norm": 2.4965805840577002, "language_loss": 0.65416414, "learning_rate": 3.499085765880308e-06, "loss": 0.67582107, "num_input_tokens_seen": 90943260, "step": 4214, "time_per_iteration": 2.691359281539917 }, { "auxiliary_loss_clip": 0.01046401, "auxiliary_loss_mlp": 0.01002761, "balance_loss_clip": 1.02238619, "balance_loss_mlp": 1.00056791, "epoch": 0.25341950999549073, "flos": 53062649936640.0, "grad_norm": 0.8515065776804692, "language_loss": 0.58004916, "learning_rate": 3.4988279310445396e-06, "loss": 0.60054076, "num_input_tokens_seen": 90996295, "step": 4215, "time_per_iteration": 2.981840133666992 }, { "auxiliary_loss_clip": 0.01124794, "auxiliary_loss_mlp": 0.01043531, "balance_loss_clip": 1.05316496, "balance_loss_mlp": 1.02655554, "epoch": 0.2534796332481587, "flos": 39020247054720.0, "grad_norm": 1.7497766885830588, "language_loss": 0.83251095, "learning_rate": 3.498570039373066e-06, "loss": 0.85419416, "num_input_tokens_seen": 91017545, "step": 4216, "time_per_iteration": 2.912137508392334 }, { "auxiliary_loss_clip": 0.0112972, "auxiliary_loss_mlp": 0.01040052, "balance_loss_clip": 1.05088937, "balance_loss_mlp": 1.02338624, "epoch": 0.2535397565008267, "flos": 23586990670080.0, "grad_norm": 3.3733415491927996, "language_loss": 0.80008072, "learning_rate": 3.498312090875666e-06, "loss": 0.82177842, "num_input_tokens_seen": 91037715, "step": 4217, "time_per_iteration": 2.6532363891601562 }, { "auxiliary_loss_clip": 0.01116019, "auxiliary_loss_mlp": 0.01038346, "balance_loss_clip": 1.04436612, "balance_loss_mlp": 1.02234793, "epoch": 0.2535998797534947, "flos": 19281373251840.0, "grad_norm": 2.333881972650505, "language_loss": 0.75585902, "learning_rate": 3.4980540855621218e-06, "loss": 0.77740264, "num_input_tokens_seen": 91055295, "step": 4218, "time_per_iteration": 2.650867223739624 }, { "auxiliary_loss_clip": 0.0113544, "auxiliary_loss_mlp": 0.0104021, "balance_loss_clip": 1.04940748, "balance_loss_mlp": 1.0229727, "epoch": 0.25366000300616265, "flos": 24024382583040.0, "grad_norm": 2.040148074486094, "language_loss": 0.74188256, "learning_rate": 3.4977960234422167e-06, "loss": 0.76363909, "num_input_tokens_seen": 91075485, "step": 4219, "time_per_iteration": 2.727161169052124 }, { "auxiliary_loss_clip": 0.01138406, "auxiliary_loss_mlp": 0.01048455, "balance_loss_clip": 1.05222011, "balance_loss_mlp": 1.03138447, "epoch": 0.2537201262588306, "flos": 16289368116480.0, "grad_norm": 4.990704095966988, "language_loss": 0.81355274, "learning_rate": 3.497537904525736e-06, "loss": 0.83542132, "num_input_tokens_seen": 91093620, "step": 4220, "time_per_iteration": 2.6146652698516846 }, { "auxiliary_loss_clip": 0.01100698, "auxiliary_loss_mlp": 0.01049127, "balance_loss_clip": 1.04988587, "balance_loss_mlp": 1.03041148, "epoch": 0.2537802495114986, "flos": 23294677789440.0, "grad_norm": 2.3092995740689197, "language_loss": 0.70819569, "learning_rate": 3.497279728822468e-06, "loss": 0.72969389, "num_input_tokens_seen": 91114110, "step": 4221, "time_per_iteration": 2.851747751235962 }, { "auxiliary_loss_clip": 0.0114682, "auxiliary_loss_mlp": 0.01039444, "balance_loss_clip": 1.05224657, "balance_loss_mlp": 1.02257586, "epoch": 0.25384037276416654, "flos": 17639142416640.0, "grad_norm": 2.4229893622177188, "language_loss": 0.61689377, "learning_rate": 3.497021496342202e-06, "loss": 0.63875645, "num_input_tokens_seen": 91133135, "step": 4222, "time_per_iteration": 2.6394412517547607 }, { "auxiliary_loss_clip": 0.01138378, "auxiliary_loss_mlp": 0.01051871, "balance_loss_clip": 1.05371165, "balance_loss_mlp": 1.03528929, "epoch": 0.2539004960168345, "flos": 21507044699520.0, "grad_norm": 1.6839261376783914, "language_loss": 0.74744058, "learning_rate": 3.496763207094731e-06, "loss": 0.76934308, "num_input_tokens_seen": 91151805, "step": 4223, "time_per_iteration": 2.648322105407715 }, { "auxiliary_loss_clip": 0.01092255, "auxiliary_loss_mlp": 0.01039082, "balance_loss_clip": 1.04767203, "balance_loss_mlp": 1.02325082, "epoch": 0.2539606192695025, "flos": 23950909313280.0, "grad_norm": 1.7092524284111348, "language_loss": 0.80226004, "learning_rate": 3.49650486108985e-06, "loss": 0.82357341, "num_input_tokens_seen": 91172270, "step": 4224, "time_per_iteration": 2.7572662830352783 }, { "auxiliary_loss_clip": 0.01130506, "auxiliary_loss_mlp": 0.00774076, "balance_loss_clip": 1.05102324, "balance_loss_mlp": 1.00112057, "epoch": 0.25402074252217044, "flos": 24169784837760.0, "grad_norm": 1.4497407173280796, "language_loss": 0.77330017, "learning_rate": 3.496246458337354e-06, "loss": 0.792346, "num_input_tokens_seen": 91192080, "step": 4225, "time_per_iteration": 2.7661190032958984 }, { "auxiliary_loss_clip": 0.01130647, "auxiliary_loss_mlp": 0.01049954, "balance_loss_clip": 1.04919255, "balance_loss_mlp": 1.03271639, "epoch": 0.2540808657748384, "flos": 22303758314880.0, "grad_norm": 2.0615353379683055, "language_loss": 0.84638137, "learning_rate": 3.4959879988470426e-06, "loss": 0.86818743, "num_input_tokens_seen": 91211450, "step": 4226, "time_per_iteration": 2.690683126449585 }, { "auxiliary_loss_clip": 0.01143268, "auxiliary_loss_mlp": 0.01043336, "balance_loss_clip": 1.05067408, "balance_loss_mlp": 1.02613425, "epoch": 0.25414098902750637, "flos": 27599541022080.0, "grad_norm": 1.5600656222031943, "language_loss": 0.70886129, "learning_rate": 3.4957294826287164e-06, "loss": 0.73072731, "num_input_tokens_seen": 91231835, "step": 4227, "time_per_iteration": 2.6647307872772217 }, { "auxiliary_loss_clip": 0.01055229, "auxiliary_loss_mlp": 0.01001956, "balance_loss_clip": 1.02168798, "balance_loss_mlp": 0.9995476, "epoch": 0.25420111228017434, "flos": 58170834887040.0, "grad_norm": 0.9869295588353136, "language_loss": 0.61927998, "learning_rate": 3.4954709096921785e-06, "loss": 0.63985181, "num_input_tokens_seen": 91288755, "step": 4228, "time_per_iteration": 2.986067533493042 }, { "auxiliary_loss_clip": 0.01124878, "auxiliary_loss_mlp": 0.01040149, "balance_loss_clip": 1.0464859, "balance_loss_mlp": 1.02212501, "epoch": 0.2542612355328423, "flos": 11464409905920.0, "grad_norm": 2.314170874410929, "language_loss": 0.86946094, "learning_rate": 3.4952122800472336e-06, "loss": 0.8911112, "num_input_tokens_seen": 91302485, "step": 4229, "time_per_iteration": 2.629518985748291 }, { "auxiliary_loss_clip": 0.01102882, "auxiliary_loss_mlp": 0.01042519, "balance_loss_clip": 1.04811144, "balance_loss_mlp": 1.0241369, "epoch": 0.2543213587855103, "flos": 22965879669120.0, "grad_norm": 1.7811216632522446, "language_loss": 0.77265114, "learning_rate": 3.4949535937036892e-06, "loss": 0.79410517, "num_input_tokens_seen": 91321120, "step": 4230, "time_per_iteration": 2.715655565261841 }, { "auxiliary_loss_clip": 0.01133364, "auxiliary_loss_mlp": 0.01047482, "balance_loss_clip": 1.0504818, "balance_loss_mlp": 1.03074503, "epoch": 0.2543814820381783, "flos": 18253178438400.0, "grad_norm": 1.8956341732473607, "language_loss": 0.7550717, "learning_rate": 3.4946948506713544e-06, "loss": 0.77688015, "num_input_tokens_seen": 91338575, "step": 4231, "time_per_iteration": 2.6945316791534424 }, { "auxiliary_loss_clip": 0.0113214, "auxiliary_loss_mlp": 0.01038979, "balance_loss_clip": 1.04939127, "balance_loss_mlp": 1.0230999, "epoch": 0.25444160529084625, "flos": 15632705629440.0, "grad_norm": 1.6179274617095247, "language_loss": 0.73618764, "learning_rate": 3.4944360509600416e-06, "loss": 0.75789881, "num_input_tokens_seen": 91357355, "step": 4232, "time_per_iteration": 2.6219112873077393 }, { "auxiliary_loss_clip": 0.01149145, "auxiliary_loss_mlp": 0.01043104, "balance_loss_clip": 1.05579972, "balance_loss_mlp": 1.02589035, "epoch": 0.2545017285435142, "flos": 24601610142720.0, "grad_norm": 2.2856831174377388, "language_loss": 0.86333203, "learning_rate": 3.4941771945795637e-06, "loss": 0.88525456, "num_input_tokens_seen": 91376515, "step": 4233, "time_per_iteration": 2.675877809524536 }, { "auxiliary_loss_clip": 0.01080108, "auxiliary_loss_mlp": 0.01040124, "balance_loss_clip": 1.04641938, "balance_loss_mlp": 1.02457917, "epoch": 0.2545618517961822, "flos": 24679069822080.0, "grad_norm": 1.5382450997432586, "language_loss": 0.75319451, "learning_rate": 3.493918281539737e-06, "loss": 0.77439684, "num_input_tokens_seen": 91397595, "step": 4234, "time_per_iteration": 2.9050087928771973 }, { "auxiliary_loss_clip": 0.01117427, "auxiliary_loss_mlp": 0.01044439, "balance_loss_clip": 1.05171227, "balance_loss_mlp": 1.02897787, "epoch": 0.25462197504885015, "flos": 23915106432000.0, "grad_norm": 2.6382014960101765, "language_loss": 0.74923635, "learning_rate": 3.493659311850379e-06, "loss": 0.77085495, "num_input_tokens_seen": 91417775, "step": 4235, "time_per_iteration": 2.788041353225708 }, { "auxiliary_loss_clip": 0.01124445, "auxiliary_loss_mlp": 0.00776537, "balance_loss_clip": 1.05315781, "balance_loss_mlp": 1.00115323, "epoch": 0.2546820983015181, "flos": 24789387467520.0, "grad_norm": 1.9882672691222136, "language_loss": 0.64451182, "learning_rate": 3.4934002855213106e-06, "loss": 0.66352159, "num_input_tokens_seen": 91437665, "step": 4236, "time_per_iteration": 2.8649141788482666 }, { "auxiliary_loss_clip": 0.01144465, "auxiliary_loss_mlp": 0.01036249, "balance_loss_clip": 1.05185175, "balance_loss_mlp": 1.02122915, "epoch": 0.2547422215541861, "flos": 18734130570240.0, "grad_norm": 1.6410229940010734, "language_loss": 0.6714325, "learning_rate": 3.493141202562354e-06, "loss": 0.69323969, "num_input_tokens_seen": 91456705, "step": 4237, "time_per_iteration": 4.262012958526611 }, { "auxiliary_loss_clip": 0.01147064, "auxiliary_loss_mlp": 0.01049012, "balance_loss_clip": 1.05240059, "balance_loss_mlp": 1.03203678, "epoch": 0.25480234480685404, "flos": 21032449274880.0, "grad_norm": 2.0013967295828237, "language_loss": 0.75415373, "learning_rate": 3.492882062983333e-06, "loss": 0.77611452, "num_input_tokens_seen": 91475535, "step": 4238, "time_per_iteration": 2.6378636360168457 }, { "auxiliary_loss_clip": 0.01137265, "auxiliary_loss_mlp": 0.01046047, "balance_loss_clip": 1.05366278, "balance_loss_mlp": 1.02843964, "epoch": 0.254862468059522, "flos": 25082167224960.0, "grad_norm": 3.4417299363308613, "language_loss": 0.80712521, "learning_rate": 3.492622866794074e-06, "loss": 0.82895833, "num_input_tokens_seen": 91499140, "step": 4239, "time_per_iteration": 4.348390579223633 }, { "auxiliary_loss_clip": 0.01128023, "auxiliary_loss_mlp": 0.01045872, "balance_loss_clip": 1.0522213, "balance_loss_mlp": 1.02870631, "epoch": 0.25492259131219, "flos": 20558392554240.0, "grad_norm": 1.7312526359597522, "language_loss": 0.77521586, "learning_rate": 3.492363614004407e-06, "loss": 0.79695487, "num_input_tokens_seen": 91518335, "step": 4240, "time_per_iteration": 2.7501273155212402 }, { "auxiliary_loss_clip": 0.01151347, "auxiliary_loss_mlp": 0.01040734, "balance_loss_clip": 1.05296493, "balance_loss_mlp": 1.0226146, "epoch": 0.25498271456485794, "flos": 25042485674880.0, "grad_norm": 3.3593092651087595, "language_loss": 0.83430749, "learning_rate": 3.492104304624162e-06, "loss": 0.85622829, "num_input_tokens_seen": 91537655, "step": 4241, "time_per_iteration": 2.7480928897857666 }, { "auxiliary_loss_clip": 0.01137407, "auxiliary_loss_mlp": 0.01045384, "balance_loss_clip": 1.05306387, "balance_loss_mlp": 1.02887392, "epoch": 0.2550428378175259, "flos": 26178412354560.0, "grad_norm": 1.6379574895871623, "language_loss": 0.73322648, "learning_rate": 3.4918449386631725e-06, "loss": 0.75505441, "num_input_tokens_seen": 91557545, "step": 4242, "time_per_iteration": 2.713635206222534 }, { "auxiliary_loss_clip": 0.0114709, "auxiliary_loss_mlp": 0.00774169, "balance_loss_clip": 1.05182981, "balance_loss_mlp": 1.00115824, "epoch": 0.2551029610701939, "flos": 15267170874240.0, "grad_norm": 3.2486673035230993, "language_loss": 0.72336024, "learning_rate": 3.491585516131273e-06, "loss": 0.7425729, "num_input_tokens_seen": 91574405, "step": 4243, "time_per_iteration": 4.298815727233887 }, { "auxiliary_loss_clip": 0.0113532, "auxiliary_loss_mlp": 0.01045095, "balance_loss_clip": 1.05183125, "balance_loss_mlp": 1.02797616, "epoch": 0.2551630843228619, "flos": 18112193556480.0, "grad_norm": 1.8323151946393021, "language_loss": 0.82076979, "learning_rate": 3.491326037038301e-06, "loss": 0.842574, "num_input_tokens_seen": 91593755, "step": 4244, "time_per_iteration": 2.6497015953063965 }, { "auxiliary_loss_clip": 0.01054616, "auxiliary_loss_mlp": 0.01017916, "balance_loss_clip": 1.03294289, "balance_loss_mlp": 1.01572227, "epoch": 0.25522320757552985, "flos": 70520192167680.0, "grad_norm": 0.6914168393706984, "language_loss": 0.57701397, "learning_rate": 3.4910665013940967e-06, "loss": 0.59773928, "num_input_tokens_seen": 91660335, "step": 4245, "time_per_iteration": 3.2938833236694336 }, { "auxiliary_loss_clip": 0.01146552, "auxiliary_loss_mlp": 0.01052395, "balance_loss_clip": 1.0508852, "balance_loss_mlp": 1.03577745, "epoch": 0.2552833308281978, "flos": 22893088757760.0, "grad_norm": 2.1326330958670567, "language_loss": 0.65120399, "learning_rate": 3.4908069092085015e-06, "loss": 0.6731934, "num_input_tokens_seen": 91678500, "step": 4246, "time_per_iteration": 2.5949065685272217 }, { "auxiliary_loss_clip": 0.01127579, "auxiliary_loss_mlp": 0.01044633, "balance_loss_clip": 1.04806828, "balance_loss_mlp": 1.02944601, "epoch": 0.2553434540808658, "flos": 22053605022720.0, "grad_norm": 1.7151532201527704, "language_loss": 0.81580049, "learning_rate": 3.4905472604913585e-06, "loss": 0.83752257, "num_input_tokens_seen": 91696430, "step": 4247, "time_per_iteration": 2.673624277114868 }, { "auxiliary_loss_clip": 0.01140059, "auxiliary_loss_mlp": 0.01044068, "balance_loss_clip": 1.05152941, "balance_loss_mlp": 1.02543616, "epoch": 0.25540357733353375, "flos": 16544190176640.0, "grad_norm": 2.241724474505105, "language_loss": 0.83335149, "learning_rate": 3.490287555252514e-06, "loss": 0.85519278, "num_input_tokens_seen": 91713270, "step": 4248, "time_per_iteration": 2.617570400238037 }, { "auxiliary_loss_clip": 0.01112618, "auxiliary_loss_mlp": 0.01042154, "balance_loss_clip": 1.04433584, "balance_loss_mlp": 1.02458215, "epoch": 0.2554637005862017, "flos": 17565022702080.0, "grad_norm": 2.084670538042193, "language_loss": 0.84011936, "learning_rate": 3.4900277935018166e-06, "loss": 0.8616671, "num_input_tokens_seen": 91728865, "step": 4249, "time_per_iteration": 2.6617467403411865 }, { "auxiliary_loss_clip": 0.01001275, "auxiliary_loss_mlp": 0.01002657, "balance_loss_clip": 1.0228157, "balance_loss_mlp": 0.9996174, "epoch": 0.2555238238388697, "flos": 72244763953920.0, "grad_norm": 0.765792812565725, "language_loss": 0.56274796, "learning_rate": 3.489767975249115e-06, "loss": 0.58278728, "num_input_tokens_seen": 91787470, "step": 4250, "time_per_iteration": 3.24300479888916 }, { "auxiliary_loss_clip": 0.01117816, "auxiliary_loss_mlp": 0.01036136, "balance_loss_clip": 1.04929769, "balance_loss_mlp": 1.01839769, "epoch": 0.25558394709153764, "flos": 24389414547840.0, "grad_norm": 2.294460262471245, "language_loss": 0.80566651, "learning_rate": 3.4895081005042632e-06, "loss": 0.82720602, "num_input_tokens_seen": 91805640, "step": 4251, "time_per_iteration": 2.732752561569214 }, { "auxiliary_loss_clip": 0.01030367, "auxiliary_loss_mlp": 0.01001193, "balance_loss_clip": 1.02468216, "balance_loss_mlp": 0.99888068, "epoch": 0.2556440703442056, "flos": 69231213636480.0, "grad_norm": 0.7932625116211053, "language_loss": 0.6608988, "learning_rate": 3.4892481692771146e-06, "loss": 0.68121445, "num_input_tokens_seen": 91869695, "step": 4252, "time_per_iteration": 3.304985523223877 }, { "auxiliary_loss_clip": 0.01130428, "auxiliary_loss_mlp": 0.01036056, "balance_loss_clip": 1.0499115, "balance_loss_mlp": 1.02097619, "epoch": 0.2557041935968736, "flos": 24863902231680.0, "grad_norm": 2.60951363435401, "language_loss": 0.73882902, "learning_rate": 3.4889881815775267e-06, "loss": 0.76049387, "num_input_tokens_seen": 91889920, "step": 4253, "time_per_iteration": 2.706052303314209 }, { "auxiliary_loss_clip": 0.01097964, "auxiliary_loss_mlp": 0.01044298, "balance_loss_clip": 1.04340124, "balance_loss_mlp": 1.02782309, "epoch": 0.25576431684954154, "flos": 22492110257280.0, "grad_norm": 2.978807414856607, "language_loss": 0.72565317, "learning_rate": 3.488728137415357e-06, "loss": 0.7470758, "num_input_tokens_seen": 91908665, "step": 4254, "time_per_iteration": 2.7579715251922607 }, { "auxiliary_loss_clip": 0.01098791, "auxiliary_loss_mlp": 0.00774228, "balance_loss_clip": 1.04665136, "balance_loss_mlp": 1.001104, "epoch": 0.2558244401022095, "flos": 19826748426240.0, "grad_norm": 1.7240740787107458, "language_loss": 0.80729312, "learning_rate": 3.4884680368004675e-06, "loss": 0.82602334, "num_input_tokens_seen": 91927855, "step": 4255, "time_per_iteration": 2.788978099822998 }, { "auxiliary_loss_clip": 0.01124525, "auxiliary_loss_mlp": 0.01040748, "balance_loss_clip": 1.05111384, "balance_loss_mlp": 1.02414227, "epoch": 0.2558845633548775, "flos": 23220486247680.0, "grad_norm": 1.5275751549355678, "language_loss": 0.85734111, "learning_rate": 3.488207879742721e-06, "loss": 0.87899381, "num_input_tokens_seen": 91948500, "step": 4256, "time_per_iteration": 2.7916831970214844 }, { "auxiliary_loss_clip": 0.01102599, "auxiliary_loss_mlp": 0.01049743, "balance_loss_clip": 1.04525566, "balance_loss_mlp": 1.03164732, "epoch": 0.2559446866075455, "flos": 16837867774080.0, "grad_norm": 1.8301502951270987, "language_loss": 0.74872649, "learning_rate": 3.4879476662519826e-06, "loss": 0.77024996, "num_input_tokens_seen": 91968375, "step": 4257, "time_per_iteration": 2.7754952907562256 }, { "auxiliary_loss_clip": 0.0102418, "auxiliary_loss_mlp": 0.01011535, "balance_loss_clip": 1.03534186, "balance_loss_mlp": 1.00959146, "epoch": 0.25600480986021346, "flos": 57593786895360.0, "grad_norm": 0.8003890262370261, "language_loss": 0.65255105, "learning_rate": 3.4876873963381196e-06, "loss": 0.67290819, "num_input_tokens_seen": 92028490, "step": 4258, "time_per_iteration": 3.269063949584961 }, { "auxiliary_loss_clip": 0.01091736, "auxiliary_loss_mlp": 0.00773347, "balance_loss_clip": 1.04549718, "balance_loss_mlp": 1.00111449, "epoch": 0.2560649331128814, "flos": 27819529868160.0, "grad_norm": 1.5266978755669562, "language_loss": 0.76443565, "learning_rate": 3.4874270700110013e-06, "loss": 0.78308654, "num_input_tokens_seen": 92048060, "step": 4259, "time_per_iteration": 2.805574893951416 }, { "auxiliary_loss_clip": 0.01026212, "auxiliary_loss_mlp": 0.01016368, "balance_loss_clip": 1.02208054, "balance_loss_mlp": 1.01372147, "epoch": 0.2561250563655494, "flos": 70950509101440.0, "grad_norm": 0.7927643603688844, "language_loss": 0.58455491, "learning_rate": 3.4871666872804994e-06, "loss": 0.60498071, "num_input_tokens_seen": 92118180, "step": 4260, "time_per_iteration": 3.3904550075531006 }, { "auxiliary_loss_clip": 0.01133193, "auxiliary_loss_mlp": 0.01048996, "balance_loss_clip": 1.04874313, "balance_loss_mlp": 1.03204465, "epoch": 0.25618517961821735, "flos": 27012329481600.0, "grad_norm": 3.3188145253338543, "language_loss": 0.77064955, "learning_rate": 3.4869062481564875e-06, "loss": 0.79247141, "num_input_tokens_seen": 92137570, "step": 4261, "time_per_iteration": 2.769864082336426 }, { "auxiliary_loss_clip": 0.01144035, "auxiliary_loss_mlp": 0.01040091, "balance_loss_clip": 1.05178332, "balance_loss_mlp": 1.02465355, "epoch": 0.2562453028708853, "flos": 23068296322560.0, "grad_norm": 1.5699122250769224, "language_loss": 0.83367205, "learning_rate": 3.486645752648842e-06, "loss": 0.85551333, "num_input_tokens_seen": 92157625, "step": 4262, "time_per_iteration": 2.682828426361084 }, { "auxiliary_loss_clip": 0.01134556, "auxiliary_loss_mlp": 0.01041299, "balance_loss_clip": 1.05219626, "balance_loss_mlp": 1.02344143, "epoch": 0.2563054261235533, "flos": 15120942606720.0, "grad_norm": 2.340862226505914, "language_loss": 0.73892939, "learning_rate": 3.4863852007674405e-06, "loss": 0.76068795, "num_input_tokens_seen": 92175350, "step": 4263, "time_per_iteration": 2.70947003364563 }, { "auxiliary_loss_clip": 0.0111297, "auxiliary_loss_mlp": 0.00773371, "balance_loss_clip": 1.05221081, "balance_loss_mlp": 1.00093555, "epoch": 0.25636554937622125, "flos": 27854865872640.0, "grad_norm": 1.8143922917988324, "language_loss": 0.82766259, "learning_rate": 3.486124592522163e-06, "loss": 0.84652603, "num_input_tokens_seen": 92196070, "step": 4264, "time_per_iteration": 2.7249553203582764 }, { "auxiliary_loss_clip": 0.01133012, "auxiliary_loss_mlp": 0.01041877, "balance_loss_clip": 1.05265546, "balance_loss_mlp": 1.02468669, "epoch": 0.2564256726288892, "flos": 28906509288960.0, "grad_norm": 2.8986425954305206, "language_loss": 0.74346334, "learning_rate": 3.4858639279228924e-06, "loss": 0.76521224, "num_input_tokens_seen": 92216310, "step": 4265, "time_per_iteration": 2.7149150371551514 }, { "auxiliary_loss_clip": 0.01110152, "auxiliary_loss_mlp": 0.01036531, "balance_loss_clip": 1.04754925, "balance_loss_mlp": 1.02034247, "epoch": 0.2564857958815572, "flos": 18514931823360.0, "grad_norm": 15.50909821859273, "language_loss": 0.81623137, "learning_rate": 3.485603206979513e-06, "loss": 0.83769822, "num_input_tokens_seen": 92234510, "step": 4266, "time_per_iteration": 2.6890153884887695 }, { "auxiliary_loss_clip": 0.01083702, "auxiliary_loss_mlp": 0.01050109, "balance_loss_clip": 1.0468955, "balance_loss_mlp": 1.0318346, "epoch": 0.25654591913422514, "flos": 25808280658560.0, "grad_norm": 2.4522850064786037, "language_loss": 0.79120672, "learning_rate": 3.4853424297019103e-06, "loss": 0.81254482, "num_input_tokens_seen": 92254070, "step": 4267, "time_per_iteration": 2.8390700817108154 }, { "auxiliary_loss_clip": 0.01094597, "auxiliary_loss_mlp": 0.01044608, "balance_loss_clip": 1.04643822, "balance_loss_mlp": 1.0276804, "epoch": 0.2566060423868931, "flos": 19099665325440.0, "grad_norm": 1.6765306902124857, "language_loss": 0.79241312, "learning_rate": 3.4850815960999736e-06, "loss": 0.81380516, "num_input_tokens_seen": 92275060, "step": 4268, "time_per_iteration": 2.7324178218841553 }, { "auxiliary_loss_clip": 0.01106667, "auxiliary_loss_mlp": 0.00778662, "balance_loss_clip": 1.04940808, "balance_loss_mlp": 1.00098729, "epoch": 0.25666616563956113, "flos": 23842674656640.0, "grad_norm": 1.8248642507450341, "language_loss": 0.67737979, "learning_rate": 3.484820706183595e-06, "loss": 0.69623303, "num_input_tokens_seen": 92293610, "step": 4269, "time_per_iteration": 2.7897677421569824 }, { "auxiliary_loss_clip": 0.01123993, "auxiliary_loss_mlp": 0.01043408, "balance_loss_clip": 1.05155373, "balance_loss_mlp": 1.02596736, "epoch": 0.2567262888922291, "flos": 14604259420800.0, "grad_norm": 3.069203267679029, "language_loss": 0.79117787, "learning_rate": 3.484559759962666e-06, "loss": 0.81285185, "num_input_tokens_seen": 92308305, "step": 4270, "time_per_iteration": 2.8076114654541016 }, { "auxiliary_loss_clip": 0.01094814, "auxiliary_loss_mlp": 0.010436, "balance_loss_clip": 1.04357839, "balance_loss_mlp": 1.02393079, "epoch": 0.25678641214489706, "flos": 32923117877760.0, "grad_norm": 2.413207422396751, "language_loss": 0.68088073, "learning_rate": 3.4842987574470816e-06, "loss": 0.7022649, "num_input_tokens_seen": 92329875, "step": 4271, "time_per_iteration": 2.8195667266845703 }, { "auxiliary_loss_clip": 0.01136281, "auxiliary_loss_mlp": 0.00774788, "balance_loss_clip": 1.05146289, "balance_loss_mlp": 1.00110972, "epoch": 0.256846535397565, "flos": 24098933260800.0, "grad_norm": 3.3671515903121216, "language_loss": 0.87362605, "learning_rate": 3.4840376986467403e-06, "loss": 0.89273679, "num_input_tokens_seen": 92348780, "step": 4272, "time_per_iteration": 2.6910364627838135 }, { "auxiliary_loss_clip": 0.01122968, "auxiliary_loss_mlp": 0.01046328, "balance_loss_clip": 1.05348301, "balance_loss_mlp": 1.02854192, "epoch": 0.256906658650233, "flos": 19718441942400.0, "grad_norm": 1.6813472119330561, "language_loss": 0.81420678, "learning_rate": 3.483776583571541e-06, "loss": 0.83589977, "num_input_tokens_seen": 92368175, "step": 4273, "time_per_iteration": 2.6883673667907715 }, { "auxiliary_loss_clip": 0.01097944, "auxiliary_loss_mlp": 0.01041741, "balance_loss_clip": 1.043715, "balance_loss_mlp": 1.02459884, "epoch": 0.25696678190290095, "flos": 22926018551040.0, "grad_norm": 3.3251008044769947, "language_loss": 0.76944637, "learning_rate": 3.4835154122313846e-06, "loss": 0.79084325, "num_input_tokens_seen": 92387755, "step": 4274, "time_per_iteration": 2.7613401412963867 }, { "auxiliary_loss_clip": 0.01112797, "auxiliary_loss_mlp": 0.01039272, "balance_loss_clip": 1.04380774, "balance_loss_mlp": 1.02220166, "epoch": 0.2570269051555689, "flos": 27307838672640.0, "grad_norm": 2.1172072427968933, "language_loss": 0.83780324, "learning_rate": 3.4832541846361743e-06, "loss": 0.85932392, "num_input_tokens_seen": 92409850, "step": 4275, "time_per_iteration": 2.7835779190063477 }, { "auxiliary_loss_clip": 0.01120289, "auxiliary_loss_mlp": 0.01039714, "balance_loss_clip": 1.05141211, "balance_loss_mlp": 1.02223814, "epoch": 0.2570870284082369, "flos": 27563414918400.0, "grad_norm": 2.725989678545036, "language_loss": 0.7874397, "learning_rate": 3.4829929007958175e-06, "loss": 0.80903983, "num_input_tokens_seen": 92431250, "step": 4276, "time_per_iteration": 5.679298400878906 }, { "auxiliary_loss_clip": 0.01136261, "auxiliary_loss_mlp": 0.01046327, "balance_loss_clip": 1.05269814, "balance_loss_mlp": 1.02982879, "epoch": 0.25714715166090485, "flos": 28730834847360.0, "grad_norm": 4.7083902318823885, "language_loss": 0.79273927, "learning_rate": 3.4827315607202214e-06, "loss": 0.81456512, "num_input_tokens_seen": 92452065, "step": 4277, "time_per_iteration": 2.691035270690918 }, { "auxiliary_loss_clip": 0.01146238, "auxiliary_loss_mlp": 0.01040113, "balance_loss_clip": 1.05214763, "balance_loss_mlp": 1.02367437, "epoch": 0.2572072749135728, "flos": 20116152305280.0, "grad_norm": 2.017980063834791, "language_loss": 0.78986102, "learning_rate": 3.482470164419295e-06, "loss": 0.81172454, "num_input_tokens_seen": 92470025, "step": 4278, "time_per_iteration": 4.2404680252075195 }, { "auxiliary_loss_clip": 0.01126121, "auxiliary_loss_mlp": 0.01037901, "balance_loss_clip": 1.05176449, "balance_loss_mlp": 1.02102113, "epoch": 0.2572673981662408, "flos": 26030855283840.0, "grad_norm": 2.8070462448385904, "language_loss": 0.74898899, "learning_rate": 3.482208711902952e-06, "loss": 0.77062923, "num_input_tokens_seen": 92489825, "step": 4279, "time_per_iteration": 2.65977144241333 }, { "auxiliary_loss_clip": 0.01134686, "auxiliary_loss_mlp": 0.01051687, "balance_loss_clip": 1.04973292, "balance_loss_mlp": 1.03423464, "epoch": 0.25732752141890874, "flos": 16106618695680.0, "grad_norm": 2.4256697448035687, "language_loss": 0.85603923, "learning_rate": 3.4819472031811065e-06, "loss": 0.87790298, "num_input_tokens_seen": 92507270, "step": 4280, "time_per_iteration": 2.6072864532470703 }, { "auxiliary_loss_clip": 0.01136623, "auxiliary_loss_mlp": 0.01039056, "balance_loss_clip": 1.05183434, "balance_loss_mlp": 1.02147269, "epoch": 0.2573876446715767, "flos": 22524429519360.0, "grad_norm": 3.9579835716917695, "language_loss": 0.79381943, "learning_rate": 3.4816856382636744e-06, "loss": 0.8155762, "num_input_tokens_seen": 92526300, "step": 4281, "time_per_iteration": 2.613163471221924 }, { "auxiliary_loss_clip": 0.01110196, "auxiliary_loss_mlp": 0.01038018, "balance_loss_clip": 1.04847932, "balance_loss_mlp": 1.02099478, "epoch": 0.2574477679242447, "flos": 23950837486080.0, "grad_norm": 2.240063499401578, "language_loss": 0.87314785, "learning_rate": 3.4814240171605737e-06, "loss": 0.89462996, "num_input_tokens_seen": 92546465, "step": 4282, "time_per_iteration": 4.489396333694458 }, { "auxiliary_loss_clip": 0.01148783, "auxiliary_loss_mlp": 0.01046594, "balance_loss_clip": 1.0526619, "balance_loss_mlp": 1.02959502, "epoch": 0.2575078911769127, "flos": 21981711951360.0, "grad_norm": 1.5167715532309152, "language_loss": 0.70110047, "learning_rate": 3.4811623398817267e-06, "loss": 0.72305429, "num_input_tokens_seen": 92567260, "step": 4283, "time_per_iteration": 2.619131565093994 }, { "auxiliary_loss_clip": 0.01144466, "auxiliary_loss_mlp": 0.00774605, "balance_loss_clip": 1.05443883, "balance_loss_mlp": 1.0010494, "epoch": 0.25756801442958066, "flos": 21945406279680.0, "grad_norm": 1.950947388276708, "language_loss": 0.80411774, "learning_rate": 3.4809006064370553e-06, "loss": 0.82330847, "num_input_tokens_seen": 92585425, "step": 4284, "time_per_iteration": 2.656998634338379 }, { "auxiliary_loss_clip": 0.01105473, "auxiliary_loss_mlp": 0.01039993, "balance_loss_clip": 1.05797076, "balance_loss_mlp": 1.02488899, "epoch": 0.2576281376822486, "flos": 35261980058880.0, "grad_norm": 2.2559612506718434, "language_loss": 0.70473522, "learning_rate": 3.4806388168364835e-06, "loss": 0.72618985, "num_input_tokens_seen": 92604770, "step": 4285, "time_per_iteration": 2.880835771560669 }, { "auxiliary_loss_clip": 0.01127807, "auxiliary_loss_mlp": 0.0104515, "balance_loss_clip": 1.05229783, "balance_loss_mlp": 1.02971268, "epoch": 0.2576882609349166, "flos": 14132285688960.0, "grad_norm": 1.8739093647405893, "language_loss": 0.58494061, "learning_rate": 3.4803769710899402e-06, "loss": 0.6066702, "num_input_tokens_seen": 92622635, "step": 4286, "time_per_iteration": 2.63923978805542 }, { "auxiliary_loss_clip": 0.01138174, "auxiliary_loss_mlp": 0.01046794, "balance_loss_clip": 1.05271184, "balance_loss_mlp": 1.03020048, "epoch": 0.25774838418758456, "flos": 23258336204160.0, "grad_norm": 1.4732857929087761, "language_loss": 0.63687879, "learning_rate": 3.480115069207354e-06, "loss": 0.65872842, "num_input_tokens_seen": 92642960, "step": 4287, "time_per_iteration": 2.67764949798584 }, { "auxiliary_loss_clip": 0.01127889, "auxiliary_loss_mlp": 0.01045385, "balance_loss_clip": 1.05252934, "balance_loss_mlp": 1.02769411, "epoch": 0.2578085074402525, "flos": 22601745544320.0, "grad_norm": 2.134546441867425, "language_loss": 0.71780413, "learning_rate": 3.4798531111986557e-06, "loss": 0.73953688, "num_input_tokens_seen": 92662455, "step": 4288, "time_per_iteration": 2.7174036502838135 }, { "auxiliary_loss_clip": 0.0110996, "auxiliary_loss_mlp": 0.01042748, "balance_loss_clip": 1.04934072, "balance_loss_mlp": 1.02691674, "epoch": 0.2578686306929205, "flos": 24571840746240.0, "grad_norm": 1.4449800602700236, "language_loss": 0.77059102, "learning_rate": 3.4795910970737786e-06, "loss": 0.79211813, "num_input_tokens_seen": 92683520, "step": 4289, "time_per_iteration": 2.748249053955078 }, { "auxiliary_loss_clip": 0.01146276, "auxiliary_loss_mlp": 0.00775089, "balance_loss_clip": 1.05252326, "balance_loss_mlp": 1.001122, "epoch": 0.25792875394558845, "flos": 18113953322880.0, "grad_norm": 2.0235699584636295, "language_loss": 0.85416883, "learning_rate": 3.4793290268426592e-06, "loss": 0.87338245, "num_input_tokens_seen": 92701450, "step": 4290, "time_per_iteration": 2.593461751937866 }, { "auxiliary_loss_clip": 0.01114221, "auxiliary_loss_mlp": 0.01056837, "balance_loss_clip": 1.05081999, "balance_loss_mlp": 1.03660691, "epoch": 0.2579888771982564, "flos": 17712902995200.0, "grad_norm": 2.4272093439618847, "language_loss": 0.72360331, "learning_rate": 3.4790669005152354e-06, "loss": 0.74531388, "num_input_tokens_seen": 92720355, "step": 4291, "time_per_iteration": 2.6838138103485107 }, { "auxiliary_loss_clip": 0.01150945, "auxiliary_loss_mlp": 0.0104494, "balance_loss_clip": 1.05378067, "balance_loss_mlp": 1.02758288, "epoch": 0.2580490004509244, "flos": 16434878112000.0, "grad_norm": 2.78045823134535, "language_loss": 0.80846477, "learning_rate": 3.4788047181014458e-06, "loss": 0.83042365, "num_input_tokens_seen": 92736755, "step": 4292, "time_per_iteration": 2.595710277557373 }, { "auxiliary_loss_clip": 0.0115367, "auxiliary_loss_mlp": 0.01044878, "balance_loss_clip": 1.05773902, "balance_loss_mlp": 1.02702022, "epoch": 0.25810912370359235, "flos": 33835141128960.0, "grad_norm": 2.057533015633898, "language_loss": 0.67592025, "learning_rate": 3.4785424796112337e-06, "loss": 0.69790578, "num_input_tokens_seen": 92757655, "step": 4293, "time_per_iteration": 2.699570894241333 }, { "auxiliary_loss_clip": 0.0110485, "auxiliary_loss_mlp": 0.01048043, "balance_loss_clip": 1.04971898, "balance_loss_mlp": 1.03190207, "epoch": 0.2581692469562603, "flos": 25192197561600.0, "grad_norm": 2.0097854631835217, "language_loss": 0.75671911, "learning_rate": 3.478280185054542e-06, "loss": 0.77824801, "num_input_tokens_seen": 92776100, "step": 4294, "time_per_iteration": 2.7217960357666016 }, { "auxiliary_loss_clip": 0.01098332, "auxiliary_loss_mlp": 0.01053556, "balance_loss_clip": 1.0444684, "balance_loss_mlp": 1.03404188, "epoch": 0.2582293702089283, "flos": 34932212271360.0, "grad_norm": 1.7798433628760433, "language_loss": 0.8047998, "learning_rate": 3.478017834441318e-06, "loss": 0.82631868, "num_input_tokens_seen": 92798880, "step": 4295, "time_per_iteration": 2.871460437774658 }, { "auxiliary_loss_clip": 0.01055358, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.04843688, "balance_loss_mlp": 1.0256989, "epoch": 0.2582894934615963, "flos": 26833746038400.0, "grad_norm": 2.1012913939780753, "language_loss": 0.72843397, "learning_rate": 3.4777554277815096e-06, "loss": 0.74942946, "num_input_tokens_seen": 92817750, "step": 4296, "time_per_iteration": 3.173367738723755 }, { "auxiliary_loss_clip": 0.01091622, "auxiliary_loss_mlp": 0.01038465, "balance_loss_clip": 1.05392241, "balance_loss_mlp": 1.02106011, "epoch": 0.25834961671426426, "flos": 23515241253120.0, "grad_norm": 1.5772062283828172, "language_loss": 0.86928564, "learning_rate": 3.477492965085067e-06, "loss": 0.8905865, "num_input_tokens_seen": 92837995, "step": 4297, "time_per_iteration": 3.1598868370056152 }, { "auxiliary_loss_clip": 0.01149748, "auxiliary_loss_mlp": 0.01047412, "balance_loss_clip": 1.05517435, "balance_loss_mlp": 1.03090191, "epoch": 0.25840973996693223, "flos": 22451028076800.0, "grad_norm": 1.8030727150796175, "language_loss": 0.84720427, "learning_rate": 3.477230446361943e-06, "loss": 0.86917591, "num_input_tokens_seen": 92857245, "step": 4298, "time_per_iteration": 2.632448196411133 }, { "auxiliary_loss_clip": 0.01135108, "auxiliary_loss_mlp": 0.00775458, "balance_loss_clip": 1.05262494, "balance_loss_mlp": 1.00111055, "epoch": 0.2584698632196002, "flos": 11290854366720.0, "grad_norm": 2.0124667048247686, "language_loss": 0.83514953, "learning_rate": 3.4769678716220927e-06, "loss": 0.8542552, "num_input_tokens_seen": 92873265, "step": 4299, "time_per_iteration": 2.631248950958252 }, { "auxiliary_loss_clip": 0.01117485, "auxiliary_loss_mlp": 0.0103505, "balance_loss_clip": 1.05216849, "balance_loss_mlp": 1.01868308, "epoch": 0.25852998647226816, "flos": 17929982839680.0, "grad_norm": 2.419754138344463, "language_loss": 0.82422709, "learning_rate": 3.4767052408754726e-06, "loss": 0.84575242, "num_input_tokens_seen": 92890880, "step": 4300, "time_per_iteration": 2.650834083557129 }, { "auxiliary_loss_clip": 0.0113846, "auxiliary_loss_mlp": 0.01041208, "balance_loss_clip": 1.0535903, "balance_loss_mlp": 1.02343392, "epoch": 0.2585901097249361, "flos": 33256117889280.0, "grad_norm": 2.971673559214411, "language_loss": 0.66949177, "learning_rate": 3.4764425541320417e-06, "loss": 0.69128841, "num_input_tokens_seen": 92910770, "step": 4301, "time_per_iteration": 2.729519844055176 }, { "auxiliary_loss_clip": 0.01139778, "auxiliary_loss_mlp": 0.01040158, "balance_loss_clip": 1.05335701, "balance_loss_mlp": 1.02245533, "epoch": 0.2586502329776041, "flos": 18441278985600.0, "grad_norm": 2.29820997177689, "language_loss": 0.81177735, "learning_rate": 3.4761798114017617e-06, "loss": 0.83357668, "num_input_tokens_seen": 92929520, "step": 4302, "time_per_iteration": 2.5496692657470703 }, { "auxiliary_loss_clip": 0.01105433, "auxiliary_loss_mlp": 0.01042423, "balance_loss_clip": 1.05242491, "balance_loss_mlp": 1.02542388, "epoch": 0.25871035623027205, "flos": 17968120104960.0, "grad_norm": 1.8036447001063776, "language_loss": 0.92147923, "learning_rate": 3.475917012694595e-06, "loss": 0.94295776, "num_input_tokens_seen": 92947890, "step": 4303, "time_per_iteration": 2.686222791671753 }, { "auxiliary_loss_clip": 0.01141887, "auxiliary_loss_mlp": 0.01040139, "balance_loss_clip": 1.05643094, "balance_loss_mlp": 1.02322304, "epoch": 0.25877047948294, "flos": 27777729415680.0, "grad_norm": 2.7085759571044368, "language_loss": 0.67138135, "learning_rate": 3.475654158020507e-06, "loss": 0.69320166, "num_input_tokens_seen": 92967690, "step": 4304, "time_per_iteration": 2.665797472000122 }, { "auxiliary_loss_clip": 0.01113882, "auxiliary_loss_mlp": 0.01041979, "balance_loss_clip": 1.0509342, "balance_loss_mlp": 1.02498007, "epoch": 0.258830602735608, "flos": 27125843437440.0, "grad_norm": 2.126938769919949, "language_loss": 0.72085559, "learning_rate": 3.4753912473894657e-06, "loss": 0.74241412, "num_input_tokens_seen": 92986830, "step": 4305, "time_per_iteration": 2.7514076232910156 }, { "auxiliary_loss_clip": 0.01103045, "auxiliary_loss_mlp": 0.00775987, "balance_loss_clip": 1.04804707, "balance_loss_mlp": 1.00122118, "epoch": 0.25889072598827595, "flos": 17891486438400.0, "grad_norm": 6.414506312387852, "language_loss": 0.76175749, "learning_rate": 3.4751282808114403e-06, "loss": 0.78054774, "num_input_tokens_seen": 93002740, "step": 4306, "time_per_iteration": 2.7326161861419678 }, { "auxiliary_loss_clip": 0.01049461, "auxiliary_loss_mlp": 0.0102188, "balance_loss_clip": 1.03476799, "balance_loss_mlp": 1.01943636, "epoch": 0.2589508492409439, "flos": 53934955724160.0, "grad_norm": 0.8427062291747792, "language_loss": 0.57128024, "learning_rate": 3.474865258296403e-06, "loss": 0.59199357, "num_input_tokens_seen": 93058645, "step": 4307, "time_per_iteration": 3.1499595642089844 }, { "auxiliary_loss_clip": 0.01123356, "auxiliary_loss_mlp": 0.01045032, "balance_loss_clip": 1.0514828, "balance_loss_mlp": 1.02858078, "epoch": 0.2590109724936119, "flos": 22125785402880.0, "grad_norm": 1.5299746109283647, "language_loss": 0.71727359, "learning_rate": 3.474602179854327e-06, "loss": 0.73895752, "num_input_tokens_seen": 93077140, "step": 4308, "time_per_iteration": 2.6824283599853516 }, { "auxiliary_loss_clip": 0.01152705, "auxiliary_loss_mlp": 0.01046843, "balance_loss_clip": 1.05659723, "balance_loss_mlp": 1.02976048, "epoch": 0.2590710957462799, "flos": 13474294398720.0, "grad_norm": 1.8339599204524273, "language_loss": 0.83940542, "learning_rate": 3.4743390454951886e-06, "loss": 0.86140084, "num_input_tokens_seen": 93093580, "step": 4309, "time_per_iteration": 2.560194253921509 }, { "auxiliary_loss_clip": 0.01137306, "auxiliary_loss_mlp": 0.01044025, "balance_loss_clip": 1.05587196, "balance_loss_mlp": 1.02815771, "epoch": 0.25913121899894787, "flos": 22307098279680.0, "grad_norm": 1.5397823214091813, "language_loss": 0.84657532, "learning_rate": 3.474075855228966e-06, "loss": 0.86838865, "num_input_tokens_seen": 93112345, "step": 4310, "time_per_iteration": 2.627716064453125 }, { "auxiliary_loss_clip": 0.01143598, "auxiliary_loss_mlp": 0.01047667, "balance_loss_clip": 1.05802059, "balance_loss_mlp": 1.03141904, "epoch": 0.25919134225161583, "flos": 25811728364160.0, "grad_norm": 2.0190220849922094, "language_loss": 0.77145267, "learning_rate": 3.473812609065639e-06, "loss": 0.79336536, "num_input_tokens_seen": 93131545, "step": 4311, "time_per_iteration": 2.694856643676758 }, { "auxiliary_loss_clip": 0.01110239, "auxiliary_loss_mlp": 0.01052381, "balance_loss_clip": 1.04629123, "balance_loss_mlp": 1.03498793, "epoch": 0.2592514655042838, "flos": 31212262108800.0, "grad_norm": 1.9233367952735905, "language_loss": 0.72848439, "learning_rate": 3.4735493070151904e-06, "loss": 0.75011057, "num_input_tokens_seen": 93150730, "step": 4312, "time_per_iteration": 2.7577714920043945 }, { "auxiliary_loss_clip": 0.01150768, "auxiliary_loss_mlp": 0.01044439, "balance_loss_clip": 1.05618715, "balance_loss_mlp": 1.02845287, "epoch": 0.25931158875695176, "flos": 18474998878080.0, "grad_norm": 1.8485738044524733, "language_loss": 0.70193493, "learning_rate": 3.4732859490876044e-06, "loss": 0.72388697, "num_input_tokens_seen": 93167895, "step": 4313, "time_per_iteration": 2.6447813510894775 }, { "auxiliary_loss_clip": 0.01150117, "auxiliary_loss_mlp": 0.01054192, "balance_loss_clip": 1.05624926, "balance_loss_mlp": 1.03845656, "epoch": 0.2593717120096197, "flos": 19207935895680.0, "grad_norm": 1.8538125013537565, "language_loss": 0.80462205, "learning_rate": 3.473022535292867e-06, "loss": 0.82666522, "num_input_tokens_seen": 93187650, "step": 4314, "time_per_iteration": 2.6073296070098877 }, { "auxiliary_loss_clip": 0.01110006, "auxiliary_loss_mlp": 0.01049511, "balance_loss_clip": 1.04867387, "balance_loss_mlp": 1.03253555, "epoch": 0.2594318352622877, "flos": 31248100903680.0, "grad_norm": 2.061113629574459, "language_loss": 0.670748, "learning_rate": 3.472759065640968e-06, "loss": 0.69234318, "num_input_tokens_seen": 93207370, "step": 4315, "time_per_iteration": 6.427948236465454 }, { "auxiliary_loss_clip": 0.01096074, "auxiliary_loss_mlp": 0.01056601, "balance_loss_clip": 1.04853845, "balance_loss_mlp": 1.0407939, "epoch": 0.25949195851495566, "flos": 22237144542720.0, "grad_norm": 2.0096953575355125, "language_loss": 0.79649067, "learning_rate": 3.4724955401418976e-06, "loss": 0.81801736, "num_input_tokens_seen": 93227925, "step": 4316, "time_per_iteration": 2.7463796138763428 }, { "auxiliary_loss_clip": 0.01096584, "auxiliary_loss_mlp": 0.01048328, "balance_loss_clip": 1.0487628, "balance_loss_mlp": 1.03112638, "epoch": 0.2595520817676236, "flos": 28075716645120.0, "grad_norm": 3.2727308584132584, "language_loss": 0.77498394, "learning_rate": 3.4722319588056487e-06, "loss": 0.79643309, "num_input_tokens_seen": 93250020, "step": 4317, "time_per_iteration": 4.658867359161377 }, { "auxiliary_loss_clip": 0.01155612, "auxiliary_loss_mlp": 0.01054128, "balance_loss_clip": 1.05959845, "balance_loss_mlp": 1.03734958, "epoch": 0.2596122050202916, "flos": 20190954378240.0, "grad_norm": 2.117435309152476, "language_loss": 0.77656054, "learning_rate": 3.4719683216422163e-06, "loss": 0.79865795, "num_input_tokens_seen": 93269070, "step": 4318, "time_per_iteration": 2.5934906005859375 }, { "auxiliary_loss_clip": 0.01146449, "auxiliary_loss_mlp": 0.01045441, "balance_loss_clip": 1.0530901, "balance_loss_mlp": 1.02733302, "epoch": 0.25967232827295955, "flos": 22527949052160.0, "grad_norm": 1.6144223240331488, "language_loss": 0.76362926, "learning_rate": 3.471704628661598e-06, "loss": 0.78554815, "num_input_tokens_seen": 93290250, "step": 4319, "time_per_iteration": 2.607649564743042 }, { "auxiliary_loss_clip": 0.01125042, "auxiliary_loss_mlp": 0.01041624, "balance_loss_clip": 1.05419481, "balance_loss_mlp": 1.02587628, "epoch": 0.2597324515256275, "flos": 21068252156160.0, "grad_norm": 1.6090277746740278, "language_loss": 0.76549125, "learning_rate": 3.4714408798737925e-06, "loss": 0.78715789, "num_input_tokens_seen": 93310090, "step": 4320, "time_per_iteration": 2.722574472427368 }, { "auxiliary_loss_clip": 0.01116281, "auxiliary_loss_mlp": 0.01042709, "balance_loss_clip": 1.05157554, "balance_loss_mlp": 1.02546, "epoch": 0.2597925747782955, "flos": 22050013662720.0, "grad_norm": 1.6564648175426406, "language_loss": 0.71067965, "learning_rate": 3.471177075288801e-06, "loss": 0.73226953, "num_input_tokens_seen": 93329570, "step": 4321, "time_per_iteration": 4.276093244552612 }, { "auxiliary_loss_clip": 0.01125031, "auxiliary_loss_mlp": 0.01055033, "balance_loss_clip": 1.05191207, "balance_loss_mlp": 1.03549457, "epoch": 0.2598526980309635, "flos": 19536949497600.0, "grad_norm": 1.9031382952841078, "language_loss": 0.74805915, "learning_rate": 3.4709132149166277e-06, "loss": 0.76985979, "num_input_tokens_seen": 93347920, "step": 4322, "time_per_iteration": 2.6573097705841064 }, { "auxiliary_loss_clip": 0.0111558, "auxiliary_loss_mlp": 0.0104757, "balance_loss_clip": 1.05213332, "balance_loss_mlp": 1.03004622, "epoch": 0.25991282128363147, "flos": 24495207079680.0, "grad_norm": 1.8978708709823064, "language_loss": 0.73837054, "learning_rate": 3.470649298767278e-06, "loss": 0.76000202, "num_input_tokens_seen": 93367145, "step": 4323, "time_per_iteration": 2.75765061378479 }, { "auxiliary_loss_clip": 0.01139686, "auxiliary_loss_mlp": 0.00775622, "balance_loss_clip": 1.0509938, "balance_loss_mlp": 1.00099182, "epoch": 0.25997294453629943, "flos": 24201457655040.0, "grad_norm": 2.107506603705316, "language_loss": 0.67186093, "learning_rate": 3.4703853268507597e-06, "loss": 0.69101399, "num_input_tokens_seen": 93386555, "step": 4324, "time_per_iteration": 2.752307891845703 }, { "auxiliary_loss_clip": 0.0109649, "auxiliary_loss_mlp": 0.01045367, "balance_loss_clip": 1.05030632, "balance_loss_mlp": 1.03026319, "epoch": 0.2600330677889674, "flos": 31431460855680.0, "grad_norm": 2.121769328280442, "language_loss": 0.71064055, "learning_rate": 3.470121299177082e-06, "loss": 0.732059, "num_input_tokens_seen": 93405590, "step": 4325, "time_per_iteration": 2.824281692504883 }, { "auxiliary_loss_clip": 0.01134613, "auxiliary_loss_mlp": 0.01035571, "balance_loss_clip": 1.04941416, "balance_loss_mlp": 1.01839304, "epoch": 0.26009319104163536, "flos": 32266527217920.0, "grad_norm": 1.8496839878379767, "language_loss": 0.73106551, "learning_rate": 3.469857215756257e-06, "loss": 0.75276732, "num_input_tokens_seen": 93424750, "step": 4326, "time_per_iteration": 2.7235658168792725 }, { "auxiliary_loss_clip": 0.01118123, "auxiliary_loss_mlp": 0.00776184, "balance_loss_clip": 1.05001175, "balance_loss_mlp": 1.00100303, "epoch": 0.26015331429430333, "flos": 26286754752000.0, "grad_norm": 1.7229255626307804, "language_loss": 0.86908734, "learning_rate": 3.4695930765982997e-06, "loss": 0.88803041, "num_input_tokens_seen": 93443465, "step": 4327, "time_per_iteration": 2.7072155475616455 }, { "auxiliary_loss_clip": 0.01153995, "auxiliary_loss_mlp": 0.00775932, "balance_loss_clip": 1.05640841, "balance_loss_mlp": 1.0008533, "epoch": 0.2602134375469713, "flos": 21142335957120.0, "grad_norm": 1.4664721830580452, "language_loss": 0.80265766, "learning_rate": 3.4693288817132255e-06, "loss": 0.82195687, "num_input_tokens_seen": 93462580, "step": 4328, "time_per_iteration": 2.6463024616241455 }, { "auxiliary_loss_clip": 0.0111992, "auxiliary_loss_mlp": 0.00774533, "balance_loss_clip": 1.04837036, "balance_loss_mlp": 1.00092077, "epoch": 0.26027356079963926, "flos": 25921327737600.0, "grad_norm": 1.6317826670237516, "language_loss": 0.88094193, "learning_rate": 3.4690646311110525e-06, "loss": 0.89988649, "num_input_tokens_seen": 93482790, "step": 4329, "time_per_iteration": 2.7130861282348633 }, { "auxiliary_loss_clip": 0.011478, "auxiliary_loss_mlp": 0.01040633, "balance_loss_clip": 1.05545115, "balance_loss_mlp": 1.02431321, "epoch": 0.2603336840523072, "flos": 26359222440960.0, "grad_norm": 1.8335620949826397, "language_loss": 0.77834195, "learning_rate": 3.468800324801802e-06, "loss": 0.80022621, "num_input_tokens_seen": 93498795, "step": 4330, "time_per_iteration": 2.6223180294036865 }, { "auxiliary_loss_clip": 0.01148961, "auxiliary_loss_mlp": 0.01047898, "balance_loss_clip": 1.0536809, "balance_loss_mlp": 1.03081572, "epoch": 0.2603938073049752, "flos": 23513661054720.0, "grad_norm": 1.5875829464999673, "language_loss": 0.75683081, "learning_rate": 3.4685359627954958e-06, "loss": 0.77879941, "num_input_tokens_seen": 93518335, "step": 4331, "time_per_iteration": 2.6383559703826904 }, { "auxiliary_loss_clip": 0.01130325, "auxiliary_loss_mlp": 0.01042577, "balance_loss_clip": 1.05964541, "balance_loss_mlp": 1.0261023, "epoch": 0.26045393055764315, "flos": 25374300537600.0, "grad_norm": 1.3798785286413686, "language_loss": 0.69174874, "learning_rate": 3.4682715451021584e-06, "loss": 0.71347773, "num_input_tokens_seen": 93539170, "step": 4332, "time_per_iteration": 2.675203800201416 }, { "auxiliary_loss_clip": 0.01117119, "auxiliary_loss_mlp": 0.01048864, "balance_loss_clip": 1.04849494, "balance_loss_mlp": 1.03203201, "epoch": 0.2605140538103111, "flos": 27635272076160.0, "grad_norm": 6.1371153370044915, "language_loss": 0.79897749, "learning_rate": 3.4680070717318174e-06, "loss": 0.82063735, "num_input_tokens_seen": 93558480, "step": 4333, "time_per_iteration": 2.7595479488372803 }, { "auxiliary_loss_clip": 0.01144159, "auxiliary_loss_mlp": 0.01039411, "balance_loss_clip": 1.05260658, "balance_loss_mlp": 1.02317452, "epoch": 0.2605741770629791, "flos": 13769839503360.0, "grad_norm": 1.9478362516602954, "language_loss": 0.80919975, "learning_rate": 3.467742542694501e-06, "loss": 0.83103544, "num_input_tokens_seen": 93575220, "step": 4334, "time_per_iteration": 2.585676670074463 }, { "auxiliary_loss_clip": 0.01121127, "auxiliary_loss_mlp": 0.0103772, "balance_loss_clip": 1.04868293, "balance_loss_mlp": 1.02051783, "epoch": 0.26063430031564705, "flos": 26031681296640.0, "grad_norm": 1.8490049893982383, "language_loss": 0.8027274, "learning_rate": 3.46747795800024e-06, "loss": 0.82431591, "num_input_tokens_seen": 93597015, "step": 4335, "time_per_iteration": 2.730853796005249 }, { "auxiliary_loss_clip": 0.01060862, "auxiliary_loss_mlp": 0.01054521, "balance_loss_clip": 1.03598261, "balance_loss_mlp": 1.05267298, "epoch": 0.26069442356831507, "flos": 62443809820800.0, "grad_norm": 1.1166557113782816, "language_loss": 0.60850358, "learning_rate": 3.467213317659068e-06, "loss": 0.62965739, "num_input_tokens_seen": 93657775, "step": 4336, "time_per_iteration": 3.1322128772735596 }, { "auxiliary_loss_clip": 0.01111016, "auxiliary_loss_mlp": 0.01046835, "balance_loss_clip": 1.05039525, "balance_loss_mlp": 1.02976441, "epoch": 0.26075454682098304, "flos": 13626376583040.0, "grad_norm": 2.784557437613843, "language_loss": 0.7679469, "learning_rate": 3.46694862168102e-06, "loss": 0.78952539, "num_input_tokens_seen": 93676145, "step": 4337, "time_per_iteration": 2.704305410385132 }, { "auxiliary_loss_clip": 0.0112146, "auxiliary_loss_mlp": 0.01045064, "balance_loss_clip": 1.04997659, "balance_loss_mlp": 1.02728987, "epoch": 0.260814670073651, "flos": 12126531260160.0, "grad_norm": 2.7677016823816976, "language_loss": 0.74653983, "learning_rate": 3.4666838700761334e-06, "loss": 0.76820505, "num_input_tokens_seen": 93692480, "step": 4338, "time_per_iteration": 2.652679204940796 }, { "auxiliary_loss_clip": 0.01140171, "auxiliary_loss_mlp": 0.01040507, "balance_loss_clip": 1.05246329, "balance_loss_mlp": 1.02314997, "epoch": 0.26087479332631897, "flos": 15122522805120.0, "grad_norm": 2.378816803290104, "language_loss": 0.81061137, "learning_rate": 3.466419062854447e-06, "loss": 0.8324182, "num_input_tokens_seen": 93710165, "step": 4339, "time_per_iteration": 2.7237682342529297 }, { "auxiliary_loss_clip": 0.01090328, "auxiliary_loss_mlp": 0.01040213, "balance_loss_clip": 1.04649866, "balance_loss_mlp": 1.02436984, "epoch": 0.26093491657898693, "flos": 24680937329280.0, "grad_norm": 1.6860698424881835, "language_loss": 0.76643449, "learning_rate": 3.4661542000260033e-06, "loss": 0.78773987, "num_input_tokens_seen": 93730185, "step": 4340, "time_per_iteration": 2.817647695541382 }, { "auxiliary_loss_clip": 0.01082903, "auxiliary_loss_mlp": 0.01040837, "balance_loss_clip": 1.04781985, "balance_loss_mlp": 1.02381396, "epoch": 0.2609950398316549, "flos": 25116138512640.0, "grad_norm": 1.954971477972507, "language_loss": 0.82689369, "learning_rate": 3.465889281600845e-06, "loss": 0.84813106, "num_input_tokens_seen": 93747690, "step": 4341, "time_per_iteration": 2.822387218475342 }, { "auxiliary_loss_clip": 0.01148407, "auxiliary_loss_mlp": 0.0104134, "balance_loss_clip": 1.0550344, "balance_loss_mlp": 1.02387536, "epoch": 0.26105516308432286, "flos": 28548588216960.0, "grad_norm": 2.3225619433460083, "language_loss": 0.76828772, "learning_rate": 3.4656243075890183e-06, "loss": 0.79018521, "num_input_tokens_seen": 93767405, "step": 4342, "time_per_iteration": 2.7091987133026123 }, { "auxiliary_loss_clip": 0.01137117, "auxiliary_loss_mlp": 0.01036127, "balance_loss_clip": 1.05262113, "balance_loss_mlp": 1.01837635, "epoch": 0.2611152863369908, "flos": 39530609447040.0, "grad_norm": 1.8380809165191976, "language_loss": 0.66072762, "learning_rate": 3.4653592780005707e-06, "loss": 0.68246007, "num_input_tokens_seen": 93789950, "step": 4343, "time_per_iteration": 2.7885191440582275 }, { "auxiliary_loss_clip": 0.01076135, "auxiliary_loss_mlp": 0.01045298, "balance_loss_clip": 1.04419374, "balance_loss_mlp": 1.02715397, "epoch": 0.2611754095896588, "flos": 13735329511680.0, "grad_norm": 1.9033089414913282, "language_loss": 0.73626471, "learning_rate": 3.465094192845553e-06, "loss": 0.75747907, "num_input_tokens_seen": 93807835, "step": 4344, "time_per_iteration": 2.7622575759887695 }, { "auxiliary_loss_clip": 0.01150726, "auxiliary_loss_mlp": 0.01042349, "balance_loss_clip": 1.05625904, "balance_loss_mlp": 1.02560019, "epoch": 0.26123553284232676, "flos": 21506649649920.0, "grad_norm": 2.7815673216786045, "language_loss": 0.86820161, "learning_rate": 3.4648290521340165e-06, "loss": 0.89013231, "num_input_tokens_seen": 93825670, "step": 4345, "time_per_iteration": 2.615021228790283 }, { "auxiliary_loss_clip": 0.01121997, "auxiliary_loss_mlp": 0.01036853, "balance_loss_clip": 1.05178094, "balance_loss_mlp": 1.02056956, "epoch": 0.2612956560949947, "flos": 21139786091520.0, "grad_norm": 1.9109970692142244, "language_loss": 0.76235008, "learning_rate": 3.464563855876015e-06, "loss": 0.78393853, "num_input_tokens_seen": 93844045, "step": 4346, "time_per_iteration": 2.660766363143921 }, { "auxiliary_loss_clip": 0.01140284, "auxiliary_loss_mlp": 0.01045855, "balance_loss_clip": 1.05571795, "balance_loss_mlp": 1.02870095, "epoch": 0.2613557793476627, "flos": 25119011600640.0, "grad_norm": 1.6628741865434964, "language_loss": 0.75995654, "learning_rate": 3.464298604081606e-06, "loss": 0.78181791, "num_input_tokens_seen": 93864380, "step": 4347, "time_per_iteration": 2.6985979080200195 }, { "auxiliary_loss_clip": 0.0110699, "auxiliary_loss_mlp": 0.01041742, "balance_loss_clip": 1.05063343, "balance_loss_mlp": 1.02501726, "epoch": 0.26141590260033065, "flos": 26067699659520.0, "grad_norm": 1.7474860409603998, "language_loss": 0.73196864, "learning_rate": 3.4640332967608476e-06, "loss": 0.75345594, "num_input_tokens_seen": 93885475, "step": 4348, "time_per_iteration": 2.7511887550354004 }, { "auxiliary_loss_clip": 0.01110529, "auxiliary_loss_mlp": 0.01045849, "balance_loss_clip": 1.05199265, "balance_loss_mlp": 1.0290519, "epoch": 0.2614760258529987, "flos": 25701518459520.0, "grad_norm": 2.6377025292028944, "language_loss": 0.91262084, "learning_rate": 3.463767933923799e-06, "loss": 0.93418467, "num_input_tokens_seen": 93905545, "step": 4349, "time_per_iteration": 2.720240354537964 }, { "auxiliary_loss_clip": 0.0113714, "auxiliary_loss_mlp": 0.01048228, "balance_loss_clip": 1.05569661, "balance_loss_mlp": 1.03184831, "epoch": 0.26153614910566664, "flos": 17457147181440.0, "grad_norm": 1.7232851278977876, "language_loss": 0.80046499, "learning_rate": 3.463502515580524e-06, "loss": 0.82231867, "num_input_tokens_seen": 93924185, "step": 4350, "time_per_iteration": 2.652054786682129 }, { "auxiliary_loss_clip": 0.0113538, "auxiliary_loss_mlp": 0.01049567, "balance_loss_clip": 1.05652642, "balance_loss_mlp": 1.03299654, "epoch": 0.2615962723583346, "flos": 17712831168000.0, "grad_norm": 10.816271600027287, "language_loss": 0.62736505, "learning_rate": 3.4632370417410866e-06, "loss": 0.64921451, "num_input_tokens_seen": 93942825, "step": 4351, "time_per_iteration": 2.6674954891204834 }, { "auxiliary_loss_clip": 0.01138265, "auxiliary_loss_mlp": 0.01048518, "balance_loss_clip": 1.05201697, "balance_loss_mlp": 1.03168559, "epoch": 0.26165639561100257, "flos": 23257725672960.0, "grad_norm": 1.9014393183165526, "language_loss": 0.84131002, "learning_rate": 3.462971512415555e-06, "loss": 0.86317784, "num_input_tokens_seen": 93962045, "step": 4352, "time_per_iteration": 2.8033063411712646 }, { "auxiliary_loss_clip": 0.01065372, "auxiliary_loss_mlp": 0.0102292, "balance_loss_clip": 1.04145527, "balance_loss_mlp": 1.02078664, "epoch": 0.26171651886367053, "flos": 66737970800640.0, "grad_norm": 0.8050815788583346, "language_loss": 0.70591724, "learning_rate": 3.462705927613996e-06, "loss": 0.7268002, "num_input_tokens_seen": 94021175, "step": 4353, "time_per_iteration": 3.101954936981201 }, { "auxiliary_loss_clip": 0.01115948, "auxiliary_loss_mlp": 0.01069336, "balance_loss_clip": 1.04858005, "balance_loss_mlp": 1.05013168, "epoch": 0.2617766421163385, "flos": 22349581090560.0, "grad_norm": 1.6494861832481549, "language_loss": 0.77562749, "learning_rate": 3.4624402873464816e-06, "loss": 0.79748034, "num_input_tokens_seen": 94043370, "step": 4354, "time_per_iteration": 2.772723436355591 }, { "auxiliary_loss_clip": 0.01089887, "auxiliary_loss_mlp": 0.01058882, "balance_loss_clip": 1.04805279, "balance_loss_mlp": 1.04082203, "epoch": 0.26183676536900646, "flos": 26067125041920.0, "grad_norm": 1.8339738923409379, "language_loss": 0.68351537, "learning_rate": 3.462174591623085e-06, "loss": 0.70500308, "num_input_tokens_seen": 94063510, "step": 4355, "time_per_iteration": 5.908639430999756 }, { "auxiliary_loss_clip": 0.01094509, "auxiliary_loss_mlp": 0.01039879, "balance_loss_clip": 1.0486095, "balance_loss_mlp": 1.02164054, "epoch": 0.26189688862167443, "flos": 20996466825600.0, "grad_norm": 1.9440617828376934, "language_loss": 0.67573452, "learning_rate": 3.4619088404538815e-06, "loss": 0.69707847, "num_input_tokens_seen": 94083865, "step": 4356, "time_per_iteration": 4.351539611816406 }, { "auxiliary_loss_clip": 0.01057297, "auxiliary_loss_mlp": 0.0100707, "balance_loss_clip": 1.03335488, "balance_loss_mlp": 1.00484037, "epoch": 0.2619570118743424, "flos": 65798261141760.0, "grad_norm": 0.6809064288126679, "language_loss": 0.53124392, "learning_rate": 3.4616430338489487e-06, "loss": 0.55188763, "num_input_tokens_seen": 94144095, "step": 4357, "time_per_iteration": 3.0896964073181152 }, { "auxiliary_loss_clip": 0.01139918, "auxiliary_loss_mlp": 0.0104768, "balance_loss_clip": 1.05365348, "balance_loss_mlp": 1.03106248, "epoch": 0.26201713512701036, "flos": 28766817296640.0, "grad_norm": 1.8814759411194193, "language_loss": 0.84233022, "learning_rate": 3.4613771718183654e-06, "loss": 0.86420614, "num_input_tokens_seen": 94163035, "step": 4358, "time_per_iteration": 2.723057746887207 }, { "auxiliary_loss_clip": 0.01127273, "auxiliary_loss_mlp": 0.01043309, "balance_loss_clip": 1.04886353, "balance_loss_mlp": 1.02411628, "epoch": 0.2620772583796783, "flos": 26432516142720.0, "grad_norm": 2.354545555797757, "language_loss": 0.67324048, "learning_rate": 3.4611112543722127e-06, "loss": 0.69494629, "num_input_tokens_seen": 94182520, "step": 4359, "time_per_iteration": 2.7128403186798096 }, { "auxiliary_loss_clip": 0.01118602, "auxiliary_loss_mlp": 0.01045018, "balance_loss_clip": 1.04637527, "balance_loss_mlp": 1.02880526, "epoch": 0.2621373816323463, "flos": 20156552127360.0, "grad_norm": 1.8862311303010293, "language_loss": 0.78726596, "learning_rate": 3.4608452815205757e-06, "loss": 0.80890214, "num_input_tokens_seen": 94201795, "step": 4360, "time_per_iteration": 4.41027569770813 }, { "auxiliary_loss_clip": 0.01119481, "auxiliary_loss_mlp": 0.01042435, "balance_loss_clip": 1.04831719, "balance_loss_mlp": 1.02640164, "epoch": 0.26219750488501425, "flos": 28621235473920.0, "grad_norm": 1.8399079957082187, "language_loss": 0.67980468, "learning_rate": 3.4605792532735387e-06, "loss": 0.70142382, "num_input_tokens_seen": 94222390, "step": 4361, "time_per_iteration": 2.7642054557800293 }, { "auxiliary_loss_clip": 0.01139509, "auxiliary_loss_mlp": 0.01055985, "balance_loss_clip": 1.05313993, "balance_loss_mlp": 1.03842545, "epoch": 0.2622576281376823, "flos": 15042549173760.0, "grad_norm": 2.1489496912575166, "language_loss": 0.84068632, "learning_rate": 3.46031316964119e-06, "loss": 0.86264122, "num_input_tokens_seen": 94239980, "step": 4362, "time_per_iteration": 2.6152050495147705 }, { "auxiliary_loss_clip": 0.01105407, "auxiliary_loss_mlp": 0.01046107, "balance_loss_clip": 1.04752779, "balance_loss_mlp": 1.02867842, "epoch": 0.26231775139035024, "flos": 26396174557440.0, "grad_norm": 2.0545933935481835, "language_loss": 0.65068752, "learning_rate": 3.4600470306336197e-06, "loss": 0.67220271, "num_input_tokens_seen": 94260715, "step": 4363, "time_per_iteration": 2.7297046184539795 }, { "auxiliary_loss_clip": 0.01040739, "auxiliary_loss_mlp": 0.01017272, "balance_loss_clip": 1.02776587, "balance_loss_mlp": 1.01506662, "epoch": 0.2623778746430182, "flos": 65408918647680.0, "grad_norm": 0.9195643121956573, "language_loss": 0.61104208, "learning_rate": 3.4597808362609194e-06, "loss": 0.6316222, "num_input_tokens_seen": 94321285, "step": 4364, "time_per_iteration": 3.3122286796569824 }, { "auxiliary_loss_clip": 0.01151556, "auxiliary_loss_mlp": 0.01050336, "balance_loss_clip": 1.0550462, "balance_loss_mlp": 1.03201365, "epoch": 0.26243799789568617, "flos": 12604215254400.0, "grad_norm": 2.6922753747731387, "language_loss": 0.7223357, "learning_rate": 3.459514586533184e-06, "loss": 0.74435461, "num_input_tokens_seen": 94335420, "step": 4365, "time_per_iteration": 2.588611364364624 }, { "auxiliary_loss_clip": 0.01123747, "auxiliary_loss_mlp": 0.00776591, "balance_loss_clip": 1.05296087, "balance_loss_mlp": 1.00093484, "epoch": 0.26249812114835414, "flos": 28623821253120.0, "grad_norm": 1.9684942716361389, "language_loss": 0.77178609, "learning_rate": 3.459248281460509e-06, "loss": 0.79078948, "num_input_tokens_seen": 94357440, "step": 4366, "time_per_iteration": 2.7489407062530518 }, { "auxiliary_loss_clip": 0.01149499, "auxiliary_loss_mlp": 0.0104305, "balance_loss_clip": 1.05433846, "balance_loss_mlp": 1.02652764, "epoch": 0.2625582444010221, "flos": 14465393441280.0, "grad_norm": 1.9587652436204308, "language_loss": 0.76205176, "learning_rate": 3.4589819210529927e-06, "loss": 0.78397727, "num_input_tokens_seen": 94375690, "step": 4367, "time_per_iteration": 2.63778018951416 }, { "auxiliary_loss_clip": 0.01136158, "auxiliary_loss_mlp": 0.01045138, "balance_loss_clip": 1.0523572, "balance_loss_mlp": 1.02903318, "epoch": 0.26261836765369007, "flos": 16613174246400.0, "grad_norm": 2.055472748506688, "language_loss": 0.69400585, "learning_rate": 3.458715505320736e-06, "loss": 0.71581888, "num_input_tokens_seen": 94393190, "step": 4368, "time_per_iteration": 2.6515018939971924 }, { "auxiliary_loss_clip": 0.01123905, "auxiliary_loss_mlp": 0.01045619, "balance_loss_clip": 1.05272579, "balance_loss_mlp": 1.02791643, "epoch": 0.26267849090635803, "flos": 20519932066560.0, "grad_norm": 1.8794244148025279, "language_loss": 0.79255176, "learning_rate": 3.458449034273841e-06, "loss": 0.81424701, "num_input_tokens_seen": 94410975, "step": 4369, "time_per_iteration": 2.717142343521118 }, { "auxiliary_loss_clip": 0.01119662, "auxiliary_loss_mlp": 0.01040752, "balance_loss_clip": 1.05190969, "balance_loss_mlp": 1.02344334, "epoch": 0.262738614159026, "flos": 21323936142720.0, "grad_norm": 4.796099217910503, "language_loss": 0.83591807, "learning_rate": 3.4581825079224133e-06, "loss": 0.85752219, "num_input_tokens_seen": 94429985, "step": 4370, "time_per_iteration": 2.742966890335083 }, { "auxiliary_loss_clip": 0.01137822, "auxiliary_loss_mlp": 0.01053822, "balance_loss_clip": 1.05178714, "balance_loss_mlp": 1.0345341, "epoch": 0.26279873741169396, "flos": 17603590930560.0, "grad_norm": 1.7275848609842401, "language_loss": 0.71854705, "learning_rate": 3.4579159262765575e-06, "loss": 0.7404635, "num_input_tokens_seen": 94448660, "step": 4371, "time_per_iteration": 2.691899538040161 }, { "auxiliary_loss_clip": 0.01062293, "auxiliary_loss_mlp": 0.01003561, "balance_loss_clip": 1.02797341, "balance_loss_mlp": 1.00147498, "epoch": 0.2628588606643619, "flos": 60949746587520.0, "grad_norm": 0.6802377941963699, "language_loss": 0.56387627, "learning_rate": 3.457649289346384e-06, "loss": 0.58453482, "num_input_tokens_seen": 94515630, "step": 4372, "time_per_iteration": 3.279158115386963 }, { "auxiliary_loss_clip": 0.01124406, "auxiliary_loss_mlp": 0.01038838, "balance_loss_clip": 1.05295706, "balance_loss_mlp": 1.02169585, "epoch": 0.2629189839170299, "flos": 27016315891200.0, "grad_norm": 1.9842369613103452, "language_loss": 0.77777553, "learning_rate": 3.4573825971420042e-06, "loss": 0.79940796, "num_input_tokens_seen": 94535385, "step": 4373, "time_per_iteration": 2.8367159366607666 }, { "auxiliary_loss_clip": 0.01104424, "auxiliary_loss_mlp": 0.01039426, "balance_loss_clip": 1.05070519, "balance_loss_mlp": 1.02314186, "epoch": 0.26297910716969786, "flos": 17019863009280.0, "grad_norm": 7.588420148526772, "language_loss": 0.71397603, "learning_rate": 3.4571158496735294e-06, "loss": 0.73541456, "num_input_tokens_seen": 94552650, "step": 4374, "time_per_iteration": 2.722332239151001 }, { "auxiliary_loss_clip": 0.0112606, "auxiliary_loss_mlp": 0.01045748, "balance_loss_clip": 1.05836225, "balance_loss_mlp": 1.02748489, "epoch": 0.2630392304223659, "flos": 24897370728960.0, "grad_norm": 1.8414201938467747, "language_loss": 0.81212163, "learning_rate": 3.4568490469510756e-06, "loss": 0.83383965, "num_input_tokens_seen": 94574075, "step": 4375, "time_per_iteration": 2.7654781341552734 }, { "auxiliary_loss_clip": 0.01118996, "auxiliary_loss_mlp": 0.01045139, "balance_loss_clip": 1.04959798, "balance_loss_mlp": 1.02901626, "epoch": 0.26309935367503384, "flos": 32854026067200.0, "grad_norm": 1.6461571134793078, "language_loss": 0.6613251, "learning_rate": 3.4565821889847603e-06, "loss": 0.68296647, "num_input_tokens_seen": 94594255, "step": 4376, "time_per_iteration": 2.778731107711792 }, { "auxiliary_loss_clip": 0.01096695, "auxiliary_loss_mlp": 0.0106417, "balance_loss_clip": 1.04752398, "balance_loss_mlp": 1.04587138, "epoch": 0.2631594769277018, "flos": 15887958652800.0, "grad_norm": 1.7628322447974545, "language_loss": 0.69351411, "learning_rate": 3.4563152757847026e-06, "loss": 0.71512282, "num_input_tokens_seen": 94611410, "step": 4377, "time_per_iteration": 2.7606706619262695 }, { "auxiliary_loss_clip": 0.01141095, "auxiliary_loss_mlp": 0.01043033, "balance_loss_clip": 1.0561285, "balance_loss_mlp": 1.02606952, "epoch": 0.2632196001803698, "flos": 50804943557760.0, "grad_norm": 2.1982489321824352, "language_loss": 0.79961169, "learning_rate": 3.4560483073610233e-06, "loss": 0.82145292, "num_input_tokens_seen": 94636575, "step": 4378, "time_per_iteration": 2.9000468254089355 }, { "auxiliary_loss_clip": 0.01127331, "auxiliary_loss_mlp": 0.01045659, "balance_loss_clip": 1.05713558, "balance_loss_mlp": 1.03063893, "epoch": 0.26327972343303774, "flos": 13733031041280.0, "grad_norm": 1.912468890890116, "language_loss": 0.76285684, "learning_rate": 3.455781283723846e-06, "loss": 0.78458679, "num_input_tokens_seen": 94654345, "step": 4379, "time_per_iteration": 2.6757192611694336 }, { "auxiliary_loss_clip": 0.01114814, "auxiliary_loss_mlp": 0.01043, "balance_loss_clip": 1.05360019, "balance_loss_mlp": 1.02465415, "epoch": 0.2633398466857057, "flos": 23769057732480.0, "grad_norm": 1.982346793660648, "language_loss": 0.77895945, "learning_rate": 3.4555142048832975e-06, "loss": 0.80053759, "num_input_tokens_seen": 94673985, "step": 4380, "time_per_iteration": 2.745392084121704 }, { "auxiliary_loss_clip": 0.01125918, "auxiliary_loss_mlp": 0.01040915, "balance_loss_clip": 1.04945278, "balance_loss_mlp": 1.02351093, "epoch": 0.26339996993837367, "flos": 27600223380480.0, "grad_norm": 2.2040025999375215, "language_loss": 0.64148676, "learning_rate": 3.4552470708495036e-06, "loss": 0.66315508, "num_input_tokens_seen": 94693145, "step": 4381, "time_per_iteration": 2.8020689487457275 }, { "auxiliary_loss_clip": 0.01136752, "auxiliary_loss_mlp": 0.01038794, "balance_loss_clip": 1.05113709, "balance_loss_mlp": 1.02225995, "epoch": 0.26346009319104163, "flos": 16946317912320.0, "grad_norm": 1.9675616702193486, "language_loss": 0.82470775, "learning_rate": 3.454979881632595e-06, "loss": 0.8464632, "num_input_tokens_seen": 94710185, "step": 4382, "time_per_iteration": 2.66001558303833 }, { "auxiliary_loss_clip": 0.01106019, "auxiliary_loss_mlp": 0.01045742, "balance_loss_clip": 1.04899645, "balance_loss_mlp": 1.02726483, "epoch": 0.2635202164437096, "flos": 37232218915200.0, "grad_norm": 4.511875880791621, "language_loss": 0.70333207, "learning_rate": 3.4547126372427035e-06, "loss": 0.7248497, "num_input_tokens_seen": 94730280, "step": 4383, "time_per_iteration": 2.851227045059204 }, { "auxiliary_loss_clip": 0.01136676, "auxiliary_loss_mlp": 0.01039697, "balance_loss_clip": 1.05237031, "balance_loss_mlp": 1.0239253, "epoch": 0.26358033969637756, "flos": 20996359084800.0, "grad_norm": 3.019496854013466, "language_loss": 0.69455528, "learning_rate": 3.4544453376899638e-06, "loss": 0.71631902, "num_input_tokens_seen": 94748560, "step": 4384, "time_per_iteration": 2.670023202896118 }, { "auxiliary_loss_clip": 0.01135763, "auxiliary_loss_mlp": 0.01039573, "balance_loss_clip": 1.05114567, "balance_loss_mlp": 1.02275276, "epoch": 0.26364046294904553, "flos": 27746092512000.0, "grad_norm": 2.2712502599605036, "language_loss": 0.70067525, "learning_rate": 3.45417798298451e-06, "loss": 0.72242868, "num_input_tokens_seen": 94767570, "step": 4385, "time_per_iteration": 2.7232449054718018 }, { "auxiliary_loss_clip": 0.01112529, "auxiliary_loss_mlp": 0.0104946, "balance_loss_clip": 1.04893148, "balance_loss_mlp": 1.03190076, "epoch": 0.2637005862017135, "flos": 22893088757760.0, "grad_norm": 1.8128608655109948, "language_loss": 0.85684925, "learning_rate": 3.453910573136482e-06, "loss": 0.87846911, "num_input_tokens_seen": 94784985, "step": 4386, "time_per_iteration": 2.727924108505249 }, { "auxiliary_loss_clip": 0.01126521, "auxiliary_loss_mlp": 0.01046433, "balance_loss_clip": 1.0510478, "balance_loss_mlp": 1.02955282, "epoch": 0.26376070945438146, "flos": 15048834053760.0, "grad_norm": 2.174412940978395, "language_loss": 0.7796396, "learning_rate": 3.4536431081560196e-06, "loss": 0.80136907, "num_input_tokens_seen": 94802545, "step": 4387, "time_per_iteration": 2.666287660598755 }, { "auxiliary_loss_clip": 0.01134058, "auxiliary_loss_mlp": 0.01041407, "balance_loss_clip": 1.05609179, "balance_loss_mlp": 1.02537298, "epoch": 0.2638208327070494, "flos": 21141833166720.0, "grad_norm": 2.003302761742054, "language_loss": 0.76126039, "learning_rate": 3.453375588053264e-06, "loss": 0.78301507, "num_input_tokens_seen": 94820730, "step": 4388, "time_per_iteration": 2.6321358680725098 }, { "auxiliary_loss_clip": 0.01148944, "auxiliary_loss_mlp": 0.01036978, "balance_loss_clip": 1.05455542, "balance_loss_mlp": 1.02002645, "epoch": 0.26388095595971744, "flos": 21725597001600.0, "grad_norm": 2.534815675842734, "language_loss": 0.86675179, "learning_rate": 3.4531080128383617e-06, "loss": 0.88861108, "num_input_tokens_seen": 94839175, "step": 4389, "time_per_iteration": 2.6122422218322754 }, { "auxiliary_loss_clip": 0.01048602, "auxiliary_loss_mlp": 0.01002085, "balance_loss_clip": 1.03000987, "balance_loss_mlp": 0.99961758, "epoch": 0.2639410792123854, "flos": 65515537192320.0, "grad_norm": 0.8388510572165676, "language_loss": 0.60285747, "learning_rate": 3.452840382521457e-06, "loss": 0.62336433, "num_input_tokens_seen": 94898865, "step": 4390, "time_per_iteration": 3.1867401599884033 }, { "auxiliary_loss_clip": 0.01128567, "auxiliary_loss_mlp": 0.01040305, "balance_loss_clip": 1.05022383, "balance_loss_mlp": 1.02319825, "epoch": 0.2640012024650534, "flos": 23948574929280.0, "grad_norm": 1.6144448841655068, "language_loss": 0.77730125, "learning_rate": 3.4525726971127e-06, "loss": 0.79899001, "num_input_tokens_seen": 94917490, "step": 4391, "time_per_iteration": 2.707310676574707 }, { "auxiliary_loss_clip": 0.01031384, "auxiliary_loss_mlp": 0.00755302, "balance_loss_clip": 1.02553821, "balance_loss_mlp": 1.00244236, "epoch": 0.26406132571772134, "flos": 56441163369600.0, "grad_norm": 0.8840896383522404, "language_loss": 0.58758044, "learning_rate": 3.45230495662224e-06, "loss": 0.60544735, "num_input_tokens_seen": 94969065, "step": 4392, "time_per_iteration": 3.211859941482544 }, { "auxiliary_loss_clip": 0.01136937, "auxiliary_loss_mlp": 0.0105019, "balance_loss_clip": 1.05295539, "balance_loss_mlp": 1.03322649, "epoch": 0.2641214489703893, "flos": 22090557139200.0, "grad_norm": 1.9286153229889427, "language_loss": 0.68954027, "learning_rate": 3.4520371610602306e-06, "loss": 0.71141154, "num_input_tokens_seen": 94988540, "step": 4393, "time_per_iteration": 2.6483278274536133 }, { "auxiliary_loss_clip": 0.01140079, "auxiliary_loss_mlp": 0.01041521, "balance_loss_clip": 1.05395103, "balance_loss_mlp": 1.02398562, "epoch": 0.26418157222305727, "flos": 16544764794240.0, "grad_norm": 2.0454829511435193, "language_loss": 0.84071863, "learning_rate": 3.4517693104368267e-06, "loss": 0.86253464, "num_input_tokens_seen": 95004810, "step": 4394, "time_per_iteration": 4.3396079540252686 }, { "auxiliary_loss_clip": 0.01124083, "auxiliary_loss_mlp": 0.01045374, "balance_loss_clip": 1.04999089, "balance_loss_mlp": 1.02661061, "epoch": 0.26424169547572524, "flos": 18002486442240.0, "grad_norm": 2.096391063208514, "language_loss": 0.70044839, "learning_rate": 3.4515014047621856e-06, "loss": 0.72214299, "num_input_tokens_seen": 95024085, "step": 4395, "time_per_iteration": 2.8730056285858154 }, { "auxiliary_loss_clip": 0.01110387, "auxiliary_loss_mlp": 0.01037389, "balance_loss_clip": 1.04736662, "balance_loss_mlp": 1.02071214, "epoch": 0.2643018187283932, "flos": 16983162288000.0, "grad_norm": 2.1761517020490606, "language_loss": 0.86876452, "learning_rate": 3.4512334440464655e-06, "loss": 0.89024228, "num_input_tokens_seen": 95042515, "step": 4396, "time_per_iteration": 4.384250640869141 }, { "auxiliary_loss_clip": 0.01010716, "auxiliary_loss_mlp": 0.01021406, "balance_loss_clip": 1.02197146, "balance_loss_mlp": 1.01856887, "epoch": 0.26436194198106117, "flos": 59664359416320.0, "grad_norm": 0.7957760850485174, "language_loss": 0.55022657, "learning_rate": 3.4509654282998277e-06, "loss": 0.57054776, "num_input_tokens_seen": 95094835, "step": 4397, "time_per_iteration": 3.0656893253326416 }, { "auxiliary_loss_clip": 0.01132938, "auxiliary_loss_mlp": 0.01050463, "balance_loss_clip": 1.0485754, "balance_loss_mlp": 1.03357744, "epoch": 0.26442206523372913, "flos": 32921322197760.0, "grad_norm": 1.9110208887501443, "language_loss": 0.77881467, "learning_rate": 3.450697357532435e-06, "loss": 0.80064869, "num_input_tokens_seen": 95113480, "step": 4398, "time_per_iteration": 2.740917444229126 }, { "auxiliary_loss_clip": 0.01139914, "auxiliary_loss_mlp": 0.01040709, "balance_loss_clip": 1.05469537, "balance_loss_mlp": 1.02347112, "epoch": 0.2644821884863971, "flos": 21031300039680.0, "grad_norm": 1.7657486248278176, "language_loss": 0.67534482, "learning_rate": 3.4504292317544534e-06, "loss": 0.69715106, "num_input_tokens_seen": 95132580, "step": 4399, "time_per_iteration": 4.305487871170044 }, { "auxiliary_loss_clip": 0.01097219, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.04840231, "balance_loss_mlp": 1.02503681, "epoch": 0.26454231173906506, "flos": 20776801201920.0, "grad_norm": 1.6309197312133479, "language_loss": 0.86614597, "learning_rate": 3.4501610509760504e-06, "loss": 0.88753855, "num_input_tokens_seen": 95152375, "step": 4400, "time_per_iteration": 2.695883274078369 }, { "auxiliary_loss_clip": 0.01119339, "auxiliary_loss_mlp": 0.01039987, "balance_loss_clip": 1.0483284, "balance_loss_mlp": 1.0226419, "epoch": 0.264602434991733, "flos": 16618669027200.0, "grad_norm": 3.1942141071602546, "language_loss": 0.76518428, "learning_rate": 3.4498928152073944e-06, "loss": 0.78677756, "num_input_tokens_seen": 95170265, "step": 4401, "time_per_iteration": 2.69415545463562 }, { "auxiliary_loss_clip": 0.01100665, "auxiliary_loss_mlp": 0.01046326, "balance_loss_clip": 1.04473615, "balance_loss_mlp": 1.02758598, "epoch": 0.26466255824440105, "flos": 19062677295360.0, "grad_norm": 2.336049134907364, "language_loss": 0.88363832, "learning_rate": 3.4496245244586577e-06, "loss": 0.90510821, "num_input_tokens_seen": 95188655, "step": 4402, "time_per_iteration": 2.7073450088500977 }, { "auxiliary_loss_clip": 0.01105803, "auxiliary_loss_mlp": 0.01040704, "balance_loss_clip": 1.04894042, "balance_loss_mlp": 1.02327585, "epoch": 0.264722681497069, "flos": 22638554006400.0, "grad_norm": 1.7301089969072252, "language_loss": 0.7811445, "learning_rate": 3.4493561787400137e-06, "loss": 0.80260956, "num_input_tokens_seen": 95209615, "step": 4403, "time_per_iteration": 2.7213027477264404 }, { "auxiliary_loss_clip": 0.01128649, "auxiliary_loss_mlp": 0.01038032, "balance_loss_clip": 1.04674816, "balance_loss_mlp": 1.02050877, "epoch": 0.264782804749737, "flos": 22492253911680.0, "grad_norm": 2.1369132533571604, "language_loss": 0.88594282, "learning_rate": 3.4490877780616387e-06, "loss": 0.90760964, "num_input_tokens_seen": 95227810, "step": 4404, "time_per_iteration": 2.6888909339904785 }, { "auxiliary_loss_clip": 0.01123789, "auxiliary_loss_mlp": 0.01040593, "balance_loss_clip": 1.04607344, "balance_loss_mlp": 1.02416539, "epoch": 0.26484292800240494, "flos": 16800269212800.0, "grad_norm": 1.7519644069859235, "language_loss": 0.76134694, "learning_rate": 3.448819322433709e-06, "loss": 0.78299075, "num_input_tokens_seen": 95245890, "step": 4405, "time_per_iteration": 2.7172482013702393 }, { "auxiliary_loss_clip": 0.01148976, "auxiliary_loss_mlp": 0.01040198, "balance_loss_clip": 1.05348206, "balance_loss_mlp": 1.02266204, "epoch": 0.2649030512550729, "flos": 20449583280000.0, "grad_norm": 1.711457274305917, "language_loss": 0.69873697, "learning_rate": 3.4485508118664066e-06, "loss": 0.72062874, "num_input_tokens_seen": 95264955, "step": 4406, "time_per_iteration": 2.584300994873047 }, { "auxiliary_loss_clip": 0.01121151, "auxiliary_loss_mlp": 0.01050453, "balance_loss_clip": 1.05182838, "balance_loss_mlp": 1.03432453, "epoch": 0.2649631745077409, "flos": 22416123035520.0, "grad_norm": 1.7200250795424956, "language_loss": 0.83956587, "learning_rate": 3.448282246369912e-06, "loss": 0.86128193, "num_input_tokens_seen": 95284245, "step": 4407, "time_per_iteration": 2.731316328048706 }, { "auxiliary_loss_clip": 0.01108599, "auxiliary_loss_mlp": 0.01031757, "balance_loss_clip": 1.04695201, "balance_loss_mlp": 1.01501989, "epoch": 0.26502329776040884, "flos": 35116110927360.0, "grad_norm": 1.8896460113896294, "language_loss": 0.7597363, "learning_rate": 3.4480136259544084e-06, "loss": 0.78113985, "num_input_tokens_seen": 95307125, "step": 4408, "time_per_iteration": 2.8600730895996094 }, { "auxiliary_loss_clip": 0.01091919, "auxiliary_loss_mlp": 0.01044721, "balance_loss_clip": 1.04267502, "balance_loss_mlp": 1.02679181, "epoch": 0.2650834210130768, "flos": 38687498438400.0, "grad_norm": 1.7769050714437231, "language_loss": 0.70612216, "learning_rate": 3.447744950630084e-06, "loss": 0.72748852, "num_input_tokens_seen": 95329150, "step": 4409, "time_per_iteration": 2.936380386352539 }, { "auxiliary_loss_clip": 0.01131548, "auxiliary_loss_mlp": 0.01040186, "balance_loss_clip": 1.04774857, "balance_loss_mlp": 1.02218497, "epoch": 0.26514354426574477, "flos": 24716847951360.0, "grad_norm": 1.7357795205395667, "language_loss": 0.7337513, "learning_rate": 3.4474762204071253e-06, "loss": 0.75546867, "num_input_tokens_seen": 95349880, "step": 4410, "time_per_iteration": 2.7315077781677246 }, { "auxiliary_loss_clip": 0.01141374, "auxiliary_loss_mlp": 0.0104966, "balance_loss_clip": 1.05183268, "balance_loss_mlp": 1.03216028, "epoch": 0.26520366751841273, "flos": 20340055733760.0, "grad_norm": 1.8886288474708937, "language_loss": 0.73828322, "learning_rate": 3.4472074352957244e-06, "loss": 0.76019359, "num_input_tokens_seen": 95368570, "step": 4411, "time_per_iteration": 2.641920566558838 }, { "auxiliary_loss_clip": 0.01099594, "auxiliary_loss_mlp": 0.01041576, "balance_loss_clip": 1.04986739, "balance_loss_mlp": 1.02431464, "epoch": 0.2652637907710807, "flos": 22343870828160.0, "grad_norm": 1.9943391034693418, "language_loss": 0.82447588, "learning_rate": 3.446938595306071e-06, "loss": 0.84588754, "num_input_tokens_seen": 95387065, "step": 4412, "time_per_iteration": 2.8344247341156006 }, { "auxiliary_loss_clip": 0.01135402, "auxiliary_loss_mlp": 0.01052016, "balance_loss_clip": 1.05143464, "balance_loss_mlp": 1.03544593, "epoch": 0.26532391402374866, "flos": 19354235990400.0, "grad_norm": 1.775443234311944, "language_loss": 0.7446382, "learning_rate": 3.4466697004483622e-06, "loss": 0.76651239, "num_input_tokens_seen": 95406345, "step": 4413, "time_per_iteration": 2.657975196838379 }, { "auxiliary_loss_clip": 0.01056582, "auxiliary_loss_mlp": 0.01008584, "balance_loss_clip": 1.03258443, "balance_loss_mlp": 1.00659275, "epoch": 0.26538403727641663, "flos": 44787611422080.0, "grad_norm": 0.873557285042922, "language_loss": 0.56965125, "learning_rate": 3.446400750732793e-06, "loss": 0.59030288, "num_input_tokens_seen": 95463595, "step": 4414, "time_per_iteration": 3.1158244609832764 }, { "auxiliary_loss_clip": 0.01107803, "auxiliary_loss_mlp": 0.01046612, "balance_loss_clip": 1.04481411, "balance_loss_mlp": 1.03048313, "epoch": 0.26544416052908465, "flos": 28182119708160.0, "grad_norm": 1.5786807831647507, "language_loss": 0.74238014, "learning_rate": 3.4461317461695625e-06, "loss": 0.76392424, "num_input_tokens_seen": 95484115, "step": 4415, "time_per_iteration": 2.7223031520843506 }, { "auxiliary_loss_clip": 0.01095743, "auxiliary_loss_mlp": 0.01044325, "balance_loss_clip": 1.04215193, "balance_loss_mlp": 1.02402353, "epoch": 0.2655042837817526, "flos": 17565274097280.0, "grad_norm": 2.5102345694159016, "language_loss": 0.86855936, "learning_rate": 3.4458626867688707e-06, "loss": 0.88996005, "num_input_tokens_seen": 95501435, "step": 4416, "time_per_iteration": 2.7001683712005615 }, { "auxiliary_loss_clip": 0.01141467, "auxiliary_loss_mlp": 0.01046153, "balance_loss_clip": 1.05359149, "balance_loss_mlp": 1.02761602, "epoch": 0.2655644070344206, "flos": 23404636298880.0, "grad_norm": 1.6343137061510633, "language_loss": 0.76870787, "learning_rate": 3.4455935725409217e-06, "loss": 0.79058409, "num_input_tokens_seen": 95520135, "step": 4417, "time_per_iteration": 2.662196397781372 }, { "auxiliary_loss_clip": 0.01119441, "auxiliary_loss_mlp": 0.01041503, "balance_loss_clip": 1.04989183, "balance_loss_mlp": 1.02242982, "epoch": 0.26562453028708854, "flos": 26468462678400.0, "grad_norm": 1.6334113226277946, "language_loss": 0.80320108, "learning_rate": 3.4453244034959196e-06, "loss": 0.82481045, "num_input_tokens_seen": 95541705, "step": 4418, "time_per_iteration": 2.7742624282836914 }, { "auxiliary_loss_clip": 0.0113892, "auxiliary_loss_mlp": 0.01045476, "balance_loss_clip": 1.05182683, "balance_loss_mlp": 1.02721274, "epoch": 0.2656846535397565, "flos": 19207576759680.0, "grad_norm": 2.164903581235647, "language_loss": 0.67788607, "learning_rate": 3.445055179644071e-06, "loss": 0.69972998, "num_input_tokens_seen": 95560300, "step": 4419, "time_per_iteration": 2.6437718868255615 }, { "auxiliary_loss_clip": 0.01149692, "auxiliary_loss_mlp": 0.01046258, "balance_loss_clip": 1.05360699, "balance_loss_mlp": 1.02711296, "epoch": 0.2657447767924245, "flos": 30551325903360.0, "grad_norm": 1.9366129468869788, "language_loss": 0.79625547, "learning_rate": 3.444785900995585e-06, "loss": 0.81821501, "num_input_tokens_seen": 95580150, "step": 4420, "time_per_iteration": 2.6594905853271484 }, { "auxiliary_loss_clip": 0.01126984, "auxiliary_loss_mlp": 0.01053725, "balance_loss_clip": 1.05294895, "balance_loss_mlp": 1.03368592, "epoch": 0.26580490004509244, "flos": 20922742160640.0, "grad_norm": 1.9122536358412747, "language_loss": 0.81690109, "learning_rate": 3.444516567560673e-06, "loss": 0.83870822, "num_input_tokens_seen": 95597570, "step": 4421, "time_per_iteration": 2.681410551071167 }, { "auxiliary_loss_clip": 0.0113176, "auxiliary_loss_mlp": 0.01046737, "balance_loss_clip": 1.05015123, "balance_loss_mlp": 1.02904677, "epoch": 0.2658650232977604, "flos": 43945682584320.0, "grad_norm": 1.6112293393448585, "language_loss": 0.65704989, "learning_rate": 3.444247179349548e-06, "loss": 0.6788348, "num_input_tokens_seen": 95619415, "step": 4422, "time_per_iteration": 2.8766117095947266 }, { "auxiliary_loss_clip": 0.01130944, "auxiliary_loss_mlp": 0.01047224, "balance_loss_clip": 1.04903376, "balance_loss_mlp": 1.03039181, "epoch": 0.26592514655042837, "flos": 29716439109120.0, "grad_norm": 2.1017056533749896, "language_loss": 0.74229872, "learning_rate": 3.4439777363724252e-06, "loss": 0.76408041, "num_input_tokens_seen": 95639155, "step": 4423, "time_per_iteration": 2.6983659267425537 }, { "auxiliary_loss_clip": 0.01130559, "auxiliary_loss_mlp": 0.01057709, "balance_loss_clip": 1.04790974, "balance_loss_mlp": 1.03822982, "epoch": 0.26598526980309634, "flos": 46677730014720.0, "grad_norm": 1.6865310965149165, "language_loss": 0.77855694, "learning_rate": 3.443708238639522e-06, "loss": 0.80043966, "num_input_tokens_seen": 95663320, "step": 4424, "time_per_iteration": 2.900214433670044 }, { "auxiliary_loss_clip": 0.01132339, "auxiliary_loss_mlp": 0.01049395, "balance_loss_clip": 1.04963291, "balance_loss_mlp": 1.03181148, "epoch": 0.2660453930557643, "flos": 11509442582400.0, "grad_norm": 2.0755220631041684, "language_loss": 0.78940654, "learning_rate": 3.4434386861610573e-06, "loss": 0.81122386, "num_input_tokens_seen": 95680260, "step": 4425, "time_per_iteration": 2.6266820430755615 }, { "auxiliary_loss_clip": 0.01123867, "auxiliary_loss_mlp": 0.01043959, "balance_loss_clip": 1.05143404, "balance_loss_mlp": 1.02767467, "epoch": 0.26610551630843227, "flos": 24791578197120.0, "grad_norm": 1.5673316066045293, "language_loss": 0.80135047, "learning_rate": 3.4431690789472532e-06, "loss": 0.82302874, "num_input_tokens_seen": 95701140, "step": 4426, "time_per_iteration": 2.7015280723571777 }, { "auxiliary_loss_clip": 0.01150747, "auxiliary_loss_mlp": 0.0104448, "balance_loss_clip": 1.0554285, "balance_loss_mlp": 1.02678883, "epoch": 0.26616563956110023, "flos": 27636385397760.0, "grad_norm": 1.617839398314704, "language_loss": 0.77174348, "learning_rate": 3.442899417008333e-06, "loss": 0.79369569, "num_input_tokens_seen": 95722060, "step": 4427, "time_per_iteration": 2.6438984870910645 }, { "auxiliary_loss_clip": 0.01112968, "auxiliary_loss_mlp": 0.01037518, "balance_loss_clip": 1.05125654, "balance_loss_mlp": 1.02069747, "epoch": 0.26622576281376825, "flos": 28362893880960.0, "grad_norm": 1.5634759975385293, "language_loss": 0.76754683, "learning_rate": 3.4426297003545227e-06, "loss": 0.78905165, "num_input_tokens_seen": 95742495, "step": 4428, "time_per_iteration": 2.7695741653442383 }, { "auxiliary_loss_clip": 0.01114899, "auxiliary_loss_mlp": 0.00775922, "balance_loss_clip": 1.04922283, "balance_loss_mlp": 1.0008111, "epoch": 0.2662858860664362, "flos": 18041341979520.0, "grad_norm": 1.815928660217762, "language_loss": 0.82900071, "learning_rate": 3.4423599289960495e-06, "loss": 0.84790885, "num_input_tokens_seen": 95761510, "step": 4429, "time_per_iteration": 2.764183282852173 }, { "auxiliary_loss_clip": 0.01106492, "auxiliary_loss_mlp": 0.01039033, "balance_loss_clip": 1.05041027, "balance_loss_mlp": 1.02201009, "epoch": 0.2663460093191042, "flos": 22745818995840.0, "grad_norm": 1.6463341595476202, "language_loss": 0.71996218, "learning_rate": 3.442090102943143e-06, "loss": 0.74141741, "num_input_tokens_seen": 95782385, "step": 4430, "time_per_iteration": 2.7244491577148438 }, { "auxiliary_loss_clip": 0.01148257, "auxiliary_loss_mlp": 0.01049268, "balance_loss_clip": 1.05231071, "balance_loss_mlp": 1.03068352, "epoch": 0.26640613257177215, "flos": 16508782344960.0, "grad_norm": 1.9574919733512919, "language_loss": 0.82021642, "learning_rate": 3.441820222206035e-06, "loss": 0.84219164, "num_input_tokens_seen": 95800595, "step": 4431, "time_per_iteration": 2.5910067558288574 }, { "auxiliary_loss_clip": 0.01143334, "auxiliary_loss_mlp": 0.01050031, "balance_loss_clip": 1.0540812, "balance_loss_mlp": 1.03141046, "epoch": 0.2664662558244401, "flos": 23075945919360.0, "grad_norm": 2.074794485495937, "language_loss": 0.76745522, "learning_rate": 3.44155028679496e-06, "loss": 0.7893889, "num_input_tokens_seen": 95818480, "step": 4432, "time_per_iteration": 2.6548166275024414 }, { "auxiliary_loss_clip": 0.01089372, "auxiliary_loss_mlp": 0.01052807, "balance_loss_clip": 1.04526138, "balance_loss_mlp": 1.03232694, "epoch": 0.2665263790771081, "flos": 23769273214080.0, "grad_norm": 1.872584196626497, "language_loss": 0.82903433, "learning_rate": 3.441280296720154e-06, "loss": 0.85045612, "num_input_tokens_seen": 95837205, "step": 4433, "time_per_iteration": 4.2740867137908936 }, { "auxiliary_loss_clip": 0.01142798, "auxiliary_loss_mlp": 0.01045231, "balance_loss_clip": 1.05565643, "balance_loss_mlp": 1.02671802, "epoch": 0.26658650232977604, "flos": 28001273708160.0, "grad_norm": 2.548777168378285, "language_loss": 0.76308644, "learning_rate": 3.441010251991854e-06, "loss": 0.78496677, "num_input_tokens_seen": 95858395, "step": 4434, "time_per_iteration": 4.203384160995483 }, { "auxiliary_loss_clip": 0.0114611, "auxiliary_loss_mlp": 0.01044925, "balance_loss_clip": 1.05197668, "balance_loss_mlp": 1.02772319, "epoch": 0.266646625582444, "flos": 22163635359360.0, "grad_norm": 2.3452347637055393, "language_loss": 0.82496321, "learning_rate": 3.440740152620301e-06, "loss": 0.84687358, "num_input_tokens_seen": 95877875, "step": 4435, "time_per_iteration": 4.102782964706421 }, { "auxiliary_loss_clip": 0.01104916, "auxiliary_loss_mlp": 0.01062101, "balance_loss_clip": 1.04567468, "balance_loss_mlp": 1.04245555, "epoch": 0.266706748835112, "flos": 27853537069440.0, "grad_norm": 1.994258420562806, "language_loss": 0.87634504, "learning_rate": 3.4404699986157376e-06, "loss": 0.89801526, "num_input_tokens_seen": 95895820, "step": 4436, "time_per_iteration": 2.8048155307769775 }, { "auxiliary_loss_clip": 0.01121439, "auxiliary_loss_mlp": 0.01047617, "balance_loss_clip": 1.04637265, "balance_loss_mlp": 1.03054643, "epoch": 0.26676687208777994, "flos": 25812123413760.0, "grad_norm": 1.4763923958478316, "language_loss": 0.787242, "learning_rate": 3.440199789988407e-06, "loss": 0.80893254, "num_input_tokens_seen": 95918025, "step": 4437, "time_per_iteration": 2.7382607460021973 }, { "auxiliary_loss_clip": 0.01093686, "auxiliary_loss_mlp": 0.01048829, "balance_loss_clip": 1.05000877, "balance_loss_mlp": 1.03117394, "epoch": 0.2668269953404479, "flos": 36064583504640.0, "grad_norm": 4.5178491997969115, "language_loss": 0.63910848, "learning_rate": 3.439929526748556e-06, "loss": 0.66053367, "num_input_tokens_seen": 95937725, "step": 4438, "time_per_iteration": 2.956014633178711 }, { "auxiliary_loss_clip": 0.01080658, "auxiliary_loss_mlp": 0.01047394, "balance_loss_clip": 1.0432179, "balance_loss_mlp": 1.02994168, "epoch": 0.26688711859311587, "flos": 26570987072640.0, "grad_norm": 1.84569516037299, "language_loss": 0.75897747, "learning_rate": 3.4396592089064334e-06, "loss": 0.78025794, "num_input_tokens_seen": 95956335, "step": 4439, "time_per_iteration": 4.428173065185547 }, { "auxiliary_loss_clip": 0.01089075, "auxiliary_loss_mlp": 0.01041089, "balance_loss_clip": 1.04845262, "balance_loss_mlp": 1.02181315, "epoch": 0.26694724184578383, "flos": 26761565658240.0, "grad_norm": 2.10654378697334, "language_loss": 0.7172367, "learning_rate": 3.4393888364722897e-06, "loss": 0.73853838, "num_input_tokens_seen": 95977135, "step": 4440, "time_per_iteration": 2.9196605682373047 }, { "auxiliary_loss_clip": 0.01124038, "auxiliary_loss_mlp": 0.01049644, "balance_loss_clip": 1.04784775, "balance_loss_mlp": 1.02931881, "epoch": 0.2670073650984518, "flos": 20959586536320.0, "grad_norm": 1.869180757677473, "language_loss": 0.66229129, "learning_rate": 3.439118409456376e-06, "loss": 0.68402815, "num_input_tokens_seen": 95995435, "step": 4441, "time_per_iteration": 2.666428804397583 }, { "auxiliary_loss_clip": 0.01137041, "auxiliary_loss_mlp": 0.01049045, "balance_loss_clip": 1.04973912, "balance_loss_mlp": 1.02953053, "epoch": 0.2670674883511198, "flos": 28366054277760.0, "grad_norm": 3.888081439634283, "language_loss": 0.76102316, "learning_rate": 3.4388479278689486e-06, "loss": 0.78288412, "num_input_tokens_seen": 96016340, "step": 4442, "time_per_iteration": 2.6413686275482178 }, { "auxiliary_loss_clip": 0.0100646, "auxiliary_loss_mlp": 0.0105848, "balance_loss_clip": 1.02694619, "balance_loss_mlp": 1.05538034, "epoch": 0.2671276116037878, "flos": 58971319430400.0, "grad_norm": 0.9410220376713593, "language_loss": 0.61210632, "learning_rate": 3.4385773917202637e-06, "loss": 0.63275576, "num_input_tokens_seen": 96071205, "step": 4443, "time_per_iteration": 3.2342116832733154 }, { "auxiliary_loss_clip": 0.01123665, "auxiliary_loss_mlp": 0.01039982, "balance_loss_clip": 1.05413401, "balance_loss_mlp": 1.02239847, "epoch": 0.26718773485645575, "flos": 43945072053120.0, "grad_norm": 1.5620381861600383, "language_loss": 0.76195556, "learning_rate": 3.4383068010205793e-06, "loss": 0.78359205, "num_input_tokens_seen": 96094240, "step": 4444, "time_per_iteration": 3.136178731918335 }, { "auxiliary_loss_clip": 0.01142711, "auxiliary_loss_mlp": 0.01040756, "balance_loss_clip": 1.05331576, "balance_loss_mlp": 1.0213964, "epoch": 0.2672478581091237, "flos": 25228323665280.0, "grad_norm": 1.6750833182703528, "language_loss": 0.80892444, "learning_rate": 3.438036155780158e-06, "loss": 0.83075905, "num_input_tokens_seen": 96114105, "step": 4445, "time_per_iteration": 2.660952091217041 }, { "auxiliary_loss_clip": 0.01124381, "auxiliary_loss_mlp": 0.01048514, "balance_loss_clip": 1.05190587, "balance_loss_mlp": 1.02901077, "epoch": 0.2673079813617917, "flos": 15268176455040.0, "grad_norm": 2.1125172985353533, "language_loss": 0.89060926, "learning_rate": 3.43776545600926e-06, "loss": 0.9123382, "num_input_tokens_seen": 96132140, "step": 4446, "time_per_iteration": 2.6609115600585938 }, { "auxiliary_loss_clip": 0.011447, "auxiliary_loss_mlp": 0.01053132, "balance_loss_clip": 1.05528426, "balance_loss_mlp": 1.03541803, "epoch": 0.26736810461445965, "flos": 25812733944960.0, "grad_norm": 2.4310086382368783, "language_loss": 0.67756736, "learning_rate": 3.437494701718153e-06, "loss": 0.69954574, "num_input_tokens_seen": 96152090, "step": 4447, "time_per_iteration": 2.6696949005126953 }, { "auxiliary_loss_clip": 0.01144309, "auxiliary_loss_mlp": 0.0104489, "balance_loss_clip": 1.05496442, "balance_loss_mlp": 1.02572155, "epoch": 0.2674282278671276, "flos": 24312709054080.0, "grad_norm": 1.9687667134305082, "language_loss": 0.830899, "learning_rate": 3.4372238929171026e-06, "loss": 0.85279107, "num_input_tokens_seen": 96170015, "step": 4448, "time_per_iteration": 2.639463424682617 }, { "auxiliary_loss_clip": 0.0111564, "auxiliary_loss_mlp": 0.01054364, "balance_loss_clip": 1.05101895, "balance_loss_mlp": 1.03557646, "epoch": 0.2674883511197956, "flos": 22815521337600.0, "grad_norm": 1.479052407292424, "language_loss": 0.84231561, "learning_rate": 3.436953029616378e-06, "loss": 0.8640157, "num_input_tokens_seen": 96188065, "step": 4449, "time_per_iteration": 2.812290906906128 }, { "auxiliary_loss_clip": 0.0113237, "auxiliary_loss_mlp": 0.01055905, "balance_loss_clip": 1.05103493, "balance_loss_mlp": 1.03552055, "epoch": 0.26754847437246354, "flos": 25370170473600.0, "grad_norm": 1.7379167843341312, "language_loss": 0.84231997, "learning_rate": 3.4366821118262506e-06, "loss": 0.86420268, "num_input_tokens_seen": 96205780, "step": 4450, "time_per_iteration": 2.7598626613616943 }, { "auxiliary_loss_clip": 0.01109743, "auxiliary_loss_mlp": 0.01057779, "balance_loss_clip": 1.04833305, "balance_loss_mlp": 1.04044628, "epoch": 0.2676085976251315, "flos": 20230420446720.0, "grad_norm": 8.035146429526597, "language_loss": 0.80842566, "learning_rate": 3.4364111395569937e-06, "loss": 0.83010095, "num_input_tokens_seen": 96224990, "step": 4451, "time_per_iteration": 2.7467129230499268 }, { "auxiliary_loss_clip": 0.01141732, "auxiliary_loss_mlp": 0.01055516, "balance_loss_clip": 1.0553689, "balance_loss_mlp": 1.0379324, "epoch": 0.26766872087779947, "flos": 28038225824640.0, "grad_norm": 1.6378235408468254, "language_loss": 0.86285019, "learning_rate": 3.436140112818882e-06, "loss": 0.88482267, "num_input_tokens_seen": 96245345, "step": 4452, "time_per_iteration": 2.7442660331726074 }, { "auxiliary_loss_clip": 0.01134475, "auxiliary_loss_mlp": 0.01047993, "balance_loss_clip": 1.05496478, "balance_loss_mlp": 1.02926481, "epoch": 0.26772884413046744, "flos": 18325179250560.0, "grad_norm": 2.119384740597093, "language_loss": 0.83521158, "learning_rate": 3.435869031622194e-06, "loss": 0.85703623, "num_input_tokens_seen": 96259000, "step": 4453, "time_per_iteration": 2.659623146057129 }, { "auxiliary_loss_clip": 0.01141347, "auxiliary_loss_mlp": 0.01063496, "balance_loss_clip": 1.05624223, "balance_loss_mlp": 1.04485118, "epoch": 0.2677889673831354, "flos": 22127509255680.0, "grad_norm": 1.8460317519144305, "language_loss": 0.79565918, "learning_rate": 3.435597895977208e-06, "loss": 0.8177076, "num_input_tokens_seen": 96277000, "step": 4454, "time_per_iteration": 2.6458942890167236 }, { "auxiliary_loss_clip": 0.01130641, "auxiliary_loss_mlp": 0.01056871, "balance_loss_clip": 1.05338597, "balance_loss_mlp": 1.03869116, "epoch": 0.2678490906358034, "flos": 23729699404800.0, "grad_norm": 1.5255880946203295, "language_loss": 0.7241919, "learning_rate": 3.435326705894206e-06, "loss": 0.74606699, "num_input_tokens_seen": 96297010, "step": 4455, "time_per_iteration": 2.7328429222106934 }, { "auxiliary_loss_clip": 0.01112613, "auxiliary_loss_mlp": 0.01052208, "balance_loss_clip": 1.04858243, "balance_loss_mlp": 1.03508949, "epoch": 0.2679092138884714, "flos": 21762872340480.0, "grad_norm": 1.5657028408886426, "language_loss": 0.74017322, "learning_rate": 3.435055461383471e-06, "loss": 0.76182139, "num_input_tokens_seen": 96315780, "step": 4456, "time_per_iteration": 2.700190544128418 }, { "auxiliary_loss_clip": 0.0114232, "auxiliary_loss_mlp": 0.01048809, "balance_loss_clip": 1.05394006, "balance_loss_mlp": 1.03033149, "epoch": 0.26796933714113935, "flos": 19861186590720.0, "grad_norm": 2.4373070589767774, "language_loss": 0.70647967, "learning_rate": 3.4347841624552896e-06, "loss": 0.72839093, "num_input_tokens_seen": 96333465, "step": 4457, "time_per_iteration": 2.6334941387176514 }, { "auxiliary_loss_clip": 0.01112923, "auxiliary_loss_mlp": 0.01063608, "balance_loss_clip": 1.05205595, "balance_loss_mlp": 1.04513049, "epoch": 0.2680294603938073, "flos": 20047886507520.0, "grad_norm": 1.8228045543818674, "language_loss": 0.7903617, "learning_rate": 3.4345128091199493e-06, "loss": 0.81212699, "num_input_tokens_seen": 96352005, "step": 4458, "time_per_iteration": 2.7377572059631348 }, { "auxiliary_loss_clip": 0.01030327, "auxiliary_loss_mlp": 0.01043883, "balance_loss_clip": 1.0366354, "balance_loss_mlp": 1.0414269, "epoch": 0.2680895836464753, "flos": 72113763052800.0, "grad_norm": 0.9600198584891941, "language_loss": 0.58691025, "learning_rate": 3.434241401387739e-06, "loss": 0.60765231, "num_input_tokens_seen": 96406265, "step": 4459, "time_per_iteration": 3.2385354042053223 }, { "auxiliary_loss_clip": 0.0108842, "auxiliary_loss_mlp": 0.01056025, "balance_loss_clip": 1.04306948, "balance_loss_mlp": 1.0379889, "epoch": 0.26814970689914325, "flos": 20449044576000.0, "grad_norm": 2.1196386888642382, "language_loss": 0.84988648, "learning_rate": 3.4339699392689507e-06, "loss": 0.87133086, "num_input_tokens_seen": 96425225, "step": 4460, "time_per_iteration": 2.767054319381714 }, { "auxiliary_loss_clip": 0.01134128, "auxiliary_loss_mlp": 0.01059054, "balance_loss_clip": 1.0525527, "balance_loss_mlp": 1.03916979, "epoch": 0.2682098301518112, "flos": 17566674727680.0, "grad_norm": 1.6839260392555548, "language_loss": 0.68334675, "learning_rate": 3.4336984227738796e-06, "loss": 0.70527858, "num_input_tokens_seen": 96443780, "step": 4461, "time_per_iteration": 2.7217342853546143 }, { "auxiliary_loss_clip": 0.0111525, "auxiliary_loss_mlp": 0.01054739, "balance_loss_clip": 1.05045152, "balance_loss_mlp": 1.03649962, "epoch": 0.2682699534044792, "flos": 18333259810560.0, "grad_norm": 1.7146103847032579, "language_loss": 0.67240328, "learning_rate": 3.43342685191282e-06, "loss": 0.69410318, "num_input_tokens_seen": 96464530, "step": 4462, "time_per_iteration": 2.730682134628296 }, { "auxiliary_loss_clip": 0.01116667, "auxiliary_loss_mlp": 0.01046675, "balance_loss_clip": 1.05230319, "balance_loss_mlp": 1.02710128, "epoch": 0.26833007665714714, "flos": 25301294144640.0, "grad_norm": 1.7796857642272712, "language_loss": 0.69503593, "learning_rate": 3.4331552266960705e-06, "loss": 0.71666932, "num_input_tokens_seen": 96483345, "step": 4463, "time_per_iteration": 2.738046407699585 }, { "auxiliary_loss_clip": 0.01118676, "auxiliary_loss_mlp": 0.01049589, "balance_loss_clip": 1.0492326, "balance_loss_mlp": 1.02862048, "epoch": 0.2683901999098151, "flos": 16099759198080.0, "grad_norm": 2.5866232358274277, "language_loss": 0.77943784, "learning_rate": 3.432883547133931e-06, "loss": 0.80112046, "num_input_tokens_seen": 96498305, "step": 4464, "time_per_iteration": 2.6794681549072266 }, { "auxiliary_loss_clip": 0.01133564, "auxiliary_loss_mlp": 0.01042879, "balance_loss_clip": 1.05244994, "balance_loss_mlp": 1.02410388, "epoch": 0.2684503231624831, "flos": 27308054154240.0, "grad_norm": 2.2986867036088285, "language_loss": 0.71375966, "learning_rate": 3.432611813236704e-06, "loss": 0.73552406, "num_input_tokens_seen": 96519740, "step": 4465, "time_per_iteration": 2.699575662612915 }, { "auxiliary_loss_clip": 0.01042347, "auxiliary_loss_mlp": 0.01001834, "balance_loss_clip": 1.02813911, "balance_loss_mlp": 0.9993788, "epoch": 0.26851044641515104, "flos": 71858007239040.0, "grad_norm": 0.7242654721351415, "language_loss": 0.53150702, "learning_rate": 3.4323400250146943e-06, "loss": 0.5519489, "num_input_tokens_seen": 96588870, "step": 4466, "time_per_iteration": 3.3984062671661377 }, { "auxiliary_loss_clip": 0.01118674, "auxiliary_loss_mlp": 0.0105552, "balance_loss_clip": 1.04732478, "balance_loss_mlp": 1.03381157, "epoch": 0.268570569667819, "flos": 18733771434240.0, "grad_norm": 2.1738333593055796, "language_loss": 0.74038142, "learning_rate": 3.4320681824782057e-06, "loss": 0.76212335, "num_input_tokens_seen": 96605100, "step": 4467, "time_per_iteration": 2.6631343364715576 }, { "auxiliary_loss_clip": 0.01126618, "auxiliary_loss_mlp": 0.00777879, "balance_loss_clip": 1.05088973, "balance_loss_mlp": 1.00093102, "epoch": 0.268630692920487, "flos": 18178376365440.0, "grad_norm": 3.586661477808892, "language_loss": 0.80481976, "learning_rate": 3.4317962856375493e-06, "loss": 0.82386476, "num_input_tokens_seen": 96621410, "step": 4468, "time_per_iteration": 2.64806866645813 }, { "auxiliary_loss_clip": 0.01059326, "auxiliary_loss_mlp": 0.01006331, "balance_loss_clip": 1.02527809, "balance_loss_mlp": 1.0036248, "epoch": 0.268690816173155, "flos": 68731768978560.0, "grad_norm": 0.8399316740346766, "language_loss": 0.59498715, "learning_rate": 3.4315243345030334e-06, "loss": 0.61564374, "num_input_tokens_seen": 96684810, "step": 4469, "time_per_iteration": 3.1989517211914062 }, { "auxiliary_loss_clip": 0.01156531, "auxiliary_loss_mlp": 0.01048741, "balance_loss_clip": 1.05689096, "balance_loss_mlp": 1.02854705, "epoch": 0.26875093942582295, "flos": 23293636295040.0, "grad_norm": 2.165956170420043, "language_loss": 0.82055074, "learning_rate": 3.431252329084972e-06, "loss": 0.84260345, "num_input_tokens_seen": 96701920, "step": 4470, "time_per_iteration": 2.6167352199554443 }, { "auxiliary_loss_clip": 0.01117064, "auxiliary_loss_mlp": 0.01054605, "balance_loss_clip": 1.04794455, "balance_loss_mlp": 1.03563929, "epoch": 0.2688110626784909, "flos": 21543458112000.0, "grad_norm": 1.6543166375172473, "language_loss": 0.82841349, "learning_rate": 3.4309802693936786e-06, "loss": 0.8501302, "num_input_tokens_seen": 96721260, "step": 4471, "time_per_iteration": 4.177881956100464 }, { "auxiliary_loss_clip": 0.01133274, "auxiliary_loss_mlp": 0.01045934, "balance_loss_clip": 1.05339766, "balance_loss_mlp": 1.02762365, "epoch": 0.2688711859311589, "flos": 28400600183040.0, "grad_norm": 2.017001756898941, "language_loss": 0.69309431, "learning_rate": 3.43070815543947e-06, "loss": 0.71488637, "num_input_tokens_seen": 96740385, "step": 4472, "time_per_iteration": 2.6611149311065674 }, { "auxiliary_loss_clip": 0.01150636, "auxiliary_loss_mlp": 0.01046679, "balance_loss_clip": 1.05448234, "balance_loss_mlp": 1.02882099, "epoch": 0.26893130918382685, "flos": 25994944661760.0, "grad_norm": 1.889152474147147, "language_loss": 0.67809618, "learning_rate": 3.4304359872326656e-06, "loss": 0.70006931, "num_input_tokens_seen": 96761860, "step": 4473, "time_per_iteration": 2.6570448875427246 }, { "auxiliary_loss_clip": 0.01123821, "auxiliary_loss_mlp": 0.01056077, "balance_loss_clip": 1.05778623, "balance_loss_mlp": 1.03800452, "epoch": 0.2689914324364948, "flos": 20339624770560.0, "grad_norm": 2.20378943201051, "language_loss": 0.82835853, "learning_rate": 3.4301637647835843e-06, "loss": 0.8501575, "num_input_tokens_seen": 96781890, "step": 4474, "time_per_iteration": 5.79376220703125 }, { "auxiliary_loss_clip": 0.01138349, "auxiliary_loss_mlp": 0.01055982, "balance_loss_clip": 1.05353034, "balance_loss_mlp": 1.03841054, "epoch": 0.2690515556891628, "flos": 19464553635840.0, "grad_norm": 2.404484364093812, "language_loss": 0.71004206, "learning_rate": 3.4298914881025494e-06, "loss": 0.73198539, "num_input_tokens_seen": 96800390, "step": 4475, "time_per_iteration": 2.5969674587249756 }, { "auxiliary_loss_clip": 0.01112288, "auxiliary_loss_mlp": 0.00776382, "balance_loss_clip": 1.05001771, "balance_loss_mlp": 1.00081563, "epoch": 0.26911167894183075, "flos": 18146631720960.0, "grad_norm": 1.8574153972172647, "language_loss": 0.73638999, "learning_rate": 3.4296191571998863e-06, "loss": 0.75527668, "num_input_tokens_seen": 96816685, "step": 4476, "time_per_iteration": 2.70358943939209 }, { "auxiliary_loss_clip": 0.01119256, "auxiliary_loss_mlp": 0.01043783, "balance_loss_clip": 1.05050373, "balance_loss_mlp": 1.02605665, "epoch": 0.2691718021944987, "flos": 19975131509760.0, "grad_norm": 1.5040704863343832, "language_loss": 0.80439913, "learning_rate": 3.429346772085922e-06, "loss": 0.82602954, "num_input_tokens_seen": 96836285, "step": 4477, "time_per_iteration": 4.313180208206177 }, { "auxiliary_loss_clip": 0.01097359, "auxiliary_loss_mlp": 0.0104976, "balance_loss_clip": 1.04965031, "balance_loss_mlp": 1.0309844, "epoch": 0.2692319254471667, "flos": 37447215770880.0, "grad_norm": 1.7971929656919947, "language_loss": 0.65181434, "learning_rate": 3.429074332770984e-06, "loss": 0.67328548, "num_input_tokens_seen": 96857745, "step": 4478, "time_per_iteration": 2.8882603645324707 }, { "auxiliary_loss_clip": 0.01130488, "auxiliary_loss_mlp": 0.01050401, "balance_loss_clip": 1.04841042, "balance_loss_mlp": 1.03163743, "epoch": 0.26929204869983464, "flos": 22127796564480.0, "grad_norm": 1.933707281531851, "language_loss": 0.80987537, "learning_rate": 3.4288018392654047e-06, "loss": 0.83168429, "num_input_tokens_seen": 96877295, "step": 4479, "time_per_iteration": 2.670370578765869 }, { "auxiliary_loss_clip": 0.01127626, "auxiliary_loss_mlp": 0.00776143, "balance_loss_clip": 1.05010593, "balance_loss_mlp": 1.0010041, "epoch": 0.2693521719525026, "flos": 19792813052160.0, "grad_norm": 16.364114673072947, "language_loss": 0.81205857, "learning_rate": 3.4285292915795166e-06, "loss": 0.83109629, "num_input_tokens_seen": 96896160, "step": 4480, "time_per_iteration": 2.687922954559326 }, { "auxiliary_loss_clip": 0.01098242, "auxiliary_loss_mlp": 0.01051142, "balance_loss_clip": 1.04720628, "balance_loss_mlp": 1.03243792, "epoch": 0.2694122952051706, "flos": 20994383836800.0, "grad_norm": 1.5167677573266813, "language_loss": 0.77982032, "learning_rate": 3.4282566897236543e-06, "loss": 0.80131412, "num_input_tokens_seen": 96915410, "step": 4481, "time_per_iteration": 2.783400058746338 }, { "auxiliary_loss_clip": 0.01138325, "auxiliary_loss_mlp": 0.01055373, "balance_loss_clip": 1.05098486, "balance_loss_mlp": 1.03693104, "epoch": 0.2694724184578386, "flos": 25849291011840.0, "grad_norm": 1.817845708033507, "language_loss": 0.74072635, "learning_rate": 3.4279840337081547e-06, "loss": 0.76266336, "num_input_tokens_seen": 96937865, "step": 4482, "time_per_iteration": 2.704923629760742 }, { "auxiliary_loss_clip": 0.01124372, "auxiliary_loss_mlp": 0.01046467, "balance_loss_clip": 1.05258846, "balance_loss_mlp": 1.02826333, "epoch": 0.26953254171050656, "flos": 21726961718400.0, "grad_norm": 2.016330221700464, "language_loss": 0.72562164, "learning_rate": 3.4277113235433584e-06, "loss": 0.74733007, "num_input_tokens_seen": 96957710, "step": 4483, "time_per_iteration": 2.697889804840088 }, { "auxiliary_loss_clip": 0.0113896, "auxiliary_loss_mlp": 0.01056121, "balance_loss_clip": 1.04867983, "balance_loss_mlp": 1.03658295, "epoch": 0.2695926649631745, "flos": 19682926369920.0, "grad_norm": 2.3663265895203356, "language_loss": 0.86904967, "learning_rate": 3.427438559239605e-06, "loss": 0.89100051, "num_input_tokens_seen": 96975890, "step": 4484, "time_per_iteration": 2.6893441677093506 }, { "auxiliary_loss_clip": 0.01139698, "auxiliary_loss_mlp": 0.01049025, "balance_loss_clip": 1.05224931, "balance_loss_mlp": 1.03148949, "epoch": 0.2696527882158425, "flos": 32886596724480.0, "grad_norm": 1.783447205979712, "language_loss": 0.6663093, "learning_rate": 3.427165740807239e-06, "loss": 0.68819648, "num_input_tokens_seen": 96998595, "step": 4485, "time_per_iteration": 2.795172929763794 }, { "auxiliary_loss_clip": 0.01112833, "auxiliary_loss_mlp": 0.01053324, "balance_loss_clip": 1.04507363, "balance_loss_mlp": 1.03475094, "epoch": 0.26971291146851045, "flos": 12124843320960.0, "grad_norm": 2.5437851063433743, "language_loss": 0.73155308, "learning_rate": 3.426892868256604e-06, "loss": 0.75321472, "num_input_tokens_seen": 97013715, "step": 4486, "time_per_iteration": 2.6854116916656494 }, { "auxiliary_loss_clip": 0.01156209, "auxiliary_loss_mlp": 0.01047906, "balance_loss_clip": 1.05688012, "balance_loss_mlp": 1.03062034, "epoch": 0.2697730347211784, "flos": 22634459856000.0, "grad_norm": 2.2389379935408456, "language_loss": 0.84326887, "learning_rate": 3.4266199415980495e-06, "loss": 0.86531007, "num_input_tokens_seen": 97031570, "step": 4487, "time_per_iteration": 2.6117801666259766 }, { "auxiliary_loss_clip": 0.01127332, "auxiliary_loss_mlp": 0.0105083, "balance_loss_clip": 1.05733204, "balance_loss_mlp": 1.03228104, "epoch": 0.2698331579738464, "flos": 23513050523520.0, "grad_norm": 2.345170862120161, "language_loss": 0.7189706, "learning_rate": 3.4263469608419234e-06, "loss": 0.74075222, "num_input_tokens_seen": 97049815, "step": 4488, "time_per_iteration": 2.7384660243988037 }, { "auxiliary_loss_clip": 0.01074601, "auxiliary_loss_mlp": 0.01061378, "balance_loss_clip": 1.0494225, "balance_loss_mlp": 1.04040885, "epoch": 0.26989328122651435, "flos": 24641040297600.0, "grad_norm": 1.6359957516545125, "language_loss": 0.83725536, "learning_rate": 3.426073925998578e-06, "loss": 0.85861516, "num_input_tokens_seen": 97067570, "step": 4489, "time_per_iteration": 2.9274613857269287 }, { "auxiliary_loss_clip": 0.01129648, "auxiliary_loss_mlp": 0.01061235, "balance_loss_clip": 1.05630314, "balance_loss_mlp": 1.04203057, "epoch": 0.2699534044791823, "flos": 10772555068800.0, "grad_norm": 2.6678463269995785, "language_loss": 0.90056908, "learning_rate": 3.4258008370783656e-06, "loss": 0.9224779, "num_input_tokens_seen": 97082180, "step": 4490, "time_per_iteration": 2.9096486568450928 }, { "auxiliary_loss_clip": 0.01075397, "auxiliary_loss_mlp": 0.01052666, "balance_loss_clip": 1.04493999, "balance_loss_mlp": 1.03319883, "epoch": 0.2700135277318503, "flos": 36171597098880.0, "grad_norm": 2.0876908666200573, "language_loss": 0.73380542, "learning_rate": 3.4255276940916434e-06, "loss": 0.75508606, "num_input_tokens_seen": 97103470, "step": 4491, "time_per_iteration": 2.9016802310943604 }, { "auxiliary_loss_clip": 0.01156852, "auxiliary_loss_mlp": 0.01052294, "balance_loss_clip": 1.05944943, "balance_loss_mlp": 1.03453195, "epoch": 0.27007365098451824, "flos": 17418614866560.0, "grad_norm": 2.7575700534068783, "language_loss": 0.74795783, "learning_rate": 3.4252544970487676e-06, "loss": 0.77004933, "num_input_tokens_seen": 97118100, "step": 4492, "time_per_iteration": 2.6685187816619873 }, { "auxiliary_loss_clip": 0.01130467, "auxiliary_loss_mlp": 0.01050253, "balance_loss_clip": 1.05300546, "balance_loss_mlp": 1.03205013, "epoch": 0.2701337742371862, "flos": 23185688947200.0, "grad_norm": 3.551039047250381, "language_loss": 0.89015245, "learning_rate": 3.4249812459600986e-06, "loss": 0.91195965, "num_input_tokens_seen": 97136765, "step": 4493, "time_per_iteration": 2.7044742107391357 }, { "auxiliary_loss_clip": 0.01142037, "auxiliary_loss_mlp": 0.0104825, "balance_loss_clip": 1.05408192, "balance_loss_mlp": 1.03079772, "epoch": 0.2701938974898542, "flos": 24389450461440.0, "grad_norm": 1.665337194117132, "language_loss": 0.71139705, "learning_rate": 3.424707940835998e-06, "loss": 0.73329991, "num_input_tokens_seen": 97157470, "step": 4494, "time_per_iteration": 2.6299519538879395 }, { "auxiliary_loss_clip": 0.01120214, "auxiliary_loss_mlp": 0.01045805, "balance_loss_clip": 1.05193532, "balance_loss_mlp": 1.02893662, "epoch": 0.2702540207425222, "flos": 26214322976640.0, "grad_norm": 2.4718809008283045, "language_loss": 0.8642354, "learning_rate": 3.42443458168683e-06, "loss": 0.88589561, "num_input_tokens_seen": 97176905, "step": 4495, "time_per_iteration": 2.627389907836914 }, { "auxiliary_loss_clip": 0.01151814, "auxiliary_loss_mlp": 0.0105053, "balance_loss_clip": 1.05591631, "balance_loss_mlp": 1.03308964, "epoch": 0.27031414399519016, "flos": 22926377687040.0, "grad_norm": 2.1521214825296844, "language_loss": 0.76781964, "learning_rate": 3.424161168522959e-06, "loss": 0.78984308, "num_input_tokens_seen": 97196380, "step": 4496, "time_per_iteration": 2.5360703468322754 }, { "auxiliary_loss_clip": 0.01064272, "auxiliary_loss_mlp": 0.01049575, "balance_loss_clip": 1.03151321, "balance_loss_mlp": 1.04716671, "epoch": 0.2703742672478581, "flos": 63019780404480.0, "grad_norm": 0.7153442156657138, "language_loss": 0.50134224, "learning_rate": 3.423887701354754e-06, "loss": 0.52248067, "num_input_tokens_seen": 97260100, "step": 4497, "time_per_iteration": 3.1133949756622314 }, { "auxiliary_loss_clip": 0.01106563, "auxiliary_loss_mlp": 0.01051954, "balance_loss_clip": 1.05492568, "balance_loss_mlp": 1.03482318, "epoch": 0.2704343905005261, "flos": 18840820942080.0, "grad_norm": 2.421164292554959, "language_loss": 0.72386497, "learning_rate": 3.4236141801925847e-06, "loss": 0.74545014, "num_input_tokens_seen": 97277935, "step": 4498, "time_per_iteration": 2.7409775257110596 }, { "auxiliary_loss_clip": 0.01038432, "auxiliary_loss_mlp": 0.01028244, "balance_loss_clip": 1.0322926, "balance_loss_mlp": 1.02582395, "epoch": 0.27049451375319405, "flos": 71233412618880.0, "grad_norm": 0.7537228186848703, "language_loss": 0.5917033, "learning_rate": 3.4233406050468237e-06, "loss": 0.61237001, "num_input_tokens_seen": 97338845, "step": 4499, "time_per_iteration": 3.2331602573394775 }, { "auxiliary_loss_clip": 0.01124574, "auxiliary_loss_mlp": 0.01044613, "balance_loss_clip": 1.05154204, "balance_loss_mlp": 1.02593243, "epoch": 0.270554637005862, "flos": 24278594112000.0, "grad_norm": 2.1159538878254756, "language_loss": 0.73629957, "learning_rate": 3.4230669759278438e-06, "loss": 0.75799143, "num_input_tokens_seen": 97356640, "step": 4500, "time_per_iteration": 2.7513487339019775 }, { "auxiliary_loss_clip": 0.01116688, "auxiliary_loss_mlp": 0.01047016, "balance_loss_clip": 1.04657793, "balance_loss_mlp": 1.02878881, "epoch": 0.27061476025853, "flos": 17632318832640.0, "grad_norm": 2.8997006330289925, "language_loss": 0.81041664, "learning_rate": 3.4227932928460215e-06, "loss": 0.83205366, "num_input_tokens_seen": 97372585, "step": 4501, "time_per_iteration": 2.703014850616455 }, { "auxiliary_loss_clip": 0.01104056, "auxiliary_loss_mlp": 0.01053779, "balance_loss_clip": 1.04828477, "balance_loss_mlp": 1.03331053, "epoch": 0.27067488351119795, "flos": 22710123855360.0, "grad_norm": 4.2139696132912565, "language_loss": 0.7261312, "learning_rate": 3.422519555811735e-06, "loss": 0.74770957, "num_input_tokens_seen": 97393315, "step": 4502, "time_per_iteration": 2.732167959213257 }, { "auxiliary_loss_clip": 0.01129704, "auxiliary_loss_mlp": 0.01047167, "balance_loss_clip": 1.04821455, "balance_loss_mlp": 1.0268774, "epoch": 0.2707350067638659, "flos": 41719616087040.0, "grad_norm": 1.748421457410976, "language_loss": 0.67973912, "learning_rate": 3.4222457648353642e-06, "loss": 0.70150787, "num_input_tokens_seen": 97417860, "step": 4503, "time_per_iteration": 2.7950186729431152 }, { "auxiliary_loss_clip": 0.01100008, "auxiliary_loss_mlp": 0.01051668, "balance_loss_clip": 1.04750037, "balance_loss_mlp": 1.03180754, "epoch": 0.2707951300165339, "flos": 20193037367040.0, "grad_norm": 1.847411158173202, "language_loss": 0.67971921, "learning_rate": 3.4219719199272918e-06, "loss": 0.70123595, "num_input_tokens_seen": 97436780, "step": 4504, "time_per_iteration": 2.7830374240875244 }, { "auxiliary_loss_clip": 0.01142201, "auxiliary_loss_mlp": 0.01052204, "balance_loss_clip": 1.05604792, "balance_loss_mlp": 1.03451371, "epoch": 0.27085525326920185, "flos": 21433966479360.0, "grad_norm": 1.4870002594081857, "language_loss": 0.75395846, "learning_rate": 3.421698021097902e-06, "loss": 0.77590245, "num_input_tokens_seen": 97456190, "step": 4505, "time_per_iteration": 2.6758666038513184 }, { "auxiliary_loss_clip": 0.01155407, "auxiliary_loss_mlp": 0.01064618, "balance_loss_clip": 1.05439496, "balance_loss_mlp": 1.04436409, "epoch": 0.2709153765218698, "flos": 17675232606720.0, "grad_norm": 2.0635482699578254, "language_loss": 0.73474276, "learning_rate": 3.42142406835758e-06, "loss": 0.75694299, "num_input_tokens_seen": 97474545, "step": 4506, "time_per_iteration": 2.652395009994507 }, { "auxiliary_loss_clip": 0.01130629, "auxiliary_loss_mlp": 0.01053462, "balance_loss_clip": 1.05147469, "balance_loss_mlp": 1.0338285, "epoch": 0.2709754997745378, "flos": 24456243801600.0, "grad_norm": 2.6352592870517144, "language_loss": 0.80730569, "learning_rate": 3.421150061716715e-06, "loss": 0.82914662, "num_input_tokens_seen": 97494520, "step": 4507, "time_per_iteration": 2.7858307361602783 }, { "auxiliary_loss_clip": 0.01041671, "auxiliary_loss_mlp": 0.010698, "balance_loss_clip": 1.0261147, "balance_loss_mlp": 1.0667243, "epoch": 0.2710356230272058, "flos": 65210798206080.0, "grad_norm": 0.7655673562950965, "language_loss": 0.5085085, "learning_rate": 3.420876001185698e-06, "loss": 0.52962321, "num_input_tokens_seen": 97552455, "step": 4508, "time_per_iteration": 3.144418716430664 }, { "auxiliary_loss_clip": 0.01072779, "auxiliary_loss_mlp": 0.01046589, "balance_loss_clip": 1.04359698, "balance_loss_mlp": 1.02843356, "epoch": 0.27109574627987376, "flos": 25484438615040.0, "grad_norm": 1.9710162430227722, "language_loss": 0.74710357, "learning_rate": 3.4206018867749197e-06, "loss": 0.76829731, "num_input_tokens_seen": 97572650, "step": 4509, "time_per_iteration": 2.8052053451538086 }, { "auxiliary_loss_clip": 0.01130819, "auxiliary_loss_mlp": 0.01042284, "balance_loss_clip": 1.05107474, "balance_loss_mlp": 1.0254159, "epoch": 0.2711558695325417, "flos": 19682782715520.0, "grad_norm": 2.0468089657674353, "language_loss": 0.70937192, "learning_rate": 3.4203277184947757e-06, "loss": 0.73110294, "num_input_tokens_seen": 97591150, "step": 4510, "time_per_iteration": 2.6244139671325684 }, { "auxiliary_loss_clip": 0.01135912, "auxiliary_loss_mlp": 0.0103914, "balance_loss_clip": 1.05330467, "balance_loss_mlp": 1.02156901, "epoch": 0.2712159927852097, "flos": 18587758648320.0, "grad_norm": 2.4701723872261256, "language_loss": 0.70409644, "learning_rate": 3.4200534963556627e-06, "loss": 0.72584701, "num_input_tokens_seen": 97607410, "step": 4511, "time_per_iteration": 4.112820863723755 }, { "auxiliary_loss_clip": 0.0112023, "auxiliary_loss_mlp": 0.01049105, "balance_loss_clip": 1.048491, "balance_loss_mlp": 1.03115225, "epoch": 0.27127611603787766, "flos": 25630235919360.0, "grad_norm": 6.028868725677894, "language_loss": 0.81324005, "learning_rate": 3.419779220367979e-06, "loss": 0.83493352, "num_input_tokens_seen": 97626870, "step": 4512, "time_per_iteration": 4.285844087600708 }, { "auxiliary_loss_clip": 0.01147816, "auxiliary_loss_mlp": 0.01038614, "balance_loss_clip": 1.05365086, "balance_loss_mlp": 1.02323616, "epoch": 0.2713362392905456, "flos": 23148952312320.0, "grad_norm": 2.7707983308205053, "language_loss": 0.80467856, "learning_rate": 3.419504890542124e-06, "loss": 0.82654285, "num_input_tokens_seen": 97646595, "step": 4513, "time_per_iteration": 4.415290117263794 }, { "auxiliary_loss_clip": 0.01119685, "auxiliary_loss_mlp": 0.01044412, "balance_loss_clip": 1.04594898, "balance_loss_mlp": 1.02709103, "epoch": 0.2713963625432136, "flos": 18366045949440.0, "grad_norm": 1.8005970142501413, "language_loss": 0.88150048, "learning_rate": 3.4192305068885026e-06, "loss": 0.90314144, "num_input_tokens_seen": 97665485, "step": 4514, "time_per_iteration": 2.691697835922241 }, { "auxiliary_loss_clip": 0.01129072, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.05358005, "balance_loss_mlp": 1.03337574, "epoch": 0.27145648579588155, "flos": 22491751121280.0, "grad_norm": 1.6419144417830658, "language_loss": 0.91461927, "learning_rate": 3.418956069417517e-06, "loss": 0.93642819, "num_input_tokens_seen": 97683800, "step": 4515, "time_per_iteration": 2.6709890365600586 }, { "auxiliary_loss_clip": 0.01100451, "auxiliary_loss_mlp": 0.01057835, "balance_loss_clip": 1.04920852, "balance_loss_mlp": 1.03761721, "epoch": 0.2715166090485495, "flos": 19239177749760.0, "grad_norm": 2.0250040358395944, "language_loss": 0.74093282, "learning_rate": 3.4186815781395756e-06, "loss": 0.76251566, "num_input_tokens_seen": 97700505, "step": 4516, "time_per_iteration": 2.7001607418060303 }, { "auxiliary_loss_clip": 0.01136738, "auxiliary_loss_mlp": 0.01052795, "balance_loss_clip": 1.05046439, "balance_loss_mlp": 1.03483033, "epoch": 0.2715767323012175, "flos": 17709598944000.0, "grad_norm": 2.811509606055916, "language_loss": 0.75989574, "learning_rate": 3.4184070330650866e-06, "loss": 0.78179109, "num_input_tokens_seen": 97717410, "step": 4517, "time_per_iteration": 4.207966089248657 }, { "auxiliary_loss_clip": 0.01097642, "auxiliary_loss_mlp": 0.01058771, "balance_loss_clip": 1.04378986, "balance_loss_mlp": 1.03962636, "epoch": 0.27163685555388545, "flos": 22382834106240.0, "grad_norm": 2.3161178488466097, "language_loss": 0.77046895, "learning_rate": 3.4181324342044607e-06, "loss": 0.79203308, "num_input_tokens_seen": 97734545, "step": 4518, "time_per_iteration": 2.754009246826172 }, { "auxiliary_loss_clip": 0.01118909, "auxiliary_loss_mlp": 0.01047823, "balance_loss_clip": 1.05136919, "balance_loss_mlp": 1.03077579, "epoch": 0.2716969788065534, "flos": 22346708002560.0, "grad_norm": 2.717268994046331, "language_loss": 0.68388188, "learning_rate": 3.41785778156811e-06, "loss": 0.70554924, "num_input_tokens_seen": 97754000, "step": 4519, "time_per_iteration": 2.7800872325897217 }, { "auxiliary_loss_clip": 0.01134075, "auxiliary_loss_mlp": 0.01053278, "balance_loss_clip": 1.05009973, "balance_loss_mlp": 1.03611171, "epoch": 0.2717571020592214, "flos": 25228467319680.0, "grad_norm": 2.367483937305651, "language_loss": 0.75572526, "learning_rate": 3.417583075166451e-06, "loss": 0.7775988, "num_input_tokens_seen": 97772080, "step": 4520, "time_per_iteration": 2.694591760635376 }, { "auxiliary_loss_clip": 0.01138275, "auxiliary_loss_mlp": 0.0106095, "balance_loss_clip": 1.05209494, "balance_loss_mlp": 1.04226971, "epoch": 0.2718172253118894, "flos": 20189769229440.0, "grad_norm": 3.3698654303080935, "language_loss": 0.76434267, "learning_rate": 3.4173083150099e-06, "loss": 0.78633487, "num_input_tokens_seen": 97789370, "step": 4521, "time_per_iteration": 2.675443649291992 }, { "auxiliary_loss_clip": 0.01117262, "auxiliary_loss_mlp": 0.0106414, "balance_loss_clip": 1.04636955, "balance_loss_mlp": 1.04578209, "epoch": 0.27187734856455736, "flos": 14319129260160.0, "grad_norm": 2.1933848209734936, "language_loss": 0.75041616, "learning_rate": 3.417033501108875e-06, "loss": 0.77223015, "num_input_tokens_seen": 97807385, "step": 4522, "time_per_iteration": 2.769519329071045 }, { "auxiliary_loss_clip": 0.01151707, "auxiliary_loss_mlp": 0.01045506, "balance_loss_clip": 1.05433989, "balance_loss_mlp": 1.02813768, "epoch": 0.27193747181722533, "flos": 21107682311040.0, "grad_norm": 1.9328965147806931, "language_loss": 0.73074079, "learning_rate": 3.416758633473798e-06, "loss": 0.75271285, "num_input_tokens_seen": 97827930, "step": 4523, "time_per_iteration": 2.6642134189605713 }, { "auxiliary_loss_clip": 0.01120278, "auxiliary_loss_mlp": 0.01048373, "balance_loss_clip": 1.05034256, "balance_loss_mlp": 1.03014588, "epoch": 0.2719975950698933, "flos": 19682782715520.0, "grad_norm": 1.3899676528871532, "language_loss": 0.74113363, "learning_rate": 3.4164837121150915e-06, "loss": 0.76282012, "num_input_tokens_seen": 97847440, "step": 4524, "time_per_iteration": 2.6365647315979004 }, { "auxiliary_loss_clip": 0.0115251, "auxiliary_loss_mlp": 0.01059779, "balance_loss_clip": 1.05642283, "balance_loss_mlp": 1.04233861, "epoch": 0.27205771832256126, "flos": 24754482426240.0, "grad_norm": 1.6567279945506783, "language_loss": 0.7639389, "learning_rate": 3.4162087370431803e-06, "loss": 0.78606176, "num_input_tokens_seen": 97867620, "step": 4525, "time_per_iteration": 2.7116904258728027 }, { "auxiliary_loss_clip": 0.01133976, "auxiliary_loss_mlp": 0.01063183, "balance_loss_clip": 1.05110538, "balance_loss_mlp": 1.0458858, "epoch": 0.2721178415752292, "flos": 21755581879680.0, "grad_norm": 1.8049087044415455, "language_loss": 0.81449121, "learning_rate": 3.4159337082684926e-06, "loss": 0.8364628, "num_input_tokens_seen": 97884345, "step": 4526, "time_per_iteration": 2.583151340484619 }, { "auxiliary_loss_clip": 0.01150721, "auxiliary_loss_mlp": 0.01050593, "balance_loss_clip": 1.05157495, "balance_loss_mlp": 1.03235435, "epoch": 0.2721779648278972, "flos": 12676826597760.0, "grad_norm": 2.689071598576449, "language_loss": 0.77230763, "learning_rate": 3.4156586258014566e-06, "loss": 0.79432082, "num_input_tokens_seen": 97901500, "step": 4527, "time_per_iteration": 2.6060924530029297 }, { "auxiliary_loss_clip": 0.01109469, "auxiliary_loss_mlp": 0.00777538, "balance_loss_clip": 1.04898691, "balance_loss_mlp": 1.00073338, "epoch": 0.27223808808056515, "flos": 16253206099200.0, "grad_norm": 2.5564103940467313, "language_loss": 0.8187297, "learning_rate": 3.415383489652503e-06, "loss": 0.83759975, "num_input_tokens_seen": 97917800, "step": 4528, "time_per_iteration": 2.697845458984375 }, { "auxiliary_loss_clip": 0.01116518, "auxiliary_loss_mlp": 0.01058829, "balance_loss_clip": 1.05005443, "balance_loss_mlp": 1.04094744, "epoch": 0.2722982113332331, "flos": 27745805203200.0, "grad_norm": 1.774189879269534, "language_loss": 0.77156031, "learning_rate": 3.4151082998320666e-06, "loss": 0.7933138, "num_input_tokens_seen": 97937225, "step": 4529, "time_per_iteration": 2.75425124168396 }, { "auxiliary_loss_clip": 0.01123493, "auxiliary_loss_mlp": 0.01053103, "balance_loss_clip": 1.0518961, "balance_loss_mlp": 1.03634179, "epoch": 0.2723583345859011, "flos": 21726243446400.0, "grad_norm": 2.104422440945624, "language_loss": 0.82359695, "learning_rate": 3.4148330563505805e-06, "loss": 0.84536296, "num_input_tokens_seen": 97956845, "step": 4530, "time_per_iteration": 2.6822023391723633 }, { "auxiliary_loss_clip": 0.01136812, "auxiliary_loss_mlp": 0.01047087, "balance_loss_clip": 1.05334496, "balance_loss_mlp": 1.02971828, "epoch": 0.27241845783856905, "flos": 17347260499200.0, "grad_norm": 2.321764638586046, "language_loss": 0.91554427, "learning_rate": 3.4145577592184838e-06, "loss": 0.93738323, "num_input_tokens_seen": 97972465, "step": 4531, "time_per_iteration": 2.6979331970214844 }, { "auxiliary_loss_clip": 0.01138188, "auxiliary_loss_mlp": 0.01046663, "balance_loss_clip": 1.05187678, "balance_loss_mlp": 1.02856672, "epoch": 0.272478581091237, "flos": 24754302858240.0, "grad_norm": 1.9110068503115385, "language_loss": 0.76398945, "learning_rate": 3.4142824084462155e-06, "loss": 0.78583801, "num_input_tokens_seen": 97990770, "step": 4532, "time_per_iteration": 2.6663877964019775 }, { "auxiliary_loss_clip": 0.01113354, "auxiliary_loss_mlp": 0.01040904, "balance_loss_clip": 1.05224109, "balance_loss_mlp": 1.02386856, "epoch": 0.272538704343905, "flos": 17890624512000.0, "grad_norm": 2.311201731752709, "language_loss": 0.88514459, "learning_rate": 3.4140070040442162e-06, "loss": 0.90668714, "num_input_tokens_seen": 98005775, "step": 4533, "time_per_iteration": 2.693161725997925 }, { "auxiliary_loss_clip": 0.01122748, "auxiliary_loss_mlp": 0.01040937, "balance_loss_clip": 1.05127299, "balance_loss_mlp": 1.02398562, "epoch": 0.272598827596573, "flos": 22932016122240.0, "grad_norm": 2.2174577403643245, "language_loss": 0.71288157, "learning_rate": 3.413731546022929e-06, "loss": 0.73451841, "num_input_tokens_seen": 98025750, "step": 4534, "time_per_iteration": 2.7371840476989746 }, { "auxiliary_loss_clip": 0.01121649, "auxiliary_loss_mlp": 0.01040323, "balance_loss_clip": 1.05089378, "balance_loss_mlp": 1.02177453, "epoch": 0.27265895084924097, "flos": 24238409771520.0, "grad_norm": 1.6997646677502514, "language_loss": 0.91605014, "learning_rate": 3.4134560343928005e-06, "loss": 0.93766987, "num_input_tokens_seen": 98044955, "step": 4535, "time_per_iteration": 2.72127103805542 }, { "auxiliary_loss_clip": 0.0113065, "auxiliary_loss_mlp": 0.01045251, "balance_loss_clip": 1.05495596, "balance_loss_mlp": 1.02739298, "epoch": 0.27271907410190893, "flos": 27013155494400.0, "grad_norm": 1.6448383128638457, "language_loss": 0.72919363, "learning_rate": 3.4131804691642778e-06, "loss": 0.7509526, "num_input_tokens_seen": 98065860, "step": 4536, "time_per_iteration": 2.778991460800171 }, { "auxiliary_loss_clip": 0.01137601, "auxiliary_loss_mlp": 0.01044231, "balance_loss_clip": 1.05134857, "balance_loss_mlp": 1.02601612, "epoch": 0.2727791973545769, "flos": 34452588942720.0, "grad_norm": 1.7760428855271044, "language_loss": 0.71682841, "learning_rate": 3.41290485034781e-06, "loss": 0.73864675, "num_input_tokens_seen": 98085450, "step": 4537, "time_per_iteration": 2.7746009826660156 }, { "auxiliary_loss_clip": 0.01119602, "auxiliary_loss_mlp": 0.01042982, "balance_loss_clip": 1.04899096, "balance_loss_mlp": 1.02455187, "epoch": 0.27283932060724486, "flos": 15041723160960.0, "grad_norm": 2.103574663853892, "language_loss": 0.77419543, "learning_rate": 3.4126291779538485e-06, "loss": 0.79582125, "num_input_tokens_seen": 98099115, "step": 4538, "time_per_iteration": 2.6432113647460938 }, { "auxiliary_loss_clip": 0.011333, "auxiliary_loss_mlp": 0.01044735, "balance_loss_clip": 1.05075216, "balance_loss_mlp": 1.02784324, "epoch": 0.2728994438599128, "flos": 21652411040640.0, "grad_norm": 1.824827492408775, "language_loss": 0.90160263, "learning_rate": 3.412353451992847e-06, "loss": 0.923383, "num_input_tokens_seen": 98118415, "step": 4539, "time_per_iteration": 2.620088815689087 }, { "auxiliary_loss_clip": 0.0112346, "auxiliary_loss_mlp": 0.01044264, "balance_loss_clip": 1.04970992, "balance_loss_mlp": 1.0250001, "epoch": 0.2729595671125808, "flos": 17488424949120.0, "grad_norm": 1.7778813807473632, "language_loss": 0.88033229, "learning_rate": 3.4120776724752607e-06, "loss": 0.90200949, "num_input_tokens_seen": 98136300, "step": 4540, "time_per_iteration": 2.7115092277526855 }, { "auxiliary_loss_clip": 0.01139055, "auxiliary_loss_mlp": 0.00775653, "balance_loss_clip": 1.0515871, "balance_loss_mlp": 1.00068974, "epoch": 0.27301969036524876, "flos": 19318145800320.0, "grad_norm": 3.2240434674097758, "language_loss": 0.82471287, "learning_rate": 3.4118018394115476e-06, "loss": 0.84385997, "num_input_tokens_seen": 98154580, "step": 4541, "time_per_iteration": 2.6112682819366455 }, { "auxiliary_loss_clip": 0.01123955, "auxiliary_loss_mlp": 0.01045117, "balance_loss_clip": 1.05166435, "balance_loss_mlp": 1.02798617, "epoch": 0.2730798136179167, "flos": 21065666376960.0, "grad_norm": 2.102491799578544, "language_loss": 0.79535306, "learning_rate": 3.4115259528121678e-06, "loss": 0.81704378, "num_input_tokens_seen": 98173115, "step": 4542, "time_per_iteration": 2.7202932834625244 }, { "auxiliary_loss_clip": 0.01130053, "auxiliary_loss_mlp": 0.0103993, "balance_loss_clip": 1.05406725, "balance_loss_mlp": 1.02263296, "epoch": 0.2731399368705847, "flos": 19171737964800.0, "grad_norm": 1.955696716620197, "language_loss": 0.89326978, "learning_rate": 3.411250012687582e-06, "loss": 0.91496956, "num_input_tokens_seen": 98190260, "step": 4543, "time_per_iteration": 2.6846654415130615 }, { "auxiliary_loss_clip": 0.01118776, "auxiliary_loss_mlp": 0.00776653, "balance_loss_clip": 1.04913735, "balance_loss_mlp": 1.00080073, "epoch": 0.27320006012325265, "flos": 18290130554880.0, "grad_norm": 2.4410785724718997, "language_loss": 0.64012986, "learning_rate": 3.410974019048255e-06, "loss": 0.65908414, "num_input_tokens_seen": 98207115, "step": 4544, "time_per_iteration": 2.6373775005340576 }, { "auxiliary_loss_clip": 0.01123945, "auxiliary_loss_mlp": 0.01044578, "balance_loss_clip": 1.05455351, "balance_loss_mlp": 1.02582633, "epoch": 0.2732601833759206, "flos": 34860929731200.0, "grad_norm": 3.5876362405970643, "language_loss": 0.69788039, "learning_rate": 3.410697971904651e-06, "loss": 0.71956557, "num_input_tokens_seen": 98230610, "step": 4545, "time_per_iteration": 2.7943291664123535 }, { "auxiliary_loss_clip": 0.0103839, "auxiliary_loss_mlp": 0.01023664, "balance_loss_clip": 1.02576709, "balance_loss_mlp": 1.02123213, "epoch": 0.2733203066285886, "flos": 53910824762880.0, "grad_norm": 0.7314456658795918, "language_loss": 0.61636353, "learning_rate": 3.4104218712672383e-06, "loss": 0.63698411, "num_input_tokens_seen": 98293585, "step": 4546, "time_per_iteration": 3.2244455814361572 }, { "auxiliary_loss_clip": 0.0105925, "auxiliary_loss_mlp": 0.01053726, "balance_loss_clip": 1.04915786, "balance_loss_mlp": 1.03472424, "epoch": 0.2733804298812566, "flos": 20660378244480.0, "grad_norm": 1.905103737754333, "language_loss": 0.6467241, "learning_rate": 3.410145717146488e-06, "loss": 0.66785389, "num_input_tokens_seen": 98311680, "step": 4547, "time_per_iteration": 2.7815287113189697 }, { "auxiliary_loss_clip": 0.01123347, "auxiliary_loss_mlp": 0.00774125, "balance_loss_clip": 1.05267262, "balance_loss_mlp": 1.00081313, "epoch": 0.27344055313392457, "flos": 25884339707520.0, "grad_norm": 1.90846373489731, "language_loss": 0.77248073, "learning_rate": 3.4098695095528694e-06, "loss": 0.79145551, "num_input_tokens_seen": 98330770, "step": 4548, "time_per_iteration": 2.8113017082214355 }, { "auxiliary_loss_clip": 0.01122557, "auxiliary_loss_mlp": 0.01050902, "balance_loss_clip": 1.05430245, "balance_loss_mlp": 1.03526139, "epoch": 0.27350067638659253, "flos": 22929753565440.0, "grad_norm": 1.9713428286290122, "language_loss": 0.82792878, "learning_rate": 3.4095932484968585e-06, "loss": 0.84966338, "num_input_tokens_seen": 98349860, "step": 4549, "time_per_iteration": 2.6938650608062744 }, { "auxiliary_loss_clip": 0.01135405, "auxiliary_loss_mlp": 0.01048728, "balance_loss_clip": 1.04898036, "balance_loss_mlp": 1.02902281, "epoch": 0.2735607996392605, "flos": 16574821499520.0, "grad_norm": 3.4543610040263655, "language_loss": 0.71193838, "learning_rate": 3.4093169339889305e-06, "loss": 0.73377967, "num_input_tokens_seen": 98367040, "step": 4550, "time_per_iteration": 2.638643503189087 }, { "auxiliary_loss_clip": 0.01107347, "auxiliary_loss_mlp": 0.01042242, "balance_loss_clip": 1.05066109, "balance_loss_mlp": 1.02569556, "epoch": 0.27362092289192846, "flos": 19645291895040.0, "grad_norm": 3.3050607953849576, "language_loss": 0.78899491, "learning_rate": 3.409040566039563e-06, "loss": 0.81049079, "num_input_tokens_seen": 98384010, "step": 4551, "time_per_iteration": 4.352613210678101 }, { "auxiliary_loss_clip": 0.01107945, "auxiliary_loss_mlp": 0.01052105, "balance_loss_clip": 1.04898548, "balance_loss_mlp": 1.03342533, "epoch": 0.27368104614459643, "flos": 17639142416640.0, "grad_norm": 2.480443972085862, "language_loss": 0.71220398, "learning_rate": 3.4087641446592362e-06, "loss": 0.73380452, "num_input_tokens_seen": 98399625, "step": 4552, "time_per_iteration": 4.194540739059448 }, { "auxiliary_loss_clip": 0.01123037, "auxiliary_loss_mlp": 0.01045225, "balance_loss_clip": 1.05144608, "balance_loss_mlp": 1.0275104, "epoch": 0.2737411693972644, "flos": 21580015178880.0, "grad_norm": 2.1026303213651967, "language_loss": 0.71636003, "learning_rate": 3.408487669858431e-06, "loss": 0.73804259, "num_input_tokens_seen": 98417310, "step": 4553, "time_per_iteration": 2.7323882579803467 }, { "auxiliary_loss_clip": 0.01134032, "auxiliary_loss_mlp": 0.01045217, "balance_loss_clip": 1.05039358, "balance_loss_mlp": 1.02658415, "epoch": 0.27380129264993236, "flos": 25484043565440.0, "grad_norm": 1.7325126580228065, "language_loss": 0.58917797, "learning_rate": 3.4082111416476337e-06, "loss": 0.6109705, "num_input_tokens_seen": 98438670, "step": 4554, "time_per_iteration": 2.7384533882141113 }, { "auxiliary_loss_clip": 0.01129927, "auxiliary_loss_mlp": 0.01042216, "balance_loss_clip": 1.05440903, "balance_loss_mlp": 1.02400088, "epoch": 0.2738614159026003, "flos": 18661196004480.0, "grad_norm": 1.7915916386168997, "language_loss": 0.73645991, "learning_rate": 3.4079345600373275e-06, "loss": 0.75818133, "num_input_tokens_seen": 98456060, "step": 4555, "time_per_iteration": 2.742417335510254 }, { "auxiliary_loss_clip": 0.01141373, "auxiliary_loss_mlp": 0.01039158, "balance_loss_clip": 1.0561738, "balance_loss_mlp": 1.02152658, "epoch": 0.2739215391552683, "flos": 23477139901440.0, "grad_norm": 2.8904145278515303, "language_loss": 0.77755523, "learning_rate": 3.407657925038002e-06, "loss": 0.79936051, "num_input_tokens_seen": 98473765, "step": 4556, "time_per_iteration": 4.419378280639648 }, { "auxiliary_loss_clip": 0.01150896, "auxiliary_loss_mlp": 0.01049261, "balance_loss_clip": 1.05645621, "balance_loss_mlp": 1.02959132, "epoch": 0.27398166240793626, "flos": 17128636369920.0, "grad_norm": 7.460972643049535, "language_loss": 0.82236463, "learning_rate": 3.4073812366601473e-06, "loss": 0.84436619, "num_input_tokens_seen": 98490590, "step": 4557, "time_per_iteration": 2.6087756156921387 }, { "auxiliary_loss_clip": 0.01089746, "auxiliary_loss_mlp": 0.01046447, "balance_loss_clip": 1.04229808, "balance_loss_mlp": 1.02811229, "epoch": 0.2740417856606042, "flos": 23404744039680.0, "grad_norm": 2.034332886344347, "language_loss": 0.7293033, "learning_rate": 3.4071044949142547e-06, "loss": 0.75066525, "num_input_tokens_seen": 98510590, "step": 4558, "time_per_iteration": 2.7908921241760254 }, { "auxiliary_loss_clip": 0.0112554, "auxiliary_loss_mlp": 0.01051481, "balance_loss_clip": 1.05215442, "balance_loss_mlp": 1.03334939, "epoch": 0.2741019089132722, "flos": 12780428400000.0, "grad_norm": 2.134307291688894, "language_loss": 0.67842996, "learning_rate": 3.406827699810819e-06, "loss": 0.70020014, "num_input_tokens_seen": 98527875, "step": 4559, "time_per_iteration": 2.7246246337890625 }, { "auxiliary_loss_clip": 0.01121642, "auxiliary_loss_mlp": 0.01055203, "balance_loss_clip": 1.04958165, "balance_loss_mlp": 1.03646374, "epoch": 0.27416203216594015, "flos": 20631542601600.0, "grad_norm": 2.095192605103166, "language_loss": 0.7249226, "learning_rate": 3.4065508513603353e-06, "loss": 0.74669105, "num_input_tokens_seen": 98547575, "step": 4560, "time_per_iteration": 2.634526252746582 }, { "auxiliary_loss_clip": 0.01131443, "auxiliary_loss_mlp": 0.01049928, "balance_loss_clip": 1.05592251, "balance_loss_mlp": 1.03115225, "epoch": 0.27422215541860817, "flos": 26541576812160.0, "grad_norm": 2.095026193088577, "language_loss": 0.81413525, "learning_rate": 3.406273949573303e-06, "loss": 0.83594894, "num_input_tokens_seen": 98566290, "step": 4561, "time_per_iteration": 2.711106538772583 }, { "auxiliary_loss_clip": 0.01156737, "auxiliary_loss_mlp": 0.01043903, "balance_loss_clip": 1.05919766, "balance_loss_mlp": 1.02688003, "epoch": 0.27428227867127614, "flos": 23331163029120.0, "grad_norm": 1.7066421621801435, "language_loss": 0.75436246, "learning_rate": 3.4059969944602214e-06, "loss": 0.77636886, "num_input_tokens_seen": 98586255, "step": 4562, "time_per_iteration": 2.699544668197632 }, { "auxiliary_loss_clip": 0.01155238, "auxiliary_loss_mlp": 0.01038722, "balance_loss_clip": 1.06035113, "balance_loss_mlp": 1.02138865, "epoch": 0.2743424019239441, "flos": 23035115134080.0, "grad_norm": 1.784616644228294, "language_loss": 0.74751598, "learning_rate": 3.4057199860315928e-06, "loss": 0.76945561, "num_input_tokens_seen": 98606030, "step": 4563, "time_per_iteration": 2.788313627243042 }, { "auxiliary_loss_clip": 0.01119321, "auxiliary_loss_mlp": 0.01048987, "balance_loss_clip": 1.04918432, "balance_loss_mlp": 1.02912664, "epoch": 0.27440252517661207, "flos": 21981101420160.0, "grad_norm": 1.7657560231579414, "language_loss": 0.63026172, "learning_rate": 3.4054429242979213e-06, "loss": 0.65194476, "num_input_tokens_seen": 98625225, "step": 4564, "time_per_iteration": 2.810922145843506 }, { "auxiliary_loss_clip": 0.01128901, "auxiliary_loss_mlp": 0.01046032, "balance_loss_clip": 1.05438292, "balance_loss_mlp": 1.02732766, "epoch": 0.27446264842928003, "flos": 40187451502080.0, "grad_norm": 1.9571814389681148, "language_loss": 0.78683448, "learning_rate": 3.4051658092697135e-06, "loss": 0.8085838, "num_input_tokens_seen": 98649470, "step": 4565, "time_per_iteration": 2.846803665161133 }, { "auxiliary_loss_clip": 0.01095875, "auxiliary_loss_mlp": 0.01050978, "balance_loss_clip": 1.04981828, "balance_loss_mlp": 1.03370428, "epoch": 0.274522771681948, "flos": 13479681438720.0, "grad_norm": 2.4708024317398003, "language_loss": 0.68715227, "learning_rate": 3.404888640957477e-06, "loss": 0.70862079, "num_input_tokens_seen": 98666915, "step": 4566, "time_per_iteration": 2.714352607727051 }, { "auxiliary_loss_clip": 0.01142259, "auxiliary_loss_mlp": 0.01049797, "balance_loss_clip": 1.05835438, "balance_loss_mlp": 1.03326273, "epoch": 0.27458289493461596, "flos": 28622133313920.0, "grad_norm": 2.1203833431876435, "language_loss": 0.60966527, "learning_rate": 3.404611419371723e-06, "loss": 0.63158584, "num_input_tokens_seen": 98688240, "step": 4567, "time_per_iteration": 2.71791934967041 }, { "auxiliary_loss_clip": 0.01135855, "auxiliary_loss_mlp": 0.01047435, "balance_loss_clip": 1.05527198, "balance_loss_mlp": 1.02756321, "epoch": 0.2746430181872839, "flos": 20119815492480.0, "grad_norm": 4.134990661591929, "language_loss": 0.82529241, "learning_rate": 3.4043341445229627e-06, "loss": 0.84712529, "num_input_tokens_seen": 98708245, "step": 4568, "time_per_iteration": 2.6779236793518066 }, { "auxiliary_loss_clip": 0.01141648, "auxiliary_loss_mlp": 0.01037451, "balance_loss_clip": 1.06012177, "balance_loss_mlp": 1.01916456, "epoch": 0.2747031414399519, "flos": 20193468330240.0, "grad_norm": 2.0524329167860254, "language_loss": 0.68425417, "learning_rate": 3.4040568164217117e-06, "loss": 0.70604521, "num_input_tokens_seen": 98724575, "step": 4569, "time_per_iteration": 2.6595280170440674 }, { "auxiliary_loss_clip": 0.0111585, "auxiliary_loss_mlp": 0.01047943, "balance_loss_clip": 1.04627442, "balance_loss_mlp": 1.02938235, "epoch": 0.27476326469261986, "flos": 13516346246400.0, "grad_norm": 2.9457223850766283, "language_loss": 0.70966327, "learning_rate": 3.4037794350784848e-06, "loss": 0.73130119, "num_input_tokens_seen": 98740700, "step": 4570, "time_per_iteration": 2.7404215335845947 }, { "auxiliary_loss_clip": 0.01035018, "auxiliary_loss_mlp": 0.01027544, "balance_loss_clip": 1.03062916, "balance_loss_mlp": 1.02521896, "epoch": 0.2748233879452878, "flos": 65937127121280.0, "grad_norm": 0.7294499123437721, "language_loss": 0.55835986, "learning_rate": 3.4035020005038014e-06, "loss": 0.57898545, "num_input_tokens_seen": 98803030, "step": 4571, "time_per_iteration": 3.369403123855591 }, { "auxiliary_loss_clip": 0.01096573, "auxiliary_loss_mlp": 0.0104917, "balance_loss_clip": 1.0493505, "balance_loss_mlp": 1.03134847, "epoch": 0.2748835111979558, "flos": 17384212615680.0, "grad_norm": 2.8212366896407772, "language_loss": 0.78388298, "learning_rate": 3.4032245127081812e-06, "loss": 0.80534041, "num_input_tokens_seen": 98820505, "step": 4572, "time_per_iteration": 2.835817813873291 }, { "auxiliary_loss_clip": 0.01145371, "auxiliary_loss_mlp": 0.01038852, "balance_loss_clip": 1.05474758, "balance_loss_mlp": 1.02365255, "epoch": 0.27494363445062375, "flos": 23587565287680.0, "grad_norm": 3.882915196153325, "language_loss": 0.8126958, "learning_rate": 3.402946971702147e-06, "loss": 0.83453798, "num_input_tokens_seen": 98842150, "step": 4573, "time_per_iteration": 2.709415912628174 }, { "auxiliary_loss_clip": 0.01135124, "auxiliary_loss_mlp": 0.01042886, "balance_loss_clip": 1.0529685, "balance_loss_mlp": 1.0252434, "epoch": 0.2750037577032918, "flos": 17164582905600.0, "grad_norm": 1.740498780022663, "language_loss": 0.79043669, "learning_rate": 3.402669377496223e-06, "loss": 0.81221676, "num_input_tokens_seen": 98861050, "step": 4574, "time_per_iteration": 2.651921272277832 }, { "auxiliary_loss_clip": 0.01104251, "auxiliary_loss_mlp": 0.01052183, "balance_loss_clip": 1.05164313, "balance_loss_mlp": 1.03518367, "epoch": 0.27506388095595974, "flos": 24491903028480.0, "grad_norm": 2.03666793953709, "language_loss": 0.74517256, "learning_rate": 3.402391730100936e-06, "loss": 0.76673687, "num_input_tokens_seen": 98879695, "step": 4575, "time_per_iteration": 2.7622992992401123 }, { "auxiliary_loss_clip": 0.01126178, "auxiliary_loss_mlp": 0.01042992, "balance_loss_clip": 1.05188203, "balance_loss_mlp": 1.02700627, "epoch": 0.2751240042086277, "flos": 38764706722560.0, "grad_norm": 2.5671977719319745, "language_loss": 0.71951419, "learning_rate": 3.402114029526814e-06, "loss": 0.74120593, "num_input_tokens_seen": 98902035, "step": 4576, "time_per_iteration": 2.85740065574646 }, { "auxiliary_loss_clip": 0.01102681, "auxiliary_loss_mlp": 0.00778132, "balance_loss_clip": 1.0506314, "balance_loss_mlp": 1.00075579, "epoch": 0.27518412746129567, "flos": 26907039740160.0, "grad_norm": 1.8050360629969575, "language_loss": 0.73217857, "learning_rate": 3.4018362757843866e-06, "loss": 0.7509867, "num_input_tokens_seen": 98921835, "step": 4577, "time_per_iteration": 2.9024770259857178 }, { "auxiliary_loss_clip": 0.01130618, "auxiliary_loss_mlp": 0.01043838, "balance_loss_clip": 1.05657601, "balance_loss_mlp": 1.02571797, "epoch": 0.27524425071396363, "flos": 24900531125760.0, "grad_norm": 1.7818656930434014, "language_loss": 0.76073247, "learning_rate": 3.401558468884188e-06, "loss": 0.78247702, "num_input_tokens_seen": 98939610, "step": 4578, "time_per_iteration": 2.7173874378204346 }, { "auxiliary_loss_clip": 0.01120877, "auxiliary_loss_mlp": 0.01047646, "balance_loss_clip": 1.05252147, "balance_loss_mlp": 1.02741659, "epoch": 0.2753043739666316, "flos": 26288047641600.0, "grad_norm": 2.6134371594901773, "language_loss": 0.66563278, "learning_rate": 3.4012806088367516e-06, "loss": 0.68731803, "num_input_tokens_seen": 98962250, "step": 4579, "time_per_iteration": 2.730104446411133 }, { "auxiliary_loss_clip": 0.01113502, "auxiliary_loss_mlp": 0.01058443, "balance_loss_clip": 1.04683816, "balance_loss_mlp": 1.03911948, "epoch": 0.27536449721929956, "flos": 24206772867840.0, "grad_norm": 1.8779975195253575, "language_loss": 0.80174518, "learning_rate": 3.4010026956526137e-06, "loss": 0.82346463, "num_input_tokens_seen": 98981845, "step": 4580, "time_per_iteration": 2.8395349979400635 }, { "auxiliary_loss_clip": 0.01141995, "auxiliary_loss_mlp": 0.01050029, "balance_loss_clip": 1.05684924, "balance_loss_mlp": 1.02942991, "epoch": 0.27542462047196753, "flos": 19537272720000.0, "grad_norm": 1.5301552660019138, "language_loss": 0.67242241, "learning_rate": 3.4007247293423137e-06, "loss": 0.69434267, "num_input_tokens_seen": 99001855, "step": 4581, "time_per_iteration": 2.788644552230835 }, { "auxiliary_loss_clip": 0.01132258, "auxiliary_loss_mlp": 0.0104746, "balance_loss_clip": 1.0560689, "balance_loss_mlp": 1.03050864, "epoch": 0.2754847437246355, "flos": 14319165173760.0, "grad_norm": 1.785645052077455, "language_loss": 0.77915615, "learning_rate": 3.400446709916392e-06, "loss": 0.80095327, "num_input_tokens_seen": 99019880, "step": 4582, "time_per_iteration": 2.730393409729004 }, { "auxiliary_loss_clip": 0.0110084, "auxiliary_loss_mlp": 0.01042256, "balance_loss_clip": 1.05119133, "balance_loss_mlp": 1.02575767, "epoch": 0.27554486697730346, "flos": 18838773866880.0, "grad_norm": 1.737971373642785, "language_loss": 0.84479475, "learning_rate": 3.4001686373853895e-06, "loss": 0.86622572, "num_input_tokens_seen": 99037570, "step": 4583, "time_per_iteration": 2.7274270057678223 }, { "auxiliary_loss_clip": 0.01139632, "auxiliary_loss_mlp": 0.01044098, "balance_loss_clip": 1.05364764, "balance_loss_mlp": 1.02693176, "epoch": 0.2756049902299714, "flos": 22382295402240.0, "grad_norm": 1.6883560409679848, "language_loss": 0.67007428, "learning_rate": 3.3998905117598528e-06, "loss": 0.69191158, "num_input_tokens_seen": 99056875, "step": 4584, "time_per_iteration": 2.643176794052124 }, { "auxiliary_loss_clip": 0.01080495, "auxiliary_loss_mlp": 0.01054092, "balance_loss_clip": 1.04106402, "balance_loss_mlp": 1.03475666, "epoch": 0.2756651134826394, "flos": 19573901614080.0, "grad_norm": 1.8352571769398758, "language_loss": 0.77349764, "learning_rate": 3.399612333050327e-06, "loss": 0.79484355, "num_input_tokens_seen": 99074685, "step": 4585, "time_per_iteration": 2.6824886798858643 }, { "auxiliary_loss_clip": 0.01142822, "auxiliary_loss_mlp": 0.00775816, "balance_loss_clip": 1.05703616, "balance_loss_mlp": 1.00084651, "epoch": 0.27572523673530736, "flos": 23586559706880.0, "grad_norm": 1.697985370469672, "language_loss": 0.7201665, "learning_rate": 3.399334101267362e-06, "loss": 0.73935288, "num_input_tokens_seen": 99095300, "step": 4586, "time_per_iteration": 2.672872304916382 }, { "auxiliary_loss_clip": 0.01125604, "auxiliary_loss_mlp": 0.01038583, "balance_loss_clip": 1.05329537, "balance_loss_mlp": 1.02184618, "epoch": 0.2757853599879754, "flos": 22820118278400.0, "grad_norm": 2.166019285475688, "language_loss": 0.80385983, "learning_rate": 3.3990558164215073e-06, "loss": 0.82550168, "num_input_tokens_seen": 99115965, "step": 4587, "time_per_iteration": 2.716212272644043 }, { "auxiliary_loss_clip": 0.01139286, "auxiliary_loss_mlp": 0.0104661, "balance_loss_clip": 1.05435753, "balance_loss_mlp": 1.02916992, "epoch": 0.27584548324064334, "flos": 18551704371840.0, "grad_norm": 3.416975868515595, "language_loss": 0.83000016, "learning_rate": 3.398777478523316e-06, "loss": 0.85185915, "num_input_tokens_seen": 99134265, "step": 4588, "time_per_iteration": 2.6104485988616943 }, { "auxiliary_loss_clip": 0.01109827, "auxiliary_loss_mlp": 0.01042868, "balance_loss_clip": 1.04756808, "balance_loss_mlp": 1.02567828, "epoch": 0.2759056064933113, "flos": 23769883745280.0, "grad_norm": 1.3306263403060763, "language_loss": 0.75309169, "learning_rate": 3.398499087583342e-06, "loss": 0.77461863, "num_input_tokens_seen": 99156185, "step": 4589, "time_per_iteration": 4.333514928817749 }, { "auxiliary_loss_clip": 0.01138237, "auxiliary_loss_mlp": 0.01046648, "balance_loss_clip": 1.0555464, "balance_loss_mlp": 1.02944636, "epoch": 0.27596572974597927, "flos": 24281898163200.0, "grad_norm": 1.9812216556422375, "language_loss": 0.8860873, "learning_rate": 3.398220643612143e-06, "loss": 0.90793616, "num_input_tokens_seen": 99176735, "step": 4590, "time_per_iteration": 4.256460428237915 }, { "auxiliary_loss_clip": 0.01132985, "auxiliary_loss_mlp": 0.01048634, "balance_loss_clip": 1.05280411, "balance_loss_mlp": 1.03025222, "epoch": 0.27602585299864724, "flos": 35040985632000.0, "grad_norm": 1.594737426944321, "language_loss": 0.71265185, "learning_rate": 3.397942146620277e-06, "loss": 0.7344681, "num_input_tokens_seen": 99199765, "step": 4591, "time_per_iteration": 2.8263018131256104 }, { "auxiliary_loss_clip": 0.01114882, "auxiliary_loss_mlp": 0.01048296, "balance_loss_clip": 1.05395412, "balance_loss_mlp": 1.0301044, "epoch": 0.2760859762513152, "flos": 24309405002880.0, "grad_norm": 3.793452037579163, "language_loss": 0.80017495, "learning_rate": 3.3976635966183046e-06, "loss": 0.82180673, "num_input_tokens_seen": 99218435, "step": 4592, "time_per_iteration": 4.289790153503418 }, { "auxiliary_loss_clip": 0.01051224, "auxiliary_loss_mlp": 0.00755885, "balance_loss_clip": 1.02655387, "balance_loss_mlp": 1.00253439, "epoch": 0.27614609950398317, "flos": 71260739890560.0, "grad_norm": 0.710408868807485, "language_loss": 0.61613023, "learning_rate": 3.3973849936167886e-06, "loss": 0.63420129, "num_input_tokens_seen": 99276200, "step": 4593, "time_per_iteration": 3.201831817626953 }, { "auxiliary_loss_clip": 0.01130969, "auxiliary_loss_mlp": 0.01042983, "balance_loss_clip": 1.05307889, "balance_loss_mlp": 1.02640104, "epoch": 0.27620622275665113, "flos": 29674854138240.0, "grad_norm": 1.9659750468178385, "language_loss": 0.778301, "learning_rate": 3.3971063376262937e-06, "loss": 0.80004054, "num_input_tokens_seen": 99297625, "step": 4594, "time_per_iteration": 2.7222111225128174 }, { "auxiliary_loss_clip": 0.0113791, "auxiliary_loss_mlp": 0.01038649, "balance_loss_clip": 1.05557215, "balance_loss_mlp": 1.02168524, "epoch": 0.2762663460093191, "flos": 15378063137280.0, "grad_norm": 1.5118783378909677, "language_loss": 0.91944981, "learning_rate": 3.3968276286573866e-06, "loss": 0.9412154, "num_input_tokens_seen": 99315790, "step": 4595, "time_per_iteration": 4.290736198425293 }, { "auxiliary_loss_clip": 0.01134891, "auxiliary_loss_mlp": 0.01052323, "balance_loss_clip": 1.05374146, "balance_loss_mlp": 1.03413117, "epoch": 0.27632646926198706, "flos": 20704082117760.0, "grad_norm": 1.7744098894398055, "language_loss": 0.69208467, "learning_rate": 3.3965488667206353e-06, "loss": 0.71395689, "num_input_tokens_seen": 99334615, "step": 4596, "time_per_iteration": 2.7178540229797363 }, { "auxiliary_loss_clip": 0.01125254, "auxiliary_loss_mlp": 0.01048102, "balance_loss_clip": 1.05075955, "balance_loss_mlp": 1.02977943, "epoch": 0.276386592514655, "flos": 32813374849920.0, "grad_norm": 1.7305541104386353, "language_loss": 0.63536781, "learning_rate": 3.3962700518266113e-06, "loss": 0.65710139, "num_input_tokens_seen": 99356685, "step": 4597, "time_per_iteration": 2.7713348865509033 }, { "auxiliary_loss_clip": 0.01150233, "auxiliary_loss_mlp": 0.01046127, "balance_loss_clip": 1.05762243, "balance_loss_mlp": 1.02949786, "epoch": 0.276446715767323, "flos": 18551704371840.0, "grad_norm": 2.077440653118394, "language_loss": 0.86298984, "learning_rate": 3.395991183985887e-06, "loss": 0.8849535, "num_input_tokens_seen": 99374810, "step": 4598, "time_per_iteration": 2.6077804565429688 }, { "auxiliary_loss_clip": 0.01151532, "auxiliary_loss_mlp": 0.01046218, "balance_loss_clip": 1.0559516, "balance_loss_mlp": 1.02790797, "epoch": 0.27650683901999096, "flos": 22819615488000.0, "grad_norm": 2.6195813063936493, "language_loss": 0.79957914, "learning_rate": 3.395712263209037e-06, "loss": 0.82155669, "num_input_tokens_seen": 99391290, "step": 4599, "time_per_iteration": 2.67372989654541 }, { "auxiliary_loss_clip": 0.01127397, "auxiliary_loss_mlp": 0.01049332, "balance_loss_clip": 1.04922533, "balance_loss_mlp": 1.03152239, "epoch": 0.276566962272659, "flos": 21361534704000.0, "grad_norm": 1.7492576371751551, "language_loss": 0.78788924, "learning_rate": 3.395433289506639e-06, "loss": 0.80965656, "num_input_tokens_seen": 99409120, "step": 4600, "time_per_iteration": 2.7197396755218506 }, { "auxiliary_loss_clip": 0.01119636, "auxiliary_loss_mlp": 0.01049981, "balance_loss_clip": 1.05458808, "balance_loss_mlp": 1.03226674, "epoch": 0.27662708552532694, "flos": 17710604524800.0, "grad_norm": 2.9827767838021906, "language_loss": 0.7372371, "learning_rate": 3.3951542628892694e-06, "loss": 0.75893331, "num_input_tokens_seen": 99426180, "step": 4601, "time_per_iteration": 2.7212698459625244 }, { "auxiliary_loss_clip": 0.01137986, "auxiliary_loss_mlp": 0.01053484, "balance_loss_clip": 1.05503917, "balance_loss_mlp": 1.03514934, "epoch": 0.2766872087779949, "flos": 21252725429760.0, "grad_norm": 1.7018676665174548, "language_loss": 0.80055201, "learning_rate": 3.3948751833675113e-06, "loss": 0.82246667, "num_input_tokens_seen": 99447720, "step": 4602, "time_per_iteration": 2.6929776668548584 }, { "auxiliary_loss_clip": 0.01131471, "auxiliary_loss_mlp": 0.01060998, "balance_loss_clip": 1.05209374, "balance_loss_mlp": 1.04194784, "epoch": 0.2767473320306629, "flos": 12931900053120.0, "grad_norm": 2.3561631161543986, "language_loss": 0.77018148, "learning_rate": 3.3945960509519455e-06, "loss": 0.79210615, "num_input_tokens_seen": 99464720, "step": 4603, "time_per_iteration": 2.7761597633361816 }, { "auxiliary_loss_clip": 0.01118804, "auxiliary_loss_mlp": 0.01044782, "balance_loss_clip": 1.05331254, "balance_loss_mlp": 1.02858686, "epoch": 0.27680745528333084, "flos": 15012851604480.0, "grad_norm": 1.686999686787164, "language_loss": 0.81469357, "learning_rate": 3.3943168656531585e-06, "loss": 0.83632934, "num_input_tokens_seen": 99482310, "step": 4604, "time_per_iteration": 2.6715614795684814 }, { "auxiliary_loss_clip": 0.01096642, "auxiliary_loss_mlp": 0.0104217, "balance_loss_clip": 1.04733086, "balance_loss_mlp": 1.02428889, "epoch": 0.2768675785359988, "flos": 22637835734400.0, "grad_norm": 1.8500484413544072, "language_loss": 0.7021662, "learning_rate": 3.3940376274817363e-06, "loss": 0.72355425, "num_input_tokens_seen": 99501255, "step": 4605, "time_per_iteration": 2.824810266494751 }, { "auxiliary_loss_clip": 0.01051326, "auxiliary_loss_mlp": 0.01005015, "balance_loss_clip": 1.02826095, "balance_loss_mlp": 1.00244009, "epoch": 0.27692770178866677, "flos": 66130542881280.0, "grad_norm": 0.7013581781305706, "language_loss": 0.57222801, "learning_rate": 3.3937583364482673e-06, "loss": 0.59279138, "num_input_tokens_seen": 99568925, "step": 4606, "time_per_iteration": 3.288269519805908 }, { "auxiliary_loss_clip": 0.01125032, "auxiliary_loss_mlp": 0.01050719, "balance_loss_clip": 1.05177283, "balance_loss_mlp": 1.03280139, "epoch": 0.27698782504133473, "flos": 26464979059200.0, "grad_norm": 1.9503980757161308, "language_loss": 0.69579148, "learning_rate": 3.3934789925633424e-06, "loss": 0.71754897, "num_input_tokens_seen": 99588455, "step": 4607, "time_per_iteration": 2.7865042686462402 }, { "auxiliary_loss_clip": 0.0113039, "auxiliary_loss_mlp": 0.01040949, "balance_loss_clip": 1.05402029, "balance_loss_mlp": 1.0242002, "epoch": 0.2770479482940027, "flos": 25884806584320.0, "grad_norm": 1.5552750364168406, "language_loss": 0.69727945, "learning_rate": 3.393199595837555e-06, "loss": 0.71899283, "num_input_tokens_seen": 99609355, "step": 4608, "time_per_iteration": 2.7139909267425537 }, { "auxiliary_loss_clip": 0.0109619, "auxiliary_loss_mlp": 0.01041619, "balance_loss_clip": 1.04789758, "balance_loss_mlp": 1.024894, "epoch": 0.27710807154667066, "flos": 22857249962880.0, "grad_norm": 1.922338327624115, "language_loss": 0.73170602, "learning_rate": 3.392920146281499e-06, "loss": 0.75308412, "num_input_tokens_seen": 99628780, "step": 4609, "time_per_iteration": 2.8674490451812744 }, { "auxiliary_loss_clip": 0.01105896, "auxiliary_loss_mlp": 0.01054215, "balance_loss_clip": 1.04444993, "balance_loss_mlp": 1.03615475, "epoch": 0.27716819479933863, "flos": 17711071401600.0, "grad_norm": 2.284482242639661, "language_loss": 0.84028268, "learning_rate": 3.3926406439057714e-06, "loss": 0.86188376, "num_input_tokens_seen": 99644545, "step": 4610, "time_per_iteration": 2.6861605644226074 }, { "auxiliary_loss_clip": 0.01074905, "auxiliary_loss_mlp": 0.00781444, "balance_loss_clip": 1.04093325, "balance_loss_mlp": 1.00102568, "epoch": 0.2772283180520066, "flos": 19646046080640.0, "grad_norm": 2.0943450829127044, "language_loss": 0.68915951, "learning_rate": 3.3923610887209705e-06, "loss": 0.70772296, "num_input_tokens_seen": 99663125, "step": 4611, "time_per_iteration": 2.799345016479492 }, { "auxiliary_loss_clip": 0.01144902, "auxiliary_loss_mlp": 0.01042567, "balance_loss_clip": 1.05466819, "balance_loss_mlp": 1.02591395, "epoch": 0.27728844130467456, "flos": 21032628842880.0, "grad_norm": 2.6988182686748785, "language_loss": 0.73646772, "learning_rate": 3.392081480737698e-06, "loss": 0.75834239, "num_input_tokens_seen": 99682645, "step": 4612, "time_per_iteration": 2.643157720565796 }, { "auxiliary_loss_clip": 0.01139286, "auxiliary_loss_mlp": 0.00775997, "balance_loss_clip": 1.05283117, "balance_loss_mlp": 1.00099993, "epoch": 0.2773485645573425, "flos": 18989204025600.0, "grad_norm": 2.0654093622255436, "language_loss": 0.66356897, "learning_rate": 3.3918018199665563e-06, "loss": 0.68272179, "num_input_tokens_seen": 99700520, "step": 4613, "time_per_iteration": 2.6685144901275635 }, { "auxiliary_loss_clip": 0.01096758, "auxiliary_loss_mlp": 0.01051618, "balance_loss_clip": 1.04526055, "balance_loss_mlp": 1.03354573, "epoch": 0.27740868781001055, "flos": 21468440557440.0, "grad_norm": 1.5160858700983233, "language_loss": 0.79385912, "learning_rate": 3.39152210641815e-06, "loss": 0.8153429, "num_input_tokens_seen": 99720355, "step": 4614, "time_per_iteration": 2.82061505317688 }, { "auxiliary_loss_clip": 0.01129896, "auxiliary_loss_mlp": 0.01047714, "balance_loss_clip": 1.04873419, "balance_loss_mlp": 1.02978539, "epoch": 0.2774688110626785, "flos": 19827825834240.0, "grad_norm": 2.763943164845673, "language_loss": 0.80632633, "learning_rate": 3.3912423401030865e-06, "loss": 0.82810241, "num_input_tokens_seen": 99736090, "step": 4615, "time_per_iteration": 2.607448101043701 }, { "auxiliary_loss_clip": 0.01114657, "auxiliary_loss_mlp": 0.01051705, "balance_loss_clip": 1.04532576, "balance_loss_mlp": 1.03447962, "epoch": 0.2775289343153465, "flos": 18216226321920.0, "grad_norm": 2.3373471978129543, "language_loss": 0.646945, "learning_rate": 3.3909625210319735e-06, "loss": 0.66860855, "num_input_tokens_seen": 99751805, "step": 4616, "time_per_iteration": 2.693556308746338 }, { "auxiliary_loss_clip": 0.01133374, "auxiliary_loss_mlp": 0.01047225, "balance_loss_clip": 1.0536505, "balance_loss_mlp": 1.03001153, "epoch": 0.27758905756801444, "flos": 16472476673280.0, "grad_norm": 2.175848824107301, "language_loss": 0.82324976, "learning_rate": 3.3906826492154226e-06, "loss": 0.84505582, "num_input_tokens_seen": 99770610, "step": 4617, "time_per_iteration": 2.64677357673645 }, { "auxiliary_loss_clip": 0.01147475, "auxiliary_loss_mlp": 0.01049438, "balance_loss_clip": 1.05210304, "balance_loss_mlp": 1.03261721, "epoch": 0.2776491808206824, "flos": 18728240739840.0, "grad_norm": 2.8579401527932236, "language_loss": 0.77031851, "learning_rate": 3.3904027246640458e-06, "loss": 0.79228759, "num_input_tokens_seen": 99787305, "step": 4618, "time_per_iteration": 2.555001735687256 }, { "auxiliary_loss_clip": 0.01151182, "auxiliary_loss_mlp": 0.01042958, "balance_loss_clip": 1.05599475, "balance_loss_mlp": 1.0268048, "epoch": 0.27770930407335037, "flos": 28038189911040.0, "grad_norm": 1.6850470881083441, "language_loss": 0.85102153, "learning_rate": 3.390122747388459e-06, "loss": 0.87296283, "num_input_tokens_seen": 99808940, "step": 4619, "time_per_iteration": 2.753230094909668 }, { "auxiliary_loss_clip": 0.01121872, "auxiliary_loss_mlp": 0.01041506, "balance_loss_clip": 1.05075216, "balance_loss_mlp": 1.02592564, "epoch": 0.27776942732601834, "flos": 23549823072000.0, "grad_norm": 1.6763124645732197, "language_loss": 0.7707957, "learning_rate": 3.3898427173992778e-06, "loss": 0.79242951, "num_input_tokens_seen": 99829575, "step": 4620, "time_per_iteration": 2.7764816284179688 }, { "auxiliary_loss_clip": 0.01091863, "auxiliary_loss_mlp": 0.01042513, "balance_loss_clip": 1.04290819, "balance_loss_mlp": 1.02517962, "epoch": 0.2778295505786863, "flos": 23908713811200.0, "grad_norm": 1.985202794634515, "language_loss": 0.78144193, "learning_rate": 3.389562634707122e-06, "loss": 0.80278563, "num_input_tokens_seen": 99847575, "step": 4621, "time_per_iteration": 2.740419387817383 }, { "auxiliary_loss_clip": 0.01113871, "auxiliary_loss_mlp": 0.01054223, "balance_loss_clip": 1.04857588, "balance_loss_mlp": 1.03642535, "epoch": 0.27788967383135427, "flos": 25554571920000.0, "grad_norm": 2.864120631038579, "language_loss": 0.87357259, "learning_rate": 3.389282499322611e-06, "loss": 0.89525354, "num_input_tokens_seen": 99864995, "step": 4622, "time_per_iteration": 2.8351151943206787 }, { "auxiliary_loss_clip": 0.01096216, "auxiliary_loss_mlp": 0.01052098, "balance_loss_clip": 1.0477345, "balance_loss_mlp": 1.0349195, "epoch": 0.27794979708402223, "flos": 16252631481600.0, "grad_norm": 1.7857472181098575, "language_loss": 0.81315404, "learning_rate": 3.389002311256369e-06, "loss": 0.83463717, "num_input_tokens_seen": 99881540, "step": 4623, "time_per_iteration": 2.7112133502960205 }, { "auxiliary_loss_clip": 0.01119674, "auxiliary_loss_mlp": 0.01043259, "balance_loss_clip": 1.05434608, "balance_loss_mlp": 1.02628374, "epoch": 0.2780099203366902, "flos": 20667632791680.0, "grad_norm": 2.1551340516102897, "language_loss": 0.80889726, "learning_rate": 3.3887220705190204e-06, "loss": 0.83052659, "num_input_tokens_seen": 99899595, "step": 4624, "time_per_iteration": 2.6492481231689453 }, { "auxiliary_loss_clip": 0.01112812, "auxiliary_loss_mlp": 0.0077763, "balance_loss_clip": 1.05008531, "balance_loss_mlp": 1.00092447, "epoch": 0.27807004358935816, "flos": 17739583822080.0, "grad_norm": 2.21671742511245, "language_loss": 0.76949263, "learning_rate": 3.388441777121191e-06, "loss": 0.78839707, "num_input_tokens_seen": 99913020, "step": 4625, "time_per_iteration": 2.6312057971954346 }, { "auxiliary_loss_clip": 0.01106879, "auxiliary_loss_mlp": 0.01046687, "balance_loss_clip": 1.04205859, "balance_loss_mlp": 1.02767277, "epoch": 0.2781301668420261, "flos": 16727119165440.0, "grad_norm": 1.790813282848893, "language_loss": 0.69947815, "learning_rate": 3.388161431073511e-06, "loss": 0.72101378, "num_input_tokens_seen": 99931405, "step": 4626, "time_per_iteration": 2.7656819820404053 }, { "auxiliary_loss_clip": 0.0110548, "auxiliary_loss_mlp": 0.01041917, "balance_loss_clip": 1.04827905, "balance_loss_mlp": 1.02385652, "epoch": 0.27819029009469415, "flos": 13844749317120.0, "grad_norm": 2.1086116607571546, "language_loss": 0.92367601, "learning_rate": 3.38788103238661e-06, "loss": 0.94515002, "num_input_tokens_seen": 99948100, "step": 4627, "time_per_iteration": 2.8608667850494385 }, { "auxiliary_loss_clip": 0.01149683, "auxiliary_loss_mlp": 0.01040775, "balance_loss_clip": 1.05388021, "balance_loss_mlp": 1.0248611, "epoch": 0.2782504133473621, "flos": 27089286370560.0, "grad_norm": 1.7290354122756755, "language_loss": 0.85490036, "learning_rate": 3.387600581071121e-06, "loss": 0.87680495, "num_input_tokens_seen": 99966470, "step": 4628, "time_per_iteration": 2.6468069553375244 }, { "auxiliary_loss_clip": 0.01114712, "auxiliary_loss_mlp": 0.0104202, "balance_loss_clip": 1.0482378, "balance_loss_mlp": 1.02509212, "epoch": 0.2783105366000301, "flos": 21068826773760.0, "grad_norm": 1.5106040860694088, "language_loss": 0.79246545, "learning_rate": 3.387320077137679e-06, "loss": 0.81403273, "num_input_tokens_seen": 99985930, "step": 4629, "time_per_iteration": 5.656833648681641 }, { "auxiliary_loss_clip": 0.01100825, "auxiliary_loss_mlp": 0.01040328, "balance_loss_clip": 1.04602218, "balance_loss_mlp": 1.02339983, "epoch": 0.27837065985269804, "flos": 26501823434880.0, "grad_norm": 1.5125577415085874, "language_loss": 0.84574991, "learning_rate": 3.3870395205969208e-06, "loss": 0.86716145, "num_input_tokens_seen": 100006235, "step": 4630, "time_per_iteration": 2.70917010307312 }, { "auxiliary_loss_clip": 0.01123828, "auxiliary_loss_mlp": 0.01038547, "balance_loss_clip": 1.04848623, "balance_loss_mlp": 1.02099967, "epoch": 0.278430783105366, "flos": 20223201813120.0, "grad_norm": 2.1016222667741857, "language_loss": 0.81134796, "learning_rate": 3.386758911459485e-06, "loss": 0.83297169, "num_input_tokens_seen": 100023655, "step": 4631, "time_per_iteration": 4.19342041015625 }, { "auxiliary_loss_clip": 0.01149092, "auxiliary_loss_mlp": 0.01049428, "balance_loss_clip": 1.05402875, "balance_loss_mlp": 1.03257155, "epoch": 0.278490906358034, "flos": 25592888753280.0, "grad_norm": 3.9436500565538295, "language_loss": 0.71196103, "learning_rate": 3.3864782497360126e-06, "loss": 0.7339462, "num_input_tokens_seen": 100043280, "step": 4632, "time_per_iteration": 2.620439291000366 }, { "auxiliary_loss_clip": 0.01132813, "auxiliary_loss_mlp": 0.01044268, "balance_loss_clip": 1.05435467, "balance_loss_mlp": 1.02798355, "epoch": 0.27855102961070194, "flos": 16171544528640.0, "grad_norm": 1.8243983980851597, "language_loss": 0.82563186, "learning_rate": 3.386197535437145e-06, "loss": 0.84740269, "num_input_tokens_seen": 100057690, "step": 4633, "time_per_iteration": 2.6531693935394287 }, { "auxiliary_loss_clip": 0.01122775, "auxiliary_loss_mlp": 0.01039803, "balance_loss_clip": 1.04714537, "balance_loss_mlp": 1.02130151, "epoch": 0.2786111528633699, "flos": 22927598749440.0, "grad_norm": 1.6667943176882647, "language_loss": 0.87727869, "learning_rate": 3.385916768573529e-06, "loss": 0.89890444, "num_input_tokens_seen": 100075875, "step": 4634, "time_per_iteration": 4.391691446304321 }, { "auxiliary_loss_clip": 0.01118626, "auxiliary_loss_mlp": 0.01042889, "balance_loss_clip": 1.04900146, "balance_loss_mlp": 1.02503181, "epoch": 0.27867127611603787, "flos": 23404205335680.0, "grad_norm": 1.8664238108113964, "language_loss": 0.7701081, "learning_rate": 3.38563594915581e-06, "loss": 0.79172325, "num_input_tokens_seen": 100092930, "step": 4635, "time_per_iteration": 2.7107748985290527 }, { "auxiliary_loss_clip": 0.01148262, "auxiliary_loss_mlp": 0.01044984, "balance_loss_clip": 1.05233121, "balance_loss_mlp": 1.02705491, "epoch": 0.27873139936870583, "flos": 19829010983040.0, "grad_norm": 1.6280540509164947, "language_loss": 0.65174443, "learning_rate": 3.385355077194637e-06, "loss": 0.67367697, "num_input_tokens_seen": 100110790, "step": 4636, "time_per_iteration": 2.660099744796753 }, { "auxiliary_loss_clip": 0.01134021, "auxiliary_loss_mlp": 0.01042528, "balance_loss_clip": 1.048437, "balance_loss_mlp": 1.0243845, "epoch": 0.2787915226213738, "flos": 17707659609600.0, "grad_norm": 2.8501862977667667, "language_loss": 0.83485681, "learning_rate": 3.3850741527006604e-06, "loss": 0.85662234, "num_input_tokens_seen": 100126970, "step": 4637, "time_per_iteration": 2.6234302520751953 }, { "auxiliary_loss_clip": 0.01117465, "auxiliary_loss_mlp": 0.01043194, "balance_loss_clip": 1.04580319, "balance_loss_mlp": 1.02658796, "epoch": 0.27885164587404176, "flos": 22090557139200.0, "grad_norm": 1.4481958644660236, "language_loss": 0.75996393, "learning_rate": 3.384793175684533e-06, "loss": 0.78157055, "num_input_tokens_seen": 100146720, "step": 4638, "time_per_iteration": 2.6488263607025146 }, { "auxiliary_loss_clip": 0.0113367, "auxiliary_loss_mlp": 0.01047522, "balance_loss_clip": 1.04905438, "balance_loss_mlp": 1.02935445, "epoch": 0.27891176912670973, "flos": 19207684500480.0, "grad_norm": 1.973043880665722, "language_loss": 0.71658665, "learning_rate": 3.38451214615691e-06, "loss": 0.73839855, "num_input_tokens_seen": 100165920, "step": 4639, "time_per_iteration": 2.606290817260742 }, { "auxiliary_loss_clip": 0.01134631, "auxiliary_loss_mlp": 0.01040486, "balance_loss_clip": 1.04905224, "balance_loss_mlp": 1.02213931, "epoch": 0.27897189237937775, "flos": 27600007898880.0, "grad_norm": 1.9413688357819885, "language_loss": 0.6546669, "learning_rate": 3.384231064128447e-06, "loss": 0.67641807, "num_input_tokens_seen": 100185525, "step": 4640, "time_per_iteration": 2.670572280883789 }, { "auxiliary_loss_clip": 0.01134835, "auxiliary_loss_mlp": 0.01040753, "balance_loss_clip": 1.05033112, "balance_loss_mlp": 1.02394438, "epoch": 0.2790320156320457, "flos": 21178210665600.0, "grad_norm": 2.0528630099938385, "language_loss": 0.72150993, "learning_rate": 3.383949929609804e-06, "loss": 0.74326581, "num_input_tokens_seen": 100204850, "step": 4641, "time_per_iteration": 2.693377733230591 }, { "auxiliary_loss_clip": 0.01112862, "auxiliary_loss_mlp": 0.01043132, "balance_loss_clip": 1.05076349, "balance_loss_mlp": 1.02322423, "epoch": 0.2790921388847137, "flos": 22783920347520.0, "grad_norm": 1.7365449070814052, "language_loss": 0.74695385, "learning_rate": 3.383668742611641e-06, "loss": 0.7685138, "num_input_tokens_seen": 100224520, "step": 4642, "time_per_iteration": 2.7462241649627686 }, { "auxiliary_loss_clip": 0.0111075, "auxiliary_loss_mlp": 0.01045242, "balance_loss_clip": 1.04543257, "balance_loss_mlp": 1.02603781, "epoch": 0.27915226213738165, "flos": 23400649889280.0, "grad_norm": 1.8272594017764643, "language_loss": 0.85924351, "learning_rate": 3.3833875031446205e-06, "loss": 0.88080341, "num_input_tokens_seen": 100243935, "step": 4643, "time_per_iteration": 2.725135564804077 }, { "auxiliary_loss_clip": 0.01105223, "auxiliary_loss_mlp": 0.01045051, "balance_loss_clip": 1.04933143, "balance_loss_mlp": 1.02697933, "epoch": 0.2792123853900496, "flos": 22747794243840.0, "grad_norm": 1.7474380366240072, "language_loss": 0.83161986, "learning_rate": 3.383106211219407e-06, "loss": 0.85312265, "num_input_tokens_seen": 100262290, "step": 4644, "time_per_iteration": 2.7356133460998535 }, { "auxiliary_loss_clip": 0.01135825, "auxiliary_loss_mlp": 0.01044339, "balance_loss_clip": 1.04996896, "balance_loss_mlp": 1.02672005, "epoch": 0.2792725086427176, "flos": 15049372757760.0, "grad_norm": 1.8326156585035789, "language_loss": 0.79077673, "learning_rate": 3.3828248668466673e-06, "loss": 0.81257844, "num_input_tokens_seen": 100280015, "step": 4645, "time_per_iteration": 2.6605966091156006 }, { "auxiliary_loss_clip": 0.01043101, "auxiliary_loss_mlp": 0.01005168, "balance_loss_clip": 1.02972245, "balance_loss_mlp": 1.00273657, "epoch": 0.27933263189538554, "flos": 62544861757440.0, "grad_norm": 0.7804050577208047, "language_loss": 0.62298429, "learning_rate": 3.3825434700370705e-06, "loss": 0.64346695, "num_input_tokens_seen": 100338935, "step": 4646, "time_per_iteration": 3.203944206237793 }, { "auxiliary_loss_clip": 0.01116876, "auxiliary_loss_mlp": 0.01036795, "balance_loss_clip": 1.05170095, "balance_loss_mlp": 1.02054703, "epoch": 0.2793927551480535, "flos": 25118365155840.0, "grad_norm": 1.6679902986930268, "language_loss": 0.89280778, "learning_rate": 3.3822620208012865e-06, "loss": 0.91434449, "num_input_tokens_seen": 100359905, "step": 4647, "time_per_iteration": 2.829617500305176 }, { "auxiliary_loss_clip": 0.0113911, "auxiliary_loss_mlp": 0.01047084, "balance_loss_clip": 1.05125523, "balance_loss_mlp": 1.02880919, "epoch": 0.27945287840072147, "flos": 21324582587520.0, "grad_norm": 1.8012650128540075, "language_loss": 0.86784112, "learning_rate": 3.381980519149988e-06, "loss": 0.88970304, "num_input_tokens_seen": 100376955, "step": 4648, "time_per_iteration": 2.632321357727051 }, { "auxiliary_loss_clip": 0.01134603, "auxiliary_loss_mlp": 0.01044893, "balance_loss_clip": 1.05110133, "balance_loss_mlp": 1.02733302, "epoch": 0.27951300165338944, "flos": 27450547407360.0, "grad_norm": 2.0026822782024705, "language_loss": 0.73003638, "learning_rate": 3.38169896509385e-06, "loss": 0.75183129, "num_input_tokens_seen": 100397545, "step": 4649, "time_per_iteration": 2.7211172580718994 }, { "auxiliary_loss_clip": 0.01111127, "auxiliary_loss_mlp": 0.01044981, "balance_loss_clip": 1.04752195, "balance_loss_mlp": 1.02557421, "epoch": 0.2795731249060574, "flos": 15159008044800.0, "grad_norm": 2.1164331968139325, "language_loss": 0.80629992, "learning_rate": 3.381417358643549e-06, "loss": 0.82786095, "num_input_tokens_seen": 100415080, "step": 4650, "time_per_iteration": 2.7502310276031494 }, { "auxiliary_loss_clip": 0.01039445, "auxiliary_loss_mlp": 0.00754956, "balance_loss_clip": 1.03124094, "balance_loss_mlp": 1.00203133, "epoch": 0.27963324815872537, "flos": 60120103178880.0, "grad_norm": 0.8151234776797575, "language_loss": 0.58806145, "learning_rate": 3.3811356998097624e-06, "loss": 0.60600549, "num_input_tokens_seen": 100471105, "step": 4651, "time_per_iteration": 3.2224526405334473 }, { "auxiliary_loss_clip": 0.01135312, "auxiliary_loss_mlp": 0.01047398, "balance_loss_clip": 1.04708123, "balance_loss_mlp": 1.02753818, "epoch": 0.27969337141139333, "flos": 21765960910080.0, "grad_norm": 1.7351399642666463, "language_loss": 0.74332011, "learning_rate": 3.3808539886031726e-06, "loss": 0.76514727, "num_input_tokens_seen": 100492520, "step": 4652, "time_per_iteration": 2.685736894607544 }, { "auxiliary_loss_clip": 0.01148943, "auxiliary_loss_mlp": 0.01045678, "balance_loss_clip": 1.05235481, "balance_loss_mlp": 1.02742696, "epoch": 0.27975349466406135, "flos": 39851398834560.0, "grad_norm": 2.2003219434248633, "language_loss": 0.79789567, "learning_rate": 3.380572225034461e-06, "loss": 0.81984192, "num_input_tokens_seen": 100512870, "step": 4653, "time_per_iteration": 2.7558584213256836 }, { "auxiliary_loss_clip": 0.01121239, "auxiliary_loss_mlp": 0.01050268, "balance_loss_clip": 1.04883742, "balance_loss_mlp": 1.03280401, "epoch": 0.2798136179167293, "flos": 21579799697280.0, "grad_norm": 2.080129868341082, "language_loss": 0.78903222, "learning_rate": 3.380290409114312e-06, "loss": 0.81074733, "num_input_tokens_seen": 100531655, "step": 4654, "time_per_iteration": 2.6496095657348633 }, { "auxiliary_loss_clip": 0.01101836, "auxiliary_loss_mlp": 0.01052085, "balance_loss_clip": 1.04982615, "balance_loss_mlp": 1.03267753, "epoch": 0.2798737411693973, "flos": 21537676022400.0, "grad_norm": 2.0985102630300134, "language_loss": 0.81319463, "learning_rate": 3.3800085408534127e-06, "loss": 0.83473378, "num_input_tokens_seen": 100548005, "step": 4655, "time_per_iteration": 2.742586135864258 }, { "auxiliary_loss_clip": 0.01112605, "auxiliary_loss_mlp": 0.00776867, "balance_loss_clip": 1.04759109, "balance_loss_mlp": 1.00071263, "epoch": 0.27993386442206525, "flos": 26981051713920.0, "grad_norm": 1.7515804597190672, "language_loss": 0.81455064, "learning_rate": 3.3797266202624506e-06, "loss": 0.83344543, "num_input_tokens_seen": 100567980, "step": 4656, "time_per_iteration": 2.796480894088745 }, { "auxiliary_loss_clip": 0.01120191, "auxiliary_loss_mlp": 0.01050328, "balance_loss_clip": 1.05115008, "balance_loss_mlp": 1.03204143, "epoch": 0.2799939876747332, "flos": 24349876652160.0, "grad_norm": 2.044588364139205, "language_loss": 0.83203471, "learning_rate": 3.3794446473521176e-06, "loss": 0.85373986, "num_input_tokens_seen": 100588630, "step": 4657, "time_per_iteration": 2.6785871982574463 }, { "auxiliary_loss_clip": 0.01111476, "auxiliary_loss_mlp": 0.01052182, "balance_loss_clip": 1.04937756, "balance_loss_mlp": 1.03294206, "epoch": 0.2800541109274012, "flos": 33656988648960.0, "grad_norm": 2.165484252442401, "language_loss": 0.63694274, "learning_rate": 3.379162622133105e-06, "loss": 0.65857935, "num_input_tokens_seen": 100608775, "step": 4658, "time_per_iteration": 2.879409074783325 }, { "auxiliary_loss_clip": 0.01136248, "auxiliary_loss_mlp": 0.010462, "balance_loss_clip": 1.0495683, "balance_loss_mlp": 1.02822304, "epoch": 0.28011423418006914, "flos": 21614417429760.0, "grad_norm": 1.7192056687926605, "language_loss": 0.78342974, "learning_rate": 3.3788805446161073e-06, "loss": 0.80525422, "num_input_tokens_seen": 100627975, "step": 4659, "time_per_iteration": 2.6989047527313232 }, { "auxiliary_loss_clip": 0.0111004, "auxiliary_loss_mlp": 0.01054733, "balance_loss_clip": 1.04974771, "balance_loss_mlp": 1.03588593, "epoch": 0.2801743574327371, "flos": 23112431159040.0, "grad_norm": 1.755148683242289, "language_loss": 0.79341501, "learning_rate": 3.3785984148118215e-06, "loss": 0.8150627, "num_input_tokens_seen": 100645430, "step": 4660, "time_per_iteration": 2.715477705001831 }, { "auxiliary_loss_clip": 0.01108147, "auxiliary_loss_mlp": 0.01046506, "balance_loss_clip": 1.05007386, "balance_loss_mlp": 1.02897, "epoch": 0.2802344806854051, "flos": 12641418766080.0, "grad_norm": 2.2526204230687115, "language_loss": 0.80604905, "learning_rate": 3.3783162327309453e-06, "loss": 0.82759559, "num_input_tokens_seen": 100663775, "step": 4661, "time_per_iteration": 2.7715258598327637 }, { "auxiliary_loss_clip": 0.01125452, "auxiliary_loss_mlp": 0.01056292, "balance_loss_clip": 1.05232596, "balance_loss_mlp": 1.03836262, "epoch": 0.28029460393807304, "flos": 37267878142080.0, "grad_norm": 1.5529278028038542, "language_loss": 0.79010582, "learning_rate": 3.3780339983841794e-06, "loss": 0.81192333, "num_input_tokens_seen": 100686085, "step": 4662, "time_per_iteration": 2.81427264213562 }, { "auxiliary_loss_clip": 0.01133119, "auxiliary_loss_mlp": 0.01052014, "balance_loss_clip": 1.05226839, "balance_loss_mlp": 1.03252363, "epoch": 0.280354727190741, "flos": 20741106061440.0, "grad_norm": 1.6202884167711182, "language_loss": 0.69617724, "learning_rate": 3.377751711782227e-06, "loss": 0.71802866, "num_input_tokens_seen": 100705135, "step": 4663, "time_per_iteration": 2.697368860244751 }, { "auxiliary_loss_clip": 0.01124677, "auxiliary_loss_mlp": 0.01049339, "balance_loss_clip": 1.05170035, "balance_loss_mlp": 1.03104067, "epoch": 0.28041485044340897, "flos": 21471026336640.0, "grad_norm": 1.9196144000248758, "language_loss": 0.77708608, "learning_rate": 3.377469372935791e-06, "loss": 0.79882622, "num_input_tokens_seen": 100724960, "step": 4664, "time_per_iteration": 2.7275149822235107 }, { "auxiliary_loss_clip": 0.01107718, "auxiliary_loss_mlp": 0.01048769, "balance_loss_clip": 1.0480299, "balance_loss_mlp": 1.03099537, "epoch": 0.28047497369607693, "flos": 14794263388800.0, "grad_norm": 1.999889511399453, "language_loss": 0.79593849, "learning_rate": 3.377186981855578e-06, "loss": 0.81750339, "num_input_tokens_seen": 100741995, "step": 4665, "time_per_iteration": 2.710507392883301 }, { "auxiliary_loss_clip": 0.01132609, "auxiliary_loss_mlp": 0.01044622, "balance_loss_clip": 1.04908824, "balance_loss_mlp": 1.02724159, "epoch": 0.2805350969487449, "flos": 23070738447360.0, "grad_norm": 1.8624041004678782, "language_loss": 0.81080002, "learning_rate": 3.3769045385522968e-06, "loss": 0.83257234, "num_input_tokens_seen": 100758985, "step": 4666, "time_per_iteration": 2.6129403114318848 }, { "auxiliary_loss_clip": 0.01108409, "auxiliary_loss_mlp": 0.01071225, "balance_loss_clip": 1.04823136, "balance_loss_mlp": 1.05097127, "epoch": 0.2805952202014129, "flos": 20479855466880.0, "grad_norm": 2.103406835637469, "language_loss": 0.84507895, "learning_rate": 3.376622043036658e-06, "loss": 0.86687529, "num_input_tokens_seen": 100777820, "step": 4667, "time_per_iteration": 2.7332448959350586 }, { "auxiliary_loss_clip": 0.01123034, "auxiliary_loss_mlp": 0.00775483, "balance_loss_clip": 1.05581784, "balance_loss_mlp": 1.00072694, "epoch": 0.2806553434540809, "flos": 27417330305280.0, "grad_norm": 3.1307253624061486, "language_loss": 0.79295927, "learning_rate": 3.376339495319373e-06, "loss": 0.81194448, "num_input_tokens_seen": 100798205, "step": 4668, "time_per_iteration": 5.80406928062439 }, { "auxiliary_loss_clip": 0.01086886, "auxiliary_loss_mlp": 0.01042603, "balance_loss_clip": 1.04659402, "balance_loss_mlp": 1.02432859, "epoch": 0.28071546670674885, "flos": 26505019745280.0, "grad_norm": 1.6340052887006857, "language_loss": 0.76323926, "learning_rate": 3.3760568954111563e-06, "loss": 0.7845341, "num_input_tokens_seen": 100819800, "step": 4669, "time_per_iteration": 2.909986734390259 }, { "auxiliary_loss_clip": 0.01135126, "auxiliary_loss_mlp": 0.01048727, "balance_loss_clip": 1.05091906, "balance_loss_mlp": 1.03104806, "epoch": 0.2807755899594168, "flos": 20558679863040.0, "grad_norm": 2.509610012971093, "language_loss": 0.79246378, "learning_rate": 3.375774243322725e-06, "loss": 0.81430233, "num_input_tokens_seen": 100837880, "step": 4670, "time_per_iteration": 4.177394866943359 }, { "auxiliary_loss_clip": 0.01106377, "auxiliary_loss_mlp": 0.01050214, "balance_loss_clip": 1.04797912, "balance_loss_mlp": 1.03053236, "epoch": 0.2808357132120848, "flos": 24313319585280.0, "grad_norm": 2.7368773080153455, "language_loss": 0.79247916, "learning_rate": 3.3754915390647955e-06, "loss": 0.81404507, "num_input_tokens_seen": 100856350, "step": 4671, "time_per_iteration": 2.711390256881714 }, { "auxiliary_loss_clip": 0.01127751, "auxiliary_loss_mlp": 0.01045588, "balance_loss_clip": 1.05121446, "balance_loss_mlp": 1.02806473, "epoch": 0.28089583646475275, "flos": 26432408401920.0, "grad_norm": 1.6750085767967255, "language_loss": 0.74537772, "learning_rate": 3.37520878264809e-06, "loss": 0.76711112, "num_input_tokens_seen": 100876135, "step": 4672, "time_per_iteration": 2.661121129989624 }, { "auxiliary_loss_clip": 0.01124033, "auxiliary_loss_mlp": 0.01050888, "balance_loss_clip": 1.04696918, "balance_loss_mlp": 1.03130245, "epoch": 0.2809559597174207, "flos": 23111820627840.0, "grad_norm": 2.8450273884489805, "language_loss": 0.75648308, "learning_rate": 3.3749259740833286e-06, "loss": 0.77823234, "num_input_tokens_seen": 100894790, "step": 4673, "time_per_iteration": 2.672701120376587 }, { "auxiliary_loss_clip": 0.0113134, "auxiliary_loss_mlp": 0.01042591, "balance_loss_clip": 1.04937172, "balance_loss_mlp": 1.02492452, "epoch": 0.2810160829700887, "flos": 20923496346240.0, "grad_norm": 1.8533271967959946, "language_loss": 0.72668427, "learning_rate": 3.374643113381237e-06, "loss": 0.74842358, "num_input_tokens_seen": 100915100, "step": 4674, "time_per_iteration": 4.2516560554504395 }, { "auxiliary_loss_clip": 0.01138771, "auxiliary_loss_mlp": 0.01046386, "balance_loss_clip": 1.05174136, "balance_loss_mlp": 1.02751493, "epoch": 0.28107620622275664, "flos": 14355901808640.0, "grad_norm": 2.0688845921593377, "language_loss": 0.77195638, "learning_rate": 3.374360200552541e-06, "loss": 0.79380798, "num_input_tokens_seen": 100932795, "step": 4675, "time_per_iteration": 2.618218183517456 }, { "auxiliary_loss_clip": 0.01149881, "auxiliary_loss_mlp": 0.01047998, "balance_loss_clip": 1.05321908, "balance_loss_mlp": 1.02948523, "epoch": 0.2811363294754246, "flos": 20919078973440.0, "grad_norm": 1.9283078401930889, "language_loss": 0.70211101, "learning_rate": 3.374077235607968e-06, "loss": 0.7240898, "num_input_tokens_seen": 100950505, "step": 4676, "time_per_iteration": 2.59861159324646 }, { "auxiliary_loss_clip": 0.01144319, "auxiliary_loss_mlp": 0.01042342, "balance_loss_clip": 1.05481541, "balance_loss_mlp": 1.02517629, "epoch": 0.28119645272809257, "flos": 20594841880320.0, "grad_norm": 1.6132814643409343, "language_loss": 0.7048012, "learning_rate": 3.3737942185582487e-06, "loss": 0.72666782, "num_input_tokens_seen": 100968790, "step": 4677, "time_per_iteration": 2.6064453125 }, { "auxiliary_loss_clip": 0.01125461, "auxiliary_loss_mlp": 0.01047839, "balance_loss_clip": 1.04849231, "balance_loss_mlp": 1.02783537, "epoch": 0.28125657598076054, "flos": 25337420248320.0, "grad_norm": 1.5663130673511025, "language_loss": 0.639018, "learning_rate": 3.3735111494141153e-06, "loss": 0.66075099, "num_input_tokens_seen": 100990205, "step": 4678, "time_per_iteration": 2.6609809398651123 }, { "auxiliary_loss_clip": 0.01134563, "auxiliary_loss_mlp": 0.01050264, "balance_loss_clip": 1.05104351, "balance_loss_mlp": 1.03315794, "epoch": 0.2813166992334285, "flos": 24827093769600.0, "grad_norm": 5.827919401990006, "language_loss": 0.70568973, "learning_rate": 3.3732280281863013e-06, "loss": 0.72753799, "num_input_tokens_seen": 101009815, "step": 4679, "time_per_iteration": 2.7039310932159424 }, { "auxiliary_loss_clip": 0.01134537, "auxiliary_loss_mlp": 0.01040896, "balance_loss_clip": 1.05048108, "balance_loss_mlp": 1.02283621, "epoch": 0.2813768224860965, "flos": 21760753438080.0, "grad_norm": 2.2073803144691255, "language_loss": 0.74848735, "learning_rate": 3.3729448548855422e-06, "loss": 0.77024174, "num_input_tokens_seen": 101026780, "step": 4680, "time_per_iteration": 2.6897919178009033 }, { "auxiliary_loss_clip": 0.01149427, "auxiliary_loss_mlp": 0.01039945, "balance_loss_clip": 1.05414999, "balance_loss_mlp": 1.02363694, "epoch": 0.2814369457387645, "flos": 24316803204480.0, "grad_norm": 2.2743778704427267, "language_loss": 0.7719292, "learning_rate": 3.3726616295225774e-06, "loss": 0.793823, "num_input_tokens_seen": 101046215, "step": 4681, "time_per_iteration": 2.6178102493286133 }, { "auxiliary_loss_clip": 0.01138594, "auxiliary_loss_mlp": 0.01037179, "balance_loss_clip": 1.05333447, "balance_loss_mlp": 1.01864183, "epoch": 0.28149706899143245, "flos": 18515326872960.0, "grad_norm": 2.5230258038951723, "language_loss": 0.74197519, "learning_rate": 3.372378352108146e-06, "loss": 0.76373291, "num_input_tokens_seen": 101063365, "step": 4682, "time_per_iteration": 2.5892751216888428 }, { "auxiliary_loss_clip": 0.01145225, "auxiliary_loss_mlp": 0.01043744, "balance_loss_clip": 1.05250573, "balance_loss_mlp": 1.02619636, "epoch": 0.2815571922441004, "flos": 24863255786880.0, "grad_norm": 1.5493572746384299, "language_loss": 0.81096184, "learning_rate": 3.3720950226529894e-06, "loss": 0.83285153, "num_input_tokens_seen": 101083835, "step": 4683, "time_per_iteration": 2.6272947788238525 }, { "auxiliary_loss_clip": 0.01089095, "auxiliary_loss_mlp": 0.01048071, "balance_loss_clip": 1.04691851, "balance_loss_mlp": 1.02916479, "epoch": 0.2816173154967684, "flos": 19901622326400.0, "grad_norm": 1.5570192452178944, "language_loss": 0.76437271, "learning_rate": 3.371811641167852e-06, "loss": 0.78574431, "num_input_tokens_seen": 101101740, "step": 4684, "time_per_iteration": 2.7542243003845215 }, { "auxiliary_loss_clip": 0.01090035, "auxiliary_loss_mlp": 0.01043858, "balance_loss_clip": 1.04495156, "balance_loss_mlp": 1.02659678, "epoch": 0.28167743874943635, "flos": 17491333950720.0, "grad_norm": 3.250404845672824, "language_loss": 0.76287019, "learning_rate": 3.3715282076634807e-06, "loss": 0.78420913, "num_input_tokens_seen": 101120480, "step": 4685, "time_per_iteration": 2.724954843521118 }, { "auxiliary_loss_clip": 0.01116834, "auxiliary_loss_mlp": 0.01045285, "balance_loss_clip": 1.05042076, "balance_loss_mlp": 1.02820265, "epoch": 0.2817375620021043, "flos": 25302120157440.0, "grad_norm": 1.80192319881426, "language_loss": 0.75822544, "learning_rate": 3.3712447221506218e-06, "loss": 0.77984667, "num_input_tokens_seen": 101142910, "step": 4686, "time_per_iteration": 2.7375218868255615 }, { "auxiliary_loss_clip": 0.01113965, "auxiliary_loss_mlp": 0.01054481, "balance_loss_clip": 1.04542971, "balance_loss_mlp": 1.03530002, "epoch": 0.2817976852547723, "flos": 18693227957760.0, "grad_norm": 5.9534421572259095, "language_loss": 0.62298906, "learning_rate": 3.370961184640025e-06, "loss": 0.64467359, "num_input_tokens_seen": 101160030, "step": 4687, "time_per_iteration": 2.7273154258728027 }, { "auxiliary_loss_clip": 0.01125077, "auxiliary_loss_mlp": 0.01052662, "balance_loss_clip": 1.05122471, "balance_loss_mlp": 1.03501928, "epoch": 0.28185780850744024, "flos": 22742263549440.0, "grad_norm": 3.512847657951686, "language_loss": 0.76642895, "learning_rate": 3.3706775951424433e-06, "loss": 0.78820634, "num_input_tokens_seen": 101177675, "step": 4688, "time_per_iteration": 2.6962485313415527 }, { "auxiliary_loss_clip": 0.01111064, "auxiliary_loss_mlp": 0.01038903, "balance_loss_clip": 1.050143, "balance_loss_mlp": 1.0222497, "epoch": 0.2819179317601082, "flos": 14933919467520.0, "grad_norm": 2.029299855452059, "language_loss": 0.78377295, "learning_rate": 3.37039395366863e-06, "loss": 0.80527258, "num_input_tokens_seen": 101192225, "step": 4689, "time_per_iteration": 2.7611160278320312 }, { "auxiliary_loss_clip": 0.01101002, "auxiliary_loss_mlp": 0.01042004, "balance_loss_clip": 1.044873, "balance_loss_mlp": 1.02469492, "epoch": 0.2819780550127762, "flos": 23145325038720.0, "grad_norm": 1.6619977361488503, "language_loss": 0.78151089, "learning_rate": 3.37011026022934e-06, "loss": 0.80294096, "num_input_tokens_seen": 101210870, "step": 4690, "time_per_iteration": 2.8166253566741943 }, { "auxiliary_loss_clip": 0.01144307, "auxiliary_loss_mlp": 0.0077562, "balance_loss_clip": 1.04972041, "balance_loss_mlp": 1.00065684, "epoch": 0.28203817826544414, "flos": 21616356764160.0, "grad_norm": 1.8251699545436237, "language_loss": 0.87835205, "learning_rate": 3.369826514835332e-06, "loss": 0.8975513, "num_input_tokens_seen": 101229965, "step": 4691, "time_per_iteration": 2.755540609359741 }, { "auxiliary_loss_clip": 0.01120177, "auxiliary_loss_mlp": 0.01057161, "balance_loss_clip": 1.0480932, "balance_loss_mlp": 1.03866005, "epoch": 0.2820983015181121, "flos": 24026788794240.0, "grad_norm": 2.0164591316320086, "language_loss": 0.81783265, "learning_rate": 3.3695427174973654e-06, "loss": 0.83960605, "num_input_tokens_seen": 101250980, "step": 4692, "time_per_iteration": 2.766826868057251 }, { "auxiliary_loss_clip": 0.01108273, "auxiliary_loss_mlp": 0.01044592, "balance_loss_clip": 1.05000174, "balance_loss_mlp": 1.02690101, "epoch": 0.2821584247707801, "flos": 30007925976960.0, "grad_norm": 1.5153062693168577, "language_loss": 0.74520338, "learning_rate": 3.3692588682262022e-06, "loss": 0.76673198, "num_input_tokens_seen": 101273335, "step": 4693, "time_per_iteration": 2.833829402923584 }, { "auxiliary_loss_clip": 0.01107692, "auxiliary_loss_mlp": 0.01038565, "balance_loss_clip": 1.04546356, "balance_loss_mlp": 1.02018356, "epoch": 0.2822185480234481, "flos": 21396762967680.0, "grad_norm": 1.6139880108231377, "language_loss": 0.77396065, "learning_rate": 3.3689749670326046e-06, "loss": 0.79542327, "num_input_tokens_seen": 101292110, "step": 4694, "time_per_iteration": 2.6783409118652344 }, { "auxiliary_loss_clip": 0.01131719, "auxiliary_loss_mlp": 0.01043428, "balance_loss_clip": 1.05066633, "balance_loss_mlp": 1.02610695, "epoch": 0.28227867127611606, "flos": 27452809964160.0, "grad_norm": 2.1245298140537354, "language_loss": 0.67171001, "learning_rate": 3.3686910139273392e-06, "loss": 0.69346148, "num_input_tokens_seen": 101312815, "step": 4695, "time_per_iteration": 2.657508373260498 }, { "auxiliary_loss_clip": 0.01129418, "auxiliary_loss_mlp": 0.01047718, "balance_loss_clip": 1.05160189, "balance_loss_mlp": 1.02857292, "epoch": 0.282338794528784, "flos": 22593736811520.0, "grad_norm": 2.1132011275006297, "language_loss": 0.75410438, "learning_rate": 3.3684070089211736e-06, "loss": 0.77587581, "num_input_tokens_seen": 101329045, "step": 4696, "time_per_iteration": 2.6419622898101807 }, { "auxiliary_loss_clip": 0.01108873, "auxiliary_loss_mlp": 0.01050131, "balance_loss_clip": 1.04857826, "balance_loss_mlp": 1.03241634, "epoch": 0.282398917781452, "flos": 42010923386880.0, "grad_norm": 1.6547739374499746, "language_loss": 0.62379837, "learning_rate": 3.368122952024877e-06, "loss": 0.64538848, "num_input_tokens_seen": 101352715, "step": 4697, "time_per_iteration": 2.863271951675415 }, { "auxiliary_loss_clip": 0.01098306, "auxiliary_loss_mlp": 0.01038026, "balance_loss_clip": 1.04702902, "balance_loss_mlp": 1.0213964, "epoch": 0.28245904103411995, "flos": 23224724052480.0, "grad_norm": 1.3648463295211168, "language_loss": 0.73178887, "learning_rate": 3.3678388432492214e-06, "loss": 0.75315219, "num_input_tokens_seen": 101374640, "step": 4698, "time_per_iteration": 2.7437515258789062 }, { "auxiliary_loss_clip": 0.01138661, "auxiliary_loss_mlp": 0.01044687, "balance_loss_clip": 1.04783368, "balance_loss_mlp": 1.02820039, "epoch": 0.2825191642867879, "flos": 25374623760000.0, "grad_norm": 1.73143255072412, "language_loss": 0.75260699, "learning_rate": 3.3675546826049788e-06, "loss": 0.77444041, "num_input_tokens_seen": 101393595, "step": 4699, "time_per_iteration": 2.6352651119232178 }, { "auxiliary_loss_clip": 0.01130406, "auxiliary_loss_mlp": 0.01042781, "balance_loss_clip": 1.04642487, "balance_loss_mlp": 1.02379072, "epoch": 0.2825792875394559, "flos": 17236799199360.0, "grad_norm": 2.939003683920128, "language_loss": 0.80683541, "learning_rate": 3.3672704701029265e-06, "loss": 0.82856727, "num_input_tokens_seen": 101409265, "step": 4700, "time_per_iteration": 2.597543478012085 }, { "auxiliary_loss_clip": 0.01118395, "auxiliary_loss_mlp": 0.01052226, "balance_loss_clip": 1.05168593, "balance_loss_mlp": 1.03699148, "epoch": 0.28263941079212385, "flos": 26723967096960.0, "grad_norm": 1.8973185440197946, "language_loss": 0.82377315, "learning_rate": 3.3669862057538402e-06, "loss": 0.84547931, "num_input_tokens_seen": 101428365, "step": 4701, "time_per_iteration": 2.6613359451293945 }, { "auxiliary_loss_clip": 0.01079732, "auxiliary_loss_mlp": 0.01044955, "balance_loss_clip": 1.04725862, "balance_loss_mlp": 1.02782488, "epoch": 0.2826995340447918, "flos": 25921327737600.0, "grad_norm": 2.6106451650427913, "language_loss": 0.72911763, "learning_rate": 3.3667018895685004e-06, "loss": 0.75036454, "num_input_tokens_seen": 101447280, "step": 4702, "time_per_iteration": 2.927156448364258 }, { "auxiliary_loss_clip": 0.0114189, "auxiliary_loss_mlp": 0.01039287, "balance_loss_clip": 1.05118549, "balance_loss_mlp": 1.02240694, "epoch": 0.2827596572974598, "flos": 22379709623040.0, "grad_norm": 2.1110096252533754, "language_loss": 0.78497601, "learning_rate": 3.3664175215576886e-06, "loss": 0.80678773, "num_input_tokens_seen": 101465435, "step": 4703, "time_per_iteration": 2.603217124938965 }, { "auxiliary_loss_clip": 0.01115372, "auxiliary_loss_mlp": 0.01049407, "balance_loss_clip": 1.04668045, "balance_loss_mlp": 1.03100109, "epoch": 0.28281978055012774, "flos": 33547137880320.0, "grad_norm": 1.6207045759516274, "language_loss": 0.69310379, "learning_rate": 3.3661331017321867e-06, "loss": 0.71475154, "num_input_tokens_seen": 101486355, "step": 4704, "time_per_iteration": 2.737741708755493 }, { "auxiliary_loss_clip": 0.0110991, "auxiliary_loss_mlp": 0.0104005, "balance_loss_clip": 1.05106401, "balance_loss_mlp": 1.02204967, "epoch": 0.2828799038027957, "flos": 23440870143360.0, "grad_norm": 2.0629797483939893, "language_loss": 0.70487976, "learning_rate": 3.3658486301027807e-06, "loss": 0.72637939, "num_input_tokens_seen": 101505875, "step": 4705, "time_per_iteration": 2.7810943126678467 }, { "auxiliary_loss_clip": 0.01051193, "auxiliary_loss_mlp": 0.01011527, "balance_loss_clip": 1.02885246, "balance_loss_mlp": 1.00905895, "epoch": 0.2829400270554637, "flos": 69873690251520.0, "grad_norm": 0.7331461257989402, "language_loss": 0.59262896, "learning_rate": 3.3655641066802577e-06, "loss": 0.6132561, "num_input_tokens_seen": 101565045, "step": 4706, "time_per_iteration": 3.223500967025757 }, { "auxiliary_loss_clip": 0.01117208, "auxiliary_loss_mlp": 0.01042955, "balance_loss_clip": 1.04750693, "balance_loss_mlp": 1.02711248, "epoch": 0.2830001503081317, "flos": 24789028331520.0, "grad_norm": 1.4542369915695899, "language_loss": 0.82314008, "learning_rate": 3.365279531475407e-06, "loss": 0.84474176, "num_input_tokens_seen": 101585825, "step": 4707, "time_per_iteration": 5.995711326599121 }, { "auxiliary_loss_clip": 0.0112325, "auxiliary_loss_mlp": 0.01043198, "balance_loss_clip": 1.04714823, "balance_loss_mlp": 1.02451742, "epoch": 0.28306027356079966, "flos": 27669387018240.0, "grad_norm": 1.6937335335925583, "language_loss": 0.80196846, "learning_rate": 3.36499490449902e-06, "loss": 0.82363296, "num_input_tokens_seen": 101606105, "step": 4708, "time_per_iteration": 2.730365753173828 }, { "auxiliary_loss_clip": 0.01036827, "auxiliary_loss_mlp": 0.01004906, "balance_loss_clip": 1.0241586, "balance_loss_mlp": 1.00274837, "epoch": 0.2831203968134676, "flos": 60527938199040.0, "grad_norm": 0.8797441515413378, "language_loss": 0.62768304, "learning_rate": 3.3647102257618895e-06, "loss": 0.64810038, "num_input_tokens_seen": 101656875, "step": 4709, "time_per_iteration": 3.0734164714813232 }, { "auxiliary_loss_clip": 0.01113275, "auxiliary_loss_mlp": 0.01045412, "balance_loss_clip": 1.04819441, "balance_loss_mlp": 1.02711344, "epoch": 0.2831805200661356, "flos": 22054790171520.0, "grad_norm": 1.4416556980461737, "language_loss": 0.74092108, "learning_rate": 3.3644254952748103e-06, "loss": 0.76250798, "num_input_tokens_seen": 101676225, "step": 4710, "time_per_iteration": 4.214928388595581 }, { "auxiliary_loss_clip": 0.01108833, "auxiliary_loss_mlp": 0.01058426, "balance_loss_clip": 1.04568553, "balance_loss_mlp": 1.0393765, "epoch": 0.28324064331880355, "flos": 22600668136320.0, "grad_norm": 2.192994300890924, "language_loss": 0.7857554, "learning_rate": 3.364140713048579e-06, "loss": 0.80742794, "num_input_tokens_seen": 101693710, "step": 4711, "time_per_iteration": 2.9334824085235596 }, { "auxiliary_loss_clip": 0.01135754, "auxiliary_loss_mlp": 0.00775746, "balance_loss_clip": 1.05244637, "balance_loss_mlp": 1.00072622, "epoch": 0.2833007665714715, "flos": 30404127968640.0, "grad_norm": 2.328121287113732, "language_loss": 0.70832199, "learning_rate": 3.363855879093996e-06, "loss": 0.72743702, "num_input_tokens_seen": 101714010, "step": 4712, "time_per_iteration": 2.8570704460144043 }, { "auxiliary_loss_clip": 0.0114641, "auxiliary_loss_mlp": 0.01050688, "balance_loss_clip": 1.05171633, "balance_loss_mlp": 1.03284216, "epoch": 0.2833608898241395, "flos": 23549499849600.0, "grad_norm": 2.3843934106626157, "language_loss": 0.81725228, "learning_rate": 3.3635709934218605e-06, "loss": 0.83922327, "num_input_tokens_seen": 101732995, "step": 4713, "time_per_iteration": 4.343034029006958 }, { "auxiliary_loss_clip": 0.01120505, "auxiliary_loss_mlp": 0.01048075, "balance_loss_clip": 1.05054498, "balance_loss_mlp": 1.03044379, "epoch": 0.28342101307680745, "flos": 20266726118400.0, "grad_norm": 1.7964609324305687, "language_loss": 0.75316995, "learning_rate": 3.3632860560429766e-06, "loss": 0.77485573, "num_input_tokens_seen": 101751385, "step": 4714, "time_per_iteration": 2.656919479370117 }, { "auxiliary_loss_clip": 0.01129168, "auxiliary_loss_mlp": 0.01051102, "balance_loss_clip": 1.050372, "balance_loss_mlp": 1.03424633, "epoch": 0.2834811363294754, "flos": 30847050576000.0, "grad_norm": 1.4082553086863412, "language_loss": 0.78457153, "learning_rate": 3.3630010669681494e-06, "loss": 0.80637431, "num_input_tokens_seen": 101773825, "step": 4715, "time_per_iteration": 2.721869468688965 }, { "auxiliary_loss_clip": 0.01117334, "auxiliary_loss_mlp": 0.01046437, "balance_loss_clip": 1.04618871, "balance_loss_mlp": 1.0294199, "epoch": 0.2835412595821434, "flos": 22711021695360.0, "grad_norm": 1.791082386208426, "language_loss": 0.73825723, "learning_rate": 3.3627160262081845e-06, "loss": 0.75989497, "num_input_tokens_seen": 101791920, "step": 4716, "time_per_iteration": 2.689964532852173 }, { "auxiliary_loss_clip": 0.0111778, "auxiliary_loss_mlp": 0.01054857, "balance_loss_clip": 1.04580188, "balance_loss_mlp": 1.03397131, "epoch": 0.28360138283481134, "flos": 18077719478400.0, "grad_norm": 2.1425450832247868, "language_loss": 0.74293232, "learning_rate": 3.3624309337738917e-06, "loss": 0.76465869, "num_input_tokens_seen": 101809515, "step": 4717, "time_per_iteration": 2.653107166290283 }, { "auxiliary_loss_clip": 0.01112398, "auxiliary_loss_mlp": 0.01052347, "balance_loss_clip": 1.04736984, "balance_loss_mlp": 1.03526437, "epoch": 0.2836615060874793, "flos": 17854785717120.0, "grad_norm": 1.96982951308544, "language_loss": 0.67022157, "learning_rate": 3.3621457896760813e-06, "loss": 0.69186902, "num_input_tokens_seen": 101827735, "step": 4718, "time_per_iteration": 2.7287323474884033 }, { "auxiliary_loss_clip": 0.01119996, "auxiliary_loss_mlp": 0.01052629, "balance_loss_clip": 1.04606366, "balance_loss_mlp": 1.03479528, "epoch": 0.2837216293401473, "flos": 25740302169600.0, "grad_norm": 1.7409435577223806, "language_loss": 0.72453725, "learning_rate": 3.361860593925566e-06, "loss": 0.7462635, "num_input_tokens_seen": 101845970, "step": 4719, "time_per_iteration": 2.7101874351501465 }, { "auxiliary_loss_clip": 0.01129472, "auxiliary_loss_mlp": 0.01044, "balance_loss_clip": 1.04724336, "balance_loss_mlp": 1.02711964, "epoch": 0.2837817525928153, "flos": 20923532259840.0, "grad_norm": 1.8163652523997504, "language_loss": 0.80517805, "learning_rate": 3.3615753465331605e-06, "loss": 0.82691276, "num_input_tokens_seen": 101865040, "step": 4720, "time_per_iteration": 2.630380392074585 }, { "auxiliary_loss_clip": 0.01130938, "auxiliary_loss_mlp": 0.01047274, "balance_loss_clip": 1.04798317, "balance_loss_mlp": 1.02935672, "epoch": 0.28384187584548326, "flos": 18916700423040.0, "grad_norm": 2.340232614040239, "language_loss": 0.79146183, "learning_rate": 3.3612900475096817e-06, "loss": 0.81324387, "num_input_tokens_seen": 101883735, "step": 4721, "time_per_iteration": 2.6779117584228516 }, { "auxiliary_loss_clip": 0.01091324, "auxiliary_loss_mlp": 0.00778191, "balance_loss_clip": 1.04653215, "balance_loss_mlp": 1.00074911, "epoch": 0.2839019990981512, "flos": 27343964776320.0, "grad_norm": 1.7859505861297744, "language_loss": 0.82514244, "learning_rate": 3.3610046968659474e-06, "loss": 0.84383762, "num_input_tokens_seen": 101903025, "step": 4722, "time_per_iteration": 2.8601412773132324 }, { "auxiliary_loss_clip": 0.0114735, "auxiliary_loss_mlp": 0.0104339, "balance_loss_clip": 1.05396807, "balance_loss_mlp": 1.02641416, "epoch": 0.2839621223508192, "flos": 18114312458880.0, "grad_norm": 1.8976073667217488, "language_loss": 0.70048773, "learning_rate": 3.3607192946127785e-06, "loss": 0.72239512, "num_input_tokens_seen": 101922255, "step": 4723, "time_per_iteration": 2.6259007453918457 }, { "auxiliary_loss_clip": 0.0111455, "auxiliary_loss_mlp": 0.01051142, "balance_loss_clip": 1.04818106, "balance_loss_mlp": 1.03247368, "epoch": 0.28402224560348716, "flos": 26358360514560.0, "grad_norm": 1.540245146059843, "language_loss": 0.78676599, "learning_rate": 3.360433840760998e-06, "loss": 0.80842292, "num_input_tokens_seen": 101943100, "step": 4724, "time_per_iteration": 2.7364859580993652 }, { "auxiliary_loss_clip": 0.01116323, "auxiliary_loss_mlp": 0.01063488, "balance_loss_clip": 1.04846072, "balance_loss_mlp": 1.04442668, "epoch": 0.2840823688561551, "flos": 24060795995520.0, "grad_norm": 1.6728910575536384, "language_loss": 0.92433345, "learning_rate": 3.36014833532143e-06, "loss": 0.94613159, "num_input_tokens_seen": 101963160, "step": 4725, "time_per_iteration": 2.653244733810425 }, { "auxiliary_loss_clip": 0.01137335, "auxiliary_loss_mlp": 0.01047317, "balance_loss_clip": 1.05249703, "balance_loss_mlp": 1.02951932, "epoch": 0.2841424921088231, "flos": 29459821368960.0, "grad_norm": 1.5774329387244128, "language_loss": 0.88881439, "learning_rate": 3.3598627783049e-06, "loss": 0.91066098, "num_input_tokens_seen": 101984300, "step": 4726, "time_per_iteration": 2.6815872192382812 }, { "auxiliary_loss_clip": 0.01132666, "auxiliary_loss_mlp": 0.01049768, "balance_loss_clip": 1.05290008, "balance_loss_mlp": 1.03223181, "epoch": 0.28420261536149105, "flos": 48100367053440.0, "grad_norm": 2.008368257744288, "language_loss": 0.78913373, "learning_rate": 3.359577169722238e-06, "loss": 0.81095803, "num_input_tokens_seen": 102005765, "step": 4727, "time_per_iteration": 2.8668875694274902 }, { "auxiliary_loss_clip": 0.01134036, "auxiliary_loss_mlp": 0.01041813, "balance_loss_clip": 1.05225933, "balance_loss_mlp": 1.02603006, "epoch": 0.284262738614159, "flos": 25666146541440.0, "grad_norm": 2.1196929739552433, "language_loss": 0.66590458, "learning_rate": 3.3592915095842733e-06, "loss": 0.68766308, "num_input_tokens_seen": 102022755, "step": 4728, "time_per_iteration": 2.6871252059936523 }, { "auxiliary_loss_clip": 0.01111522, "auxiliary_loss_mlp": 0.01054966, "balance_loss_clip": 1.04948676, "balance_loss_mlp": 1.03766847, "epoch": 0.284322861866827, "flos": 19718980646400.0, "grad_norm": 1.7247901443745783, "language_loss": 0.76369143, "learning_rate": 3.3590057979018386e-06, "loss": 0.78535628, "num_input_tokens_seen": 102041850, "step": 4729, "time_per_iteration": 2.671739339828491 }, { "auxiliary_loss_clip": 0.01121198, "auxiliary_loss_mlp": 0.01054506, "balance_loss_clip": 1.05166233, "balance_loss_mlp": 1.03707767, "epoch": 0.28438298511949495, "flos": 23915250086400.0, "grad_norm": 1.8284571123244682, "language_loss": 0.67062581, "learning_rate": 3.3587200346857674e-06, "loss": 0.69238287, "num_input_tokens_seen": 102059500, "step": 4730, "time_per_iteration": 2.6957883834838867 }, { "auxiliary_loss_clip": 0.01120949, "auxiliary_loss_mlp": 0.01040777, "balance_loss_clip": 1.05008078, "balance_loss_mlp": 1.02283621, "epoch": 0.2844431083721629, "flos": 26067340523520.0, "grad_norm": 1.8142087038783352, "language_loss": 0.7456513, "learning_rate": 3.3584342199468965e-06, "loss": 0.76726854, "num_input_tokens_seen": 102080460, "step": 4731, "time_per_iteration": 2.7621212005615234 }, { "auxiliary_loss_clip": 0.01100065, "auxiliary_loss_mlp": 0.0104061, "balance_loss_clip": 1.04959893, "balance_loss_mlp": 1.02338386, "epoch": 0.2845032316248309, "flos": 25810435474560.0, "grad_norm": 1.4533231430590194, "language_loss": 0.83672202, "learning_rate": 3.3581483536960638e-06, "loss": 0.85812879, "num_input_tokens_seen": 102100950, "step": 4732, "time_per_iteration": 2.807701587677002 }, { "auxiliary_loss_clip": 0.01135958, "auxiliary_loss_mlp": 0.01049006, "balance_loss_clip": 1.05248308, "balance_loss_mlp": 1.03040957, "epoch": 0.2845633548774989, "flos": 19823192979840.0, "grad_norm": 2.88493918484894, "language_loss": 0.78892827, "learning_rate": 3.357862435944109e-06, "loss": 0.8107779, "num_input_tokens_seen": 102119345, "step": 4733, "time_per_iteration": 2.66524076461792 }, { "auxiliary_loss_clip": 0.01153472, "auxiliary_loss_mlp": 0.01047702, "balance_loss_clip": 1.05533004, "balance_loss_mlp": 1.02984452, "epoch": 0.28462347813016686, "flos": 23182815859200.0, "grad_norm": 2.2364375024988776, "language_loss": 0.71791029, "learning_rate": 3.357576466701875e-06, "loss": 0.73992205, "num_input_tokens_seen": 102139050, "step": 4734, "time_per_iteration": 2.6941637992858887 }, { "auxiliary_loss_clip": 0.01125779, "auxiliary_loss_mlp": 0.01035132, "balance_loss_clip": 1.05455363, "balance_loss_mlp": 1.01766825, "epoch": 0.2846836013828348, "flos": 18660477732480.0, "grad_norm": 1.8491255089189595, "language_loss": 0.73942113, "learning_rate": 3.3572904459802056e-06, "loss": 0.76103032, "num_input_tokens_seen": 102157935, "step": 4735, "time_per_iteration": 2.736027956008911 }, { "auxiliary_loss_clip": 0.01124029, "auxiliary_loss_mlp": 0.01048016, "balance_loss_clip": 1.05248201, "balance_loss_mlp": 1.03177929, "epoch": 0.2847437246355028, "flos": 14173511523840.0, "grad_norm": 1.7217440703764713, "language_loss": 0.79690897, "learning_rate": 3.357004373789946e-06, "loss": 0.81862932, "num_input_tokens_seen": 102175325, "step": 4736, "time_per_iteration": 2.7069075107574463 }, { "auxiliary_loss_clip": 0.01152237, "auxiliary_loss_mlp": 0.01048515, "balance_loss_clip": 1.0569663, "balance_loss_mlp": 1.03019249, "epoch": 0.28480384788817076, "flos": 29278364837760.0, "grad_norm": 2.5331890881723327, "language_loss": 0.59956342, "learning_rate": 3.3567182501419453e-06, "loss": 0.62157094, "num_input_tokens_seen": 102196625, "step": 4737, "time_per_iteration": 2.718904972076416 }, { "auxiliary_loss_clip": 0.01131951, "auxiliary_loss_mlp": 0.0104121, "balance_loss_clip": 1.05099404, "balance_loss_mlp": 1.02437758, "epoch": 0.2848639711408387, "flos": 22601314581120.0, "grad_norm": 1.8696274848062555, "language_loss": 0.86556888, "learning_rate": 3.356432075047052e-06, "loss": 0.88730049, "num_input_tokens_seen": 102214975, "step": 4738, "time_per_iteration": 2.719223976135254 }, { "auxiliary_loss_clip": 0.01127313, "auxiliary_loss_mlp": 0.01051123, "balance_loss_clip": 1.05986989, "balance_loss_mlp": 1.03207278, "epoch": 0.2849240943935067, "flos": 17599460866560.0, "grad_norm": 2.688438536338364, "language_loss": 0.90028232, "learning_rate": 3.356145848516118e-06, "loss": 0.92206669, "num_input_tokens_seen": 102231885, "step": 4739, "time_per_iteration": 2.674363851547241 }, { "auxiliary_loss_clip": 0.01136036, "auxiliary_loss_mlp": 0.01044124, "balance_loss_clip": 1.05522013, "balance_loss_mlp": 1.02627802, "epoch": 0.28498421764617465, "flos": 24862573428480.0, "grad_norm": 1.41783833400805, "language_loss": 0.7216897, "learning_rate": 3.355859570559998e-06, "loss": 0.74349129, "num_input_tokens_seen": 102252725, "step": 4740, "time_per_iteration": 2.688591957092285 }, { "auxiliary_loss_clip": 0.01130927, "auxiliary_loss_mlp": 0.010392, "balance_loss_clip": 1.05868936, "balance_loss_mlp": 1.02229571, "epoch": 0.2850443408988426, "flos": 22782555630720.0, "grad_norm": 3.325446081949271, "language_loss": 0.77782756, "learning_rate": 3.3555732411895477e-06, "loss": 0.79952878, "num_input_tokens_seen": 102271730, "step": 4741, "time_per_iteration": 2.6747119426727295 }, { "auxiliary_loss_clip": 0.01107503, "auxiliary_loss_mlp": 0.01048819, "balance_loss_clip": 1.04771924, "balance_loss_mlp": 1.03065109, "epoch": 0.2851044641515106, "flos": 18844053166080.0, "grad_norm": 1.6557809034578879, "language_loss": 0.75952959, "learning_rate": 3.3552868604156235e-06, "loss": 0.78109288, "num_input_tokens_seen": 102291325, "step": 4742, "time_per_iteration": 2.7584095001220703 }, { "auxiliary_loss_clip": 0.01151989, "auxiliary_loss_mlp": 0.01057399, "balance_loss_clip": 1.05341601, "balance_loss_mlp": 1.03720486, "epoch": 0.28516458740417855, "flos": 18880502492160.0, "grad_norm": 2.0538587827096713, "language_loss": 0.57376975, "learning_rate": 3.355000428249086e-06, "loss": 0.59586358, "num_input_tokens_seen": 102309000, "step": 4743, "time_per_iteration": 2.621572494506836 }, { "auxiliary_loss_clip": 0.01116239, "auxiliary_loss_mlp": 0.01056356, "balance_loss_clip": 1.05067348, "balance_loss_mlp": 1.03747356, "epoch": 0.2852247106568465, "flos": 25299821687040.0, "grad_norm": 1.6259491452975234, "language_loss": 0.74499846, "learning_rate": 3.354713944700797e-06, "loss": 0.76672441, "num_input_tokens_seen": 102329240, "step": 4744, "time_per_iteration": 2.8029959201812744 }, { "auxiliary_loss_clip": 0.01132324, "auxiliary_loss_mlp": 0.01047205, "balance_loss_clip": 1.05420351, "balance_loss_mlp": 1.03014612, "epoch": 0.2852848339095145, "flos": 11655383541120.0, "grad_norm": 2.4725597828733563, "language_loss": 0.77258176, "learning_rate": 3.3544274097816185e-06, "loss": 0.79437709, "num_input_tokens_seen": 102344440, "step": 4745, "time_per_iteration": 2.5961194038391113 }, { "auxiliary_loss_clip": 0.01124474, "auxiliary_loss_mlp": 0.01040571, "balance_loss_clip": 1.05262041, "balance_loss_mlp": 1.02427554, "epoch": 0.2853449571621825, "flos": 12933228856320.0, "grad_norm": 1.9164884333366974, "language_loss": 0.8275286, "learning_rate": 3.3541408235024173e-06, "loss": 0.84917903, "num_input_tokens_seen": 102360985, "step": 4746, "time_per_iteration": 4.211855411529541 }, { "auxiliary_loss_clip": 0.01101779, "auxiliary_loss_mlp": 0.01043428, "balance_loss_clip": 1.0488627, "balance_loss_mlp": 1.02497482, "epoch": 0.28540508041485046, "flos": 20010575255040.0, "grad_norm": 1.8281951571940926, "language_loss": 0.79537141, "learning_rate": 3.3538541858740604e-06, "loss": 0.81682348, "num_input_tokens_seen": 102380320, "step": 4747, "time_per_iteration": 4.276613712310791 }, { "auxiliary_loss_clip": 0.01046154, "auxiliary_loss_mlp": 0.01017989, "balance_loss_clip": 1.02844512, "balance_loss_mlp": 1.01572371, "epoch": 0.28546520366751843, "flos": 68139349966080.0, "grad_norm": 0.7754147669680839, "language_loss": 0.6049211, "learning_rate": 3.3535674969074173e-06, "loss": 0.62556255, "num_input_tokens_seen": 102439140, "step": 4748, "time_per_iteration": 3.0963478088378906 }, { "auxiliary_loss_clip": 0.01148062, "auxiliary_loss_mlp": 0.01048043, "balance_loss_clip": 1.05367923, "balance_loss_mlp": 1.03001821, "epoch": 0.2855253269201864, "flos": 13251540205440.0, "grad_norm": 2.39914017508816, "language_loss": 0.8061412, "learning_rate": 3.3532807566133592e-06, "loss": 0.82810223, "num_input_tokens_seen": 102450990, "step": 4749, "time_per_iteration": 4.199607610702515 }, { "auxiliary_loss_clip": 0.01135936, "auxiliary_loss_mlp": 0.01045252, "balance_loss_clip": 1.05160487, "balance_loss_mlp": 1.02788317, "epoch": 0.28558545017285436, "flos": 28620876337920.0, "grad_norm": 1.92101956988616, "language_loss": 0.70763719, "learning_rate": 3.3529939650027587e-06, "loss": 0.72944903, "num_input_tokens_seen": 102471820, "step": 4750, "time_per_iteration": 2.6975722312927246 }, { "auxiliary_loss_clip": 0.01132057, "auxiliary_loss_mlp": 0.0104367, "balance_loss_clip": 1.05308008, "balance_loss_mlp": 1.02660573, "epoch": 0.2856455734255223, "flos": 34130470752000.0, "grad_norm": 1.619747991653998, "language_loss": 0.81983078, "learning_rate": 3.3527071220864917e-06, "loss": 0.84158808, "num_input_tokens_seen": 102492625, "step": 4751, "time_per_iteration": 2.685194969177246 }, { "auxiliary_loss_clip": 0.01146027, "auxiliary_loss_mlp": 0.01046872, "balance_loss_clip": 1.0541997, "balance_loss_mlp": 1.03009951, "epoch": 0.2857056966781903, "flos": 39786149779200.0, "grad_norm": 2.1857777553010203, "language_loss": 0.80359828, "learning_rate": 3.3524202278754353e-06, "loss": 0.82552731, "num_input_tokens_seen": 102514145, "step": 4752, "time_per_iteration": 4.363154649734497 }, { "auxiliary_loss_clip": 0.01130862, "auxiliary_loss_mlp": 0.010456, "balance_loss_clip": 1.04920304, "balance_loss_mlp": 1.02675319, "epoch": 0.28576581993085826, "flos": 21872292145920.0, "grad_norm": 2.612706759191024, "language_loss": 0.78674287, "learning_rate": 3.3521332823804676e-06, "loss": 0.8085075, "num_input_tokens_seen": 102532365, "step": 4753, "time_per_iteration": 2.6128499507904053 }, { "auxiliary_loss_clip": 0.0114991, "auxiliary_loss_mlp": 0.01051658, "balance_loss_clip": 1.05356765, "balance_loss_mlp": 1.03166628, "epoch": 0.2858259431835262, "flos": 19091656592640.0, "grad_norm": 3.5161743537336596, "language_loss": 0.8947711, "learning_rate": 3.3518462856124704e-06, "loss": 0.91678679, "num_input_tokens_seen": 102548425, "step": 4754, "time_per_iteration": 2.5410687923431396 }, { "auxiliary_loss_clip": 0.01130155, "auxiliary_loss_mlp": 0.010468, "balance_loss_clip": 1.05048347, "balance_loss_mlp": 1.03026593, "epoch": 0.2858860664361942, "flos": 20334309557760.0, "grad_norm": 2.3617926288322724, "language_loss": 0.82039523, "learning_rate": 3.3515592375823267e-06, "loss": 0.84216481, "num_input_tokens_seen": 102566370, "step": 4755, "time_per_iteration": 2.6514527797698975 }, { "auxiliary_loss_clip": 0.01098878, "auxiliary_loss_mlp": 0.01049575, "balance_loss_clip": 1.04732597, "balance_loss_mlp": 1.03233767, "epoch": 0.28594618968886215, "flos": 24461738582400.0, "grad_norm": 1.6385978416895255, "language_loss": 0.83764589, "learning_rate": 3.351272138300922e-06, "loss": 0.8591305, "num_input_tokens_seen": 102588715, "step": 4756, "time_per_iteration": 2.7975916862487793 }, { "auxiliary_loss_clip": 0.01023363, "auxiliary_loss_mlp": 0.01007772, "balance_loss_clip": 1.01913142, "balance_loss_mlp": 1.00524473, "epoch": 0.2860063129415301, "flos": 71652850709760.0, "grad_norm": 0.8721113874523594, "language_loss": 0.6097033, "learning_rate": 3.350984987779142e-06, "loss": 0.63001466, "num_input_tokens_seen": 102656715, "step": 4757, "time_per_iteration": 3.406625986099243 }, { "auxiliary_loss_clip": 0.01147819, "auxiliary_loss_mlp": 0.01038916, "balance_loss_clip": 1.05585599, "balance_loss_mlp": 1.021595, "epoch": 0.2860664361941981, "flos": 20558679863040.0, "grad_norm": 2.030913944398288, "language_loss": 0.66206789, "learning_rate": 3.3506977860278756e-06, "loss": 0.68393528, "num_input_tokens_seen": 102676545, "step": 4758, "time_per_iteration": 2.589768648147583 }, { "auxiliary_loss_clip": 0.01133475, "auxiliary_loss_mlp": 0.01042694, "balance_loss_clip": 1.04988813, "balance_loss_mlp": 1.02581418, "epoch": 0.2861265594468661, "flos": 35996389534080.0, "grad_norm": 2.019963236438103, "language_loss": 0.63374877, "learning_rate": 3.3504105330580143e-06, "loss": 0.65551043, "num_input_tokens_seen": 102702875, "step": 4759, "time_per_iteration": 2.809325695037842 }, { "auxiliary_loss_clip": 0.01129183, "auxiliary_loss_mlp": 0.00777076, "balance_loss_clip": 1.04924989, "balance_loss_mlp": 1.00088644, "epoch": 0.28618668269953407, "flos": 20047419630720.0, "grad_norm": 1.9693348774443893, "language_loss": 0.74033993, "learning_rate": 3.3501232288804496e-06, "loss": 0.75940251, "num_input_tokens_seen": 102723160, "step": 4760, "time_per_iteration": 2.6797397136688232 }, { "auxiliary_loss_clip": 0.01124387, "auxiliary_loss_mlp": 0.01045022, "balance_loss_clip": 1.05517232, "balance_loss_mlp": 1.02849925, "epoch": 0.28624680595220203, "flos": 24971849579520.0, "grad_norm": 2.574168946313644, "language_loss": 0.72227889, "learning_rate": 3.3498358735060773e-06, "loss": 0.74397296, "num_input_tokens_seen": 102743855, "step": 4761, "time_per_iteration": 2.672394275665283 }, { "auxiliary_loss_clip": 0.01079005, "auxiliary_loss_mlp": 0.01049385, "balance_loss_clip": 1.04688287, "balance_loss_mlp": 1.03218305, "epoch": 0.28630692920487, "flos": 22492253911680.0, "grad_norm": 2.095293128310336, "language_loss": 0.74758703, "learning_rate": 3.349548466945793e-06, "loss": 0.76887095, "num_input_tokens_seen": 102761370, "step": 4762, "time_per_iteration": 2.8573946952819824 }, { "auxiliary_loss_clip": 0.01108257, "auxiliary_loss_mlp": 0.01044255, "balance_loss_clip": 1.05117726, "balance_loss_mlp": 1.02725577, "epoch": 0.28636705245753796, "flos": 21249888255360.0, "grad_norm": 1.4714690500952254, "language_loss": 0.76185489, "learning_rate": 3.349261009210496e-06, "loss": 0.78338003, "num_input_tokens_seen": 102780885, "step": 4763, "time_per_iteration": 2.7058494091033936 }, { "auxiliary_loss_clip": 0.01103052, "auxiliary_loss_mlp": 0.01041715, "balance_loss_clip": 1.0442332, "balance_loss_mlp": 1.0234046, "epoch": 0.28642717571020593, "flos": 24095772864000.0, "grad_norm": 2.250941696220621, "language_loss": 0.77264833, "learning_rate": 3.348973500311086e-06, "loss": 0.79409599, "num_input_tokens_seen": 102801000, "step": 4764, "time_per_iteration": 2.7363107204437256 }, { "auxiliary_loss_clip": 0.0111141, "auxiliary_loss_mlp": 0.01044325, "balance_loss_clip": 1.04883742, "balance_loss_mlp": 1.02520347, "epoch": 0.2864872989628739, "flos": 22601386408320.0, "grad_norm": 3.808468667851145, "language_loss": 0.71222258, "learning_rate": 3.348685940258466e-06, "loss": 0.73377991, "num_input_tokens_seen": 102820230, "step": 4765, "time_per_iteration": 2.7225682735443115 }, { "auxiliary_loss_clip": 0.01127531, "auxiliary_loss_mlp": 0.01037638, "balance_loss_clip": 1.0501802, "balance_loss_mlp": 1.02118707, "epoch": 0.28654742221554186, "flos": 32745073138560.0, "grad_norm": 1.6284115173108313, "language_loss": 0.76206756, "learning_rate": 3.3483983290635395e-06, "loss": 0.78371924, "num_input_tokens_seen": 102842670, "step": 4766, "time_per_iteration": 2.724776268005371 }, { "auxiliary_loss_clip": 0.01130255, "auxiliary_loss_mlp": 0.01038205, "balance_loss_clip": 1.0502758, "balance_loss_mlp": 1.02133691, "epoch": 0.2866075454682098, "flos": 26981626331520.0, "grad_norm": 1.7313176116986193, "language_loss": 0.77457404, "learning_rate": 3.348110666737214e-06, "loss": 0.79625863, "num_input_tokens_seen": 102864480, "step": 4767, "time_per_iteration": 2.7313742637634277 }, { "auxiliary_loss_clip": 0.0114162, "auxiliary_loss_mlp": 0.01042697, "balance_loss_clip": 1.05109096, "balance_loss_mlp": 1.02519727, "epoch": 0.2866676687208778, "flos": 23253847004160.0, "grad_norm": 1.7818476838857593, "language_loss": 0.65043855, "learning_rate": 3.3478229532903956e-06, "loss": 0.67228168, "num_input_tokens_seen": 102883740, "step": 4768, "time_per_iteration": 2.6173784732818604 }, { "auxiliary_loss_clip": 0.01123197, "auxiliary_loss_mlp": 0.01041331, "balance_loss_clip": 1.04803848, "balance_loss_mlp": 1.02385533, "epoch": 0.28672779197354575, "flos": 21579727870080.0, "grad_norm": 1.5842392137882455, "language_loss": 0.70497799, "learning_rate": 3.3475351887339967e-06, "loss": 0.7266233, "num_input_tokens_seen": 102902945, "step": 4769, "time_per_iteration": 2.627859115600586 }, { "auxiliary_loss_clip": 0.01078118, "auxiliary_loss_mlp": 0.01033792, "balance_loss_clip": 1.04276228, "balance_loss_mlp": 1.01722169, "epoch": 0.2867879152262137, "flos": 19865568049920.0, "grad_norm": 1.555057890983365, "language_loss": 0.74735439, "learning_rate": 3.3472473730789288e-06, "loss": 0.76847351, "num_input_tokens_seen": 102922405, "step": 4770, "time_per_iteration": 2.807286262512207 }, { "auxiliary_loss_clip": 0.01094623, "auxiliary_loss_mlp": 0.01041164, "balance_loss_clip": 1.04522562, "balance_loss_mlp": 1.02336657, "epoch": 0.2868480384788817, "flos": 28213325648640.0, "grad_norm": 2.2768786529491427, "language_loss": 0.6760053, "learning_rate": 3.3469595063361045e-06, "loss": 0.6973632, "num_input_tokens_seen": 102938980, "step": 4771, "time_per_iteration": 2.7709410190582275 }, { "auxiliary_loss_clip": 0.01041422, "auxiliary_loss_mlp": 0.01015109, "balance_loss_clip": 1.01907253, "balance_loss_mlp": 1.01243877, "epoch": 0.2869081617315497, "flos": 65424286690560.0, "grad_norm": 0.770068198596698, "language_loss": 0.56874299, "learning_rate": 3.3466715885164414e-06, "loss": 0.58930826, "num_input_tokens_seen": 103000405, "step": 4772, "time_per_iteration": 3.0978245735168457 }, { "auxiliary_loss_clip": 0.01067739, "auxiliary_loss_mlp": 0.0077878, "balance_loss_clip": 1.04115915, "balance_loss_mlp": 1.00089169, "epoch": 0.28696828498421767, "flos": 18660729127680.0, "grad_norm": 2.7874039039613345, "language_loss": 0.82870376, "learning_rate": 3.346383619630856e-06, "loss": 0.84716898, "num_input_tokens_seen": 103017970, "step": 4773, "time_per_iteration": 2.7716143131256104 }, { "auxiliary_loss_clip": 0.0114188, "auxiliary_loss_mlp": 0.01043405, "balance_loss_clip": 1.04776216, "balance_loss_mlp": 1.02553546, "epoch": 0.28702840823688563, "flos": 23659745667840.0, "grad_norm": 11.069053071667042, "language_loss": 0.77580261, "learning_rate": 3.34609559969027e-06, "loss": 0.79765546, "num_input_tokens_seen": 103036385, "step": 4774, "time_per_iteration": 2.604790687561035 }, { "auxiliary_loss_clip": 0.01119567, "auxiliary_loss_mlp": 0.01042061, "balance_loss_clip": 1.04915977, "balance_loss_mlp": 1.02414346, "epoch": 0.2870885314895536, "flos": 13804744544640.0, "grad_norm": 1.9103573283121942, "language_loss": 0.73611873, "learning_rate": 3.3458075287056034e-06, "loss": 0.75773501, "num_input_tokens_seen": 103052170, "step": 4775, "time_per_iteration": 2.6234211921691895 }, { "auxiliary_loss_clip": 0.01133151, "auxiliary_loss_mlp": 0.01045326, "balance_loss_clip": 1.04905081, "balance_loss_mlp": 1.02782607, "epoch": 0.28714865474222157, "flos": 17786771314560.0, "grad_norm": 1.6535491049734306, "language_loss": 0.88343942, "learning_rate": 3.34551940668778e-06, "loss": 0.9052242, "num_input_tokens_seen": 103070510, "step": 4776, "time_per_iteration": 2.6941640377044678 }, { "auxiliary_loss_clip": 0.01132773, "auxiliary_loss_mlp": 0.0104327, "balance_loss_clip": 1.05156159, "balance_loss_mlp": 1.02712941, "epoch": 0.28720877799488953, "flos": 15997486199040.0, "grad_norm": 1.7321020140737395, "language_loss": 0.74257779, "learning_rate": 3.345231233647726e-06, "loss": 0.76433825, "num_input_tokens_seen": 103089590, "step": 4777, "time_per_iteration": 2.645650863647461 }, { "auxiliary_loss_clip": 0.01126691, "auxiliary_loss_mlp": 0.01045293, "balance_loss_clip": 1.05245948, "balance_loss_mlp": 1.02812648, "epoch": 0.2872689012475575, "flos": 20923137210240.0, "grad_norm": 1.9446580110028222, "language_loss": 0.80069196, "learning_rate": 3.3449430095963696e-06, "loss": 0.82241178, "num_input_tokens_seen": 103109080, "step": 4778, "time_per_iteration": 2.7606308460235596 }, { "auxiliary_loss_clip": 0.01123482, "auxiliary_loss_mlp": 0.01044505, "balance_loss_clip": 1.05461526, "balance_loss_mlp": 1.02750611, "epoch": 0.28732902450022546, "flos": 21325121291520.0, "grad_norm": 1.7560492266469991, "language_loss": 0.7396307, "learning_rate": 3.3446547345446386e-06, "loss": 0.76131058, "num_input_tokens_seen": 103127755, "step": 4779, "time_per_iteration": 2.831167221069336 }, { "auxiliary_loss_clip": 0.01122102, "auxiliary_loss_mlp": 0.01043876, "balance_loss_clip": 1.04866719, "balance_loss_mlp": 1.0262928, "epoch": 0.2873891477528934, "flos": 20850382212480.0, "grad_norm": 1.5882306223862566, "language_loss": 0.76327771, "learning_rate": 3.3443664085034656e-06, "loss": 0.7849375, "num_input_tokens_seen": 103147035, "step": 4780, "time_per_iteration": 2.6548538208007812 }, { "auxiliary_loss_clip": 0.01102465, "auxiliary_loss_mlp": 0.01042038, "balance_loss_clip": 1.04413557, "balance_loss_mlp": 1.02517641, "epoch": 0.2874492710055614, "flos": 17420051410560.0, "grad_norm": 1.5896497572299877, "language_loss": 0.81445092, "learning_rate": 3.344078031483784e-06, "loss": 0.83589596, "num_input_tokens_seen": 103165410, "step": 4781, "time_per_iteration": 2.6422417163848877 }, { "auxiliary_loss_clip": 0.01109573, "auxiliary_loss_mlp": 0.01045358, "balance_loss_clip": 1.05339658, "balance_loss_mlp": 1.0277034, "epoch": 0.28750939425822936, "flos": 13406818700160.0, "grad_norm": 1.8389370421072637, "language_loss": 0.86738765, "learning_rate": 3.3437896034965283e-06, "loss": 0.888937, "num_input_tokens_seen": 103183710, "step": 4782, "time_per_iteration": 2.7507951259613037 }, { "auxiliary_loss_clip": 0.01113582, "auxiliary_loss_mlp": 0.01043351, "balance_loss_clip": 1.05343366, "balance_loss_mlp": 1.02604771, "epoch": 0.2875695175108973, "flos": 21870029589120.0, "grad_norm": 1.5283433651606986, "language_loss": 0.71153063, "learning_rate": 3.3435011245526357e-06, "loss": 0.73309994, "num_input_tokens_seen": 103203790, "step": 4783, "time_per_iteration": 2.7166218757629395 }, { "auxiliary_loss_clip": 0.0112343, "auxiliary_loss_mlp": 0.01047879, "balance_loss_clip": 1.05475473, "balance_loss_mlp": 1.030761, "epoch": 0.2876296407635653, "flos": 26245457089920.0, "grad_norm": 1.6861942701171202, "language_loss": 0.76872855, "learning_rate": 3.343212594663047e-06, "loss": 0.79044163, "num_input_tokens_seen": 103223925, "step": 4784, "time_per_iteration": 2.693665027618408 }, { "auxiliary_loss_clip": 0.01095423, "auxiliary_loss_mlp": 0.01053931, "balance_loss_clip": 1.04587293, "balance_loss_mlp": 1.03514349, "epoch": 0.28768976401623325, "flos": 25373654092800.0, "grad_norm": 4.596098798847224, "language_loss": 0.75646108, "learning_rate": 3.3429240138387015e-06, "loss": 0.77795458, "num_input_tokens_seen": 103244760, "step": 4785, "time_per_iteration": 4.380687236785889 }, { "auxiliary_loss_clip": 0.01144615, "auxiliary_loss_mlp": 0.01048905, "balance_loss_clip": 1.0532378, "balance_loss_mlp": 1.03213263, "epoch": 0.28774988726890127, "flos": 30664372982400.0, "grad_norm": 2.434913324661012, "language_loss": 0.83660555, "learning_rate": 3.3426353820905425e-06, "loss": 0.85854077, "num_input_tokens_seen": 103261995, "step": 4786, "time_per_iteration": 4.138700723648071 }, { "auxiliary_loss_clip": 0.01113505, "auxiliary_loss_mlp": 0.0077478, "balance_loss_clip": 1.05201936, "balance_loss_mlp": 1.00095487, "epoch": 0.28781001052156924, "flos": 20595452411520.0, "grad_norm": 1.8737605513707083, "language_loss": 0.80388975, "learning_rate": 3.342346699429516e-06, "loss": 0.82277262, "num_input_tokens_seen": 103279780, "step": 4787, "time_per_iteration": 2.7030651569366455 }, { "auxiliary_loss_clip": 0.01120528, "auxiliary_loss_mlp": 0.01039353, "balance_loss_clip": 1.0489651, "balance_loss_mlp": 1.02212751, "epoch": 0.2878701337742372, "flos": 26542330997760.0, "grad_norm": 1.8370986188087255, "language_loss": 0.83052301, "learning_rate": 3.3420579658665677e-06, "loss": 0.85212183, "num_input_tokens_seen": 103300580, "step": 4788, "time_per_iteration": 2.7650442123413086 }, { "auxiliary_loss_clip": 0.01110861, "auxiliary_loss_mlp": 0.01044904, "balance_loss_clip": 1.0567044, "balance_loss_mlp": 1.0279882, "epoch": 0.28793025702690517, "flos": 28146855530880.0, "grad_norm": 7.859878454786593, "language_loss": 0.73045379, "learning_rate": 3.3417691814126468e-06, "loss": 0.75201148, "num_input_tokens_seen": 103320430, "step": 4789, "time_per_iteration": 4.340694189071655 }, { "auxiliary_loss_clip": 0.01123471, "auxiliary_loss_mlp": 0.01042567, "balance_loss_clip": 1.04852343, "balance_loss_mlp": 1.02599669, "epoch": 0.28799038027957313, "flos": 23805471144960.0, "grad_norm": 1.7615007973154742, "language_loss": 0.84425223, "learning_rate": 3.341480346078704e-06, "loss": 0.86591256, "num_input_tokens_seen": 103337695, "step": 4790, "time_per_iteration": 2.6953821182250977 }, { "auxiliary_loss_clip": 0.01136004, "auxiliary_loss_mlp": 0.01049022, "balance_loss_clip": 1.05240703, "balance_loss_mlp": 1.03145027, "epoch": 0.2880505035322411, "flos": 22344122223360.0, "grad_norm": 1.743209341690147, "language_loss": 0.78031182, "learning_rate": 3.3411914598756922e-06, "loss": 0.80216199, "num_input_tokens_seen": 103357010, "step": 4791, "time_per_iteration": 4.299259424209595 }, { "auxiliary_loss_clip": 0.01120123, "auxiliary_loss_mlp": 0.01036962, "balance_loss_clip": 1.05015528, "balance_loss_mlp": 1.01999843, "epoch": 0.28811062678490906, "flos": 18004246208640.0, "grad_norm": 2.2148694233914474, "language_loss": 0.70164073, "learning_rate": 3.3409025228145654e-06, "loss": 0.72321159, "num_input_tokens_seen": 103375600, "step": 4792, "time_per_iteration": 2.646732807159424 }, { "auxiliary_loss_clip": 0.01107079, "auxiliary_loss_mlp": 0.01037734, "balance_loss_clip": 1.05645919, "balance_loss_mlp": 1.02149773, "epoch": 0.28817075003757703, "flos": 22090880361600.0, "grad_norm": 1.9192442052106609, "language_loss": 0.79200894, "learning_rate": 3.3406135349062812e-06, "loss": 0.81345713, "num_input_tokens_seen": 103395225, "step": 4793, "time_per_iteration": 2.765010356903076 }, { "auxiliary_loss_clip": 0.01117839, "auxiliary_loss_mlp": 0.01038019, "balance_loss_clip": 1.05114603, "balance_loss_mlp": 1.02235532, "epoch": 0.288230873290245, "flos": 41683130847360.0, "grad_norm": 1.7689864288971164, "language_loss": 0.78136635, "learning_rate": 3.340324496161797e-06, "loss": 0.80292487, "num_input_tokens_seen": 103417245, "step": 4794, "time_per_iteration": 2.868473529815674 }, { "auxiliary_loss_clip": 0.01134193, "auxiliary_loss_mlp": 0.0104583, "balance_loss_clip": 1.05259347, "balance_loss_mlp": 1.02856886, "epoch": 0.28829099654291296, "flos": 18624423456000.0, "grad_norm": 2.1692523829597063, "language_loss": 0.8320052, "learning_rate": 3.340035406592074e-06, "loss": 0.85380542, "num_input_tokens_seen": 103435500, "step": 4795, "time_per_iteration": 2.6216471195220947 }, { "auxiliary_loss_clip": 0.01126764, "auxiliary_loss_mlp": 0.01043565, "balance_loss_clip": 1.05043364, "balance_loss_mlp": 1.0279845, "epoch": 0.2883511197955809, "flos": 24674832017280.0, "grad_norm": 2.290853867887048, "language_loss": 0.74744678, "learning_rate": 3.339746266208074e-06, "loss": 0.76915002, "num_input_tokens_seen": 103451040, "step": 4796, "time_per_iteration": 2.6819822788238525 }, { "auxiliary_loss_clip": 0.01136938, "auxiliary_loss_mlp": 0.01040822, "balance_loss_clip": 1.05140758, "balance_loss_mlp": 1.02221298, "epoch": 0.2884112430482489, "flos": 23112143850240.0, "grad_norm": 1.9890524806298786, "language_loss": 0.73144913, "learning_rate": 3.3394570750207614e-06, "loss": 0.7532267, "num_input_tokens_seen": 103471330, "step": 4797, "time_per_iteration": 2.666097640991211 }, { "auxiliary_loss_clip": 0.01104454, "auxiliary_loss_mlp": 0.00775335, "balance_loss_clip": 1.04594803, "balance_loss_mlp": 1.00097072, "epoch": 0.28847136630091685, "flos": 16873347432960.0, "grad_norm": 1.9324008515617646, "language_loss": 0.74650872, "learning_rate": 3.3391678330411017e-06, "loss": 0.76530659, "num_input_tokens_seen": 103488060, "step": 4798, "time_per_iteration": 2.7281830310821533 }, { "auxiliary_loss_clip": 0.0113412, "auxiliary_loss_mlp": 0.01043523, "balance_loss_clip": 1.04996431, "balance_loss_mlp": 1.02463984, "epoch": 0.2885314895535849, "flos": 25657527277440.0, "grad_norm": 3.037553219769834, "language_loss": 0.66004431, "learning_rate": 3.3388785402800642e-06, "loss": 0.68182075, "num_input_tokens_seen": 103503600, "step": 4799, "time_per_iteration": 2.6416096687316895 }, { "auxiliary_loss_clip": 0.01144575, "auxiliary_loss_mlp": 0.01049843, "balance_loss_clip": 1.05205584, "balance_loss_mlp": 1.03268862, "epoch": 0.28859161280625284, "flos": 21107251347840.0, "grad_norm": 1.7946911133370596, "language_loss": 0.8231616, "learning_rate": 3.3385891967486178e-06, "loss": 0.84510577, "num_input_tokens_seen": 103524195, "step": 4800, "time_per_iteration": 2.704357624053955 }, { "auxiliary_loss_clip": 0.01105166, "auxiliary_loss_mlp": 0.01040519, "balance_loss_clip": 1.04861474, "balance_loss_mlp": 1.02392507, "epoch": 0.2886517360589208, "flos": 26469540086400.0, "grad_norm": 1.5930665564066124, "language_loss": 0.9080106, "learning_rate": 3.3382998024577347e-06, "loss": 0.92946744, "num_input_tokens_seen": 103545235, "step": 4801, "time_per_iteration": 2.8163902759552 }, { "auxiliary_loss_clip": 0.01119221, "auxiliary_loss_mlp": 0.00775037, "balance_loss_clip": 1.05178905, "balance_loss_mlp": 1.0008862, "epoch": 0.28871185931158877, "flos": 25265275781760.0, "grad_norm": 2.098995863955026, "language_loss": 0.74342406, "learning_rate": 3.33801035741839e-06, "loss": 0.76236671, "num_input_tokens_seen": 103563305, "step": 4802, "time_per_iteration": 2.8244271278381348 }, { "auxiliary_loss_clip": 0.01029511, "auxiliary_loss_mlp": 0.01004263, "balance_loss_clip": 1.02472734, "balance_loss_mlp": 1.00193822, "epoch": 0.28877198256425674, "flos": 66665431284480.0, "grad_norm": 0.7780596068321518, "language_loss": 0.62987334, "learning_rate": 3.337720861641558e-06, "loss": 0.65021104, "num_input_tokens_seen": 103625025, "step": 4803, "time_per_iteration": 3.299269676208496 }, { "auxiliary_loss_clip": 0.01083739, "auxiliary_loss_mlp": 0.01051002, "balance_loss_clip": 1.03981495, "balance_loss_mlp": 1.03369915, "epoch": 0.2888321058169247, "flos": 20303031790080.0, "grad_norm": 1.8528386679599225, "language_loss": 0.71095157, "learning_rate": 3.3374313151382165e-06, "loss": 0.73229897, "num_input_tokens_seen": 103644235, "step": 4804, "time_per_iteration": 2.762883424758911 }, { "auxiliary_loss_clip": 0.01135071, "auxiliary_loss_mlp": 0.01047534, "balance_loss_clip": 1.05108273, "balance_loss_mlp": 1.0289135, "epoch": 0.28889222906959267, "flos": 25516721963520.0, "grad_norm": 1.926588918304246, "language_loss": 0.67916834, "learning_rate": 3.337141717919346e-06, "loss": 0.70099443, "num_input_tokens_seen": 103664700, "step": 4805, "time_per_iteration": 2.6848111152648926 }, { "auxiliary_loss_clip": 0.01135111, "auxiliary_loss_mlp": 0.01046638, "balance_loss_clip": 1.05359602, "balance_loss_mlp": 1.03029394, "epoch": 0.28895235232226063, "flos": 32671312560000.0, "grad_norm": 1.4381182508216341, "language_loss": 0.69720542, "learning_rate": 3.3368520699959272e-06, "loss": 0.71902293, "num_input_tokens_seen": 103686595, "step": 4806, "time_per_iteration": 2.762458562850952 }, { "auxiliary_loss_clip": 0.01120642, "auxiliary_loss_mlp": 0.01052311, "balance_loss_clip": 1.05073118, "balance_loss_mlp": 1.03559768, "epoch": 0.2890124755749286, "flos": 29714679342720.0, "grad_norm": 1.4600495853323927, "language_loss": 0.71255589, "learning_rate": 3.3365623713789443e-06, "loss": 0.73428547, "num_input_tokens_seen": 103707525, "step": 4807, "time_per_iteration": 2.740931987762451 }, { "auxiliary_loss_clip": 0.01106054, "auxiliary_loss_mlp": 0.01043407, "balance_loss_clip": 1.05087459, "balance_loss_mlp": 1.02625299, "epoch": 0.28907259882759656, "flos": 22674464628480.0, "grad_norm": 1.6111027163793539, "language_loss": 0.81489629, "learning_rate": 3.336272622079382e-06, "loss": 0.83639085, "num_input_tokens_seen": 103727905, "step": 4808, "time_per_iteration": 2.722787380218506 }, { "auxiliary_loss_clip": 0.01098162, "auxiliary_loss_mlp": 0.01048507, "balance_loss_clip": 1.04795146, "balance_loss_mlp": 1.03160298, "epoch": 0.2891327220802645, "flos": 22566050403840.0, "grad_norm": 1.7874609682529725, "language_loss": 0.78304112, "learning_rate": 3.3359828221082276e-06, "loss": 0.80450785, "num_input_tokens_seen": 103748335, "step": 4809, "time_per_iteration": 2.742063522338867 }, { "auxiliary_loss_clip": 0.01091743, "auxiliary_loss_mlp": 0.01047553, "balance_loss_clip": 1.04519784, "balance_loss_mlp": 1.02924204, "epoch": 0.2891928453329325, "flos": 21652806090240.0, "grad_norm": 1.7709564567634208, "language_loss": 0.78864932, "learning_rate": 3.3356929714764714e-06, "loss": 0.81004226, "num_input_tokens_seen": 103767020, "step": 4810, "time_per_iteration": 2.7578415870666504 }, { "auxiliary_loss_clip": 0.01090252, "auxiliary_loss_mlp": 0.01039009, "balance_loss_clip": 1.04552603, "balance_loss_mlp": 1.02280235, "epoch": 0.28925296858560046, "flos": 23222102359680.0, "grad_norm": 1.6298276151024105, "language_loss": 0.76974982, "learning_rate": 3.3354030701951032e-06, "loss": 0.79104245, "num_input_tokens_seen": 103786355, "step": 4811, "time_per_iteration": 2.7336831092834473 }, { "auxiliary_loss_clip": 0.01132677, "auxiliary_loss_mlp": 0.01047674, "balance_loss_clip": 1.05356216, "balance_loss_mlp": 1.03038859, "epoch": 0.2893130918382685, "flos": 28621666437120.0, "grad_norm": 1.4740946425962824, "language_loss": 0.77044773, "learning_rate": 3.335113118275117e-06, "loss": 0.79225123, "num_input_tokens_seen": 103809345, "step": 4812, "time_per_iteration": 2.745115280151367 }, { "auxiliary_loss_clip": 0.01024348, "auxiliary_loss_mlp": 0.01009076, "balance_loss_clip": 1.02794337, "balance_loss_mlp": 1.00728762, "epoch": 0.28937321509093644, "flos": 72301288982400.0, "grad_norm": 0.8337141037006477, "language_loss": 0.60292435, "learning_rate": 3.3348231157275085e-06, "loss": 0.62325859, "num_input_tokens_seen": 103871180, "step": 4813, "time_per_iteration": 3.3592262268066406 }, { "auxiliary_loss_clip": 0.01094544, "auxiliary_loss_mlp": 0.01044805, "balance_loss_clip": 1.0431211, "balance_loss_mlp": 1.02734065, "epoch": 0.2894333383436044, "flos": 16216397637120.0, "grad_norm": 3.1340543474440623, "language_loss": 0.82301223, "learning_rate": 3.3345330625632725e-06, "loss": 0.84440577, "num_input_tokens_seen": 103889040, "step": 4814, "time_per_iteration": 2.7069244384765625 }, { "auxiliary_loss_clip": 0.01101478, "auxiliary_loss_mlp": 0.01052591, "balance_loss_clip": 1.05051374, "balance_loss_mlp": 1.03556752, "epoch": 0.2894934615962724, "flos": 24828278918400.0, "grad_norm": 1.6672038490985601, "language_loss": 0.73249441, "learning_rate": 3.3342429587934094e-06, "loss": 0.75403512, "num_input_tokens_seen": 103910380, "step": 4815, "time_per_iteration": 2.764214515686035 }, { "auxiliary_loss_clip": 0.01131126, "auxiliary_loss_mlp": 0.01045124, "balance_loss_clip": 1.05259883, "balance_loss_mlp": 1.02997231, "epoch": 0.28955358484894034, "flos": 20449978329600.0, "grad_norm": 1.9821106518618066, "language_loss": 0.70783043, "learning_rate": 3.3339528044289198e-06, "loss": 0.72959292, "num_input_tokens_seen": 103929955, "step": 4816, "time_per_iteration": 2.7809629440307617 }, { "auxiliary_loss_clip": 0.01119261, "auxiliary_loss_mlp": 0.01048806, "balance_loss_clip": 1.04862189, "balance_loss_mlp": 1.03097248, "epoch": 0.2896137081016083, "flos": 22565188477440.0, "grad_norm": 2.3636227133284122, "language_loss": 0.7445122, "learning_rate": 3.3336625994808055e-06, "loss": 0.76619279, "num_input_tokens_seen": 103948020, "step": 4817, "time_per_iteration": 2.829183578491211 }, { "auxiliary_loss_clip": 0.01108198, "auxiliary_loss_mlp": 0.01054129, "balance_loss_clip": 1.05107522, "balance_loss_mlp": 1.03633142, "epoch": 0.28967383135427627, "flos": 26687948734080.0, "grad_norm": 1.8479613371686012, "language_loss": 0.76190692, "learning_rate": 3.3333723439600723e-06, "loss": 0.78353024, "num_input_tokens_seen": 103968740, "step": 4818, "time_per_iteration": 2.827925443649292 }, { "auxiliary_loss_clip": 0.01074516, "auxiliary_loss_mlp": 0.01041914, "balance_loss_clip": 1.04805899, "balance_loss_mlp": 1.02477193, "epoch": 0.28973395460694423, "flos": 15558262692480.0, "grad_norm": 1.9558897556763024, "language_loss": 0.80060315, "learning_rate": 3.3330820378777263e-06, "loss": 0.82176751, "num_input_tokens_seen": 103986005, "step": 4819, "time_per_iteration": 2.8941574096679688 }, { "auxiliary_loss_clip": 0.01110223, "auxiliary_loss_mlp": 0.01048219, "balance_loss_clip": 1.0494163, "balance_loss_mlp": 1.02931273, "epoch": 0.2897940778596122, "flos": 18697465762560.0, "grad_norm": 1.8074124972104149, "language_loss": 0.78504574, "learning_rate": 3.332791681244776e-06, "loss": 0.80663019, "num_input_tokens_seen": 104005070, "step": 4820, "time_per_iteration": 2.7016515731811523 }, { "auxiliary_loss_clip": 0.01096478, "auxiliary_loss_mlp": 0.01037037, "balance_loss_clip": 1.04924846, "balance_loss_mlp": 1.02028775, "epoch": 0.28985420111228016, "flos": 18770292587520.0, "grad_norm": 2.105369007151224, "language_loss": 0.72925651, "learning_rate": 3.332501274072231e-06, "loss": 0.7505917, "num_input_tokens_seen": 104022945, "step": 4821, "time_per_iteration": 2.743091583251953 }, { "auxiliary_loss_clip": 0.01132782, "auxiliary_loss_mlp": 0.01040556, "balance_loss_clip": 1.05055594, "balance_loss_mlp": 1.02290142, "epoch": 0.28991432436494813, "flos": 23069840607360.0, "grad_norm": 2.331696646407205, "language_loss": 0.71962738, "learning_rate": 3.332210816371104e-06, "loss": 0.74136078, "num_input_tokens_seen": 104042080, "step": 4822, "time_per_iteration": 2.768996477127075 }, { "auxiliary_loss_clip": 0.01128837, "auxiliary_loss_mlp": 0.01048176, "balance_loss_clip": 1.05237818, "balance_loss_mlp": 1.03142738, "epoch": 0.2899744476176161, "flos": 17603195880960.0, "grad_norm": 1.8111020118629353, "language_loss": 0.662521, "learning_rate": 3.3319203081524102e-06, "loss": 0.68429112, "num_input_tokens_seen": 104060975, "step": 4823, "time_per_iteration": 2.733591318130493 }, { "auxiliary_loss_clip": 0.01107872, "auxiliary_loss_mlp": 0.01042255, "balance_loss_clip": 1.04404497, "balance_loss_mlp": 1.02588761, "epoch": 0.29003457087028406, "flos": 22309360836480.0, "grad_norm": 4.579803152663717, "language_loss": 0.81162238, "learning_rate": 3.331629749427164e-06, "loss": 0.83312368, "num_input_tokens_seen": 104081395, "step": 4824, "time_per_iteration": 4.278540849685669 }, { "auxiliary_loss_clip": 0.01143667, "auxiliary_loss_mlp": 0.01043888, "balance_loss_clip": 1.05104661, "balance_loss_mlp": 1.025828, "epoch": 0.2900946941229521, "flos": 21944975316480.0, "grad_norm": 2.265114761106369, "language_loss": 0.72592747, "learning_rate": 3.331339140206385e-06, "loss": 0.74780297, "num_input_tokens_seen": 104099995, "step": 4825, "time_per_iteration": 4.177908658981323 }, { "auxiliary_loss_clip": 0.01147795, "auxiliary_loss_mlp": 0.01036998, "balance_loss_clip": 1.05434549, "balance_loss_mlp": 1.01930714, "epoch": 0.29015481737562004, "flos": 17932173569280.0, "grad_norm": 2.216571865047856, "language_loss": 0.73680669, "learning_rate": 3.331048480501092e-06, "loss": 0.75865459, "num_input_tokens_seen": 104118930, "step": 4826, "time_per_iteration": 2.6371700763702393 }, { "auxiliary_loss_clip": 0.0113072, "auxiliary_loss_mlp": 0.01040585, "balance_loss_clip": 1.05073726, "balance_loss_mlp": 1.02483773, "epoch": 0.290214940628288, "flos": 22783525297920.0, "grad_norm": 2.324527624383577, "language_loss": 0.68556225, "learning_rate": 3.3307577703223073e-06, "loss": 0.70727527, "num_input_tokens_seen": 104136940, "step": 4827, "time_per_iteration": 2.6447484493255615 }, { "auxiliary_loss_clip": 0.01125924, "auxiliary_loss_mlp": 0.0104453, "balance_loss_clip": 1.04981911, "balance_loss_mlp": 1.02650571, "epoch": 0.290275063880956, "flos": 20006481104640.0, "grad_norm": 1.8485927197530279, "language_loss": 0.80266023, "learning_rate": 3.3304670096810545e-06, "loss": 0.82436466, "num_input_tokens_seen": 104154280, "step": 4828, "time_per_iteration": 4.131803274154663 }, { "auxiliary_loss_clip": 0.01144317, "auxiliary_loss_mlp": 0.01049939, "balance_loss_clip": 1.05393863, "balance_loss_mlp": 1.03288054, "epoch": 0.29033518713362394, "flos": 22053605022720.0, "grad_norm": 1.8003854621941846, "language_loss": 0.80658895, "learning_rate": 3.33017619858836e-06, "loss": 0.8285315, "num_input_tokens_seen": 104172605, "step": 4829, "time_per_iteration": 2.760899066925049 }, { "auxiliary_loss_clip": 0.011197, "auxiliary_loss_mlp": 0.01044046, "balance_loss_clip": 1.05093288, "balance_loss_mlp": 1.02680826, "epoch": 0.2903953103862919, "flos": 25630056351360.0, "grad_norm": 1.5734536519128175, "language_loss": 0.82911146, "learning_rate": 3.329885337055249e-06, "loss": 0.85074902, "num_input_tokens_seen": 104194120, "step": 4830, "time_per_iteration": 4.403480529785156 }, { "auxiliary_loss_clip": 0.01137563, "auxiliary_loss_mlp": 0.01048934, "balance_loss_clip": 1.05430257, "balance_loss_mlp": 1.03155351, "epoch": 0.29045543363895987, "flos": 16945851035520.0, "grad_norm": 2.2586543311689486, "language_loss": 0.79236752, "learning_rate": 3.3295944250927546e-06, "loss": 0.81423253, "num_input_tokens_seen": 104210875, "step": 4831, "time_per_iteration": 2.6066412925720215 }, { "auxiliary_loss_clip": 0.01143728, "auxiliary_loss_mlp": 0.01045824, "balance_loss_clip": 1.05470276, "balance_loss_mlp": 1.03000546, "epoch": 0.29051555689162784, "flos": 26395492199040.0, "grad_norm": 1.9694662738232038, "language_loss": 0.7459774, "learning_rate": 3.3293034627119055e-06, "loss": 0.76787293, "num_input_tokens_seen": 104229875, "step": 4832, "time_per_iteration": 2.8411331176757812 }, { "auxiliary_loss_clip": 0.01122405, "auxiliary_loss_mlp": 0.01037758, "balance_loss_clip": 1.05429769, "balance_loss_mlp": 1.02335787, "epoch": 0.2905756801442958, "flos": 21103875469440.0, "grad_norm": 1.979215737756815, "language_loss": 0.76150024, "learning_rate": 3.329012449923736e-06, "loss": 0.78310186, "num_input_tokens_seen": 104250405, "step": 4833, "time_per_iteration": 2.7510006427764893 }, { "auxiliary_loss_clip": 0.01107016, "auxiliary_loss_mlp": 0.01040024, "balance_loss_clip": 1.04580688, "balance_loss_mlp": 1.02383542, "epoch": 0.29063580339696377, "flos": 15706071158400.0, "grad_norm": 1.7715964188803632, "language_loss": 0.64404124, "learning_rate": 3.3287213867392813e-06, "loss": 0.66551173, "num_input_tokens_seen": 104269185, "step": 4834, "time_per_iteration": 2.6475064754486084 }, { "auxiliary_loss_clip": 0.01117159, "auxiliary_loss_mlp": 0.01032155, "balance_loss_clip": 1.05111325, "balance_loss_mlp": 1.01724815, "epoch": 0.29069592664963173, "flos": 24644990793600.0, "grad_norm": 1.4640588842294755, "language_loss": 0.71717769, "learning_rate": 3.3284302731695783e-06, "loss": 0.73867083, "num_input_tokens_seen": 104289400, "step": 4835, "time_per_iteration": 2.6991324424743652 }, { "auxiliary_loss_clip": 0.01117393, "auxiliary_loss_mlp": 0.01037314, "balance_loss_clip": 1.04881835, "balance_loss_mlp": 1.02187634, "epoch": 0.2907560499022997, "flos": 24973753000320.0, "grad_norm": 1.657223137158586, "language_loss": 0.79492378, "learning_rate": 3.3281391092256668e-06, "loss": 0.81647086, "num_input_tokens_seen": 104310485, "step": 4836, "time_per_iteration": 2.7060084342956543 }, { "auxiliary_loss_clip": 0.01107347, "auxiliary_loss_mlp": 0.01045193, "balance_loss_clip": 1.05334711, "balance_loss_mlp": 1.02744293, "epoch": 0.29081617315496766, "flos": 18657496903680.0, "grad_norm": 1.9442300400082562, "language_loss": 0.81372344, "learning_rate": 3.3278478949185865e-06, "loss": 0.83524883, "num_input_tokens_seen": 104327330, "step": 4837, "time_per_iteration": 2.640610933303833 }, { "auxiliary_loss_clip": 0.01116355, "auxiliary_loss_mlp": 0.01039398, "balance_loss_clip": 1.04938102, "balance_loss_mlp": 1.0233283, "epoch": 0.2908762964076356, "flos": 35331035955840.0, "grad_norm": 6.209911556378307, "language_loss": 0.67358792, "learning_rate": 3.327556630259381e-06, "loss": 0.69514549, "num_input_tokens_seen": 104350350, "step": 4838, "time_per_iteration": 2.758422374725342 }, { "auxiliary_loss_clip": 0.01147958, "auxiliary_loss_mlp": 0.00775113, "balance_loss_clip": 1.05402315, "balance_loss_mlp": 1.00096607, "epoch": 0.29093641966030365, "flos": 23076305055360.0, "grad_norm": 1.5628414298261506, "language_loss": 0.71139944, "learning_rate": 3.327265315259095e-06, "loss": 0.73063016, "num_input_tokens_seen": 104369995, "step": 4839, "time_per_iteration": 2.683349132537842 }, { "auxiliary_loss_clip": 0.0114095, "auxiliary_loss_mlp": 0.01036937, "balance_loss_clip": 1.04966319, "balance_loss_mlp": 1.02147555, "epoch": 0.2909965429129716, "flos": 35955415094400.0, "grad_norm": 1.9403130873020338, "language_loss": 0.7539593, "learning_rate": 3.326973949928776e-06, "loss": 0.77573812, "num_input_tokens_seen": 104392285, "step": 4840, "time_per_iteration": 2.696808099746704 }, { "auxiliary_loss_clip": 0.01093571, "auxiliary_loss_mlp": 0.01045095, "balance_loss_clip": 1.04470551, "balance_loss_mlp": 1.02825069, "epoch": 0.2910566661656396, "flos": 30880231764480.0, "grad_norm": 1.7841334294021773, "language_loss": 0.60546595, "learning_rate": 3.326682534279471e-06, "loss": 0.62685257, "num_input_tokens_seen": 104412640, "step": 4841, "time_per_iteration": 2.74575138092041 }, { "auxiliary_loss_clip": 0.01120271, "auxiliary_loss_mlp": 0.01039624, "balance_loss_clip": 1.04983509, "balance_loss_mlp": 1.02288651, "epoch": 0.29111678941830754, "flos": 30010188533760.0, "grad_norm": 1.408353605568525, "language_loss": 0.71321762, "learning_rate": 3.326391068322232e-06, "loss": 0.73481655, "num_input_tokens_seen": 104435245, "step": 4842, "time_per_iteration": 2.7568962574005127 }, { "auxiliary_loss_clip": 0.01130885, "auxiliary_loss_mlp": 0.01037088, "balance_loss_clip": 1.05042899, "balance_loss_mlp": 1.02191257, "epoch": 0.2911769126709755, "flos": 22857393617280.0, "grad_norm": 2.1183002067983585, "language_loss": 0.73610562, "learning_rate": 3.3260995520681098e-06, "loss": 0.75778532, "num_input_tokens_seen": 104455395, "step": 4843, "time_per_iteration": 2.6703171730041504 }, { "auxiliary_loss_clip": 0.0108851, "auxiliary_loss_mlp": 0.01036244, "balance_loss_clip": 1.04775739, "balance_loss_mlp": 1.02058005, "epoch": 0.2912370359236435, "flos": 21650507619840.0, "grad_norm": 4.868884277111801, "language_loss": 0.58445942, "learning_rate": 3.3258079855281602e-06, "loss": 0.60570699, "num_input_tokens_seen": 104473350, "step": 4844, "time_per_iteration": 2.7461965084075928 }, { "auxiliary_loss_clip": 0.01138917, "auxiliary_loss_mlp": 0.01039428, "balance_loss_clip": 1.05586743, "balance_loss_mlp": 1.0222863, "epoch": 0.29129715917631144, "flos": 22893340152960.0, "grad_norm": 1.9200815982611392, "language_loss": 0.86459565, "learning_rate": 3.3255163687134396e-06, "loss": 0.88637912, "num_input_tokens_seen": 104492265, "step": 4845, "time_per_iteration": 2.711101770401001 }, { "auxiliary_loss_clip": 0.01115849, "auxiliary_loss_mlp": 0.01052584, "balance_loss_clip": 1.05018926, "balance_loss_mlp": 1.03505993, "epoch": 0.2913572824289794, "flos": 22674464628480.0, "grad_norm": 1.7226223126663984, "language_loss": 0.67067879, "learning_rate": 3.3252247016350046e-06, "loss": 0.69236308, "num_input_tokens_seen": 104510755, "step": 4846, "time_per_iteration": 2.698076009750366 }, { "auxiliary_loss_clip": 0.01120746, "auxiliary_loss_mlp": 0.01040428, "balance_loss_clip": 1.05198884, "balance_loss_mlp": 1.02457917, "epoch": 0.29141740568164737, "flos": 23107403255040.0, "grad_norm": 1.9884880347168128, "language_loss": 0.70629871, "learning_rate": 3.3249329843039166e-06, "loss": 0.7279104, "num_input_tokens_seen": 104530830, "step": 4847, "time_per_iteration": 2.6693859100341797 }, { "auxiliary_loss_clip": 0.01129385, "auxiliary_loss_mlp": 0.01036362, "balance_loss_clip": 1.0490911, "balance_loss_mlp": 1.02048314, "epoch": 0.29147752893431533, "flos": 23587026583680.0, "grad_norm": 1.4444788582363046, "language_loss": 0.73975939, "learning_rate": 3.324641216731237e-06, "loss": 0.76141691, "num_input_tokens_seen": 104550115, "step": 4848, "time_per_iteration": 2.779012680053711 }, { "auxiliary_loss_clip": 0.0112526, "auxiliary_loss_mlp": 0.01051811, "balance_loss_clip": 1.04831481, "balance_loss_mlp": 1.03391802, "epoch": 0.2915376521869833, "flos": 20591968792320.0, "grad_norm": 3.067540232947916, "language_loss": 0.76738584, "learning_rate": 3.3243493989280295e-06, "loss": 0.7891565, "num_input_tokens_seen": 104566255, "step": 4849, "time_per_iteration": 2.6103999614715576 }, { "auxiliary_loss_clip": 0.01124372, "auxiliary_loss_mlp": 0.01041862, "balance_loss_clip": 1.04718697, "balance_loss_mlp": 1.02541125, "epoch": 0.29159777543965126, "flos": 20811490761600.0, "grad_norm": 1.7266499063872853, "language_loss": 0.78276592, "learning_rate": 3.3240575309053596e-06, "loss": 0.80442822, "num_input_tokens_seen": 104585235, "step": 4850, "time_per_iteration": 2.6395609378814697 }, { "auxiliary_loss_clip": 0.01111964, "auxiliary_loss_mlp": 0.01038044, "balance_loss_clip": 1.04907775, "balance_loss_mlp": 1.0209378, "epoch": 0.29165789869231923, "flos": 24244155947520.0, "grad_norm": 1.8024770323318549, "language_loss": 0.7657702, "learning_rate": 3.323765612674296e-06, "loss": 0.78727031, "num_input_tokens_seen": 104605315, "step": 4851, "time_per_iteration": 2.7265985012054443 }, { "auxiliary_loss_clip": 0.01132156, "auxiliary_loss_mlp": 0.01045641, "balance_loss_clip": 1.052459, "balance_loss_mlp": 1.03083527, "epoch": 0.29171802194498725, "flos": 28949925853440.0, "grad_norm": 1.3639310788782566, "language_loss": 0.77680421, "learning_rate": 3.3234736442459078e-06, "loss": 0.7985822, "num_input_tokens_seen": 104626055, "step": 4852, "time_per_iteration": 2.7161712646484375 }, { "auxiliary_loss_clip": 0.01120344, "auxiliary_loss_mlp": 0.01051407, "balance_loss_clip": 1.05108476, "balance_loss_mlp": 1.03523064, "epoch": 0.2917781451976552, "flos": 22598226011520.0, "grad_norm": 1.6397145219173752, "language_loss": 0.7816534, "learning_rate": 3.3231816256312665e-06, "loss": 0.80337089, "num_input_tokens_seen": 104646005, "step": 4853, "time_per_iteration": 2.748053789138794 }, { "auxiliary_loss_clip": 0.01108012, "auxiliary_loss_mlp": 0.01041349, "balance_loss_clip": 1.04923177, "balance_loss_mlp": 1.02535105, "epoch": 0.2918382684503232, "flos": 21574448570880.0, "grad_norm": 2.273586870261815, "language_loss": 0.8791436, "learning_rate": 3.322889556841445e-06, "loss": 0.90063715, "num_input_tokens_seen": 104661620, "step": 4854, "time_per_iteration": 2.7663791179656982 }, { "auxiliary_loss_clip": 0.01128591, "auxiliary_loss_mlp": 0.01054226, "balance_loss_clip": 1.05255818, "balance_loss_mlp": 1.03502131, "epoch": 0.29189839170299114, "flos": 24353503925760.0, "grad_norm": 1.7143523369489482, "language_loss": 0.86374146, "learning_rate": 3.322597437887519e-06, "loss": 0.88556957, "num_input_tokens_seen": 104681445, "step": 4855, "time_per_iteration": 2.613903284072876 }, { "auxiliary_loss_clip": 0.01039808, "auxiliary_loss_mlp": 0.01005184, "balance_loss_clip": 1.02170599, "balance_loss_mlp": 1.00303864, "epoch": 0.2919585149556591, "flos": 71316726215040.0, "grad_norm": 0.7954079009769616, "language_loss": 0.60148996, "learning_rate": 3.322305268780566e-06, "loss": 0.6219399, "num_input_tokens_seen": 104747945, "step": 4856, "time_per_iteration": 3.273501396179199 }, { "auxiliary_loss_clip": 0.01115701, "auxiliary_loss_mlp": 0.00774991, "balance_loss_clip": 1.04708552, "balance_loss_mlp": 1.00107539, "epoch": 0.2920186382083271, "flos": 15633208419840.0, "grad_norm": 1.7540806356878256, "language_loss": 0.6825304, "learning_rate": 3.322013049531664e-06, "loss": 0.70143735, "num_input_tokens_seen": 104766225, "step": 4857, "time_per_iteration": 2.6799964904785156 }, { "auxiliary_loss_clip": 0.01129839, "auxiliary_loss_mlp": 0.00774071, "balance_loss_clip": 1.05058599, "balance_loss_mlp": 1.00106227, "epoch": 0.29207876146099504, "flos": 28366018364160.0, "grad_norm": 1.9069678720023968, "language_loss": 0.83446503, "learning_rate": 3.321720780151895e-06, "loss": 0.85350412, "num_input_tokens_seen": 104785345, "step": 4858, "time_per_iteration": 2.7004997730255127 }, { "auxiliary_loss_clip": 0.01143419, "auxiliary_loss_mlp": 0.01047414, "balance_loss_clip": 1.05265319, "balance_loss_mlp": 1.03119004, "epoch": 0.292138884713663, "flos": 21870963342720.0, "grad_norm": 1.7162042036272904, "language_loss": 0.77357888, "learning_rate": 3.321428460652342e-06, "loss": 0.79548717, "num_input_tokens_seen": 104804560, "step": 4859, "time_per_iteration": 2.5901620388031006 }, { "auxiliary_loss_clip": 0.01105726, "auxiliary_loss_mlp": 0.01044957, "balance_loss_clip": 1.05237806, "balance_loss_mlp": 1.02816057, "epoch": 0.29219900796633097, "flos": 20992552243200.0, "grad_norm": 2.2554676354860246, "language_loss": 0.68046212, "learning_rate": 3.3211360910440885e-06, "loss": 0.70196903, "num_input_tokens_seen": 104821105, "step": 4860, "time_per_iteration": 2.7831058502197266 }, { "auxiliary_loss_clip": 0.01117304, "auxiliary_loss_mlp": 0.01041096, "balance_loss_clip": 1.05229402, "balance_loss_mlp": 1.02662396, "epoch": 0.29225913121899894, "flos": 35004608133120.0, "grad_norm": 2.539974445673703, "language_loss": 0.75258791, "learning_rate": 3.320843671338222e-06, "loss": 0.77417195, "num_input_tokens_seen": 104841440, "step": 4861, "time_per_iteration": 2.7506070137023926 }, { "auxiliary_loss_clip": 0.01128031, "auxiliary_loss_mlp": 0.0105121, "balance_loss_clip": 1.04845262, "balance_loss_mlp": 1.03620112, "epoch": 0.2923192544716669, "flos": 13515663888000.0, "grad_norm": 3.0942357088370245, "language_loss": 0.91498685, "learning_rate": 3.320551201545832e-06, "loss": 0.93677926, "num_input_tokens_seen": 104858210, "step": 4862, "time_per_iteration": 2.589700937271118 }, { "auxiliary_loss_clip": 0.01131947, "auxiliary_loss_mlp": 0.01042917, "balance_loss_clip": 1.05090141, "balance_loss_mlp": 1.02786124, "epoch": 0.29237937772433487, "flos": 19463512141440.0, "grad_norm": 2.2124063953391464, "language_loss": 0.73112279, "learning_rate": 3.320258681678008e-06, "loss": 0.75287139, "num_input_tokens_seen": 104875620, "step": 4863, "time_per_iteration": 4.142335653305054 }, { "auxiliary_loss_clip": 0.01061699, "auxiliary_loss_mlp": 0.01044676, "balance_loss_clip": 1.04478168, "balance_loss_mlp": 1.02934611, "epoch": 0.29243950097700283, "flos": 20850597694080.0, "grad_norm": 1.893468710780351, "language_loss": 0.77841508, "learning_rate": 3.319966111745842e-06, "loss": 0.79947883, "num_input_tokens_seen": 104894600, "step": 4864, "time_per_iteration": 4.309613943099976 }, { "auxiliary_loss_clip": 0.01102707, "auxiliary_loss_mlp": 0.01050983, "balance_loss_clip": 1.04593945, "balance_loss_mlp": 1.03424644, "epoch": 0.29249962422967085, "flos": 23584225322880.0, "grad_norm": 1.5703024458168264, "language_loss": 0.81861019, "learning_rate": 3.319673491760429e-06, "loss": 0.84014714, "num_input_tokens_seen": 104914530, "step": 4865, "time_per_iteration": 2.762397527694702 }, { "auxiliary_loss_clip": 0.0109576, "auxiliary_loss_mlp": 0.01046651, "balance_loss_clip": 1.05265307, "balance_loss_mlp": 1.02924657, "epoch": 0.2925597474823388, "flos": 22273342473600.0, "grad_norm": 2.2072447614425554, "language_loss": 0.85522473, "learning_rate": 3.3193808217328645e-06, "loss": 0.87664878, "num_input_tokens_seen": 104933460, "step": 4866, "time_per_iteration": 2.8033764362335205 }, { "auxiliary_loss_clip": 0.01110933, "auxiliary_loss_mlp": 0.01039812, "balance_loss_clip": 1.04811919, "balance_loss_mlp": 1.02410054, "epoch": 0.2926198707350068, "flos": 34456108475520.0, "grad_norm": 1.7213351696608077, "language_loss": 0.75498515, "learning_rate": 3.3190881016742476e-06, "loss": 0.7764926, "num_input_tokens_seen": 104954495, "step": 4867, "time_per_iteration": 4.2950732707977295 }, { "auxiliary_loss_clip": 0.01083116, "auxiliary_loss_mlp": 0.01052463, "balance_loss_clip": 1.04825687, "balance_loss_mlp": 1.03576183, "epoch": 0.29267999398767475, "flos": 20704153944960.0, "grad_norm": 1.9203033465249189, "language_loss": 0.73236179, "learning_rate": 3.3187953315956776e-06, "loss": 0.75371754, "num_input_tokens_seen": 104971915, "step": 4868, "time_per_iteration": 2.775538921356201 }, { "auxiliary_loss_clip": 0.01091396, "auxiliary_loss_mlp": 0.01045538, "balance_loss_clip": 1.04888034, "balance_loss_mlp": 1.02836001, "epoch": 0.2927401172403427, "flos": 18368667642240.0, "grad_norm": 1.663889887662616, "language_loss": 0.74540651, "learning_rate": 3.3185025115082566e-06, "loss": 0.76677585, "num_input_tokens_seen": 104991335, "step": 4869, "time_per_iteration": 2.734683036804199 }, { "auxiliary_loss_clip": 0.01116568, "auxiliary_loss_mlp": 0.01040323, "balance_loss_clip": 1.050179, "balance_loss_mlp": 1.02405143, "epoch": 0.2928002404930107, "flos": 26104041244800.0, "grad_norm": 1.5721867242720646, "language_loss": 0.76492888, "learning_rate": 3.318209641423088e-06, "loss": 0.78649783, "num_input_tokens_seen": 105012015, "step": 4870, "time_per_iteration": 4.413575649261475 }, { "auxiliary_loss_clip": 0.01133789, "auxiliary_loss_mlp": 0.0105055, "balance_loss_clip": 1.05237079, "balance_loss_mlp": 1.0328114, "epoch": 0.29286036374567864, "flos": 21324726241920.0, "grad_norm": 2.0174334678237655, "language_loss": 0.6773119, "learning_rate": 3.3179167213512777e-06, "loss": 0.69915527, "num_input_tokens_seen": 105031460, "step": 4871, "time_per_iteration": 2.68796706199646 }, { "auxiliary_loss_clip": 0.01112736, "auxiliary_loss_mlp": 0.01051475, "balance_loss_clip": 1.04638386, "balance_loss_mlp": 1.03515494, "epoch": 0.2929204869983466, "flos": 29569492569600.0, "grad_norm": 4.945083241782643, "language_loss": 0.77463269, "learning_rate": 3.317623751303933e-06, "loss": 0.79627478, "num_input_tokens_seen": 105052965, "step": 4872, "time_per_iteration": 2.7679827213287354 }, { "auxiliary_loss_clip": 0.01078644, "auxiliary_loss_mlp": 0.01045822, "balance_loss_clip": 1.0468123, "balance_loss_mlp": 1.0273211, "epoch": 0.2929806102510146, "flos": 19058259922560.0, "grad_norm": 1.9468785945114855, "language_loss": 0.72814691, "learning_rate": 3.317330731292164e-06, "loss": 0.74939156, "num_input_tokens_seen": 105071840, "step": 4873, "time_per_iteration": 2.8704919815063477 }, { "auxiliary_loss_clip": 0.01135073, "auxiliary_loss_mlp": 0.01044722, "balance_loss_clip": 1.0525651, "balance_loss_mlp": 1.02705503, "epoch": 0.29304073350368254, "flos": 21944221130880.0, "grad_norm": 1.9420707280566882, "language_loss": 0.78093398, "learning_rate": 3.3170376613270812e-06, "loss": 0.80273187, "num_input_tokens_seen": 105089445, "step": 4874, "time_per_iteration": 2.6573073863983154 }, { "auxiliary_loss_clip": 0.01093774, "auxiliary_loss_mlp": 0.01045077, "balance_loss_clip": 1.05151463, "balance_loss_mlp": 1.02790475, "epoch": 0.2931008567563505, "flos": 15450818135040.0, "grad_norm": 1.8901262824755785, "language_loss": 0.77336359, "learning_rate": 3.3167445414197985e-06, "loss": 0.794752, "num_input_tokens_seen": 105106210, "step": 4875, "time_per_iteration": 2.6960959434509277 }, { "auxiliary_loss_clip": 0.01141436, "auxiliary_loss_mlp": 0.01038673, "balance_loss_clip": 1.05718327, "balance_loss_mlp": 1.02218604, "epoch": 0.29316098000901847, "flos": 16983162288000.0, "grad_norm": 1.556341262673854, "language_loss": 0.69037539, "learning_rate": 3.316451371581431e-06, "loss": 0.71217644, "num_input_tokens_seen": 105124200, "step": 4876, "time_per_iteration": 2.6719844341278076 }, { "auxiliary_loss_clip": 0.01121768, "auxiliary_loss_mlp": 0.01047732, "balance_loss_clip": 1.04729414, "balance_loss_mlp": 1.03105509, "epoch": 0.29322110326168643, "flos": 16357705741440.0, "grad_norm": 2.0371531421747466, "language_loss": 0.82111382, "learning_rate": 3.316158151823096e-06, "loss": 0.84280884, "num_input_tokens_seen": 105140400, "step": 4877, "time_per_iteration": 2.632293462753296 }, { "auxiliary_loss_clip": 0.01139233, "auxiliary_loss_mlp": 0.01040634, "balance_loss_clip": 1.05428672, "balance_loss_mlp": 1.02392054, "epoch": 0.29328122651435445, "flos": 13990869843840.0, "grad_norm": 3.614839551588232, "language_loss": 0.67366385, "learning_rate": 3.315864882155911e-06, "loss": 0.69546252, "num_input_tokens_seen": 105157535, "step": 4878, "time_per_iteration": 2.5839362144470215 }, { "auxiliary_loss_clip": 0.01100237, "auxiliary_loss_mlp": 0.01045253, "balance_loss_clip": 1.04628241, "balance_loss_mlp": 1.02817595, "epoch": 0.2933413497670224, "flos": 25264593423360.0, "grad_norm": 2.0985622071445063, "language_loss": 0.73632258, "learning_rate": 3.3155715625909982e-06, "loss": 0.75777751, "num_input_tokens_seen": 105175185, "step": 4879, "time_per_iteration": 2.738429307937622 }, { "auxiliary_loss_clip": 0.01104776, "auxiliary_loss_mlp": 0.00776504, "balance_loss_clip": 1.05266857, "balance_loss_mlp": 1.00116253, "epoch": 0.2934014730196904, "flos": 32123746656000.0, "grad_norm": 1.8172867500477656, "language_loss": 0.66441375, "learning_rate": 3.3152781931394803e-06, "loss": 0.68322659, "num_input_tokens_seen": 105194540, "step": 4880, "time_per_iteration": 2.7889339923858643 }, { "auxiliary_loss_clip": 0.01130875, "auxiliary_loss_mlp": 0.01049004, "balance_loss_clip": 1.05021453, "balance_loss_mlp": 1.03249359, "epoch": 0.29346159627235835, "flos": 24352498344960.0, "grad_norm": 1.9971358437235982, "language_loss": 0.70130688, "learning_rate": 3.314984773812481e-06, "loss": 0.72310567, "num_input_tokens_seen": 105213215, "step": 4881, "time_per_iteration": 2.705906629562378 }, { "auxiliary_loss_clip": 0.01112418, "auxiliary_loss_mlp": 0.00775734, "balance_loss_clip": 1.04823685, "balance_loss_mlp": 1.00119698, "epoch": 0.2935217195250263, "flos": 22746752749440.0, "grad_norm": 1.8949601379230998, "language_loss": 0.83497417, "learning_rate": 3.314691304621127e-06, "loss": 0.85385573, "num_input_tokens_seen": 105231585, "step": 4882, "time_per_iteration": 2.715853691101074 }, { "auxiliary_loss_clip": 0.01148283, "auxiliary_loss_mlp": 0.01045596, "balance_loss_clip": 1.05350292, "balance_loss_mlp": 1.02825117, "epoch": 0.2935818427776943, "flos": 21725561088000.0, "grad_norm": 2.6750396503443827, "language_loss": 0.71433568, "learning_rate": 3.314397785576548e-06, "loss": 0.73627448, "num_input_tokens_seen": 105250120, "step": 4883, "time_per_iteration": 2.629642963409424 }, { "auxiliary_loss_clip": 0.01123143, "auxiliary_loss_mlp": 0.01040743, "balance_loss_clip": 1.05262315, "balance_loss_mlp": 1.0230521, "epoch": 0.29364196603036224, "flos": 23804968354560.0, "grad_norm": 2.1262053984109226, "language_loss": 0.92650437, "learning_rate": 3.3141042166898726e-06, "loss": 0.94814324, "num_input_tokens_seen": 105266065, "step": 4884, "time_per_iteration": 2.727379322052002 }, { "auxiliary_loss_clip": 0.01138638, "auxiliary_loss_mlp": 0.01039707, "balance_loss_clip": 1.05512667, "balance_loss_mlp": 1.0232085, "epoch": 0.2937020892830302, "flos": 23470064922240.0, "grad_norm": 2.19754538449792, "language_loss": 0.73535883, "learning_rate": 3.313810597972234e-06, "loss": 0.75714231, "num_input_tokens_seen": 105282155, "step": 4885, "time_per_iteration": 2.706212043762207 }, { "auxiliary_loss_clip": 0.01124089, "auxiliary_loss_mlp": 0.01045234, "balance_loss_clip": 1.04882109, "balance_loss_mlp": 1.02791286, "epoch": 0.2937622125356982, "flos": 24272740195200.0, "grad_norm": 2.8259058407064566, "language_loss": 0.84815478, "learning_rate": 3.3135169294347655e-06, "loss": 0.86984795, "num_input_tokens_seen": 105299225, "step": 4886, "time_per_iteration": 2.651383876800537 }, { "auxiliary_loss_clip": 0.01112051, "auxiliary_loss_mlp": 0.01040147, "balance_loss_clip": 1.04674077, "balance_loss_mlp": 1.023839, "epoch": 0.29382233578836614, "flos": 20662461233280.0, "grad_norm": 2.312079302728887, "language_loss": 0.77030611, "learning_rate": 3.313223211088603e-06, "loss": 0.7918281, "num_input_tokens_seen": 105315710, "step": 4887, "time_per_iteration": 2.8299317359924316 }, { "auxiliary_loss_clip": 0.01121167, "auxiliary_loss_mlp": 0.01044419, "balance_loss_clip": 1.05137563, "balance_loss_mlp": 1.02809978, "epoch": 0.2938824590410341, "flos": 16545052103040.0, "grad_norm": 4.814706857660641, "language_loss": 0.79822707, "learning_rate": 3.3129294429448855e-06, "loss": 0.81988299, "num_input_tokens_seen": 105333505, "step": 4888, "time_per_iteration": 2.6942543983459473 }, { "auxiliary_loss_clip": 0.01114672, "auxiliary_loss_mlp": 0.01035208, "balance_loss_clip": 1.05101824, "balance_loss_mlp": 1.01886487, "epoch": 0.29394258229370207, "flos": 37925474382720.0, "grad_norm": 1.8060574020422921, "language_loss": 0.55514884, "learning_rate": 3.3126356250147517e-06, "loss": 0.57664764, "num_input_tokens_seen": 105355605, "step": 4889, "time_per_iteration": 2.838529586791992 }, { "auxiliary_loss_clip": 0.01136079, "auxiliary_loss_mlp": 0.01040242, "balance_loss_clip": 1.05230045, "balance_loss_mlp": 1.02257514, "epoch": 0.29400270554637004, "flos": 20044690197120.0, "grad_norm": 1.9006309093473746, "language_loss": 0.84414017, "learning_rate": 3.3123417573093434e-06, "loss": 0.86590338, "num_input_tokens_seen": 105374225, "step": 4890, "time_per_iteration": 2.653601884841919 }, { "auxiliary_loss_clip": 0.01138833, "auxiliary_loss_mlp": 0.01044226, "balance_loss_clip": 1.05449104, "balance_loss_mlp": 1.02767992, "epoch": 0.294062828799038, "flos": 15266380775040.0, "grad_norm": 2.3284792525221625, "language_loss": 0.72417939, "learning_rate": 3.3120478398398046e-06, "loss": 0.74600995, "num_input_tokens_seen": 105391565, "step": 4891, "time_per_iteration": 2.6499764919281006 }, { "auxiliary_loss_clip": 0.01148906, "auxiliary_loss_mlp": 0.01046245, "balance_loss_clip": 1.05517375, "balance_loss_mlp": 1.02797008, "epoch": 0.294122952051706, "flos": 22747147799040.0, "grad_norm": 1.6858898954482169, "language_loss": 0.77310836, "learning_rate": 3.3117538726172797e-06, "loss": 0.7950598, "num_input_tokens_seen": 105409840, "step": 4892, "time_per_iteration": 2.6123669147491455 }, { "auxiliary_loss_clip": 0.01143283, "auxiliary_loss_mlp": 0.01036481, "balance_loss_clip": 1.05147183, "balance_loss_mlp": 1.01932704, "epoch": 0.294183075304374, "flos": 24972891073920.0, "grad_norm": 1.8056938004749827, "language_loss": 0.77826709, "learning_rate": 3.3114598556529164e-06, "loss": 0.80006474, "num_input_tokens_seen": 105428645, "step": 4893, "time_per_iteration": 2.6142194271087646 }, { "auxiliary_loss_clip": 0.01106286, "auxiliary_loss_mlp": 0.01045871, "balance_loss_clip": 1.0508399, "balance_loss_mlp": 1.02912164, "epoch": 0.29424319855704195, "flos": 30952986762240.0, "grad_norm": 3.6552959609210944, "language_loss": 0.85032988, "learning_rate": 3.311165788957864e-06, "loss": 0.87185144, "num_input_tokens_seen": 105447480, "step": 4894, "time_per_iteration": 2.837883234024048 }, { "auxiliary_loss_clip": 0.01131513, "auxiliary_loss_mlp": 0.01038131, "balance_loss_clip": 1.05098557, "balance_loss_mlp": 1.02169216, "epoch": 0.2943033218097099, "flos": 15231583474560.0, "grad_norm": 3.570255241204836, "language_loss": 0.90650308, "learning_rate": 3.310871672543274e-06, "loss": 0.92819947, "num_input_tokens_seen": 105464600, "step": 4895, "time_per_iteration": 2.588153839111328 }, { "auxiliary_loss_clip": 0.01138224, "auxiliary_loss_mlp": 0.01045554, "balance_loss_clip": 1.05338621, "balance_loss_mlp": 1.02777958, "epoch": 0.2943634450623779, "flos": 21725884310400.0, "grad_norm": 1.7548452829513195, "language_loss": 0.86612183, "learning_rate": 3.3105775064202982e-06, "loss": 0.88795966, "num_input_tokens_seen": 105481510, "step": 4896, "time_per_iteration": 2.6405279636383057 }, { "auxiliary_loss_clip": 0.01142594, "auxiliary_loss_mlp": 0.01053714, "balance_loss_clip": 1.05662429, "balance_loss_mlp": 1.03620195, "epoch": 0.29442356831504585, "flos": 22602104680320.0, "grad_norm": 2.0549220420715906, "language_loss": 0.73394442, "learning_rate": 3.3102832906000924e-06, "loss": 0.75590742, "num_input_tokens_seen": 105501390, "step": 4897, "time_per_iteration": 2.6669554710388184 }, { "auxiliary_loss_clip": 0.01128563, "auxiliary_loss_mlp": 0.01050668, "balance_loss_clip": 1.04556203, "balance_loss_mlp": 1.03214252, "epoch": 0.2944836915677138, "flos": 20011401267840.0, "grad_norm": 2.0814872266581426, "language_loss": 0.74344778, "learning_rate": 3.309989025093813e-06, "loss": 0.76524007, "num_input_tokens_seen": 105519600, "step": 4898, "time_per_iteration": 2.6286890506744385 }, { "auxiliary_loss_clip": 0.01140269, "auxiliary_loss_mlp": 0.01047883, "balance_loss_clip": 1.05775058, "balance_loss_mlp": 1.02880955, "epoch": 0.2945438148203818, "flos": 20045875345920.0, "grad_norm": 2.610474436320842, "language_loss": 0.70560962, "learning_rate": 3.309694709912618e-06, "loss": 0.72749114, "num_input_tokens_seen": 105535970, "step": 4899, "time_per_iteration": 2.6050777435302734 }, { "auxiliary_loss_clip": 0.01122842, "auxiliary_loss_mlp": 0.00775757, "balance_loss_clip": 1.05115175, "balance_loss_mlp": 1.00110114, "epoch": 0.29460393807304974, "flos": 23733542160000.0, "grad_norm": 2.6981557529788587, "language_loss": 0.78938496, "learning_rate": 3.3094003450676685e-06, "loss": 0.80837095, "num_input_tokens_seen": 105556735, "step": 4900, "time_per_iteration": 2.7517058849334717 }, { "auxiliary_loss_clip": 0.0110429, "auxiliary_loss_mlp": 0.01059395, "balance_loss_clip": 1.04257679, "balance_loss_mlp": 1.03992808, "epoch": 0.2946640613257177, "flos": 14976079056000.0, "grad_norm": 1.7286923709762618, "language_loss": 0.80861294, "learning_rate": 3.3091059305701268e-06, "loss": 0.83024979, "num_input_tokens_seen": 105574875, "step": 4901, "time_per_iteration": 2.58297061920166 }, { "auxiliary_loss_clip": 0.01114064, "auxiliary_loss_mlp": 0.01035256, "balance_loss_clip": 1.05081403, "balance_loss_mlp": 1.01993775, "epoch": 0.2947241845783857, "flos": 24243904552320.0, "grad_norm": 2.2236242529025954, "language_loss": 0.57768303, "learning_rate": 3.308811466431157e-06, "loss": 0.59917623, "num_input_tokens_seen": 105594225, "step": 4902, "time_per_iteration": 2.6765553951263428 }, { "auxiliary_loss_clip": 0.01122886, "auxiliary_loss_mlp": 0.01044406, "balance_loss_clip": 1.05165744, "balance_loss_mlp": 1.02809834, "epoch": 0.29478430783105364, "flos": 19938394874880.0, "grad_norm": 1.6365628527843905, "language_loss": 0.7553789, "learning_rate": 3.308516952661925e-06, "loss": 0.77705181, "num_input_tokens_seen": 105614000, "step": 4903, "time_per_iteration": 5.72201132774353 }, { "auxiliary_loss_clip": 0.01117125, "auxiliary_loss_mlp": 0.01054328, "balance_loss_clip": 1.05058551, "balance_loss_mlp": 1.03506362, "epoch": 0.2948444310837216, "flos": 27381347856000.0, "grad_norm": 1.79479894391178, "language_loss": 0.62782186, "learning_rate": 3.3082223892736e-06, "loss": 0.64953631, "num_input_tokens_seen": 105634575, "step": 4904, "time_per_iteration": 2.7290875911712646 }, { "auxiliary_loss_clip": 0.01135143, "auxiliary_loss_mlp": 0.01043669, "balance_loss_clip": 1.05146813, "balance_loss_mlp": 1.02669382, "epoch": 0.2949045543363896, "flos": 23405462311680.0, "grad_norm": 1.4755442774564356, "language_loss": 0.73145443, "learning_rate": 3.3079277762773496e-06, "loss": 0.75324261, "num_input_tokens_seen": 105654385, "step": 4905, "time_per_iteration": 2.6482555866241455 }, { "auxiliary_loss_clip": 0.01112476, "auxiliary_loss_mlp": 0.01046266, "balance_loss_clip": 1.05017638, "balance_loss_mlp": 1.028265, "epoch": 0.2949646775890576, "flos": 23951483930880.0, "grad_norm": 1.7800977730713317, "language_loss": 0.8199898, "learning_rate": 3.3076331136843476e-06, "loss": 0.84157723, "num_input_tokens_seen": 105673570, "step": 4906, "time_per_iteration": 2.737182378768921 }, { "auxiliary_loss_clip": 0.01094663, "auxiliary_loss_mlp": 0.01040505, "balance_loss_clip": 1.04579425, "balance_loss_mlp": 1.02372003, "epoch": 0.29502480084172555, "flos": 22784315397120.0, "grad_norm": 2.8763815934933867, "language_loss": 0.87373984, "learning_rate": 3.3073384015057667e-06, "loss": 0.89509153, "num_input_tokens_seen": 105691940, "step": 4907, "time_per_iteration": 4.367825746536255 }, { "auxiliary_loss_clip": 0.01149393, "auxiliary_loss_mlp": 0.01043671, "balance_loss_clip": 1.05400407, "balance_loss_mlp": 1.02501488, "epoch": 0.2950849240943935, "flos": 19646656611840.0, "grad_norm": 2.047818146937445, "language_loss": 0.81910521, "learning_rate": 3.307043639752782e-06, "loss": 0.84103584, "num_input_tokens_seen": 105709825, "step": 4908, "time_per_iteration": 2.578582525253296 }, { "auxiliary_loss_clip": 0.01055582, "auxiliary_loss_mlp": 0.01003419, "balance_loss_clip": 1.02453518, "balance_loss_mlp": 1.00138056, "epoch": 0.2951450473470615, "flos": 71002829260800.0, "grad_norm": 0.7982723827999523, "language_loss": 0.57287854, "learning_rate": 3.3067488284365728e-06, "loss": 0.59346855, "num_input_tokens_seen": 105766880, "step": 4909, "time_per_iteration": 4.640491247177124 }, { "auxiliary_loss_clip": 0.01135445, "auxiliary_loss_mlp": 0.00774301, "balance_loss_clip": 1.05580318, "balance_loss_mlp": 1.00097156, "epoch": 0.29520517059972945, "flos": 22966310632320.0, "grad_norm": 1.756295161453336, "language_loss": 0.87018639, "learning_rate": 3.3064539675683163e-06, "loss": 0.88928384, "num_input_tokens_seen": 105786875, "step": 4910, "time_per_iteration": 2.642312526702881 }, { "auxiliary_loss_clip": 0.01131096, "auxiliary_loss_mlp": 0.0104303, "balance_loss_clip": 1.05359542, "balance_loss_mlp": 1.02744913, "epoch": 0.2952652938523974, "flos": 20485673470080.0, "grad_norm": 1.692596753939278, "language_loss": 0.73332304, "learning_rate": 3.3061590571591946e-06, "loss": 0.75506431, "num_input_tokens_seen": 105805315, "step": 4911, "time_per_iteration": 2.6130573749542236 }, { "auxiliary_loss_clip": 0.01132917, "auxiliary_loss_mlp": 0.01038473, "balance_loss_clip": 1.05330253, "balance_loss_mlp": 1.02193832, "epoch": 0.2953254171050654, "flos": 19646584784640.0, "grad_norm": 1.8009313294920104, "language_loss": 0.89653587, "learning_rate": 3.3058640972203904e-06, "loss": 0.91824973, "num_input_tokens_seen": 105825125, "step": 4912, "time_per_iteration": 2.660090684890747 }, { "auxiliary_loss_clip": 0.01114053, "auxiliary_loss_mlp": 0.010529, "balance_loss_clip": 1.0482899, "balance_loss_mlp": 1.03503084, "epoch": 0.29538554035773334, "flos": 22747973811840.0, "grad_norm": 1.3579869674800176, "language_loss": 0.83175462, "learning_rate": 3.3055690877630894e-06, "loss": 0.85342413, "num_input_tokens_seen": 105846085, "step": 4913, "time_per_iteration": 2.743364095687866 }, { "auxiliary_loss_clip": 0.01142468, "auxiliary_loss_mlp": 0.01043093, "balance_loss_clip": 1.04977608, "balance_loss_mlp": 1.02690446, "epoch": 0.2954456636104013, "flos": 21871861182720.0, "grad_norm": 1.9704695859403116, "language_loss": 0.76919919, "learning_rate": 3.3052740287984765e-06, "loss": 0.79105484, "num_input_tokens_seen": 105865400, "step": 4914, "time_per_iteration": 2.6778385639190674 }, { "auxiliary_loss_clip": 0.01121315, "auxiliary_loss_mlp": 0.01045386, "balance_loss_clip": 1.05064511, "balance_loss_mlp": 1.02818418, "epoch": 0.2955057868630693, "flos": 40442560871040.0, "grad_norm": 1.678810736285401, "language_loss": 0.81829619, "learning_rate": 3.3049789203377424e-06, "loss": 0.8399632, "num_input_tokens_seen": 105887920, "step": 4915, "time_per_iteration": 2.9347212314605713 }, { "auxiliary_loss_clip": 0.01068117, "auxiliary_loss_mlp": 0.01044435, "balance_loss_clip": 1.04405856, "balance_loss_mlp": 1.02722168, "epoch": 0.29556591011573724, "flos": 22564506119040.0, "grad_norm": 2.129336551193515, "language_loss": 0.84701812, "learning_rate": 3.3046837623920772e-06, "loss": 0.86814368, "num_input_tokens_seen": 105904035, "step": 4916, "time_per_iteration": 2.9183273315429688 }, { "auxiliary_loss_clip": 0.01125851, "auxiliary_loss_mlp": 0.01036694, "balance_loss_clip": 1.04655123, "balance_loss_mlp": 1.01975429, "epoch": 0.2956260333684052, "flos": 22089300163200.0, "grad_norm": 2.1082729468541683, "language_loss": 0.69490808, "learning_rate": 3.3043885549726723e-06, "loss": 0.71653348, "num_input_tokens_seen": 105922685, "step": 4917, "time_per_iteration": 2.7400357723236084 }, { "auxiliary_loss_clip": 0.01123659, "auxiliary_loss_mlp": 0.01038633, "balance_loss_clip": 1.05140972, "balance_loss_mlp": 1.02214622, "epoch": 0.2956861566210732, "flos": 16435488643200.0, "grad_norm": 2.699189623646437, "language_loss": 0.91076934, "learning_rate": 3.3040932980907226e-06, "loss": 0.93239224, "num_input_tokens_seen": 105940425, "step": 4918, "time_per_iteration": 2.7343270778656006 }, { "auxiliary_loss_clip": 0.01147937, "auxiliary_loss_mlp": 0.01043258, "balance_loss_clip": 1.0551039, "balance_loss_mlp": 1.02629495, "epoch": 0.2957462798737412, "flos": 25812087500160.0, "grad_norm": 1.9388581576792214, "language_loss": 0.72399175, "learning_rate": 3.303797991757425e-06, "loss": 0.74590373, "num_input_tokens_seen": 105960550, "step": 4919, "time_per_iteration": 2.718583822250366 }, { "auxiliary_loss_clip": 0.01119627, "auxiliary_loss_mlp": 0.01045651, "balance_loss_clip": 1.04843163, "balance_loss_mlp": 1.02838945, "epoch": 0.29580640312640916, "flos": 16690849407360.0, "grad_norm": 1.8826298231205452, "language_loss": 0.75919485, "learning_rate": 3.3035026359839763e-06, "loss": 0.78084767, "num_input_tokens_seen": 105978820, "step": 4920, "time_per_iteration": 2.7425734996795654 }, { "auxiliary_loss_clip": 0.01121739, "auxiliary_loss_mlp": 0.01052293, "balance_loss_clip": 1.05511427, "balance_loss_mlp": 1.03449547, "epoch": 0.2958665263790771, "flos": 23945594100480.0, "grad_norm": 5.307541834842734, "language_loss": 0.69020098, "learning_rate": 3.3032072307815774e-06, "loss": 0.71194124, "num_input_tokens_seen": 105997545, "step": 4921, "time_per_iteration": 2.7755305767059326 }, { "auxiliary_loss_clip": 0.01120164, "auxiliary_loss_mlp": 0.01043, "balance_loss_clip": 1.05075121, "balance_loss_mlp": 1.02453458, "epoch": 0.2959266496317451, "flos": 18478410670080.0, "grad_norm": 1.8488664920888758, "language_loss": 0.7462194, "learning_rate": 3.3029117761614298e-06, "loss": 0.767851, "num_input_tokens_seen": 106015320, "step": 4922, "time_per_iteration": 2.740687131881714 }, { "auxiliary_loss_clip": 0.01152013, "auxiliary_loss_mlp": 0.00775382, "balance_loss_clip": 1.05429566, "balance_loss_mlp": 1.00129843, "epoch": 0.29598677288441305, "flos": 25957489754880.0, "grad_norm": 1.7662799143188246, "language_loss": 0.77148855, "learning_rate": 3.302616272134737e-06, "loss": 0.79076254, "num_input_tokens_seen": 106034555, "step": 4923, "time_per_iteration": 2.664875030517578 }, { "auxiliary_loss_clip": 0.01117655, "auxiliary_loss_mlp": 0.01042537, "balance_loss_clip": 1.05065989, "balance_loss_mlp": 1.0247035, "epoch": 0.296046896137081, "flos": 25155999630720.0, "grad_norm": 1.7775190737024398, "language_loss": 0.86232758, "learning_rate": 3.3023207187127042e-06, "loss": 0.88392955, "num_input_tokens_seen": 106054200, "step": 4924, "time_per_iteration": 2.7413501739501953 }, { "auxiliary_loss_clip": 0.01132544, "auxiliary_loss_mlp": 0.01038356, "balance_loss_clip": 1.05098939, "balance_loss_mlp": 1.02114248, "epoch": 0.296107019389749, "flos": 21761148487680.0, "grad_norm": 1.479657736715748, "language_loss": 0.82050943, "learning_rate": 3.3020251159065396e-06, "loss": 0.84221852, "num_input_tokens_seen": 106074700, "step": 4925, "time_per_iteration": 2.676556348800659 }, { "auxiliary_loss_clip": 0.01078547, "auxiliary_loss_mlp": 0.01051683, "balance_loss_clip": 1.04153097, "balance_loss_mlp": 1.03283572, "epoch": 0.29616714264241695, "flos": 17960039544960.0, "grad_norm": 2.5440905583969697, "language_loss": 0.86138272, "learning_rate": 3.301729463727452e-06, "loss": 0.88268495, "num_input_tokens_seen": 106091415, "step": 4926, "time_per_iteration": 2.675780773162842 }, { "auxiliary_loss_clip": 0.01108502, "auxiliary_loss_mlp": 0.01035423, "balance_loss_clip": 1.04910469, "balance_loss_mlp": 1.0193243, "epoch": 0.2962272658950849, "flos": 15012779777280.0, "grad_norm": 2.332235960138756, "language_loss": 0.85897464, "learning_rate": 3.3014337621866527e-06, "loss": 0.88041389, "num_input_tokens_seen": 106109135, "step": 4927, "time_per_iteration": 2.7407169342041016 }, { "auxiliary_loss_clip": 0.01131541, "auxiliary_loss_mlp": 0.01039363, "balance_loss_clip": 1.05158448, "balance_loss_mlp": 1.02312613, "epoch": 0.2962873891477529, "flos": 14720861946240.0, "grad_norm": 3.581765820174834, "language_loss": 0.80772752, "learning_rate": 3.3011380112953553e-06, "loss": 0.8294366, "num_input_tokens_seen": 106125750, "step": 4928, "time_per_iteration": 2.6719777584075928 }, { "auxiliary_loss_clip": 0.01123889, "auxiliary_loss_mlp": 0.01043191, "balance_loss_clip": 1.04852009, "balance_loss_mlp": 1.02346206, "epoch": 0.29634751240042084, "flos": 26723787528960.0, "grad_norm": 2.79065826833615, "language_loss": 0.7313869, "learning_rate": 3.300842211064773e-06, "loss": 0.75305772, "num_input_tokens_seen": 106142835, "step": 4929, "time_per_iteration": 2.75266695022583 }, { "auxiliary_loss_clip": 0.0112132, "auxiliary_loss_mlp": 0.01054118, "balance_loss_clip": 1.0495156, "balance_loss_mlp": 1.03481805, "epoch": 0.2964076356530888, "flos": 14571293713920.0, "grad_norm": 2.360375509218164, "language_loss": 0.71534413, "learning_rate": 3.3005463615061246e-06, "loss": 0.73709846, "num_input_tokens_seen": 106160680, "step": 4930, "time_per_iteration": 2.799149990081787 }, { "auxiliary_loss_clip": 0.01028509, "auxiliary_loss_mlp": 0.01003992, "balance_loss_clip": 1.03094876, "balance_loss_mlp": 1.00229919, "epoch": 0.29646775890575683, "flos": 63104315063040.0, "grad_norm": 0.8053244370028285, "language_loss": 0.6061247, "learning_rate": 3.3002504626306275e-06, "loss": 0.6264497, "num_input_tokens_seen": 106224415, "step": 4931, "time_per_iteration": 3.218900442123413 }, { "auxiliary_loss_clip": 0.01007041, "auxiliary_loss_mlp": 0.01005936, "balance_loss_clip": 1.02247667, "balance_loss_mlp": 1.00395727, "epoch": 0.2965278821584248, "flos": 63067686168960.0, "grad_norm": 0.7408573754586586, "language_loss": 0.52380091, "learning_rate": 3.2999545144495023e-06, "loss": 0.54393071, "num_input_tokens_seen": 106279140, "step": 4932, "time_per_iteration": 3.26432728767395 }, { "auxiliary_loss_clip": 0.01129633, "auxiliary_loss_mlp": 0.01042438, "balance_loss_clip": 1.04917526, "balance_loss_mlp": 1.02584457, "epoch": 0.29658800541109276, "flos": 23768734510080.0, "grad_norm": 2.012094119717185, "language_loss": 0.81540775, "learning_rate": 3.299658516973972e-06, "loss": 0.83712846, "num_input_tokens_seen": 106298190, "step": 4933, "time_per_iteration": 2.804293155670166 }, { "auxiliary_loss_clip": 0.01092845, "auxiliary_loss_mlp": 0.01036901, "balance_loss_clip": 1.04405773, "balance_loss_mlp": 1.01966333, "epoch": 0.2966481286637607, "flos": 23988543788160.0, "grad_norm": 1.916542141573101, "language_loss": 0.75165296, "learning_rate": 3.299362470215261e-06, "loss": 0.77295041, "num_input_tokens_seen": 106319065, "step": 4934, "time_per_iteration": 2.797697067260742 }, { "auxiliary_loss_clip": 0.01126398, "auxiliary_loss_mlp": 0.01047716, "balance_loss_clip": 1.04985118, "balance_loss_mlp": 1.03013301, "epoch": 0.2967082519164287, "flos": 17165157523200.0, "grad_norm": 1.8491505675561635, "language_loss": 0.62093496, "learning_rate": 3.299066374184594e-06, "loss": 0.64267612, "num_input_tokens_seen": 106338040, "step": 4935, "time_per_iteration": 2.6466407775878906 }, { "auxiliary_loss_clip": 0.01129018, "auxiliary_loss_mlp": 0.01041652, "balance_loss_clip": 1.05052114, "balance_loss_mlp": 1.02452123, "epoch": 0.29676837516909665, "flos": 29387712816000.0, "grad_norm": 1.4269626202910053, "language_loss": 0.79485404, "learning_rate": 3.2987702288932e-06, "loss": 0.81656075, "num_input_tokens_seen": 106358900, "step": 4936, "time_per_iteration": 2.7333009243011475 }, { "auxiliary_loss_clip": 0.01100808, "auxiliary_loss_mlp": 0.01048756, "balance_loss_clip": 1.04970682, "balance_loss_mlp": 1.03040934, "epoch": 0.2968284984217646, "flos": 34751222616960.0, "grad_norm": 1.5951903019521643, "language_loss": 0.73993498, "learning_rate": 3.298474034352309e-06, "loss": 0.76143062, "num_input_tokens_seen": 106381805, "step": 4937, "time_per_iteration": 2.853935718536377 }, { "auxiliary_loss_clip": 0.01094789, "auxiliary_loss_mlp": 0.01038743, "balance_loss_clip": 1.05060768, "balance_loss_mlp": 1.0209924, "epoch": 0.2968886216744326, "flos": 21544104556800.0, "grad_norm": 1.654578873057457, "language_loss": 0.78373563, "learning_rate": 3.2981777905731526e-06, "loss": 0.80507094, "num_input_tokens_seen": 106402365, "step": 4938, "time_per_iteration": 2.803147077560425 }, { "auxiliary_loss_clip": 0.0111878, "auxiliary_loss_mlp": 0.01048023, "balance_loss_clip": 1.05193913, "balance_loss_mlp": 1.02931857, "epoch": 0.29694874492710055, "flos": 12787323811200.0, "grad_norm": 2.4827377035181013, "language_loss": 0.76842266, "learning_rate": 3.297881497566964e-06, "loss": 0.79009068, "num_input_tokens_seen": 106419800, "step": 4939, "time_per_iteration": 2.8867270946502686 }, { "auxiliary_loss_clip": 0.0111051, "auxiliary_loss_mlp": 0.01041172, "balance_loss_clip": 1.04666841, "balance_loss_mlp": 1.02361226, "epoch": 0.2970088681797685, "flos": 24569973239040.0, "grad_norm": 1.8055035581570296, "language_loss": 0.78354549, "learning_rate": 3.297585155344979e-06, "loss": 0.80506229, "num_input_tokens_seen": 106440300, "step": 4940, "time_per_iteration": 2.783046245574951 }, { "auxiliary_loss_clip": 0.01117762, "auxiliary_loss_mlp": 0.01037936, "balance_loss_clip": 1.0486958, "balance_loss_mlp": 1.01876736, "epoch": 0.2970689914324365, "flos": 23659171050240.0, "grad_norm": 1.6305550110852276, "language_loss": 0.75628781, "learning_rate": 3.297288763918435e-06, "loss": 0.77784479, "num_input_tokens_seen": 106460035, "step": 4941, "time_per_iteration": 2.74379825592041 }, { "auxiliary_loss_clip": 0.01138083, "auxiliary_loss_mlp": 0.01051629, "balance_loss_clip": 1.05272233, "balance_loss_mlp": 1.03276968, "epoch": 0.29712911468510445, "flos": 39670301439360.0, "grad_norm": 2.3053326725865313, "language_loss": 0.74158287, "learning_rate": 3.2969923232985712e-06, "loss": 0.76347995, "num_input_tokens_seen": 106481095, "step": 4942, "time_per_iteration": 4.468350410461426 }, { "auxiliary_loss_clip": 0.01111068, "auxiliary_loss_mlp": 0.0104429, "balance_loss_clip": 1.05172181, "balance_loss_mlp": 1.02589595, "epoch": 0.2971892379377724, "flos": 26395312631040.0, "grad_norm": 2.42728921351593, "language_loss": 0.702492, "learning_rate": 3.2966958334966287e-06, "loss": 0.72404563, "num_input_tokens_seen": 106501590, "step": 4943, "time_per_iteration": 4.2555251121521 }, { "auxiliary_loss_clip": 0.01124177, "auxiliary_loss_mlp": 0.01041442, "balance_loss_clip": 1.04988825, "balance_loss_mlp": 1.02360821, "epoch": 0.2972493611904404, "flos": 17603195880960.0, "grad_norm": 2.221197725988377, "language_loss": 0.795506, "learning_rate": 3.2963992945238497e-06, "loss": 0.81716216, "num_input_tokens_seen": 106519430, "step": 4944, "time_per_iteration": 2.6572201251983643 }, { "auxiliary_loss_clip": 0.0111705, "auxiliary_loss_mlp": 0.01041351, "balance_loss_clip": 1.04914248, "balance_loss_mlp": 1.02521038, "epoch": 0.2973094844431084, "flos": 20412774817920.0, "grad_norm": 2.187472317578873, "language_loss": 0.83260202, "learning_rate": 3.2961027063914795e-06, "loss": 0.85418606, "num_input_tokens_seen": 106535870, "step": 4945, "time_per_iteration": 2.6700363159179688 }, { "auxiliary_loss_clip": 0.01090371, "auxiliary_loss_mlp": 0.01039575, "balance_loss_clip": 1.04623246, "balance_loss_mlp": 1.02256417, "epoch": 0.29736960769577636, "flos": 17493488766720.0, "grad_norm": 1.8830005833778707, "language_loss": 0.67067397, "learning_rate": 3.2958060691107654e-06, "loss": 0.69197345, "num_input_tokens_seen": 106553560, "step": 4946, "time_per_iteration": 4.29357385635376 }, { "auxiliary_loss_clip": 0.01127819, "auxiliary_loss_mlp": 0.00777134, "balance_loss_clip": 1.04997563, "balance_loss_mlp": 1.00115252, "epoch": 0.2974297309484443, "flos": 26103969417600.0, "grad_norm": 1.879721590970614, "language_loss": 0.73877805, "learning_rate": 3.2955093826929547e-06, "loss": 0.75782764, "num_input_tokens_seen": 106574115, "step": 4947, "time_per_iteration": 2.657038450241089 }, { "auxiliary_loss_clip": 0.01109701, "auxiliary_loss_mlp": 0.01045546, "balance_loss_clip": 1.04896843, "balance_loss_mlp": 1.02705622, "epoch": 0.2974898542011123, "flos": 25666433850240.0, "grad_norm": 2.0989098852090633, "language_loss": 0.73522758, "learning_rate": 3.2952126471492985e-06, "loss": 0.75678003, "num_input_tokens_seen": 106593070, "step": 4948, "time_per_iteration": 4.4359636306762695 }, { "auxiliary_loss_clip": 0.01139863, "auxiliary_loss_mlp": 0.01040301, "balance_loss_clip": 1.04885721, "balance_loss_mlp": 1.02332592, "epoch": 0.29754997745378026, "flos": 18661339658880.0, "grad_norm": 2.06615582769113, "language_loss": 0.8397494, "learning_rate": 3.2949158624910497e-06, "loss": 0.86155105, "num_input_tokens_seen": 106610695, "step": 4949, "time_per_iteration": 2.6052157878875732 }, { "auxiliary_loss_clip": 0.01128522, "auxiliary_loss_mlp": 0.01041578, "balance_loss_clip": 1.04901218, "balance_loss_mlp": 1.02459633, "epoch": 0.2976101007064482, "flos": 22274599449600.0, "grad_norm": 2.2184783420455814, "language_loss": 0.71360326, "learning_rate": 3.2946190287294603e-06, "loss": 0.73530424, "num_input_tokens_seen": 106631300, "step": 4950, "time_per_iteration": 2.678953170776367 }, { "auxiliary_loss_clip": 0.01095366, "auxiliary_loss_mlp": 0.01039981, "balance_loss_clip": 1.04944646, "balance_loss_mlp": 1.0239712, "epoch": 0.2976702239591162, "flos": 21945657674880.0, "grad_norm": 3.098719098855731, "language_loss": 0.82645297, "learning_rate": 3.294322145875789e-06, "loss": 0.84780639, "num_input_tokens_seen": 106650065, "step": 4951, "time_per_iteration": 2.7566003799438477 }, { "auxiliary_loss_clip": 0.01118264, "auxiliary_loss_mlp": 0.01039186, "balance_loss_clip": 1.04655933, "balance_loss_mlp": 1.02190065, "epoch": 0.29773034721178415, "flos": 24637197542400.0, "grad_norm": 15.690000260498868, "language_loss": 0.74144769, "learning_rate": 3.2940252139412912e-06, "loss": 0.76302218, "num_input_tokens_seen": 106668230, "step": 4952, "time_per_iteration": 2.7019882202148438 }, { "auxiliary_loss_clip": 0.01063128, "auxiliary_loss_mlp": 0.01049349, "balance_loss_clip": 1.0433315, "balance_loss_mlp": 1.03133702, "epoch": 0.2977904704644521, "flos": 20557566541440.0, "grad_norm": 1.6701113978494808, "language_loss": 0.84251344, "learning_rate": 3.293728232937228e-06, "loss": 0.86363828, "num_input_tokens_seen": 106687785, "step": 4953, "time_per_iteration": 2.9622793197631836 }, { "auxiliary_loss_clip": 0.01120636, "auxiliary_loss_mlp": 0.01040588, "balance_loss_clip": 1.04966831, "balance_loss_mlp": 1.02428031, "epoch": 0.2978505937171201, "flos": 18916449027840.0, "grad_norm": 2.301918041259246, "language_loss": 0.74366152, "learning_rate": 3.2934312028748597e-06, "loss": 0.76527375, "num_input_tokens_seen": 106706875, "step": 4954, "time_per_iteration": 2.767455577850342 }, { "auxiliary_loss_clip": 0.01138563, "auxiliary_loss_mlp": 0.01036281, "balance_loss_clip": 1.04899216, "balance_loss_mlp": 1.02028275, "epoch": 0.29791071696978805, "flos": 19317750750720.0, "grad_norm": 2.0603039788066155, "language_loss": 0.75687683, "learning_rate": 3.293134123765452e-06, "loss": 0.77862525, "num_input_tokens_seen": 106725105, "step": 4955, "time_per_iteration": 2.638389825820923 }, { "auxiliary_loss_clip": 0.01094257, "auxiliary_loss_mlp": 0.01042355, "balance_loss_clip": 1.04760742, "balance_loss_mlp": 1.02505171, "epoch": 0.297970840222456, "flos": 18806813740800.0, "grad_norm": 2.358195616275362, "language_loss": 0.72600436, "learning_rate": 3.2928369956202684e-06, "loss": 0.74737054, "num_input_tokens_seen": 106744780, "step": 4956, "time_per_iteration": 2.777873992919922 }, { "auxiliary_loss_clip": 0.01134603, "auxiliary_loss_mlp": 0.0104754, "balance_loss_clip": 1.04957581, "balance_loss_mlp": 1.02930105, "epoch": 0.298030963475124, "flos": 22852760762880.0, "grad_norm": 2.0297274127598435, "language_loss": 0.79068756, "learning_rate": 3.2925398184505754e-06, "loss": 0.81250894, "num_input_tokens_seen": 106764670, "step": 4957, "time_per_iteration": 2.719581365585327 }, { "auxiliary_loss_clip": 0.01134843, "auxiliary_loss_mlp": 0.01041974, "balance_loss_clip": 1.05054235, "balance_loss_mlp": 1.02383018, "epoch": 0.298091086727792, "flos": 21868485304320.0, "grad_norm": 1.706880580606115, "language_loss": 0.70570725, "learning_rate": 3.2922425922676437e-06, "loss": 0.7274754, "num_input_tokens_seen": 106783695, "step": 4958, "time_per_iteration": 2.613697052001953 }, { "auxiliary_loss_clip": 0.01108077, "auxiliary_loss_mlp": 0.0104267, "balance_loss_clip": 1.05166888, "balance_loss_mlp": 1.0253129, "epoch": 0.29815120998045996, "flos": 21175014355200.0, "grad_norm": 1.5383051389102413, "language_loss": 0.78736448, "learning_rate": 3.291945317082743e-06, "loss": 0.80887192, "num_input_tokens_seen": 106803150, "step": 4959, "time_per_iteration": 2.751455545425415 }, { "auxiliary_loss_clip": 0.01129828, "auxiliary_loss_mlp": 0.01045919, "balance_loss_clip": 1.04906321, "balance_loss_mlp": 1.0290029, "epoch": 0.29821133323312793, "flos": 19896271200000.0, "grad_norm": 1.6624120752671379, "language_loss": 0.79747117, "learning_rate": 3.291647992907147e-06, "loss": 0.81922865, "num_input_tokens_seen": 106820705, "step": 4960, "time_per_iteration": 2.6345505714416504 }, { "auxiliary_loss_clip": 0.01110987, "auxiliary_loss_mlp": 0.01052912, "balance_loss_clip": 1.04863763, "balance_loss_mlp": 1.03449416, "epoch": 0.2982714564857959, "flos": 12750766744320.0, "grad_norm": 2.376132196895137, "language_loss": 0.73364639, "learning_rate": 3.291350619752129e-06, "loss": 0.75528538, "num_input_tokens_seen": 106837335, "step": 4961, "time_per_iteration": 2.725008010864258 }, { "auxiliary_loss_clip": 0.01130001, "auxiliary_loss_mlp": 0.0104294, "balance_loss_clip": 1.04824948, "balance_loss_mlp": 1.02640533, "epoch": 0.29833157973846386, "flos": 22271905929600.0, "grad_norm": 2.036560430862295, "language_loss": 0.62106621, "learning_rate": 3.291053197628967e-06, "loss": 0.64279556, "num_input_tokens_seen": 106856250, "step": 4962, "time_per_iteration": 2.690870523452759 }, { "auxiliary_loss_clip": 0.01128362, "auxiliary_loss_mlp": 0.01051341, "balance_loss_clip": 1.05034256, "balance_loss_mlp": 1.03310251, "epoch": 0.2983917029911318, "flos": 15372999319680.0, "grad_norm": 2.046461333274312, "language_loss": 0.82866591, "learning_rate": 3.2907557265489375e-06, "loss": 0.85046291, "num_input_tokens_seen": 106873370, "step": 4963, "time_per_iteration": 2.637723207473755 }, { "auxiliary_loss_clip": 0.01112844, "auxiliary_loss_mlp": 0.01044675, "balance_loss_clip": 1.05338502, "balance_loss_mlp": 1.0272826, "epoch": 0.2984518262437998, "flos": 15377632174080.0, "grad_norm": 2.580714695656121, "language_loss": 0.65933317, "learning_rate": 3.290458206523322e-06, "loss": 0.68090838, "num_input_tokens_seen": 106890330, "step": 4964, "time_per_iteration": 2.7210114002227783 }, { "auxiliary_loss_clip": 0.01128428, "auxiliary_loss_mlp": 0.01039216, "balance_loss_clip": 1.04990005, "balance_loss_mlp": 1.02345669, "epoch": 0.29851194949646775, "flos": 18108458542080.0, "grad_norm": 1.8191471944851214, "language_loss": 0.71093529, "learning_rate": 3.2901606375634015e-06, "loss": 0.73261172, "num_input_tokens_seen": 106909190, "step": 4965, "time_per_iteration": 2.7070064544677734 }, { "auxiliary_loss_clip": 0.01151396, "auxiliary_loss_mlp": 0.01056357, "balance_loss_clip": 1.05813003, "balance_loss_mlp": 1.03827357, "epoch": 0.2985720727491357, "flos": 22018233104640.0, "grad_norm": 2.164601494744612, "language_loss": 0.65952027, "learning_rate": 3.289863019680461e-06, "loss": 0.68159783, "num_input_tokens_seen": 106927825, "step": 4966, "time_per_iteration": 2.5820860862731934 }, { "auxiliary_loss_clip": 0.01148496, "auxiliary_loss_mlp": 0.01042183, "balance_loss_clip": 1.05610132, "balance_loss_mlp": 1.02496934, "epoch": 0.2986321960018037, "flos": 13041355772160.0, "grad_norm": 5.631297794621363, "language_loss": 0.73553479, "learning_rate": 3.289565352885785e-06, "loss": 0.75744158, "num_input_tokens_seen": 106943155, "step": 4967, "time_per_iteration": 2.558378219604492 }, { "auxiliary_loss_clip": 0.01110231, "auxiliary_loss_mlp": 0.01041561, "balance_loss_clip": 1.04339898, "balance_loss_mlp": 1.02440643, "epoch": 0.29869231925447165, "flos": 14465034305280.0, "grad_norm": 2.07351823246568, "language_loss": 0.71246195, "learning_rate": 3.2892676371906614e-06, "loss": 0.73397982, "num_input_tokens_seen": 106960295, "step": 4968, "time_per_iteration": 2.663163900375366 }, { "auxiliary_loss_clip": 0.01124763, "auxiliary_loss_mlp": 0.01043588, "balance_loss_clip": 1.04864979, "balance_loss_mlp": 1.02545607, "epoch": 0.2987524425071396, "flos": 31650228639360.0, "grad_norm": 2.159507035183752, "language_loss": 0.76744419, "learning_rate": 3.2889698726063805e-06, "loss": 0.78912771, "num_input_tokens_seen": 106982870, "step": 4969, "time_per_iteration": 2.729922294616699 }, { "auxiliary_loss_clip": 0.0114364, "auxiliary_loss_mlp": 0.01036255, "balance_loss_clip": 1.05239987, "balance_loss_mlp": 1.02054322, "epoch": 0.2988125657598076, "flos": 21433427775360.0, "grad_norm": 2.2724385668179936, "language_loss": 0.69836891, "learning_rate": 3.2886720591442327e-06, "loss": 0.72016788, "num_input_tokens_seen": 107002405, "step": 4970, "time_per_iteration": 2.6299381256103516 }, { "auxiliary_loss_clip": 0.01135061, "auxiliary_loss_mlp": 0.01048009, "balance_loss_clip": 1.05199289, "balance_loss_mlp": 1.02973413, "epoch": 0.2988726890124756, "flos": 18076965292800.0, "grad_norm": 2.0648779209654258, "language_loss": 0.85228848, "learning_rate": 3.2883741968155103e-06, "loss": 0.87411916, "num_input_tokens_seen": 107017310, "step": 4971, "time_per_iteration": 2.6508536338806152 }, { "auxiliary_loss_clip": 0.01112297, "auxiliary_loss_mlp": 0.01054091, "balance_loss_clip": 1.04895663, "balance_loss_mlp": 1.03510106, "epoch": 0.29893281226514357, "flos": 21755653706880.0, "grad_norm": 2.125047221260382, "language_loss": 0.79404521, "learning_rate": 3.2880762856315107e-06, "loss": 0.81570905, "num_input_tokens_seen": 107034645, "step": 4972, "time_per_iteration": 2.7924270629882812 }, { "auxiliary_loss_clip": 0.01145651, "auxiliary_loss_mlp": 0.01050789, "balance_loss_clip": 1.05367875, "balance_loss_mlp": 1.03427887, "epoch": 0.29899293551781153, "flos": 16836718538880.0, "grad_norm": 2.200462139835186, "language_loss": 0.85242772, "learning_rate": 3.2877783256035285e-06, "loss": 0.87439215, "num_input_tokens_seen": 107051125, "step": 4973, "time_per_iteration": 2.5249850749969482 }, { "auxiliary_loss_clip": 0.011108, "auxiliary_loss_mlp": 0.0104405, "balance_loss_clip": 1.04758012, "balance_loss_mlp": 1.02664554, "epoch": 0.2990530587704795, "flos": 11729215946880.0, "grad_norm": 2.0029664307268664, "language_loss": 0.77612329, "learning_rate": 3.287480316742863e-06, "loss": 0.79767179, "num_input_tokens_seen": 107068815, "step": 4974, "time_per_iteration": 2.6555633544921875 }, { "auxiliary_loss_clip": 0.01115732, "auxiliary_loss_mlp": 0.00779073, "balance_loss_clip": 1.04864824, "balance_loss_mlp": 1.00132942, "epoch": 0.29911318202314746, "flos": 28039877850240.0, "grad_norm": 1.735885031779611, "language_loss": 0.72557616, "learning_rate": 3.287182259060815e-06, "loss": 0.74452424, "num_input_tokens_seen": 107090420, "step": 4975, "time_per_iteration": 2.826773166656494 }, { "auxiliary_loss_clip": 0.01137332, "auxiliary_loss_mlp": 0.01043625, "balance_loss_clip": 1.05628741, "balance_loss_mlp": 1.02561235, "epoch": 0.2991733052758154, "flos": 18733555952640.0, "grad_norm": 2.282255680734404, "language_loss": 0.76357341, "learning_rate": 3.286884152568687e-06, "loss": 0.78538299, "num_input_tokens_seen": 107107255, "step": 4976, "time_per_iteration": 2.7506988048553467 }, { "auxiliary_loss_clip": 0.01130399, "auxiliary_loss_mlp": 0.01046525, "balance_loss_clip": 1.0515976, "balance_loss_mlp": 1.02988303, "epoch": 0.2992334285284834, "flos": 15559160532480.0, "grad_norm": 2.005019372487673, "language_loss": 0.86173046, "learning_rate": 3.2865859972777827e-06, "loss": 0.88349968, "num_input_tokens_seen": 107123840, "step": 4977, "time_per_iteration": 2.665029764175415 }, { "auxiliary_loss_clip": 0.01118345, "auxiliary_loss_mlp": 0.01041325, "balance_loss_clip": 1.05032945, "balance_loss_mlp": 1.02443314, "epoch": 0.29929355178115136, "flos": 21797561900160.0, "grad_norm": 1.7658271873172786, "language_loss": 0.68290305, "learning_rate": 3.2862877931994088e-06, "loss": 0.70449972, "num_input_tokens_seen": 107143475, "step": 4978, "time_per_iteration": 2.8401222229003906 }, { "auxiliary_loss_clip": 0.011259, "auxiliary_loss_mlp": 0.0104045, "balance_loss_clip": 1.05556107, "balance_loss_mlp": 1.02268767, "epoch": 0.2993536750338193, "flos": 21178533888000.0, "grad_norm": 2.254262103488659, "language_loss": 0.76281357, "learning_rate": 3.2859895403448726e-06, "loss": 0.78447711, "num_input_tokens_seen": 107161725, "step": 4979, "time_per_iteration": 2.7814600467681885 }, { "auxiliary_loss_clip": 0.01090165, "auxiliary_loss_mlp": 0.0104942, "balance_loss_clip": 1.04378402, "balance_loss_mlp": 1.03001285, "epoch": 0.2994137982864873, "flos": 32122130544000.0, "grad_norm": 2.1261514095664253, "language_loss": 0.68627954, "learning_rate": 3.285691238725484e-06, "loss": 0.70767546, "num_input_tokens_seen": 107183935, "step": 4980, "time_per_iteration": 2.891620635986328 }, { "auxiliary_loss_clip": 0.01130184, "auxiliary_loss_mlp": 0.00774942, "balance_loss_clip": 1.0525018, "balance_loss_mlp": 1.00121665, "epoch": 0.29947392153915525, "flos": 21105419754240.0, "grad_norm": 2.1372298066204114, "language_loss": 0.73153281, "learning_rate": 3.285392888352555e-06, "loss": 0.75058407, "num_input_tokens_seen": 107204285, "step": 4981, "time_per_iteration": 5.394481420516968 }, { "auxiliary_loss_clip": 0.01131964, "auxiliary_loss_mlp": 0.0103921, "balance_loss_clip": 1.0491364, "balance_loss_mlp": 1.02280653, "epoch": 0.2995340447918232, "flos": 21542632099200.0, "grad_norm": 1.6530173596529, "language_loss": 0.86516619, "learning_rate": 3.2850944892373987e-06, "loss": 0.88687789, "num_input_tokens_seen": 107225265, "step": 4982, "time_per_iteration": 4.269104480743408 }, { "auxiliary_loss_clip": 0.01122605, "auxiliary_loss_mlp": 0.01045235, "balance_loss_clip": 1.05186415, "balance_loss_mlp": 1.02632844, "epoch": 0.2995941680444912, "flos": 16725143917440.0, "grad_norm": 2.446225936700185, "language_loss": 0.86517423, "learning_rate": 3.2847960413913307e-06, "loss": 0.88685262, "num_input_tokens_seen": 107241335, "step": 4983, "time_per_iteration": 2.844748020172119 }, { "auxiliary_loss_clip": 0.01127565, "auxiliary_loss_mlp": 0.01041992, "balance_loss_clip": 1.05255556, "balance_loss_mlp": 1.02594662, "epoch": 0.2996542912971592, "flos": 20923496346240.0, "grad_norm": 2.024163877740881, "language_loss": 0.78712893, "learning_rate": 3.284497544825668e-06, "loss": 0.80882448, "num_input_tokens_seen": 107259375, "step": 4984, "time_per_iteration": 2.6945550441741943 }, { "auxiliary_loss_clip": 0.01110139, "auxiliary_loss_mlp": 0.01046002, "balance_loss_clip": 1.0492574, "balance_loss_mlp": 1.02761972, "epoch": 0.29971441454982717, "flos": 25079868754560.0, "grad_norm": 1.5529534411437271, "language_loss": 0.78736818, "learning_rate": 3.2841989995517303e-06, "loss": 0.8089295, "num_input_tokens_seen": 107279890, "step": 4985, "time_per_iteration": 2.8082690238952637 }, { "auxiliary_loss_clip": 0.01083189, "auxiliary_loss_mlp": 0.01050178, "balance_loss_clip": 1.04330277, "balance_loss_mlp": 1.02925658, "epoch": 0.29977453780249513, "flos": 52555911840000.0, "grad_norm": 2.2301347819864112, "language_loss": 0.72089684, "learning_rate": 3.283900405580837e-06, "loss": 0.74223053, "num_input_tokens_seen": 107303430, "step": 4986, "time_per_iteration": 4.54891562461853 }, { "auxiliary_loss_clip": 0.01119419, "auxiliary_loss_mlp": 0.01047564, "balance_loss_clip": 1.04838538, "balance_loss_mlp": 1.03007603, "epoch": 0.2998346610551631, "flos": 22237144542720.0, "grad_norm": 2.1453051702670787, "language_loss": 0.73143345, "learning_rate": 3.283601762924312e-06, "loss": 0.75310332, "num_input_tokens_seen": 107323700, "step": 4987, "time_per_iteration": 4.324375152587891 }, { "auxiliary_loss_clip": 0.01111213, "auxiliary_loss_mlp": 0.01039103, "balance_loss_clip": 1.04803324, "balance_loss_mlp": 1.0233314, "epoch": 0.29989478430783106, "flos": 16873203778560.0, "grad_norm": 2.095598578062247, "language_loss": 0.80221194, "learning_rate": 3.2833030715934793e-06, "loss": 0.82371509, "num_input_tokens_seen": 107341965, "step": 4988, "time_per_iteration": 2.772221565246582 }, { "auxiliary_loss_clip": 0.01114945, "auxiliary_loss_mlp": 0.00777889, "balance_loss_clip": 1.04905486, "balance_loss_mlp": 1.0013597, "epoch": 0.29995490756049903, "flos": 23768878164480.0, "grad_norm": 1.6966696236855432, "language_loss": 0.70858777, "learning_rate": 3.2830043315996658e-06, "loss": 0.72751617, "num_input_tokens_seen": 107362615, "step": 4989, "time_per_iteration": 2.7470130920410156 }, { "auxiliary_loss_clip": 0.0110827, "auxiliary_loss_mlp": 0.01046589, "balance_loss_clip": 1.0506041, "balance_loss_mlp": 1.02906489, "epoch": 0.300015030813167, "flos": 14465321614080.0, "grad_norm": 1.9545100728262668, "language_loss": 0.85589516, "learning_rate": 3.282705542954199e-06, "loss": 0.87744367, "num_input_tokens_seen": 107378980, "step": 4990, "time_per_iteration": 2.808276414871216 }, { "auxiliary_loss_clip": 0.01133569, "auxiliary_loss_mlp": 0.0103974, "balance_loss_clip": 1.05172086, "balance_loss_mlp": 1.02152538, "epoch": 0.30007515406583496, "flos": 25191982080000.0, "grad_norm": 1.8023870470649808, "language_loss": 0.67019355, "learning_rate": 3.28240670566841e-06, "loss": 0.69192666, "num_input_tokens_seen": 107397640, "step": 4991, "time_per_iteration": 2.7097268104553223 }, { "auxiliary_loss_clip": 0.0112021, "auxiliary_loss_mlp": 0.01041383, "balance_loss_clip": 1.04660511, "balance_loss_mlp": 1.02248883, "epoch": 0.3001352773185029, "flos": 19391188106880.0, "grad_norm": 1.684252307124257, "language_loss": 0.78640115, "learning_rate": 3.28210781975363e-06, "loss": 0.80801708, "num_input_tokens_seen": 107416020, "step": 4992, "time_per_iteration": 2.66925311088562 }, { "auxiliary_loss_clip": 0.01143243, "auxiliary_loss_mlp": 0.01041924, "balance_loss_clip": 1.05240428, "balance_loss_mlp": 1.02457952, "epoch": 0.3001954005711709, "flos": 21543853161600.0, "grad_norm": 2.3134173579188175, "language_loss": 0.82057947, "learning_rate": 3.281808885221193e-06, "loss": 0.84243113, "num_input_tokens_seen": 107436340, "step": 4993, "time_per_iteration": 2.613849639892578 }, { "auxiliary_loss_clip": 0.01096023, "auxiliary_loss_mlp": 0.01048917, "balance_loss_clip": 1.04667079, "balance_loss_mlp": 1.02997458, "epoch": 0.30025552382383885, "flos": 17384320356480.0, "grad_norm": 2.1042579138834197, "language_loss": 0.86142659, "learning_rate": 3.2815099020824345e-06, "loss": 0.88287598, "num_input_tokens_seen": 107454585, "step": 4994, "time_per_iteration": 2.703126907348633 }, { "auxiliary_loss_clip": 0.01118329, "auxiliary_loss_mlp": 0.01041975, "balance_loss_clip": 1.05592799, "balance_loss_mlp": 1.02504694, "epoch": 0.3003156470765068, "flos": 29533330552320.0, "grad_norm": 1.5905866784601752, "language_loss": 0.80834931, "learning_rate": 3.2812108703486924e-06, "loss": 0.82995236, "num_input_tokens_seen": 107477180, "step": 4995, "time_per_iteration": 2.8100333213806152 }, { "auxiliary_loss_clip": 0.01117939, "auxiliary_loss_mlp": 0.01043612, "balance_loss_clip": 1.05073023, "balance_loss_mlp": 1.02623129, "epoch": 0.3003757703291748, "flos": 43646402465280.0, "grad_norm": 1.9490007813217745, "language_loss": 0.67086798, "learning_rate": 3.2809117900313055e-06, "loss": 0.69248348, "num_input_tokens_seen": 107500250, "step": 4996, "time_per_iteration": 2.989062786102295 }, { "auxiliary_loss_clip": 0.01114657, "auxiliary_loss_mlp": 0.01042055, "balance_loss_clip": 1.04888701, "balance_loss_mlp": 1.02449584, "epoch": 0.30043589358184275, "flos": 22528380015360.0, "grad_norm": 4.4692930536610245, "language_loss": 0.75825363, "learning_rate": 3.280612661141615e-06, "loss": 0.7798208, "num_input_tokens_seen": 107520070, "step": 4997, "time_per_iteration": 2.733402967453003 }, { "auxiliary_loss_clip": 0.01131118, "auxiliary_loss_mlp": 0.0104737, "balance_loss_clip": 1.05176449, "balance_loss_mlp": 1.03149128, "epoch": 0.30049601683451077, "flos": 20995892208000.0, "grad_norm": 2.0588160995259197, "language_loss": 0.78425241, "learning_rate": 3.2803134836909646e-06, "loss": 0.80603731, "num_input_tokens_seen": 107539285, "step": 4998, "time_per_iteration": 2.7973837852478027 }, { "auxiliary_loss_clip": 0.011392, "auxiliary_loss_mlp": 0.01044927, "balance_loss_clip": 1.05180395, "balance_loss_mlp": 1.0287745, "epoch": 0.30055614008717874, "flos": 23916004272000.0, "grad_norm": 18.871291300313036, "language_loss": 0.73622382, "learning_rate": 3.2800142576906985e-06, "loss": 0.7580651, "num_input_tokens_seen": 107560260, "step": 4999, "time_per_iteration": 2.7197916507720947 }, { "auxiliary_loss_clip": 0.01131684, "auxiliary_loss_mlp": 0.01044515, "balance_loss_clip": 1.05033612, "balance_loss_mlp": 1.02750361, "epoch": 0.3006162633398467, "flos": 19169798630400.0, "grad_norm": 1.6090337016392804, "language_loss": 0.75454789, "learning_rate": 3.2797149831521626e-06, "loss": 0.77630985, "num_input_tokens_seen": 107579260, "step": 5000, "time_per_iteration": 2.688054323196411 }, { "auxiliary_loss_clip": 0.01138443, "auxiliary_loss_mlp": 0.01041074, "balance_loss_clip": 1.0505259, "balance_loss_mlp": 1.02564812, "epoch": 0.30067638659251467, "flos": 14679241061760.0, "grad_norm": 1.7985326326547535, "language_loss": 0.81841409, "learning_rate": 3.2794156600867073e-06, "loss": 0.84020931, "num_input_tokens_seen": 107595245, "step": 5001, "time_per_iteration": 2.6519837379455566 }, { "auxiliary_loss_clip": 0.01128756, "auxiliary_loss_mlp": 0.01048602, "balance_loss_clip": 1.05139947, "balance_loss_mlp": 1.03068447, "epoch": 0.30073650984518263, "flos": 23368007404800.0, "grad_norm": 1.8684342377814658, "language_loss": 0.7999261, "learning_rate": 3.2791162885056815e-06, "loss": 0.82169974, "num_input_tokens_seen": 107613985, "step": 5002, "time_per_iteration": 2.6749327182769775 }, { "auxiliary_loss_clip": 0.01091983, "auxiliary_loss_mlp": 0.0104282, "balance_loss_clip": 1.04869151, "balance_loss_mlp": 1.02431834, "epoch": 0.3007966330978506, "flos": 22966633854720.0, "grad_norm": 1.9577039368374018, "language_loss": 0.70993537, "learning_rate": 3.2788168684204376e-06, "loss": 0.73128337, "num_input_tokens_seen": 107631435, "step": 5003, "time_per_iteration": 2.908494472503662 }, { "auxiliary_loss_clip": 0.01110546, "auxiliary_loss_mlp": 0.01043883, "balance_loss_clip": 1.05014396, "balance_loss_mlp": 1.02643037, "epoch": 0.30085675635051856, "flos": 27818452460160.0, "grad_norm": 1.956987555909332, "language_loss": 0.70556092, "learning_rate": 3.27851739984233e-06, "loss": 0.72710526, "num_input_tokens_seen": 107650530, "step": 5004, "time_per_iteration": 2.8064236640930176 }, { "auxiliary_loss_clip": 0.01119172, "auxiliary_loss_mlp": 0.01045143, "balance_loss_clip": 1.05067444, "balance_loss_mlp": 1.02800083, "epoch": 0.3009168796031865, "flos": 10882729059840.0, "grad_norm": 2.8453259041050805, "language_loss": 0.81459486, "learning_rate": 3.278217882782715e-06, "loss": 0.83623803, "num_input_tokens_seen": 107662240, "step": 5005, "time_per_iteration": 2.633951425552368 }, { "auxiliary_loss_clip": 0.01130639, "auxiliary_loss_mlp": 0.01043853, "balance_loss_clip": 1.0514015, "balance_loss_mlp": 1.02742577, "epoch": 0.3009770028558545, "flos": 23805399317760.0, "grad_norm": 3.7156546302240043, "language_loss": 0.74672973, "learning_rate": 3.2779183172529497e-06, "loss": 0.76847464, "num_input_tokens_seen": 107680330, "step": 5006, "time_per_iteration": 2.7556662559509277 }, { "auxiliary_loss_clip": 0.01101239, "auxiliary_loss_mlp": 0.00775371, "balance_loss_clip": 1.04850578, "balance_loss_mlp": 1.00104856, "epoch": 0.30103712610852246, "flos": 26468211283200.0, "grad_norm": 2.0504029481480153, "language_loss": 0.71090448, "learning_rate": 3.2776187032643932e-06, "loss": 0.72967064, "num_input_tokens_seen": 107700020, "step": 5007, "time_per_iteration": 2.83591365814209 }, { "auxiliary_loss_clip": 0.01129575, "auxiliary_loss_mlp": 0.01038114, "balance_loss_clip": 1.05173922, "balance_loss_mlp": 1.0206027, "epoch": 0.3010972493611904, "flos": 22856459863680.0, "grad_norm": 2.302333802055736, "language_loss": 0.76504552, "learning_rate": 3.2773190408284075e-06, "loss": 0.78672242, "num_input_tokens_seen": 107718575, "step": 5008, "time_per_iteration": 2.7624082565307617 }, { "auxiliary_loss_clip": 0.0112694, "auxiliary_loss_mlp": 0.01039735, "balance_loss_clip": 1.05119205, "balance_loss_mlp": 1.02284265, "epoch": 0.3011573726138584, "flos": 24053685102720.0, "grad_norm": 1.840633361886899, "language_loss": 0.84215975, "learning_rate": 3.2770193299563564e-06, "loss": 0.86382657, "num_input_tokens_seen": 107738635, "step": 5009, "time_per_iteration": 2.7053475379943848 }, { "auxiliary_loss_clip": 0.01135722, "auxiliary_loss_mlp": 0.0104281, "balance_loss_clip": 1.05079174, "balance_loss_mlp": 1.02389145, "epoch": 0.30121749586652635, "flos": 20259687052800.0, "grad_norm": 1.970244045667646, "language_loss": 0.83804011, "learning_rate": 3.276719570659604e-06, "loss": 0.85982549, "num_input_tokens_seen": 107753415, "step": 5010, "time_per_iteration": 2.677002429962158 }, { "auxiliary_loss_clip": 0.01108582, "auxiliary_loss_mlp": 0.01038214, "balance_loss_clip": 1.04942024, "balance_loss_mlp": 1.02294374, "epoch": 0.3012776191191944, "flos": 26943058103040.0, "grad_norm": 2.3216326772862246, "language_loss": 0.85401523, "learning_rate": 3.2764197629495176e-06, "loss": 0.87548327, "num_input_tokens_seen": 107773840, "step": 5011, "time_per_iteration": 2.807887077331543 }, { "auxiliary_loss_clip": 0.01119452, "auxiliary_loss_mlp": 0.01044648, "balance_loss_clip": 1.04522014, "balance_loss_mlp": 1.02680194, "epoch": 0.30133774237186234, "flos": 20412307941120.0, "grad_norm": 2.58081844210284, "language_loss": 0.72122502, "learning_rate": 3.2761199068374656e-06, "loss": 0.74286604, "num_input_tokens_seen": 107792020, "step": 5012, "time_per_iteration": 2.689375400543213 }, { "auxiliary_loss_clip": 0.01127162, "auxiliary_loss_mlp": 0.01042946, "balance_loss_clip": 1.04826403, "balance_loss_mlp": 1.02628016, "epoch": 0.3013978656245303, "flos": 19792453916160.0, "grad_norm": 2.871668468467944, "language_loss": 0.88278735, "learning_rate": 3.275820002334819e-06, "loss": 0.90448833, "num_input_tokens_seen": 107809595, "step": 5013, "time_per_iteration": 2.6482350826263428 }, { "auxiliary_loss_clip": 0.01110184, "auxiliary_loss_mlp": 0.01050326, "balance_loss_clip": 1.04318821, "balance_loss_mlp": 1.0286417, "epoch": 0.30145798887719827, "flos": 16249650652800.0, "grad_norm": 1.8756845710135603, "language_loss": 0.82593644, "learning_rate": 3.2755200494529496e-06, "loss": 0.84754151, "num_input_tokens_seen": 107827230, "step": 5014, "time_per_iteration": 2.6681008338928223 }, { "auxiliary_loss_clip": 0.01092673, "auxiliary_loss_mlp": 0.01047692, "balance_loss_clip": 1.04461288, "balance_loss_mlp": 1.03045392, "epoch": 0.30151811212986623, "flos": 24571733005440.0, "grad_norm": 1.7101695757694795, "language_loss": 0.68239003, "learning_rate": 3.2752200482032323e-06, "loss": 0.7037937, "num_input_tokens_seen": 107847195, "step": 5015, "time_per_iteration": 2.725411891937256 }, { "auxiliary_loss_clip": 0.01110447, "auxiliary_loss_mlp": 0.01043819, "balance_loss_clip": 1.0448432, "balance_loss_mlp": 1.02652168, "epoch": 0.3015782353825342, "flos": 21872076664320.0, "grad_norm": 2.2766913154728625, "language_loss": 0.74497074, "learning_rate": 3.2749199985970436e-06, "loss": 0.76651341, "num_input_tokens_seen": 107866420, "step": 5016, "time_per_iteration": 2.710721492767334 }, { "auxiliary_loss_clip": 0.01133464, "auxiliary_loss_mlp": 0.01041604, "balance_loss_clip": 1.05026031, "balance_loss_mlp": 1.02444994, "epoch": 0.30163835863520216, "flos": 28769331248640.0, "grad_norm": 1.7847015072033203, "language_loss": 0.65504754, "learning_rate": 3.2746199006457603e-06, "loss": 0.67679822, "num_input_tokens_seen": 107889090, "step": 5017, "time_per_iteration": 2.7239317893981934 }, { "auxiliary_loss_clip": 0.01091977, "auxiliary_loss_mlp": 0.01057247, "balance_loss_clip": 1.04233074, "balance_loss_mlp": 1.03813791, "epoch": 0.30169848188787013, "flos": 22966202891520.0, "grad_norm": 2.1696927992492783, "language_loss": 0.68739498, "learning_rate": 3.2743197543607628e-06, "loss": 0.70888722, "num_input_tokens_seen": 107907520, "step": 5018, "time_per_iteration": 2.6655359268188477 }, { "auxiliary_loss_clip": 0.01135218, "auxiliary_loss_mlp": 0.01042787, "balance_loss_clip": 1.0482893, "balance_loss_mlp": 1.02783799, "epoch": 0.3017586051405381, "flos": 21835268202240.0, "grad_norm": 1.9457029488983892, "language_loss": 0.78853333, "learning_rate": 3.2740195597534327e-06, "loss": 0.8103134, "num_input_tokens_seen": 107925650, "step": 5019, "time_per_iteration": 2.669679641723633 }, { "auxiliary_loss_clip": 0.01112458, "auxiliary_loss_mlp": 0.01044161, "balance_loss_clip": 1.04863656, "balance_loss_mlp": 1.02766263, "epoch": 0.30181872839320606, "flos": 22160403135360.0, "grad_norm": 3.674249330665847, "language_loss": 0.70038712, "learning_rate": 3.2737193168351527e-06, "loss": 0.72195333, "num_input_tokens_seen": 107943975, "step": 5020, "time_per_iteration": 2.704000234603882 }, { "auxiliary_loss_clip": 0.01143422, "auxiliary_loss_mlp": 0.01049684, "balance_loss_clip": 1.05071819, "balance_loss_mlp": 1.03320909, "epoch": 0.301878851645874, "flos": 18114168804480.0, "grad_norm": 5.641410405732297, "language_loss": 0.78549969, "learning_rate": 3.2734190256173085e-06, "loss": 0.80743068, "num_input_tokens_seen": 107962950, "step": 5021, "time_per_iteration": 4.521278142929077 }, { "auxiliary_loss_clip": 0.01129372, "auxiliary_loss_mlp": 0.01031797, "balance_loss_clip": 1.04859924, "balance_loss_mlp": 1.01572752, "epoch": 0.301938974898542, "flos": 17602226213760.0, "grad_norm": 3.308202374048827, "language_loss": 0.75482392, "learning_rate": 3.2731186861112877e-06, "loss": 0.77643561, "num_input_tokens_seen": 107979700, "step": 5022, "time_per_iteration": 4.1478235721588135 }, { "auxiliary_loss_clip": 0.01141828, "auxiliary_loss_mlp": 0.01043797, "balance_loss_clip": 1.04905522, "balance_loss_mlp": 1.02676249, "epoch": 0.30199909815120995, "flos": 11181219079680.0, "grad_norm": 1.7715139184612991, "language_loss": 0.69534874, "learning_rate": 3.2728182983284793e-06, "loss": 0.71720505, "num_input_tokens_seen": 107996645, "step": 5023, "time_per_iteration": 2.582491636276245 }, { "auxiliary_loss_clip": 0.01112614, "auxiliary_loss_mlp": 0.01040881, "balance_loss_clip": 1.04434311, "balance_loss_mlp": 1.02471602, "epoch": 0.302059221403878, "flos": 21907843632000.0, "grad_norm": 4.128865002464027, "language_loss": 0.71400636, "learning_rate": 3.2725178622802724e-06, "loss": 0.73554134, "num_input_tokens_seen": 108015020, "step": 5024, "time_per_iteration": 2.6789708137512207 }, { "auxiliary_loss_clip": 0.01125475, "auxiliary_loss_mlp": 0.01051317, "balance_loss_clip": 1.04789031, "balance_loss_mlp": 1.03441346, "epoch": 0.30211934465654594, "flos": 26396390039040.0, "grad_norm": 2.5352325664815396, "language_loss": 0.73949707, "learning_rate": 3.272217377978061e-06, "loss": 0.76126498, "num_input_tokens_seen": 108036430, "step": 5025, "time_per_iteration": 2.7021281719207764 }, { "auxiliary_loss_clip": 0.01129438, "auxiliary_loss_mlp": 0.01049255, "balance_loss_clip": 1.05115628, "balance_loss_mlp": 1.03333473, "epoch": 0.3021794679092139, "flos": 23400470321280.0, "grad_norm": 1.5312912087399582, "language_loss": 0.67339373, "learning_rate": 3.2719168454332387e-06, "loss": 0.69518065, "num_input_tokens_seen": 108054250, "step": 5026, "time_per_iteration": 4.172817230224609 }, { "auxiliary_loss_clip": 0.01131398, "auxiliary_loss_mlp": 0.01045765, "balance_loss_clip": 1.05058789, "balance_loss_mlp": 1.02871835, "epoch": 0.30223959116188187, "flos": 20260979942400.0, "grad_norm": 1.8656003857402752, "language_loss": 0.84821522, "learning_rate": 3.2716162646572034e-06, "loss": 0.86998689, "num_input_tokens_seen": 108071495, "step": 5027, "time_per_iteration": 2.66186785697937 }, { "auxiliary_loss_clip": 0.01104085, "auxiliary_loss_mlp": 0.01045706, "balance_loss_clip": 1.04686451, "balance_loss_mlp": 1.03030431, "epoch": 0.30229971441454984, "flos": 26687840993280.0, "grad_norm": 1.633485895123786, "language_loss": 0.78574622, "learning_rate": 3.271315635661351e-06, "loss": 0.80724418, "num_input_tokens_seen": 108092135, "step": 5028, "time_per_iteration": 4.454678297042847 }, { "auxiliary_loss_clip": 0.01113383, "auxiliary_loss_mlp": 0.01048022, "balance_loss_clip": 1.04682207, "balance_loss_mlp": 1.03115392, "epoch": 0.3023598376672178, "flos": 34345323953280.0, "grad_norm": 1.9340935936746968, "language_loss": 0.77085543, "learning_rate": 3.2710149584570826e-06, "loss": 0.79246956, "num_input_tokens_seen": 108112945, "step": 5029, "time_per_iteration": 2.841707229614258 }, { "auxiliary_loss_clip": 0.01111921, "auxiliary_loss_mlp": 0.01048937, "balance_loss_clip": 1.04846191, "balance_loss_mlp": 1.02920818, "epoch": 0.30241996091988577, "flos": 23112143850240.0, "grad_norm": 2.1432001376374257, "language_loss": 0.8240397, "learning_rate": 3.2707142330557993e-06, "loss": 0.84564829, "num_input_tokens_seen": 108130325, "step": 5030, "time_per_iteration": 2.8557751178741455 }, { "auxiliary_loss_clip": 0.01090897, "auxiliary_loss_mlp": 0.00775419, "balance_loss_clip": 1.04519463, "balance_loss_mlp": 1.00112486, "epoch": 0.30248008417255373, "flos": 19390002958080.0, "grad_norm": 2.2374457582531098, "language_loss": 0.6987617, "learning_rate": 3.270413459468905e-06, "loss": 0.71742487, "num_input_tokens_seen": 108150300, "step": 5031, "time_per_iteration": 2.7827746868133545 }, { "auxiliary_loss_clip": 0.01121676, "auxiliary_loss_mlp": 0.01044463, "balance_loss_clip": 1.04549253, "balance_loss_mlp": 1.02800059, "epoch": 0.3025402074252217, "flos": 23769704177280.0, "grad_norm": 1.8685207024800563, "language_loss": 0.82324117, "learning_rate": 3.2701126377078047e-06, "loss": 0.84490258, "num_input_tokens_seen": 108170330, "step": 5032, "time_per_iteration": 2.6529927253723145 }, { "auxiliary_loss_clip": 0.01104945, "auxiliary_loss_mlp": 0.01059072, "balance_loss_clip": 1.05129266, "balance_loss_mlp": 1.03951025, "epoch": 0.30260033067788966, "flos": 25994118648960.0, "grad_norm": 2.130148669813867, "language_loss": 0.73156881, "learning_rate": 3.269811767783906e-06, "loss": 0.75320899, "num_input_tokens_seen": 108191265, "step": 5033, "time_per_iteration": 2.7259597778320312 }, { "auxiliary_loss_clip": 0.01124221, "auxiliary_loss_mlp": 0.01049397, "balance_loss_clip": 1.04687023, "balance_loss_mlp": 1.03221893, "epoch": 0.3026604539305576, "flos": 25374551932800.0, "grad_norm": 1.564237149834404, "language_loss": 0.74164939, "learning_rate": 3.2695108497086185e-06, "loss": 0.76338559, "num_input_tokens_seen": 108211615, "step": 5034, "time_per_iteration": 2.674745798110962 }, { "auxiliary_loss_clip": 0.01140313, "auxiliary_loss_mlp": 0.01039121, "balance_loss_clip": 1.04939198, "balance_loss_mlp": 1.02224064, "epoch": 0.3027205771832256, "flos": 25812733944960.0, "grad_norm": 1.8295549596836873, "language_loss": 0.72133434, "learning_rate": 3.269209883493352e-06, "loss": 0.74312872, "num_input_tokens_seen": 108231080, "step": 5035, "time_per_iteration": 2.6429855823516846 }, { "auxiliary_loss_clip": 0.01123118, "auxiliary_loss_mlp": 0.01038432, "balance_loss_clip": 1.04499483, "balance_loss_mlp": 1.02267289, "epoch": 0.30278070043589356, "flos": 27344539393920.0, "grad_norm": 2.468501372591198, "language_loss": 0.86918867, "learning_rate": 3.2689088691495196e-06, "loss": 0.89080417, "num_input_tokens_seen": 108251125, "step": 5036, "time_per_iteration": 2.6735007762908936 }, { "auxiliary_loss_clip": 0.01097642, "auxiliary_loss_mlp": 0.01051442, "balance_loss_clip": 1.04504728, "balance_loss_mlp": 1.0331912, "epoch": 0.3028408236885616, "flos": 24786227070720.0, "grad_norm": 2.859596651876304, "language_loss": 0.77406383, "learning_rate": 3.268607806688536e-06, "loss": 0.79555464, "num_input_tokens_seen": 108272545, "step": 5037, "time_per_iteration": 2.7311182022094727 }, { "auxiliary_loss_clip": 0.01102304, "auxiliary_loss_mlp": 0.01044604, "balance_loss_clip": 1.0462358, "balance_loss_mlp": 1.02683008, "epoch": 0.30290094694122954, "flos": 12932474670720.0, "grad_norm": 2.32450780354164, "language_loss": 0.77307165, "learning_rate": 3.268306696121816e-06, "loss": 0.79454064, "num_input_tokens_seen": 108289725, "step": 5038, "time_per_iteration": 2.677525043487549 }, { "auxiliary_loss_clip": 0.01113965, "auxiliary_loss_mlp": 0.01037105, "balance_loss_clip": 1.04819584, "balance_loss_mlp": 1.02067804, "epoch": 0.3029610701938975, "flos": 25916443488000.0, "grad_norm": 2.1234468188232976, "language_loss": 0.74140579, "learning_rate": 3.2680055374607804e-06, "loss": 0.76291645, "num_input_tokens_seen": 108310690, "step": 5039, "time_per_iteration": 2.7086853981018066 }, { "auxiliary_loss_clip": 0.01137739, "auxiliary_loss_mlp": 0.00774651, "balance_loss_clip": 1.05068994, "balance_loss_mlp": 1.00113058, "epoch": 0.3030211934465655, "flos": 21980993679360.0, "grad_norm": 2.3826017374700372, "language_loss": 0.79777801, "learning_rate": 3.267704330716847e-06, "loss": 0.81690192, "num_input_tokens_seen": 108328905, "step": 5040, "time_per_iteration": 2.665175199508667 }, { "auxiliary_loss_clip": 0.01114198, "auxiliary_loss_mlp": 0.01038229, "balance_loss_clip": 1.04937124, "balance_loss_mlp": 1.02279687, "epoch": 0.30308131669923344, "flos": 20991977625600.0, "grad_norm": 1.7800027985776907, "language_loss": 0.81872481, "learning_rate": 3.267403075901438e-06, "loss": 0.84024912, "num_input_tokens_seen": 108346680, "step": 5041, "time_per_iteration": 2.6471712589263916 }, { "auxiliary_loss_clip": 0.01018002, "auxiliary_loss_mlp": 0.01004656, "balance_loss_clip": 1.0244385, "balance_loss_mlp": 1.00277221, "epoch": 0.3031414399519014, "flos": 60548875827840.0, "grad_norm": 0.7715538683836823, "language_loss": 0.59505904, "learning_rate": 3.267101773025978e-06, "loss": 0.61528552, "num_input_tokens_seen": 108413885, "step": 5042, "time_per_iteration": 3.3167309761047363 }, { "auxiliary_loss_clip": 0.0114486, "auxiliary_loss_mlp": 0.01036647, "balance_loss_clip": 1.05319929, "balance_loss_mlp": 1.01940918, "epoch": 0.30320156320456937, "flos": 21907664064000.0, "grad_norm": 1.838538817411587, "language_loss": 0.71149278, "learning_rate": 3.266800422101892e-06, "loss": 0.73330784, "num_input_tokens_seen": 108433640, "step": 5043, "time_per_iteration": 2.6266753673553467 }, { "auxiliary_loss_clip": 0.01095086, "auxiliary_loss_mlp": 0.01036293, "balance_loss_clip": 1.04519725, "balance_loss_mlp": 1.01948404, "epoch": 0.30326168645723733, "flos": 21652770176640.0, "grad_norm": 3.620919115388089, "language_loss": 0.69573802, "learning_rate": 3.266499023140606e-06, "loss": 0.71705186, "num_input_tokens_seen": 108452640, "step": 5044, "time_per_iteration": 2.7561492919921875 }, { "auxiliary_loss_clip": 0.01127659, "auxiliary_loss_mlp": 0.01039805, "balance_loss_clip": 1.05019724, "balance_loss_mlp": 1.02335382, "epoch": 0.3033218097099053, "flos": 21871286565120.0, "grad_norm": 1.3797061223764004, "language_loss": 0.77188826, "learning_rate": 3.2661975761535513e-06, "loss": 0.79356289, "num_input_tokens_seen": 108472470, "step": 5045, "time_per_iteration": 2.6529667377471924 }, { "auxiliary_loss_clip": 0.01141388, "auxiliary_loss_mlp": 0.00775246, "balance_loss_clip": 1.05165195, "balance_loss_mlp": 1.00136316, "epoch": 0.30338193296257326, "flos": 27089717333760.0, "grad_norm": 1.772786200303907, "language_loss": 0.72473782, "learning_rate": 3.2658960811521564e-06, "loss": 0.74390417, "num_input_tokens_seen": 108493025, "step": 5046, "time_per_iteration": 2.8433380126953125 }, { "auxiliary_loss_clip": 0.01131475, "auxiliary_loss_mlp": 0.01040342, "balance_loss_clip": 1.04979491, "balance_loss_mlp": 1.02119732, "epoch": 0.30344205621524123, "flos": 19534363718400.0, "grad_norm": 1.7729778222487513, "language_loss": 0.81406343, "learning_rate": 3.2655945381478564e-06, "loss": 0.83578163, "num_input_tokens_seen": 108513480, "step": 5047, "time_per_iteration": 2.6653506755828857 }, { "auxiliary_loss_clip": 0.01078955, "auxiliary_loss_mlp": 0.01042974, "balance_loss_clip": 1.04126537, "balance_loss_mlp": 1.02565265, "epoch": 0.3035021794679092, "flos": 23910976368000.0, "grad_norm": 2.0012909108595287, "language_loss": 0.7191782, "learning_rate": 3.265292947152084e-06, "loss": 0.74039751, "num_input_tokens_seen": 108533155, "step": 5048, "time_per_iteration": 2.7198410034179688 }, { "auxiliary_loss_clip": 0.01117557, "auxiliary_loss_mlp": 0.01037944, "balance_loss_clip": 1.04860258, "balance_loss_mlp": 1.02263796, "epoch": 0.30356230272057716, "flos": 16143606725760.0, "grad_norm": 1.6260333435769418, "language_loss": 0.75220919, "learning_rate": 3.2649913081762763e-06, "loss": 0.77376425, "num_input_tokens_seen": 108551900, "step": 5049, "time_per_iteration": 2.6649906635284424 }, { "auxiliary_loss_clip": 0.01131404, "auxiliary_loss_mlp": 0.01035526, "balance_loss_clip": 1.04947305, "balance_loss_mlp": 1.01907563, "epoch": 0.3036224259732452, "flos": 28914697589760.0, "grad_norm": 1.5855456549340856, "language_loss": 0.82088244, "learning_rate": 3.2646896212318717e-06, "loss": 0.84255171, "num_input_tokens_seen": 108574005, "step": 5050, "time_per_iteration": 2.657400131225586 }, { "auxiliary_loss_clip": 0.01106158, "auxiliary_loss_mlp": 0.0103828, "balance_loss_clip": 1.05031502, "balance_loss_mlp": 1.02079201, "epoch": 0.30368254922591315, "flos": 21105599322240.0, "grad_norm": 2.7844840544166436, "language_loss": 0.74196702, "learning_rate": 3.2643878863303106e-06, "loss": 0.7634114, "num_input_tokens_seen": 108592715, "step": 5051, "time_per_iteration": 2.8018569946289062 }, { "auxiliary_loss_clip": 0.01079332, "auxiliary_loss_mlp": 0.00775567, "balance_loss_clip": 1.04338145, "balance_loss_mlp": 1.00118661, "epoch": 0.3037426724785811, "flos": 23002293081600.0, "grad_norm": 1.6849730779493737, "language_loss": 0.76015687, "learning_rate": 3.264086103483033e-06, "loss": 0.77870589, "num_input_tokens_seen": 108611770, "step": 5052, "time_per_iteration": 2.9220657348632812 }, { "auxiliary_loss_clip": 0.01143047, "auxiliary_loss_mlp": 0.01043624, "balance_loss_clip": 1.0504849, "balance_loss_mlp": 1.02656555, "epoch": 0.3038027957312491, "flos": 15632705629440.0, "grad_norm": 2.421175308310746, "language_loss": 0.82370055, "learning_rate": 3.2637842727014836e-06, "loss": 0.84556723, "num_input_tokens_seen": 108629070, "step": 5053, "time_per_iteration": 2.5955326557159424 }, { "auxiliary_loss_clip": 0.01113702, "auxiliary_loss_mlp": 0.01042002, "balance_loss_clip": 1.0471338, "balance_loss_mlp": 1.02475214, "epoch": 0.30386291898391704, "flos": 12713994195840.0, "grad_norm": 1.8307418288785484, "language_loss": 0.70979112, "learning_rate": 3.2634823939971083e-06, "loss": 0.73134822, "num_input_tokens_seen": 108646315, "step": 5054, "time_per_iteration": 2.7001569271087646 }, { "auxiliary_loss_clip": 0.01140964, "auxiliary_loss_mlp": 0.01039805, "balance_loss_clip": 1.05088401, "balance_loss_mlp": 1.0225668, "epoch": 0.303923042236585, "flos": 26359437922560.0, "grad_norm": 2.314538095600907, "language_loss": 0.69049591, "learning_rate": 3.2631804673813545e-06, "loss": 0.71230358, "num_input_tokens_seen": 108665920, "step": 5055, "time_per_iteration": 2.6685287952423096 }, { "auxiliary_loss_clip": 0.01113325, "auxiliary_loss_mlp": 0.01036352, "balance_loss_clip": 1.04871488, "balance_loss_mlp": 1.01880479, "epoch": 0.30398316548925297, "flos": 19719232041600.0, "grad_norm": 1.959915959447654, "language_loss": 0.67298615, "learning_rate": 3.2628784928656707e-06, "loss": 0.69448292, "num_input_tokens_seen": 108683485, "step": 5056, "time_per_iteration": 2.6933648586273193 }, { "auxiliary_loss_clip": 0.01110454, "auxiliary_loss_mlp": 0.01043223, "balance_loss_clip": 1.04604077, "balance_loss_mlp": 1.02673686, "epoch": 0.30404328874192094, "flos": 24239846315520.0, "grad_norm": 1.7045430221851803, "language_loss": 0.82544303, "learning_rate": 3.262576470461507e-06, "loss": 0.84697986, "num_input_tokens_seen": 108702700, "step": 5057, "time_per_iteration": 2.740187406539917 }, { "auxiliary_loss_clip": 0.01115402, "auxiliary_loss_mlp": 0.01039139, "balance_loss_clip": 1.04719019, "balance_loss_mlp": 1.0222472, "epoch": 0.3041034119945889, "flos": 24498942094080.0, "grad_norm": 1.8459128585017135, "language_loss": 0.88849652, "learning_rate": 3.2622744001803176e-06, "loss": 0.91004193, "num_input_tokens_seen": 108721860, "step": 5058, "time_per_iteration": 2.7015340328216553 }, { "auxiliary_loss_clip": 0.01102971, "auxiliary_loss_mlp": 0.01047692, "balance_loss_clip": 1.04598641, "balance_loss_mlp": 1.03040063, "epoch": 0.30416353524725687, "flos": 28288881907200.0, "grad_norm": 7.837576661900421, "language_loss": 0.71809238, "learning_rate": 3.2619722820335564e-06, "loss": 0.73959899, "num_input_tokens_seen": 108743215, "step": 5059, "time_per_iteration": 2.7542827129364014 }, { "auxiliary_loss_clip": 0.01083101, "auxiliary_loss_mlp": 0.01042605, "balance_loss_clip": 1.04435182, "balance_loss_mlp": 1.02670228, "epoch": 0.30422365849992483, "flos": 23660392112640.0, "grad_norm": 2.424944175434462, "language_loss": 0.73316336, "learning_rate": 3.26167011603268e-06, "loss": 0.7544204, "num_input_tokens_seen": 108765505, "step": 5060, "time_per_iteration": 4.655209541320801 }, { "auxiliary_loss_clip": 0.01140365, "auxiliary_loss_mlp": 0.01038221, "balance_loss_clip": 1.05072367, "balance_loss_mlp": 1.02234221, "epoch": 0.3042837817525928, "flos": 22998773548800.0, "grad_norm": 2.6284704346086, "language_loss": 0.77279079, "learning_rate": 3.2613679021891463e-06, "loss": 0.79457664, "num_input_tokens_seen": 108783370, "step": 5061, "time_per_iteration": 4.1857099533081055 }, { "auxiliary_loss_clip": 0.01105214, "auxiliary_loss_mlp": 0.01039505, "balance_loss_clip": 1.05216312, "balance_loss_mlp": 1.02225542, "epoch": 0.30434390500526076, "flos": 22082332924800.0, "grad_norm": 1.9238999634605745, "language_loss": 0.81891274, "learning_rate": 3.261065640514415e-06, "loss": 0.84035993, "num_input_tokens_seen": 108797430, "step": 5062, "time_per_iteration": 2.7250373363494873 }, { "auxiliary_loss_clip": 0.01132809, "auxiliary_loss_mlp": 0.01036348, "balance_loss_clip": 1.04662633, "balance_loss_mlp": 1.02098203, "epoch": 0.3044040282579287, "flos": 25483504861440.0, "grad_norm": 1.8479376829176948, "language_loss": 0.74707627, "learning_rate": 3.2607633310199483e-06, "loss": 0.76876783, "num_input_tokens_seen": 108816945, "step": 5063, "time_per_iteration": 2.6387155055999756 }, { "auxiliary_loss_clip": 0.01126143, "auxiliary_loss_mlp": 0.00775405, "balance_loss_clip": 1.04923415, "balance_loss_mlp": 1.00135541, "epoch": 0.30446415151059675, "flos": 21945478106880.0, "grad_norm": 1.691336757602503, "language_loss": 0.84400523, "learning_rate": 3.26046097371721e-06, "loss": 0.86302078, "num_input_tokens_seen": 108836615, "step": 5064, "time_per_iteration": 2.645256519317627 }, { "auxiliary_loss_clip": 0.01125608, "auxiliary_loss_mlp": 0.01040172, "balance_loss_clip": 1.04725182, "balance_loss_mlp": 1.02311337, "epoch": 0.3045242747632647, "flos": 16435416816000.0, "grad_norm": 2.198572989748056, "language_loss": 0.76257896, "learning_rate": 3.2601585686176655e-06, "loss": 0.78423673, "num_input_tokens_seen": 108855165, "step": 5065, "time_per_iteration": 4.119553565979004 }, { "auxiliary_loss_clip": 0.01110206, "auxiliary_loss_mlp": 0.01043438, "balance_loss_clip": 1.04441273, "balance_loss_mlp": 1.0260098, "epoch": 0.3045843980159327, "flos": 31540341957120.0, "grad_norm": 1.985168773674731, "language_loss": 0.62328786, "learning_rate": 3.2598561157327814e-06, "loss": 0.64482433, "num_input_tokens_seen": 108874690, "step": 5066, "time_per_iteration": 4.380331516265869 }, { "auxiliary_loss_clip": 0.01112307, "auxiliary_loss_mlp": 0.0104907, "balance_loss_clip": 1.04790235, "balance_loss_mlp": 1.03186774, "epoch": 0.30464452126860064, "flos": 17853636481920.0, "grad_norm": 2.188592288059769, "language_loss": 0.83193344, "learning_rate": 3.2595536150740265e-06, "loss": 0.85354722, "num_input_tokens_seen": 108893140, "step": 5067, "time_per_iteration": 2.628598213195801 }, { "auxiliary_loss_clip": 0.01136833, "auxiliary_loss_mlp": 0.01045137, "balance_loss_clip": 1.04994464, "balance_loss_mlp": 1.02904344, "epoch": 0.3047046445212686, "flos": 20631398947200.0, "grad_norm": 4.883769852075586, "language_loss": 0.62878895, "learning_rate": 3.259251066652873e-06, "loss": 0.65060866, "num_input_tokens_seen": 108911880, "step": 5068, "time_per_iteration": 2.583193302154541 }, { "auxiliary_loss_clip": 0.01127244, "auxiliary_loss_mlp": 0.01039272, "balance_loss_clip": 1.04866779, "balance_loss_mlp": 1.02316117, "epoch": 0.3047647677739366, "flos": 21287594557440.0, "grad_norm": 4.297243307498397, "language_loss": 0.74780715, "learning_rate": 3.258948470480793e-06, "loss": 0.7694723, "num_input_tokens_seen": 108930440, "step": 5069, "time_per_iteration": 2.643608570098877 }, { "auxiliary_loss_clip": 0.01103787, "auxiliary_loss_mlp": 0.01045252, "balance_loss_clip": 1.04608154, "balance_loss_mlp": 1.02922475, "epoch": 0.30482489102660454, "flos": 20995928121600.0, "grad_norm": 1.9753352797934713, "language_loss": 0.75726902, "learning_rate": 3.258645826569261e-06, "loss": 0.77875942, "num_input_tokens_seen": 108949125, "step": 5070, "time_per_iteration": 2.715672016143799 }, { "auxiliary_loss_clip": 0.01140483, "auxiliary_loss_mlp": 0.0077507, "balance_loss_clip": 1.04843533, "balance_loss_mlp": 1.0012939, "epoch": 0.3048850142792725, "flos": 26290812988800.0, "grad_norm": 1.7281078039111346, "language_loss": 0.81636953, "learning_rate": 3.2583431349297527e-06, "loss": 0.83552504, "num_input_tokens_seen": 108972190, "step": 5071, "time_per_iteration": 2.635542869567871 }, { "auxiliary_loss_clip": 0.01108476, "auxiliary_loss_mlp": 0.01045674, "balance_loss_clip": 1.04286063, "balance_loss_mlp": 1.02776885, "epoch": 0.30494513753194047, "flos": 22346241125760.0, "grad_norm": 2.0085610287172173, "language_loss": 0.76208484, "learning_rate": 3.2580403955737467e-06, "loss": 0.78362632, "num_input_tokens_seen": 108990325, "step": 5072, "time_per_iteration": 2.6662180423736572 }, { "auxiliary_loss_clip": 0.01099158, "auxiliary_loss_mlp": 0.01044752, "balance_loss_clip": 1.04694605, "balance_loss_mlp": 1.02821743, "epoch": 0.30500526078460843, "flos": 19537667769600.0, "grad_norm": 1.8424983506970039, "language_loss": 0.70873296, "learning_rate": 3.257737608512723e-06, "loss": 0.7301721, "num_input_tokens_seen": 109009505, "step": 5073, "time_per_iteration": 2.815281867980957 }, { "auxiliary_loss_clip": 0.01133011, "auxiliary_loss_mlp": 0.01055026, "balance_loss_clip": 1.05032837, "balance_loss_mlp": 1.03757334, "epoch": 0.3050653840372764, "flos": 14465321614080.0, "grad_norm": 2.0666195830085434, "language_loss": 0.76370406, "learning_rate": 3.257434773758163e-06, "loss": 0.78558439, "num_input_tokens_seen": 109026350, "step": 5074, "time_per_iteration": 2.748568534851074 }, { "auxiliary_loss_clip": 0.01115721, "auxiliary_loss_mlp": 0.01037599, "balance_loss_clip": 1.04921389, "balance_loss_mlp": 1.02149391, "epoch": 0.30512550728994436, "flos": 24243796811520.0, "grad_norm": 1.8649350467458667, "language_loss": 0.74393201, "learning_rate": 3.25713189132155e-06, "loss": 0.76546526, "num_input_tokens_seen": 109044165, "step": 5075, "time_per_iteration": 2.7015154361724854 }, { "auxiliary_loss_clip": 0.01141745, "auxiliary_loss_mlp": 0.01047345, "balance_loss_clip": 1.0498178, "balance_loss_mlp": 1.02825916, "epoch": 0.30518563054261233, "flos": 16360542915840.0, "grad_norm": 2.030111139920667, "language_loss": 0.75904357, "learning_rate": 3.2568289612143703e-06, "loss": 0.78093445, "num_input_tokens_seen": 109060665, "step": 5076, "time_per_iteration": 2.5811965465545654 }, { "auxiliary_loss_clip": 0.01116901, "auxiliary_loss_mlp": 0.01040641, "balance_loss_clip": 1.04864156, "balance_loss_mlp": 1.02466679, "epoch": 0.30524575379528035, "flos": 21579584215680.0, "grad_norm": 1.6479970241835653, "language_loss": 0.79240596, "learning_rate": 3.25652598344811e-06, "loss": 0.81398141, "num_input_tokens_seen": 109080035, "step": 5077, "time_per_iteration": 2.680205821990967 }, { "auxiliary_loss_clip": 0.01087088, "auxiliary_loss_mlp": 0.01033699, "balance_loss_clip": 1.04356635, "balance_loss_mlp": 1.01881564, "epoch": 0.3053058770479483, "flos": 16545231671040.0, "grad_norm": 1.6765288024346336, "language_loss": 0.74525034, "learning_rate": 3.256222958034259e-06, "loss": 0.76645821, "num_input_tokens_seen": 109097385, "step": 5078, "time_per_iteration": 2.7247111797332764 }, { "auxiliary_loss_clip": 0.01085086, "auxiliary_loss_mlp": 0.01054049, "balance_loss_clip": 1.04356313, "balance_loss_mlp": 1.03728211, "epoch": 0.3053660003006163, "flos": 12312907954560.0, "grad_norm": 1.7442741256404064, "language_loss": 0.66648543, "learning_rate": 3.255919884984307e-06, "loss": 0.68787676, "num_input_tokens_seen": 109115495, "step": 5079, "time_per_iteration": 2.746490716934204 }, { "auxiliary_loss_clip": 0.01127155, "auxiliary_loss_mlp": 0.01040504, "balance_loss_clip": 1.04811811, "balance_loss_mlp": 1.0248282, "epoch": 0.30542612355328425, "flos": 23112287504640.0, "grad_norm": 2.3583709354228213, "language_loss": 0.79841697, "learning_rate": 3.2556167643097477e-06, "loss": 0.82009357, "num_input_tokens_seen": 109134235, "step": 5080, "time_per_iteration": 2.7156612873077393 }, { "auxiliary_loss_clip": 0.01124116, "auxiliary_loss_mlp": 0.00772863, "balance_loss_clip": 1.04919219, "balance_loss_mlp": 1.00125837, "epoch": 0.3054862468059522, "flos": 24389450461440.0, "grad_norm": 2.2636550763480074, "language_loss": 0.81280053, "learning_rate": 3.255313596022074e-06, "loss": 0.8317703, "num_input_tokens_seen": 109152760, "step": 5081, "time_per_iteration": 2.6763248443603516 }, { "auxiliary_loss_clip": 0.01120003, "auxiliary_loss_mlp": 0.01044443, "balance_loss_clip": 1.04644883, "balance_loss_mlp": 1.02843297, "epoch": 0.3055463700586202, "flos": 29386096704000.0, "grad_norm": 7.924214405919456, "language_loss": 0.71839154, "learning_rate": 3.255010380132783e-06, "loss": 0.74003601, "num_input_tokens_seen": 109173925, "step": 5082, "time_per_iteration": 2.7159903049468994 }, { "auxiliary_loss_clip": 0.0112721, "auxiliary_loss_mlp": 0.01043614, "balance_loss_clip": 1.04611564, "balance_loss_mlp": 1.02554226, "epoch": 0.30560649331128814, "flos": 25591775431680.0, "grad_norm": 2.25447896755926, "language_loss": 0.73108822, "learning_rate": 3.2547071166533736e-06, "loss": 0.75279647, "num_input_tokens_seen": 109192510, "step": 5083, "time_per_iteration": 2.646739959716797 }, { "auxiliary_loss_clip": 0.01107487, "auxiliary_loss_mlp": 0.00775151, "balance_loss_clip": 1.04263341, "balance_loss_mlp": 1.00127327, "epoch": 0.3056666165639561, "flos": 19128321400320.0, "grad_norm": 1.7470718607902291, "language_loss": 0.71378291, "learning_rate": 3.254403805595344e-06, "loss": 0.73260927, "num_input_tokens_seen": 109210885, "step": 5084, "time_per_iteration": 2.6846230030059814 }, { "auxiliary_loss_clip": 0.01099017, "auxiliary_loss_mlp": 0.01047221, "balance_loss_clip": 1.04366112, "balance_loss_mlp": 1.02929187, "epoch": 0.30572673981662407, "flos": 15523860441600.0, "grad_norm": 1.8852357422602322, "language_loss": 0.78966236, "learning_rate": 3.2541004469701962e-06, "loss": 0.81112474, "num_input_tokens_seen": 109229180, "step": 5085, "time_per_iteration": 2.7193636894226074 }, { "auxiliary_loss_clip": 0.01130512, "auxiliary_loss_mlp": 0.01034677, "balance_loss_clip": 1.04483652, "balance_loss_mlp": 1.01910806, "epoch": 0.30578686306929204, "flos": 21506541909120.0, "grad_norm": 1.9742516674355037, "language_loss": 0.78476739, "learning_rate": 3.2537970407894342e-06, "loss": 0.80641937, "num_input_tokens_seen": 109249510, "step": 5086, "time_per_iteration": 2.5860135555267334 }, { "auxiliary_loss_clip": 0.01103374, "auxiliary_loss_mlp": 0.01052848, "balance_loss_clip": 1.04314184, "balance_loss_mlp": 1.03509736, "epoch": 0.30584698632196, "flos": 20954271323520.0, "grad_norm": 1.8682002339545791, "language_loss": 0.76727784, "learning_rate": 3.253493587064563e-06, "loss": 0.78884006, "num_input_tokens_seen": 109268200, "step": 5087, "time_per_iteration": 2.732639789581299 }, { "auxiliary_loss_clip": 0.01125241, "auxiliary_loss_mlp": 0.01041401, "balance_loss_clip": 1.04509556, "balance_loss_mlp": 1.02450943, "epoch": 0.30590710957462797, "flos": 24681116897280.0, "grad_norm": 2.048016576932303, "language_loss": 0.72534674, "learning_rate": 3.2531900858070885e-06, "loss": 0.74701315, "num_input_tokens_seen": 109288370, "step": 5088, "time_per_iteration": 2.66654109954834 }, { "auxiliary_loss_clip": 0.01128516, "auxiliary_loss_mlp": 0.01043444, "balance_loss_clip": 1.04584277, "balance_loss_mlp": 1.02587295, "epoch": 0.30596723282729593, "flos": 17086907744640.0, "grad_norm": 2.359735204382993, "language_loss": 0.79327172, "learning_rate": 3.252886537028521e-06, "loss": 0.8149913, "num_input_tokens_seen": 109306730, "step": 5089, "time_per_iteration": 2.613231897354126 }, { "auxiliary_loss_clip": 0.01110444, "auxiliary_loss_mlp": 0.01041514, "balance_loss_clip": 1.04634953, "balance_loss_mlp": 1.02470577, "epoch": 0.30602735607996395, "flos": 22857106308480.0, "grad_norm": 1.8271327477144206, "language_loss": 0.77158219, "learning_rate": 3.2525829407403703e-06, "loss": 0.79310179, "num_input_tokens_seen": 109327360, "step": 5090, "time_per_iteration": 2.7469358444213867 }, { "auxiliary_loss_clip": 0.01116264, "auxiliary_loss_mlp": 0.01050158, "balance_loss_clip": 1.04506445, "balance_loss_mlp": 1.03317034, "epoch": 0.3060874793326319, "flos": 29861482227840.0, "grad_norm": 1.7853121536190235, "language_loss": 0.76108491, "learning_rate": 3.2522792969541488e-06, "loss": 0.78274912, "num_input_tokens_seen": 109348135, "step": 5091, "time_per_iteration": 2.7344727516174316 }, { "auxiliary_loss_clip": 0.01076722, "auxiliary_loss_mlp": 0.01049007, "balance_loss_clip": 1.04582906, "balance_loss_mlp": 1.02905178, "epoch": 0.3061476025852999, "flos": 20448577699200.0, "grad_norm": 1.9985396703734173, "language_loss": 0.71938324, "learning_rate": 3.2519756056813705e-06, "loss": 0.74064058, "num_input_tokens_seen": 109366220, "step": 5092, "time_per_iteration": 2.767212390899658 }, { "auxiliary_loss_clip": 0.01114871, "auxiliary_loss_mlp": 0.01040516, "balance_loss_clip": 1.04740167, "balance_loss_mlp": 1.0246855, "epoch": 0.30620772583796785, "flos": 19391475415680.0, "grad_norm": 3.231748461445431, "language_loss": 0.82655406, "learning_rate": 3.2516718669335522e-06, "loss": 0.84810787, "num_input_tokens_seen": 109385260, "step": 5093, "time_per_iteration": 2.705643892288208 }, { "auxiliary_loss_clip": 0.01136927, "auxiliary_loss_mlp": 0.00773786, "balance_loss_clip": 1.04842925, "balance_loss_mlp": 1.00142932, "epoch": 0.3062678490906358, "flos": 24024562151040.0, "grad_norm": 1.6185046249293755, "language_loss": 0.75340986, "learning_rate": 3.2513680807222114e-06, "loss": 0.77251703, "num_input_tokens_seen": 109405025, "step": 5094, "time_per_iteration": 2.6171963214874268 }, { "auxiliary_loss_clip": 0.01112613, "auxiliary_loss_mlp": 0.01042135, "balance_loss_clip": 1.04798305, "balance_loss_mlp": 1.02639914, "epoch": 0.3063279723433038, "flos": 19754639873280.0, "grad_norm": 2.1053112950674824, "language_loss": 0.75988996, "learning_rate": 3.251064247058868e-06, "loss": 0.7814374, "num_input_tokens_seen": 109422465, "step": 5095, "time_per_iteration": 2.7002673149108887 }, { "auxiliary_loss_clip": 0.0112272, "auxiliary_loss_mlp": 0.01043966, "balance_loss_clip": 1.04654729, "balance_loss_mlp": 1.0278492, "epoch": 0.30638809559597174, "flos": 22450022496000.0, "grad_norm": 8.237851994820396, "language_loss": 0.80608332, "learning_rate": 3.250760365955042e-06, "loss": 0.82775021, "num_input_tokens_seen": 109440575, "step": 5096, "time_per_iteration": 2.675551414489746 }, { "auxiliary_loss_clip": 0.01125431, "auxiliary_loss_mlp": 0.01036388, "balance_loss_clip": 1.04639602, "balance_loss_mlp": 1.02030659, "epoch": 0.3064482188486397, "flos": 17165157523200.0, "grad_norm": 3.1166257890970566, "language_loss": 0.81695235, "learning_rate": 3.250456437422258e-06, "loss": 0.83857059, "num_input_tokens_seen": 109459050, "step": 5097, "time_per_iteration": 2.6616358757019043 }, { "auxiliary_loss_clip": 0.01138165, "auxiliary_loss_mlp": 0.01042971, "balance_loss_clip": 1.04782009, "balance_loss_mlp": 1.02522099, "epoch": 0.3065083421013077, "flos": 23768483114880.0, "grad_norm": 2.1722798378639663, "language_loss": 0.78152639, "learning_rate": 3.250152461472041e-06, "loss": 0.80333775, "num_input_tokens_seen": 109475860, "step": 5098, "time_per_iteration": 2.581339120864868 }, { "auxiliary_loss_clip": 0.01093696, "auxiliary_loss_mlp": 0.01039814, "balance_loss_clip": 1.04763365, "balance_loss_mlp": 1.02302897, "epoch": 0.30656846535397564, "flos": 26431833784320.0, "grad_norm": 1.8342329708039284, "language_loss": 0.84488571, "learning_rate": 3.249848438115917e-06, "loss": 0.86622083, "num_input_tokens_seen": 109494760, "step": 5099, "time_per_iteration": 2.761580467224121 }, { "auxiliary_loss_clip": 0.0113763, "auxiliary_loss_mlp": 0.01044142, "balance_loss_clip": 1.04598331, "balance_loss_mlp": 1.02683902, "epoch": 0.3066285886066436, "flos": 26651786716800.0, "grad_norm": 1.7645297710058767, "language_loss": 0.85650218, "learning_rate": 3.2495443673654148e-06, "loss": 0.87831986, "num_input_tokens_seen": 109516480, "step": 5100, "time_per_iteration": 4.130753517150879 }, { "auxiliary_loss_clip": 0.01099546, "auxiliary_loss_mlp": 0.01040494, "balance_loss_clip": 1.04097986, "balance_loss_mlp": 1.02268374, "epoch": 0.30668871185931157, "flos": 15049947375360.0, "grad_norm": 1.8121599631247622, "language_loss": 0.78980827, "learning_rate": 3.249240249232065e-06, "loss": 0.81120867, "num_input_tokens_seen": 109534615, "step": 5101, "time_per_iteration": 4.324965000152588 }, { "auxiliary_loss_clip": 0.01102347, "auxiliary_loss_mlp": 0.01054476, "balance_loss_clip": 1.04654586, "balance_loss_mlp": 1.03549778, "epoch": 0.30674883511197953, "flos": 20082109190400.0, "grad_norm": 3.103169454759946, "language_loss": 0.8002606, "learning_rate": 3.2489360837273998e-06, "loss": 0.82182884, "num_input_tokens_seen": 109554040, "step": 5102, "time_per_iteration": 2.6799395084381104 }, { "auxiliary_loss_clip": 0.01142197, "auxiliary_loss_mlp": 0.01041215, "balance_loss_clip": 1.05097044, "balance_loss_mlp": 1.02254653, "epoch": 0.30680895836464755, "flos": 22893807029760.0, "grad_norm": 2.1213785434731416, "language_loss": 0.88774347, "learning_rate": 3.2486318708629532e-06, "loss": 0.90957761, "num_input_tokens_seen": 109574345, "step": 5103, "time_per_iteration": 2.65173077583313 }, { "auxiliary_loss_clip": 0.01117159, "auxiliary_loss_mlp": 0.01047865, "balance_loss_clip": 1.04379106, "balance_loss_mlp": 1.03051972, "epoch": 0.3068690816173155, "flos": 23696159080320.0, "grad_norm": 1.7904968866721789, "language_loss": 0.73977435, "learning_rate": 3.2483276106502607e-06, "loss": 0.7614246, "num_input_tokens_seen": 109593670, "step": 5104, "time_per_iteration": 4.15887975692749 }, { "auxiliary_loss_clip": 0.01124364, "auxiliary_loss_mlp": 0.00776702, "balance_loss_clip": 1.04378068, "balance_loss_mlp": 1.00128829, "epoch": 0.3069292048699835, "flos": 23551044134400.0, "grad_norm": 3.7241561762804496, "language_loss": 0.72777617, "learning_rate": 3.2480233031008605e-06, "loss": 0.74678683, "num_input_tokens_seen": 109613385, "step": 5105, "time_per_iteration": 2.657212972640991 }, { "auxiliary_loss_clip": 0.01112354, "auxiliary_loss_mlp": 0.01041782, "balance_loss_clip": 1.0451684, "balance_loss_mlp": 1.02401972, "epoch": 0.30698932812265145, "flos": 24531656405760.0, "grad_norm": 1.9297281358185925, "language_loss": 0.87290782, "learning_rate": 3.2477189482262916e-06, "loss": 0.89444917, "num_input_tokens_seen": 109632395, "step": 5106, "time_per_iteration": 4.409428119659424 }, { "auxiliary_loss_clip": 0.0110831, "auxiliary_loss_mlp": 0.01052851, "balance_loss_clip": 1.04540682, "balance_loss_mlp": 1.03390849, "epoch": 0.3070494513753194, "flos": 20996430912000.0, "grad_norm": 2.254355123120303, "language_loss": 0.71420276, "learning_rate": 3.2474145460380945e-06, "loss": 0.73581433, "num_input_tokens_seen": 109651380, "step": 5107, "time_per_iteration": 2.7320871353149414 }, { "auxiliary_loss_clip": 0.01101295, "auxiliary_loss_mlp": 0.0104767, "balance_loss_clip": 1.04618347, "balance_loss_mlp": 1.03034878, "epoch": 0.3071095746279874, "flos": 19025940660480.0, "grad_norm": 2.1230574515432705, "language_loss": 0.72282934, "learning_rate": 3.247110096547814e-06, "loss": 0.74431896, "num_input_tokens_seen": 109670240, "step": 5108, "time_per_iteration": 2.720196485519409 }, { "auxiliary_loss_clip": 0.01112658, "auxiliary_loss_mlp": 0.01040837, "balance_loss_clip": 1.04619241, "balance_loss_mlp": 1.02325416, "epoch": 0.30716969788065535, "flos": 21215521918080.0, "grad_norm": 3.0053852764205695, "language_loss": 0.8601433, "learning_rate": 3.2468055997669926e-06, "loss": 0.88167822, "num_input_tokens_seen": 109690810, "step": 5109, "time_per_iteration": 2.715580940246582 }, { "auxiliary_loss_clip": 0.01109383, "auxiliary_loss_mlp": 0.01036759, "balance_loss_clip": 1.04432368, "balance_loss_mlp": 1.02017736, "epoch": 0.3072298211333233, "flos": 25772765086080.0, "grad_norm": 1.7463183423202828, "language_loss": 0.67169911, "learning_rate": 3.2465010557071788e-06, "loss": 0.69316053, "num_input_tokens_seen": 109711145, "step": 5110, "time_per_iteration": 2.7133336067199707 }, { "auxiliary_loss_clip": 0.01126653, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.04854119, "balance_loss_mlp": 1.01736796, "epoch": 0.3072899443859913, "flos": 25848931875840.0, "grad_norm": 1.4548971516988844, "language_loss": 0.76673061, "learning_rate": 3.246196464379919e-06, "loss": 0.78833127, "num_input_tokens_seen": 109731425, "step": 5111, "time_per_iteration": 2.692505121231079 }, { "auxiliary_loss_clip": 0.01140411, "auxiliary_loss_mlp": 0.0103997, "balance_loss_clip": 1.04979658, "balance_loss_mlp": 1.02360249, "epoch": 0.30735006763865924, "flos": 25922800195200.0, "grad_norm": 3.7694679470365244, "language_loss": 0.67143333, "learning_rate": 3.245891825796765e-06, "loss": 0.69323719, "num_input_tokens_seen": 109752720, "step": 5112, "time_per_iteration": 2.6441125869750977 }, { "auxiliary_loss_clip": 0.01133822, "auxiliary_loss_mlp": 0.01044497, "balance_loss_clip": 1.05147326, "balance_loss_mlp": 1.02482784, "epoch": 0.3074101908913272, "flos": 30917004312960.0, "grad_norm": 2.062737517485213, "language_loss": 0.79524493, "learning_rate": 3.2455871399692678e-06, "loss": 0.81702805, "num_input_tokens_seen": 109772840, "step": 5113, "time_per_iteration": 2.7166647911071777 }, { "auxiliary_loss_clip": 0.01102438, "auxiliary_loss_mlp": 0.00774651, "balance_loss_clip": 1.04638815, "balance_loss_mlp": 1.00138378, "epoch": 0.30747031414399517, "flos": 18401058731520.0, "grad_norm": 2.08885217843665, "language_loss": 0.76926446, "learning_rate": 3.2452824069089815e-06, "loss": 0.78803539, "num_input_tokens_seen": 109790150, "step": 5114, "time_per_iteration": 2.6842217445373535 }, { "auxiliary_loss_clip": 0.01100955, "auxiliary_loss_mlp": 0.01034415, "balance_loss_clip": 1.0446732, "balance_loss_mlp": 1.01589036, "epoch": 0.30753043739666314, "flos": 22633166966400.0, "grad_norm": 2.179333764681939, "language_loss": 0.62607706, "learning_rate": 3.2449776266274623e-06, "loss": 0.64743078, "num_input_tokens_seen": 109807985, "step": 5115, "time_per_iteration": 2.7709848880767822 }, { "auxiliary_loss_clip": 0.0113067, "auxiliary_loss_mlp": 0.01041883, "balance_loss_clip": 1.04829907, "balance_loss_mlp": 1.02557516, "epoch": 0.3075905606493311, "flos": 27344072517120.0, "grad_norm": 2.4707888757665684, "language_loss": 0.82835108, "learning_rate": 3.2446727991362657e-06, "loss": 0.85007656, "num_input_tokens_seen": 109825920, "step": 5116, "time_per_iteration": 2.6891255378723145 }, { "auxiliary_loss_clip": 0.01115169, "auxiliary_loss_mlp": 0.01050095, "balance_loss_clip": 1.04928303, "balance_loss_mlp": 1.03291702, "epoch": 0.3076506839019991, "flos": 22090808534400.0, "grad_norm": 1.792550086960714, "language_loss": 0.75943851, "learning_rate": 3.244367924446952e-06, "loss": 0.78109109, "num_input_tokens_seen": 109846220, "step": 5117, "time_per_iteration": 2.6685919761657715 }, { "auxiliary_loss_clip": 0.01096356, "auxiliary_loss_mlp": 0.010422, "balance_loss_clip": 1.04583359, "balance_loss_mlp": 1.02309084, "epoch": 0.3077108071546671, "flos": 21289533891840.0, "grad_norm": 2.509228810910763, "language_loss": 0.71450555, "learning_rate": 3.2440630025710826e-06, "loss": 0.7358911, "num_input_tokens_seen": 109863870, "step": 5118, "time_per_iteration": 2.7360472679138184 }, { "auxiliary_loss_clip": 0.0109679, "auxiliary_loss_mlp": 0.01040047, "balance_loss_clip": 1.05069757, "balance_loss_mlp": 1.02279758, "epoch": 0.30777093040733505, "flos": 21430985650560.0, "grad_norm": 1.6950758291291428, "language_loss": 0.74499059, "learning_rate": 3.243758033520219e-06, "loss": 0.76635897, "num_input_tokens_seen": 109883500, "step": 5119, "time_per_iteration": 2.7963552474975586 }, { "auxiliary_loss_clip": 0.01133391, "auxiliary_loss_mlp": 0.01054336, "balance_loss_clip": 1.05088997, "balance_loss_mlp": 1.03520322, "epoch": 0.307831053660003, "flos": 23149275534720.0, "grad_norm": 2.3083726349779785, "language_loss": 0.79968077, "learning_rate": 3.243453017305926e-06, "loss": 0.821558, "num_input_tokens_seen": 109904620, "step": 5120, "time_per_iteration": 2.7600536346435547 }, { "auxiliary_loss_clip": 0.01127117, "auxiliary_loss_mlp": 0.01045491, "balance_loss_clip": 1.04772663, "balance_loss_mlp": 1.02994657, "epoch": 0.307891176912671, "flos": 17019755268480.0, "grad_norm": 1.7119475154385397, "language_loss": 0.79864663, "learning_rate": 3.24314795393977e-06, "loss": 0.8203727, "num_input_tokens_seen": 109922275, "step": 5121, "time_per_iteration": 2.6204211711883545 }, { "auxiliary_loss_clip": 0.01105091, "auxiliary_loss_mlp": 0.01039616, "balance_loss_clip": 1.04669154, "balance_loss_mlp": 1.02292657, "epoch": 0.30795130016533895, "flos": 27705046245120.0, "grad_norm": 1.4682711249191758, "language_loss": 0.82526803, "learning_rate": 3.242842843433319e-06, "loss": 0.84671509, "num_input_tokens_seen": 109944265, "step": 5122, "time_per_iteration": 2.7210805416107178 }, { "auxiliary_loss_clip": 0.01052784, "auxiliary_loss_mlp": 0.01010188, "balance_loss_clip": 1.03048515, "balance_loss_mlp": 1.00826919, "epoch": 0.3080114234180069, "flos": 69058699591680.0, "grad_norm": 0.7449761063336078, "language_loss": 0.58609217, "learning_rate": 3.242537685798143e-06, "loss": 0.60672188, "num_input_tokens_seen": 110014160, "step": 5123, "time_per_iteration": 3.303093433380127 }, { "auxiliary_loss_clip": 0.01133855, "auxiliary_loss_mlp": 0.00776294, "balance_loss_clip": 1.04937184, "balance_loss_mlp": 1.00136161, "epoch": 0.3080715466706749, "flos": 24060221377920.0, "grad_norm": 1.5927838238117058, "language_loss": 0.83550704, "learning_rate": 3.242232481045813e-06, "loss": 0.85460854, "num_input_tokens_seen": 110034865, "step": 5124, "time_per_iteration": 2.7226438522338867 }, { "auxiliary_loss_clip": 0.01143185, "auxiliary_loss_mlp": 0.01038734, "balance_loss_clip": 1.05123234, "balance_loss_mlp": 1.02206898, "epoch": 0.30813166992334284, "flos": 25848680480640.0, "grad_norm": 2.0767599752543657, "language_loss": 0.79332423, "learning_rate": 3.2419272291879035e-06, "loss": 0.81514347, "num_input_tokens_seen": 110052930, "step": 5125, "time_per_iteration": 2.6514153480529785 }, { "auxiliary_loss_clip": 0.01125892, "auxiliary_loss_mlp": 0.01035278, "balance_loss_clip": 1.04636812, "balance_loss_mlp": 1.01694369, "epoch": 0.3081917931760108, "flos": 20449619193600.0, "grad_norm": 1.764828299724452, "language_loss": 0.64689863, "learning_rate": 3.241621930235989e-06, "loss": 0.66851032, "num_input_tokens_seen": 110071765, "step": 5126, "time_per_iteration": 2.6408963203430176 }, { "auxiliary_loss_clip": 0.01099238, "auxiliary_loss_mlp": 0.01044536, "balance_loss_clip": 1.05009556, "balance_loss_mlp": 1.02698874, "epoch": 0.3082519164286788, "flos": 22166257052160.0, "grad_norm": 1.5302214532460006, "language_loss": 0.86800975, "learning_rate": 3.241316584201646e-06, "loss": 0.88944745, "num_input_tokens_seen": 110092660, "step": 5127, "time_per_iteration": 2.793318748474121 }, { "auxiliary_loss_clip": 0.01086461, "auxiliary_loss_mlp": 0.01045743, "balance_loss_clip": 1.04368591, "balance_loss_mlp": 1.02862501, "epoch": 0.30831203968134674, "flos": 28913404700160.0, "grad_norm": 1.6968110238499217, "language_loss": 0.69155616, "learning_rate": 3.2410111910964538e-06, "loss": 0.71287817, "num_input_tokens_seen": 110114960, "step": 5128, "time_per_iteration": 2.777060031890869 }, { "auxiliary_loss_clip": 0.01130807, "auxiliary_loss_mlp": 0.00775186, "balance_loss_clip": 1.05044532, "balance_loss_mlp": 1.00153518, "epoch": 0.3083721629340147, "flos": 25667726739840.0, "grad_norm": 1.7900045405252538, "language_loss": 0.71075535, "learning_rate": 3.240705750931993e-06, "loss": 0.7298153, "num_input_tokens_seen": 110135750, "step": 5129, "time_per_iteration": 2.7317588329315186 }, { "auxiliary_loss_clip": 0.01030892, "auxiliary_loss_mlp": 0.01007708, "balance_loss_clip": 1.0286324, "balance_loss_mlp": 1.00588405, "epoch": 0.3084322861866827, "flos": 68212679581440.0, "grad_norm": 0.8221299931057983, "language_loss": 0.59160221, "learning_rate": 3.240400263719846e-06, "loss": 0.61198819, "num_input_tokens_seen": 110189480, "step": 5130, "time_per_iteration": 3.2141849994659424 }, { "auxiliary_loss_clip": 0.01115906, "auxiliary_loss_mlp": 0.01041214, "balance_loss_clip": 1.04513061, "balance_loss_mlp": 1.02297497, "epoch": 0.3084924094393507, "flos": 20296495514880.0, "grad_norm": 2.986922621878904, "language_loss": 0.73292506, "learning_rate": 3.2400947294715957e-06, "loss": 0.75449622, "num_input_tokens_seen": 110206445, "step": 5131, "time_per_iteration": 2.6520204544067383 }, { "auxiliary_loss_clip": 0.01099541, "auxiliary_loss_mlp": 0.010345, "balance_loss_clip": 1.04438055, "balance_loss_mlp": 1.01822817, "epoch": 0.30855253269201866, "flos": 23949831905280.0, "grad_norm": 1.569237882810685, "language_loss": 0.71420097, "learning_rate": 3.2397891481988303e-06, "loss": 0.73554134, "num_input_tokens_seen": 110226845, "step": 5132, "time_per_iteration": 2.8439948558807373 }, { "auxiliary_loss_clip": 0.01134935, "auxiliary_loss_mlp": 0.00774998, "balance_loss_clip": 1.04922795, "balance_loss_mlp": 1.00131333, "epoch": 0.3086126559446866, "flos": 19281876042240.0, "grad_norm": 1.9070570981004293, "language_loss": 0.89846021, "learning_rate": 3.239483519913136e-06, "loss": 0.91755956, "num_input_tokens_seen": 110244095, "step": 5133, "time_per_iteration": 2.5872273445129395 }, { "auxiliary_loss_clip": 0.01122429, "auxiliary_loss_mlp": 0.01043613, "balance_loss_clip": 1.04856205, "balance_loss_mlp": 1.02580321, "epoch": 0.3086727791973546, "flos": 33760770019200.0, "grad_norm": 1.7209646054950307, "language_loss": 0.67267555, "learning_rate": 3.239177844626102e-06, "loss": 0.69433594, "num_input_tokens_seen": 110264240, "step": 5134, "time_per_iteration": 2.7872183322906494 }, { "auxiliary_loss_clip": 0.01124541, "auxiliary_loss_mlp": 0.01041364, "balance_loss_clip": 1.04777277, "balance_loss_mlp": 1.02393556, "epoch": 0.30873290245002255, "flos": 16034151006720.0, "grad_norm": 1.9145067593542924, "language_loss": 0.82794344, "learning_rate": 3.2388721223493197e-06, "loss": 0.84960246, "num_input_tokens_seen": 110282450, "step": 5135, "time_per_iteration": 2.6355140209198 }, { "auxiliary_loss_clip": 0.01026512, "auxiliary_loss_mlp": 0.01003035, "balance_loss_clip": 1.02417064, "balance_loss_mlp": 1.00113988, "epoch": 0.3087930257026905, "flos": 65048304055680.0, "grad_norm": 0.6923211570832432, "language_loss": 0.55314827, "learning_rate": 3.2385663530943824e-06, "loss": 0.57344365, "num_input_tokens_seen": 110343715, "step": 5136, "time_per_iteration": 3.31300687789917 }, { "auxiliary_loss_clip": 0.01118007, "auxiliary_loss_mlp": 0.00775624, "balance_loss_clip": 1.04826593, "balance_loss_mlp": 1.00124264, "epoch": 0.3088531489553585, "flos": 74738829824640.0, "grad_norm": 2.038560176689262, "language_loss": 0.76524079, "learning_rate": 3.2382605368728852e-06, "loss": 0.78417706, "num_input_tokens_seen": 110368430, "step": 5137, "time_per_iteration": 3.1237831115722656 }, { "auxiliary_loss_clip": 0.01102933, "auxiliary_loss_mlp": 0.010362, "balance_loss_clip": 1.04592168, "balance_loss_mlp": 1.02058411, "epoch": 0.30891327220802645, "flos": 21142300043520.0, "grad_norm": 1.655645044155811, "language_loss": 0.80083114, "learning_rate": 3.237954673696424e-06, "loss": 0.82222247, "num_input_tokens_seen": 110386735, "step": 5138, "time_per_iteration": 2.775902509689331 }, { "auxiliary_loss_clip": 0.01078807, "auxiliary_loss_mlp": 0.0104514, "balance_loss_clip": 1.03953338, "balance_loss_mlp": 1.02583957, "epoch": 0.3089733954606944, "flos": 25664494515840.0, "grad_norm": 1.3823165076112356, "language_loss": 0.81288958, "learning_rate": 3.2376487635765983e-06, "loss": 0.8341291, "num_input_tokens_seen": 110406820, "step": 5139, "time_per_iteration": 4.48141074180603 }, { "auxiliary_loss_clip": 0.01127056, "auxiliary_loss_mlp": 0.01044845, "balance_loss_clip": 1.04565382, "balance_loss_mlp": 1.02575994, "epoch": 0.3090335187133624, "flos": 19427350124160.0, "grad_norm": 2.1511159973406593, "language_loss": 0.77260494, "learning_rate": 3.2373428065250067e-06, "loss": 0.79432398, "num_input_tokens_seen": 110424225, "step": 5140, "time_per_iteration": 4.1141037940979 }, { "auxiliary_loss_clip": 0.01099157, "auxiliary_loss_mlp": 0.01048812, "balance_loss_clip": 1.04282403, "balance_loss_mlp": 1.03233695, "epoch": 0.30909364196603034, "flos": 20011329440640.0, "grad_norm": 1.77105935640331, "language_loss": 0.78806967, "learning_rate": 3.237036802553252e-06, "loss": 0.80954939, "num_input_tokens_seen": 110443310, "step": 5141, "time_per_iteration": 2.6497676372528076 }, { "auxiliary_loss_clip": 0.01119702, "auxiliary_loss_mlp": 0.0104967, "balance_loss_clip": 1.04679799, "balance_loss_mlp": 1.03138292, "epoch": 0.3091537652186983, "flos": 19677575243520.0, "grad_norm": 2.261971688212118, "language_loss": 0.86853915, "learning_rate": 3.2367307516729377e-06, "loss": 0.89023286, "num_input_tokens_seen": 110460215, "step": 5142, "time_per_iteration": 2.635495662689209 }, { "auxiliary_loss_clip": 0.01127738, "auxiliary_loss_mlp": 0.01048033, "balance_loss_clip": 1.04709148, "balance_loss_mlp": 1.03136778, "epoch": 0.3092138884713663, "flos": 17020042577280.0, "grad_norm": 1.7222677689082588, "language_loss": 0.79352587, "learning_rate": 3.23642465389567e-06, "loss": 0.81528366, "num_input_tokens_seen": 110479385, "step": 5143, "time_per_iteration": 2.672196388244629 }, { "auxiliary_loss_clip": 0.01108121, "auxiliary_loss_mlp": 0.01046466, "balance_loss_clip": 1.04830873, "balance_loss_mlp": 1.02858496, "epoch": 0.3092740117240343, "flos": 25009986844800.0, "grad_norm": 1.849759687088619, "language_loss": 0.72079581, "learning_rate": 3.236118509233055e-06, "loss": 0.7423417, "num_input_tokens_seen": 110499885, "step": 5144, "time_per_iteration": 4.2138121128082275 }, { "auxiliary_loss_clip": 0.01130266, "auxiliary_loss_mlp": 0.0105055, "balance_loss_clip": 1.04617548, "balance_loss_mlp": 1.03297877, "epoch": 0.30933413497670226, "flos": 25590410714880.0, "grad_norm": 1.9804845877808144, "language_loss": 0.74328083, "learning_rate": 3.235812317696702e-06, "loss": 0.76508898, "num_input_tokens_seen": 110519690, "step": 5145, "time_per_iteration": 4.315273761749268 }, { "auxiliary_loss_clip": 0.01110927, "auxiliary_loss_mlp": 0.01045527, "balance_loss_clip": 1.04372048, "balance_loss_mlp": 1.02788365, "epoch": 0.3093942582293702, "flos": 24389665943040.0, "grad_norm": 1.6657569174801012, "language_loss": 0.76391518, "learning_rate": 3.2355060792982224e-06, "loss": 0.78547978, "num_input_tokens_seen": 110540520, "step": 5146, "time_per_iteration": 2.7259135246276855 }, { "auxiliary_loss_clip": 0.0111122, "auxiliary_loss_mlp": 0.01042459, "balance_loss_clip": 1.04380584, "balance_loss_mlp": 1.02553141, "epoch": 0.3094543814820382, "flos": 19646441130240.0, "grad_norm": 2.148705061921787, "language_loss": 0.66899967, "learning_rate": 3.2351997940492286e-06, "loss": 0.6905365, "num_input_tokens_seen": 110557950, "step": 5147, "time_per_iteration": 2.6804444789886475 }, { "auxiliary_loss_clip": 0.01132642, "auxiliary_loss_mlp": 0.0104049, "balance_loss_clip": 1.04998684, "balance_loss_mlp": 1.0238843, "epoch": 0.30951450473470615, "flos": 25663812157440.0, "grad_norm": 2.0634223914225585, "language_loss": 0.74823105, "learning_rate": 3.2348934619613346e-06, "loss": 0.76996237, "num_input_tokens_seen": 110578215, "step": 5148, "time_per_iteration": 2.637509346008301 }, { "auxiliary_loss_clip": 0.0113505, "auxiliary_loss_mlp": 0.01047495, "balance_loss_clip": 1.0492146, "balance_loss_mlp": 1.02901721, "epoch": 0.3095746279873741, "flos": 12020415505920.0, "grad_norm": 2.1367843023537287, "language_loss": 0.73082036, "learning_rate": 3.2345870830461567e-06, "loss": 0.75264585, "num_input_tokens_seen": 110592990, "step": 5149, "time_per_iteration": 2.6134157180786133 }, { "auxiliary_loss_clip": 0.01097892, "auxiliary_loss_mlp": 0.0104428, "balance_loss_clip": 1.04601955, "balance_loss_mlp": 1.02615988, "epoch": 0.3096347512400421, "flos": 23623044946560.0, "grad_norm": 2.0797901111423274, "language_loss": 0.845025, "learning_rate": 3.2342806573153132e-06, "loss": 0.86644673, "num_input_tokens_seen": 110612130, "step": 5150, "time_per_iteration": 2.7804181575775146 }, { "auxiliary_loss_clip": 0.01086512, "auxiliary_loss_mlp": 0.01047133, "balance_loss_clip": 1.04168093, "balance_loss_mlp": 1.02820301, "epoch": 0.30969487449271005, "flos": 22529313768960.0, "grad_norm": 1.8768941622145223, "language_loss": 0.78431082, "learning_rate": 3.233974184780424e-06, "loss": 0.80564725, "num_input_tokens_seen": 110632045, "step": 5151, "time_per_iteration": 2.7539470195770264 }, { "auxiliary_loss_clip": 0.01131879, "auxiliary_loss_mlp": 0.01041443, "balance_loss_clip": 1.04880977, "balance_loss_mlp": 1.02362132, "epoch": 0.309754997745378, "flos": 15267925059840.0, "grad_norm": 1.9606136965084777, "language_loss": 0.67416716, "learning_rate": 3.2336676654531084e-06, "loss": 0.69590038, "num_input_tokens_seen": 110649340, "step": 5152, "time_per_iteration": 2.579238176345825 }, { "auxiliary_loss_clip": 0.01080518, "auxiliary_loss_mlp": 0.01045921, "balance_loss_clip": 1.04402971, "balance_loss_mlp": 1.02807546, "epoch": 0.309815120998046, "flos": 26979291947520.0, "grad_norm": 5.6670540450328355, "language_loss": 0.8251189, "learning_rate": 3.2333610993449926e-06, "loss": 0.84638333, "num_input_tokens_seen": 110668450, "step": 5153, "time_per_iteration": 2.792285203933716 }, { "auxiliary_loss_clip": 0.01113849, "auxiliary_loss_mlp": 0.00775793, "balance_loss_clip": 1.04663801, "balance_loss_mlp": 1.00127769, "epoch": 0.30987524425071394, "flos": 21143161969920.0, "grad_norm": 1.937189485762574, "language_loss": 0.73793215, "learning_rate": 3.2330544864676997e-06, "loss": 0.75682855, "num_input_tokens_seen": 110689410, "step": 5154, "time_per_iteration": 2.678454875946045 }, { "auxiliary_loss_clip": 0.01132509, "auxiliary_loss_mlp": 0.0103738, "balance_loss_clip": 1.0507983, "balance_loss_mlp": 1.02009416, "epoch": 0.3099353675033819, "flos": 15268284195840.0, "grad_norm": 2.1601099672999586, "language_loss": 0.76069349, "learning_rate": 3.232747826832858e-06, "loss": 0.78239238, "num_input_tokens_seen": 110707350, "step": 5155, "time_per_iteration": 2.577634334564209 }, { "auxiliary_loss_clip": 0.01131155, "auxiliary_loss_mlp": 0.01040429, "balance_loss_clip": 1.05483913, "balance_loss_mlp": 1.02283418, "epoch": 0.30999549075604993, "flos": 15413794191360.0, "grad_norm": 2.044896457109867, "language_loss": 0.79096609, "learning_rate": 3.232441120452094e-06, "loss": 0.81268191, "num_input_tokens_seen": 110724910, "step": 5156, "time_per_iteration": 2.628363609313965 }, { "auxiliary_loss_clip": 0.01127429, "auxiliary_loss_mlp": 0.01047381, "balance_loss_clip": 1.04775023, "balance_loss_mlp": 1.02779543, "epoch": 0.3100556140087179, "flos": 23184539712000.0, "grad_norm": 2.468311845454126, "language_loss": 0.74950963, "learning_rate": 3.23213436733704e-06, "loss": 0.77125776, "num_input_tokens_seen": 110744010, "step": 5157, "time_per_iteration": 2.6231181621551514 }, { "auxiliary_loss_clip": 0.01108321, "auxiliary_loss_mlp": 0.01042715, "balance_loss_clip": 1.04868615, "balance_loss_mlp": 1.02634752, "epoch": 0.31011573726138586, "flos": 25742169676800.0, "grad_norm": 1.6453166696914168, "language_loss": 0.69648343, "learning_rate": 3.231827567499327e-06, "loss": 0.71799374, "num_input_tokens_seen": 110765835, "step": 5158, "time_per_iteration": 2.734889030456543 }, { "auxiliary_loss_clip": 0.01095116, "auxiliary_loss_mlp": 0.01046106, "balance_loss_clip": 1.04443944, "balance_loss_mlp": 1.0301435, "epoch": 0.3101758605140538, "flos": 20011329440640.0, "grad_norm": 1.9329481500014836, "language_loss": 0.84861457, "learning_rate": 3.2315207209505896e-06, "loss": 0.87002677, "num_input_tokens_seen": 110784655, "step": 5159, "time_per_iteration": 2.665311813354492 }, { "auxiliary_loss_clip": 0.01116498, "auxiliary_loss_mlp": 0.01046065, "balance_loss_clip": 1.04710639, "balance_loss_mlp": 1.02877951, "epoch": 0.3102359837667218, "flos": 19135683688320.0, "grad_norm": 1.9614748869944683, "language_loss": 0.85129201, "learning_rate": 3.231213827702462e-06, "loss": 0.87291765, "num_input_tokens_seen": 110802545, "step": 5160, "time_per_iteration": 2.597130298614502 }, { "auxiliary_loss_clip": 0.01133056, "auxiliary_loss_mlp": 0.01042602, "balance_loss_clip": 1.0520395, "balance_loss_mlp": 1.02582884, "epoch": 0.31029610701938976, "flos": 22265405568000.0, "grad_norm": 1.9459577302566504, "language_loss": 0.75555152, "learning_rate": 3.230906887766584e-06, "loss": 0.77730811, "num_input_tokens_seen": 110820265, "step": 5161, "time_per_iteration": 2.583240032196045 }, { "auxiliary_loss_clip": 0.0113313, "auxiliary_loss_mlp": 0.01045414, "balance_loss_clip": 1.05046988, "balance_loss_mlp": 1.02797401, "epoch": 0.3103562302720577, "flos": 20805349536000.0, "grad_norm": 1.9938857241338979, "language_loss": 0.8156144, "learning_rate": 3.2305999011545924e-06, "loss": 0.83739984, "num_input_tokens_seen": 110836195, "step": 5162, "time_per_iteration": 2.495689630508423 }, { "auxiliary_loss_clip": 0.01128762, "auxiliary_loss_mlp": 0.01039959, "balance_loss_clip": 1.04903293, "balance_loss_mlp": 1.02450919, "epoch": 0.3104163535247257, "flos": 22344158136960.0, "grad_norm": 1.777649785974679, "language_loss": 0.82892883, "learning_rate": 3.2302928678781295e-06, "loss": 0.85061604, "num_input_tokens_seen": 110856420, "step": 5163, "time_per_iteration": 2.591036081314087 }, { "auxiliary_loss_clip": 0.01147486, "auxiliary_loss_mlp": 0.01044526, "balance_loss_clip": 1.05307984, "balance_loss_mlp": 1.0273242, "epoch": 0.31047647677739365, "flos": 21689363157120.0, "grad_norm": 1.875247009463239, "language_loss": 0.76131678, "learning_rate": 3.2299857879488376e-06, "loss": 0.78323686, "num_input_tokens_seen": 110876650, "step": 5164, "time_per_iteration": 2.5745677947998047 }, { "auxiliary_loss_clip": 0.01103275, "auxiliary_loss_mlp": 0.01046349, "balance_loss_clip": 1.04969811, "balance_loss_mlp": 1.02880108, "epoch": 0.3105366000300616, "flos": 18917275040640.0, "grad_norm": 3.462886730904856, "language_loss": 0.74514711, "learning_rate": 3.2296786613783626e-06, "loss": 0.7666434, "num_input_tokens_seen": 110894445, "step": 5165, "time_per_iteration": 2.724846124649048 }, { "auxiliary_loss_clip": 0.01100578, "auxiliary_loss_mlp": 0.01057021, "balance_loss_clip": 1.04695523, "balance_loss_mlp": 1.03841233, "epoch": 0.3105967232827296, "flos": 18260397072000.0, "grad_norm": 1.6273273492295701, "language_loss": 0.75827682, "learning_rate": 3.229371488178348e-06, "loss": 0.77985275, "num_input_tokens_seen": 110912855, "step": 5166, "time_per_iteration": 2.7309961318969727 }, { "auxiliary_loss_clip": 0.01121318, "auxiliary_loss_mlp": 0.01043526, "balance_loss_clip": 1.04969096, "balance_loss_mlp": 1.02665818, "epoch": 0.31065684653539755, "flos": 17672144037120.0, "grad_norm": 2.1635307284170833, "language_loss": 0.73621917, "learning_rate": 3.229064268360444e-06, "loss": 0.75786763, "num_input_tokens_seen": 110928025, "step": 5167, "time_per_iteration": 2.623375654220581 }, { "auxiliary_loss_clip": 0.01007539, "auxiliary_loss_mlp": 0.01008435, "balance_loss_clip": 1.02476823, "balance_loss_mlp": 1.0059557, "epoch": 0.3107169697880655, "flos": 68531996511360.0, "grad_norm": 0.7113763854018822, "language_loss": 0.53030008, "learning_rate": 3.2287570019362997e-06, "loss": 0.55045986, "num_input_tokens_seen": 110992215, "step": 5168, "time_per_iteration": 3.3115129470825195 }, { "auxiliary_loss_clip": 0.01138497, "auxiliary_loss_mlp": 0.01050074, "balance_loss_clip": 1.05561399, "balance_loss_mlp": 1.03151321, "epoch": 0.3107770930407335, "flos": 13188733274880.0, "grad_norm": 3.621905149464154, "language_loss": 0.79032969, "learning_rate": 3.2284496889175668e-06, "loss": 0.81221539, "num_input_tokens_seen": 111010400, "step": 5169, "time_per_iteration": 2.595463514328003 }, { "auxiliary_loss_clip": 0.01121822, "auxiliary_loss_mlp": 0.01047209, "balance_loss_clip": 1.04804373, "balance_loss_mlp": 1.02937579, "epoch": 0.3108372162934015, "flos": 31580849520000.0, "grad_norm": 1.57130024638105, "language_loss": 0.64071, "learning_rate": 3.2281423293158986e-06, "loss": 0.66240036, "num_input_tokens_seen": 111033960, "step": 5170, "time_per_iteration": 2.746469497680664 }, { "auxiliary_loss_clip": 0.0110491, "auxiliary_loss_mlp": 0.00776539, "balance_loss_clip": 1.04874384, "balance_loss_mlp": 1.00120461, "epoch": 0.31089733954606946, "flos": 28729829266560.0, "grad_norm": 2.172069963879317, "language_loss": 0.7723515, "learning_rate": 3.22783492314295e-06, "loss": 0.79116607, "num_input_tokens_seen": 111053265, "step": 5171, "time_per_iteration": 2.776974678039551 }, { "auxiliary_loss_clip": 0.01100832, "auxiliary_loss_mlp": 0.01048172, "balance_loss_clip": 1.049088, "balance_loss_mlp": 1.03055298, "epoch": 0.3109574627987374, "flos": 19683249592320.0, "grad_norm": 1.830523579545495, "language_loss": 0.84020013, "learning_rate": 3.2275274704103785e-06, "loss": 0.86169016, "num_input_tokens_seen": 111071130, "step": 5172, "time_per_iteration": 2.718118906021118 }, { "auxiliary_loss_clip": 0.01091688, "auxiliary_loss_mlp": 0.01045541, "balance_loss_clip": 1.04622412, "balance_loss_mlp": 1.02706313, "epoch": 0.3110175860514054, "flos": 14683981656960.0, "grad_norm": 1.9540355263753015, "language_loss": 0.83730888, "learning_rate": 3.227219971129842e-06, "loss": 0.8586812, "num_input_tokens_seen": 111089560, "step": 5173, "time_per_iteration": 2.735163927078247 }, { "auxiliary_loss_clip": 0.01145239, "auxiliary_loss_mlp": 0.01042621, "balance_loss_clip": 1.05589437, "balance_loss_mlp": 1.02656341, "epoch": 0.31107770930407336, "flos": 25739655724800.0, "grad_norm": 3.2612368513370495, "language_loss": 0.83354348, "learning_rate": 3.226912425313001e-06, "loss": 0.85542202, "num_input_tokens_seen": 111109960, "step": 5174, "time_per_iteration": 2.65226411819458 }, { "auxiliary_loss_clip": 0.01122854, "auxiliary_loss_mlp": 0.01046101, "balance_loss_clip": 1.05162597, "balance_loss_mlp": 1.02928042, "epoch": 0.3111378325567413, "flos": 19208259118080.0, "grad_norm": 1.9777752297496725, "language_loss": 0.85181922, "learning_rate": 3.2266048329715183e-06, "loss": 0.87350869, "num_input_tokens_seen": 111127960, "step": 5175, "time_per_iteration": 2.6930692195892334 }, { "auxiliary_loss_clip": 0.01087659, "auxiliary_loss_mlp": 0.01044685, "balance_loss_clip": 1.04638839, "balance_loss_mlp": 1.02623129, "epoch": 0.3111979558094093, "flos": 23696374561920.0, "grad_norm": 1.845729409399547, "language_loss": 0.82990116, "learning_rate": 3.2262971941170575e-06, "loss": 0.8512246, "num_input_tokens_seen": 111146730, "step": 5176, "time_per_iteration": 2.7975289821624756 }, { "auxiliary_loss_clip": 0.01126555, "auxiliary_loss_mlp": 0.01042513, "balance_loss_clip": 1.04662132, "balance_loss_mlp": 1.02361798, "epoch": 0.31125807906207725, "flos": 21033023892480.0, "grad_norm": 1.9258407965023028, "language_loss": 0.8096348, "learning_rate": 3.2259895087612837e-06, "loss": 0.83132547, "num_input_tokens_seen": 111166295, "step": 5177, "time_per_iteration": 2.6275687217712402 }, { "auxiliary_loss_clip": 0.01134117, "auxiliary_loss_mlp": 0.0077682, "balance_loss_clip": 1.05381465, "balance_loss_mlp": 1.00119591, "epoch": 0.3113182023147452, "flos": 23076628277760.0, "grad_norm": 1.6855068015846089, "language_loss": 0.80707169, "learning_rate": 3.2256817769158657e-06, "loss": 0.82618099, "num_input_tokens_seen": 111185665, "step": 5178, "time_per_iteration": 4.142611742019653 }, { "auxiliary_loss_clip": 0.01119942, "auxiliary_loss_mlp": 0.01047667, "balance_loss_clip": 1.05289316, "balance_loss_mlp": 1.03076327, "epoch": 0.3113783255674132, "flos": 11838994888320.0, "grad_norm": 2.5880769767242633, "language_loss": 0.80990803, "learning_rate": 3.225373998592471e-06, "loss": 0.83158416, "num_input_tokens_seen": 111201615, "step": 5179, "time_per_iteration": 2.6429331302642822 }, { "auxiliary_loss_clip": 0.01112505, "auxiliary_loss_mlp": 0.01048581, "balance_loss_clip": 1.05353093, "balance_loss_mlp": 1.03139079, "epoch": 0.31143844882008115, "flos": 16289547684480.0, "grad_norm": 2.4201759029551813, "language_loss": 0.78532577, "learning_rate": 3.2250661738027715e-06, "loss": 0.80693662, "num_input_tokens_seen": 111220515, "step": 5180, "time_per_iteration": 4.1918723583221436 }, { "auxiliary_loss_clip": 0.01107686, "auxiliary_loss_mlp": 0.01037212, "balance_loss_clip": 1.05114985, "balance_loss_mlp": 1.02011788, "epoch": 0.3114985720727491, "flos": 23217792727680.0, "grad_norm": 1.6775849826612523, "language_loss": 0.83088589, "learning_rate": 3.22475830255844e-06, "loss": 0.85233486, "num_input_tokens_seen": 111240395, "step": 5181, "time_per_iteration": 2.760340929031372 }, { "auxiliary_loss_clip": 0.01110614, "auxiliary_loss_mlp": 0.01044232, "balance_loss_clip": 1.04879427, "balance_loss_mlp": 1.02881861, "epoch": 0.3115586953254171, "flos": 30044626698240.0, "grad_norm": 1.766790552230027, "language_loss": 0.74396992, "learning_rate": 3.2244503848711516e-06, "loss": 0.76551843, "num_input_tokens_seen": 111261100, "step": 5182, "time_per_iteration": 2.7501730918884277 }, { "auxiliary_loss_clip": 0.01093489, "auxiliary_loss_mlp": 0.00776946, "balance_loss_clip": 1.04811049, "balance_loss_mlp": 1.00152898, "epoch": 0.3116188185780851, "flos": 25666326109440.0, "grad_norm": 2.03695228940596, "language_loss": 0.70169222, "learning_rate": 3.2241424207525815e-06, "loss": 0.72039658, "num_input_tokens_seen": 111281320, "step": 5183, "time_per_iteration": 4.26041579246521 }, { "auxiliary_loss_clip": 0.01017812, "auxiliary_loss_mlp": 0.01006564, "balance_loss_clip": 1.01984847, "balance_loss_mlp": 1.00418019, "epoch": 0.31167894183075306, "flos": 69510058917120.0, "grad_norm": 0.9394459872440335, "language_loss": 0.59573013, "learning_rate": 3.223834410214408e-06, "loss": 0.61597383, "num_input_tokens_seen": 111341405, "step": 5184, "time_per_iteration": 4.992337226867676 }, { "auxiliary_loss_clip": 0.01115495, "auxiliary_loss_mlp": 0.01050891, "balance_loss_clip": 1.04588842, "balance_loss_mlp": 1.03422523, "epoch": 0.31173906508342103, "flos": 14939845211520.0, "grad_norm": 2.48453112640368, "language_loss": 0.70156622, "learning_rate": 3.223526353268311e-06, "loss": 0.72323, "num_input_tokens_seen": 111358975, "step": 5185, "time_per_iteration": 2.6406824588775635 }, { "auxiliary_loss_clip": 0.01122412, "auxiliary_loss_mlp": 0.01051261, "balance_loss_clip": 1.05447555, "balance_loss_mlp": 1.03405905, "epoch": 0.311799188336089, "flos": 16176033728640.0, "grad_norm": 2.8983279272522853, "language_loss": 0.63588691, "learning_rate": 3.2232182499259725e-06, "loss": 0.65762365, "num_input_tokens_seen": 111375845, "step": 5186, "time_per_iteration": 2.683971881866455 }, { "auxiliary_loss_clip": 0.01126858, "auxiliary_loss_mlp": 0.01049881, "balance_loss_clip": 1.05240881, "balance_loss_mlp": 1.03145099, "epoch": 0.31185931158875696, "flos": 25009627708800.0, "grad_norm": 2.2127415604209335, "language_loss": 0.86427295, "learning_rate": 3.2229101001990747e-06, "loss": 0.88604033, "num_input_tokens_seen": 111394150, "step": 5187, "time_per_iteration": 2.6983299255371094 }, { "auxiliary_loss_clip": 0.01146114, "auxiliary_loss_mlp": 0.0077496, "balance_loss_clip": 1.05417776, "balance_loss_mlp": 1.00131774, "epoch": 0.3119194348414249, "flos": 37232901273600.0, "grad_norm": 1.653121843679143, "language_loss": 0.63481069, "learning_rate": 3.2226019040993036e-06, "loss": 0.6540215, "num_input_tokens_seen": 111418355, "step": 5188, "time_per_iteration": 2.6974728107452393 }, { "auxiliary_loss_clip": 0.01106256, "auxiliary_loss_mlp": 0.01044626, "balance_loss_clip": 1.05064225, "balance_loss_mlp": 1.02799582, "epoch": 0.3119795580940929, "flos": 15012779777280.0, "grad_norm": 2.578497111530561, "language_loss": 0.83241487, "learning_rate": 3.222293661638346e-06, "loss": 0.85392368, "num_input_tokens_seen": 111435445, "step": 5189, "time_per_iteration": 2.6956889629364014 }, { "auxiliary_loss_clip": 0.01031008, "auxiliary_loss_mlp": 0.01045956, "balance_loss_clip": 1.03804195, "balance_loss_mlp": 1.02812243, "epoch": 0.31203968134676086, "flos": 15998168557440.0, "grad_norm": 1.8156368008577992, "language_loss": 0.79266763, "learning_rate": 3.22198537282789e-06, "loss": 0.81343722, "num_input_tokens_seen": 111453430, "step": 5190, "time_per_iteration": 3.0180671215057373 }, { "auxiliary_loss_clip": 0.01086186, "auxiliary_loss_mlp": 0.01053443, "balance_loss_clip": 1.04333639, "balance_loss_mlp": 1.03413141, "epoch": 0.3120998045994288, "flos": 23837359443840.0, "grad_norm": 1.571307617405072, "language_loss": 0.75174087, "learning_rate": 3.2216770376796262e-06, "loss": 0.77313721, "num_input_tokens_seen": 111475325, "step": 5191, "time_per_iteration": 3.0170204639434814 }, { "auxiliary_loss_clip": 0.01043661, "auxiliary_loss_mlp": 0.00755081, "balance_loss_clip": 1.02154636, "balance_loss_mlp": 1.00261629, "epoch": 0.3121599278520968, "flos": 69184205712000.0, "grad_norm": 0.8534965117798614, "language_loss": 0.63942307, "learning_rate": 3.221368656205247e-06, "loss": 0.6574105, "num_input_tokens_seen": 111533960, "step": 5192, "time_per_iteration": 3.288938045501709 }, { "auxiliary_loss_clip": 0.01133662, "auxiliary_loss_mlp": 0.01043466, "balance_loss_clip": 1.05246997, "balance_loss_mlp": 1.02569187, "epoch": 0.31222005110476475, "flos": 23806368984960.0, "grad_norm": 1.9226654053779162, "language_loss": 0.7976644, "learning_rate": 3.221060228416446e-06, "loss": 0.81943566, "num_input_tokens_seen": 111554055, "step": 5193, "time_per_iteration": 2.758859157562256 }, { "auxiliary_loss_clip": 0.01117628, "auxiliary_loss_mlp": 0.01054751, "balance_loss_clip": 1.04916263, "balance_loss_mlp": 1.03508139, "epoch": 0.3122801743574327, "flos": 25226132935680.0, "grad_norm": 2.5170295869133024, "language_loss": 0.72488689, "learning_rate": 3.2207517543249183e-06, "loss": 0.74661064, "num_input_tokens_seen": 111574305, "step": 5194, "time_per_iteration": 2.69765567779541 }, { "auxiliary_loss_clip": 0.01144699, "auxiliary_loss_mlp": 0.01044476, "balance_loss_clip": 1.05394197, "balance_loss_mlp": 1.02819204, "epoch": 0.3123402976101007, "flos": 22966490200320.0, "grad_norm": 1.775027795968239, "language_loss": 0.76423192, "learning_rate": 3.2204432339423616e-06, "loss": 0.78612363, "num_input_tokens_seen": 111595680, "step": 5195, "time_per_iteration": 2.665656566619873 }, { "auxiliary_loss_clip": 0.01144607, "auxiliary_loss_mlp": 0.01042079, "balance_loss_clip": 1.05148935, "balance_loss_mlp": 1.02544916, "epoch": 0.3124004208627687, "flos": 25192089820800.0, "grad_norm": 1.4414001308378115, "language_loss": 0.78089559, "learning_rate": 3.220134667280476e-06, "loss": 0.80276251, "num_input_tokens_seen": 111618135, "step": 5196, "time_per_iteration": 2.682476282119751 }, { "auxiliary_loss_clip": 0.01032618, "auxiliary_loss_mlp": 0.00755246, "balance_loss_clip": 1.02237272, "balance_loss_mlp": 1.00273037, "epoch": 0.31246054411543667, "flos": 67485165517440.0, "grad_norm": 0.794984063014186, "language_loss": 0.54770386, "learning_rate": 3.2198260543509613e-06, "loss": 0.56558245, "num_input_tokens_seen": 111682220, "step": 5197, "time_per_iteration": 3.24509334564209 }, { "auxiliary_loss_clip": 0.01144094, "auxiliary_loss_mlp": 0.01042495, "balance_loss_clip": 1.0547365, "balance_loss_mlp": 1.02586555, "epoch": 0.31252066736810463, "flos": 17858520731520.0, "grad_norm": 1.8260094290654212, "language_loss": 0.66137004, "learning_rate": 3.21951739516552e-06, "loss": 0.68323588, "num_input_tokens_seen": 111700815, "step": 5198, "time_per_iteration": 2.5970942974090576 }, { "auxiliary_loss_clip": 0.01102297, "auxiliary_loss_mlp": 0.01047482, "balance_loss_clip": 1.0459094, "balance_loss_mlp": 1.02898037, "epoch": 0.3125807906207726, "flos": 18475034791680.0, "grad_norm": 2.530729988117139, "language_loss": 0.6949119, "learning_rate": 3.219208689735857e-06, "loss": 0.71640968, "num_input_tokens_seen": 111718195, "step": 5199, "time_per_iteration": 2.6682288646698 }, { "auxiliary_loss_clip": 0.01132634, "auxiliary_loss_mlp": 0.01050152, "balance_loss_clip": 1.04906189, "balance_loss_mlp": 1.03258061, "epoch": 0.31264091387344056, "flos": 18946541646720.0, "grad_norm": 1.8087592578592666, "language_loss": 0.78480452, "learning_rate": 3.2188999380736785e-06, "loss": 0.8066324, "num_input_tokens_seen": 111734440, "step": 5200, "time_per_iteration": 2.6664814949035645 }, { "auxiliary_loss_clip": 0.01132139, "auxiliary_loss_mlp": 0.01037041, "balance_loss_clip": 1.05233109, "balance_loss_mlp": 1.02036345, "epoch": 0.3127010371261085, "flos": 21468512384640.0, "grad_norm": 2.0480479984687214, "language_loss": 0.83231741, "learning_rate": 3.2185911401906917e-06, "loss": 0.85400921, "num_input_tokens_seen": 111751960, "step": 5201, "time_per_iteration": 2.674558401107788 }, { "auxiliary_loss_clip": 0.01144703, "auxiliary_loss_mlp": 0.01045083, "balance_loss_clip": 1.05244124, "balance_loss_mlp": 1.02697527, "epoch": 0.3127611603787765, "flos": 15336047203200.0, "grad_norm": 3.6217323271444037, "language_loss": 0.6910159, "learning_rate": 3.2182822960986072e-06, "loss": 0.71291375, "num_input_tokens_seen": 111769585, "step": 5202, "time_per_iteration": 2.563164710998535 }, { "auxiliary_loss_clip": 0.01146715, "auxiliary_loss_mlp": 0.01041598, "balance_loss_clip": 1.05293012, "balance_loss_mlp": 1.02608871, "epoch": 0.31282128363144446, "flos": 17602980399360.0, "grad_norm": 1.898082303559049, "language_loss": 0.84124672, "learning_rate": 3.2179734058091358e-06, "loss": 0.86312985, "num_input_tokens_seen": 111787880, "step": 5203, "time_per_iteration": 2.6024506092071533 }, { "auxiliary_loss_clip": 0.01086755, "auxiliary_loss_mlp": 0.01049344, "balance_loss_clip": 1.04461396, "balance_loss_mlp": 1.03139079, "epoch": 0.3128814068841124, "flos": 26756753235840.0, "grad_norm": 2.246749233698224, "language_loss": 0.61165982, "learning_rate": 3.2176644693339913e-06, "loss": 0.63302082, "num_input_tokens_seen": 111805950, "step": 5204, "time_per_iteration": 2.748486042022705 }, { "auxiliary_loss_clip": 0.01105223, "auxiliary_loss_mlp": 0.01043537, "balance_loss_clip": 1.04439998, "balance_loss_mlp": 1.02722907, "epoch": 0.3129415301367804, "flos": 22272372806400.0, "grad_norm": 1.6432390116063589, "language_loss": 0.65875763, "learning_rate": 3.217355486684887e-06, "loss": 0.68024528, "num_input_tokens_seen": 111826135, "step": 5205, "time_per_iteration": 2.717499256134033 }, { "auxiliary_loss_clip": 0.01134026, "auxiliary_loss_mlp": 0.01046734, "balance_loss_clip": 1.05126929, "balance_loss_mlp": 1.02849531, "epoch": 0.31300165338944835, "flos": 26464907232000.0, "grad_norm": 1.6106510494401134, "language_loss": 0.76811433, "learning_rate": 3.2170464578735414e-06, "loss": 0.78992188, "num_input_tokens_seen": 111844700, "step": 5206, "time_per_iteration": 2.642439603805542 }, { "auxiliary_loss_clip": 0.01140688, "auxiliary_loss_mlp": 0.01041131, "balance_loss_clip": 1.04956853, "balance_loss_mlp": 1.02448893, "epoch": 0.3130617766421163, "flos": 21944652094080.0, "grad_norm": 2.214530025407602, "language_loss": 0.83204615, "learning_rate": 3.216737382911672e-06, "loss": 0.85386431, "num_input_tokens_seen": 111861585, "step": 5207, "time_per_iteration": 2.616652727127075 }, { "auxiliary_loss_clip": 0.01127002, "auxiliary_loss_mlp": 0.0104831, "balance_loss_clip": 1.0502398, "balance_loss_mlp": 1.0328126, "epoch": 0.3131218998947843, "flos": 23292774368640.0, "grad_norm": 1.5207985149404841, "language_loss": 0.71359724, "learning_rate": 3.216428261810999e-06, "loss": 0.73535037, "num_input_tokens_seen": 111882950, "step": 5208, "time_per_iteration": 2.674813747406006 }, { "auxiliary_loss_clip": 0.01120564, "auxiliary_loss_mlp": 0.01045064, "balance_loss_clip": 1.04862344, "balance_loss_mlp": 1.02827978, "epoch": 0.3131820231474523, "flos": 21139642437120.0, "grad_norm": 1.848256205390157, "language_loss": 0.74558908, "learning_rate": 3.2161190945832445e-06, "loss": 0.76724535, "num_input_tokens_seen": 111901640, "step": 5209, "time_per_iteration": 2.7193644046783447 }, { "auxiliary_loss_clip": 0.01140035, "auxiliary_loss_mlp": 0.01045727, "balance_loss_clip": 1.04733396, "balance_loss_mlp": 1.02937174, "epoch": 0.31324214640012027, "flos": 23909863046400.0, "grad_norm": 2.0633998475681135, "language_loss": 0.77254915, "learning_rate": 3.2158098812401325e-06, "loss": 0.79440677, "num_input_tokens_seen": 111919615, "step": 5210, "time_per_iteration": 2.6212270259857178 }, { "auxiliary_loss_clip": 0.01125553, "auxiliary_loss_mlp": 0.01039925, "balance_loss_clip": 1.047261, "balance_loss_mlp": 1.02385592, "epoch": 0.31330226965278823, "flos": 22236929061120.0, "grad_norm": 1.9577389211395706, "language_loss": 0.79128736, "learning_rate": 3.2155006217933874e-06, "loss": 0.81294215, "num_input_tokens_seen": 111938485, "step": 5211, "time_per_iteration": 2.6618316173553467 }, { "auxiliary_loss_clip": 0.01132257, "auxiliary_loss_mlp": 0.01042587, "balance_loss_clip": 1.05107522, "balance_loss_mlp": 1.02768588, "epoch": 0.3133623929054562, "flos": 19753993428480.0, "grad_norm": 2.4581961413264195, "language_loss": 0.79612064, "learning_rate": 3.2151913162547367e-06, "loss": 0.81786901, "num_input_tokens_seen": 111956425, "step": 5212, "time_per_iteration": 2.81793475151062 }, { "auxiliary_loss_clip": 0.01125931, "auxiliary_loss_mlp": 0.01053393, "balance_loss_clip": 1.05156052, "balance_loss_mlp": 1.03576159, "epoch": 0.31342251615812416, "flos": 27162256849920.0, "grad_norm": 2.69561664367352, "language_loss": 0.71024299, "learning_rate": 3.2148819646359097e-06, "loss": 0.73203623, "num_input_tokens_seen": 111975915, "step": 5213, "time_per_iteration": 2.6739485263824463 }, { "auxiliary_loss_clip": 0.01132672, "auxiliary_loss_mlp": 0.01045903, "balance_loss_clip": 1.05284989, "balance_loss_mlp": 1.02961898, "epoch": 0.31348263941079213, "flos": 20229809915520.0, "grad_norm": 1.9828215257111186, "language_loss": 0.77684069, "learning_rate": 3.2145725669486374e-06, "loss": 0.79862642, "num_input_tokens_seen": 111995055, "step": 5214, "time_per_iteration": 2.6108171939849854 }, { "auxiliary_loss_clip": 0.01099316, "auxiliary_loss_mlp": 0.01038553, "balance_loss_clip": 1.0522778, "balance_loss_mlp": 1.02317524, "epoch": 0.3135427626634601, "flos": 24607643627520.0, "grad_norm": 2.2634840816113075, "language_loss": 0.8300609, "learning_rate": 3.2142631232046517e-06, "loss": 0.8514396, "num_input_tokens_seen": 112015830, "step": 5215, "time_per_iteration": 2.77897047996521 }, { "auxiliary_loss_clip": 0.01131919, "auxiliary_loss_mlp": 0.01040929, "balance_loss_clip": 1.05089617, "balance_loss_mlp": 1.02375078, "epoch": 0.31360288591612806, "flos": 20959873845120.0, "grad_norm": 2.280765330466862, "language_loss": 0.79540187, "learning_rate": 3.213953633415686e-06, "loss": 0.81713033, "num_input_tokens_seen": 112035065, "step": 5216, "time_per_iteration": 2.675492763519287 }, { "auxiliary_loss_clip": 0.01119434, "auxiliary_loss_mlp": 0.01049814, "balance_loss_clip": 1.04817545, "balance_loss_mlp": 1.03174222, "epoch": 0.313663009168796, "flos": 26980513009920.0, "grad_norm": 1.97082305961493, "language_loss": 0.69007474, "learning_rate": 3.213644097593477e-06, "loss": 0.7117672, "num_input_tokens_seen": 112058405, "step": 5217, "time_per_iteration": 2.7360196113586426 }, { "auxiliary_loss_clip": 0.01121348, "auxiliary_loss_mlp": 0.01038659, "balance_loss_clip": 1.04833519, "balance_loss_mlp": 1.02275062, "epoch": 0.313723132421464, "flos": 18040911016320.0, "grad_norm": 1.7253432561329243, "language_loss": 0.81228399, "learning_rate": 3.2133345157497624e-06, "loss": 0.83388406, "num_input_tokens_seen": 112076420, "step": 5218, "time_per_iteration": 4.393778562545776 }, { "auxiliary_loss_clip": 0.01139073, "auxiliary_loss_mlp": 0.01041023, "balance_loss_clip": 1.04819143, "balance_loss_mlp": 1.02422082, "epoch": 0.31378325567413196, "flos": 22488913946880.0, "grad_norm": 2.6452768271158167, "language_loss": 0.69128895, "learning_rate": 3.2130248878962813e-06, "loss": 0.71308994, "num_input_tokens_seen": 112090775, "step": 5219, "time_per_iteration": 4.162578344345093 }, { "auxiliary_loss_clip": 0.01117748, "auxiliary_loss_mlp": 0.01044298, "balance_loss_clip": 1.04879618, "balance_loss_mlp": 1.0287652, "epoch": 0.3138433789267999, "flos": 22419247518720.0, "grad_norm": 5.057996341652072, "language_loss": 0.80019122, "learning_rate": 3.2127152140447747e-06, "loss": 0.82181168, "num_input_tokens_seen": 112110980, "step": 5220, "time_per_iteration": 2.693300247192383 }, { "auxiliary_loss_clip": 0.01133002, "auxiliary_loss_mlp": 0.01038024, "balance_loss_clip": 1.05214572, "balance_loss_mlp": 1.0220139, "epoch": 0.3139035021794679, "flos": 13005912026880.0, "grad_norm": 1.7918234828134079, "language_loss": 0.72575235, "learning_rate": 3.212405494206986e-06, "loss": 0.74746263, "num_input_tokens_seen": 112129020, "step": 5221, "time_per_iteration": 2.6918861865997314 }, { "auxiliary_loss_clip": 0.01105754, "auxiliary_loss_mlp": 0.0104005, "balance_loss_clip": 1.04538214, "balance_loss_mlp": 1.02435017, "epoch": 0.31396362543213585, "flos": 16945994689920.0, "grad_norm": 1.7850671432610508, "language_loss": 0.82097268, "learning_rate": 3.2120957283946588e-06, "loss": 0.84243071, "num_input_tokens_seen": 112147865, "step": 5222, "time_per_iteration": 4.193262100219727 }, { "auxiliary_loss_clip": 0.01136096, "auxiliary_loss_mlp": 0.01044943, "balance_loss_clip": 1.05302894, "balance_loss_mlp": 1.02764595, "epoch": 0.31402374868480387, "flos": 20156731695360.0, "grad_norm": 2.3946225731958073, "language_loss": 0.70159894, "learning_rate": 3.2117859166195407e-06, "loss": 0.7234093, "num_input_tokens_seen": 112166745, "step": 5223, "time_per_iteration": 2.642608642578125 }, { "auxiliary_loss_clip": 0.01120375, "auxiliary_loss_mlp": 0.00773089, "balance_loss_clip": 1.04545665, "balance_loss_mlp": 1.0012387, "epoch": 0.31408387193747184, "flos": 21251073404160.0, "grad_norm": 1.5662600408509175, "language_loss": 0.80818307, "learning_rate": 3.211476058893379e-06, "loss": 0.82711768, "num_input_tokens_seen": 112185895, "step": 5224, "time_per_iteration": 4.334134101867676 }, { "auxiliary_loss_clip": 0.0113849, "auxiliary_loss_mlp": 0.01044903, "balance_loss_clip": 1.05376673, "balance_loss_mlp": 1.02807033, "epoch": 0.3141439951901398, "flos": 27484267299840.0, "grad_norm": 2.581635190586104, "language_loss": 0.57647121, "learning_rate": 3.2111661552279243e-06, "loss": 0.59830517, "num_input_tokens_seen": 112204465, "step": 5225, "time_per_iteration": 2.680227041244507 }, { "auxiliary_loss_clip": 0.01086502, "auxiliary_loss_mlp": 0.01032759, "balance_loss_clip": 1.04252625, "balance_loss_mlp": 1.0179472, "epoch": 0.31420411844280777, "flos": 17852235851520.0, "grad_norm": 2.0500851879408577, "language_loss": 0.81726074, "learning_rate": 3.2108562056349273e-06, "loss": 0.83845341, "num_input_tokens_seen": 112221635, "step": 5226, "time_per_iteration": 2.8080878257751465 }, { "auxiliary_loss_clip": 0.01123539, "auxiliary_loss_mlp": 0.01053238, "balance_loss_clip": 1.04718053, "balance_loss_mlp": 1.03557122, "epoch": 0.31426424169547573, "flos": 21616967295360.0, "grad_norm": 1.8156350578732643, "language_loss": 0.7435357, "learning_rate": 3.210546210126141e-06, "loss": 0.76530349, "num_input_tokens_seen": 112241240, "step": 5227, "time_per_iteration": 2.6420040130615234 }, { "auxiliary_loss_clip": 0.01128154, "auxiliary_loss_mlp": 0.01036288, "balance_loss_clip": 1.05315053, "balance_loss_mlp": 1.01981306, "epoch": 0.3143243649481437, "flos": 30920631586560.0, "grad_norm": 1.9798889840887306, "language_loss": 0.6779027, "learning_rate": 3.2102361687133213e-06, "loss": 0.69954711, "num_input_tokens_seen": 112262350, "step": 5228, "time_per_iteration": 2.6904454231262207 }, { "auxiliary_loss_clip": 0.01116854, "auxiliary_loss_mlp": 0.01042698, "balance_loss_clip": 1.04812217, "balance_loss_mlp": 1.02755868, "epoch": 0.31438448820081166, "flos": 22821411168000.0, "grad_norm": 2.2592581290101648, "language_loss": 0.802086, "learning_rate": 3.2099260814082254e-06, "loss": 0.82368147, "num_input_tokens_seen": 112283710, "step": 5229, "time_per_iteration": 2.720972776412964 }, { "auxiliary_loss_clip": 0.01116185, "auxiliary_loss_mlp": 0.01034979, "balance_loss_clip": 1.04888391, "balance_loss_mlp": 1.01917148, "epoch": 0.3144446114534796, "flos": 23292127923840.0, "grad_norm": 2.206396959728329, "language_loss": 0.69972271, "learning_rate": 3.209615948222611e-06, "loss": 0.72123438, "num_input_tokens_seen": 112304285, "step": 5230, "time_per_iteration": 2.69555401802063 }, { "auxiliary_loss_clip": 0.01094216, "auxiliary_loss_mlp": 0.01051308, "balance_loss_clip": 1.042889, "balance_loss_mlp": 1.03331971, "epoch": 0.3145047347061476, "flos": 31355976424320.0, "grad_norm": 11.083232715551919, "language_loss": 0.79441226, "learning_rate": 3.209305769168239e-06, "loss": 0.81586754, "num_input_tokens_seen": 112325110, "step": 5231, "time_per_iteration": 2.742414712905884 }, { "auxiliary_loss_clip": 0.01111136, "auxiliary_loss_mlp": 0.01044032, "balance_loss_clip": 1.05004621, "balance_loss_mlp": 1.02751017, "epoch": 0.31456485795881556, "flos": 10889552643840.0, "grad_norm": 68.21693219117104, "language_loss": 0.84846044, "learning_rate": 3.2089955442568704e-06, "loss": 0.87001216, "num_input_tokens_seen": 112339855, "step": 5232, "time_per_iteration": 2.681541919708252 }, { "auxiliary_loss_clip": 0.01082351, "auxiliary_loss_mlp": 0.01063678, "balance_loss_clip": 1.04169703, "balance_loss_mlp": 1.04589176, "epoch": 0.3146249812114835, "flos": 17092438439040.0, "grad_norm": 1.732593505271442, "language_loss": 0.79899549, "learning_rate": 3.2086852735002692e-06, "loss": 0.82045579, "num_input_tokens_seen": 112358480, "step": 5233, "time_per_iteration": 2.7261524200439453 }, { "auxiliary_loss_clip": 0.01095476, "auxiliary_loss_mlp": 0.01043701, "balance_loss_clip": 1.04795146, "balance_loss_mlp": 1.02775121, "epoch": 0.3146851044641515, "flos": 55291442889600.0, "grad_norm": 1.8884411146751285, "language_loss": 0.71124369, "learning_rate": 3.2083749569102024e-06, "loss": 0.73263544, "num_input_tokens_seen": 112382350, "step": 5234, "time_per_iteration": 3.0071427822113037 }, { "auxiliary_loss_clip": 0.01105209, "auxiliary_loss_mlp": 0.01036666, "balance_loss_clip": 1.05008078, "balance_loss_mlp": 1.02060878, "epoch": 0.31474522771681945, "flos": 27015884928000.0, "grad_norm": 2.1537517260325396, "language_loss": 0.72106552, "learning_rate": 3.2080645944984356e-06, "loss": 0.74248433, "num_input_tokens_seen": 112400260, "step": 5235, "time_per_iteration": 2.7347464561462402 }, { "auxiliary_loss_clip": 0.011281, "auxiliary_loss_mlp": 0.0103842, "balance_loss_clip": 1.0479089, "balance_loss_mlp": 1.0225656, "epoch": 0.3148053509694875, "flos": 21251935330560.0, "grad_norm": 2.047935998004664, "language_loss": 0.78640145, "learning_rate": 3.2077541862767384e-06, "loss": 0.80806667, "num_input_tokens_seen": 112419400, "step": 5236, "time_per_iteration": 2.6480181217193604 }, { "auxiliary_loss_clip": 0.01142531, "auxiliary_loss_mlp": 0.0104222, "balance_loss_clip": 1.04929006, "balance_loss_mlp": 1.02536416, "epoch": 0.31486547422215544, "flos": 31248675521280.0, "grad_norm": 1.8469097199945863, "language_loss": 0.75903904, "learning_rate": 3.207443732256881e-06, "loss": 0.78088653, "num_input_tokens_seen": 112440825, "step": 5237, "time_per_iteration": 2.7113847732543945 }, { "auxiliary_loss_clip": 0.01133953, "auxiliary_loss_mlp": 0.01035749, "balance_loss_clip": 1.04817045, "balance_loss_mlp": 1.02128255, "epoch": 0.3149255974748234, "flos": 19828615933440.0, "grad_norm": 2.176202072112168, "language_loss": 0.79725033, "learning_rate": 3.2071332324506372e-06, "loss": 0.81894737, "num_input_tokens_seen": 112459180, "step": 5238, "time_per_iteration": 2.649968147277832 }, { "auxiliary_loss_clip": 0.01046118, "auxiliary_loss_mlp": 0.01018852, "balance_loss_clip": 1.02561212, "balance_loss_mlp": 1.01676548, "epoch": 0.31498572072749137, "flos": 67683965339520.0, "grad_norm": 0.8324046464960934, "language_loss": 0.67913729, "learning_rate": 3.2068226868697795e-06, "loss": 0.69978696, "num_input_tokens_seen": 112516680, "step": 5239, "time_per_iteration": 3.130643606185913 }, { "auxiliary_loss_clip": 0.01121581, "auxiliary_loss_mlp": 0.01043617, "balance_loss_clip": 1.04828835, "balance_loss_mlp": 1.02528274, "epoch": 0.31504584398015933, "flos": 19793136274560.0, "grad_norm": 2.4702861290170235, "language_loss": 0.82906926, "learning_rate": 3.2065120955260846e-06, "loss": 0.85072124, "num_input_tokens_seen": 112535895, "step": 5240, "time_per_iteration": 2.6314027309417725 }, { "auxiliary_loss_clip": 0.0111196, "auxiliary_loss_mlp": 0.0077379, "balance_loss_clip": 1.04708409, "balance_loss_mlp": 1.00132334, "epoch": 0.3151059672328273, "flos": 26615409217920.0, "grad_norm": 1.6854261536361361, "language_loss": 0.81405544, "learning_rate": 3.2062014584313302e-06, "loss": 0.83291298, "num_input_tokens_seen": 112557490, "step": 5241, "time_per_iteration": 2.7245657444000244 }, { "auxiliary_loss_clip": 0.01138561, "auxiliary_loss_mlp": 0.01038584, "balance_loss_clip": 1.05094576, "balance_loss_mlp": 1.0230633, "epoch": 0.31516609048549526, "flos": 24204438483840.0, "grad_norm": 1.7554610875937957, "language_loss": 0.74513441, "learning_rate": 3.2058907755972956e-06, "loss": 0.7669059, "num_input_tokens_seen": 112577075, "step": 5242, "time_per_iteration": 2.5925803184509277 }, { "auxiliary_loss_clip": 0.01106752, "auxiliary_loss_mlp": 0.01039069, "balance_loss_clip": 1.04686832, "balance_loss_mlp": 1.02230775, "epoch": 0.31522621373816323, "flos": 25958710817280.0, "grad_norm": 12.905078117761404, "language_loss": 0.73457384, "learning_rate": 3.2055800470357626e-06, "loss": 0.75603199, "num_input_tokens_seen": 112597620, "step": 5243, "time_per_iteration": 2.721261739730835 }, { "auxiliary_loss_clip": 0.01126602, "auxiliary_loss_mlp": 0.01041378, "balance_loss_clip": 1.04783881, "balance_loss_mlp": 1.02524936, "epoch": 0.3152863369908312, "flos": 21908813299200.0, "grad_norm": 2.079273463581607, "language_loss": 0.6462577, "learning_rate": 3.205269272758513e-06, "loss": 0.66793752, "num_input_tokens_seen": 112617150, "step": 5244, "time_per_iteration": 2.6753153800964355 }, { "auxiliary_loss_clip": 0.01087107, "auxiliary_loss_mlp": 0.01037472, "balance_loss_clip": 1.04454994, "balance_loss_mlp": 1.02158141, "epoch": 0.31534646024349916, "flos": 16281072074880.0, "grad_norm": 2.126512737541558, "language_loss": 0.91117549, "learning_rate": 3.2049584527773313e-06, "loss": 0.93242127, "num_input_tokens_seen": 112631090, "step": 5245, "time_per_iteration": 2.717316150665283 }, { "auxiliary_loss_clip": 0.01129236, "auxiliary_loss_mlp": 0.01046116, "balance_loss_clip": 1.04892504, "balance_loss_mlp": 1.02911687, "epoch": 0.3154065834961671, "flos": 24717243000960.0, "grad_norm": 2.0341104694483296, "language_loss": 0.75199413, "learning_rate": 3.2046475871040048e-06, "loss": 0.77374756, "num_input_tokens_seen": 112651220, "step": 5246, "time_per_iteration": 2.738969564437866 }, { "auxiliary_loss_clip": 0.01139621, "auxiliary_loss_mlp": 0.01044826, "balance_loss_clip": 1.04860735, "balance_loss_mlp": 1.027946, "epoch": 0.3154667067488351, "flos": 35371148469120.0, "grad_norm": 1.7161631839732394, "language_loss": 0.61524433, "learning_rate": 3.204336675750321e-06, "loss": 0.63708878, "num_input_tokens_seen": 112671560, "step": 5247, "time_per_iteration": 2.714258909225464 }, { "auxiliary_loss_clip": 0.01129569, "auxiliary_loss_mlp": 0.0104508, "balance_loss_clip": 1.04842138, "balance_loss_mlp": 1.0283072, "epoch": 0.31552683000150306, "flos": 17456464823040.0, "grad_norm": 2.438581052681848, "language_loss": 0.82096362, "learning_rate": 3.2040257187280693e-06, "loss": 0.84271014, "num_input_tokens_seen": 112689790, "step": 5248, "time_per_iteration": 2.6235198974609375 }, { "auxiliary_loss_clip": 0.01121718, "auxiliary_loss_mlp": 0.01047358, "balance_loss_clip": 1.04964209, "balance_loss_mlp": 1.0292145, "epoch": 0.3155869532541711, "flos": 18405763413120.0, "grad_norm": 5.654706808285272, "language_loss": 0.84601712, "learning_rate": 3.2037147160490423e-06, "loss": 0.86770785, "num_input_tokens_seen": 112708265, "step": 5249, "time_per_iteration": 2.664454698562622 }, { "auxiliary_loss_clip": 0.01105599, "auxiliary_loss_mlp": 0.01040266, "balance_loss_clip": 1.04724038, "balance_loss_mlp": 1.02252758, "epoch": 0.31564707650683904, "flos": 21579763783680.0, "grad_norm": 2.1333510394712034, "language_loss": 0.85412121, "learning_rate": 3.2034036677250322e-06, "loss": 0.87557989, "num_input_tokens_seen": 112727820, "step": 5250, "time_per_iteration": 2.7892768383026123 }, { "auxiliary_loss_clip": 0.01110748, "auxiliary_loss_mlp": 0.01044305, "balance_loss_clip": 1.04626083, "balance_loss_mlp": 1.02721059, "epoch": 0.315707199759507, "flos": 21030976817280.0, "grad_norm": 3.250818956981283, "language_loss": 0.68651402, "learning_rate": 3.203092573767835e-06, "loss": 0.70806456, "num_input_tokens_seen": 112743140, "step": 5251, "time_per_iteration": 2.660738468170166 }, { "auxiliary_loss_clip": 0.01141131, "auxiliary_loss_mlp": 0.01040852, "balance_loss_clip": 1.05063367, "balance_loss_mlp": 1.02374566, "epoch": 0.31576732301217497, "flos": 26828861788800.0, "grad_norm": 1.6959923935223091, "language_loss": 0.79367268, "learning_rate": 3.202781434189246e-06, "loss": 0.81549257, "num_input_tokens_seen": 112764705, "step": 5252, "time_per_iteration": 2.6600146293640137 }, { "auxiliary_loss_clip": 0.01123952, "auxiliary_loss_mlp": 0.01055554, "balance_loss_clip": 1.04919744, "balance_loss_mlp": 1.03742182, "epoch": 0.31582744626484294, "flos": 22711165349760.0, "grad_norm": 1.5850214403847396, "language_loss": 0.74167955, "learning_rate": 3.202470249001066e-06, "loss": 0.76347458, "num_input_tokens_seen": 112785310, "step": 5253, "time_per_iteration": 2.6831557750701904 }, { "auxiliary_loss_clip": 0.01117625, "auxiliary_loss_mlp": 0.01042879, "balance_loss_clip": 1.04685211, "balance_loss_mlp": 1.02571261, "epoch": 0.3158875695175109, "flos": 23951914894080.0, "grad_norm": 1.8578399335985847, "language_loss": 0.73295557, "learning_rate": 3.2021590182150924e-06, "loss": 0.75456059, "num_input_tokens_seen": 112802905, "step": 5254, "time_per_iteration": 2.664445161819458 }, { "auxiliary_loss_clip": 0.0112999, "auxiliary_loss_mlp": 0.0104166, "balance_loss_clip": 1.04998255, "balance_loss_mlp": 1.02442837, "epoch": 0.31594769277017887, "flos": 13261883322240.0, "grad_norm": 1.9116991379626416, "language_loss": 0.77497417, "learning_rate": 3.201847741843128e-06, "loss": 0.7966907, "num_input_tokens_seen": 112820305, "step": 5255, "time_per_iteration": 2.5817084312438965 }, { "auxiliary_loss_clip": 0.01116092, "auxiliary_loss_mlp": 0.01045862, "balance_loss_clip": 1.0481391, "balance_loss_mlp": 1.02718151, "epoch": 0.31600781602284683, "flos": 23368258800000.0, "grad_norm": 2.396272573281143, "language_loss": 0.7821492, "learning_rate": 3.2015364198969772e-06, "loss": 0.80376875, "num_input_tokens_seen": 112841185, "step": 5256, "time_per_iteration": 2.6798577308654785 }, { "auxiliary_loss_clip": 0.0109858, "auxiliary_loss_mlp": 0.01042238, "balance_loss_clip": 1.04874921, "balance_loss_mlp": 1.02676511, "epoch": 0.3160679392755148, "flos": 19828580019840.0, "grad_norm": 1.575034121408654, "language_loss": 0.71175283, "learning_rate": 3.2012250523884453e-06, "loss": 0.73316103, "num_input_tokens_seen": 112860570, "step": 5257, "time_per_iteration": 4.252342462539673 }, { "auxiliary_loss_clip": 0.01132481, "auxiliary_loss_mlp": 0.01043271, "balance_loss_clip": 1.05120182, "balance_loss_mlp": 1.02524674, "epoch": 0.31612806252818276, "flos": 20193216935040.0, "grad_norm": 2.0196036815267036, "language_loss": 0.76539034, "learning_rate": 3.2009136393293393e-06, "loss": 0.78714788, "num_input_tokens_seen": 112877975, "step": 5258, "time_per_iteration": 4.240477085113525 }, { "auxiliary_loss_clip": 0.01110908, "auxiliary_loss_mlp": 0.01047088, "balance_loss_clip": 1.04727268, "balance_loss_mlp": 1.02917099, "epoch": 0.31618818578085073, "flos": 24235967646720.0, "grad_norm": 3.2354010090655403, "language_loss": 0.72901475, "learning_rate": 3.200602180731467e-06, "loss": 0.75059474, "num_input_tokens_seen": 112896170, "step": 5259, "time_per_iteration": 2.726944923400879 }, { "auxiliary_loss_clip": 0.01117115, "auxiliary_loss_mlp": 0.00776982, "balance_loss_clip": 1.04983401, "balance_loss_mlp": 1.0013001, "epoch": 0.3162483090335187, "flos": 25081844002560.0, "grad_norm": 2.1961272089612307, "language_loss": 0.66124642, "learning_rate": 3.20029067660664e-06, "loss": 0.68018734, "num_input_tokens_seen": 112916180, "step": 5260, "time_per_iteration": 2.7605621814727783 }, { "auxiliary_loss_clip": 0.01130372, "auxiliary_loss_mlp": 0.01037108, "balance_loss_clip": 1.04645884, "balance_loss_mlp": 1.02016842, "epoch": 0.31630843228618666, "flos": 26323383646080.0, "grad_norm": 1.8277182943015604, "language_loss": 0.71989, "learning_rate": 3.1999791269666706e-06, "loss": 0.74156475, "num_input_tokens_seen": 112936745, "step": 5261, "time_per_iteration": 4.231431484222412 }, { "auxiliary_loss_clip": 0.01044321, "auxiliary_loss_mlp": 0.01007323, "balance_loss_clip": 1.02311194, "balance_loss_mlp": 1.00424767, "epoch": 0.3163685555388547, "flos": 66758441552640.0, "grad_norm": 0.7429950107461195, "language_loss": 0.50646758, "learning_rate": 3.1996675318233716e-06, "loss": 0.5269841, "num_input_tokens_seen": 112994845, "step": 5262, "time_per_iteration": 3.232384443283081 }, { "auxiliary_loss_clip": 0.01131333, "auxiliary_loss_mlp": 0.01046761, "balance_loss_clip": 1.05222106, "balance_loss_mlp": 1.02932084, "epoch": 0.31642867879152264, "flos": 25995662933760.0, "grad_norm": 1.5863649349069382, "language_loss": 0.85187083, "learning_rate": 3.19935589118856e-06, "loss": 0.8736518, "num_input_tokens_seen": 113015125, "step": 5263, "time_per_iteration": 4.33522629737854 }, { "auxiliary_loss_clip": 0.01112644, "auxiliary_loss_mlp": 0.01048382, "balance_loss_clip": 1.04875994, "balance_loss_mlp": 1.03256297, "epoch": 0.3164888020441906, "flos": 25774955815680.0, "grad_norm": 1.550008856477613, "language_loss": 0.81648135, "learning_rate": 3.1990442050740535e-06, "loss": 0.83809161, "num_input_tokens_seen": 113035535, "step": 5264, "time_per_iteration": 2.8155312538146973 }, { "auxiliary_loss_clip": 0.01121259, "auxiliary_loss_mlp": 0.0104222, "balance_loss_clip": 1.04812968, "balance_loss_mlp": 1.02431464, "epoch": 0.3165489252968586, "flos": 19756220071680.0, "grad_norm": 2.234025317189389, "language_loss": 0.78969181, "learning_rate": 3.19873247349167e-06, "loss": 0.81132656, "num_input_tokens_seen": 113052720, "step": 5265, "time_per_iteration": 2.6533524990081787 }, { "auxiliary_loss_clip": 0.0113452, "auxiliary_loss_mlp": 0.01049591, "balance_loss_clip": 1.05209899, "balance_loss_mlp": 1.03144741, "epoch": 0.31660904854952654, "flos": 23183929180800.0, "grad_norm": 1.789116232573577, "language_loss": 0.74705631, "learning_rate": 3.1984206964532307e-06, "loss": 0.76889741, "num_input_tokens_seen": 113071435, "step": 5266, "time_per_iteration": 2.66683292388916 }, { "auxiliary_loss_clip": 0.01108402, "auxiliary_loss_mlp": 0.0104338, "balance_loss_clip": 1.04636073, "balance_loss_mlp": 1.02660751, "epoch": 0.3166691718021945, "flos": 20408501099520.0, "grad_norm": 2.507852328081816, "language_loss": 0.79178059, "learning_rate": 3.1981088739705585e-06, "loss": 0.81329834, "num_input_tokens_seen": 113088645, "step": 5267, "time_per_iteration": 2.6870310306549072 }, { "auxiliary_loss_clip": 0.0103642, "auxiliary_loss_mlp": 0.01002482, "balance_loss_clip": 1.02563763, "balance_loss_mlp": 1.00002623, "epoch": 0.31672929505486247, "flos": 70144781172480.0, "grad_norm": 0.7343006553516018, "language_loss": 0.57840127, "learning_rate": 3.197797006055478e-06, "loss": 0.59879029, "num_input_tokens_seen": 113152775, "step": 5268, "time_per_iteration": 3.211494207382202 }, { "auxiliary_loss_clip": 0.01144761, "auxiliary_loss_mlp": 0.01044165, "balance_loss_clip": 1.0517385, "balance_loss_mlp": 1.02729666, "epoch": 0.31678941830753043, "flos": 14355758154240.0, "grad_norm": 2.2657818682072146, "language_loss": 0.73009932, "learning_rate": 3.197485092719815e-06, "loss": 0.75198865, "num_input_tokens_seen": 113171410, "step": 5269, "time_per_iteration": 2.5840115547180176 }, { "auxiliary_loss_clip": 0.01108492, "auxiliary_loss_mlp": 0.01049824, "balance_loss_clip": 1.0489136, "balance_loss_mlp": 1.03283644, "epoch": 0.3168495415601984, "flos": 22747722416640.0, "grad_norm": 2.2273308320264995, "language_loss": 0.79972744, "learning_rate": 3.1971731339753973e-06, "loss": 0.82131052, "num_input_tokens_seen": 113189965, "step": 5270, "time_per_iteration": 2.858154535293579 }, { "auxiliary_loss_clip": 0.01146892, "auxiliary_loss_mlp": 0.01050124, "balance_loss_clip": 1.05206418, "balance_loss_mlp": 1.03207529, "epoch": 0.31690966481286637, "flos": 20115254465280.0, "grad_norm": 9.25747726986636, "language_loss": 0.7941646, "learning_rate": 3.1968611298340545e-06, "loss": 0.81613475, "num_input_tokens_seen": 113206355, "step": 5271, "time_per_iteration": 2.6510884761810303 }, { "auxiliary_loss_clip": 0.01144344, "auxiliary_loss_mlp": 0.01040088, "balance_loss_clip": 1.05230093, "balance_loss_mlp": 1.02269578, "epoch": 0.31696978806553433, "flos": 21178928937600.0, "grad_norm": 1.806612869692892, "language_loss": 0.72429144, "learning_rate": 3.1965490803076173e-06, "loss": 0.74613577, "num_input_tokens_seen": 113225440, "step": 5272, "time_per_iteration": 2.6807363033294678 }, { "auxiliary_loss_clip": 0.01123855, "auxiliary_loss_mlp": 0.01052611, "balance_loss_clip": 1.04942703, "balance_loss_mlp": 1.03365636, "epoch": 0.3170299113182023, "flos": 42997030439040.0, "grad_norm": 2.241731745129767, "language_loss": 0.69146693, "learning_rate": 3.1962369854079194e-06, "loss": 0.71323156, "num_input_tokens_seen": 113248840, "step": 5273, "time_per_iteration": 2.9202728271484375 }, { "auxiliary_loss_clip": 0.01128467, "auxiliary_loss_mlp": 0.00775845, "balance_loss_clip": 1.04869509, "balance_loss_mlp": 1.00146461, "epoch": 0.31709003457087026, "flos": 24460158384000.0, "grad_norm": 1.872718303622414, "language_loss": 0.67764306, "learning_rate": 3.195924845146795e-06, "loss": 0.69668615, "num_input_tokens_seen": 113269630, "step": 5274, "time_per_iteration": 2.6541714668273926 }, { "auxiliary_loss_clip": 0.01092683, "auxiliary_loss_mlp": 0.0106112, "balance_loss_clip": 1.04346347, "balance_loss_mlp": 1.04305935, "epoch": 0.3171501578235382, "flos": 24135310759680.0, "grad_norm": 1.7402048894999724, "language_loss": 0.80815518, "learning_rate": 3.195612659536081e-06, "loss": 0.8296932, "num_input_tokens_seen": 113291200, "step": 5275, "time_per_iteration": 2.840696096420288 }, { "auxiliary_loss_clip": 0.0113287, "auxiliary_loss_mlp": 0.01047853, "balance_loss_clip": 1.04862475, "balance_loss_mlp": 1.02979279, "epoch": 0.31721028107620625, "flos": 18879712392960.0, "grad_norm": 2.28886723118271, "language_loss": 0.72418922, "learning_rate": 3.1953004285876147e-06, "loss": 0.74599648, "num_input_tokens_seen": 113310170, "step": 5276, "time_per_iteration": 2.6426591873168945 }, { "auxiliary_loss_clip": 0.01122606, "auxiliary_loss_mlp": 0.01041381, "balance_loss_clip": 1.05439019, "balance_loss_mlp": 1.02588356, "epoch": 0.3172704043288742, "flos": 23147874904320.0, "grad_norm": 1.4542936031710312, "language_loss": 0.77923822, "learning_rate": 3.194988152313236e-06, "loss": 0.80087811, "num_input_tokens_seen": 113331140, "step": 5277, "time_per_iteration": 2.7192864418029785 }, { "auxiliary_loss_clip": 0.01113098, "auxiliary_loss_mlp": 0.01054598, "balance_loss_clip": 1.04708886, "balance_loss_mlp": 1.03432024, "epoch": 0.3173305275815422, "flos": 17858520731520.0, "grad_norm": 2.071832444797603, "language_loss": 0.79029107, "learning_rate": 3.1946758307247878e-06, "loss": 0.81196797, "num_input_tokens_seen": 113350030, "step": 5278, "time_per_iteration": 2.606973648071289 }, { "auxiliary_loss_clip": 0.01041198, "auxiliary_loss_mlp": 0.01006121, "balance_loss_clip": 1.02207565, "balance_loss_mlp": 1.00391531, "epoch": 0.31739065083421014, "flos": 59973476883840.0, "grad_norm": 0.8783580735908582, "language_loss": 0.62817574, "learning_rate": 3.1943634638341114e-06, "loss": 0.64864898, "num_input_tokens_seen": 113395820, "step": 5279, "time_per_iteration": 2.998594284057617 }, { "auxiliary_loss_clip": 0.01146927, "auxiliary_loss_mlp": 0.01055699, "balance_loss_clip": 1.05080009, "balance_loss_mlp": 1.03651857, "epoch": 0.3174507740868781, "flos": 23800981944960.0, "grad_norm": 1.4881688285488497, "language_loss": 0.80855167, "learning_rate": 3.194051051653053e-06, "loss": 0.83057791, "num_input_tokens_seen": 113416835, "step": 5280, "time_per_iteration": 2.662240743637085 }, { "auxiliary_loss_clip": 0.0110603, "auxiliary_loss_mlp": 0.01050191, "balance_loss_clip": 1.04850507, "balance_loss_mlp": 1.0339663, "epoch": 0.31751089733954607, "flos": 27638899349760.0, "grad_norm": 1.6411021360183768, "language_loss": 0.77964067, "learning_rate": 3.19373859419346e-06, "loss": 0.80120289, "num_input_tokens_seen": 113440850, "step": 5281, "time_per_iteration": 2.8303840160369873 }, { "auxiliary_loss_clip": 0.01119054, "auxiliary_loss_mlp": 0.0103955, "balance_loss_clip": 1.04812443, "balance_loss_mlp": 1.02194262, "epoch": 0.31757102059221404, "flos": 23769273214080.0, "grad_norm": 2.6184534699054116, "language_loss": 0.78539747, "learning_rate": 3.193426091467179e-06, "loss": 0.80698353, "num_input_tokens_seen": 113461000, "step": 5282, "time_per_iteration": 2.75915265083313 }, { "auxiliary_loss_clip": 0.01122553, "auxiliary_loss_mlp": 0.01050996, "balance_loss_clip": 1.0517695, "balance_loss_mlp": 1.03284001, "epoch": 0.317631143844882, "flos": 25264521596160.0, "grad_norm": 1.8901773671102746, "language_loss": 0.67857707, "learning_rate": 3.193113543486061e-06, "loss": 0.70031261, "num_input_tokens_seen": 113480820, "step": 5283, "time_per_iteration": 2.710601329803467 }, { "auxiliary_loss_clip": 0.01039071, "auxiliary_loss_mlp": 0.01003581, "balance_loss_clip": 1.02084279, "balance_loss_mlp": 1.00145948, "epoch": 0.31769126709754997, "flos": 55825939221120.0, "grad_norm": 0.7284643981615322, "language_loss": 0.52787578, "learning_rate": 3.192800950261958e-06, "loss": 0.54830229, "num_input_tokens_seen": 113536910, "step": 5284, "time_per_iteration": 3.1312994956970215 }, { "auxiliary_loss_clip": 0.01123508, "auxiliary_loss_mlp": 0.01041652, "balance_loss_clip": 1.05256152, "balance_loss_mlp": 1.02529633, "epoch": 0.31775139035021793, "flos": 16690562098560.0, "grad_norm": 1.6358492252526933, "language_loss": 0.70703542, "learning_rate": 3.1924883118067235e-06, "loss": 0.72868699, "num_input_tokens_seen": 113555480, "step": 5285, "time_per_iteration": 2.66414213180542 }, { "auxiliary_loss_clip": 0.01051594, "auxiliary_loss_mlp": 0.01001353, "balance_loss_clip": 1.02112103, "balance_loss_mlp": 0.99919558, "epoch": 0.3178115136028859, "flos": 64227241019520.0, "grad_norm": 0.8795363824150627, "language_loss": 0.60495377, "learning_rate": 3.1921756281322123e-06, "loss": 0.62548316, "num_input_tokens_seen": 113616790, "step": 5286, "time_per_iteration": 3.1636195182800293 }, { "auxiliary_loss_clip": 0.01145219, "auxiliary_loss_mlp": 0.01047411, "balance_loss_clip": 1.05137587, "balance_loss_mlp": 1.02995849, "epoch": 0.31787163685555386, "flos": 18697465762560.0, "grad_norm": 10.257300688850748, "language_loss": 0.72160053, "learning_rate": 3.1918628992502826e-06, "loss": 0.74352682, "num_input_tokens_seen": 113635320, "step": 5287, "time_per_iteration": 2.628863573074341 }, { "auxiliary_loss_clip": 0.01132987, "auxiliary_loss_mlp": 0.0105662, "balance_loss_clip": 1.04966712, "balance_loss_mlp": 1.03823805, "epoch": 0.31793176010822183, "flos": 21324762155520.0, "grad_norm": 2.3229849512265126, "language_loss": 0.75706261, "learning_rate": 3.191550125172792e-06, "loss": 0.77895868, "num_input_tokens_seen": 113654000, "step": 5288, "time_per_iteration": 2.7565319538116455 }, { "auxiliary_loss_clip": 0.01128698, "auxiliary_loss_mlp": 0.01037369, "balance_loss_clip": 1.04913831, "balance_loss_mlp": 1.02223587, "epoch": 0.31799188336088985, "flos": 20958688696320.0, "grad_norm": 3.550043827117326, "language_loss": 0.87827504, "learning_rate": 3.1912373059116007e-06, "loss": 0.89993572, "num_input_tokens_seen": 113672375, "step": 5289, "time_per_iteration": 2.6671485900878906 }, { "auxiliary_loss_clip": 0.01126628, "auxiliary_loss_mlp": 0.01039655, "balance_loss_clip": 1.05225897, "balance_loss_mlp": 1.02443218, "epoch": 0.3180520066135578, "flos": 22491930689280.0, "grad_norm": 1.767762146387748, "language_loss": 0.68103814, "learning_rate": 3.190924441478572e-06, "loss": 0.70270097, "num_input_tokens_seen": 113692385, "step": 5290, "time_per_iteration": 2.6986947059631348 }, { "auxiliary_loss_clip": 0.01120385, "auxiliary_loss_mlp": 0.01046806, "balance_loss_clip": 1.04791737, "balance_loss_mlp": 1.02924609, "epoch": 0.3181121298662258, "flos": 27235335070080.0, "grad_norm": 2.1353951835610303, "language_loss": 0.80298805, "learning_rate": 3.1906115318855687e-06, "loss": 0.82465994, "num_input_tokens_seen": 113712145, "step": 5291, "time_per_iteration": 2.67692494392395 }, { "auxiliary_loss_clip": 0.01112404, "auxiliary_loss_mlp": 0.01038285, "balance_loss_clip": 1.05768418, "balance_loss_mlp": 1.02066636, "epoch": 0.31817225311889374, "flos": 23180158252800.0, "grad_norm": 4.0426741537939614, "language_loss": 0.79877901, "learning_rate": 3.1902985771444577e-06, "loss": 0.82028592, "num_input_tokens_seen": 113731435, "step": 5292, "time_per_iteration": 2.8386974334716797 }, { "auxiliary_loss_clip": 0.01126783, "auxiliary_loss_mlp": 0.01037968, "balance_loss_clip": 1.05076253, "balance_loss_mlp": 1.0233407, "epoch": 0.3182323763715617, "flos": 23258803080960.0, "grad_norm": 1.5696258430885255, "language_loss": 0.74754488, "learning_rate": 3.1899855772671043e-06, "loss": 0.7691924, "num_input_tokens_seen": 113750825, "step": 5293, "time_per_iteration": 2.651566982269287 }, { "auxiliary_loss_clip": 0.01129161, "auxiliary_loss_mlp": 0.01045458, "balance_loss_clip": 1.05253696, "balance_loss_mlp": 1.03027081, "epoch": 0.3182924996242297, "flos": 29016683280000.0, "grad_norm": 1.9205945835079516, "language_loss": 0.74100351, "learning_rate": 3.189672532265379e-06, "loss": 0.76274973, "num_input_tokens_seen": 113770010, "step": 5294, "time_per_iteration": 2.6593024730682373 }, { "auxiliary_loss_clip": 0.01145372, "auxiliary_loss_mlp": 0.01038723, "balance_loss_clip": 1.05254447, "balance_loss_mlp": 1.02166462, "epoch": 0.31835262287689764, "flos": 20449188230400.0, "grad_norm": 3.618714545146935, "language_loss": 0.76019043, "learning_rate": 3.189359442151152e-06, "loss": 0.78203136, "num_input_tokens_seen": 113788640, "step": 5295, "time_per_iteration": 2.597567558288574 }, { "auxiliary_loss_clip": 0.01110615, "auxiliary_loss_mlp": 0.01046432, "balance_loss_clip": 1.04994202, "balance_loss_mlp": 1.02979052, "epoch": 0.3184127461295656, "flos": 25119478477440.0, "grad_norm": 2.278908740959458, "language_loss": 0.69146252, "learning_rate": 3.189046306936296e-06, "loss": 0.71303296, "num_input_tokens_seen": 113809515, "step": 5296, "time_per_iteration": 4.286029100418091 }, { "auxiliary_loss_clip": 0.01115954, "auxiliary_loss_mlp": 0.01043279, "balance_loss_clip": 1.04866266, "balance_loss_mlp": 1.02709007, "epoch": 0.31847286938223357, "flos": 25551231955200.0, "grad_norm": 1.7786470593469696, "language_loss": 0.77374327, "learning_rate": 3.1887331266326846e-06, "loss": 0.79533565, "num_input_tokens_seen": 113829770, "step": 5297, "time_per_iteration": 4.164870023727417 }, { "auxiliary_loss_clip": 0.0111312, "auxiliary_loss_mlp": 0.01036407, "balance_loss_clip": 1.05341816, "balance_loss_mlp": 1.01857328, "epoch": 0.31853299263490154, "flos": 27782470010880.0, "grad_norm": 2.4185702861431104, "language_loss": 0.79294181, "learning_rate": 3.1884199012521942e-06, "loss": 0.81443709, "num_input_tokens_seen": 113849320, "step": 5298, "time_per_iteration": 2.761035919189453 }, { "auxiliary_loss_clip": 0.01127152, "auxiliary_loss_mlp": 0.01052383, "balance_loss_clip": 1.05250955, "balance_loss_mlp": 1.0361588, "epoch": 0.3185931158875695, "flos": 22706747976960.0, "grad_norm": 2.109744523678234, "language_loss": 0.74082595, "learning_rate": 3.1881066308067016e-06, "loss": 0.76262128, "num_input_tokens_seen": 113867860, "step": 5299, "time_per_iteration": 2.6674296855926514 }, { "auxiliary_loss_clip": 0.01133842, "auxiliary_loss_mlp": 0.01048899, "balance_loss_clip": 1.05652189, "balance_loss_mlp": 1.03213775, "epoch": 0.31865323914023747, "flos": 24571517523840.0, "grad_norm": 2.0125699214837627, "language_loss": 0.78636098, "learning_rate": 3.1877933153080873e-06, "loss": 0.80818832, "num_input_tokens_seen": 113886375, "step": 5300, "time_per_iteration": 2.721202850341797 }, { "auxiliary_loss_clip": 0.01119633, "auxiliary_loss_mlp": 0.01050293, "balance_loss_clip": 1.04830885, "balance_loss_mlp": 1.03297138, "epoch": 0.31871336239290543, "flos": 18186564666240.0, "grad_norm": 1.8639511619571896, "language_loss": 0.83660495, "learning_rate": 3.1874799547682304e-06, "loss": 0.8583042, "num_input_tokens_seen": 113904065, "step": 5301, "time_per_iteration": 4.22704291343689 }, { "auxiliary_loss_clip": 0.01131996, "auxiliary_loss_mlp": 0.01049945, "balance_loss_clip": 1.05371821, "balance_loss_mlp": 1.03263569, "epoch": 0.31877348564557345, "flos": 21826756679040.0, "grad_norm": 2.3173946845583444, "language_loss": 0.77328432, "learning_rate": 3.187166549199015e-06, "loss": 0.79510373, "num_input_tokens_seen": 113918415, "step": 5302, "time_per_iteration": 2.6678919792175293 }, { "auxiliary_loss_clip": 0.011364, "auxiliary_loss_mlp": 0.01039827, "balance_loss_clip": 1.04891157, "balance_loss_mlp": 1.02270818, "epoch": 0.3188336088982414, "flos": 22015252275840.0, "grad_norm": 2.352282677018458, "language_loss": 0.79816842, "learning_rate": 3.1868530986123255e-06, "loss": 0.81993073, "num_input_tokens_seen": 113938135, "step": 5303, "time_per_iteration": 4.289660453796387 }, { "auxiliary_loss_clip": 0.0113563, "auxiliary_loss_mlp": 0.01045445, "balance_loss_clip": 1.05256605, "balance_loss_mlp": 1.02739668, "epoch": 0.3188937321509094, "flos": 20047886507520.0, "grad_norm": 2.03328242361333, "language_loss": 0.72914493, "learning_rate": 3.186539603020047e-06, "loss": 0.7509557, "num_input_tokens_seen": 113957125, "step": 5304, "time_per_iteration": 2.6123225688934326 }, { "auxiliary_loss_clip": 0.01106707, "auxiliary_loss_mlp": 0.01038113, "balance_loss_clip": 1.04701817, "balance_loss_mlp": 1.02234125, "epoch": 0.31895385540357735, "flos": 25848105863040.0, "grad_norm": 2.816339992135166, "language_loss": 0.71918428, "learning_rate": 3.186226062434068e-06, "loss": 0.74063241, "num_input_tokens_seen": 113974875, "step": 5305, "time_per_iteration": 2.7341108322143555 }, { "auxiliary_loss_clip": 0.01120594, "auxiliary_loss_mlp": 0.01042646, "balance_loss_clip": 1.05007052, "balance_loss_mlp": 1.0271126, "epoch": 0.3190139786562453, "flos": 23477714519040.0, "grad_norm": 2.1368418928112067, "language_loss": 0.64082253, "learning_rate": 3.1859124768662778e-06, "loss": 0.66245496, "num_input_tokens_seen": 113994450, "step": 5306, "time_per_iteration": 2.678497791290283 }, { "auxiliary_loss_clip": 0.01113987, "auxiliary_loss_mlp": 0.01046306, "balance_loss_clip": 1.04777002, "balance_loss_mlp": 1.02913976, "epoch": 0.3190741019089133, "flos": 29095543589760.0, "grad_norm": 2.249856956834014, "language_loss": 0.7981708, "learning_rate": 3.1855988463285678e-06, "loss": 0.81977379, "num_input_tokens_seen": 114013945, "step": 5307, "time_per_iteration": 2.684825897216797 }, { "auxiliary_loss_clip": 0.01110939, "auxiliary_loss_mlp": 0.01046246, "balance_loss_clip": 1.04708028, "balance_loss_mlp": 1.02869821, "epoch": 0.31913422516158124, "flos": 17129534209920.0, "grad_norm": 1.891192054321282, "language_loss": 0.77413881, "learning_rate": 3.1852851708328308e-06, "loss": 0.79571068, "num_input_tokens_seen": 114031375, "step": 5308, "time_per_iteration": 2.62485408782959 }, { "auxiliary_loss_clip": 0.01142071, "auxiliary_loss_mlp": 0.01050679, "balance_loss_clip": 1.05399549, "balance_loss_mlp": 1.03109312, "epoch": 0.3191943484142492, "flos": 16069846147200.0, "grad_norm": 3.6914677983836586, "language_loss": 0.73960984, "learning_rate": 3.184971450390961e-06, "loss": 0.76153737, "num_input_tokens_seen": 114048465, "step": 5309, "time_per_iteration": 2.6268463134765625 }, { "auxiliary_loss_clip": 0.01134349, "auxiliary_loss_mlp": 0.01035267, "balance_loss_clip": 1.05286658, "balance_loss_mlp": 1.01932931, "epoch": 0.3192544716669172, "flos": 22966166977920.0, "grad_norm": 1.9182514579370458, "language_loss": 0.82652342, "learning_rate": 3.184657685014856e-06, "loss": 0.84821963, "num_input_tokens_seen": 114068415, "step": 5310, "time_per_iteration": 2.649099111557007 }, { "auxiliary_loss_clip": 0.01116653, "auxiliary_loss_mlp": 0.01039176, "balance_loss_clip": 1.04808259, "balance_loss_mlp": 1.02340484, "epoch": 0.31931459491958514, "flos": 26870339018880.0, "grad_norm": 2.200225110342558, "language_loss": 0.78296745, "learning_rate": 3.184343874716412e-06, "loss": 0.80452585, "num_input_tokens_seen": 114088565, "step": 5311, "time_per_iteration": 2.7054250240325928 }, { "auxiliary_loss_clip": 0.01106724, "auxiliary_loss_mlp": 0.01036895, "balance_loss_clip": 1.04822886, "balance_loss_mlp": 1.01952648, "epoch": 0.3193747181722531, "flos": 21836525178240.0, "grad_norm": 2.0057857548781883, "language_loss": 0.84169972, "learning_rate": 3.1840300195075295e-06, "loss": 0.86313581, "num_input_tokens_seen": 114107160, "step": 5312, "time_per_iteration": 2.749263048171997 }, { "auxiliary_loss_clip": 0.01093899, "auxiliary_loss_mlp": 0.01053441, "balance_loss_clip": 1.04266024, "balance_loss_mlp": 1.03477311, "epoch": 0.31943484142492107, "flos": 18324999682560.0, "grad_norm": 3.6700749085790063, "language_loss": 0.78648412, "learning_rate": 3.1837161194001102e-06, "loss": 0.80795753, "num_input_tokens_seen": 114123420, "step": 5313, "time_per_iteration": 2.720930814743042 }, { "auxiliary_loss_clip": 0.01130677, "auxiliary_loss_mlp": 0.01038161, "balance_loss_clip": 1.05141878, "balance_loss_mlp": 1.0219605, "epoch": 0.31949496467758903, "flos": 21615818060160.0, "grad_norm": 2.386195329240294, "language_loss": 0.86217451, "learning_rate": 3.183402174406057e-06, "loss": 0.88386285, "num_input_tokens_seen": 114139230, "step": 5314, "time_per_iteration": 2.6785764694213867 }, { "auxiliary_loss_clip": 0.01116655, "auxiliary_loss_mlp": 0.01050856, "balance_loss_clip": 1.04983997, "balance_loss_mlp": 1.03231871, "epoch": 0.31955508793025705, "flos": 21760214734080.0, "grad_norm": 1.996028492072791, "language_loss": 0.79866767, "learning_rate": 3.1830881845372747e-06, "loss": 0.82034278, "num_input_tokens_seen": 114159290, "step": 5315, "time_per_iteration": 2.723097085952759 }, { "auxiliary_loss_clip": 0.0110521, "auxiliary_loss_mlp": 0.01063258, "balance_loss_clip": 1.04667854, "balance_loss_mlp": 1.04386258, "epoch": 0.319615211182925, "flos": 17164331510400.0, "grad_norm": 2.2633227615123275, "language_loss": 0.67312729, "learning_rate": 3.18277414980567e-06, "loss": 0.69481194, "num_input_tokens_seen": 114177655, "step": 5316, "time_per_iteration": 2.7841827869415283 }, { "auxiliary_loss_clip": 0.01131119, "auxiliary_loss_mlp": 0.01046731, "balance_loss_clip": 1.05015874, "balance_loss_mlp": 1.03126907, "epoch": 0.319675334435593, "flos": 28112812416000.0, "grad_norm": 1.540647016415601, "language_loss": 0.69375229, "learning_rate": 3.1824600702231515e-06, "loss": 0.71553081, "num_input_tokens_seen": 114200880, "step": 5317, "time_per_iteration": 2.7080705165863037 }, { "auxiliary_loss_clip": 0.01036788, "auxiliary_loss_mlp": 0.01033442, "balance_loss_clip": 1.02571428, "balance_loss_mlp": 1.03117692, "epoch": 0.31973545768826095, "flos": 69501119408640.0, "grad_norm": 0.7974882454120521, "language_loss": 0.53049421, "learning_rate": 3.182145945801628e-06, "loss": 0.55119646, "num_input_tokens_seen": 114267145, "step": 5318, "time_per_iteration": 3.5072765350341797 }, { "auxiliary_loss_clip": 0.0114058, "auxiliary_loss_mlp": 0.01041014, "balance_loss_clip": 1.05322218, "balance_loss_mlp": 1.02509975, "epoch": 0.3197955809409289, "flos": 13699203408000.0, "grad_norm": 3.679429868734815, "language_loss": 0.84239668, "learning_rate": 3.181831776553012e-06, "loss": 0.86421257, "num_input_tokens_seen": 114284630, "step": 5319, "time_per_iteration": 2.6148228645324707 }, { "auxiliary_loss_clip": 0.0112589, "auxiliary_loss_mlp": 0.01041338, "balance_loss_clip": 1.04876614, "balance_loss_mlp": 1.02552485, "epoch": 0.3198557041935969, "flos": 33218124278400.0, "grad_norm": 1.684363339069699, "language_loss": 0.63463295, "learning_rate": 3.1815175624892165e-06, "loss": 0.65630519, "num_input_tokens_seen": 114305830, "step": 5320, "time_per_iteration": 2.7444913387298584 }, { "auxiliary_loss_clip": 0.01120865, "auxiliary_loss_mlp": 0.01042926, "balance_loss_clip": 1.05072045, "balance_loss_mlp": 1.02682114, "epoch": 0.31991582744626484, "flos": 23732033788800.0, "grad_norm": 2.113040492667506, "language_loss": 0.70552826, "learning_rate": 3.1812033036221567e-06, "loss": 0.72716618, "num_input_tokens_seen": 114325165, "step": 5321, "time_per_iteration": 2.7078404426574707 }, { "auxiliary_loss_clip": 0.01151862, "auxiliary_loss_mlp": 0.00776802, "balance_loss_clip": 1.05639851, "balance_loss_mlp": 1.00126243, "epoch": 0.3199759506989328, "flos": 18550842445440.0, "grad_norm": 2.699319417691227, "language_loss": 0.8659147, "learning_rate": 3.180888999963749e-06, "loss": 0.88520133, "num_input_tokens_seen": 114341310, "step": 5322, "time_per_iteration": 2.5562047958374023 }, { "auxiliary_loss_clip": 0.01119411, "auxiliary_loss_mlp": 0.01038951, "balance_loss_clip": 1.05106568, "balance_loss_mlp": 1.02265561, "epoch": 0.3200360739516008, "flos": 22418888382720.0, "grad_norm": 1.7451682184714292, "language_loss": 0.83021653, "learning_rate": 3.1805746515259123e-06, "loss": 0.85180014, "num_input_tokens_seen": 114360355, "step": 5323, "time_per_iteration": 2.6323180198669434 }, { "auxiliary_loss_clip": 0.01129356, "auxiliary_loss_mlp": 0.01041616, "balance_loss_clip": 1.05092812, "balance_loss_mlp": 1.02440214, "epoch": 0.32009619720426874, "flos": 20595236929920.0, "grad_norm": 1.6785162629315, "language_loss": 0.77686846, "learning_rate": 3.1802602583205663e-06, "loss": 0.79857814, "num_input_tokens_seen": 114379220, "step": 5324, "time_per_iteration": 2.6361289024353027 }, { "auxiliary_loss_clip": 0.01115575, "auxiliary_loss_mlp": 0.01035772, "balance_loss_clip": 1.04754376, "balance_loss_mlp": 1.01861751, "epoch": 0.3201563204569367, "flos": 18147637301760.0, "grad_norm": 1.9010400542588533, "language_loss": 0.80500418, "learning_rate": 3.1799458203596333e-06, "loss": 0.82651764, "num_input_tokens_seen": 114396365, "step": 5325, "time_per_iteration": 2.681349277496338 }, { "auxiliary_loss_clip": 0.01133585, "auxiliary_loss_mlp": 0.01039966, "balance_loss_clip": 1.05378425, "balance_loss_mlp": 1.02394414, "epoch": 0.32021644370960467, "flos": 31684235840640.0, "grad_norm": 1.7412856997403743, "language_loss": 0.74817789, "learning_rate": 3.179631337655037e-06, "loss": 0.76991343, "num_input_tokens_seen": 114416780, "step": 5326, "time_per_iteration": 2.6932616233825684 }, { "auxiliary_loss_clip": 0.01103829, "auxiliary_loss_mlp": 0.0104309, "balance_loss_clip": 1.05045807, "balance_loss_mlp": 1.02659154, "epoch": 0.32027656696227264, "flos": 26865921646080.0, "grad_norm": 1.642662123916105, "language_loss": 0.80796289, "learning_rate": 3.179316810218701e-06, "loss": 0.82943213, "num_input_tokens_seen": 114437405, "step": 5327, "time_per_iteration": 2.7527899742126465 }, { "auxiliary_loss_clip": 0.01115203, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.05185604, "balance_loss_mlp": 1.02162015, "epoch": 0.32033669021494066, "flos": 24169928492160.0, "grad_norm": 1.846540372387515, "language_loss": 0.77796161, "learning_rate": 3.179002238062554e-06, "loss": 0.79949659, "num_input_tokens_seen": 114458505, "step": 5328, "time_per_iteration": 2.7631096839904785 }, { "auxiliary_loss_clip": 0.01087281, "auxiliary_loss_mlp": 0.01043102, "balance_loss_clip": 1.0453198, "balance_loss_mlp": 1.0245527, "epoch": 0.3203968134676086, "flos": 24460768915200.0, "grad_norm": 1.6837826518335735, "language_loss": 0.74184239, "learning_rate": 3.178687621198524e-06, "loss": 0.76314622, "num_input_tokens_seen": 114479050, "step": 5329, "time_per_iteration": 2.7749221324920654 }, { "auxiliary_loss_clip": 0.01110066, "auxiliary_loss_mlp": 0.01036662, "balance_loss_clip": 1.04650402, "balance_loss_mlp": 1.02133203, "epoch": 0.3204569367202766, "flos": 18004713085440.0, "grad_norm": 1.7163505659405243, "language_loss": 0.71138644, "learning_rate": 3.1783729596385415e-06, "loss": 0.73285371, "num_input_tokens_seen": 114497415, "step": 5330, "time_per_iteration": 2.655578136444092 }, { "auxiliary_loss_clip": 0.01093261, "auxiliary_loss_mlp": 0.01053955, "balance_loss_clip": 1.05082417, "balance_loss_mlp": 1.03379714, "epoch": 0.32051705997294455, "flos": 30589678650240.0, "grad_norm": 1.6854796065505788, "language_loss": 0.80175424, "learning_rate": 3.1780582533945376e-06, "loss": 0.82322645, "num_input_tokens_seen": 114518785, "step": 5331, "time_per_iteration": 2.851639747619629 }, { "auxiliary_loss_clip": 0.01040347, "auxiliary_loss_mlp": 0.01008357, "balance_loss_clip": 1.02573299, "balance_loss_mlp": 1.0059495, "epoch": 0.3205771832256125, "flos": 68417979765120.0, "grad_norm": 0.8321512232204817, "language_loss": 0.57821107, "learning_rate": 3.177743502478447e-06, "loss": 0.59869808, "num_input_tokens_seen": 114577710, "step": 5332, "time_per_iteration": 3.1104307174682617 }, { "auxiliary_loss_clip": 0.01104131, "auxiliary_loss_mlp": 0.01038271, "balance_loss_clip": 1.04842329, "balance_loss_mlp": 1.02194548, "epoch": 0.3206373064782805, "flos": 30443953173120.0, "grad_norm": 1.7127909178457088, "language_loss": 0.72918129, "learning_rate": 3.177428706902205e-06, "loss": 0.75060534, "num_input_tokens_seen": 114598640, "step": 5333, "time_per_iteration": 2.7683963775634766 }, { "auxiliary_loss_clip": 0.01118957, "auxiliary_loss_mlp": 0.01043487, "balance_loss_clip": 1.04778981, "balance_loss_mlp": 1.02685761, "epoch": 0.32069742973094845, "flos": 22054502862720.0, "grad_norm": 2.1728626414536767, "language_loss": 0.70592654, "learning_rate": 3.1771138666777485e-06, "loss": 0.72755098, "num_input_tokens_seen": 114618780, "step": 5334, "time_per_iteration": 2.6861116886138916 }, { "auxiliary_loss_clip": 0.01100969, "auxiliary_loss_mlp": 0.01041644, "balance_loss_clip": 1.04742825, "balance_loss_mlp": 1.02536023, "epoch": 0.3207575529836164, "flos": 22054000072320.0, "grad_norm": 2.526978692505362, "language_loss": 0.77161503, "learning_rate": 3.1767989818170156e-06, "loss": 0.79304117, "num_input_tokens_seen": 114637525, "step": 5335, "time_per_iteration": 4.33164381980896 }, { "auxiliary_loss_clip": 0.01130469, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.05087018, "balance_loss_mlp": 1.02213204, "epoch": 0.3208176762362844, "flos": 34057536186240.0, "grad_norm": 1.6997548644452432, "language_loss": 0.68414462, "learning_rate": 3.1764840523319477e-06, "loss": 0.7058323, "num_input_tokens_seen": 114659705, "step": 5336, "time_per_iteration": 2.840373992919922 }, { "auxiliary_loss_clip": 0.01102432, "auxiliary_loss_mlp": 0.01055244, "balance_loss_clip": 1.04495001, "balance_loss_mlp": 1.03862596, "epoch": 0.32087779948895234, "flos": 21798711135360.0, "grad_norm": 1.733261513029939, "language_loss": 0.78828537, "learning_rate": 3.176169078234487e-06, "loss": 0.8098622, "num_input_tokens_seen": 114678340, "step": 5337, "time_per_iteration": 4.268811464309692 }, { "auxiliary_loss_clip": 0.01121282, "auxiliary_loss_mlp": 0.01039712, "balance_loss_clip": 1.04696417, "balance_loss_mlp": 1.02512085, "epoch": 0.3209379227416203, "flos": 21434110133760.0, "grad_norm": 2.1583979373304194, "language_loss": 0.74322718, "learning_rate": 3.1758540595365766e-06, "loss": 0.76483715, "num_input_tokens_seen": 114696980, "step": 5338, "time_per_iteration": 2.6442766189575195 }, { "auxiliary_loss_clip": 0.01119062, "auxiliary_loss_mlp": 0.01047297, "balance_loss_clip": 1.04633641, "balance_loss_mlp": 1.03078675, "epoch": 0.3209980459942883, "flos": 25849075530240.0, "grad_norm": 2.118549362741933, "language_loss": 0.62622869, "learning_rate": 3.1755389962501626e-06, "loss": 0.64789224, "num_input_tokens_seen": 114717330, "step": 5339, "time_per_iteration": 2.684843063354492 }, { "auxiliary_loss_clip": 0.01141698, "auxiliary_loss_mlp": 0.01046177, "balance_loss_clip": 1.05127931, "balance_loss_mlp": 1.02954674, "epoch": 0.32105816924695624, "flos": 19099162535040.0, "grad_norm": 2.480509085809345, "language_loss": 0.81685597, "learning_rate": 3.175223888387192e-06, "loss": 0.83873475, "num_input_tokens_seen": 114736320, "step": 5340, "time_per_iteration": 4.130942344665527 }, { "auxiliary_loss_clip": 0.01110441, "auxiliary_loss_mlp": 0.01050741, "balance_loss_clip": 1.04820514, "balance_loss_mlp": 1.03462362, "epoch": 0.3211182924996242, "flos": 16581860565120.0, "grad_norm": 2.326860742494733, "language_loss": 0.76571834, "learning_rate": 3.1749087359596137e-06, "loss": 0.78733015, "num_input_tokens_seen": 114754575, "step": 5341, "time_per_iteration": 2.7302300930023193 }, { "auxiliary_loss_clip": 0.01101828, "auxiliary_loss_mlp": 0.01044591, "balance_loss_clip": 1.04797173, "balance_loss_mlp": 1.02840281, "epoch": 0.3211784157522922, "flos": 22672202071680.0, "grad_norm": 1.680960149410583, "language_loss": 0.79268491, "learning_rate": 3.1745935389793786e-06, "loss": 0.81414914, "num_input_tokens_seen": 114773590, "step": 5342, "time_per_iteration": 4.462036609649658 }, { "auxiliary_loss_clip": 0.01118478, "auxiliary_loss_mlp": 0.01045941, "balance_loss_clip": 1.05000186, "balance_loss_mlp": 1.02876329, "epoch": 0.3212385390049602, "flos": 20558787603840.0, "grad_norm": 3.232512085646521, "language_loss": 0.74449253, "learning_rate": 3.174278297458438e-06, "loss": 0.76613677, "num_input_tokens_seen": 114790775, "step": 5343, "time_per_iteration": 2.7057244777679443 }, { "auxiliary_loss_clip": 0.01080228, "auxiliary_loss_mlp": 0.0104431, "balance_loss_clip": 1.04317784, "balance_loss_mlp": 1.02704811, "epoch": 0.32129866225762815, "flos": 24791147233920.0, "grad_norm": 1.672847320129023, "language_loss": 0.82661629, "learning_rate": 3.173963011408748e-06, "loss": 0.84786165, "num_input_tokens_seen": 114809835, "step": 5344, "time_per_iteration": 2.801013231277466 }, { "auxiliary_loss_clip": 0.01088811, "auxiliary_loss_mlp": 0.01042568, "balance_loss_clip": 1.04556143, "balance_loss_mlp": 1.02565217, "epoch": 0.3213587855102961, "flos": 18366871962240.0, "grad_norm": 22.33494793204904, "language_loss": 0.79863501, "learning_rate": 3.173647680842262e-06, "loss": 0.81994879, "num_input_tokens_seen": 114826505, "step": 5345, "time_per_iteration": 2.743778944015503 }, { "auxiliary_loss_clip": 0.01114864, "auxiliary_loss_mlp": 0.01041047, "balance_loss_clip": 1.04774046, "balance_loss_mlp": 1.02507281, "epoch": 0.3214189087629641, "flos": 27015992668800.0, "grad_norm": 2.095379605818748, "language_loss": 0.83340824, "learning_rate": 3.1733323057709384e-06, "loss": 0.85496742, "num_input_tokens_seen": 114846140, "step": 5346, "time_per_iteration": 2.8187026977539062 }, { "auxiliary_loss_clip": 0.01110187, "auxiliary_loss_mlp": 0.01045041, "balance_loss_clip": 1.04783988, "balance_loss_mlp": 1.02797008, "epoch": 0.32147903201563205, "flos": 23148269953920.0, "grad_norm": 1.6371928172660764, "language_loss": 0.81853002, "learning_rate": 3.1730168862067366e-06, "loss": 0.84008235, "num_input_tokens_seen": 114866660, "step": 5347, "time_per_iteration": 2.724003553390503 }, { "auxiliary_loss_clip": 0.0112676, "auxiliary_loss_mlp": 0.01047135, "balance_loss_clip": 1.048388, "balance_loss_mlp": 1.02891994, "epoch": 0.3215391552683, "flos": 16580747243520.0, "grad_norm": 4.152516057334243, "language_loss": 0.80263776, "learning_rate": 3.1727014221616164e-06, "loss": 0.8243767, "num_input_tokens_seen": 114882820, "step": 5348, "time_per_iteration": 2.6249122619628906 }, { "auxiliary_loss_clip": 0.01113488, "auxiliary_loss_mlp": 0.0105622, "balance_loss_clip": 1.04640627, "balance_loss_mlp": 1.03931606, "epoch": 0.321599278520968, "flos": 17821820010240.0, "grad_norm": 2.570277900111974, "language_loss": 0.85020632, "learning_rate": 3.172385913647542e-06, "loss": 0.87190342, "num_input_tokens_seen": 114900745, "step": 5349, "time_per_iteration": 2.6685211658477783 }, { "auxiliary_loss_clip": 0.01113139, "auxiliary_loss_mlp": 0.0104332, "balance_loss_clip": 1.04840457, "balance_loss_mlp": 1.02644002, "epoch": 0.32165940177363594, "flos": 16251769555200.0, "grad_norm": 2.7209437086115282, "language_loss": 0.80619532, "learning_rate": 3.172070360676475e-06, "loss": 0.82775992, "num_input_tokens_seen": 114917940, "step": 5350, "time_per_iteration": 2.6857874393463135 }, { "auxiliary_loss_clip": 0.01128309, "auxiliary_loss_mlp": 0.01045442, "balance_loss_clip": 1.05025196, "balance_loss_mlp": 1.02955103, "epoch": 0.3217195250263039, "flos": 27599900158080.0, "grad_norm": 5.5112684101117395, "language_loss": 0.80060112, "learning_rate": 3.1717547632603828e-06, "loss": 0.82233858, "num_input_tokens_seen": 114937735, "step": 5351, "time_per_iteration": 2.68406081199646 }, { "auxiliary_loss_clip": 0.01104774, "auxiliary_loss_mlp": 0.01045518, "balance_loss_clip": 1.04905438, "balance_loss_mlp": 1.02811348, "epoch": 0.3217796482789719, "flos": 21470595373440.0, "grad_norm": 2.189681121413186, "language_loss": 0.75826663, "learning_rate": 3.1714391214112326e-06, "loss": 0.7797696, "num_input_tokens_seen": 114956630, "step": 5352, "time_per_iteration": 2.7035396099090576 }, { "auxiliary_loss_clip": 0.0109763, "auxiliary_loss_mlp": 0.01043305, "balance_loss_clip": 1.04897571, "balance_loss_mlp": 1.02579308, "epoch": 0.32183977153163984, "flos": 21215593745280.0, "grad_norm": 2.4508783518814807, "language_loss": 0.81992233, "learning_rate": 3.1711234351409933e-06, "loss": 0.84133166, "num_input_tokens_seen": 114976470, "step": 5353, "time_per_iteration": 2.731339931488037 }, { "auxiliary_loss_clip": 0.01074627, "auxiliary_loss_mlp": 0.0104331, "balance_loss_clip": 1.04917347, "balance_loss_mlp": 1.02605999, "epoch": 0.3218998947843078, "flos": 24608182331520.0, "grad_norm": 2.2390857397461246, "language_loss": 0.73474252, "learning_rate": 3.1708077044616365e-06, "loss": 0.75592184, "num_input_tokens_seen": 114996710, "step": 5354, "time_per_iteration": 2.8337595462799072 }, { "auxiliary_loss_clip": 0.01103547, "auxiliary_loss_mlp": 0.01039731, "balance_loss_clip": 1.04475546, "balance_loss_mlp": 1.02428102, "epoch": 0.3219600180369758, "flos": 22270577126400.0, "grad_norm": 1.8690515367544651, "language_loss": 0.83792925, "learning_rate": 3.1704919293851334e-06, "loss": 0.85936201, "num_input_tokens_seen": 115015775, "step": 5355, "time_per_iteration": 2.7299652099609375 }, { "auxiliary_loss_clip": 0.01146025, "auxiliary_loss_mlp": 0.01046795, "balance_loss_clip": 1.05450225, "balance_loss_mlp": 1.03032064, "epoch": 0.3220201412896438, "flos": 14939126939520.0, "grad_norm": 1.9705527058452093, "language_loss": 0.70895493, "learning_rate": 3.1701761099234597e-06, "loss": 0.73088312, "num_input_tokens_seen": 115034265, "step": 5356, "time_per_iteration": 2.638268232345581 }, { "auxiliary_loss_clip": 0.01102103, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.04954576, "balance_loss_mlp": 1.02245283, "epoch": 0.32208026454231176, "flos": 22667389649280.0, "grad_norm": 2.5241040535813095, "language_loss": 0.67760962, "learning_rate": 3.1698602460885903e-06, "loss": 0.69903815, "num_input_tokens_seen": 115051945, "step": 5357, "time_per_iteration": 2.7816576957702637 }, { "auxiliary_loss_clip": 0.01037625, "auxiliary_loss_mlp": 0.01029071, "balance_loss_clip": 1.0279882, "balance_loss_mlp": 1.02722347, "epoch": 0.3221403877949797, "flos": 64605130053120.0, "grad_norm": 0.7244200234208643, "language_loss": 0.58319688, "learning_rate": 3.1695443378925035e-06, "loss": 0.60386384, "num_input_tokens_seen": 115119090, "step": 5358, "time_per_iteration": 3.3341448307037354 }, { "auxiliary_loss_clip": 0.01076802, "auxiliary_loss_mlp": 0.01044493, "balance_loss_clip": 1.04142976, "balance_loss_mlp": 1.0270052, "epoch": 0.3222005110476477, "flos": 20157019004160.0, "grad_norm": 2.2322811787478427, "language_loss": 0.83184302, "learning_rate": 3.1692283853471777e-06, "loss": 0.85305595, "num_input_tokens_seen": 115137755, "step": 5359, "time_per_iteration": 2.836543083190918 }, { "auxiliary_loss_clip": 0.01129966, "auxiliary_loss_mlp": 0.01035598, "balance_loss_clip": 1.04800034, "balance_loss_mlp": 1.01938617, "epoch": 0.32226063430031565, "flos": 22674177319680.0, "grad_norm": 2.0261007556732964, "language_loss": 0.79563689, "learning_rate": 3.168912388464595e-06, "loss": 0.81729257, "num_input_tokens_seen": 115158150, "step": 5360, "time_per_iteration": 2.66043758392334 }, { "auxiliary_loss_clip": 0.01045199, "auxiliary_loss_mlp": 0.01009155, "balance_loss_clip": 1.02352595, "balance_loss_mlp": 1.00706911, "epoch": 0.3223207575529836, "flos": 63828525075840.0, "grad_norm": 0.6569282603798298, "language_loss": 0.56928504, "learning_rate": 3.168596347256737e-06, "loss": 0.58982855, "num_input_tokens_seen": 115212755, "step": 5361, "time_per_iteration": 3.007119655609131 }, { "auxiliary_loss_clip": 0.01078785, "auxiliary_loss_mlp": 0.01049092, "balance_loss_clip": 1.04366553, "balance_loss_mlp": 1.03166366, "epoch": 0.3223808808056516, "flos": 26870123537280.0, "grad_norm": 3.2787914187636495, "language_loss": 0.71563178, "learning_rate": 3.168280261735588e-06, "loss": 0.73691058, "num_input_tokens_seen": 115233090, "step": 5362, "time_per_iteration": 2.8345048427581787 }, { "auxiliary_loss_clip": 0.0112485, "auxiliary_loss_mlp": 0.01053523, "balance_loss_clip": 1.04899716, "balance_loss_mlp": 1.03670287, "epoch": 0.32244100405831955, "flos": 26761350176640.0, "grad_norm": 2.1292104037374773, "language_loss": 0.74106693, "learning_rate": 3.167964131913135e-06, "loss": 0.76285076, "num_input_tokens_seen": 115252645, "step": 5363, "time_per_iteration": 2.70552659034729 }, { "auxiliary_loss_clip": 0.01134941, "auxiliary_loss_mlp": 0.01042612, "balance_loss_clip": 1.05024791, "balance_loss_mlp": 1.02637601, "epoch": 0.3225011273109875, "flos": 23803029020160.0, "grad_norm": 3.812297759050374, "language_loss": 0.77379405, "learning_rate": 3.167647957801365e-06, "loss": 0.7955696, "num_input_tokens_seen": 115269085, "step": 5364, "time_per_iteration": 2.66058087348938 }, { "auxiliary_loss_clip": 0.01120766, "auxiliary_loss_mlp": 0.01042612, "balance_loss_clip": 1.05058861, "balance_loss_mlp": 1.02468252, "epoch": 0.3225612505636555, "flos": 17274505501440.0, "grad_norm": 3.514939630870356, "language_loss": 0.76727009, "learning_rate": 3.1673317394122672e-06, "loss": 0.78890389, "num_input_tokens_seen": 115286470, "step": 5365, "time_per_iteration": 2.6493194103240967 }, { "auxiliary_loss_clip": 0.01124156, "auxiliary_loss_mlp": 0.01048476, "balance_loss_clip": 1.05429566, "balance_loss_mlp": 1.03201342, "epoch": 0.32262137381632344, "flos": 23366247638400.0, "grad_norm": 7.419360933702927, "language_loss": 0.76938248, "learning_rate": 3.1670154767578333e-06, "loss": 0.79110885, "num_input_tokens_seen": 115307000, "step": 5366, "time_per_iteration": 2.6984689235687256 }, { "auxiliary_loss_clip": 0.01110868, "auxiliary_loss_mlp": 0.01044399, "balance_loss_clip": 1.04554594, "balance_loss_mlp": 1.02792382, "epoch": 0.3226814970689914, "flos": 23258803080960.0, "grad_norm": 2.2843777844497453, "language_loss": 0.71972823, "learning_rate": 3.166699169850055e-06, "loss": 0.74128091, "num_input_tokens_seen": 115325925, "step": 5367, "time_per_iteration": 2.6944496631622314 }, { "auxiliary_loss_clip": 0.01138096, "auxiliary_loss_mlp": 0.01043716, "balance_loss_clip": 1.05035067, "balance_loss_mlp": 1.0286001, "epoch": 0.32274162032165943, "flos": 16395196561920.0, "grad_norm": 13.04054524246424, "language_loss": 0.74414504, "learning_rate": 3.1663828187009274e-06, "loss": 0.76596308, "num_input_tokens_seen": 115343705, "step": 5368, "time_per_iteration": 2.670567750930786 }, { "auxiliary_loss_clip": 0.01103298, "auxiliary_loss_mlp": 0.01049074, "balance_loss_clip": 1.04370904, "balance_loss_mlp": 1.0322659, "epoch": 0.3228017435743274, "flos": 27855081354240.0, "grad_norm": 1.655769512058306, "language_loss": 0.78693509, "learning_rate": 3.1660664233224467e-06, "loss": 0.80845881, "num_input_tokens_seen": 115364170, "step": 5369, "time_per_iteration": 2.777437448501587 }, { "auxiliary_loss_clip": 0.01099309, "auxiliary_loss_mlp": 0.01037821, "balance_loss_clip": 1.04874706, "balance_loss_mlp": 1.0222764, "epoch": 0.32286186682699536, "flos": 19608770741760.0, "grad_norm": 13.189929997499553, "language_loss": 0.83189309, "learning_rate": 3.16574998372661e-06, "loss": 0.85326445, "num_input_tokens_seen": 115382495, "step": 5370, "time_per_iteration": 2.734342336654663 }, { "auxiliary_loss_clip": 0.01141788, "auxiliary_loss_mlp": 0.01044735, "balance_loss_clip": 1.05202413, "balance_loss_mlp": 1.0291779, "epoch": 0.3229219900796633, "flos": 24134017870080.0, "grad_norm": 3.3293058605981614, "language_loss": 0.8288244, "learning_rate": 3.1654334999254177e-06, "loss": 0.85068965, "num_input_tokens_seen": 115399450, "step": 5371, "time_per_iteration": 2.620091676712036 }, { "auxiliary_loss_clip": 0.01133164, "auxiliary_loss_mlp": 0.00776239, "balance_loss_clip": 1.05046356, "balance_loss_mlp": 1.00122416, "epoch": 0.3229821133323313, "flos": 17748705876480.0, "grad_norm": 3.1117013800624993, "language_loss": 0.8852632, "learning_rate": 3.1651169719308695e-06, "loss": 0.90435725, "num_input_tokens_seen": 115417700, "step": 5372, "time_per_iteration": 2.673567056655884 }, { "auxiliary_loss_clip": 0.01140269, "auxiliary_loss_mlp": 0.01049295, "balance_loss_clip": 1.05098414, "balance_loss_mlp": 1.03341591, "epoch": 0.32304223658499925, "flos": 22346025644160.0, "grad_norm": 2.7114986433136727, "language_loss": 0.73388374, "learning_rate": 3.1648003997549694e-06, "loss": 0.75577939, "num_input_tokens_seen": 115435840, "step": 5373, "time_per_iteration": 2.6910293102264404 }, { "auxiliary_loss_clip": 0.0110976, "auxiliary_loss_mlp": 0.01044756, "balance_loss_clip": 1.04653084, "balance_loss_mlp": 1.02873468, "epoch": 0.3231023598376672, "flos": 18478302929280.0, "grad_norm": 2.3161305262959573, "language_loss": 0.81114149, "learning_rate": 3.1644837834097214e-06, "loss": 0.83268672, "num_input_tokens_seen": 115454210, "step": 5374, "time_per_iteration": 2.666707992553711 }, { "auxiliary_loss_clip": 0.01095169, "auxiliary_loss_mlp": 0.01038679, "balance_loss_clip": 1.0438931, "balance_loss_mlp": 1.02254975, "epoch": 0.3231624830903352, "flos": 27636313570560.0, "grad_norm": 2.1309099752285863, "language_loss": 0.87817222, "learning_rate": 3.1641671229071317e-06, "loss": 0.89951062, "num_input_tokens_seen": 115471785, "step": 5375, "time_per_iteration": 4.252593994140625 }, { "auxiliary_loss_clip": 0.01140942, "auxiliary_loss_mlp": 0.01036182, "balance_loss_clip": 1.04865098, "balance_loss_mlp": 1.01960015, "epoch": 0.32322260634300315, "flos": 21726423014400.0, "grad_norm": 2.12002794330764, "language_loss": 0.75837636, "learning_rate": 3.1638504182592076e-06, "loss": 0.78014749, "num_input_tokens_seen": 115491405, "step": 5376, "time_per_iteration": 2.64569091796875 }, { "auxiliary_loss_clip": 0.01100111, "auxiliary_loss_mlp": 0.01037893, "balance_loss_clip": 1.04745007, "balance_loss_mlp": 1.0227654, "epoch": 0.3232827295956711, "flos": 22637656166400.0, "grad_norm": 16.356053535517315, "language_loss": 0.66570163, "learning_rate": 3.1635336694779594e-06, "loss": 0.68708175, "num_input_tokens_seen": 115511555, "step": 5377, "time_per_iteration": 4.228315591812134 }, { "auxiliary_loss_clip": 0.01103406, "auxiliary_loss_mlp": 0.01059488, "balance_loss_clip": 1.04591548, "balance_loss_mlp": 1.04070055, "epoch": 0.3233428528483391, "flos": 26322593546880.0, "grad_norm": 1.5026052482517693, "language_loss": 0.72276354, "learning_rate": 3.1632168765753982e-06, "loss": 0.74439251, "num_input_tokens_seen": 115532860, "step": 5378, "time_per_iteration": 2.7754812240600586 }, { "auxiliary_loss_clip": 0.0112205, "auxiliary_loss_mlp": 0.0103656, "balance_loss_clip": 1.04869092, "balance_loss_mlp": 1.0214678, "epoch": 0.32340297610100704, "flos": 28585217111040.0, "grad_norm": 2.7898138283200344, "language_loss": 0.82221997, "learning_rate": 3.1629000395635357e-06, "loss": 0.84380603, "num_input_tokens_seen": 115553850, "step": 5379, "time_per_iteration": 2.672743320465088 }, { "auxiliary_loss_clip": 0.01130962, "auxiliary_loss_mlp": 0.01035985, "balance_loss_clip": 1.04864693, "balance_loss_mlp": 1.02083325, "epoch": 0.323463099353675, "flos": 30773792787840.0, "grad_norm": 1.5555457678220286, "language_loss": 0.78895414, "learning_rate": 3.162583158454388e-06, "loss": 0.81062359, "num_input_tokens_seen": 115575530, "step": 5380, "time_per_iteration": 4.130786180496216 }, { "auxiliary_loss_clip": 0.01124956, "auxiliary_loss_mlp": 0.01044026, "balance_loss_clip": 1.04988194, "balance_loss_mlp": 1.0286541, "epoch": 0.32352322260634303, "flos": 25228610974080.0, "grad_norm": 1.7365933554134192, "language_loss": 0.76877856, "learning_rate": 3.1622662332599697e-06, "loss": 0.79046834, "num_input_tokens_seen": 115594885, "step": 5381, "time_per_iteration": 2.6297740936279297 }, { "auxiliary_loss_clip": 0.01122723, "auxiliary_loss_mlp": 0.0103758, "balance_loss_clip": 1.0485673, "balance_loss_mlp": 1.02333474, "epoch": 0.323583345859011, "flos": 23330480670720.0, "grad_norm": 1.9510545380996942, "language_loss": 0.71868116, "learning_rate": 3.1619492639922998e-06, "loss": 0.7402842, "num_input_tokens_seen": 115614080, "step": 5382, "time_per_iteration": 4.239168167114258 }, { "auxiliary_loss_clip": 0.01114051, "auxiliary_loss_mlp": 0.01051511, "balance_loss_clip": 1.0454843, "balance_loss_mlp": 1.03392792, "epoch": 0.32364346911167896, "flos": 26207499392640.0, "grad_norm": 2.5669193665709815, "language_loss": 0.70947385, "learning_rate": 3.1616322506633964e-06, "loss": 0.73112947, "num_input_tokens_seen": 115632820, "step": 5383, "time_per_iteration": 2.701462507247925 }, { "auxiliary_loss_clip": 0.01123558, "auxiliary_loss_mlp": 0.01038956, "balance_loss_clip": 1.04770291, "balance_loss_mlp": 1.02382779, "epoch": 0.3237035923643469, "flos": 23695764030720.0, "grad_norm": 1.9442688765107798, "language_loss": 0.78333974, "learning_rate": 3.161315193285283e-06, "loss": 0.8049649, "num_input_tokens_seen": 115652860, "step": 5384, "time_per_iteration": 2.6939637660980225 }, { "auxiliary_loss_clip": 0.01078749, "auxiliary_loss_mlp": 0.01050129, "balance_loss_clip": 1.04298878, "balance_loss_mlp": 1.03203273, "epoch": 0.3237637156170149, "flos": 14428728633600.0, "grad_norm": 2.1298780259276575, "language_loss": 0.75396919, "learning_rate": 3.16099809186998e-06, "loss": 0.77525795, "num_input_tokens_seen": 115670940, "step": 5385, "time_per_iteration": 2.7813403606414795 }, { "auxiliary_loss_clip": 0.0111287, "auxiliary_loss_mlp": 0.01040739, "balance_loss_clip": 1.04995322, "balance_loss_mlp": 1.0248363, "epoch": 0.32382383886968286, "flos": 31062981185280.0, "grad_norm": 2.042597717530735, "language_loss": 0.71488941, "learning_rate": 3.1606809464295145e-06, "loss": 0.73642552, "num_input_tokens_seen": 115691155, "step": 5386, "time_per_iteration": 2.754636526107788 }, { "auxiliary_loss_clip": 0.01142583, "auxiliary_loss_mlp": 0.01040273, "balance_loss_clip": 1.0499016, "balance_loss_mlp": 1.02334547, "epoch": 0.3238839621223508, "flos": 23256935573760.0, "grad_norm": 5.057227062214219, "language_loss": 0.94889075, "learning_rate": 3.1603637569759095e-06, "loss": 0.97071928, "num_input_tokens_seen": 115710340, "step": 5387, "time_per_iteration": 2.6547048091888428 }, { "auxiliary_loss_clip": 0.01133488, "auxiliary_loss_mlp": 0.01044118, "balance_loss_clip": 1.05193102, "balance_loss_mlp": 1.02696419, "epoch": 0.3239440853750188, "flos": 22964658606720.0, "grad_norm": 10.717385990424205, "language_loss": 0.77620786, "learning_rate": 3.1600465235211956e-06, "loss": 0.79798394, "num_input_tokens_seen": 115726745, "step": 5388, "time_per_iteration": 2.657205820083618 }, { "auxiliary_loss_clip": 0.01111832, "auxiliary_loss_mlp": 0.01036701, "balance_loss_clip": 1.04523969, "balance_loss_mlp": 1.01978493, "epoch": 0.32400420862768675, "flos": 36246614653440.0, "grad_norm": 2.237731185409586, "language_loss": 0.71233571, "learning_rate": 3.1597292460774006e-06, "loss": 0.73382103, "num_input_tokens_seen": 115749385, "step": 5389, "time_per_iteration": 2.799731731414795 }, { "auxiliary_loss_clip": 0.01099836, "auxiliary_loss_mlp": 0.01038996, "balance_loss_clip": 1.04759645, "balance_loss_mlp": 1.02302158, "epoch": 0.3240643318803547, "flos": 21616500418560.0, "grad_norm": 1.8547230503773184, "language_loss": 0.80461568, "learning_rate": 3.159411924656557e-06, "loss": 0.82600403, "num_input_tokens_seen": 115768105, "step": 5390, "time_per_iteration": 2.703913450241089 }, { "auxiliary_loss_clip": 0.01112322, "auxiliary_loss_mlp": 0.01050073, "balance_loss_clip": 1.04881656, "balance_loss_mlp": 1.0330621, "epoch": 0.3241244551330227, "flos": 23295611543040.0, "grad_norm": 4.514534114801655, "language_loss": 0.72674775, "learning_rate": 3.1590945592706967e-06, "loss": 0.74837172, "num_input_tokens_seen": 115787340, "step": 5391, "time_per_iteration": 2.8789660930633545 }, { "auxiliary_loss_clip": 0.01110171, "auxiliary_loss_mlp": 0.01040459, "balance_loss_clip": 1.04422975, "balance_loss_mlp": 1.02517664, "epoch": 0.32418457838569065, "flos": 14097236993280.0, "grad_norm": 2.092129040046021, "language_loss": 0.77347648, "learning_rate": 3.158777149931855e-06, "loss": 0.79498285, "num_input_tokens_seen": 115805565, "step": 5392, "time_per_iteration": 2.6689188480377197 }, { "auxiliary_loss_clip": 0.01112252, "auxiliary_loss_mlp": 0.01051929, "balance_loss_clip": 1.04517519, "balance_loss_mlp": 1.03289127, "epoch": 0.3242447016383586, "flos": 29752672953600.0, "grad_norm": 1.9207699243041063, "language_loss": 0.62606925, "learning_rate": 3.158459696652067e-06, "loss": 0.6477111, "num_input_tokens_seen": 115826725, "step": 5393, "time_per_iteration": 2.758423328399658 }, { "auxiliary_loss_clip": 0.01122257, "auxiliary_loss_mlp": 0.01043934, "balance_loss_clip": 1.04730856, "balance_loss_mlp": 1.02770925, "epoch": 0.3243048248910266, "flos": 24351205455360.0, "grad_norm": 1.583732116281239, "language_loss": 0.82284617, "learning_rate": 3.158142199443371e-06, "loss": 0.84450811, "num_input_tokens_seen": 115846955, "step": 5394, "time_per_iteration": 2.6715636253356934 }, { "auxiliary_loss_clip": 0.01111969, "auxiliary_loss_mlp": 0.01045824, "balance_loss_clip": 1.04729748, "balance_loss_mlp": 1.03120947, "epoch": 0.3243649481436946, "flos": 24353037048960.0, "grad_norm": 1.873068954405441, "language_loss": 0.817029, "learning_rate": 3.1578246583178076e-06, "loss": 0.83860689, "num_input_tokens_seen": 115865975, "step": 5395, "time_per_iteration": 2.7120518684387207 }, { "auxiliary_loss_clip": 0.01126983, "auxiliary_loss_mlp": 0.01039478, "balance_loss_clip": 1.0519104, "balance_loss_mlp": 1.02413607, "epoch": 0.32442507139636256, "flos": 22925228451840.0, "grad_norm": 1.8441183317386671, "language_loss": 0.83172363, "learning_rate": 3.157507073287417e-06, "loss": 0.85338825, "num_input_tokens_seen": 115884950, "step": 5396, "time_per_iteration": 2.6589252948760986 }, { "auxiliary_loss_clip": 0.0110371, "auxiliary_loss_mlp": 0.01053141, "balance_loss_clip": 1.04818082, "balance_loss_mlp": 1.03462827, "epoch": 0.32448519464903053, "flos": 22200192426240.0, "grad_norm": 2.3735724483298553, "language_loss": 0.75721765, "learning_rate": 3.1571894443642414e-06, "loss": 0.77878618, "num_input_tokens_seen": 115904170, "step": 5397, "time_per_iteration": 2.7118513584136963 }, { "auxiliary_loss_clip": 0.01104001, "auxiliary_loss_mlp": 0.0104059, "balance_loss_clip": 1.04970932, "balance_loss_mlp": 1.02504468, "epoch": 0.3245453179016985, "flos": 18838450644480.0, "grad_norm": 7.349892433890134, "language_loss": 0.67359912, "learning_rate": 3.1568717715603263e-06, "loss": 0.69504505, "num_input_tokens_seen": 115919255, "step": 5398, "time_per_iteration": 2.690317153930664 }, { "auxiliary_loss_clip": 0.01111486, "auxiliary_loss_mlp": 0.01033579, "balance_loss_clip": 1.04846239, "balance_loss_mlp": 1.01784301, "epoch": 0.32460544115436646, "flos": 21178390233600.0, "grad_norm": 1.692830304346276, "language_loss": 0.73074687, "learning_rate": 3.156554054887718e-06, "loss": 0.7521975, "num_input_tokens_seen": 115938535, "step": 5399, "time_per_iteration": 2.754539728164673 }, { "auxiliary_loss_clip": 0.01101582, "auxiliary_loss_mlp": 0.01036858, "balance_loss_clip": 1.04522848, "balance_loss_mlp": 1.02056217, "epoch": 0.3246655644070344, "flos": 21981137333760.0, "grad_norm": 2.780796864612311, "language_loss": 0.71580744, "learning_rate": 3.1562362943584645e-06, "loss": 0.7371918, "num_input_tokens_seen": 115955005, "step": 5400, "time_per_iteration": 2.707712173461914 }, { "auxiliary_loss_clip": 0.01127225, "auxiliary_loss_mlp": 0.01040347, "balance_loss_clip": 1.0472424, "balance_loss_mlp": 1.02469516, "epoch": 0.3247256876597024, "flos": 32159729105280.0, "grad_norm": 2.1905750946262805, "language_loss": 0.79769576, "learning_rate": 3.155918489984614e-06, "loss": 0.81937146, "num_input_tokens_seen": 115975305, "step": 5401, "time_per_iteration": 2.7813303470611572 }, { "auxiliary_loss_clip": 0.01109499, "auxiliary_loss_mlp": 0.01041329, "balance_loss_clip": 1.04414558, "balance_loss_mlp": 1.02341187, "epoch": 0.32478581091237035, "flos": 20997544233600.0, "grad_norm": 4.743153882711402, "language_loss": 0.87785316, "learning_rate": 3.1556006417782196e-06, "loss": 0.89936143, "num_input_tokens_seen": 115994810, "step": 5402, "time_per_iteration": 2.7685606479644775 }, { "auxiliary_loss_clip": 0.01078796, "auxiliary_loss_mlp": 0.01044786, "balance_loss_clip": 1.03948891, "balance_loss_mlp": 1.02792931, "epoch": 0.3248459341650383, "flos": 17924990849280.0, "grad_norm": 4.964706141121962, "language_loss": 0.84572911, "learning_rate": 3.155282749751332e-06, "loss": 0.86696494, "num_input_tokens_seen": 116011095, "step": 5403, "time_per_iteration": 2.7299063205718994 }, { "auxiliary_loss_clip": 0.01104053, "auxiliary_loss_mlp": 0.01045074, "balance_loss_clip": 1.04597795, "balance_loss_mlp": 1.03049469, "epoch": 0.3249060574177063, "flos": 24535606901760.0, "grad_norm": 3.7265891750540785, "language_loss": 0.87614954, "learning_rate": 3.154964813916007e-06, "loss": 0.89764082, "num_input_tokens_seen": 116028805, "step": 5404, "time_per_iteration": 2.7740931510925293 }, { "auxiliary_loss_clip": 0.01125798, "auxiliary_loss_mlp": 0.01043439, "balance_loss_clip": 1.04930234, "balance_loss_mlp": 1.02685738, "epoch": 0.32496618067037425, "flos": 25994765093760.0, "grad_norm": 2.5497237434599964, "language_loss": 0.72717422, "learning_rate": 3.1546468342843008e-06, "loss": 0.74886656, "num_input_tokens_seen": 116047765, "step": 5405, "time_per_iteration": 2.6756839752197266 }, { "auxiliary_loss_clip": 0.01098309, "auxiliary_loss_mlp": 0.01039466, "balance_loss_clip": 1.04964566, "balance_loss_mlp": 1.02390265, "epoch": 0.3250263039230422, "flos": 19573757959680.0, "grad_norm": 1.6968031771183532, "language_loss": 0.82927752, "learning_rate": 3.1543288108682707e-06, "loss": 0.8506552, "num_input_tokens_seen": 116068385, "step": 5406, "time_per_iteration": 2.728217124938965 }, { "auxiliary_loss_clip": 0.01136878, "auxiliary_loss_mlp": 0.01032192, "balance_loss_clip": 1.05117011, "balance_loss_mlp": 1.01728487, "epoch": 0.3250864271757102, "flos": 16763640318720.0, "grad_norm": 1.9312900503750694, "language_loss": 0.87836796, "learning_rate": 3.1540107436799764e-06, "loss": 0.90005869, "num_input_tokens_seen": 116085350, "step": 5407, "time_per_iteration": 2.5519261360168457 }, { "auxiliary_loss_clip": 0.01112002, "auxiliary_loss_mlp": 0.01040482, "balance_loss_clip": 1.04575169, "balance_loss_mlp": 1.02506793, "epoch": 0.3251465504283782, "flos": 27819458040960.0, "grad_norm": 1.6044550363094983, "language_loss": 0.69804603, "learning_rate": 3.153692632731479e-06, "loss": 0.71957088, "num_input_tokens_seen": 116107560, "step": 5408, "time_per_iteration": 2.7141807079315186 }, { "auxiliary_loss_clip": 0.01131975, "auxiliary_loss_mlp": 0.01035871, "balance_loss_clip": 1.05021083, "balance_loss_mlp": 1.01977742, "epoch": 0.32520667368104617, "flos": 19063144172160.0, "grad_norm": 10.423580562540607, "language_loss": 0.77558911, "learning_rate": 3.153374478034841e-06, "loss": 0.79726762, "num_input_tokens_seen": 116125980, "step": 5409, "time_per_iteration": 2.644792318344116 }, { "auxiliary_loss_clip": 0.01079567, "auxiliary_loss_mlp": 0.01043858, "balance_loss_clip": 1.03893065, "balance_loss_mlp": 1.0280745, "epoch": 0.32526679693371413, "flos": 29382146208000.0, "grad_norm": 2.0524453166640146, "language_loss": 0.83282518, "learning_rate": 3.1530562796021285e-06, "loss": 0.85405946, "num_input_tokens_seen": 116146530, "step": 5410, "time_per_iteration": 2.846480131149292 }, { "auxiliary_loss_clip": 0.01086095, "auxiliary_loss_mlp": 0.01037636, "balance_loss_clip": 1.04789686, "balance_loss_mlp": 1.02272296, "epoch": 0.3253269201863821, "flos": 20704513080960.0, "grad_norm": 1.6475099523255856, "language_loss": 0.7081182, "learning_rate": 3.152738037445405e-06, "loss": 0.72935545, "num_input_tokens_seen": 116165695, "step": 5411, "time_per_iteration": 2.779330253601074 }, { "auxiliary_loss_clip": 0.0108148, "auxiliary_loss_mlp": 0.01041588, "balance_loss_clip": 1.04331398, "balance_loss_mlp": 1.02688956, "epoch": 0.32538704343905006, "flos": 29094142959360.0, "grad_norm": 1.6354124554173295, "language_loss": 0.82894456, "learning_rate": 3.1524197515767403e-06, "loss": 0.85017526, "num_input_tokens_seen": 116185375, "step": 5412, "time_per_iteration": 2.7841992378234863 }, { "auxiliary_loss_clip": 0.01106895, "auxiliary_loss_mlp": 0.01041599, "balance_loss_clip": 1.04730868, "balance_loss_mlp": 1.02430189, "epoch": 0.325447166691718, "flos": 24676124906880.0, "grad_norm": 1.867437266565155, "language_loss": 0.80913842, "learning_rate": 3.152101422008203e-06, "loss": 0.83062339, "num_input_tokens_seen": 116204335, "step": 5413, "time_per_iteration": 2.7533957958221436 }, { "auxiliary_loss_clip": 0.01115005, "auxiliary_loss_mlp": 0.0103855, "balance_loss_clip": 1.04923081, "balance_loss_mlp": 1.02155089, "epoch": 0.325507289944386, "flos": 21543134889600.0, "grad_norm": 3.355430774898342, "language_loss": 0.76891947, "learning_rate": 3.151783048751864e-06, "loss": 0.79045498, "num_input_tokens_seen": 116222840, "step": 5414, "time_per_iteration": 4.331217527389526 }, { "auxiliary_loss_clip": 0.01030644, "auxiliary_loss_mlp": 0.01012699, "balance_loss_clip": 1.02726388, "balance_loss_mlp": 1.01063681, "epoch": 0.32556741319705396, "flos": 71518722347520.0, "grad_norm": 0.9066964616955783, "language_loss": 0.63865513, "learning_rate": 3.1514646318197965e-06, "loss": 0.65908855, "num_input_tokens_seen": 116274940, "step": 5415, "time_per_iteration": 3.172816753387451 }, { "auxiliary_loss_clip": 0.01088465, "auxiliary_loss_mlp": 0.01038606, "balance_loss_clip": 1.04119301, "balance_loss_mlp": 1.02279866, "epoch": 0.3256275364497219, "flos": 23732428838400.0, "grad_norm": 1.52454367487569, "language_loss": 0.74014068, "learning_rate": 3.151146171224075e-06, "loss": 0.76141143, "num_input_tokens_seen": 116297300, "step": 5416, "time_per_iteration": 4.326166868209839 }, { "auxiliary_loss_clip": 0.01062287, "auxiliary_loss_mlp": 0.0100407, "balance_loss_clip": 1.03045964, "balance_loss_mlp": 1.00160217, "epoch": 0.3256876597023899, "flos": 67289199891840.0, "grad_norm": 0.7686966052914506, "language_loss": 0.57851374, "learning_rate": 3.1508276669767757e-06, "loss": 0.59917736, "num_input_tokens_seen": 116362370, "step": 5417, "time_per_iteration": 3.2102463245391846 }, { "auxiliary_loss_clip": 0.01040835, "auxiliary_loss_mlp": 0.01012103, "balance_loss_clip": 1.02768993, "balance_loss_mlp": 1.00975466, "epoch": 0.32574778295505785, "flos": 71282323964160.0, "grad_norm": 0.7997987203444133, "language_loss": 0.63392216, "learning_rate": 3.150509119089975e-06, "loss": 0.65445155, "num_input_tokens_seen": 116430365, "step": 5418, "time_per_iteration": 4.847350120544434 }, { "auxiliary_loss_clip": 0.01110249, "auxiliary_loss_mlp": 0.01043458, "balance_loss_clip": 1.05171919, "balance_loss_mlp": 1.02794838, "epoch": 0.3258079062077258, "flos": 20776370238720.0, "grad_norm": 2.0985111563442325, "language_loss": 0.69086784, "learning_rate": 3.1501905275757537e-06, "loss": 0.71240497, "num_input_tokens_seen": 116447525, "step": 5419, "time_per_iteration": 2.6837174892425537 }, { "auxiliary_loss_clip": 0.0112744, "auxiliary_loss_mlp": 0.01037157, "balance_loss_clip": 1.05152702, "balance_loss_mlp": 1.02099252, "epoch": 0.3258680294603938, "flos": 22235456603520.0, "grad_norm": 1.6553118170887535, "language_loss": 0.77041519, "learning_rate": 3.1498718924461926e-06, "loss": 0.79206121, "num_input_tokens_seen": 116466310, "step": 5420, "time_per_iteration": 2.690243721008301 }, { "auxiliary_loss_clip": 0.01124221, "auxiliary_loss_mlp": 0.00774579, "balance_loss_clip": 1.04583097, "balance_loss_mlp": 1.00118852, "epoch": 0.3259281527130618, "flos": 26979974305920.0, "grad_norm": 1.6758047570714483, "language_loss": 0.8033973, "learning_rate": 3.1495532137133736e-06, "loss": 0.82238531, "num_input_tokens_seen": 116487825, "step": 5421, "time_per_iteration": 4.346652984619141 }, { "auxiliary_loss_clip": 0.01133401, "auxiliary_loss_mlp": 0.0103494, "balance_loss_clip": 1.04982162, "balance_loss_mlp": 1.0212909, "epoch": 0.32598827596572977, "flos": 26214251149440.0, "grad_norm": 1.7368751669124027, "language_loss": 0.75101721, "learning_rate": 3.149234491389381e-06, "loss": 0.77270067, "num_input_tokens_seen": 116509950, "step": 5422, "time_per_iteration": 2.698486566543579 }, { "auxiliary_loss_clip": 0.01104722, "auxiliary_loss_mlp": 0.00773675, "balance_loss_clip": 1.04894829, "balance_loss_mlp": 1.00120938, "epoch": 0.32604839921839773, "flos": 17639752947840.0, "grad_norm": 2.1580318636917384, "language_loss": 0.63323581, "learning_rate": 3.1489157254863026e-06, "loss": 0.65201974, "num_input_tokens_seen": 116527695, "step": 5423, "time_per_iteration": 2.7364964485168457 }, { "auxiliary_loss_clip": 0.01098661, "auxiliary_loss_mlp": 0.01032454, "balance_loss_clip": 1.04357564, "balance_loss_mlp": 1.01884615, "epoch": 0.3261085224710657, "flos": 23622721724160.0, "grad_norm": 1.5676988826806029, "language_loss": 0.74530792, "learning_rate": 3.148596916016224e-06, "loss": 0.76661909, "num_input_tokens_seen": 116547800, "step": 5424, "time_per_iteration": 2.695530652999878 }, { "auxiliary_loss_clip": 0.0110482, "auxiliary_loss_mlp": 0.01035713, "balance_loss_clip": 1.04803681, "balance_loss_mlp": 1.02199221, "epoch": 0.32616864572373366, "flos": 23260455106560.0, "grad_norm": 1.6667522289255576, "language_loss": 0.77194774, "learning_rate": 3.1482780629912355e-06, "loss": 0.79335308, "num_input_tokens_seen": 116568460, "step": 5425, "time_per_iteration": 2.6649699211120605 }, { "auxiliary_loss_clip": 0.01106187, "auxiliary_loss_mlp": 0.01040306, "balance_loss_clip": 1.04740202, "balance_loss_mlp": 1.02368808, "epoch": 0.32622876897640163, "flos": 25593427457280.0, "grad_norm": 2.8883064562409744, "language_loss": 0.78262472, "learning_rate": 3.147959166423428e-06, "loss": 0.80408967, "num_input_tokens_seen": 116588705, "step": 5426, "time_per_iteration": 2.7820892333984375 }, { "auxiliary_loss_clip": 0.01088898, "auxiliary_loss_mlp": 0.01035243, "balance_loss_clip": 1.04331303, "balance_loss_mlp": 1.01889908, "epoch": 0.3262888922290696, "flos": 22418996123520.0, "grad_norm": 1.9267107865215556, "language_loss": 0.74485052, "learning_rate": 3.147640226324893e-06, "loss": 0.76609194, "num_input_tokens_seen": 116608845, "step": 5427, "time_per_iteration": 2.7831003665924072 }, { "auxiliary_loss_clip": 0.01103791, "auxiliary_loss_mlp": 0.01041786, "balance_loss_clip": 1.04539597, "balance_loss_mlp": 1.02549028, "epoch": 0.32634901548173756, "flos": 19718908819200.0, "grad_norm": 6.869638277775165, "language_loss": 0.79136658, "learning_rate": 3.1473212427077266e-06, "loss": 0.81282234, "num_input_tokens_seen": 116628145, "step": 5428, "time_per_iteration": 2.7186481952667236 }, { "auxiliary_loss_clip": 0.01121911, "auxiliary_loss_mlp": 0.01040908, "balance_loss_clip": 1.04629314, "balance_loss_mlp": 1.02576876, "epoch": 0.3264091387344055, "flos": 16142924367360.0, "grad_norm": 5.016107817785842, "language_loss": 0.71130025, "learning_rate": 3.147002215584023e-06, "loss": 0.7329284, "num_input_tokens_seen": 116646920, "step": 5429, "time_per_iteration": 2.6733968257904053 }, { "auxiliary_loss_clip": 0.01098408, "auxiliary_loss_mlp": 0.01035827, "balance_loss_clip": 1.04658663, "balance_loss_mlp": 1.0212121, "epoch": 0.3264692619870735, "flos": 16399075230720.0, "grad_norm": 1.7379615094125744, "language_loss": 0.78620625, "learning_rate": 3.146683144965881e-06, "loss": 0.80754858, "num_input_tokens_seen": 116665100, "step": 5430, "time_per_iteration": 2.7313849925994873 }, { "auxiliary_loss_clip": 0.01084979, "auxiliary_loss_mlp": 0.01043143, "balance_loss_clip": 1.04809749, "balance_loss_mlp": 1.02660871, "epoch": 0.32652938523974145, "flos": 22382331315840.0, "grad_norm": 3.4420441965814477, "language_loss": 0.84279943, "learning_rate": 3.146364030865399e-06, "loss": 0.86408061, "num_input_tokens_seen": 116682205, "step": 5431, "time_per_iteration": 2.720797300338745 }, { "auxiliary_loss_clip": 0.01117845, "auxiliary_loss_mlp": 0.01034908, "balance_loss_clip": 1.04730058, "balance_loss_mlp": 1.02067482, "epoch": 0.3265895084924094, "flos": 21908059113600.0, "grad_norm": 1.9482899767939774, "language_loss": 0.70736587, "learning_rate": 3.146044873294678e-06, "loss": 0.7288934, "num_input_tokens_seen": 116702575, "step": 5432, "time_per_iteration": 2.6805124282836914 }, { "auxiliary_loss_clip": 0.01073417, "auxiliary_loss_mlp": 0.01042634, "balance_loss_clip": 1.04051948, "balance_loss_mlp": 1.02625418, "epoch": 0.3266496317450774, "flos": 16067152627200.0, "grad_norm": 1.6263283854003907, "language_loss": 0.84160507, "learning_rate": 3.1457256722658203e-06, "loss": 0.86276555, "num_input_tokens_seen": 116720885, "step": 5433, "time_per_iteration": 2.733450174331665 }, { "auxiliary_loss_clip": 0.01110224, "auxiliary_loss_mlp": 0.01031776, "balance_loss_clip": 1.04831946, "balance_loss_mlp": 1.01733375, "epoch": 0.3267097549977454, "flos": 22528236360960.0, "grad_norm": 1.8752055231309104, "language_loss": 0.860237, "learning_rate": 3.145406427790931e-06, "loss": 0.881657, "num_input_tokens_seen": 116740395, "step": 5434, "time_per_iteration": 2.6711690425872803 }, { "auxiliary_loss_clip": 0.01115762, "auxiliary_loss_mlp": 0.0104022, "balance_loss_clip": 1.04894018, "balance_loss_mlp": 1.02460361, "epoch": 0.32676987825041337, "flos": 27270419679360.0, "grad_norm": 2.089345873834278, "language_loss": 0.87845808, "learning_rate": 3.1450871398821147e-06, "loss": 0.90001786, "num_input_tokens_seen": 116758870, "step": 5435, "time_per_iteration": 2.7342183589935303 }, { "auxiliary_loss_clip": 0.01137287, "auxiliary_loss_mlp": 0.01037617, "balance_loss_clip": 1.05190301, "balance_loss_mlp": 1.02256095, "epoch": 0.32683000150308134, "flos": 11508257433600.0, "grad_norm": 3.0926239838125595, "language_loss": 0.7645883, "learning_rate": 3.144767808551479e-06, "loss": 0.78633732, "num_input_tokens_seen": 116773440, "step": 5436, "time_per_iteration": 2.648062229156494 }, { "auxiliary_loss_clip": 0.01137346, "auxiliary_loss_mlp": 0.01034933, "balance_loss_clip": 1.0532552, "balance_loss_mlp": 1.02046728, "epoch": 0.3268901247557493, "flos": 25630200005760.0, "grad_norm": 1.7720337367532448, "language_loss": 0.71802473, "learning_rate": 3.144448433811134e-06, "loss": 0.73974752, "num_input_tokens_seen": 116794375, "step": 5437, "time_per_iteration": 2.680525541305542 }, { "auxiliary_loss_clip": 0.01095966, "auxiliary_loss_mlp": 0.0104222, "balance_loss_clip": 1.04542243, "balance_loss_mlp": 1.02445781, "epoch": 0.32695024800841727, "flos": 24860849575680.0, "grad_norm": 1.7134236857074348, "language_loss": 0.63728261, "learning_rate": 3.144129015673189e-06, "loss": 0.65866441, "num_input_tokens_seen": 116815095, "step": 5438, "time_per_iteration": 2.7343454360961914 }, { "auxiliary_loss_clip": 0.01128746, "auxiliary_loss_mlp": 0.01039734, "balance_loss_clip": 1.05383801, "balance_loss_mlp": 1.02468967, "epoch": 0.32701037126108523, "flos": 28839249072000.0, "grad_norm": 3.854723832885701, "language_loss": 0.74629039, "learning_rate": 3.1438095541497576e-06, "loss": 0.76797515, "num_input_tokens_seen": 116836630, "step": 5439, "time_per_iteration": 2.6859002113342285 }, { "auxiliary_loss_clip": 0.0113034, "auxiliary_loss_mlp": 0.0104413, "balance_loss_clip": 1.05407321, "balance_loss_mlp": 1.02773881, "epoch": 0.3270704945137532, "flos": 27965075777280.0, "grad_norm": 3.9922367032947634, "language_loss": 0.74743968, "learning_rate": 3.1434900492529527e-06, "loss": 0.76918435, "num_input_tokens_seen": 116856880, "step": 5440, "time_per_iteration": 2.6785733699798584 }, { "auxiliary_loss_clip": 0.01124529, "auxiliary_loss_mlp": 0.00773254, "balance_loss_clip": 1.05180979, "balance_loss_mlp": 1.00108397, "epoch": 0.32713061776642116, "flos": 23690700213120.0, "grad_norm": 2.2888111794693033, "language_loss": 0.84642965, "learning_rate": 3.1431705009948914e-06, "loss": 0.86540747, "num_input_tokens_seen": 116873770, "step": 5441, "time_per_iteration": 2.692375421524048 }, { "auxiliary_loss_clip": 0.01126517, "auxiliary_loss_mlp": 0.01042941, "balance_loss_clip": 1.05065203, "balance_loss_mlp": 1.02715778, "epoch": 0.3271907410190891, "flos": 22455625017600.0, "grad_norm": 3.048730330719705, "language_loss": 0.86782062, "learning_rate": 3.1428509093876897e-06, "loss": 0.88951516, "num_input_tokens_seen": 116891225, "step": 5442, "time_per_iteration": 2.6678872108459473 }, { "auxiliary_loss_clip": 0.01105154, "auxiliary_loss_mlp": 0.01041235, "balance_loss_clip": 1.05088091, "balance_loss_mlp": 1.02450991, "epoch": 0.3272508642717571, "flos": 22820118278400.0, "grad_norm": 2.240879974234663, "language_loss": 0.77471602, "learning_rate": 3.1425312744434668e-06, "loss": 0.79617989, "num_input_tokens_seen": 116912300, "step": 5443, "time_per_iteration": 2.715407133102417 }, { "auxiliary_loss_clip": 0.01109692, "auxiliary_loss_mlp": 0.00773391, "balance_loss_clip": 1.05144906, "balance_loss_mlp": 1.00102162, "epoch": 0.32731098752442506, "flos": 11801360413440.0, "grad_norm": 2.595112113661144, "language_loss": 0.81782895, "learning_rate": 3.142211596174343e-06, "loss": 0.83665979, "num_input_tokens_seen": 116929425, "step": 5444, "time_per_iteration": 2.7483620643615723 }, { "auxiliary_loss_clip": 0.0109768, "auxiliary_loss_mlp": 0.01042359, "balance_loss_clip": 1.05127132, "balance_loss_mlp": 1.02671897, "epoch": 0.327371110777093, "flos": 21027780506880.0, "grad_norm": 2.0540771727134786, "language_loss": 0.59668452, "learning_rate": 3.1418918745924423e-06, "loss": 0.61808491, "num_input_tokens_seen": 116948255, "step": 5445, "time_per_iteration": 2.7937049865722656 }, { "auxiliary_loss_clip": 0.01134371, "auxiliary_loss_mlp": 0.01045479, "balance_loss_clip": 1.05779314, "balance_loss_mlp": 1.02935553, "epoch": 0.327431234029761, "flos": 19062102677760.0, "grad_norm": 2.705344105300375, "language_loss": 0.88343978, "learning_rate": 3.1415721097098865e-06, "loss": 0.90523833, "num_input_tokens_seen": 116964905, "step": 5446, "time_per_iteration": 2.586451292037964 }, { "auxiliary_loss_clip": 0.01135097, "auxiliary_loss_mlp": 0.01041409, "balance_loss_clip": 1.0612191, "balance_loss_mlp": 1.02387285, "epoch": 0.32749135728242895, "flos": 25849219184640.0, "grad_norm": 2.2697780368090883, "language_loss": 0.79279661, "learning_rate": 3.141252301538802e-06, "loss": 0.81456167, "num_input_tokens_seen": 116983650, "step": 5447, "time_per_iteration": 2.744072198867798 }, { "auxiliary_loss_clip": 0.01107571, "auxiliary_loss_mlp": 0.00773964, "balance_loss_clip": 1.04747021, "balance_loss_mlp": 1.00110793, "epoch": 0.327551480535097, "flos": 20120533764480.0, "grad_norm": 1.8015667711206929, "language_loss": 0.73182315, "learning_rate": 3.1409324500913157e-06, "loss": 0.75063848, "num_input_tokens_seen": 117003265, "step": 5448, "time_per_iteration": 2.6825077533721924 }, { "auxiliary_loss_clip": 0.01142648, "auxiliary_loss_mlp": 0.01042295, "balance_loss_clip": 1.05620432, "balance_loss_mlp": 1.02694106, "epoch": 0.32761160378776494, "flos": 28803553931520.0, "grad_norm": 1.4660761852129829, "language_loss": 0.67103487, "learning_rate": 3.1406125553795567e-06, "loss": 0.69288433, "num_input_tokens_seen": 117025370, "step": 5449, "time_per_iteration": 2.682499885559082 }, { "auxiliary_loss_clip": 0.0110995, "auxiliary_loss_mlp": 0.010411, "balance_loss_clip": 1.0542469, "balance_loss_mlp": 1.02627623, "epoch": 0.3276717270404329, "flos": 26937778803840.0, "grad_norm": 3.4023702964270943, "language_loss": 0.65110958, "learning_rate": 3.1402926174156556e-06, "loss": 0.67262006, "num_input_tokens_seen": 117044350, "step": 5450, "time_per_iteration": 2.7582857608795166 }, { "auxiliary_loss_clip": 0.0113136, "auxiliary_loss_mlp": 0.01045713, "balance_loss_clip": 1.05517817, "balance_loss_mlp": 1.03021002, "epoch": 0.32773185029310087, "flos": 25338425829120.0, "grad_norm": 1.5880234750249043, "language_loss": 0.77630055, "learning_rate": 3.1399726362117437e-06, "loss": 0.79807132, "num_input_tokens_seen": 117064450, "step": 5451, "time_per_iteration": 2.6543071269989014 }, { "auxiliary_loss_clip": 0.01131184, "auxiliary_loss_mlp": 0.01044056, "balance_loss_clip": 1.05428064, "balance_loss_mlp": 1.02809358, "epoch": 0.32779197354576883, "flos": 26391721271040.0, "grad_norm": 1.913131066587778, "language_loss": 0.70510584, "learning_rate": 3.1396526117799555e-06, "loss": 0.7268582, "num_input_tokens_seen": 117083060, "step": 5452, "time_per_iteration": 2.6963608264923096 }, { "auxiliary_loss_clip": 0.01112229, "auxiliary_loss_mlp": 0.01036592, "balance_loss_clip": 1.048841, "balance_loss_mlp": 1.02223349, "epoch": 0.3278520967984368, "flos": 24899381890560.0, "grad_norm": 2.6287596248848013, "language_loss": 0.78730083, "learning_rate": 3.1393325441324256e-06, "loss": 0.80878907, "num_input_tokens_seen": 117101860, "step": 5453, "time_per_iteration": 4.197263479232788 }, { "auxiliary_loss_clip": 0.01130585, "auxiliary_loss_mlp": 0.01035536, "balance_loss_clip": 1.0526675, "balance_loss_mlp": 1.02026486, "epoch": 0.32791222005110476, "flos": 29752996176000.0, "grad_norm": 5.184832608635382, "language_loss": 0.75771177, "learning_rate": 3.1390124332812916e-06, "loss": 0.77937293, "num_input_tokens_seen": 117123100, "step": 5454, "time_per_iteration": 2.7643721103668213 }, { "auxiliary_loss_clip": 0.01070253, "auxiliary_loss_mlp": 0.01047697, "balance_loss_clip": 1.03818846, "balance_loss_mlp": 1.03363037, "epoch": 0.32797234330377273, "flos": 16508064072960.0, "grad_norm": 2.8017119157252703, "language_loss": 0.76891404, "learning_rate": 3.1386922792386924e-06, "loss": 0.79009354, "num_input_tokens_seen": 117140515, "step": 5455, "time_per_iteration": 4.402290105819702 }, { "auxiliary_loss_clip": 0.01131084, "auxiliary_loss_mlp": 0.01042542, "balance_loss_clip": 1.05241477, "balance_loss_mlp": 1.02624655, "epoch": 0.3280324665564407, "flos": 26577918397440.0, "grad_norm": 1.6426536912861747, "language_loss": 0.74021912, "learning_rate": 3.138372082016768e-06, "loss": 0.76195538, "num_input_tokens_seen": 117161485, "step": 5456, "time_per_iteration": 2.821965217590332 }, { "auxiliary_loss_clip": 0.01140062, "auxiliary_loss_mlp": 0.01047408, "balance_loss_clip": 1.05334985, "balance_loss_mlp": 1.03212523, "epoch": 0.32809258980910866, "flos": 22929969047040.0, "grad_norm": 1.7597936582740754, "language_loss": 0.78038168, "learning_rate": 3.1380518416276596e-06, "loss": 0.80225635, "num_input_tokens_seen": 117181870, "step": 5457, "time_per_iteration": 2.703756093978882 }, { "auxiliary_loss_clip": 0.01104649, "auxiliary_loss_mlp": 0.01042509, "balance_loss_clip": 1.04943132, "balance_loss_mlp": 1.02752471, "epoch": 0.3281527130617766, "flos": 22783848520320.0, "grad_norm": 5.102364490559591, "language_loss": 0.79493362, "learning_rate": 3.1377315580835115e-06, "loss": 0.81640518, "num_input_tokens_seen": 117201380, "step": 5458, "time_per_iteration": 4.307415962219238 }, { "auxiliary_loss_clip": 0.01124323, "auxiliary_loss_mlp": 0.01039216, "balance_loss_clip": 1.05467916, "balance_loss_mlp": 1.02362311, "epoch": 0.3282128363144446, "flos": 21250678354560.0, "grad_norm": 1.6160363150508943, "language_loss": 0.73029429, "learning_rate": 3.1374112313964686e-06, "loss": 0.7519297, "num_input_tokens_seen": 117221040, "step": 5459, "time_per_iteration": 2.678131341934204 }, { "auxiliary_loss_clip": 0.01118921, "auxiliary_loss_mlp": 0.01041188, "balance_loss_clip": 1.05190325, "balance_loss_mlp": 1.02591753, "epoch": 0.32827295956711255, "flos": 30843064166400.0, "grad_norm": 2.011905165126453, "language_loss": 0.84018445, "learning_rate": 3.1370908615786783e-06, "loss": 0.86178553, "num_input_tokens_seen": 117241395, "step": 5460, "time_per_iteration": 5.767046213150024 }, { "auxiliary_loss_clip": 0.01138817, "auxiliary_loss_mlp": 0.01035204, "balance_loss_clip": 1.05174541, "balance_loss_mlp": 1.02029121, "epoch": 0.3283330828197806, "flos": 25915006944000.0, "grad_norm": 1.9959413021835115, "language_loss": 0.76553524, "learning_rate": 3.136770448642288e-06, "loss": 0.78727543, "num_input_tokens_seen": 117259340, "step": 5461, "time_per_iteration": 2.673659086227417 }, { "auxiliary_loss_clip": 0.01121607, "auxiliary_loss_mlp": 0.01042243, "balance_loss_clip": 1.05065536, "balance_loss_mlp": 1.02489805, "epoch": 0.32839320607244854, "flos": 38582065042560.0, "grad_norm": 2.148112131584704, "language_loss": 0.62898672, "learning_rate": 3.1364499925994484e-06, "loss": 0.65062523, "num_input_tokens_seen": 117282375, "step": 5462, "time_per_iteration": 2.789217472076416 }, { "auxiliary_loss_clip": 0.01136727, "auxiliary_loss_mlp": 0.0077334, "balance_loss_clip": 1.05279326, "balance_loss_mlp": 1.00113511, "epoch": 0.3284533293251165, "flos": 26650888876800.0, "grad_norm": 2.4415591889879056, "language_loss": 0.7805075, "learning_rate": 3.1361294934623115e-06, "loss": 0.79960817, "num_input_tokens_seen": 117303830, "step": 5463, "time_per_iteration": 2.6797146797180176 }, { "auxiliary_loss_clip": 0.01109773, "auxiliary_loss_mlp": 0.01040868, "balance_loss_clip": 1.05036163, "balance_loss_mlp": 1.02523983, "epoch": 0.32851345257778447, "flos": 15304158904320.0, "grad_norm": 1.8407799027990368, "language_loss": 0.70095646, "learning_rate": 3.1358089512430303e-06, "loss": 0.72246289, "num_input_tokens_seen": 117320665, "step": 5464, "time_per_iteration": 2.7286477088928223 }, { "auxiliary_loss_clip": 0.01130175, "auxiliary_loss_mlp": 0.01038523, "balance_loss_clip": 1.05659711, "balance_loss_mlp": 1.02327609, "epoch": 0.32857357583045244, "flos": 23513732881920.0, "grad_norm": 1.976060055551124, "language_loss": 0.72474623, "learning_rate": 3.1354883659537594e-06, "loss": 0.74643314, "num_input_tokens_seen": 117339795, "step": 5465, "time_per_iteration": 2.6666364669799805 }, { "auxiliary_loss_clip": 0.01113042, "auxiliary_loss_mlp": 0.01049431, "balance_loss_clip": 1.05094242, "balance_loss_mlp": 1.03334332, "epoch": 0.3286336990831204, "flos": 20995209849600.0, "grad_norm": 1.953344541818443, "language_loss": 0.832214, "learning_rate": 3.1351677376066567e-06, "loss": 0.8538388, "num_input_tokens_seen": 117359525, "step": 5466, "time_per_iteration": 2.7432901859283447 }, { "auxiliary_loss_clip": 0.01113455, "auxiliary_loss_mlp": 0.01041029, "balance_loss_clip": 1.04729056, "balance_loss_mlp": 1.02577055, "epoch": 0.32869382233578837, "flos": 23658811914240.0, "grad_norm": 1.7893036060845653, "language_loss": 0.79221183, "learning_rate": 3.134847066213879e-06, "loss": 0.8137567, "num_input_tokens_seen": 117380320, "step": 5467, "time_per_iteration": 2.701490879058838 }, { "auxiliary_loss_clip": 0.0111678, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.05045676, "balance_loss_mlp": 1.01759124, "epoch": 0.32875394558845633, "flos": 25336522408320.0, "grad_norm": 1.5411251384559923, "language_loss": 0.74338531, "learning_rate": 3.134526351787587e-06, "loss": 0.76488233, "num_input_tokens_seen": 117400695, "step": 5468, "time_per_iteration": 2.6820507049560547 }, { "auxiliary_loss_clip": 0.0111552, "auxiliary_loss_mlp": 0.01042549, "balance_loss_clip": 1.05065966, "balance_loss_mlp": 1.02476263, "epoch": 0.3288140688411243, "flos": 14903108576640.0, "grad_norm": 1.9818058078172698, "language_loss": 0.7869612, "learning_rate": 3.134205594339942e-06, "loss": 0.80854189, "num_input_tokens_seen": 117418800, "step": 5469, "time_per_iteration": 2.6281590461730957 }, { "auxiliary_loss_clip": 0.01104752, "auxiliary_loss_mlp": 0.01033111, "balance_loss_clip": 1.04863441, "balance_loss_mlp": 1.01838851, "epoch": 0.32887419209379226, "flos": 18551345235840.0, "grad_norm": 1.9383846382167882, "language_loss": 0.81744516, "learning_rate": 3.133884793883107e-06, "loss": 0.8388238, "num_input_tokens_seen": 117438220, "step": 5470, "time_per_iteration": 2.8643784523010254 }, { "auxiliary_loss_clip": 0.01140563, "auxiliary_loss_mlp": 0.01045939, "balance_loss_clip": 1.05232358, "balance_loss_mlp": 1.03021562, "epoch": 0.3289343153464602, "flos": 48105610439040.0, "grad_norm": 2.0914054865715768, "language_loss": 0.67699564, "learning_rate": 3.1335639504292478e-06, "loss": 0.69886065, "num_input_tokens_seen": 117462560, "step": 5471, "time_per_iteration": 2.851717948913574 }, { "auxiliary_loss_clip": 0.01148136, "auxiliary_loss_mlp": 0.01043561, "balance_loss_clip": 1.05701339, "balance_loss_mlp": 1.02594161, "epoch": 0.3289944385991282, "flos": 27600295207680.0, "grad_norm": 2.097557855250848, "language_loss": 0.64926231, "learning_rate": 3.1332430639905288e-06, "loss": 0.67117929, "num_input_tokens_seen": 117483665, "step": 5472, "time_per_iteration": 2.6586108207702637 }, { "auxiliary_loss_clip": 0.01128351, "auxiliary_loss_mlp": 0.01045454, "balance_loss_clip": 1.05333138, "balance_loss_mlp": 1.02850199, "epoch": 0.32905456185179616, "flos": 20120318282880.0, "grad_norm": 3.4668570750263155, "language_loss": 0.88257217, "learning_rate": 3.13292213457912e-06, "loss": 0.90431023, "num_input_tokens_seen": 117503565, "step": 5473, "time_per_iteration": 2.6792144775390625 }, { "auxiliary_loss_clip": 0.01103479, "auxiliary_loss_mlp": 0.01038881, "balance_loss_clip": 1.04814398, "balance_loss_mlp": 1.02123809, "epoch": 0.3291146851044642, "flos": 23180230080000.0, "grad_norm": 1.8710184691373295, "language_loss": 0.78193343, "learning_rate": 3.1326011622071903e-06, "loss": 0.80335701, "num_input_tokens_seen": 117521460, "step": 5474, "time_per_iteration": 2.739057779312134 }, { "auxiliary_loss_clip": 0.01038022, "auxiliary_loss_mlp": 0.01029239, "balance_loss_clip": 1.02788568, "balance_loss_mlp": 1.02673554, "epoch": 0.32917480835713214, "flos": 67621912594560.0, "grad_norm": 0.8109823017171686, "language_loss": 0.6018818, "learning_rate": 3.132280146886911e-06, "loss": 0.62255442, "num_input_tokens_seen": 117580550, "step": 5475, "time_per_iteration": 3.196384906768799 }, { "auxiliary_loss_clip": 0.01091837, "auxiliary_loss_mlp": 0.01057279, "balance_loss_clip": 1.04454446, "balance_loss_mlp": 1.03726411, "epoch": 0.3292349316098001, "flos": 27964537073280.0, "grad_norm": 4.962450920257536, "language_loss": 0.76504046, "learning_rate": 3.131959088630455e-06, "loss": 0.78653169, "num_input_tokens_seen": 117600645, "step": 5476, "time_per_iteration": 2.7369961738586426 }, { "auxiliary_loss_clip": 0.01100541, "auxiliary_loss_mlp": 0.01044762, "balance_loss_clip": 1.04824603, "balance_loss_mlp": 1.02946782, "epoch": 0.3292950548624681, "flos": 20263673462400.0, "grad_norm": 2.5019671735892937, "language_loss": 0.74746907, "learning_rate": 3.131637987449997e-06, "loss": 0.76892209, "num_input_tokens_seen": 117618880, "step": 5477, "time_per_iteration": 2.814467430114746 }, { "auxiliary_loss_clip": 0.01135692, "auxiliary_loss_mlp": 0.01042652, "balance_loss_clip": 1.05235898, "balance_loss_mlp": 1.02838814, "epoch": 0.32935517811513604, "flos": 20812999132800.0, "grad_norm": 3.9065130557825234, "language_loss": 0.75539625, "learning_rate": 3.131316843357713e-06, "loss": 0.77717972, "num_input_tokens_seen": 117636445, "step": 5478, "time_per_iteration": 2.730445384979248 }, { "auxiliary_loss_clip": 0.0112467, "auxiliary_loss_mlp": 0.01042056, "balance_loss_clip": 1.04921985, "balance_loss_mlp": 1.02750051, "epoch": 0.329415301367804, "flos": 18441853603200.0, "grad_norm": 2.855777191383278, "language_loss": 0.80462509, "learning_rate": 3.1309956563657807e-06, "loss": 0.82629234, "num_input_tokens_seen": 117653105, "step": 5479, "time_per_iteration": 2.6443796157836914 }, { "auxiliary_loss_clip": 0.01037863, "auxiliary_loss_mlp": 0.01000413, "balance_loss_clip": 1.02671266, "balance_loss_mlp": 0.99823159, "epoch": 0.32947542462047197, "flos": 66323024887680.0, "grad_norm": 0.7530723778079996, "language_loss": 0.56519568, "learning_rate": 3.1306744264863804e-06, "loss": 0.58557844, "num_input_tokens_seen": 117719225, "step": 5480, "time_per_iteration": 3.213240146636963 }, { "auxiliary_loss_clip": 0.01124019, "auxiliary_loss_mlp": 0.00774449, "balance_loss_clip": 1.04898739, "balance_loss_mlp": 1.00116146, "epoch": 0.32953554787313993, "flos": 23221599569280.0, "grad_norm": 1.7923941739082951, "language_loss": 0.77444887, "learning_rate": 3.1303531537316915e-06, "loss": 0.79343355, "num_input_tokens_seen": 117738725, "step": 5481, "time_per_iteration": 2.6905598640441895 }, { "auxiliary_loss_clip": 0.01119194, "auxiliary_loss_mlp": 0.01050738, "balance_loss_clip": 1.05167091, "balance_loss_mlp": 1.03557408, "epoch": 0.3295956711258079, "flos": 27009492307200.0, "grad_norm": 1.5874205685036498, "language_loss": 0.78222132, "learning_rate": 3.130031838113899e-06, "loss": 0.80392069, "num_input_tokens_seen": 117757765, "step": 5482, "time_per_iteration": 2.765235424041748 }, { "auxiliary_loss_clip": 0.01130055, "auxiliary_loss_mlp": 0.01052605, "balance_loss_clip": 1.05121589, "balance_loss_mlp": 1.03674388, "epoch": 0.32965579437847586, "flos": 19171702051200.0, "grad_norm": 2.9405789595849385, "language_loss": 0.73674762, "learning_rate": 3.129710479645185e-06, "loss": 0.75857425, "num_input_tokens_seen": 117776810, "step": 5483, "time_per_iteration": 2.624969005584717 }, { "auxiliary_loss_clip": 0.01122896, "auxiliary_loss_mlp": 0.01054419, "balance_loss_clip": 1.05069685, "balance_loss_mlp": 1.03886831, "epoch": 0.32971591763114383, "flos": 30482521401600.0, "grad_norm": 1.8706124903497952, "language_loss": 0.75649381, "learning_rate": 3.1293890783377366e-06, "loss": 0.77826691, "num_input_tokens_seen": 117797730, "step": 5484, "time_per_iteration": 2.7650864124298096 }, { "auxiliary_loss_clip": 0.01141223, "auxiliary_loss_mlp": 0.01053478, "balance_loss_clip": 1.05515027, "balance_loss_mlp": 1.03807664, "epoch": 0.3297760408838118, "flos": 16289583598080.0, "grad_norm": 72.4202789440072, "language_loss": 0.71719176, "learning_rate": 3.129067634203742e-06, "loss": 0.73913872, "num_input_tokens_seen": 117815365, "step": 5485, "time_per_iteration": 2.603039264678955 }, { "auxiliary_loss_clip": 0.01081054, "auxiliary_loss_mlp": 0.01052335, "balance_loss_clip": 1.04921818, "balance_loss_mlp": 1.03822041, "epoch": 0.32983616413647976, "flos": 29530924341120.0, "grad_norm": 1.6108204077161399, "language_loss": 0.80275488, "learning_rate": 3.128746147255388e-06, "loss": 0.82408869, "num_input_tokens_seen": 117836095, "step": 5486, "time_per_iteration": 2.8364202976226807 }, { "auxiliary_loss_clip": 0.01106188, "auxiliary_loss_mlp": 0.01053006, "balance_loss_clip": 1.04739475, "balance_loss_mlp": 1.03650784, "epoch": 0.3298962873891478, "flos": 20631398947200.0, "grad_norm": 2.173231613182175, "language_loss": 0.84374005, "learning_rate": 3.1284246175048683e-06, "loss": 0.86533195, "num_input_tokens_seen": 117854655, "step": 5487, "time_per_iteration": 2.7796428203582764 }, { "auxiliary_loss_clip": 0.01087509, "auxiliary_loss_mlp": 0.01055173, "balance_loss_clip": 1.04317069, "balance_loss_mlp": 1.0379355, "epoch": 0.32995641064181574, "flos": 14976007228800.0, "grad_norm": 2.633362688401157, "language_loss": 0.74667275, "learning_rate": 3.1281030449643735e-06, "loss": 0.76809955, "num_input_tokens_seen": 117873300, "step": 5488, "time_per_iteration": 2.7173233032226562 }, { "auxiliary_loss_clip": 0.01143363, "auxiliary_loss_mlp": 0.01051325, "balance_loss_clip": 1.05679107, "balance_loss_mlp": 1.03563726, "epoch": 0.3300165338944837, "flos": 18661447399680.0, "grad_norm": 2.518818086418956, "language_loss": 0.71718305, "learning_rate": 3.127781429646098e-06, "loss": 0.7391299, "num_input_tokens_seen": 117891540, "step": 5489, "time_per_iteration": 2.6647188663482666 }, { "auxiliary_loss_clip": 0.01137372, "auxiliary_loss_mlp": 0.01044261, "balance_loss_clip": 1.05154073, "balance_loss_mlp": 1.02973497, "epoch": 0.3300766571471517, "flos": 25583730785280.0, "grad_norm": 6.067113992727344, "language_loss": 0.88346136, "learning_rate": 3.127459771562238e-06, "loss": 0.90527773, "num_input_tokens_seen": 117907690, "step": 5490, "time_per_iteration": 2.594193696975708 }, { "auxiliary_loss_clip": 0.01127009, "auxiliary_loss_mlp": 0.0103878, "balance_loss_clip": 1.05081856, "balance_loss_mlp": 1.02396214, "epoch": 0.33013678039981964, "flos": 11363501623680.0, "grad_norm": 5.091693260582257, "language_loss": 0.83396459, "learning_rate": 3.1271380707249907e-06, "loss": 0.85562241, "num_input_tokens_seen": 117925640, "step": 5491, "time_per_iteration": 2.6124439239501953 }, { "auxiliary_loss_clip": 0.01111643, "auxiliary_loss_mlp": 0.01048849, "balance_loss_clip": 1.05066538, "balance_loss_mlp": 1.03372788, "epoch": 0.3301969036524876, "flos": 24821203939200.0, "grad_norm": 1.9936853829327341, "language_loss": 0.77453989, "learning_rate": 3.126816327146554e-06, "loss": 0.79614484, "num_input_tokens_seen": 117944525, "step": 5492, "time_per_iteration": 4.26681923866272 }, { "auxiliary_loss_clip": 0.01144384, "auxiliary_loss_mlp": 0.01046422, "balance_loss_clip": 1.05559993, "balance_loss_mlp": 1.02987576, "epoch": 0.33025702690515557, "flos": 15961144613760.0, "grad_norm": 2.586093125227841, "language_loss": 0.74295127, "learning_rate": 3.12649454083913e-06, "loss": 0.76485932, "num_input_tokens_seen": 117962515, "step": 5493, "time_per_iteration": 2.572657585144043 }, { "auxiliary_loss_clip": 0.01007495, "auxiliary_loss_mlp": 0.01051184, "balance_loss_clip": 1.0238874, "balance_loss_mlp": 1.0491215, "epoch": 0.33031715015782354, "flos": 59416755989760.0, "grad_norm": 0.7952972655943692, "language_loss": 0.53981996, "learning_rate": 3.12617271181492e-06, "loss": 0.5604068, "num_input_tokens_seen": 118018780, "step": 5494, "time_per_iteration": 3.2123944759368896 }, { "auxiliary_loss_clip": 0.01114646, "auxiliary_loss_mlp": 0.0103786, "balance_loss_clip": 1.04879999, "balance_loss_mlp": 1.02241075, "epoch": 0.3303772734104915, "flos": 23184360144000.0, "grad_norm": 1.4867113292626302, "language_loss": 0.87236047, "learning_rate": 3.1258508400861276e-06, "loss": 0.89388549, "num_input_tokens_seen": 118038610, "step": 5495, "time_per_iteration": 4.180245637893677 }, { "auxiliary_loss_clip": 0.01104415, "auxiliary_loss_mlp": 0.0104461, "balance_loss_clip": 1.0520072, "balance_loss_mlp": 1.02813482, "epoch": 0.33043739666315947, "flos": 33071896010880.0, "grad_norm": 2.0634169818588157, "language_loss": 0.73468459, "learning_rate": 3.1255289256649587e-06, "loss": 0.7561748, "num_input_tokens_seen": 118055905, "step": 5496, "time_per_iteration": 2.816849946975708 }, { "auxiliary_loss_clip": 0.01107244, "auxiliary_loss_mlp": 0.01039897, "balance_loss_clip": 1.04852057, "balance_loss_mlp": 1.02469766, "epoch": 0.33049751991582743, "flos": 24895431394560.0, "grad_norm": 2.430684839051296, "language_loss": 0.72464252, "learning_rate": 3.1252069685636196e-06, "loss": 0.74611384, "num_input_tokens_seen": 118073695, "step": 5497, "time_per_iteration": 4.314718961715698 }, { "auxiliary_loss_clip": 0.01111966, "auxiliary_loss_mlp": 0.01038015, "balance_loss_clip": 1.05051875, "balance_loss_mlp": 1.02313733, "epoch": 0.3305576431684954, "flos": 29460575554560.0, "grad_norm": 1.9082848646705384, "language_loss": 0.804672, "learning_rate": 3.124884968794321e-06, "loss": 0.82617176, "num_input_tokens_seen": 118094030, "step": 5498, "time_per_iteration": 2.831347942352295 }, { "auxiliary_loss_clip": 0.01121599, "auxiliary_loss_mlp": 0.01041664, "balance_loss_clip": 1.04826963, "balance_loss_mlp": 1.02467656, "epoch": 0.33061776642116336, "flos": 22632305040000.0, "grad_norm": 2.0593804502858823, "language_loss": 0.75822198, "learning_rate": 3.12456292636927e-06, "loss": 0.77985466, "num_input_tokens_seen": 118111665, "step": 5499, "time_per_iteration": 4.880478858947754 }, { "auxiliary_loss_clip": 0.01119724, "auxiliary_loss_mlp": 0.01035684, "balance_loss_clip": 1.05307007, "balance_loss_mlp": 1.02016318, "epoch": 0.3306778896738313, "flos": 25776320532480.0, "grad_norm": 2.088317081581358, "language_loss": 0.78981787, "learning_rate": 3.124240841300681e-06, "loss": 0.81137192, "num_input_tokens_seen": 118132435, "step": 5500, "time_per_iteration": 2.7601048946380615 }, { "auxiliary_loss_clip": 0.01131843, "auxiliary_loss_mlp": 0.0103364, "balance_loss_clip": 1.0540576, "balance_loss_mlp": 1.01751041, "epoch": 0.33073801292649935, "flos": 36940552479360.0, "grad_norm": 8.499573931934933, "language_loss": 0.6655246, "learning_rate": 3.1239187136007665e-06, "loss": 0.68717939, "num_input_tokens_seen": 118155255, "step": 5501, "time_per_iteration": 2.7880568504333496 }, { "auxiliary_loss_clip": 0.01130024, "auxiliary_loss_mlp": 0.01044854, "balance_loss_clip": 1.05215073, "balance_loss_mlp": 1.02766418, "epoch": 0.3307981361791673, "flos": 12967738848000.0, "grad_norm": 2.417495150038941, "language_loss": 0.77221018, "learning_rate": 3.1235965432817417e-06, "loss": 0.79395902, "num_input_tokens_seen": 118169865, "step": 5502, "time_per_iteration": 2.621891736984253 }, { "auxiliary_loss_clip": 0.01120279, "auxiliary_loss_mlp": 0.01041312, "balance_loss_clip": 1.05816746, "balance_loss_mlp": 1.02508807, "epoch": 0.3308582594318353, "flos": 25374372364800.0, "grad_norm": 1.6870244228079128, "language_loss": 0.72882998, "learning_rate": 3.123274330355824e-06, "loss": 0.75044584, "num_input_tokens_seen": 118190760, "step": 5503, "time_per_iteration": 2.731391191482544 }, { "auxiliary_loss_clip": 0.01107126, "auxiliary_loss_mlp": 0.01042991, "balance_loss_clip": 1.04483843, "balance_loss_mlp": 1.02543116, "epoch": 0.33091838268450324, "flos": 26468570419200.0, "grad_norm": 1.6983408951831631, "language_loss": 0.75341403, "learning_rate": 3.12295207483523e-06, "loss": 0.77491516, "num_input_tokens_seen": 118213620, "step": 5504, "time_per_iteration": 2.734440565109253 }, { "auxiliary_loss_clip": 0.01116159, "auxiliary_loss_mlp": 0.01038384, "balance_loss_clip": 1.05076432, "balance_loss_mlp": 1.02267826, "epoch": 0.3309785059371712, "flos": 24971167221120.0, "grad_norm": 1.5921827086772462, "language_loss": 0.69537103, "learning_rate": 3.1226297767321816e-06, "loss": 0.71691644, "num_input_tokens_seen": 118235010, "step": 5505, "time_per_iteration": 2.7224769592285156 }, { "auxiliary_loss_clip": 0.0112242, "auxiliary_loss_mlp": 0.01050735, "balance_loss_clip": 1.04997373, "balance_loss_mlp": 1.03454661, "epoch": 0.3310386291898392, "flos": 20446710192000.0, "grad_norm": 1.6566524839278514, "language_loss": 0.81701219, "learning_rate": 3.122307436058899e-06, "loss": 0.83874375, "num_input_tokens_seen": 118255820, "step": 5506, "time_per_iteration": 2.6608633995056152 }, { "auxiliary_loss_clip": 0.01126393, "auxiliary_loss_mlp": 0.01036938, "balance_loss_clip": 1.05129898, "balance_loss_mlp": 1.02032042, "epoch": 0.33109875244250714, "flos": 23182672204800.0, "grad_norm": 2.1165262291534663, "language_loss": 0.7961843, "learning_rate": 3.121985052827606e-06, "loss": 0.81781757, "num_input_tokens_seen": 118274160, "step": 5507, "time_per_iteration": 2.6279826164245605 }, { "auxiliary_loss_clip": 0.01115407, "auxiliary_loss_mlp": 0.0104488, "balance_loss_clip": 1.04948068, "balance_loss_mlp": 1.02901316, "epoch": 0.3311588756951751, "flos": 24168384207360.0, "grad_norm": 1.8252383106416188, "language_loss": 0.71632457, "learning_rate": 3.1216626270505274e-06, "loss": 0.73792744, "num_input_tokens_seen": 118294385, "step": 5508, "time_per_iteration": 2.666274070739746 }, { "auxiliary_loss_clip": 0.01105407, "auxiliary_loss_mlp": 0.01035431, "balance_loss_clip": 1.04841506, "balance_loss_mlp": 1.02048194, "epoch": 0.33121899894784307, "flos": 28145742209280.0, "grad_norm": 2.0681023318662053, "language_loss": 0.71877921, "learning_rate": 3.12134015873989e-06, "loss": 0.74018759, "num_input_tokens_seen": 118313105, "step": 5509, "time_per_iteration": 2.9805185794830322 }, { "auxiliary_loss_clip": 0.01123913, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.05431342, "balance_loss_mlp": 1.02019095, "epoch": 0.33127912220051103, "flos": 29567660976000.0, "grad_norm": 1.690455092128618, "language_loss": 0.72850806, "learning_rate": 3.121017647907921e-06, "loss": 0.75010473, "num_input_tokens_seen": 118335250, "step": 5510, "time_per_iteration": 2.7012648582458496 }, { "auxiliary_loss_clip": 0.01097101, "auxiliary_loss_mlp": 0.01036395, "balance_loss_clip": 1.04754674, "balance_loss_mlp": 1.02099323, "epoch": 0.331339245453179, "flos": 14428836374400.0, "grad_norm": 2.529653220973509, "language_loss": 0.87842733, "learning_rate": 3.1206950945668508e-06, "loss": 0.89976227, "num_input_tokens_seen": 118351470, "step": 5511, "time_per_iteration": 2.699303150177002 }, { "auxiliary_loss_clip": 0.01077351, "auxiliary_loss_mlp": 0.0103825, "balance_loss_clip": 1.04569423, "balance_loss_mlp": 1.0232892, "epoch": 0.33139936870584696, "flos": 20887118847360.0, "grad_norm": 2.0800696693803404, "language_loss": 0.73301774, "learning_rate": 3.12037249872891e-06, "loss": 0.7541737, "num_input_tokens_seen": 118370970, "step": 5512, "time_per_iteration": 2.773071765899658 }, { "auxiliary_loss_clip": 0.01092657, "auxiliary_loss_mlp": 0.01037164, "balance_loss_clip": 1.04608238, "balance_loss_mlp": 1.02226281, "epoch": 0.33145949195851493, "flos": 36284356869120.0, "grad_norm": 28.686212163123738, "language_loss": 0.7188127, "learning_rate": 3.1200498604063317e-06, "loss": 0.74011087, "num_input_tokens_seen": 118393125, "step": 5513, "time_per_iteration": 2.832712411880493 }, { "auxiliary_loss_clip": 0.0110331, "auxiliary_loss_mlp": 0.01037016, "balance_loss_clip": 1.0480994, "balance_loss_mlp": 1.02052951, "epoch": 0.33151961521118295, "flos": 14279735018880.0, "grad_norm": 1.9100766123367274, "language_loss": 0.68260789, "learning_rate": 3.1197271796113507e-06, "loss": 0.70401114, "num_input_tokens_seen": 118410860, "step": 5514, "time_per_iteration": 2.62347674369812 }, { "auxiliary_loss_clip": 0.01111479, "auxiliary_loss_mlp": 0.01042546, "balance_loss_clip": 1.04936767, "balance_loss_mlp": 1.02481997, "epoch": 0.3315797384638509, "flos": 20774323163520.0, "grad_norm": 1.9179680687741931, "language_loss": 0.65994096, "learning_rate": 3.1194044563562026e-06, "loss": 0.68148118, "num_input_tokens_seen": 118429570, "step": 5515, "time_per_iteration": 2.6913952827453613 }, { "auxiliary_loss_clip": 0.01121539, "auxiliary_loss_mlp": 0.01039988, "balance_loss_clip": 1.04903245, "balance_loss_mlp": 1.02393019, "epoch": 0.3316398617165189, "flos": 24679464871680.0, "grad_norm": 1.8088538037879305, "language_loss": 0.69273043, "learning_rate": 3.1190816906531257e-06, "loss": 0.71434575, "num_input_tokens_seen": 118450285, "step": 5516, "time_per_iteration": 2.6469173431396484 }, { "auxiliary_loss_clip": 0.011287, "auxiliary_loss_mlp": 0.01039737, "balance_loss_clip": 1.05089724, "balance_loss_mlp": 1.02339315, "epoch": 0.33169998496918685, "flos": 18587974129920.0, "grad_norm": 3.871010712989623, "language_loss": 0.79914033, "learning_rate": 3.118758882514359e-06, "loss": 0.82082474, "num_input_tokens_seen": 118468270, "step": 5517, "time_per_iteration": 2.6387667655944824 }, { "auxiliary_loss_clip": 0.01113973, "auxiliary_loss_mlp": 0.01040442, "balance_loss_clip": 1.04587924, "balance_loss_mlp": 1.02412271, "epoch": 0.3317601082218548, "flos": 20193647898240.0, "grad_norm": 1.7856922866156533, "language_loss": 0.74043357, "learning_rate": 3.118436031952143e-06, "loss": 0.76197767, "num_input_tokens_seen": 118486615, "step": 5518, "time_per_iteration": 2.6136653423309326 }, { "auxiliary_loss_clip": 0.01035845, "auxiliary_loss_mlp": 0.0100663, "balance_loss_clip": 1.02549803, "balance_loss_mlp": 1.00447261, "epoch": 0.3318202314745228, "flos": 68974703637120.0, "grad_norm": 0.6165261089589951, "language_loss": 0.54330659, "learning_rate": 3.1181131389787206e-06, "loss": 0.56373143, "num_input_tokens_seen": 118553580, "step": 5519, "time_per_iteration": 3.3124027252197266 }, { "auxiliary_loss_clip": 0.01129225, "auxiliary_loss_mlp": 0.01042237, "balance_loss_clip": 1.05353975, "balance_loss_mlp": 1.02483273, "epoch": 0.33188035472719074, "flos": 21500113374720.0, "grad_norm": 2.4445902922344342, "language_loss": 0.78693354, "learning_rate": 3.117790203606336e-06, "loss": 0.80864823, "num_input_tokens_seen": 118570280, "step": 5520, "time_per_iteration": 2.680413246154785 }, { "auxiliary_loss_clip": 0.0111174, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.04981971, "balance_loss_mlp": 1.01946807, "epoch": 0.3319404779798587, "flos": 28870490926080.0, "grad_norm": 2.1205551001068645, "language_loss": 0.76597643, "learning_rate": 3.1174672258472344e-06, "loss": 0.78743839, "num_input_tokens_seen": 118590455, "step": 5521, "time_per_iteration": 2.7977516651153564 }, { "auxiliary_loss_clip": 0.01128356, "auxiliary_loss_mlp": 0.0104906, "balance_loss_clip": 1.0500772, "balance_loss_mlp": 1.0320611, "epoch": 0.33200060123252667, "flos": 23076915586560.0, "grad_norm": 5.546447388917159, "language_loss": 0.70404172, "learning_rate": 3.117144205713664e-06, "loss": 0.72581589, "num_input_tokens_seen": 118609495, "step": 5522, "time_per_iteration": 2.7343335151672363 }, { "auxiliary_loss_clip": 0.01112615, "auxiliary_loss_mlp": 0.01039333, "balance_loss_clip": 1.04872596, "balance_loss_mlp": 1.02413392, "epoch": 0.33206072448519464, "flos": 21142479611520.0, "grad_norm": 2.5717643633026133, "language_loss": 0.7406925, "learning_rate": 3.1168211432178735e-06, "loss": 0.76221192, "num_input_tokens_seen": 118628720, "step": 5523, "time_per_iteration": 2.6910529136657715 }, { "auxiliary_loss_clip": 0.01108522, "auxiliary_loss_mlp": 0.01039859, "balance_loss_clip": 1.04778576, "balance_loss_mlp": 1.02415287, "epoch": 0.3321208477378626, "flos": 13079097987840.0, "grad_norm": 1.7441145490896364, "language_loss": 0.82432246, "learning_rate": 3.116498038372114e-06, "loss": 0.8458063, "num_input_tokens_seen": 118645955, "step": 5524, "time_per_iteration": 2.747279405593872 }, { "auxiliary_loss_clip": 0.01094215, "auxiliary_loss_mlp": 0.00773366, "balance_loss_clip": 1.04763544, "balance_loss_mlp": 1.000983, "epoch": 0.33218097099053057, "flos": 21215414177280.0, "grad_norm": 1.8821817398487202, "language_loss": 0.83040905, "learning_rate": 3.116174891188636e-06, "loss": 0.84908485, "num_input_tokens_seen": 118665605, "step": 5525, "time_per_iteration": 2.7802865505218506 }, { "auxiliary_loss_clip": 0.01051991, "auxiliary_loss_mlp": 0.01009126, "balance_loss_clip": 1.02309918, "balance_loss_mlp": 1.00730228, "epoch": 0.33224109424319853, "flos": 64348979189760.0, "grad_norm": 0.7599038914172829, "language_loss": 0.52588648, "learning_rate": 3.1158517016796945e-06, "loss": 0.54649764, "num_input_tokens_seen": 118728155, "step": 5526, "time_per_iteration": 3.1430625915527344 }, { "auxiliary_loss_clip": 0.01100912, "auxiliary_loss_mlp": 0.00775153, "balance_loss_clip": 1.05235875, "balance_loss_mlp": 1.00101066, "epoch": 0.33230121749586655, "flos": 17346003523200.0, "grad_norm": 1.9434005693126541, "language_loss": 0.77540255, "learning_rate": 3.1155284698575445e-06, "loss": 0.79416323, "num_input_tokens_seen": 118743955, "step": 5527, "time_per_iteration": 2.779862403869629 }, { "auxiliary_loss_clip": 0.01095485, "auxiliary_loss_mlp": 0.01045396, "balance_loss_clip": 1.05338502, "balance_loss_mlp": 1.02997637, "epoch": 0.3323613407485345, "flos": 20997041443200.0, "grad_norm": 2.507974613956182, "language_loss": 0.7222321, "learning_rate": 3.1152051957344434e-06, "loss": 0.7436409, "num_input_tokens_seen": 118763275, "step": 5528, "time_per_iteration": 2.7340548038482666 }, { "auxiliary_loss_clip": 0.01112677, "auxiliary_loss_mlp": 0.01037789, "balance_loss_clip": 1.04796624, "balance_loss_mlp": 1.02333462, "epoch": 0.3324214640012025, "flos": 13152535344000.0, "grad_norm": 1.86583443755271, "language_loss": 0.82796729, "learning_rate": 3.1148818793226497e-06, "loss": 0.84947193, "num_input_tokens_seen": 118781110, "step": 5529, "time_per_iteration": 2.6532175540924072 }, { "auxiliary_loss_clip": 0.01113738, "auxiliary_loss_mlp": 0.00775289, "balance_loss_clip": 1.04990721, "balance_loss_mlp": 1.00095487, "epoch": 0.33248158725387045, "flos": 22273522041600.0, "grad_norm": 2.91854332756289, "language_loss": 0.69676769, "learning_rate": 3.114558520634423e-06, "loss": 0.71565795, "num_input_tokens_seen": 118800620, "step": 5530, "time_per_iteration": 2.708841323852539 }, { "auxiliary_loss_clip": 0.01126266, "auxiliary_loss_mlp": 0.01050268, "balance_loss_clip": 1.05040276, "balance_loss_mlp": 1.03394794, "epoch": 0.3325417105065384, "flos": 20740998320640.0, "grad_norm": 2.896961644373142, "language_loss": 0.75989115, "learning_rate": 3.1142351196820256e-06, "loss": 0.7816565, "num_input_tokens_seen": 118818725, "step": 5531, "time_per_iteration": 2.672736167907715 }, { "auxiliary_loss_clip": 0.01118495, "auxiliary_loss_mlp": 0.0104264, "balance_loss_clip": 1.05284333, "balance_loss_mlp": 1.0260222, "epoch": 0.3326018337592064, "flos": 24790536702720.0, "grad_norm": 2.0175366752259465, "language_loss": 0.73189509, "learning_rate": 3.1139116764777206e-06, "loss": 0.75350642, "num_input_tokens_seen": 118839390, "step": 5532, "time_per_iteration": 4.367426156997681 }, { "auxiliary_loss_clip": 0.0111545, "auxiliary_loss_mlp": 0.0103097, "balance_loss_clip": 1.0523479, "balance_loss_mlp": 1.01623583, "epoch": 0.33266195701187434, "flos": 14501699112960.0, "grad_norm": 2.031596721272471, "language_loss": 0.65847003, "learning_rate": 3.1135881910337735e-06, "loss": 0.67993426, "num_input_tokens_seen": 118856275, "step": 5533, "time_per_iteration": 2.66029691696167 }, { "auxiliary_loss_clip": 0.01080696, "auxiliary_loss_mlp": 0.01037858, "balance_loss_clip": 1.04513919, "balance_loss_mlp": 1.02147257, "epoch": 0.3327220802645423, "flos": 15304410299520.0, "grad_norm": 2.349847054242377, "language_loss": 0.71297956, "learning_rate": 3.113264663362451e-06, "loss": 0.73416501, "num_input_tokens_seen": 118873830, "step": 5534, "time_per_iteration": 4.27457070350647 }, { "auxiliary_loss_clip": 0.0109151, "auxiliary_loss_mlp": 0.01041219, "balance_loss_clip": 1.04982436, "balance_loss_mlp": 1.02534652, "epoch": 0.3327822035172103, "flos": 23477534951040.0, "grad_norm": 2.0777718313633997, "language_loss": 0.6718514, "learning_rate": 3.1129410934760204e-06, "loss": 0.69317865, "num_input_tokens_seen": 118891560, "step": 5535, "time_per_iteration": 2.774434804916382 }, { "auxiliary_loss_clip": 0.01126643, "auxiliary_loss_mlp": 0.00774026, "balance_loss_clip": 1.04974341, "balance_loss_mlp": 1.00099397, "epoch": 0.33284232676987824, "flos": 25374516019200.0, "grad_norm": 4.4518317449354905, "language_loss": 0.72757089, "learning_rate": 3.1126174813867517e-06, "loss": 0.74657756, "num_input_tokens_seen": 118910260, "step": 5536, "time_per_iteration": 4.211881399154663 }, { "auxiliary_loss_clip": 0.0112639, "auxiliary_loss_mlp": 0.01042922, "balance_loss_clip": 1.05097485, "balance_loss_mlp": 1.02740741, "epoch": 0.3329024500225462, "flos": 23694363400320.0, "grad_norm": 1.6494647990025764, "language_loss": 0.81951326, "learning_rate": 3.112293827106917e-06, "loss": 0.84120637, "num_input_tokens_seen": 118929985, "step": 5537, "time_per_iteration": 2.723938465118408 }, { "auxiliary_loss_clip": 0.01130953, "auxiliary_loss_mlp": 0.01041699, "balance_loss_clip": 1.05334187, "balance_loss_mlp": 1.02568924, "epoch": 0.33296257327521417, "flos": 31723163205120.0, "grad_norm": 2.0361349610506987, "language_loss": 0.71549797, "learning_rate": 3.111970130648789e-06, "loss": 0.73722446, "num_input_tokens_seen": 118951355, "step": 5538, "time_per_iteration": 4.913949489593506 }, { "auxiliary_loss_clip": 0.01120461, "auxiliary_loss_mlp": 0.01037376, "balance_loss_clip": 1.04746032, "balance_loss_mlp": 1.02189124, "epoch": 0.33302269652788213, "flos": 22744705674240.0, "grad_norm": 1.8849765474814903, "language_loss": 0.74648041, "learning_rate": 3.1116463920246424e-06, "loss": 0.76805872, "num_input_tokens_seen": 118970910, "step": 5539, "time_per_iteration": 2.7290310859680176 }, { "auxiliary_loss_clip": 0.01142521, "auxiliary_loss_mlp": 0.01045266, "balance_loss_clip": 1.05175686, "balance_loss_mlp": 1.02844524, "epoch": 0.33308281978055015, "flos": 11473747441920.0, "grad_norm": 1.7887365250144445, "language_loss": 0.71008205, "learning_rate": 3.1113226112467527e-06, "loss": 0.73195994, "num_input_tokens_seen": 118989200, "step": 5540, "time_per_iteration": 2.6340630054473877 }, { "auxiliary_loss_clip": 0.01121672, "auxiliary_loss_mlp": 0.01037813, "balance_loss_clip": 1.04614174, "balance_loss_mlp": 1.02212477, "epoch": 0.3331429430332181, "flos": 38213693112960.0, "grad_norm": 2.2050863595265535, "language_loss": 0.60332179, "learning_rate": 3.1109987883273983e-06, "loss": 0.62491661, "num_input_tokens_seen": 119011030, "step": 5541, "time_per_iteration": 2.9001681804656982 }, { "auxiliary_loss_clip": 0.01116142, "auxiliary_loss_mlp": 0.01045386, "balance_loss_clip": 1.04896498, "balance_loss_mlp": 1.02827907, "epoch": 0.3332030662858861, "flos": 22528667324160.0, "grad_norm": 1.8682676496278656, "language_loss": 0.68843257, "learning_rate": 3.1106749232788584e-06, "loss": 0.7100479, "num_input_tokens_seen": 119030620, "step": 5542, "time_per_iteration": 2.7336552143096924 }, { "auxiliary_loss_clip": 0.01125827, "auxiliary_loss_mlp": 0.01039479, "balance_loss_clip": 1.04983997, "balance_loss_mlp": 1.0241369, "epoch": 0.33326318953855405, "flos": 15997773507840.0, "grad_norm": 1.7424785130645766, "language_loss": 0.75545055, "learning_rate": 3.110351016113414e-06, "loss": 0.7771036, "num_input_tokens_seen": 119048015, "step": 5543, "time_per_iteration": 2.7098708152770996 }, { "auxiliary_loss_clip": 0.01059952, "auxiliary_loss_mlp": 0.01049723, "balance_loss_clip": 1.04679465, "balance_loss_mlp": 1.03153133, "epoch": 0.333323312791222, "flos": 25593535198080.0, "grad_norm": 1.720313350609618, "language_loss": 0.75207818, "learning_rate": 3.110027066843348e-06, "loss": 0.77317488, "num_input_tokens_seen": 119066280, "step": 5544, "time_per_iteration": 2.8580381870269775 }, { "auxiliary_loss_clip": 0.01131382, "auxiliary_loss_mlp": 0.01034467, "balance_loss_clip": 1.0470835, "balance_loss_mlp": 1.01900601, "epoch": 0.33338343604389, "flos": 25119550304640.0, "grad_norm": 1.8195187872515122, "language_loss": 0.70631826, "learning_rate": 3.1097030754809456e-06, "loss": 0.7279768, "num_input_tokens_seen": 119087680, "step": 5545, "time_per_iteration": 2.6675262451171875 }, { "auxiliary_loss_clip": 0.01090227, "auxiliary_loss_mlp": 0.01038197, "balance_loss_clip": 1.04591393, "balance_loss_mlp": 1.0225687, "epoch": 0.33344355929655795, "flos": 16947287579520.0, "grad_norm": 2.0475528286172615, "language_loss": 0.68962657, "learning_rate": 3.1093790420384894e-06, "loss": 0.7109108, "num_input_tokens_seen": 119105820, "step": 5546, "time_per_iteration": 2.6620733737945557 }, { "auxiliary_loss_clip": 0.01099462, "auxiliary_loss_mlp": 0.01039292, "balance_loss_clip": 1.04328573, "balance_loss_mlp": 1.02330589, "epoch": 0.3335036825492259, "flos": 27889591345920.0, "grad_norm": 1.6439201248410251, "language_loss": 0.64893299, "learning_rate": 3.1090549665282702e-06, "loss": 0.67032051, "num_input_tokens_seen": 119126630, "step": 5547, "time_per_iteration": 2.7897326946258545 }, { "auxiliary_loss_clip": 0.0111514, "auxiliary_loss_mlp": 0.0103407, "balance_loss_clip": 1.05108774, "balance_loss_mlp": 1.01957989, "epoch": 0.3335638058018939, "flos": 16179553261440.0, "grad_norm": 2.7266915889905765, "language_loss": 0.85475278, "learning_rate": 3.1087308489625742e-06, "loss": 0.8762449, "num_input_tokens_seen": 119143375, "step": 5548, "time_per_iteration": 2.691776990890503 }, { "auxiliary_loss_clip": 0.0112443, "auxiliary_loss_mlp": 0.01038689, "balance_loss_clip": 1.04759526, "balance_loss_mlp": 1.02190423, "epoch": 0.33362392905456184, "flos": 39896108288640.0, "grad_norm": 2.1593805374763466, "language_loss": 0.74996036, "learning_rate": 3.1084066893536945e-06, "loss": 0.77159154, "num_input_tokens_seen": 119166450, "step": 5549, "time_per_iteration": 2.778918743133545 }, { "auxiliary_loss_clip": 0.01129114, "auxiliary_loss_mlp": 0.01040153, "balance_loss_clip": 1.0509795, "balance_loss_mlp": 1.02330887, "epoch": 0.3336840523072298, "flos": 44271212567040.0, "grad_norm": 2.0942861782322577, "language_loss": 0.6826036, "learning_rate": 3.108082487713921e-06, "loss": 0.70429623, "num_input_tokens_seen": 119189645, "step": 5550, "time_per_iteration": 2.8417065143585205 }, { "auxiliary_loss_clip": 0.01094461, "auxiliary_loss_mlp": 0.01050862, "balance_loss_clip": 1.04752803, "balance_loss_mlp": 1.03398156, "epoch": 0.33374417555989777, "flos": 15085678429440.0, "grad_norm": 3.079168539029832, "language_loss": 0.60630679, "learning_rate": 3.1077582440555495e-06, "loss": 0.62776005, "num_input_tokens_seen": 119208045, "step": 5551, "time_per_iteration": 2.7206614017486572 }, { "auxiliary_loss_clip": 0.01096001, "auxiliary_loss_mlp": 0.01040976, "balance_loss_clip": 1.04871941, "balance_loss_mlp": 1.02429891, "epoch": 0.33380429881256574, "flos": 15849174942720.0, "grad_norm": 5.115117677651213, "language_loss": 0.70642906, "learning_rate": 3.1074339583908746e-06, "loss": 0.72779882, "num_input_tokens_seen": 119224910, "step": 5552, "time_per_iteration": 2.7452614307403564 }, { "auxiliary_loss_clip": 0.0109902, "auxiliary_loss_mlp": 0.01036983, "balance_loss_clip": 1.04360175, "balance_loss_mlp": 1.02150989, "epoch": 0.33386442206523376, "flos": 13480327883520.0, "grad_norm": 2.544991024269762, "language_loss": 0.82464319, "learning_rate": 3.107109630732192e-06, "loss": 0.84600323, "num_input_tokens_seen": 119243290, "step": 5553, "time_per_iteration": 2.755664110183716 }, { "auxiliary_loss_clip": 0.01115353, "auxiliary_loss_mlp": 0.00774656, "balance_loss_clip": 1.05034745, "balance_loss_mlp": 1.00092673, "epoch": 0.3339245453179017, "flos": 16690669839360.0, "grad_norm": 2.0139615227647343, "language_loss": 0.80920005, "learning_rate": 3.1067852610918017e-06, "loss": 0.82810014, "num_input_tokens_seen": 119261195, "step": 5554, "time_per_iteration": 2.701960563659668 }, { "auxiliary_loss_clip": 0.01127546, "auxiliary_loss_mlp": 0.01043388, "balance_loss_clip": 1.05171227, "balance_loss_mlp": 1.02820015, "epoch": 0.3339846685705697, "flos": 24610624456320.0, "grad_norm": 1.6473304910242343, "language_loss": 0.81187713, "learning_rate": 3.1064608494820032e-06, "loss": 0.83358645, "num_input_tokens_seen": 119282845, "step": 5555, "time_per_iteration": 2.697605609893799 }, { "auxiliary_loss_clip": 0.01120953, "auxiliary_loss_mlp": 0.01039289, "balance_loss_clip": 1.04721272, "balance_loss_mlp": 1.02425706, "epoch": 0.33404479182323765, "flos": 30953812775040.0, "grad_norm": 1.6543240081497628, "language_loss": 0.74369228, "learning_rate": 3.106136395915099e-06, "loss": 0.76529467, "num_input_tokens_seen": 119304430, "step": 5556, "time_per_iteration": 2.7341341972351074 }, { "auxiliary_loss_clip": 0.01124745, "auxiliary_loss_mlp": 0.0103615, "balance_loss_clip": 1.05016208, "balance_loss_mlp": 1.02102232, "epoch": 0.3341049150759056, "flos": 23513301918720.0, "grad_norm": 1.6367363007204896, "language_loss": 0.82058722, "learning_rate": 3.105811900403391e-06, "loss": 0.84219617, "num_input_tokens_seen": 119323830, "step": 5557, "time_per_iteration": 2.6798059940338135 }, { "auxiliary_loss_clip": 0.01115524, "auxiliary_loss_mlp": 0.01038861, "balance_loss_clip": 1.04990697, "balance_loss_mlp": 1.02333987, "epoch": 0.3341650383285736, "flos": 24026824707840.0, "grad_norm": 1.4529426900334401, "language_loss": 0.80220526, "learning_rate": 3.1054873629591855e-06, "loss": 0.82374907, "num_input_tokens_seen": 119346340, "step": 5558, "time_per_iteration": 2.760270118713379 }, { "auxiliary_loss_clip": 0.01108428, "auxiliary_loss_mlp": 0.01040994, "balance_loss_clip": 1.04822016, "balance_loss_mlp": 1.02628982, "epoch": 0.33422516158124155, "flos": 24901967669760.0, "grad_norm": 1.5625296304307381, "language_loss": 0.8137213, "learning_rate": 3.105162783594788e-06, "loss": 0.83521557, "num_input_tokens_seen": 119367285, "step": 5559, "time_per_iteration": 2.7685365676879883 }, { "auxiliary_loss_clip": 0.01096895, "auxiliary_loss_mlp": 0.01042951, "balance_loss_clip": 1.04609013, "balance_loss_mlp": 1.02726293, "epoch": 0.3342852848339095, "flos": 18333403464960.0, "grad_norm": 2.3834321283612003, "language_loss": 0.7164095, "learning_rate": 3.1048381623225074e-06, "loss": 0.73780799, "num_input_tokens_seen": 119385370, "step": 5560, "time_per_iteration": 2.721888780593872 }, { "auxiliary_loss_clip": 0.011201, "auxiliary_loss_mlp": 0.01043409, "balance_loss_clip": 1.05215085, "balance_loss_mlp": 1.02716064, "epoch": 0.3343454080865775, "flos": 30046530119040.0, "grad_norm": 2.1203222418546015, "language_loss": 0.75029516, "learning_rate": 3.1045134991546526e-06, "loss": 0.77193022, "num_input_tokens_seen": 119409150, "step": 5561, "time_per_iteration": 2.8445487022399902 }, { "auxiliary_loss_clip": 0.01115063, "auxiliary_loss_mlp": 0.01036711, "balance_loss_clip": 1.05170679, "balance_loss_mlp": 1.02177453, "epoch": 0.33440553133924544, "flos": 16398823835520.0, "grad_norm": 1.6036143049019338, "language_loss": 0.69467896, "learning_rate": 3.1041887941035355e-06, "loss": 0.71619672, "num_input_tokens_seen": 119426475, "step": 5562, "time_per_iteration": 2.664062023162842 }, { "auxiliary_loss_clip": 0.01125323, "auxiliary_loss_mlp": 0.01042082, "balance_loss_clip": 1.05125499, "balance_loss_mlp": 1.02763367, "epoch": 0.3344656545919134, "flos": 24242072958720.0, "grad_norm": 3.5139835262543504, "language_loss": 0.65094876, "learning_rate": 3.1038640471814685e-06, "loss": 0.67262286, "num_input_tokens_seen": 119446900, "step": 5563, "time_per_iteration": 2.70878529548645 }, { "auxiliary_loss_clip": 0.01078552, "auxiliary_loss_mlp": 0.01045974, "balance_loss_clip": 1.04751515, "balance_loss_mlp": 1.0296303, "epoch": 0.3345257778445814, "flos": 52118843149440.0, "grad_norm": 1.4983314251487456, "language_loss": 0.74106556, "learning_rate": 3.103539258400766e-06, "loss": 0.76231086, "num_input_tokens_seen": 119470945, "step": 5564, "time_per_iteration": 3.0751025676727295 }, { "auxiliary_loss_clip": 0.01035298, "auxiliary_loss_mlp": 0.01009529, "balance_loss_clip": 1.03294694, "balance_loss_mlp": 1.00762165, "epoch": 0.33458590109724934, "flos": 68048602254720.0, "grad_norm": 0.7758359845819034, "language_loss": 0.555296, "learning_rate": 3.103214427773745e-06, "loss": 0.57574433, "num_input_tokens_seen": 119529925, "step": 5565, "time_per_iteration": 3.2246947288513184 }, { "auxiliary_loss_clip": 0.01134316, "auxiliary_loss_mlp": 0.01036162, "balance_loss_clip": 1.05123055, "balance_loss_mlp": 1.02145183, "epoch": 0.3346460243499173, "flos": 37414788768000.0, "grad_norm": 2.332924120890769, "language_loss": 0.65000319, "learning_rate": 3.102889555312721e-06, "loss": 0.67170799, "num_input_tokens_seen": 119550700, "step": 5566, "time_per_iteration": 2.8920817375183105 }, { "auxiliary_loss_clip": 0.01115876, "auxiliary_loss_mlp": 0.0103757, "balance_loss_clip": 1.05134845, "balance_loss_mlp": 1.02252626, "epoch": 0.3347061476025853, "flos": 18697358021760.0, "grad_norm": 2.3005222539878436, "language_loss": 0.77525175, "learning_rate": 3.102564641030016e-06, "loss": 0.79678619, "num_input_tokens_seen": 119569295, "step": 5567, "time_per_iteration": 2.82244610786438 }, { "auxiliary_loss_clip": 0.01112911, "auxiliary_loss_mlp": 0.01037105, "balance_loss_clip": 1.0479182, "balance_loss_mlp": 1.02079725, "epoch": 0.3347662708552533, "flos": 13917827537280.0, "grad_norm": 1.7148039320536435, "language_loss": 0.76432139, "learning_rate": 3.102239684937949e-06, "loss": 0.78582156, "num_input_tokens_seen": 119587375, "step": 5568, "time_per_iteration": 2.689354181289673 }, { "auxiliary_loss_clip": 0.01099358, "auxiliary_loss_mlp": 0.01048314, "balance_loss_clip": 1.04898834, "balance_loss_mlp": 1.03163624, "epoch": 0.33482639410792125, "flos": 19750402068480.0, "grad_norm": 3.260707250765708, "language_loss": 0.70965171, "learning_rate": 3.101914687048842e-06, "loss": 0.73112851, "num_input_tokens_seen": 119604530, "step": 5569, "time_per_iteration": 2.747023344039917 }, { "auxiliary_loss_clip": 0.01099669, "auxiliary_loss_mlp": 0.01034787, "balance_loss_clip": 1.04569411, "balance_loss_mlp": 1.01819277, "epoch": 0.3348865173605892, "flos": 16102991422080.0, "grad_norm": 2.127450904564192, "language_loss": 0.89788258, "learning_rate": 3.10158964737502e-06, "loss": 0.91922712, "num_input_tokens_seen": 119621025, "step": 5570, "time_per_iteration": 2.810328960418701 }, { "auxiliary_loss_clip": 0.01098742, "auxiliary_loss_mlp": 0.01034906, "balance_loss_clip": 1.04593182, "balance_loss_mlp": 1.01970696, "epoch": 0.3349466406132572, "flos": 25008945350400.0, "grad_norm": 2.0196203016458245, "language_loss": 0.79848439, "learning_rate": 3.101264565928808e-06, "loss": 0.81982088, "num_input_tokens_seen": 119641725, "step": 5571, "time_per_iteration": 4.5300047397613525 }, { "auxiliary_loss_clip": 0.01052126, "auxiliary_loss_mlp": 0.00754598, "balance_loss_clip": 1.02251923, "balance_loss_mlp": 1.0014987, "epoch": 0.33500676386592515, "flos": 54319991564160.0, "grad_norm": 0.8956854098175919, "language_loss": 0.5596205, "learning_rate": 3.1009394427225335e-06, "loss": 0.57768774, "num_input_tokens_seen": 119693560, "step": 5572, "time_per_iteration": 3.0931503772735596 }, { "auxiliary_loss_clip": 0.01137277, "auxiliary_loss_mlp": 0.01047626, "balance_loss_clip": 1.05220318, "balance_loss_mlp": 1.03196192, "epoch": 0.3350668871185931, "flos": 26797332625920.0, "grad_norm": 2.019282888464976, "language_loss": 0.78090006, "learning_rate": 3.1006142777685257e-06, "loss": 0.8027491, "num_input_tokens_seen": 119712935, "step": 5573, "time_per_iteration": 2.710340738296509 }, { "auxiliary_loss_clip": 0.01105804, "auxiliary_loss_mlp": 0.01046551, "balance_loss_clip": 1.05004358, "balance_loss_mlp": 1.02974284, "epoch": 0.3351270103712611, "flos": 33510508986240.0, "grad_norm": 3.3664569303363834, "language_loss": 0.7253201, "learning_rate": 3.1002890710791133e-06, "loss": 0.74684364, "num_input_tokens_seen": 119731680, "step": 5574, "time_per_iteration": 4.390132427215576 }, { "auxiliary_loss_clip": 0.01119913, "auxiliary_loss_mlp": 0.01033586, "balance_loss_clip": 1.04622221, "balance_loss_mlp": 1.01882839, "epoch": 0.33518713362392905, "flos": 26506240807680.0, "grad_norm": 1.806126996337021, "language_loss": 0.87605375, "learning_rate": 3.0999638226666287e-06, "loss": 0.89758873, "num_input_tokens_seen": 119752155, "step": 5575, "time_per_iteration": 2.6650984287261963 }, { "auxiliary_loss_clip": 0.01119423, "auxiliary_loss_mlp": 0.01044892, "balance_loss_clip": 1.05073953, "balance_loss_mlp": 1.02783298, "epoch": 0.335247256876597, "flos": 17232345912960.0, "grad_norm": 2.5292682388354404, "language_loss": 0.82834053, "learning_rate": 3.0996385325434063e-06, "loss": 0.84998369, "num_input_tokens_seen": 119769195, "step": 5576, "time_per_iteration": 4.143759727478027 }, { "auxiliary_loss_clip": 0.01126035, "auxiliary_loss_mlp": 0.01042249, "balance_loss_clip": 1.04928613, "balance_loss_mlp": 1.02584612, "epoch": 0.335307380129265, "flos": 25629373992960.0, "grad_norm": 2.62081807641563, "language_loss": 0.72970062, "learning_rate": 3.0993132007217806e-06, "loss": 0.75138342, "num_input_tokens_seen": 119786810, "step": 5577, "time_per_iteration": 4.264250755310059 }, { "auxiliary_loss_clip": 0.01102749, "auxiliary_loss_mlp": 0.01040193, "balance_loss_clip": 1.05250812, "balance_loss_mlp": 1.02409935, "epoch": 0.33536750338193294, "flos": 19680089195520.0, "grad_norm": 2.2461501835528255, "language_loss": 0.8147049, "learning_rate": 3.0989878272140883e-06, "loss": 0.83613431, "num_input_tokens_seen": 119805395, "step": 5578, "time_per_iteration": 2.748187780380249 }, { "auxiliary_loss_clip": 0.01072311, "auxiliary_loss_mlp": 0.0077377, "balance_loss_clip": 1.04737353, "balance_loss_mlp": 1.00086129, "epoch": 0.3354276266346009, "flos": 18332613365760.0, "grad_norm": 2.081067644088489, "language_loss": 0.72135395, "learning_rate": 3.0986624120326676e-06, "loss": 0.73981476, "num_input_tokens_seen": 119823135, "step": 5579, "time_per_iteration": 2.797891616821289 }, { "auxiliary_loss_clip": 0.0108369, "auxiliary_loss_mlp": 0.01042635, "balance_loss_clip": 1.04664183, "balance_loss_mlp": 1.02608919, "epoch": 0.3354877498872689, "flos": 17858556645120.0, "grad_norm": 2.1516301629227255, "language_loss": 0.81264424, "learning_rate": 3.0983369551898573e-06, "loss": 0.83390749, "num_input_tokens_seen": 119842265, "step": 5580, "time_per_iteration": 2.76359224319458 }, { "auxiliary_loss_clip": 0.01112891, "auxiliary_loss_mlp": 0.01034758, "balance_loss_clip": 1.04777932, "balance_loss_mlp": 1.01918936, "epoch": 0.3355478731399369, "flos": 24717745791360.0, "grad_norm": 1.787418199208594, "language_loss": 0.78071463, "learning_rate": 3.0980114566980003e-06, "loss": 0.80219114, "num_input_tokens_seen": 119862500, "step": 5581, "time_per_iteration": 2.6893699169158936 }, { "auxiliary_loss_clip": 0.01102381, "auxiliary_loss_mlp": 0.01044533, "balance_loss_clip": 1.04555583, "balance_loss_mlp": 1.02674723, "epoch": 0.33560799639260486, "flos": 16873886136960.0, "grad_norm": 3.5541134032025528, "language_loss": 0.74734783, "learning_rate": 3.0976859165694384e-06, "loss": 0.76881701, "num_input_tokens_seen": 119880160, "step": 5582, "time_per_iteration": 2.750110149383545 }, { "auxiliary_loss_clip": 0.01109205, "auxiliary_loss_mlp": 0.0104468, "balance_loss_clip": 1.04334664, "balance_loss_mlp": 1.02793145, "epoch": 0.3356681196452728, "flos": 18333511205760.0, "grad_norm": 2.0738327777636574, "language_loss": 0.82039702, "learning_rate": 3.0973603348165166e-06, "loss": 0.84193587, "num_input_tokens_seen": 119899040, "step": 5583, "time_per_iteration": 2.629065990447998 }, { "auxiliary_loss_clip": 0.01113126, "auxiliary_loss_mlp": 0.01047702, "balance_loss_clip": 1.04719925, "balance_loss_mlp": 1.0322051, "epoch": 0.3357282428979408, "flos": 34750612085760.0, "grad_norm": 2.1437775006956814, "language_loss": 0.77524137, "learning_rate": 3.097034711451581e-06, "loss": 0.79684973, "num_input_tokens_seen": 119921120, "step": 5584, "time_per_iteration": 2.9303438663482666 }, { "auxiliary_loss_clip": 0.01115168, "auxiliary_loss_mlp": 0.01043431, "balance_loss_clip": 1.04803944, "balance_loss_mlp": 1.02755225, "epoch": 0.33578836615060875, "flos": 21580087006080.0, "grad_norm": 1.8068970963649096, "language_loss": 0.76473475, "learning_rate": 3.0967090464869795e-06, "loss": 0.78632081, "num_input_tokens_seen": 119940165, "step": 5585, "time_per_iteration": 2.7168867588043213 }, { "auxiliary_loss_clip": 0.01120824, "auxiliary_loss_mlp": 0.01040676, "balance_loss_clip": 1.04579937, "balance_loss_mlp": 1.02442741, "epoch": 0.3358484894032767, "flos": 24530291688960.0, "grad_norm": 1.8490215812193886, "language_loss": 0.77754235, "learning_rate": 3.0963833399350608e-06, "loss": 0.79915732, "num_input_tokens_seen": 119959730, "step": 5586, "time_per_iteration": 2.88452410697937 }, { "auxiliary_loss_clip": 0.01100333, "auxiliary_loss_mlp": 0.01057166, "balance_loss_clip": 1.0484302, "balance_loss_mlp": 1.03673398, "epoch": 0.3359086126559447, "flos": 22455589104000.0, "grad_norm": 1.6698470723885088, "language_loss": 0.810045, "learning_rate": 3.0960575918081756e-06, "loss": 0.8316201, "num_input_tokens_seen": 119979315, "step": 5587, "time_per_iteration": 2.7335522174835205 }, { "auxiliary_loss_clip": 0.01130777, "auxiliary_loss_mlp": 0.01042735, "balance_loss_clip": 1.04809558, "balance_loss_mlp": 1.02837586, "epoch": 0.33596873590861265, "flos": 16543687386240.0, "grad_norm": 1.8626695130182664, "language_loss": 0.67307252, "learning_rate": 3.095731802118677e-06, "loss": 0.69480765, "num_input_tokens_seen": 119996140, "step": 5588, "time_per_iteration": 2.5910611152648926 }, { "auxiliary_loss_clip": 0.01113468, "auxiliary_loss_mlp": 0.00774774, "balance_loss_clip": 1.04702032, "balance_loss_mlp": 1.0007664, "epoch": 0.3360288591612806, "flos": 31175812782720.0, "grad_norm": 2.758181662666948, "language_loss": 0.70459288, "learning_rate": 3.095405970878919e-06, "loss": 0.72347522, "num_input_tokens_seen": 120017720, "step": 5589, "time_per_iteration": 2.7966625690460205 }, { "auxiliary_loss_clip": 0.01110605, "auxiliary_loss_mlp": 0.01046945, "balance_loss_clip": 1.04478765, "balance_loss_mlp": 1.02951634, "epoch": 0.3360889824139486, "flos": 23696913265920.0, "grad_norm": 6.820816752821097, "language_loss": 0.6717155, "learning_rate": 3.0950800981012567e-06, "loss": 0.69329101, "num_input_tokens_seen": 120036335, "step": 5590, "time_per_iteration": 2.804384231567383 }, { "auxiliary_loss_clip": 0.01107091, "auxiliary_loss_mlp": 0.01044113, "balance_loss_clip": 1.05176187, "balance_loss_mlp": 1.02741194, "epoch": 0.33614910566661654, "flos": 19318109886720.0, "grad_norm": 2.108159500929249, "language_loss": 0.731767, "learning_rate": 3.094754183798047e-06, "loss": 0.75327909, "num_input_tokens_seen": 120056120, "step": 5591, "time_per_iteration": 2.7423245906829834 }, { "auxiliary_loss_clip": 0.01132777, "auxiliary_loss_mlp": 0.01043438, "balance_loss_clip": 1.04753232, "balance_loss_mlp": 1.02802432, "epoch": 0.3362092289192845, "flos": 16472261191680.0, "grad_norm": 2.4812698890164238, "language_loss": 0.6978277, "learning_rate": 3.0944282279816493e-06, "loss": 0.71958983, "num_input_tokens_seen": 120073650, "step": 5592, "time_per_iteration": 2.624565362930298 }, { "auxiliary_loss_clip": 0.01109265, "auxiliary_loss_mlp": 0.01035799, "balance_loss_clip": 1.0459764, "balance_loss_mlp": 1.02034986, "epoch": 0.33626935217195253, "flos": 24243581329920.0, "grad_norm": 2.2034044743639676, "language_loss": 0.76362681, "learning_rate": 3.094102230664423e-06, "loss": 0.78507739, "num_input_tokens_seen": 120093260, "step": 5593, "time_per_iteration": 2.7709946632385254 }, { "auxiliary_loss_clip": 0.01100555, "auxiliary_loss_mlp": 0.00775613, "balance_loss_clip": 1.04247713, "balance_loss_mlp": 1.00074506, "epoch": 0.3363294754246205, "flos": 19718765164800.0, "grad_norm": 2.2856177577930876, "language_loss": 0.7229932, "learning_rate": 3.093776191858731e-06, "loss": 0.74175489, "num_input_tokens_seen": 120111830, "step": 5594, "time_per_iteration": 2.7880120277404785 }, { "auxiliary_loss_clip": 0.01079557, "auxiliary_loss_mlp": 0.00778898, "balance_loss_clip": 1.04157269, "balance_loss_mlp": 1.00079668, "epoch": 0.33638959867728846, "flos": 22596286677120.0, "grad_norm": 3.2295215673950293, "language_loss": 0.79940557, "learning_rate": 3.0934501115769363e-06, "loss": 0.81799006, "num_input_tokens_seen": 120130470, "step": 5595, "time_per_iteration": 2.8623924255371094 }, { "auxiliary_loss_clip": 0.01111225, "auxiliary_loss_mlp": 0.01039348, "balance_loss_clip": 1.04694319, "balance_loss_mlp": 1.02456045, "epoch": 0.3364497219299564, "flos": 20994742972800.0, "grad_norm": 3.201033356603963, "language_loss": 0.81473815, "learning_rate": 3.0931239898314037e-06, "loss": 0.83624387, "num_input_tokens_seen": 120150735, "step": 5596, "time_per_iteration": 2.900319814682007 }, { "auxiliary_loss_clip": 0.01113286, "auxiliary_loss_mlp": 0.01044516, "balance_loss_clip": 1.04682481, "balance_loss_mlp": 1.02877986, "epoch": 0.3365098451826244, "flos": 25228610974080.0, "grad_norm": 1.642499178477658, "language_loss": 0.75647599, "learning_rate": 3.0927978266344995e-06, "loss": 0.778054, "num_input_tokens_seen": 120173230, "step": 5597, "time_per_iteration": 2.8402984142303467 }, { "auxiliary_loss_clip": 0.0112326, "auxiliary_loss_mlp": 0.01034747, "balance_loss_clip": 1.04734445, "balance_loss_mlp": 1.01902318, "epoch": 0.33656996843529235, "flos": 24571697091840.0, "grad_norm": 1.910742765655482, "language_loss": 0.78611934, "learning_rate": 3.0924716219985916e-06, "loss": 0.80769938, "num_input_tokens_seen": 120191860, "step": 5598, "time_per_iteration": 2.7380945682525635 }, { "auxiliary_loss_clip": 0.01141013, "auxiliary_loss_mlp": 0.01041333, "balance_loss_clip": 1.04969454, "balance_loss_mlp": 1.0235827, "epoch": 0.3366300916879603, "flos": 44091120752640.0, "grad_norm": 1.511676842650176, "language_loss": 0.6446076, "learning_rate": 3.0921453759360514e-06, "loss": 0.66643113, "num_input_tokens_seen": 120219195, "step": 5599, "time_per_iteration": 2.845017433166504 }, { "auxiliary_loss_clip": 0.01103042, "auxiliary_loss_mlp": 0.01054079, "balance_loss_clip": 1.04571164, "balance_loss_mlp": 1.03408813, "epoch": 0.3366902149406283, "flos": 13879869840000.0, "grad_norm": 3.0475721260430486, "language_loss": 0.8262403, "learning_rate": 3.091819088459249e-06, "loss": 0.84781146, "num_input_tokens_seen": 120232950, "step": 5600, "time_per_iteration": 2.690335512161255 }, { "auxiliary_loss_clip": 0.01128117, "auxiliary_loss_mlp": 0.01045257, "balance_loss_clip": 1.04780042, "balance_loss_mlp": 1.02822232, "epoch": 0.33675033819329625, "flos": 16253098358400.0, "grad_norm": 2.4530209101601037, "language_loss": 0.83457136, "learning_rate": 3.0914927595805573e-06, "loss": 0.856305, "num_input_tokens_seen": 120248865, "step": 5601, "time_per_iteration": 2.760735034942627 }, { "auxiliary_loss_clip": 0.01122256, "auxiliary_loss_mlp": 0.0103673, "balance_loss_clip": 1.04873729, "balance_loss_mlp": 1.02092862, "epoch": 0.3368104614459642, "flos": 17055809544960.0, "grad_norm": 2.1704904083215903, "language_loss": 0.83173311, "learning_rate": 3.0911663893123507e-06, "loss": 0.85332292, "num_input_tokens_seen": 120267820, "step": 5602, "time_per_iteration": 2.6818981170654297 }, { "auxiliary_loss_clip": 0.0113558, "auxiliary_loss_mlp": 0.01053921, "balance_loss_clip": 1.04765427, "balance_loss_mlp": 1.03756535, "epoch": 0.3368705846986322, "flos": 17858628472320.0, "grad_norm": 3.8525391607572477, "language_loss": 0.69046748, "learning_rate": 3.0908399776670048e-06, "loss": 0.71236247, "num_input_tokens_seen": 120286540, "step": 5603, "time_per_iteration": 2.6086158752441406 }, { "auxiliary_loss_clip": 0.01116527, "auxiliary_loss_mlp": 0.01042678, "balance_loss_clip": 1.04876411, "balance_loss_mlp": 1.02617979, "epoch": 0.33693070795130015, "flos": 22929502170240.0, "grad_norm": 1.5388557517073465, "language_loss": 0.83146536, "learning_rate": 3.090513524656898e-06, "loss": 0.85305738, "num_input_tokens_seen": 120307305, "step": 5604, "time_per_iteration": 2.7269375324249268 }, { "auxiliary_loss_clip": 0.01095396, "auxiliary_loss_mlp": 0.01043597, "balance_loss_clip": 1.04384422, "balance_loss_mlp": 1.02708673, "epoch": 0.3369908312039681, "flos": 22017443005440.0, "grad_norm": 1.634462052702842, "language_loss": 0.73473096, "learning_rate": 3.090187030294409e-06, "loss": 0.75612092, "num_input_tokens_seen": 120327845, "step": 5605, "time_per_iteration": 2.712197780609131 }, { "auxiliary_loss_clip": 0.0111786, "auxiliary_loss_mlp": 0.01038834, "balance_loss_clip": 1.04761815, "balance_loss_mlp": 1.02235925, "epoch": 0.33705095445663613, "flos": 11801970944640.0, "grad_norm": 3.8834830456250913, "language_loss": 0.83444858, "learning_rate": 3.089860494591919e-06, "loss": 0.85601556, "num_input_tokens_seen": 120343255, "step": 5606, "time_per_iteration": 2.6680989265441895 }, { "auxiliary_loss_clip": 0.01108557, "auxiliary_loss_mlp": 0.01039061, "balance_loss_clip": 1.04293787, "balance_loss_mlp": 1.02370059, "epoch": 0.3371110777093041, "flos": 25046400257280.0, "grad_norm": 2.0409696956182946, "language_loss": 0.67694759, "learning_rate": 3.089533917561809e-06, "loss": 0.69842374, "num_input_tokens_seen": 120361745, "step": 5607, "time_per_iteration": 2.8172407150268555 }, { "auxiliary_loss_clip": 0.01121964, "auxiliary_loss_mlp": 0.01053243, "balance_loss_clip": 1.04604626, "balance_loss_mlp": 1.03458667, "epoch": 0.33717120096197206, "flos": 26579031719040.0, "grad_norm": 1.9822534609557965, "language_loss": 0.70618403, "learning_rate": 3.089207299216464e-06, "loss": 0.72793615, "num_input_tokens_seen": 120380565, "step": 5608, "time_per_iteration": 2.669027090072632 }, { "auxiliary_loss_clip": 0.01055328, "auxiliary_loss_mlp": 0.01040575, "balance_loss_clip": 1.03931713, "balance_loss_mlp": 1.02449393, "epoch": 0.33723132421464, "flos": 15158541168000.0, "grad_norm": 1.931960515128334, "language_loss": 0.79290974, "learning_rate": 3.088880639568269e-06, "loss": 0.81386876, "num_input_tokens_seen": 120399235, "step": 5609, "time_per_iteration": 2.7859673500061035 }, { "auxiliary_loss_clip": 0.01124996, "auxiliary_loss_mlp": 0.01041459, "balance_loss_clip": 1.04914641, "balance_loss_mlp": 1.02387619, "epoch": 0.337291447467308, "flos": 23436093634560.0, "grad_norm": 1.7580059679361764, "language_loss": 0.82490408, "learning_rate": 3.0885539386296114e-06, "loss": 0.8465687, "num_input_tokens_seen": 120420095, "step": 5610, "time_per_iteration": 4.319208145141602 }, { "auxiliary_loss_clip": 0.01123032, "auxiliary_loss_mlp": 0.0104256, "balance_loss_clip": 1.0486002, "balance_loss_mlp": 1.02448845, "epoch": 0.33735157071997596, "flos": 17238163916160.0, "grad_norm": 2.0228863025134824, "language_loss": 0.82122159, "learning_rate": 3.088227196412879e-06, "loss": 0.84287751, "num_input_tokens_seen": 120437690, "step": 5611, "time_per_iteration": 2.6127841472625732 }, { "auxiliary_loss_clip": 0.01116485, "auxiliary_loss_mlp": 0.01045036, "balance_loss_clip": 1.04920387, "balance_loss_mlp": 1.02683246, "epoch": 0.3374116939726439, "flos": 28257388657920.0, "grad_norm": 2.0856936331065037, "language_loss": 0.79704899, "learning_rate": 3.0879004129304626e-06, "loss": 0.81866419, "num_input_tokens_seen": 120459240, "step": 5612, "time_per_iteration": 2.7237493991851807 }, { "auxiliary_loss_clip": 0.01076712, "auxiliary_loss_mlp": 0.01040315, "balance_loss_clip": 1.04079247, "balance_loss_mlp": 1.02410221, "epoch": 0.3374718172253119, "flos": 35919396731520.0, "grad_norm": 2.390785367991082, "language_loss": 0.70200634, "learning_rate": 3.087573588194753e-06, "loss": 0.7231766, "num_input_tokens_seen": 120481090, "step": 5613, "time_per_iteration": 4.43415379524231 }, { "auxiliary_loss_clip": 0.01118495, "auxiliary_loss_mlp": 0.01037291, "balance_loss_clip": 1.04903054, "balance_loss_mlp": 1.02097178, "epoch": 0.33753194047797985, "flos": 18186672407040.0, "grad_norm": 2.1929626699857585, "language_loss": 0.79407388, "learning_rate": 3.087246722218144e-06, "loss": 0.81563175, "num_input_tokens_seen": 120500045, "step": 5614, "time_per_iteration": 2.6484436988830566 }, { "auxiliary_loss_clip": 0.01105902, "auxiliary_loss_mlp": 0.01046863, "balance_loss_clip": 1.04512811, "balance_loss_mlp": 1.02796841, "epoch": 0.3375920637306478, "flos": 23148916398720.0, "grad_norm": 1.967540834348034, "language_loss": 0.91201901, "learning_rate": 3.086919815013031e-06, "loss": 0.93354666, "num_input_tokens_seen": 120521125, "step": 5615, "time_per_iteration": 4.486853361129761 }, { "auxiliary_loss_clip": 0.01119294, "auxiliary_loss_mlp": 0.01042109, "balance_loss_clip": 1.04542458, "balance_loss_mlp": 1.0265168, "epoch": 0.3376521869833158, "flos": 23112215677440.0, "grad_norm": 2.688104519924193, "language_loss": 0.80865037, "learning_rate": 3.086592866591809e-06, "loss": 0.83026439, "num_input_tokens_seen": 120539180, "step": 5616, "time_per_iteration": 2.693419933319092 }, { "auxiliary_loss_clip": 0.01132102, "auxiliary_loss_mlp": 0.00776249, "balance_loss_clip": 1.04987526, "balance_loss_mlp": 1.00074387, "epoch": 0.33771231023598375, "flos": 19274585581440.0, "grad_norm": 5.641479508637021, "language_loss": 0.83967853, "learning_rate": 3.0862658769668774e-06, "loss": 0.85876203, "num_input_tokens_seen": 120556280, "step": 5617, "time_per_iteration": 4.261611461639404 }, { "auxiliary_loss_clip": 0.01065047, "auxiliary_loss_mlp": 0.01048039, "balance_loss_clip": 1.0423851, "balance_loss_mlp": 1.030074, "epoch": 0.3377724334886517, "flos": 18150187167360.0, "grad_norm": 2.2609860925126117, "language_loss": 0.80159199, "learning_rate": 3.0859388461506343e-06, "loss": 0.82272285, "num_input_tokens_seen": 120575395, "step": 5618, "time_per_iteration": 2.8115389347076416 }, { "auxiliary_loss_clip": 0.01092947, "auxiliary_loss_mlp": 0.01037796, "balance_loss_clip": 1.04605365, "balance_loss_mlp": 1.02121353, "epoch": 0.3378325567413197, "flos": 25775997310080.0, "grad_norm": 1.9598490702889584, "language_loss": 0.7111814, "learning_rate": 3.085611774155481e-06, "loss": 0.73248887, "num_input_tokens_seen": 120596075, "step": 5619, "time_per_iteration": 2.86958909034729 }, { "auxiliary_loss_clip": 0.01116213, "auxiliary_loss_mlp": 0.01047745, "balance_loss_clip": 1.04749656, "balance_loss_mlp": 1.03167593, "epoch": 0.3378926799939877, "flos": 21317112558720.0, "grad_norm": 2.630730252639156, "language_loss": 0.70144761, "learning_rate": 3.085284660993821e-06, "loss": 0.72308713, "num_input_tokens_seen": 120614195, "step": 5620, "time_per_iteration": 2.6953368186950684 }, { "auxiliary_loss_clip": 0.01136416, "auxiliary_loss_mlp": 0.01047216, "balance_loss_clip": 1.05076015, "balance_loss_mlp": 1.03201699, "epoch": 0.33795280324665566, "flos": 24900028335360.0, "grad_norm": 1.8373178803043773, "language_loss": 0.67899036, "learning_rate": 3.084957506678058e-06, "loss": 0.70082676, "num_input_tokens_seen": 120634475, "step": 5621, "time_per_iteration": 2.6531872749328613 }, { "auxiliary_loss_clip": 0.0110792, "auxiliary_loss_mlp": 0.01044445, "balance_loss_clip": 1.04716897, "balance_loss_mlp": 1.02814865, "epoch": 0.33801292649932363, "flos": 24753943722240.0, "grad_norm": 1.7693089540657438, "language_loss": 0.82862681, "learning_rate": 3.0846303112205975e-06, "loss": 0.85015041, "num_input_tokens_seen": 120654980, "step": 5622, "time_per_iteration": 2.7764267921447754 }, { "auxiliary_loss_clip": 0.01097036, "auxiliary_loss_mlp": 0.01041227, "balance_loss_clip": 1.043239, "balance_loss_mlp": 1.02565813, "epoch": 0.3380730497519916, "flos": 26723967096960.0, "grad_norm": 7.015051283901371, "language_loss": 0.73815429, "learning_rate": 3.0843030746338464e-06, "loss": 0.75953692, "num_input_tokens_seen": 120676245, "step": 5623, "time_per_iteration": 2.7962961196899414 }, { "auxiliary_loss_clip": 0.0104645, "auxiliary_loss_mlp": 0.01031816, "balance_loss_clip": 1.03514934, "balance_loss_mlp": 1.0298605, "epoch": 0.33813317300465956, "flos": 70035756416640.0, "grad_norm": 0.757644747116446, "language_loss": 0.55002284, "learning_rate": 3.083975796930215e-06, "loss": 0.57080543, "num_input_tokens_seen": 120741965, "step": 5624, "time_per_iteration": 3.3495559692382812 }, { "auxiliary_loss_clip": 0.01091887, "auxiliary_loss_mlp": 0.01055525, "balance_loss_clip": 1.04508519, "balance_loss_mlp": 1.03704786, "epoch": 0.3381932962573275, "flos": 24097317148800.0, "grad_norm": 3.1490866232839876, "language_loss": 0.73299229, "learning_rate": 3.083648478122111e-06, "loss": 0.75446641, "num_input_tokens_seen": 120760410, "step": 5625, "time_per_iteration": 2.7474253177642822 }, { "auxiliary_loss_clip": 0.01127839, "auxiliary_loss_mlp": 0.01045252, "balance_loss_clip": 1.04838002, "balance_loss_mlp": 1.02828884, "epoch": 0.3382534195099955, "flos": 19278248768640.0, "grad_norm": 5.828984180477566, "language_loss": 0.70578009, "learning_rate": 3.0833211182219497e-06, "loss": 0.72751105, "num_input_tokens_seen": 120777705, "step": 5626, "time_per_iteration": 2.6597115993499756 }, { "auxiliary_loss_clip": 0.01108172, "auxiliary_loss_mlp": 0.01041744, "balance_loss_clip": 1.04509664, "balance_loss_mlp": 1.02605569, "epoch": 0.33831354276266346, "flos": 25226240676480.0, "grad_norm": 3.2927176036830574, "language_loss": 0.80853224, "learning_rate": 3.0829937172421425e-06, "loss": 0.83003139, "num_input_tokens_seen": 120798660, "step": 5627, "time_per_iteration": 2.730774402618408 }, { "auxiliary_loss_clip": 0.01131612, "auxiliary_loss_mlp": 0.0077564, "balance_loss_clip": 1.05286694, "balance_loss_mlp": 1.00064421, "epoch": 0.3383736660153314, "flos": 23112000195840.0, "grad_norm": 2.306116347111899, "language_loss": 0.80454439, "learning_rate": 3.0826662751951055e-06, "loss": 0.82361686, "num_input_tokens_seen": 120816705, "step": 5628, "time_per_iteration": 2.691471576690674 }, { "auxiliary_loss_clip": 0.01080566, "auxiliary_loss_mlp": 0.01046147, "balance_loss_clip": 1.04250276, "balance_loss_mlp": 1.02787185, "epoch": 0.3384337892679994, "flos": 23477139901440.0, "grad_norm": 3.64262689820424, "language_loss": 0.77174091, "learning_rate": 3.082338792093254e-06, "loss": 0.79300809, "num_input_tokens_seen": 120835375, "step": 5629, "time_per_iteration": 2.7564992904663086 }, { "auxiliary_loss_clip": 0.01116368, "auxiliary_loss_mlp": 0.01046104, "balance_loss_clip": 1.04699719, "balance_loss_mlp": 1.02819836, "epoch": 0.33849391252066735, "flos": 19425805839360.0, "grad_norm": 4.669184863549949, "language_loss": 0.84738326, "learning_rate": 3.0820112679490074e-06, "loss": 0.86900795, "num_input_tokens_seen": 120854260, "step": 5630, "time_per_iteration": 2.7284910678863525 }, { "auxiliary_loss_clip": 0.0108732, "auxiliary_loss_mlp": 0.01055965, "balance_loss_clip": 1.04692125, "balance_loss_mlp": 1.03889382, "epoch": 0.3385540357733353, "flos": 21064840364160.0, "grad_norm": 2.0951078731071204, "language_loss": 0.71627271, "learning_rate": 3.0816837027747857e-06, "loss": 0.73770559, "num_input_tokens_seen": 120871590, "step": 5631, "time_per_iteration": 2.7423501014709473 }, { "auxiliary_loss_clip": 0.01036653, "auxiliary_loss_mlp": 0.01008716, "balance_loss_clip": 1.02691352, "balance_loss_mlp": 1.00683236, "epoch": 0.3386141590260033, "flos": 69208013450880.0, "grad_norm": 0.8383263502294551, "language_loss": 0.56103444, "learning_rate": 3.0813560965830084e-06, "loss": 0.58148813, "num_input_tokens_seen": 120925550, "step": 5632, "time_per_iteration": 3.24780535697937 }, { "auxiliary_loss_clip": 0.01122742, "auxiliary_loss_mlp": 0.01038822, "balance_loss_clip": 1.05064476, "balance_loss_mlp": 1.02198935, "epoch": 0.3386742822786713, "flos": 25519487310720.0, "grad_norm": 1.5341010429525646, "language_loss": 0.80410492, "learning_rate": 3.0810284493861005e-06, "loss": 0.82572055, "num_input_tokens_seen": 120947620, "step": 5633, "time_per_iteration": 2.6492738723754883 }, { "auxiliary_loss_clip": 0.01099799, "auxiliary_loss_mlp": 0.01044702, "balance_loss_clip": 1.04435778, "balance_loss_mlp": 1.02854943, "epoch": 0.33873440553133927, "flos": 23623116773760.0, "grad_norm": 2.1401050060877997, "language_loss": 0.59013391, "learning_rate": 3.0807007611964855e-06, "loss": 0.61157894, "num_input_tokens_seen": 120965205, "step": 5634, "time_per_iteration": 2.7261369228363037 }, { "auxiliary_loss_clip": 0.01106157, "auxiliary_loss_mlp": 0.01040516, "balance_loss_clip": 1.04877985, "balance_loss_mlp": 1.02482784, "epoch": 0.33879452878400723, "flos": 17088882992640.0, "grad_norm": 1.8243057386875807, "language_loss": 0.92440355, "learning_rate": 3.080373032026589e-06, "loss": 0.94587028, "num_input_tokens_seen": 120983560, "step": 5635, "time_per_iteration": 2.627788782119751 }, { "auxiliary_loss_clip": 0.01091476, "auxiliary_loss_mlp": 0.01039192, "balance_loss_clip": 1.05005646, "balance_loss_mlp": 1.02288401, "epoch": 0.3388546520366752, "flos": 15742053607680.0, "grad_norm": 2.00681285666687, "language_loss": 0.75539577, "learning_rate": 3.0800452618888386e-06, "loss": 0.7767024, "num_input_tokens_seen": 121001400, "step": 5636, "time_per_iteration": 2.706772565841675 }, { "auxiliary_loss_clip": 0.0112617, "auxiliary_loss_mlp": 0.01044921, "balance_loss_clip": 1.05089188, "balance_loss_mlp": 1.02866137, "epoch": 0.33891477528934316, "flos": 22418744728320.0, "grad_norm": 1.7127540900641318, "language_loss": 0.83448696, "learning_rate": 3.0797174507956637e-06, "loss": 0.85619783, "num_input_tokens_seen": 121021760, "step": 5637, "time_per_iteration": 2.6864166259765625 }, { "auxiliary_loss_clip": 0.0109052, "auxiliary_loss_mlp": 0.01051499, "balance_loss_clip": 1.04899251, "balance_loss_mlp": 1.03193665, "epoch": 0.3389748985420111, "flos": 17274828723840.0, "grad_norm": 1.650296659926583, "language_loss": 0.70123053, "learning_rate": 3.079389598759495e-06, "loss": 0.72265071, "num_input_tokens_seen": 121041070, "step": 5638, "time_per_iteration": 2.7513418197631836 }, { "auxiliary_loss_clip": 0.01107421, "auxiliary_loss_mlp": 0.01049541, "balance_loss_clip": 1.0486834, "balance_loss_mlp": 1.0325892, "epoch": 0.3390350217946791, "flos": 27744979190400.0, "grad_norm": 3.471125425253904, "language_loss": 0.80819786, "learning_rate": 3.079061705792765e-06, "loss": 0.82976747, "num_input_tokens_seen": 121060890, "step": 5639, "time_per_iteration": 2.8025810718536377 }, { "auxiliary_loss_clip": 0.01143398, "auxiliary_loss_mlp": 0.01048836, "balance_loss_clip": 1.0533762, "balance_loss_mlp": 1.03158689, "epoch": 0.33909514504734706, "flos": 20339804338560.0, "grad_norm": 8.162571098362656, "language_loss": 0.67619336, "learning_rate": 3.078733771907907e-06, "loss": 0.69811565, "num_input_tokens_seen": 121079135, "step": 5640, "time_per_iteration": 2.662127733230591 }, { "auxiliary_loss_clip": 0.01114186, "auxiliary_loss_mlp": 0.01038526, "balance_loss_clip": 1.04930854, "balance_loss_mlp": 1.02196789, "epoch": 0.339155268300015, "flos": 14830030356480.0, "grad_norm": 1.6687164879604648, "language_loss": 0.69589841, "learning_rate": 3.0784057971173554e-06, "loss": 0.71742553, "num_input_tokens_seen": 121097685, "step": 5641, "time_per_iteration": 2.6596109867095947 }, { "auxiliary_loss_clip": 0.01142481, "auxiliary_loss_mlp": 0.0104296, "balance_loss_clip": 1.05451512, "balance_loss_mlp": 1.02698565, "epoch": 0.339215391552683, "flos": 26067951054720.0, "grad_norm": 2.4357287647671266, "language_loss": 0.87591994, "learning_rate": 3.0780777814335483e-06, "loss": 0.89777428, "num_input_tokens_seen": 121115640, "step": 5642, "time_per_iteration": 2.6347198486328125 }, { "auxiliary_loss_clip": 0.01117312, "auxiliary_loss_mlp": 0.01034931, "balance_loss_clip": 1.04759669, "balance_loss_mlp": 1.02112639, "epoch": 0.33927551480535095, "flos": 14574705505920.0, "grad_norm": 1.860184080586481, "language_loss": 0.83900917, "learning_rate": 3.077749724868924e-06, "loss": 0.86053157, "num_input_tokens_seen": 121132485, "step": 5643, "time_per_iteration": 2.678086042404175 }, { "auxiliary_loss_clip": 0.01107188, "auxiliary_loss_mlp": 0.01049417, "balance_loss_clip": 1.04616475, "balance_loss_mlp": 1.03295422, "epoch": 0.3393356380580189, "flos": 23805578885760.0, "grad_norm": 4.293096130940915, "language_loss": 0.76897138, "learning_rate": 3.077421627435922e-06, "loss": 0.79053748, "num_input_tokens_seen": 121152935, "step": 5644, "time_per_iteration": 2.6681976318359375 }, { "auxiliary_loss_clip": 0.01123, "auxiliary_loss_mlp": 0.01046638, "balance_loss_clip": 1.05055666, "balance_loss_mlp": 1.02978194, "epoch": 0.3393957613106869, "flos": 17347871030400.0, "grad_norm": 8.889141309374795, "language_loss": 0.62855232, "learning_rate": 3.0770934891469832e-06, "loss": 0.65024871, "num_input_tokens_seen": 121169835, "step": 5645, "time_per_iteration": 2.5976576805114746 }, { "auxiliary_loss_clip": 0.01123901, "auxiliary_loss_mlp": 0.01042398, "balance_loss_clip": 1.04963613, "balance_loss_mlp": 1.0272944, "epoch": 0.3394558845633549, "flos": 28433960939520.0, "grad_norm": 1.8158202042065192, "language_loss": 0.76223624, "learning_rate": 3.076765310014552e-06, "loss": 0.78389925, "num_input_tokens_seen": 121190290, "step": 5646, "time_per_iteration": 2.674058437347412 }, { "auxiliary_loss_clip": 0.01128511, "auxiliary_loss_mlp": 0.01049927, "balance_loss_clip": 1.05314088, "balance_loss_mlp": 1.03245091, "epoch": 0.33951600781602287, "flos": 22086929865600.0, "grad_norm": 2.6597837481337256, "language_loss": 0.78888249, "learning_rate": 3.0764370900510727e-06, "loss": 0.81066692, "num_input_tokens_seen": 121209060, "step": 5647, "time_per_iteration": 2.636462688446045 }, { "auxiliary_loss_clip": 0.01113432, "auxiliary_loss_mlp": 0.0077397, "balance_loss_clip": 1.05254745, "balance_loss_mlp": 1.00053275, "epoch": 0.33957613106869083, "flos": 23878262056320.0, "grad_norm": 2.0563114900155037, "language_loss": 0.77694631, "learning_rate": 3.0761088292689904e-06, "loss": 0.7958203, "num_input_tokens_seen": 121227480, "step": 5648, "time_per_iteration": 2.704535484313965 }, { "auxiliary_loss_clip": 0.00999132, "auxiliary_loss_mlp": 0.01023587, "balance_loss_clip": 1.03748918, "balance_loss_mlp": 1.02168012, "epoch": 0.3396362543213588, "flos": 71242642414080.0, "grad_norm": 0.7822172669689142, "language_loss": 0.56281364, "learning_rate": 3.075780527680754e-06, "loss": 0.58304083, "num_input_tokens_seen": 121291305, "step": 5649, "time_per_iteration": 3.6428561210632324 }, { "auxiliary_loss_clip": 0.01109513, "auxiliary_loss_mlp": 0.00776659, "balance_loss_clip": 1.04886901, "balance_loss_mlp": 1.00053644, "epoch": 0.33969637757402676, "flos": 25921615046400.0, "grad_norm": 1.4990429944851429, "language_loss": 0.85522908, "learning_rate": 3.0754521852988117e-06, "loss": 0.87409085, "num_input_tokens_seen": 121312740, "step": 5650, "time_per_iteration": 4.6250996589660645 }, { "auxiliary_loss_clip": 0.01125063, "auxiliary_loss_mlp": 0.01029114, "balance_loss_clip": 1.04845572, "balance_loss_mlp": 1.01392674, "epoch": 0.33975650082669473, "flos": 35261728663680.0, "grad_norm": 1.7009103293103713, "language_loss": 0.70462626, "learning_rate": 3.0751238021356152e-06, "loss": 0.7261681, "num_input_tokens_seen": 121334220, "step": 5651, "time_per_iteration": 3.0873425006866455 }, { "auxiliary_loss_clip": 0.01088353, "auxiliary_loss_mlp": 0.01041459, "balance_loss_clip": 1.04718101, "balance_loss_mlp": 1.02539587, "epoch": 0.3398166240793627, "flos": 16647001879680.0, "grad_norm": 2.657059560006321, "language_loss": 0.80932343, "learning_rate": 3.074795378203616e-06, "loss": 0.83062148, "num_input_tokens_seen": 121351870, "step": 5652, "time_per_iteration": 2.957105875015259 }, { "auxiliary_loss_clip": 0.01143187, "auxiliary_loss_mlp": 0.0104477, "balance_loss_clip": 1.05543184, "balance_loss_mlp": 1.0275445, "epoch": 0.33987674733203066, "flos": 24062196625920.0, "grad_norm": 2.181969038816262, "language_loss": 0.76847494, "learning_rate": 3.0744669135152685e-06, "loss": 0.79035449, "num_input_tokens_seen": 121373400, "step": 5653, "time_per_iteration": 4.277743816375732 }, { "auxiliary_loss_clip": 0.01117346, "auxiliary_loss_mlp": 0.01041107, "balance_loss_clip": 1.04708898, "balance_loss_mlp": 1.02475142, "epoch": 0.3399368705846986, "flos": 13250678279040.0, "grad_norm": 2.9108557214850217, "language_loss": 0.85412633, "learning_rate": 3.0741384080830278e-06, "loss": 0.8757109, "num_input_tokens_seen": 121385225, "step": 5654, "time_per_iteration": 4.243285179138184 }, { "auxiliary_loss_clip": 0.01118111, "auxiliary_loss_mlp": 0.01041226, "balance_loss_clip": 1.04521537, "balance_loss_mlp": 1.02490664, "epoch": 0.3399969938373666, "flos": 27012832272000.0, "grad_norm": 5.5024852924346765, "language_loss": 0.64919531, "learning_rate": 3.073809861919351e-06, "loss": 0.67078876, "num_input_tokens_seen": 121404735, "step": 5655, "time_per_iteration": 2.793121576309204 }, { "auxiliary_loss_clip": 0.01129599, "auxiliary_loss_mlp": 0.01043607, "balance_loss_clip": 1.05404055, "balance_loss_mlp": 1.02828872, "epoch": 0.34005711709003456, "flos": 28550096588160.0, "grad_norm": 1.7231624830718477, "language_loss": 0.7624622, "learning_rate": 3.073481275036697e-06, "loss": 0.78419423, "num_input_tokens_seen": 121426780, "step": 5656, "time_per_iteration": 2.739227056503296 }, { "auxiliary_loss_clip": 0.01102847, "auxiliary_loss_mlp": 0.01040319, "balance_loss_clip": 1.0458467, "balance_loss_mlp": 1.02364159, "epoch": 0.3401172403427025, "flos": 21617003208960.0, "grad_norm": 8.964185236965056, "language_loss": 0.82842731, "learning_rate": 3.073152647447525e-06, "loss": 0.849859, "num_input_tokens_seen": 121447245, "step": 5657, "time_per_iteration": 5.179774761199951 }, { "auxiliary_loss_clip": 0.01113742, "auxiliary_loss_mlp": 0.01048481, "balance_loss_clip": 1.05169284, "balance_loss_mlp": 1.03313899, "epoch": 0.3401773635953705, "flos": 25885776251520.0, "grad_norm": 1.8385093437954252, "language_loss": 0.85050905, "learning_rate": 3.0728239791642976e-06, "loss": 0.87213123, "num_input_tokens_seen": 121468165, "step": 5658, "time_per_iteration": 2.776137351989746 }, { "auxiliary_loss_clip": 0.01053106, "auxiliary_loss_mlp": 0.01016184, "balance_loss_clip": 1.03449082, "balance_loss_mlp": 1.01424086, "epoch": 0.3402374868480385, "flos": 65507995336320.0, "grad_norm": 0.825209949556337, "language_loss": 0.59988189, "learning_rate": 3.072495270199477e-06, "loss": 0.62057471, "num_input_tokens_seen": 121523795, "step": 5659, "time_per_iteration": 3.272684335708618 }, { "auxiliary_loss_clip": 0.01137862, "auxiliary_loss_mlp": 0.01036085, "balance_loss_clip": 1.05531621, "balance_loss_mlp": 1.02102888, "epoch": 0.34029761010070647, "flos": 24060580513920.0, "grad_norm": 2.521681543348545, "language_loss": 0.67763948, "learning_rate": 3.0721665205655284e-06, "loss": 0.69937897, "num_input_tokens_seen": 121542950, "step": 5660, "time_per_iteration": 2.699267864227295 }, { "auxiliary_loss_clip": 0.01142235, "auxiliary_loss_mlp": 0.010443, "balance_loss_clip": 1.05695057, "balance_loss_mlp": 1.02787328, "epoch": 0.34035773335337444, "flos": 27599720590080.0, "grad_norm": 1.9299535220965447, "language_loss": 0.67668259, "learning_rate": 3.071837730274918e-06, "loss": 0.69854796, "num_input_tokens_seen": 121562765, "step": 5661, "time_per_iteration": 2.647101402282715 }, { "auxiliary_loss_clip": 0.01119112, "auxiliary_loss_mlp": 0.01041902, "balance_loss_clip": 1.05479288, "balance_loss_mlp": 1.02634561, "epoch": 0.3404178566060424, "flos": 20812783651200.0, "grad_norm": 2.0521689983251954, "language_loss": 0.78806192, "learning_rate": 3.071508899340113e-06, "loss": 0.80967206, "num_input_tokens_seen": 121581610, "step": 5662, "time_per_iteration": 2.847168207168579 }, { "auxiliary_loss_clip": 0.01103563, "auxiliary_loss_mlp": 0.01041962, "balance_loss_clip": 1.05163002, "balance_loss_mlp": 1.02498698, "epoch": 0.34047797985871037, "flos": 26833566470400.0, "grad_norm": 2.226848836482441, "language_loss": 0.73531127, "learning_rate": 3.0711800277735833e-06, "loss": 0.75676656, "num_input_tokens_seen": 121601885, "step": 5663, "time_per_iteration": 2.8581340312957764 }, { "auxiliary_loss_clip": 0.01090462, "auxiliary_loss_mlp": 0.01035271, "balance_loss_clip": 1.04631042, "balance_loss_mlp": 1.02079868, "epoch": 0.34053810311137833, "flos": 19682639061120.0, "grad_norm": 1.7108226041633658, "language_loss": 0.86297357, "learning_rate": 3.0708511155877997e-06, "loss": 0.88423085, "num_input_tokens_seen": 121621335, "step": 5664, "time_per_iteration": 2.778038501739502 }, { "auxiliary_loss_clip": 0.01139377, "auxiliary_loss_mlp": 0.0103938, "balance_loss_clip": 1.05399597, "balance_loss_mlp": 1.0245564, "epoch": 0.3405982263640463, "flos": 21725740656000.0, "grad_norm": 2.2398696420560675, "language_loss": 0.68712831, "learning_rate": 3.070522162795235e-06, "loss": 0.70891583, "num_input_tokens_seen": 121641310, "step": 5665, "time_per_iteration": 2.688643217086792 }, { "auxiliary_loss_clip": 0.01138662, "auxiliary_loss_mlp": 0.01039766, "balance_loss_clip": 1.05278993, "balance_loss_mlp": 1.0229218, "epoch": 0.34065834961671426, "flos": 18041629288320.0, "grad_norm": 2.716291820837314, "language_loss": 0.73084486, "learning_rate": 3.0701931694083626e-06, "loss": 0.7526291, "num_input_tokens_seen": 121659625, "step": 5666, "time_per_iteration": 2.7325544357299805 }, { "auxiliary_loss_clip": 0.01128915, "auxiliary_loss_mlp": 0.01039671, "balance_loss_clip": 1.05135012, "balance_loss_mlp": 1.0244832, "epoch": 0.3407184728693822, "flos": 21397337585280.0, "grad_norm": 2.363121461769924, "language_loss": 0.72947341, "learning_rate": 3.0698641354396576e-06, "loss": 0.75115931, "num_input_tokens_seen": 121679205, "step": 5667, "time_per_iteration": 2.7143874168395996 }, { "auxiliary_loss_clip": 0.01042137, "auxiliary_loss_mlp": 0.01008076, "balance_loss_clip": 1.02401757, "balance_loss_mlp": 1.00638342, "epoch": 0.3407785961220502, "flos": 68688101018880.0, "grad_norm": 0.8313790259289849, "language_loss": 0.63259363, "learning_rate": 3.069535060901597e-06, "loss": 0.65309572, "num_input_tokens_seen": 121751085, "step": 5668, "time_per_iteration": 3.3907217979431152 }, { "auxiliary_loss_clip": 0.01036989, "auxiliary_loss_mlp": 0.01045108, "balance_loss_clip": 1.03961444, "balance_loss_mlp": 1.02808475, "epoch": 0.34083871937471816, "flos": 14064379027200.0, "grad_norm": 2.2447075161594365, "language_loss": 0.71795446, "learning_rate": 3.0692059458066596e-06, "loss": 0.73877549, "num_input_tokens_seen": 121768565, "step": 5669, "time_per_iteration": 2.941349983215332 }, { "auxiliary_loss_clip": 0.0110323, "auxiliary_loss_mlp": 0.00773367, "balance_loss_clip": 1.04966998, "balance_loss_mlp": 1.00054646, "epoch": 0.3408988426273861, "flos": 17085435287040.0, "grad_norm": 1.973306725053756, "language_loss": 0.80678529, "learning_rate": 3.0688767901673265e-06, "loss": 0.82555127, "num_input_tokens_seen": 121784925, "step": 5670, "time_per_iteration": 2.8877930641174316 }, { "auxiliary_loss_clip": 0.01088488, "auxiliary_loss_mlp": 0.01037182, "balance_loss_clip": 1.04484558, "balance_loss_mlp": 1.02111244, "epoch": 0.3409589658800541, "flos": 24024562151040.0, "grad_norm": 1.926244069219147, "language_loss": 0.77521646, "learning_rate": 3.068547593996078e-06, "loss": 0.79647315, "num_input_tokens_seen": 121804425, "step": 5671, "time_per_iteration": 2.886425256729126 }, { "auxiliary_loss_clip": 0.01138739, "auxiliary_loss_mlp": 0.0077388, "balance_loss_clip": 1.05301285, "balance_loss_mlp": 1.00052333, "epoch": 0.34101908913272205, "flos": 21142012734720.0, "grad_norm": 3.7152219569219427, "language_loss": 0.74220848, "learning_rate": 3.0682183573053974e-06, "loss": 0.76133466, "num_input_tokens_seen": 121825145, "step": 5672, "time_per_iteration": 2.751692056655884 }, { "auxiliary_loss_clip": 0.01121109, "auxiliary_loss_mlp": 0.01047405, "balance_loss_clip": 1.04886246, "balance_loss_mlp": 1.03089476, "epoch": 0.3410792123853901, "flos": 15702012921600.0, "grad_norm": 1.8011032028958165, "language_loss": 0.73721337, "learning_rate": 3.06788908010777e-06, "loss": 0.7588985, "num_input_tokens_seen": 121842185, "step": 5673, "time_per_iteration": 2.6628050804138184 }, { "auxiliary_loss_clip": 0.01126244, "auxiliary_loss_mlp": 0.01038975, "balance_loss_clip": 1.05143654, "balance_loss_mlp": 1.02362132, "epoch": 0.34113933563805804, "flos": 23036012974080.0, "grad_norm": 1.7591090628800392, "language_loss": 0.79972708, "learning_rate": 3.067559762415682e-06, "loss": 0.8213793, "num_input_tokens_seen": 121862260, "step": 5674, "time_per_iteration": 2.6803476810455322 }, { "auxiliary_loss_clip": 0.01054856, "auxiliary_loss_mlp": 0.01001466, "balance_loss_clip": 1.0258925, "balance_loss_mlp": 0.9994635, "epoch": 0.341199458890726, "flos": 69614235336960.0, "grad_norm": 0.7875282266281167, "language_loss": 0.56080592, "learning_rate": 3.0672304042416198e-06, "loss": 0.5813691, "num_input_tokens_seen": 121923560, "step": 5675, "time_per_iteration": 3.3068313598632812 }, { "auxiliary_loss_clip": 0.01115956, "auxiliary_loss_mlp": 0.00773448, "balance_loss_clip": 1.052145, "balance_loss_mlp": 1.0006851, "epoch": 0.34125958214339397, "flos": 22346348866560.0, "grad_norm": 1.6444328441844458, "language_loss": 0.78795338, "learning_rate": 3.0669010055980734e-06, "loss": 0.80684733, "num_input_tokens_seen": 121943515, "step": 5676, "time_per_iteration": 2.7983739376068115 }, { "auxiliary_loss_clip": 0.01120251, "auxiliary_loss_mlp": 0.01036846, "balance_loss_clip": 1.04593658, "balance_loss_mlp": 1.02024043, "epoch": 0.34131970539606193, "flos": 21871933009920.0, "grad_norm": 1.8897537275348075, "language_loss": 0.85468972, "learning_rate": 3.0665715664975357e-06, "loss": 0.8762607, "num_input_tokens_seen": 121962540, "step": 5677, "time_per_iteration": 2.698751449584961 }, { "auxiliary_loss_clip": 0.01109896, "auxiliary_loss_mlp": 0.01042182, "balance_loss_clip": 1.04772925, "balance_loss_mlp": 1.02586842, "epoch": 0.3413798286487299, "flos": 24935723475840.0, "grad_norm": 1.7514589696636707, "language_loss": 0.79352021, "learning_rate": 3.0662420869524966e-06, "loss": 0.81504107, "num_input_tokens_seen": 121979830, "step": 5678, "time_per_iteration": 2.731834650039673 }, { "auxiliary_loss_clip": 0.01123477, "auxiliary_loss_mlp": 0.01033453, "balance_loss_clip": 1.04799783, "balance_loss_mlp": 1.01833677, "epoch": 0.34143995190139786, "flos": 25374372364800.0, "grad_norm": 1.8765190883227818, "language_loss": 0.74821675, "learning_rate": 3.0659125669754506e-06, "loss": 0.76978606, "num_input_tokens_seen": 121999055, "step": 5679, "time_per_iteration": 2.7362489700317383 }, { "auxiliary_loss_clip": 0.01044772, "auxiliary_loss_mlp": 0.01004164, "balance_loss_clip": 1.02617037, "balance_loss_mlp": 1.00210214, "epoch": 0.34150007515406583, "flos": 67782578129280.0, "grad_norm": 0.716476818724812, "language_loss": 0.59445524, "learning_rate": 3.0655830065788923e-06, "loss": 0.61494464, "num_input_tokens_seen": 122067015, "step": 5680, "time_per_iteration": 3.241750955581665 }, { "auxiliary_loss_clip": 0.01108333, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.04563892, "balance_loss_mlp": 1.01804543, "epoch": 0.3415601984067338, "flos": 20302421258880.0, "grad_norm": 1.760771174406363, "language_loss": 0.72054088, "learning_rate": 3.0652534057753206e-06, "loss": 0.74195278, "num_input_tokens_seen": 122085295, "step": 5681, "time_per_iteration": 2.7306556701660156 }, { "auxiliary_loss_clip": 0.01109003, "auxiliary_loss_mlp": 0.0104301, "balance_loss_clip": 1.0462265, "balance_loss_mlp": 1.02786994, "epoch": 0.34162032165940176, "flos": 26031178506240.0, "grad_norm": 2.2327180896030443, "language_loss": 0.71463466, "learning_rate": 3.064923764577233e-06, "loss": 0.73615474, "num_input_tokens_seen": 122104020, "step": 5682, "time_per_iteration": 2.825296640396118 }, { "auxiliary_loss_clip": 0.01132395, "auxiliary_loss_mlp": 0.0104079, "balance_loss_clip": 1.04721618, "balance_loss_mlp": 1.02507806, "epoch": 0.3416804449120697, "flos": 28803338449920.0, "grad_norm": 1.5426603390069147, "language_loss": 0.84101224, "learning_rate": 3.0645940829971295e-06, "loss": 0.86274409, "num_input_tokens_seen": 122125080, "step": 5683, "time_per_iteration": 2.6654412746429443 }, { "auxiliary_loss_clip": 0.01112942, "auxiliary_loss_mlp": 0.01047099, "balance_loss_clip": 1.04768562, "balance_loss_mlp": 1.03113699, "epoch": 0.3417405681647377, "flos": 22601601889920.0, "grad_norm": 4.046428716645244, "language_loss": 0.70964772, "learning_rate": 3.0642643610475116e-06, "loss": 0.73124808, "num_input_tokens_seen": 122146350, "step": 5684, "time_per_iteration": 2.724592924118042 }, { "auxiliary_loss_clip": 0.01132202, "auxiliary_loss_mlp": 0.01038054, "balance_loss_clip": 1.04905093, "balance_loss_mlp": 1.02367699, "epoch": 0.34180069141740566, "flos": 24716237420160.0, "grad_norm": 1.9204482618269598, "language_loss": 0.74832582, "learning_rate": 3.0639345987408823e-06, "loss": 0.77002841, "num_input_tokens_seen": 122168085, "step": 5685, "time_per_iteration": 2.7046890258789062 }, { "auxiliary_loss_clip": 0.01114777, "auxiliary_loss_mlp": 0.0104831, "balance_loss_clip": 1.04522872, "balance_loss_mlp": 1.03261042, "epoch": 0.3418608146700737, "flos": 30518755246080.0, "grad_norm": 1.9200820074556442, "language_loss": 0.70611888, "learning_rate": 3.0636047960897468e-06, "loss": 0.72774971, "num_input_tokens_seen": 122191040, "step": 5686, "time_per_iteration": 2.7390410900115967 }, { "auxiliary_loss_clip": 0.01123208, "auxiliary_loss_mlp": 0.01044107, "balance_loss_clip": 1.04809284, "balance_loss_mlp": 1.02819252, "epoch": 0.34192093792274164, "flos": 15122343237120.0, "grad_norm": 2.0197354521106563, "language_loss": 0.77240539, "learning_rate": 3.06327495310661e-06, "loss": 0.79407853, "num_input_tokens_seen": 122209225, "step": 5687, "time_per_iteration": 2.6381263732910156 }, { "auxiliary_loss_clip": 0.01106353, "auxiliary_loss_mlp": 0.01040255, "balance_loss_clip": 1.04849195, "balance_loss_mlp": 1.02412593, "epoch": 0.3419810611754096, "flos": 13187799521280.0, "grad_norm": 3.7332163528162385, "language_loss": 0.8676976, "learning_rate": 3.062945069803981e-06, "loss": 0.88916373, "num_input_tokens_seen": 122226160, "step": 5688, "time_per_iteration": 2.647320508956909 }, { "auxiliary_loss_clip": 0.01119843, "auxiliary_loss_mlp": 0.01042145, "balance_loss_clip": 1.04928863, "balance_loss_mlp": 1.0255394, "epoch": 0.34204118442807757, "flos": 19536267139200.0, "grad_norm": 1.870477619822585, "language_loss": 0.79564822, "learning_rate": 3.0626151461943684e-06, "loss": 0.81726807, "num_input_tokens_seen": 122243115, "step": 5689, "time_per_iteration": 4.1660990715026855 }, { "auxiliary_loss_clip": 0.0112576, "auxiliary_loss_mlp": 0.01042306, "balance_loss_clip": 1.04875994, "balance_loss_mlp": 1.02580786, "epoch": 0.34210130768074554, "flos": 15194846839680.0, "grad_norm": 1.7530560995380315, "language_loss": 0.73215616, "learning_rate": 3.0622851822902834e-06, "loss": 0.75383675, "num_input_tokens_seen": 122261105, "step": 5690, "time_per_iteration": 2.699846029281616 }, { "auxiliary_loss_clip": 0.01115188, "auxiliary_loss_mlp": 0.01047594, "balance_loss_clip": 1.04381919, "balance_loss_mlp": 1.03121471, "epoch": 0.3421614309334135, "flos": 24936226266240.0, "grad_norm": 2.1339055209058184, "language_loss": 0.76036334, "learning_rate": 3.061955178104237e-06, "loss": 0.78199112, "num_input_tokens_seen": 122279995, "step": 5691, "time_per_iteration": 2.707598924636841 }, { "auxiliary_loss_clip": 0.01119412, "auxiliary_loss_mlp": 0.01042889, "balance_loss_clip": 1.04769242, "balance_loss_mlp": 1.02878046, "epoch": 0.34222155418608147, "flos": 21908633731200.0, "grad_norm": 1.9419180569645556, "language_loss": 0.68321705, "learning_rate": 3.0616251336487447e-06, "loss": 0.70484006, "num_input_tokens_seen": 122299070, "step": 5692, "time_per_iteration": 2.6876816749572754 }, { "auxiliary_loss_clip": 0.01123804, "auxiliary_loss_mlp": 0.01042902, "balance_loss_clip": 1.0481621, "balance_loss_mlp": 1.02660608, "epoch": 0.34228167743874943, "flos": 18114061063680.0, "grad_norm": 2.8342834288415504, "language_loss": 0.72458065, "learning_rate": 3.06129504893632e-06, "loss": 0.74624765, "num_input_tokens_seen": 122316800, "step": 5693, "time_per_iteration": 5.672837018966675 }, { "auxiliary_loss_clip": 0.01090312, "auxiliary_loss_mlp": 0.01043466, "balance_loss_clip": 1.0433774, "balance_loss_mlp": 1.02832651, "epoch": 0.3423418006914174, "flos": 21288600138240.0, "grad_norm": 1.9009541760697364, "language_loss": 0.75556326, "learning_rate": 3.0609649239794813e-06, "loss": 0.77690107, "num_input_tokens_seen": 122335275, "step": 5694, "time_per_iteration": 2.713236093521118 }, { "auxiliary_loss_clip": 0.01093804, "auxiliary_loss_mlp": 0.01036832, "balance_loss_clip": 1.04769742, "balance_loss_mlp": 1.02205038, "epoch": 0.34240192394408536, "flos": 19823480288640.0, "grad_norm": 2.1810058063417608, "language_loss": 0.79590774, "learning_rate": 3.060634758790747e-06, "loss": 0.81721413, "num_input_tokens_seen": 122353215, "step": 5695, "time_per_iteration": 2.7206506729125977 }, { "auxiliary_loss_clip": 0.01077977, "auxiliary_loss_mlp": 0.01043311, "balance_loss_clip": 1.04183137, "balance_loss_mlp": 1.02764642, "epoch": 0.3424620471967533, "flos": 24535535074560.0, "grad_norm": 1.8643380844369803, "language_loss": 0.73428202, "learning_rate": 3.060304553382635e-06, "loss": 0.75549489, "num_input_tokens_seen": 122372495, "step": 5696, "time_per_iteration": 4.777001857757568 }, { "auxiliary_loss_clip": 0.01088152, "auxiliary_loss_mlp": 0.01052674, "balance_loss_clip": 1.0424118, "balance_loss_mlp": 1.03569841, "epoch": 0.3425221704494213, "flos": 25848895962240.0, "grad_norm": 5.815439398629578, "language_loss": 0.71460104, "learning_rate": 3.0599743077676685e-06, "loss": 0.73600936, "num_input_tokens_seen": 122394600, "step": 5697, "time_per_iteration": 2.7620668411254883 }, { "auxiliary_loss_clip": 0.01108783, "auxiliary_loss_mlp": 0.01032533, "balance_loss_clip": 1.04925871, "balance_loss_mlp": 1.01740503, "epoch": 0.34258229370208926, "flos": 21540513196800.0, "grad_norm": 2.6993537181180316, "language_loss": 0.82170486, "learning_rate": 3.05964402195837e-06, "loss": 0.84311801, "num_input_tokens_seen": 122414700, "step": 5698, "time_per_iteration": 2.6930580139160156 }, { "auxiliary_loss_clip": 0.01077965, "auxiliary_loss_mlp": 0.01049711, "balance_loss_clip": 1.0451839, "balance_loss_mlp": 1.03073311, "epoch": 0.3426424169547573, "flos": 23652778429440.0, "grad_norm": 2.492082875954734, "language_loss": 0.68941295, "learning_rate": 3.0593136959672645e-06, "loss": 0.71068972, "num_input_tokens_seen": 122432760, "step": 5699, "time_per_iteration": 2.8604705333709717 }, { "auxiliary_loss_clip": 0.01113381, "auxiliary_loss_mlp": 0.01042187, "balance_loss_clip": 1.05009818, "balance_loss_mlp": 1.02698755, "epoch": 0.34270254020742524, "flos": 24644883052800.0, "grad_norm": 2.4799642493365046, "language_loss": 0.72708368, "learning_rate": 3.058983329806877e-06, "loss": 0.74863935, "num_input_tokens_seen": 122449105, "step": 5700, "time_per_iteration": 2.721219301223755 }, { "auxiliary_loss_clip": 0.01107869, "auxiliary_loss_mlp": 0.01033632, "balance_loss_clip": 1.05173492, "balance_loss_mlp": 1.01942825, "epoch": 0.3427626634600932, "flos": 20996754134400.0, "grad_norm": 1.8907099352771195, "language_loss": 0.81771016, "learning_rate": 3.0586529234897354e-06, "loss": 0.83912516, "num_input_tokens_seen": 122468700, "step": 5701, "time_per_iteration": 2.668776273727417 }, { "auxiliary_loss_clip": 0.01122749, "auxiliary_loss_mlp": 0.01036444, "balance_loss_clip": 1.05318427, "balance_loss_mlp": 1.02137566, "epoch": 0.3428227867127612, "flos": 21433786911360.0, "grad_norm": 1.8540703451937275, "language_loss": 0.71611702, "learning_rate": 3.0583224770283694e-06, "loss": 0.73770893, "num_input_tokens_seen": 122488160, "step": 5702, "time_per_iteration": 2.7413434982299805 }, { "auxiliary_loss_clip": 0.01034072, "auxiliary_loss_mlp": 0.0102117, "balance_loss_clip": 1.02648544, "balance_loss_mlp": 1.01936996, "epoch": 0.34288290996542914, "flos": 55731782695680.0, "grad_norm": 0.8291151185510042, "language_loss": 0.57455015, "learning_rate": 3.057991990435309e-06, "loss": 0.59510255, "num_input_tokens_seen": 122542890, "step": 5703, "time_per_iteration": 3.123619318008423 }, { "auxiliary_loss_clip": 0.01125899, "auxiliary_loss_mlp": 0.01044546, "balance_loss_clip": 1.05167961, "balance_loss_mlp": 1.02754664, "epoch": 0.3429430332180971, "flos": 20156803522560.0, "grad_norm": 2.054859273280662, "language_loss": 0.75049305, "learning_rate": 3.057661463723086e-06, "loss": 0.77219748, "num_input_tokens_seen": 122561770, "step": 5704, "time_per_iteration": 2.786344051361084 }, { "auxiliary_loss_clip": 0.01103715, "auxiliary_loss_mlp": 0.01039493, "balance_loss_clip": 1.05234969, "balance_loss_mlp": 1.02506232, "epoch": 0.34300315647076507, "flos": 17965857548160.0, "grad_norm": 1.921400910299184, "language_loss": 0.72367042, "learning_rate": 3.0573308969042346e-06, "loss": 0.74510252, "num_input_tokens_seen": 122580580, "step": 5705, "time_per_iteration": 2.7464826107025146 }, { "auxiliary_loss_clip": 0.01099266, "auxiliary_loss_mlp": 0.01035276, "balance_loss_clip": 1.05201912, "balance_loss_mlp": 1.01980281, "epoch": 0.34306327972343303, "flos": 22086822124800.0, "grad_norm": 2.585473080189318, "language_loss": 0.80016834, "learning_rate": 3.057000289991289e-06, "loss": 0.82151377, "num_input_tokens_seen": 122599810, "step": 5706, "time_per_iteration": 2.83493971824646 }, { "auxiliary_loss_clip": 0.01126183, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.05822873, "balance_loss_mlp": 1.02111542, "epoch": 0.343123402976101, "flos": 18442679616000.0, "grad_norm": 2.833985332828215, "language_loss": 0.83001584, "learning_rate": 3.056669642996787e-06, "loss": 0.85165167, "num_input_tokens_seen": 122616035, "step": 5707, "time_per_iteration": 2.6888725757598877 }, { "auxiliary_loss_clip": 0.01130807, "auxiliary_loss_mlp": 0.01038349, "balance_loss_clip": 1.05664158, "balance_loss_mlp": 1.02264881, "epoch": 0.34318352622876896, "flos": 17163685065600.0, "grad_norm": 1.6733576562987098, "language_loss": 0.75313264, "learning_rate": 3.056338955933266e-06, "loss": 0.7748242, "num_input_tokens_seen": 122633785, "step": 5708, "time_per_iteration": 2.655061960220337 }, { "auxiliary_loss_clip": 0.01105586, "auxiliary_loss_mlp": 0.01039807, "balance_loss_clip": 1.05063939, "balance_loss_mlp": 1.02357078, "epoch": 0.34324364948143693, "flos": 26688164215680.0, "grad_norm": 1.6008558791331946, "language_loss": 0.81187862, "learning_rate": 3.0560082288132662e-06, "loss": 0.83333254, "num_input_tokens_seen": 122652100, "step": 5709, "time_per_iteration": 2.7354934215545654 }, { "auxiliary_loss_clip": 0.01119071, "auxiliary_loss_mlp": 0.01043385, "balance_loss_clip": 1.0550828, "balance_loss_mlp": 1.02581382, "epoch": 0.3433037727341049, "flos": 21251576194560.0, "grad_norm": 2.1605529243452297, "language_loss": 0.79441178, "learning_rate": 3.055677461649329e-06, "loss": 0.81603634, "num_input_tokens_seen": 122669720, "step": 5710, "time_per_iteration": 2.757321834564209 }, { "auxiliary_loss_clip": 0.01130524, "auxiliary_loss_mlp": 0.01039861, "balance_loss_clip": 1.05363941, "balance_loss_mlp": 1.02329111, "epoch": 0.34336389598677286, "flos": 20629423699200.0, "grad_norm": 1.8403881586839854, "language_loss": 0.70303786, "learning_rate": 3.055346654453996e-06, "loss": 0.7247417, "num_input_tokens_seen": 122688715, "step": 5711, "time_per_iteration": 2.6535775661468506 }, { "auxiliary_loss_clip": 0.01106817, "auxiliary_loss_mlp": 0.00774858, "balance_loss_clip": 1.05299044, "balance_loss_mlp": 1.00072622, "epoch": 0.3434240192394409, "flos": 14538579402240.0, "grad_norm": 1.8401630077009354, "language_loss": 0.67124939, "learning_rate": 3.055015807239812e-06, "loss": 0.69006616, "num_input_tokens_seen": 122706970, "step": 5712, "time_per_iteration": 2.7115519046783447 }, { "auxiliary_loss_clip": 0.01051163, "auxiliary_loss_mlp": 0.01005713, "balance_loss_clip": 1.0511148, "balance_loss_mlp": 1.00409162, "epoch": 0.34348414249210885, "flos": 58051538841600.0, "grad_norm": 0.846630151399307, "language_loss": 0.58072996, "learning_rate": 3.0546849200193226e-06, "loss": 0.60129869, "num_input_tokens_seen": 122758095, "step": 5713, "time_per_iteration": 3.3988189697265625 }, { "auxiliary_loss_clip": 0.01142007, "auxiliary_loss_mlp": 0.01043862, "balance_loss_clip": 1.05782688, "balance_loss_mlp": 1.02813852, "epoch": 0.3435442657447768, "flos": 20704441253760.0, "grad_norm": 1.6506449407169241, "language_loss": 0.8079257, "learning_rate": 3.054353992805076e-06, "loss": 0.82978439, "num_input_tokens_seen": 122777815, "step": 5714, "time_per_iteration": 2.682537078857422 }, { "auxiliary_loss_clip": 0.01142274, "auxiliary_loss_mlp": 0.01042249, "balance_loss_clip": 1.0581255, "balance_loss_mlp": 1.02628696, "epoch": 0.3436043889974448, "flos": 22930256355840.0, "grad_norm": 2.1462767477025055, "language_loss": 0.72059911, "learning_rate": 3.05402302560962e-06, "loss": 0.74244434, "num_input_tokens_seen": 122797555, "step": 5715, "time_per_iteration": 2.6535134315490723 }, { "auxiliary_loss_clip": 0.01070037, "auxiliary_loss_mlp": 0.01002865, "balance_loss_clip": 1.0577507, "balance_loss_mlp": 1.00051689, "epoch": 0.34366451225011274, "flos": 58403285752320.0, "grad_norm": 0.9103705044251069, "language_loss": 0.65885556, "learning_rate": 3.053692018445505e-06, "loss": 0.67958462, "num_input_tokens_seen": 122863955, "step": 5716, "time_per_iteration": 3.205113172531128 }, { "auxiliary_loss_clip": 0.01124236, "auxiliary_loss_mlp": 0.0104266, "balance_loss_clip": 1.05416417, "balance_loss_mlp": 1.02718663, "epoch": 0.3437246355027807, "flos": 15596292216960.0, "grad_norm": 2.101112668121384, "language_loss": 0.74272031, "learning_rate": 3.0533609713252838e-06, "loss": 0.76438928, "num_input_tokens_seen": 122883000, "step": 5717, "time_per_iteration": 2.60300350189209 }, { "auxiliary_loss_clip": 0.01084832, "auxiliary_loss_mlp": 0.01039269, "balance_loss_clip": 1.05195725, "balance_loss_mlp": 1.02437937, "epoch": 0.34378475875544867, "flos": 27672260106240.0, "grad_norm": 1.8405555467441777, "language_loss": 0.75446129, "learning_rate": 3.0530298842615077e-06, "loss": 0.7757023, "num_input_tokens_seen": 122903265, "step": 5718, "time_per_iteration": 2.787687301635742 }, { "auxiliary_loss_clip": 0.01097103, "auxiliary_loss_mlp": 0.01043125, "balance_loss_clip": 1.04837775, "balance_loss_mlp": 1.02739501, "epoch": 0.34384488200811664, "flos": 31431496769280.0, "grad_norm": 1.9369525419747404, "language_loss": 0.63647246, "learning_rate": 3.052698757266734e-06, "loss": 0.65787476, "num_input_tokens_seen": 122923860, "step": 5719, "time_per_iteration": 2.8138949871063232 }, { "auxiliary_loss_clip": 0.01098152, "auxiliary_loss_mlp": 0.01040429, "balance_loss_clip": 1.05234158, "balance_loss_mlp": 1.02310777, "epoch": 0.3439050052607846, "flos": 24899920594560.0, "grad_norm": 1.8182809721987367, "language_loss": 0.73785692, "learning_rate": 3.0523675903535183e-06, "loss": 0.75924277, "num_input_tokens_seen": 122945305, "step": 5720, "time_per_iteration": 2.761371612548828 }, { "auxiliary_loss_clip": 0.01127909, "auxiliary_loss_mlp": 0.01052147, "balance_loss_clip": 1.056463, "balance_loss_mlp": 1.03434944, "epoch": 0.34396512851345257, "flos": 18150079426560.0, "grad_norm": 2.2267988645125896, "language_loss": 0.74087942, "learning_rate": 3.0520363835344173e-06, "loss": 0.76267999, "num_input_tokens_seen": 122962535, "step": 5721, "time_per_iteration": 2.6139280796051025 }, { "auxiliary_loss_clip": 0.0111919, "auxiliary_loss_mlp": 0.0077563, "balance_loss_clip": 1.05647993, "balance_loss_mlp": 1.00063252, "epoch": 0.34402525176612053, "flos": 16034438315520.0, "grad_norm": 2.313932715754647, "language_loss": 0.80464351, "learning_rate": 3.051705136821992e-06, "loss": 0.82359171, "num_input_tokens_seen": 122979750, "step": 5722, "time_per_iteration": 2.6886982917785645 }, { "auxiliary_loss_clip": 0.01092207, "auxiliary_loss_mlp": 0.01038868, "balance_loss_clip": 1.05326557, "balance_loss_mlp": 1.02348995, "epoch": 0.3440853750187885, "flos": 21178641628800.0, "grad_norm": 2.5095280683984984, "language_loss": 0.81647789, "learning_rate": 3.051373850228801e-06, "loss": 0.83778864, "num_input_tokens_seen": 122998955, "step": 5723, "time_per_iteration": 2.7464921474456787 }, { "auxiliary_loss_clip": 0.01099736, "auxiliary_loss_mlp": 0.0105726, "balance_loss_clip": 1.0488528, "balance_loss_mlp": 1.04023743, "epoch": 0.34414549827145646, "flos": 12677868092160.0, "grad_norm": 1.9897062128640133, "language_loss": 0.81431544, "learning_rate": 3.0510425237674096e-06, "loss": 0.83588541, "num_input_tokens_seen": 123016165, "step": 5724, "time_per_iteration": 2.7447471618652344 }, { "auxiliary_loss_clip": 0.01112954, "auxiliary_loss_mlp": 0.01047765, "balance_loss_clip": 1.05231178, "balance_loss_mlp": 1.03056324, "epoch": 0.3442056215241244, "flos": 31284514316160.0, "grad_norm": 1.858960952495153, "language_loss": 0.68913317, "learning_rate": 3.05071115745038e-06, "loss": 0.71074033, "num_input_tokens_seen": 123036900, "step": 5725, "time_per_iteration": 2.798987627029419 }, { "auxiliary_loss_clip": 0.01132971, "auxiliary_loss_mlp": 0.0105182, "balance_loss_clip": 1.05775714, "balance_loss_mlp": 1.03379524, "epoch": 0.34426574477679245, "flos": 23367289132800.0, "grad_norm": 1.4701315954442116, "language_loss": 0.6946882, "learning_rate": 3.0503797512902773e-06, "loss": 0.71653616, "num_input_tokens_seen": 123057480, "step": 5726, "time_per_iteration": 2.663766622543335 }, { "auxiliary_loss_clip": 0.01111868, "auxiliary_loss_mlp": 0.01038496, "balance_loss_clip": 1.05667615, "balance_loss_mlp": 1.02374983, "epoch": 0.3443258680294604, "flos": 24535427333760.0, "grad_norm": 2.4860883718983873, "language_loss": 0.73317868, "learning_rate": 3.0500483052996703e-06, "loss": 0.7546823, "num_input_tokens_seen": 123076890, "step": 5727, "time_per_iteration": 2.8002336025238037 }, { "auxiliary_loss_clip": 0.01097058, "auxiliary_loss_mlp": 0.01052204, "balance_loss_clip": 1.05053401, "balance_loss_mlp": 1.03590822, "epoch": 0.3443859912821284, "flos": 20230133137920.0, "grad_norm": 2.2067060616784815, "language_loss": 0.88451493, "learning_rate": 3.0497168194911257e-06, "loss": 0.90600753, "num_input_tokens_seen": 123092530, "step": 5728, "time_per_iteration": 2.703842878341675 }, { "auxiliary_loss_clip": 0.01089582, "auxiliary_loss_mlp": 0.01048379, "balance_loss_clip": 1.04858351, "balance_loss_mlp": 1.03266144, "epoch": 0.34444611453479634, "flos": 24316515895680.0, "grad_norm": 2.2135571419735904, "language_loss": 0.70018214, "learning_rate": 3.0493852938772143e-06, "loss": 0.72156173, "num_input_tokens_seen": 123110560, "step": 5729, "time_per_iteration": 4.360877275466919 }, { "auxiliary_loss_clip": 0.01124088, "auxiliary_loss_mlp": 0.01037772, "balance_loss_clip": 1.0525502, "balance_loss_mlp": 1.02208424, "epoch": 0.3445062377874643, "flos": 16983413683200.0, "grad_norm": 1.9483871766944658, "language_loss": 0.7435137, "learning_rate": 3.0490537284705078e-06, "loss": 0.76513231, "num_input_tokens_seen": 123128655, "step": 5730, "time_per_iteration": 2.6021499633789062 }, { "auxiliary_loss_clip": 0.01099617, "auxiliary_loss_mlp": 0.0105823, "balance_loss_clip": 1.04880106, "balance_loss_mlp": 1.04053974, "epoch": 0.3445663610401323, "flos": 20302708567680.0, "grad_norm": 2.1142556114368314, "language_loss": 0.7952323, "learning_rate": 3.048722123283578e-06, "loss": 0.81681079, "num_input_tokens_seen": 123145130, "step": 5731, "time_per_iteration": 4.273399114608765 }, { "auxiliary_loss_clip": 0.01130567, "auxiliary_loss_mlp": 0.01043537, "balance_loss_clip": 1.05617356, "balance_loss_mlp": 1.02793896, "epoch": 0.34462648429280024, "flos": 15888102307200.0, "grad_norm": 2.0299111477971334, "language_loss": 0.78609502, "learning_rate": 3.0483904783290006e-06, "loss": 0.80783606, "num_input_tokens_seen": 123162265, "step": 5732, "time_per_iteration": 4.672218322753906 }, { "auxiliary_loss_clip": 0.01037769, "auxiliary_loss_mlp": 0.0101237, "balance_loss_clip": 1.03788018, "balance_loss_mlp": 1.0106411, "epoch": 0.3446866075454682, "flos": 59311035285120.0, "grad_norm": 0.7456337544046427, "language_loss": 0.53537595, "learning_rate": 3.0480587936193505e-06, "loss": 0.55587733, "num_input_tokens_seen": 123218620, "step": 5733, "time_per_iteration": 3.322802782058716 }, { "auxiliary_loss_clip": 0.01122514, "auxiliary_loss_mlp": 0.01042066, "balance_loss_clip": 1.05675018, "balance_loss_mlp": 1.02577019, "epoch": 0.34474673079813617, "flos": 22343799000960.0, "grad_norm": 1.936820728476944, "language_loss": 0.832178, "learning_rate": 3.047727069167207e-06, "loss": 0.85382378, "num_input_tokens_seen": 123237325, "step": 5734, "time_per_iteration": 2.7426953315734863 }, { "auxiliary_loss_clip": 0.01120142, "auxiliary_loss_mlp": 0.0103601, "balance_loss_clip": 1.05517805, "balance_loss_mlp": 1.01988125, "epoch": 0.34480685405080413, "flos": 27670141203840.0, "grad_norm": 2.7764640699074077, "language_loss": 0.92655241, "learning_rate": 3.0473953049851478e-06, "loss": 0.94811392, "num_input_tokens_seen": 123258650, "step": 5735, "time_per_iteration": 4.536838054656982 }, { "auxiliary_loss_clip": 0.0110302, "auxiliary_loss_mlp": 0.01041265, "balance_loss_clip": 1.05774188, "balance_loss_mlp": 1.02492189, "epoch": 0.3448669773034721, "flos": 22456020067200.0, "grad_norm": 1.7508294751665012, "language_loss": 0.76571405, "learning_rate": 3.0470635010857533e-06, "loss": 0.78715694, "num_input_tokens_seen": 123277155, "step": 5736, "time_per_iteration": 2.784958600997925 }, { "auxiliary_loss_clip": 0.01122912, "auxiliary_loss_mlp": 0.0104053, "balance_loss_clip": 1.05683184, "balance_loss_mlp": 1.02396011, "epoch": 0.34492710055614006, "flos": 24936190352640.0, "grad_norm": 1.7983696926456887, "language_loss": 0.78327668, "learning_rate": 3.0467316574816064e-06, "loss": 0.80491114, "num_input_tokens_seen": 123297640, "step": 5737, "time_per_iteration": 2.709786891937256 }, { "auxiliary_loss_clip": 0.01083721, "auxiliary_loss_mlp": 0.0104406, "balance_loss_clip": 1.04379368, "balance_loss_mlp": 1.02520096, "epoch": 0.34498722380880803, "flos": 20120821073280.0, "grad_norm": 2.0055780284948375, "language_loss": 0.71544027, "learning_rate": 3.0463997741852893e-06, "loss": 0.73671806, "num_input_tokens_seen": 123314370, "step": 5738, "time_per_iteration": 2.779651165008545 }, { "auxiliary_loss_clip": 0.0110112, "auxiliary_loss_mlp": 0.01042892, "balance_loss_clip": 1.04991913, "balance_loss_mlp": 1.02520132, "epoch": 0.34504734706147605, "flos": 28438126917120.0, "grad_norm": 2.7751951344870562, "language_loss": 0.82324719, "learning_rate": 3.046067851209389e-06, "loss": 0.84468728, "num_input_tokens_seen": 123336085, "step": 5739, "time_per_iteration": 2.7953522205352783 }, { "auxiliary_loss_clip": 0.01104482, "auxiliary_loss_mlp": 0.01037335, "balance_loss_clip": 1.05071819, "balance_loss_mlp": 1.02132511, "epoch": 0.345107470314144, "flos": 22674464628480.0, "grad_norm": 1.8186717226973075, "language_loss": 0.83071041, "learning_rate": 3.0457358885664898e-06, "loss": 0.85212862, "num_input_tokens_seen": 123354460, "step": 5740, "time_per_iteration": 2.7530486583709717 }, { "auxiliary_loss_clip": 0.01130478, "auxiliary_loss_mlp": 0.01035685, "balance_loss_clip": 1.05699897, "balance_loss_mlp": 1.01901984, "epoch": 0.345167593566812, "flos": 20630716588800.0, "grad_norm": 2.1971165557092656, "language_loss": 0.7704618, "learning_rate": 3.045403886269181e-06, "loss": 0.79212344, "num_input_tokens_seen": 123373420, "step": 5741, "time_per_iteration": 2.6488983631134033 }, { "auxiliary_loss_clip": 0.01116686, "auxiliary_loss_mlp": 0.01038328, "balance_loss_clip": 1.05202794, "balance_loss_mlp": 1.02271724, "epoch": 0.34522771681947995, "flos": 26214358890240.0, "grad_norm": 1.629760829576741, "language_loss": 0.76972193, "learning_rate": 3.045071844330053e-06, "loss": 0.7912721, "num_input_tokens_seen": 123394730, "step": 5742, "time_per_iteration": 2.7333807945251465 }, { "auxiliary_loss_clip": 0.01133631, "auxiliary_loss_mlp": 0.01040013, "balance_loss_clip": 1.05862427, "balance_loss_mlp": 1.02371693, "epoch": 0.3452878400721479, "flos": 19062354072960.0, "grad_norm": 2.2460068376984523, "language_loss": 0.76135588, "learning_rate": 3.0447397627616955e-06, "loss": 0.78309238, "num_input_tokens_seen": 123412895, "step": 5743, "time_per_iteration": 2.677682638168335 }, { "auxiliary_loss_clip": 0.01128893, "auxiliary_loss_mlp": 0.01037178, "balance_loss_clip": 1.05570602, "balance_loss_mlp": 1.02171636, "epoch": 0.3453479633248159, "flos": 27929739772800.0, "grad_norm": 2.0501405423310097, "language_loss": 0.70481914, "learning_rate": 3.0444076415767016e-06, "loss": 0.72647989, "num_input_tokens_seen": 123432320, "step": 5744, "time_per_iteration": 2.7430574893951416 }, { "auxiliary_loss_clip": 0.01140382, "auxiliary_loss_mlp": 0.01036281, "balance_loss_clip": 1.05727339, "balance_loss_mlp": 1.01959133, "epoch": 0.34540808657748384, "flos": 19606113135360.0, "grad_norm": 2.271690731291802, "language_loss": 0.79658759, "learning_rate": 3.044075480787665e-06, "loss": 0.81835419, "num_input_tokens_seen": 123450980, "step": 5745, "time_per_iteration": 2.6587865352630615 }, { "auxiliary_loss_clip": 0.01092128, "auxiliary_loss_mlp": 0.01041398, "balance_loss_clip": 1.0486573, "balance_loss_mlp": 1.02435148, "epoch": 0.3454682098301518, "flos": 20411661496320.0, "grad_norm": 1.8194779915280654, "language_loss": 0.89049339, "learning_rate": 3.043743280407182e-06, "loss": 0.91182864, "num_input_tokens_seen": 123469365, "step": 5746, "time_per_iteration": 2.7314908504486084 }, { "auxiliary_loss_clip": 0.01133638, "auxiliary_loss_mlp": 0.01038455, "balance_loss_clip": 1.05554819, "balance_loss_mlp": 1.02101421, "epoch": 0.34552833308281977, "flos": 21325121291520.0, "grad_norm": 2.5554958969654136, "language_loss": 0.64851058, "learning_rate": 3.043411040447849e-06, "loss": 0.67023152, "num_input_tokens_seen": 123489425, "step": 5747, "time_per_iteration": 2.6858277320861816 }, { "auxiliary_loss_clip": 0.01119459, "auxiliary_loss_mlp": 0.01035118, "balance_loss_clip": 1.05213308, "balance_loss_mlp": 1.01928735, "epoch": 0.34558845633548774, "flos": 36243633824640.0, "grad_norm": 1.5633023430662023, "language_loss": 0.72855747, "learning_rate": 3.043078760922264e-06, "loss": 0.75010324, "num_input_tokens_seen": 123509970, "step": 5748, "time_per_iteration": 2.805250406265259 }, { "auxiliary_loss_clip": 0.01084714, "auxiliary_loss_mlp": 0.01032651, "balance_loss_clip": 1.05246413, "balance_loss_mlp": 1.01832819, "epoch": 0.3456485795881557, "flos": 22450561200000.0, "grad_norm": 1.6861475272665256, "language_loss": 0.7584126, "learning_rate": 3.042746441843029e-06, "loss": 0.7795862, "num_input_tokens_seen": 123531055, "step": 5749, "time_per_iteration": 2.8886258602142334 }, { "auxiliary_loss_clip": 0.01061531, "auxiliary_loss_mlp": 0.01002064, "balance_loss_clip": 1.05058503, "balance_loss_mlp": 1.00045478, "epoch": 0.34570870284082367, "flos": 62004299005440.0, "grad_norm": 0.8852783380527953, "language_loss": 0.62715566, "learning_rate": 3.0424140832227437e-06, "loss": 0.64779162, "num_input_tokens_seen": 123584720, "step": 5750, "time_per_iteration": 3.1283066272735596 }, { "auxiliary_loss_clip": 0.01110881, "auxiliary_loss_mlp": 0.01037788, "balance_loss_clip": 1.05210388, "balance_loss_mlp": 1.02242184, "epoch": 0.34576882609349163, "flos": 22782196494720.0, "grad_norm": 2.239830827663745, "language_loss": 0.80332017, "learning_rate": 3.042081685074012e-06, "loss": 0.82480681, "num_input_tokens_seen": 123604465, "step": 5751, "time_per_iteration": 2.721344470977783 }, { "auxiliary_loss_clip": 0.01135561, "auxiliary_loss_mlp": 0.01045926, "balance_loss_clip": 1.0536952, "balance_loss_mlp": 1.03101254, "epoch": 0.34582894934615965, "flos": 12348818576640.0, "grad_norm": 2.3847713847020744, "language_loss": 0.84148252, "learning_rate": 3.041749247409439e-06, "loss": 0.86329746, "num_input_tokens_seen": 123622320, "step": 5752, "time_per_iteration": 2.578984260559082 }, { "auxiliary_loss_clip": 0.01047286, "auxiliary_loss_mlp": 0.00754976, "balance_loss_clip": 1.0380801, "balance_loss_mlp": 1.00148225, "epoch": 0.3458890725988276, "flos": 70167691071360.0, "grad_norm": 0.7284359747550926, "language_loss": 0.6310631, "learning_rate": 3.0414167702416296e-06, "loss": 0.64908576, "num_input_tokens_seen": 123678010, "step": 5753, "time_per_iteration": 3.0907819271087646 }, { "auxiliary_loss_clip": 0.01112695, "auxiliary_loss_mlp": 0.01035981, "balance_loss_clip": 1.05358505, "balance_loss_mlp": 1.01956582, "epoch": 0.3459491958514956, "flos": 17092582093440.0, "grad_norm": 1.9590865283999213, "language_loss": 0.71000856, "learning_rate": 3.0410842535831914e-06, "loss": 0.73149538, "num_input_tokens_seen": 123696830, "step": 5754, "time_per_iteration": 2.7031564712524414 }, { "auxiliary_loss_clip": 0.01127989, "auxiliary_loss_mlp": 0.01038041, "balance_loss_clip": 1.05300486, "balance_loss_mlp": 1.02251959, "epoch": 0.34600931910416355, "flos": 16650952375680.0, "grad_norm": 2.56305874029915, "language_loss": 0.73286581, "learning_rate": 3.0407516974467343e-06, "loss": 0.75452608, "num_input_tokens_seen": 123714360, "step": 5755, "time_per_iteration": 2.656804084777832 }, { "auxiliary_loss_clip": 0.01122508, "auxiliary_loss_mlp": 0.01033304, "balance_loss_clip": 1.0504849, "balance_loss_mlp": 1.01791406, "epoch": 0.3460694423568315, "flos": 38546190334080.0, "grad_norm": 1.7746130503339408, "language_loss": 0.7232182, "learning_rate": 3.040419101844869e-06, "loss": 0.74477637, "num_input_tokens_seen": 123739250, "step": 5756, "time_per_iteration": 2.8805603981018066 }, { "auxiliary_loss_clip": 0.01055943, "auxiliary_loss_mlp": 0.01012753, "balance_loss_clip": 1.03647125, "balance_loss_mlp": 1.01088166, "epoch": 0.3461295656094995, "flos": 72081479704320.0, "grad_norm": 0.7176054236110851, "language_loss": 0.62659568, "learning_rate": 3.040086466790207e-06, "loss": 0.64728266, "num_input_tokens_seen": 123802845, "step": 5757, "time_per_iteration": 3.21248197555542 }, { "auxiliary_loss_clip": 0.0103445, "auxiliary_loss_mlp": 0.00755471, "balance_loss_clip": 1.03495657, "balance_loss_mlp": 1.0016396, "epoch": 0.34618968886216744, "flos": 65460089571840.0, "grad_norm": 0.8171010225304897, "language_loss": 0.59206927, "learning_rate": 3.039753792295362e-06, "loss": 0.60996854, "num_input_tokens_seen": 123861805, "step": 5758, "time_per_iteration": 3.2514266967773438 }, { "auxiliary_loss_clip": 0.01122832, "auxiliary_loss_mlp": 0.01042223, "balance_loss_clip": 1.05849838, "balance_loss_mlp": 1.02783418, "epoch": 0.3462498121148354, "flos": 23472542960640.0, "grad_norm": 1.8827972101732287, "language_loss": 0.71806967, "learning_rate": 3.0394210783729487e-06, "loss": 0.73972023, "num_input_tokens_seen": 123881820, "step": 5759, "time_per_iteration": 2.943061351776123 }, { "auxiliary_loss_clip": 0.0108272, "auxiliary_loss_mlp": 0.01061154, "balance_loss_clip": 1.0455631, "balance_loss_mlp": 1.04352307, "epoch": 0.3463099353675034, "flos": 24170790418560.0, "grad_norm": 1.9206924983950955, "language_loss": 0.83097923, "learning_rate": 3.0390883250355836e-06, "loss": 0.85241801, "num_input_tokens_seen": 123903700, "step": 5760, "time_per_iteration": 2.8922929763793945 }, { "auxiliary_loss_clip": 0.01029416, "auxiliary_loss_mlp": 0.01010127, "balance_loss_clip": 1.02909803, "balance_loss_mlp": 1.00855386, "epoch": 0.34637005862017134, "flos": 63700609766400.0, "grad_norm": 0.8149802448400086, "language_loss": 0.56472003, "learning_rate": 3.0387555322958865e-06, "loss": 0.58511543, "num_input_tokens_seen": 123960075, "step": 5761, "time_per_iteration": 3.274470567703247 }, { "auxiliary_loss_clip": 0.01122229, "auxiliary_loss_mlp": 0.00773416, "balance_loss_clip": 1.04931128, "balance_loss_mlp": 1.00069964, "epoch": 0.3464301818728393, "flos": 13145532192000.0, "grad_norm": 2.486389460519204, "language_loss": 0.94996566, "learning_rate": 3.038422700166474e-06, "loss": 0.96892214, "num_input_tokens_seen": 123975805, "step": 5762, "time_per_iteration": 2.636906623840332 }, { "auxiliary_loss_clip": 0.01106692, "auxiliary_loss_mlp": 0.0104127, "balance_loss_clip": 1.04844642, "balance_loss_mlp": 1.02467608, "epoch": 0.34649030512550727, "flos": 29315173299840.0, "grad_norm": 1.8335548533403485, "language_loss": 0.69540495, "learning_rate": 3.0380898286599692e-06, "loss": 0.71688455, "num_input_tokens_seen": 123997530, "step": 5763, "time_per_iteration": 2.8476505279541016 }, { "auxiliary_loss_clip": 0.01125911, "auxiliary_loss_mlp": 0.01051478, "balance_loss_clip": 1.04963946, "balance_loss_mlp": 1.03319085, "epoch": 0.34655042837817523, "flos": 23730884553600.0, "grad_norm": 2.0043623648961195, "language_loss": 0.83985734, "learning_rate": 3.0377569177889945e-06, "loss": 0.86163127, "num_input_tokens_seen": 124016375, "step": 5764, "time_per_iteration": 2.693847417831421 }, { "auxiliary_loss_clip": 0.01103367, "auxiliary_loss_mlp": 0.01039514, "balance_loss_clip": 1.04989028, "balance_loss_mlp": 1.02363563, "epoch": 0.34661055163084326, "flos": 22054215553920.0, "grad_norm": 2.2905956292147045, "language_loss": 0.6769501, "learning_rate": 3.0374239675661722e-06, "loss": 0.69837892, "num_input_tokens_seen": 124033975, "step": 5765, "time_per_iteration": 2.7656123638153076 }, { "auxiliary_loss_clip": 0.01108658, "auxiliary_loss_mlp": 0.01045242, "balance_loss_clip": 1.05017447, "balance_loss_mlp": 1.0279808, "epoch": 0.3466706748835112, "flos": 21799213925760.0, "grad_norm": 2.7236728572511653, "language_loss": 0.77394044, "learning_rate": 3.03709097800413e-06, "loss": 0.79547942, "num_input_tokens_seen": 124051930, "step": 5766, "time_per_iteration": 2.7095906734466553 }, { "auxiliary_loss_clip": 0.01078684, "auxiliary_loss_mlp": 0.01035923, "balance_loss_clip": 1.04552221, "balance_loss_mlp": 1.02113521, "epoch": 0.3467307981361792, "flos": 19461680547840.0, "grad_norm": 1.6543575607114767, "language_loss": 0.73547316, "learning_rate": 3.0367579491154943e-06, "loss": 0.75661922, "num_input_tokens_seen": 124071220, "step": 5767, "time_per_iteration": 2.8161730766296387 }, { "auxiliary_loss_clip": 0.01111822, "auxiliary_loss_mlp": 0.01043875, "balance_loss_clip": 1.05307102, "balance_loss_mlp": 1.02734113, "epoch": 0.34679092138884715, "flos": 24827452905600.0, "grad_norm": 2.2530154082607776, "language_loss": 0.7832194, "learning_rate": 3.036424880912893e-06, "loss": 0.80477637, "num_input_tokens_seen": 124090140, "step": 5768, "time_per_iteration": 4.265673875808716 }, { "auxiliary_loss_clip": 0.01050543, "auxiliary_loss_mlp": 0.01012109, "balance_loss_clip": 1.0320363, "balance_loss_mlp": 1.0104636, "epoch": 0.3468510446415151, "flos": 63236070149760.0, "grad_norm": 0.7741250202123364, "language_loss": 0.57502627, "learning_rate": 3.036091773408956e-06, "loss": 0.59565282, "num_input_tokens_seen": 124152025, "step": 5769, "time_per_iteration": 3.2264139652252197 }, { "auxiliary_loss_clip": 0.01107195, "auxiliary_loss_mlp": 0.01044629, "balance_loss_clip": 1.04818511, "balance_loss_mlp": 1.02630615, "epoch": 0.3469111678941831, "flos": 12120713256960.0, "grad_norm": 2.34841523993127, "language_loss": 0.85575318, "learning_rate": 3.0357586266163154e-06, "loss": 0.87727135, "num_input_tokens_seen": 124165795, "step": 5770, "time_per_iteration": 2.7029645442962646 }, { "auxiliary_loss_clip": 0.01034922, "auxiliary_loss_mlp": 0.01007496, "balance_loss_clip": 1.02998519, "balance_loss_mlp": 1.00527906, "epoch": 0.34697129114685105, "flos": 65934110378880.0, "grad_norm": 0.7677707974310557, "language_loss": 0.59758615, "learning_rate": 3.0354254405476036e-06, "loss": 0.6180104, "num_input_tokens_seen": 124222925, "step": 5771, "time_per_iteration": 4.5523951053619385 }, { "auxiliary_loss_clip": 0.01127175, "auxiliary_loss_mlp": 0.01049141, "balance_loss_clip": 1.05249262, "balance_loss_mlp": 1.03320241, "epoch": 0.347031414399519, "flos": 34454205054720.0, "grad_norm": 1.9048919633537342, "language_loss": 0.71560407, "learning_rate": 3.0350922152154557e-06, "loss": 0.73736715, "num_input_tokens_seen": 124240915, "step": 5772, "time_per_iteration": 2.8108439445495605 }, { "auxiliary_loss_clip": 0.01108886, "auxiliary_loss_mlp": 0.0077423, "balance_loss_clip": 1.05118012, "balance_loss_mlp": 1.00077164, "epoch": 0.347091537652187, "flos": 26944135511040.0, "grad_norm": 1.679823492532721, "language_loss": 0.764898, "learning_rate": 3.034758950632507e-06, "loss": 0.78372908, "num_input_tokens_seen": 124262770, "step": 5773, "time_per_iteration": 2.813775062561035 }, { "auxiliary_loss_clip": 0.01128178, "auxiliary_loss_mlp": 0.01043067, "balance_loss_clip": 1.05019748, "balance_loss_mlp": 1.02674699, "epoch": 0.34715166090485494, "flos": 21142228216320.0, "grad_norm": 5.389351496516036, "language_loss": 0.70094979, "learning_rate": 3.034425646811396e-06, "loss": 0.72266221, "num_input_tokens_seen": 124280950, "step": 5774, "time_per_iteration": 4.167816162109375 }, { "auxiliary_loss_clip": 0.01113209, "auxiliary_loss_mlp": 0.00774032, "balance_loss_clip": 1.05024052, "balance_loss_mlp": 1.00071549, "epoch": 0.3472117841575229, "flos": 23478001827840.0, "grad_norm": 1.6687380405540382, "language_loss": 0.76013231, "learning_rate": 3.0340923037647602e-06, "loss": 0.77900469, "num_input_tokens_seen": 124299540, "step": 5775, "time_per_iteration": 2.739729404449463 }, { "auxiliary_loss_clip": 0.01114926, "auxiliary_loss_mlp": 0.01046919, "balance_loss_clip": 1.0480268, "balance_loss_mlp": 1.02965736, "epoch": 0.34727190741019087, "flos": 17492806408320.0, "grad_norm": 2.598065011523741, "language_loss": 0.77565503, "learning_rate": 3.0337589215052404e-06, "loss": 0.79727352, "num_input_tokens_seen": 124316285, "step": 5776, "time_per_iteration": 2.7339272499084473 }, { "auxiliary_loss_clip": 0.01036494, "auxiliary_loss_mlp": 0.01014475, "balance_loss_clip": 1.02741766, "balance_loss_mlp": 1.01280594, "epoch": 0.34733203066285884, "flos": 65265491640960.0, "grad_norm": 0.8358378555600092, "language_loss": 0.63272905, "learning_rate": 3.033425500045478e-06, "loss": 0.65323877, "num_input_tokens_seen": 124376650, "step": 5777, "time_per_iteration": 3.257993459701538 }, { "auxiliary_loss_clip": 0.01098381, "auxiliary_loss_mlp": 0.01045801, "balance_loss_clip": 1.04933393, "balance_loss_mlp": 1.02975535, "epoch": 0.3473921539155268, "flos": 28658726294400.0, "grad_norm": 3.5330364681008755, "language_loss": 0.6504612, "learning_rate": 3.033092039398119e-06, "loss": 0.67190301, "num_input_tokens_seen": 124396475, "step": 5778, "time_per_iteration": 2.775846481323242 }, { "auxiliary_loss_clip": 0.01113961, "auxiliary_loss_mlp": 0.01054607, "balance_loss_clip": 1.04786038, "balance_loss_mlp": 1.03903246, "epoch": 0.3474522771681948, "flos": 40836895355520.0, "grad_norm": 2.3967507755094064, "language_loss": 0.71278334, "learning_rate": 3.0327585395758046e-06, "loss": 0.73446906, "num_input_tokens_seen": 124416480, "step": 5779, "time_per_iteration": 2.7915873527526855 }, { "auxiliary_loss_clip": 0.01142932, "auxiliary_loss_mlp": 0.01053692, "balance_loss_clip": 1.05395269, "balance_loss_mlp": 1.03762269, "epoch": 0.3475124004208628, "flos": 24608577381120.0, "grad_norm": 2.0452202029673043, "language_loss": 0.62873107, "learning_rate": 3.0324250005911837e-06, "loss": 0.65069735, "num_input_tokens_seen": 124435950, "step": 5780, "time_per_iteration": 2.6743876934051514 }, { "auxiliary_loss_clip": 0.01095736, "auxiliary_loss_mlp": 0.01050069, "balance_loss_clip": 1.04648292, "balance_loss_mlp": 1.03446484, "epoch": 0.34757252367353075, "flos": 22711309004160.0, "grad_norm": 1.6009150193459345, "language_loss": 0.72167897, "learning_rate": 3.0320914224569033e-06, "loss": 0.743137, "num_input_tokens_seen": 124455410, "step": 5781, "time_per_iteration": 2.749302625656128 }, { "auxiliary_loss_clip": 0.01073898, "auxiliary_loss_mlp": 0.01052117, "balance_loss_clip": 1.040519, "balance_loss_mlp": 1.03405714, "epoch": 0.3476326469261987, "flos": 19828184970240.0, "grad_norm": 2.5507599846278644, "language_loss": 0.76966107, "learning_rate": 3.031757805185612e-06, "loss": 0.79092121, "num_input_tokens_seen": 124474870, "step": 5782, "time_per_iteration": 2.801867723464966 }, { "auxiliary_loss_clip": 0.01108825, "auxiliary_loss_mlp": 0.01037018, "balance_loss_clip": 1.05032897, "balance_loss_mlp": 1.02193785, "epoch": 0.3476927701788667, "flos": 19938107566080.0, "grad_norm": 2.367934041085959, "language_loss": 0.62506068, "learning_rate": 3.0314241487899622e-06, "loss": 0.64651906, "num_input_tokens_seen": 124494105, "step": 5783, "time_per_iteration": 2.709778070449829 }, { "auxiliary_loss_clip": 0.01092863, "auxiliary_loss_mlp": 0.01031024, "balance_loss_clip": 1.04997683, "balance_loss_mlp": 1.0163672, "epoch": 0.34775289343153465, "flos": 20735108490240.0, "grad_norm": 1.7498214415914104, "language_loss": 0.88513505, "learning_rate": 3.031090453282605e-06, "loss": 0.90637398, "num_input_tokens_seen": 124512030, "step": 5784, "time_per_iteration": 2.769317150115967 }, { "auxiliary_loss_clip": 0.01089006, "auxiliary_loss_mlp": 0.01036783, "balance_loss_clip": 1.05206084, "balance_loss_mlp": 1.02097547, "epoch": 0.3478130166842026, "flos": 19354846521600.0, "grad_norm": 1.703369857104052, "language_loss": 0.81740022, "learning_rate": 3.0307567186761946e-06, "loss": 0.83865809, "num_input_tokens_seen": 124530980, "step": 5785, "time_per_iteration": 2.791860818862915 }, { "auxiliary_loss_clip": 0.01106676, "auxiliary_loss_mlp": 0.01040592, "balance_loss_clip": 1.04747128, "balance_loss_mlp": 1.02563095, "epoch": 0.3478731399368706, "flos": 22051198811520.0, "grad_norm": 1.689422515624071, "language_loss": 0.80540836, "learning_rate": 3.0304229449833862e-06, "loss": 0.82688099, "num_input_tokens_seen": 124549330, "step": 5786, "time_per_iteration": 2.7547576427459717 }, { "auxiliary_loss_clip": 0.0113505, "auxiliary_loss_mlp": 0.00773369, "balance_loss_clip": 1.05242872, "balance_loss_mlp": 1.00073981, "epoch": 0.34793326318953854, "flos": 18041449720320.0, "grad_norm": 2.7072955912962686, "language_loss": 0.74945676, "learning_rate": 3.030089132216836e-06, "loss": 0.76854098, "num_input_tokens_seen": 124567200, "step": 5787, "time_per_iteration": 2.592688798904419 }, { "auxiliary_loss_clip": 0.01102822, "auxiliary_loss_mlp": 0.00773627, "balance_loss_clip": 1.04294109, "balance_loss_mlp": 1.00074553, "epoch": 0.3479933864422065, "flos": 29314670509440.0, "grad_norm": 1.9068485918966191, "language_loss": 0.81542754, "learning_rate": 3.029755280389203e-06, "loss": 0.83419204, "num_input_tokens_seen": 124587025, "step": 5788, "time_per_iteration": 2.84395694732666 }, { "auxiliary_loss_clip": 0.01144785, "auxiliary_loss_mlp": 0.01037478, "balance_loss_clip": 1.0562067, "balance_loss_mlp": 1.02140832, "epoch": 0.3480535096948745, "flos": 20120713332480.0, "grad_norm": 2.2432452775203964, "language_loss": 0.85701168, "learning_rate": 3.029421389513147e-06, "loss": 0.87883425, "num_input_tokens_seen": 124605860, "step": 5789, "time_per_iteration": 2.630535125732422 }, { "auxiliary_loss_clip": 0.01130136, "auxiliary_loss_mlp": 0.01056162, "balance_loss_clip": 1.05231345, "balance_loss_mlp": 1.04007459, "epoch": 0.34811363294754244, "flos": 18548974938240.0, "grad_norm": 5.008598067350991, "language_loss": 0.8502599, "learning_rate": 3.029087459601328e-06, "loss": 0.87212288, "num_input_tokens_seen": 124624270, "step": 5790, "time_per_iteration": 2.6052823066711426 }, { "auxiliary_loss_clip": 0.01130643, "auxiliary_loss_mlp": 0.01044731, "balance_loss_clip": 1.05373776, "balance_loss_mlp": 1.02904904, "epoch": 0.3481737562002104, "flos": 26870303105280.0, "grad_norm": 1.9264082121319324, "language_loss": 0.80832046, "learning_rate": 3.0287534906664097e-06, "loss": 0.83007419, "num_input_tokens_seen": 124644005, "step": 5791, "time_per_iteration": 2.7190260887145996 }, { "auxiliary_loss_clip": 0.01125872, "auxiliary_loss_mlp": 0.0104286, "balance_loss_clip": 1.04968619, "balance_loss_mlp": 1.02690983, "epoch": 0.3482338794528784, "flos": 28908664104960.0, "grad_norm": 2.4373031068755022, "language_loss": 0.77855796, "learning_rate": 3.028419482721056e-06, "loss": 0.80024529, "num_input_tokens_seen": 124663020, "step": 5792, "time_per_iteration": 2.7223403453826904 }, { "auxiliary_loss_clip": 0.01108923, "auxiliary_loss_mlp": 0.01034893, "balance_loss_clip": 1.04401517, "balance_loss_mlp": 1.01922882, "epoch": 0.3482940027055464, "flos": 22200767043840.0, "grad_norm": 1.6684091148270528, "language_loss": 0.81824791, "learning_rate": 3.0280854357779325e-06, "loss": 0.8396861, "num_input_tokens_seen": 124682975, "step": 5793, "time_per_iteration": 2.84191632270813 }, { "auxiliary_loss_clip": 0.01124823, "auxiliary_loss_mlp": 0.01055766, "balance_loss_clip": 1.05077863, "balance_loss_mlp": 1.0392313, "epoch": 0.34835412595821436, "flos": 20302708567680.0, "grad_norm": 1.8786694421525794, "language_loss": 0.7607373, "learning_rate": 3.027751349849706e-06, "loss": 0.78254318, "num_input_tokens_seen": 124701340, "step": 5794, "time_per_iteration": 2.707648515701294 }, { "auxiliary_loss_clip": 0.01123664, "auxiliary_loss_mlp": 0.01044013, "balance_loss_clip": 1.04820764, "balance_loss_mlp": 1.02735913, "epoch": 0.3484142492108823, "flos": 20449691020800.0, "grad_norm": 2.79979085265216, "language_loss": 0.57190084, "learning_rate": 3.0274172249490456e-06, "loss": 0.59357756, "num_input_tokens_seen": 124719165, "step": 5795, "time_per_iteration": 2.6533401012420654 }, { "auxiliary_loss_clip": 0.01106011, "auxiliary_loss_mlp": 0.0103693, "balance_loss_clip": 1.04720807, "balance_loss_mlp": 1.02177811, "epoch": 0.3484743724635503, "flos": 24352929308160.0, "grad_norm": 2.0564463844351546, "language_loss": 0.82218957, "learning_rate": 3.0270830610886213e-06, "loss": 0.84361899, "num_input_tokens_seen": 124738670, "step": 5796, "time_per_iteration": 2.6823246479034424 }, { "auxiliary_loss_clip": 0.01120404, "auxiliary_loss_mlp": 0.01034067, "balance_loss_clip": 1.04927754, "balance_loss_mlp": 1.0192616, "epoch": 0.34853449571621825, "flos": 24353001135360.0, "grad_norm": 1.9927036097023587, "language_loss": 0.83429003, "learning_rate": 3.0267488582811033e-06, "loss": 0.85583472, "num_input_tokens_seen": 124758760, "step": 5797, "time_per_iteration": 2.7048346996307373 }, { "auxiliary_loss_clip": 0.01132676, "auxiliary_loss_mlp": 0.01037057, "balance_loss_clip": 1.05049801, "balance_loss_mlp": 1.02151191, "epoch": 0.3485946189688862, "flos": 27267690245760.0, "grad_norm": 1.9361964581914621, "language_loss": 0.73449033, "learning_rate": 3.026414616539167e-06, "loss": 0.75618768, "num_input_tokens_seen": 124777765, "step": 5798, "time_per_iteration": 2.6807782649993896 }, { "auxiliary_loss_clip": 0.01135458, "auxiliary_loss_mlp": 0.01044729, "balance_loss_clip": 1.04995012, "balance_loss_mlp": 1.02815914, "epoch": 0.3486547422215542, "flos": 20156695781760.0, "grad_norm": 2.5738259800272725, "language_loss": 0.76111758, "learning_rate": 3.026080335875485e-06, "loss": 0.78291941, "num_input_tokens_seen": 124796775, "step": 5799, "time_per_iteration": 2.629671096801758 }, { "auxiliary_loss_clip": 0.01073192, "auxiliary_loss_mlp": 0.01035978, "balance_loss_clip": 1.05208993, "balance_loss_mlp": 1.02083826, "epoch": 0.34871486547422215, "flos": 20230348619520.0, "grad_norm": 2.242229362705527, "language_loss": 0.75801086, "learning_rate": 3.025746016302734e-06, "loss": 0.77910256, "num_input_tokens_seen": 124815825, "step": 5800, "time_per_iteration": 3.047725200653076 }, { "auxiliary_loss_clip": 0.01112927, "auxiliary_loss_mlp": 0.00774006, "balance_loss_clip": 1.04720354, "balance_loss_mlp": 1.00079536, "epoch": 0.3487749887268901, "flos": 44053234882560.0, "grad_norm": 2.6257316922509286, "language_loss": 0.67468953, "learning_rate": 3.025411657833591e-06, "loss": 0.69355887, "num_input_tokens_seen": 124838420, "step": 5801, "time_per_iteration": 3.2364816665649414 }, { "auxiliary_loss_clip": 0.01103773, "auxiliary_loss_mlp": 0.010448, "balance_loss_clip": 1.04506934, "balance_loss_mlp": 1.028754, "epoch": 0.3488351119795581, "flos": 23295144666240.0, "grad_norm": 1.8428676315803219, "language_loss": 0.76738638, "learning_rate": 3.025077260480735e-06, "loss": 0.78887206, "num_input_tokens_seen": 124857320, "step": 5802, "time_per_iteration": 2.7959024906158447 }, { "auxiliary_loss_clip": 0.01053855, "auxiliary_loss_mlp": 0.01037371, "balance_loss_clip": 1.03989601, "balance_loss_mlp": 1.02219605, "epoch": 0.34889523523222604, "flos": 19934839428480.0, "grad_norm": 1.7816673584343024, "language_loss": 0.78991377, "learning_rate": 3.0247428242568474e-06, "loss": 0.81082606, "num_input_tokens_seen": 124875685, "step": 5803, "time_per_iteration": 2.8440747261047363 }, { "auxiliary_loss_clip": 0.01111548, "auxiliary_loss_mlp": 0.00774436, "balance_loss_clip": 1.04601288, "balance_loss_mlp": 1.00073576, "epoch": 0.348955358484894, "flos": 30446179816320.0, "grad_norm": 6.169621760932873, "language_loss": 0.67899323, "learning_rate": 3.0244083491746085e-06, "loss": 0.69785309, "num_input_tokens_seen": 124895960, "step": 5804, "time_per_iteration": 2.8011341094970703 }, { "auxiliary_loss_clip": 0.01109039, "auxiliary_loss_mlp": 0.01046207, "balance_loss_clip": 1.05153811, "balance_loss_mlp": 1.0306263, "epoch": 0.349015481737562, "flos": 17999972490240.0, "grad_norm": 1.9366950093174176, "language_loss": 0.75972986, "learning_rate": 3.024073835246702e-06, "loss": 0.78128237, "num_input_tokens_seen": 124914140, "step": 5805, "time_per_iteration": 2.735410213470459 }, { "auxiliary_loss_clip": 0.01085261, "auxiliary_loss_mlp": 0.0103851, "balance_loss_clip": 1.040416, "balance_loss_mlp": 1.0230304, "epoch": 0.34907560499023, "flos": 27198490694400.0, "grad_norm": 2.3089286954803194, "language_loss": 0.67154014, "learning_rate": 3.023739282485814e-06, "loss": 0.69277781, "num_input_tokens_seen": 124934180, "step": 5806, "time_per_iteration": 2.793893575668335 }, { "auxiliary_loss_clip": 0.01122813, "auxiliary_loss_mlp": 0.0104012, "balance_loss_clip": 1.05324221, "balance_loss_mlp": 1.02445614, "epoch": 0.34913572824289796, "flos": 30226873328640.0, "grad_norm": 1.5212397526739, "language_loss": 0.71703929, "learning_rate": 3.023404690904629e-06, "loss": 0.73866862, "num_input_tokens_seen": 124956060, "step": 5807, "time_per_iteration": 2.7225730419158936 }, { "auxiliary_loss_clip": 0.01135343, "auxiliary_loss_mlp": 0.0103686, "balance_loss_clip": 1.04923332, "balance_loss_mlp": 1.02102923, "epoch": 0.3491958514955659, "flos": 29971907614080.0, "grad_norm": 2.9062872704377125, "language_loss": 0.7383548, "learning_rate": 3.0230700605158364e-06, "loss": 0.76007676, "num_input_tokens_seen": 124976070, "step": 5808, "time_per_iteration": 4.38737154006958 }, { "auxiliary_loss_clip": 0.01133483, "auxiliary_loss_mlp": 0.01047071, "balance_loss_clip": 1.05228174, "balance_loss_mlp": 1.03241384, "epoch": 0.3492559747482339, "flos": 22783273902720.0, "grad_norm": 1.513097370663534, "language_loss": 0.84501046, "learning_rate": 3.0227353913321238e-06, "loss": 0.86681598, "num_input_tokens_seen": 124996995, "step": 5809, "time_per_iteration": 2.629246711730957 }, { "auxiliary_loss_clip": 0.01106316, "auxiliary_loss_mlp": 0.01034055, "balance_loss_clip": 1.04668331, "balance_loss_mlp": 1.01995289, "epoch": 0.34931609800090185, "flos": 26068022881920.0, "grad_norm": 2.856878325415132, "language_loss": 0.80759805, "learning_rate": 3.0224006833661835e-06, "loss": 0.82900178, "num_input_tokens_seen": 125015600, "step": 5810, "time_per_iteration": 2.815232276916504 }, { "auxiliary_loss_clip": 0.01134295, "auxiliary_loss_mlp": 0.01039591, "balance_loss_clip": 1.05105019, "balance_loss_mlp": 1.02539277, "epoch": 0.3493762212535698, "flos": 29242023252480.0, "grad_norm": 1.9587859815348794, "language_loss": 0.75694251, "learning_rate": 3.0220659366307057e-06, "loss": 0.7786814, "num_input_tokens_seen": 125035290, "step": 5811, "time_per_iteration": 4.295617580413818 }, { "auxiliary_loss_clip": 0.0111498, "auxiliary_loss_mlp": 0.01040701, "balance_loss_clip": 1.04791081, "balance_loss_mlp": 1.02616942, "epoch": 0.3494363445062378, "flos": 27126058919040.0, "grad_norm": 1.5951936061604581, "language_loss": 0.80199474, "learning_rate": 3.021731151138386e-06, "loss": 0.82355154, "num_input_tokens_seen": 125057130, "step": 5812, "time_per_iteration": 2.8571486473083496 }, { "auxiliary_loss_clip": 0.0106966, "auxiliary_loss_mlp": 0.01038506, "balance_loss_clip": 1.04193187, "balance_loss_mlp": 1.02299738, "epoch": 0.34949646775890575, "flos": 12276207233280.0, "grad_norm": 1.932575417997546, "language_loss": 0.69221139, "learning_rate": 3.021396326901918e-06, "loss": 0.71329308, "num_input_tokens_seen": 125073720, "step": 5813, "time_per_iteration": 4.446147441864014 }, { "auxiliary_loss_clip": 0.01101223, "auxiliary_loss_mlp": 0.00772918, "balance_loss_clip": 1.04168797, "balance_loss_mlp": 1.00074911, "epoch": 0.3495565910115737, "flos": 17165516659200.0, "grad_norm": 2.168508070197816, "language_loss": 0.76586467, "learning_rate": 3.0210614639339998e-06, "loss": 0.7846061, "num_input_tokens_seen": 125090635, "step": 5814, "time_per_iteration": 2.698594331741333 }, { "auxiliary_loss_clip": 0.01114737, "auxiliary_loss_mlp": 0.00773337, "balance_loss_clip": 1.05010188, "balance_loss_mlp": 1.00060046, "epoch": 0.3496167142642417, "flos": 26465661417600.0, "grad_norm": 1.9777422761312171, "language_loss": 0.84760284, "learning_rate": 3.020726562247328e-06, "loss": 0.86648357, "num_input_tokens_seen": 125110070, "step": 5815, "time_per_iteration": 2.7839486598968506 }, { "auxiliary_loss_clip": 0.01117022, "auxiliary_loss_mlp": 0.01031007, "balance_loss_clip": 1.04850423, "balance_loss_mlp": 1.01695168, "epoch": 0.34967683751690964, "flos": 17414843938560.0, "grad_norm": 2.1137892099104674, "language_loss": 0.77541941, "learning_rate": 3.0203916218546024e-06, "loss": 0.79689968, "num_input_tokens_seen": 125125730, "step": 5816, "time_per_iteration": 2.6244633197784424 }, { "auxiliary_loss_clip": 0.01122041, "auxiliary_loss_mlp": 0.01042966, "balance_loss_clip": 1.05198002, "balance_loss_mlp": 1.0282141, "epoch": 0.3497369607695776, "flos": 22600021691520.0, "grad_norm": 2.2643435778821246, "language_loss": 0.5898062, "learning_rate": 3.0200566427685246e-06, "loss": 0.61145627, "num_input_tokens_seen": 125146195, "step": 5817, "time_per_iteration": 2.676058530807495 }, { "auxiliary_loss_clip": 0.01065616, "auxiliary_loss_mlp": 0.01004328, "balance_loss_clip": 1.03704262, "balance_loss_mlp": 1.00290895, "epoch": 0.34979708402224563, "flos": 68529374818560.0, "grad_norm": 0.8661744616347857, "language_loss": 0.59915632, "learning_rate": 3.0197216250017975e-06, "loss": 0.61985576, "num_input_tokens_seen": 125207790, "step": 5818, "time_per_iteration": 3.2298331260681152 }, { "auxiliary_loss_clip": 0.0109396, "auxiliary_loss_mlp": 0.01044055, "balance_loss_clip": 1.04599476, "balance_loss_mlp": 1.02892733, "epoch": 0.3498572072749136, "flos": 18989634988800.0, "grad_norm": 2.0582091611638713, "language_loss": 0.83473527, "learning_rate": 3.019386568567123e-06, "loss": 0.85611546, "num_input_tokens_seen": 125226220, "step": 5819, "time_per_iteration": 2.6558237075805664 }, { "auxiliary_loss_clip": 0.01106439, "auxiliary_loss_mlp": 0.01034351, "balance_loss_clip": 1.04502416, "balance_loss_mlp": 1.01987886, "epoch": 0.34991733052758156, "flos": 27818883423360.0, "grad_norm": 1.848700539441483, "language_loss": 0.7078613, "learning_rate": 3.0190514734772083e-06, "loss": 0.72926915, "num_input_tokens_seen": 125247485, "step": 5820, "time_per_iteration": 2.703023672103882 }, { "auxiliary_loss_clip": 0.01122902, "auxiliary_loss_mlp": 0.01036767, "balance_loss_clip": 1.04821718, "balance_loss_mlp": 1.02288496, "epoch": 0.3499774537802495, "flos": 33584197737600.0, "grad_norm": 1.691680241057735, "language_loss": 0.70418453, "learning_rate": 3.018716339744759e-06, "loss": 0.7257812, "num_input_tokens_seen": 125268625, "step": 5821, "time_per_iteration": 2.7258172035217285 }, { "auxiliary_loss_clip": 0.01128016, "auxiliary_loss_mlp": 0.01045237, "balance_loss_clip": 1.05040097, "balance_loss_mlp": 1.02945328, "epoch": 0.3500375770329175, "flos": 23476744851840.0, "grad_norm": 3.022669367007059, "language_loss": 0.73552108, "learning_rate": 3.0183811673824842e-06, "loss": 0.75725359, "num_input_tokens_seen": 125287530, "step": 5822, "time_per_iteration": 2.6288442611694336 }, { "auxiliary_loss_clip": 0.01111612, "auxiliary_loss_mlp": 0.01034787, "balance_loss_clip": 1.04867673, "balance_loss_mlp": 1.0193131, "epoch": 0.35009770028558546, "flos": 19026048401280.0, "grad_norm": 13.86145468617928, "language_loss": 0.78286207, "learning_rate": 3.018045956403094e-06, "loss": 0.80432606, "num_input_tokens_seen": 125307020, "step": 5823, "time_per_iteration": 2.585644245147705 }, { "auxiliary_loss_clip": 0.01050549, "auxiliary_loss_mlp": 0.01002993, "balance_loss_clip": 1.03169346, "balance_loss_mlp": 1.00141954, "epoch": 0.3501578235382534, "flos": 68351868783360.0, "grad_norm": 0.7268668465066358, "language_loss": 0.59232962, "learning_rate": 3.017710706819298e-06, "loss": 0.61286497, "num_input_tokens_seen": 125370445, "step": 5824, "time_per_iteration": 3.2155251502990723 }, { "auxiliary_loss_clip": 0.01110681, "auxiliary_loss_mlp": 0.01041197, "balance_loss_clip": 1.04737854, "balance_loss_mlp": 1.02561092, "epoch": 0.3502179467909214, "flos": 21250893836160.0, "grad_norm": 3.9873136748139126, "language_loss": 0.84533477, "learning_rate": 3.017375418643811e-06, "loss": 0.86685359, "num_input_tokens_seen": 125388900, "step": 5825, "time_per_iteration": 2.687849998474121 }, { "auxiliary_loss_clip": 0.01123129, "auxiliary_loss_mlp": 0.00772852, "balance_loss_clip": 1.04982102, "balance_loss_mlp": 1.00084817, "epoch": 0.35027807004358935, "flos": 11942955826560.0, "grad_norm": 3.7970216760931654, "language_loss": 0.83272213, "learning_rate": 3.0170400918893464e-06, "loss": 0.85168195, "num_input_tokens_seen": 125402675, "step": 5826, "time_per_iteration": 2.623713970184326 }, { "auxiliary_loss_clip": 0.01108751, "auxiliary_loss_mlp": 0.01045941, "balance_loss_clip": 1.04680669, "balance_loss_mlp": 1.0308249, "epoch": 0.3503381932962573, "flos": 21470918595840.0, "grad_norm": 1.799644232020304, "language_loss": 0.8068707, "learning_rate": 3.0167047265686186e-06, "loss": 0.82841766, "num_input_tokens_seen": 125421360, "step": 5827, "time_per_iteration": 2.7149739265441895 }, { "auxiliary_loss_clip": 0.01080927, "auxiliary_loss_mlp": 0.01041383, "balance_loss_clip": 1.04276204, "balance_loss_mlp": 1.02641606, "epoch": 0.3503983165489253, "flos": 21251109317760.0, "grad_norm": 3.105536532024743, "language_loss": 0.71077561, "learning_rate": 3.0163693226943467e-06, "loss": 0.73199868, "num_input_tokens_seen": 125440000, "step": 5828, "time_per_iteration": 2.7468550205230713 }, { "auxiliary_loss_clip": 0.01126682, "auxiliary_loss_mlp": 0.01050267, "balance_loss_clip": 1.05060673, "balance_loss_mlp": 1.0323143, "epoch": 0.35045843980159325, "flos": 27815723026560.0, "grad_norm": 2.750124615693701, "language_loss": 0.79695857, "learning_rate": 3.016033880279248e-06, "loss": 0.81872809, "num_input_tokens_seen": 125460390, "step": 5829, "time_per_iteration": 2.6937646865844727 }, { "auxiliary_loss_clip": 0.01096574, "auxiliary_loss_mlp": 0.01044418, "balance_loss_clip": 1.0481379, "balance_loss_mlp": 1.02766919, "epoch": 0.3505185630542612, "flos": 25921148169600.0, "grad_norm": 1.9090298023730403, "language_loss": 0.72606629, "learning_rate": 3.0156983993360417e-06, "loss": 0.74747616, "num_input_tokens_seen": 125478410, "step": 5830, "time_per_iteration": 2.7369346618652344 }, { "auxiliary_loss_clip": 0.01090166, "auxiliary_loss_mlp": 0.01037306, "balance_loss_clip": 1.04190445, "balance_loss_mlp": 1.02131414, "epoch": 0.35057868630692923, "flos": 20521763660160.0, "grad_norm": 2.5268343856675437, "language_loss": 0.88473773, "learning_rate": 3.0153628798774513e-06, "loss": 0.90601242, "num_input_tokens_seen": 125495975, "step": 5831, "time_per_iteration": 2.716801166534424 }, { "auxiliary_loss_clip": 0.01076431, "auxiliary_loss_mlp": 0.01046131, "balance_loss_clip": 1.04348278, "balance_loss_mlp": 1.03036547, "epoch": 0.3506388095595972, "flos": 20448649526400.0, "grad_norm": 2.8335622037275052, "language_loss": 0.78706706, "learning_rate": 3.0150273219161985e-06, "loss": 0.80829263, "num_input_tokens_seen": 125515035, "step": 5832, "time_per_iteration": 2.719874143600464 }, { "auxiliary_loss_clip": 0.01096023, "auxiliary_loss_mlp": 0.01049214, "balance_loss_clip": 1.04483593, "balance_loss_mlp": 1.0303669, "epoch": 0.35069893281226516, "flos": 23109665811840.0, "grad_norm": 2.771771323399588, "language_loss": 0.71084702, "learning_rate": 3.014691725465008e-06, "loss": 0.73229945, "num_input_tokens_seen": 125535555, "step": 5833, "time_per_iteration": 2.729029655456543 }, { "auxiliary_loss_clip": 0.0111933, "auxiliary_loss_mlp": 0.01035784, "balance_loss_clip": 1.04690456, "balance_loss_mlp": 1.02119827, "epoch": 0.35075905606493313, "flos": 27271999877760.0, "grad_norm": 1.4652984704802052, "language_loss": 0.80866987, "learning_rate": 3.014356090536606e-06, "loss": 0.830221, "num_input_tokens_seen": 125558195, "step": 5834, "time_per_iteration": 2.6999855041503906 }, { "auxiliary_loss_clip": 0.01086162, "auxiliary_loss_mlp": 0.01041057, "balance_loss_clip": 1.05142856, "balance_loss_mlp": 1.02516639, "epoch": 0.3508191793176011, "flos": 19128608709120.0, "grad_norm": 2.24398587431922, "language_loss": 0.84067535, "learning_rate": 3.0140204171437183e-06, "loss": 0.86194754, "num_input_tokens_seen": 125575375, "step": 5835, "time_per_iteration": 2.7401607036590576 }, { "auxiliary_loss_clip": 0.01072219, "auxiliary_loss_mlp": 0.0104369, "balance_loss_clip": 1.04324877, "balance_loss_mlp": 1.02816927, "epoch": 0.35087930257026906, "flos": 25557588662400.0, "grad_norm": 1.6286460178957367, "language_loss": 0.76643491, "learning_rate": 3.0136847052990754e-06, "loss": 0.78759408, "num_input_tokens_seen": 125596745, "step": 5836, "time_per_iteration": 2.767824649810791 }, { "auxiliary_loss_clip": 0.01095252, "auxiliary_loss_mlp": 0.01044499, "balance_loss_clip": 1.04785156, "balance_loss_mlp": 1.02751756, "epoch": 0.350939425822937, "flos": 18004246208640.0, "grad_norm": 2.0145924652365945, "language_loss": 0.77402902, "learning_rate": 3.0133489550154074e-06, "loss": 0.79542655, "num_input_tokens_seen": 125613980, "step": 5837, "time_per_iteration": 2.684300661087036 }, { "auxiliary_loss_clip": 0.01122261, "auxiliary_loss_mlp": 0.01044889, "balance_loss_clip": 1.04895687, "balance_loss_mlp": 1.02941537, "epoch": 0.350999549075605, "flos": 22273198819200.0, "grad_norm": 2.68275803808264, "language_loss": 0.67695981, "learning_rate": 3.0130131663054442e-06, "loss": 0.69863135, "num_input_tokens_seen": 125632100, "step": 5838, "time_per_iteration": 2.6679129600524902 }, { "auxiliary_loss_clip": 0.01133084, "auxiliary_loss_mlp": 0.01041419, "balance_loss_clip": 1.04808521, "balance_loss_mlp": 1.02538526, "epoch": 0.35105967232827295, "flos": 14392279307520.0, "grad_norm": 2.478699358378921, "language_loss": 0.83575064, "learning_rate": 3.0126773391819215e-06, "loss": 0.85749567, "num_input_tokens_seen": 125649190, "step": 5839, "time_per_iteration": 2.7186849117279053 }, { "auxiliary_loss_clip": 0.01125827, "auxiliary_loss_mlp": 0.01045138, "balance_loss_clip": 1.0484879, "balance_loss_mlp": 1.02930689, "epoch": 0.3511197955809409, "flos": 25082346792960.0, "grad_norm": 2.56286420283892, "language_loss": 0.58882701, "learning_rate": 3.012341473657572e-06, "loss": 0.61053669, "num_input_tokens_seen": 125668680, "step": 5840, "time_per_iteration": 2.7048165798187256 }, { "auxiliary_loss_clip": 0.01093858, "auxiliary_loss_mlp": 0.01043209, "balance_loss_clip": 1.0449121, "balance_loss_mlp": 1.02719963, "epoch": 0.3511799188336089, "flos": 25884160139520.0, "grad_norm": 2.762376787670534, "language_loss": 0.87442869, "learning_rate": 3.0120055697451322e-06, "loss": 0.89579934, "num_input_tokens_seen": 125686935, "step": 5841, "time_per_iteration": 2.763007402420044 }, { "auxiliary_loss_clip": 0.01116677, "auxiliary_loss_mlp": 0.01038697, "balance_loss_clip": 1.04990196, "balance_loss_mlp": 1.02083993, "epoch": 0.35124004208627685, "flos": 20083725302400.0, "grad_norm": 1.9868500880648916, "language_loss": 0.75116056, "learning_rate": 3.0116696274573406e-06, "loss": 0.77271438, "num_input_tokens_seen": 125707180, "step": 5842, "time_per_iteration": 2.703010082244873 }, { "auxiliary_loss_clip": 0.01124735, "auxiliary_loss_mlp": 0.01045785, "balance_loss_clip": 1.04863322, "balance_loss_mlp": 1.0302043, "epoch": 0.3513001653389448, "flos": 17783431349760.0, "grad_norm": 2.134458584945634, "language_loss": 0.68687361, "learning_rate": 3.0113336468069346e-06, "loss": 0.70857882, "num_input_tokens_seen": 125722780, "step": 5843, "time_per_iteration": 2.6459767818450928 }, { "auxiliary_loss_clip": 0.01135637, "auxiliary_loss_mlp": 0.01046534, "balance_loss_clip": 1.05054379, "balance_loss_mlp": 1.0305481, "epoch": 0.3513602885916128, "flos": 29387138198400.0, "grad_norm": 2.0610262324560984, "language_loss": 0.65392244, "learning_rate": 3.010997627806655e-06, "loss": 0.67574418, "num_input_tokens_seen": 125742110, "step": 5844, "time_per_iteration": 2.6542131900787354 }, { "auxiliary_loss_clip": 0.01119986, "auxiliary_loss_mlp": 0.01042575, "balance_loss_clip": 1.04791713, "balance_loss_mlp": 1.02620745, "epoch": 0.3514204118442808, "flos": 16179876483840.0, "grad_norm": 2.0120705985466394, "language_loss": 0.75180912, "learning_rate": 3.010661570469245e-06, "loss": 0.77343476, "num_input_tokens_seen": 125759980, "step": 5845, "time_per_iteration": 2.686753511428833 }, { "auxiliary_loss_clip": 0.01122626, "auxiliary_loss_mlp": 0.01043989, "balance_loss_clip": 1.0485301, "balance_loss_mlp": 1.02835488, "epoch": 0.35148053509694877, "flos": 23834665923840.0, "grad_norm": 4.021226487899694, "language_loss": 0.73548663, "learning_rate": 3.0103254748074465e-06, "loss": 0.7571528, "num_input_tokens_seen": 125772660, "step": 5846, "time_per_iteration": 2.67868971824646 }, { "auxiliary_loss_clip": 0.01094187, "auxiliary_loss_mlp": 0.01044379, "balance_loss_clip": 1.04565465, "balance_loss_mlp": 1.02834511, "epoch": 0.35154065834961673, "flos": 20991295267200.0, "grad_norm": 1.687499817432144, "language_loss": 0.756024, "learning_rate": 3.0099893408340046e-06, "loss": 0.77740967, "num_input_tokens_seen": 125791935, "step": 5847, "time_per_iteration": 2.749495267868042 }, { "auxiliary_loss_clip": 0.011087, "auxiliary_loss_mlp": 0.01034036, "balance_loss_clip": 1.04465413, "balance_loss_mlp": 1.01871789, "epoch": 0.3516007816022847, "flos": 33255471444480.0, "grad_norm": 2.8847551511625675, "language_loss": 0.71752924, "learning_rate": 3.009653168561666e-06, "loss": 0.73895657, "num_input_tokens_seen": 125813455, "step": 5848, "time_per_iteration": 4.367843151092529 }, { "auxiliary_loss_clip": 0.0111724, "auxiliary_loss_mlp": 0.01051356, "balance_loss_clip": 1.04754996, "balance_loss_mlp": 1.03528619, "epoch": 0.35166090485495266, "flos": 11726953390080.0, "grad_norm": 2.1303857634409455, "language_loss": 0.89211285, "learning_rate": 3.009316958003178e-06, "loss": 0.91379881, "num_input_tokens_seen": 125827660, "step": 5849, "time_per_iteration": 2.720156192779541 }, { "auxiliary_loss_clip": 0.01112345, "auxiliary_loss_mlp": 0.01035199, "balance_loss_clip": 1.04670548, "balance_loss_mlp": 1.01948714, "epoch": 0.3517210281076206, "flos": 22638446265600.0, "grad_norm": 5.671837642447228, "language_loss": 0.74645329, "learning_rate": 3.0089807091712897e-06, "loss": 0.76792872, "num_input_tokens_seen": 125846655, "step": 5850, "time_per_iteration": 5.769666910171509 }, { "auxiliary_loss_clip": 0.01124277, "auxiliary_loss_mlp": 0.01039165, "balance_loss_clip": 1.05061293, "balance_loss_mlp": 1.02304828, "epoch": 0.3517811513602886, "flos": 21322750993920.0, "grad_norm": 4.453824391316201, "language_loss": 0.75497609, "learning_rate": 3.0086444220787515e-06, "loss": 0.77661049, "num_input_tokens_seen": 125866290, "step": 5851, "time_per_iteration": 2.6903436183929443 }, { "auxiliary_loss_clip": 0.01109028, "auxiliary_loss_mlp": 0.01043585, "balance_loss_clip": 1.047647, "balance_loss_mlp": 1.02581048, "epoch": 0.35184127461295656, "flos": 21032880238080.0, "grad_norm": 2.6842208339362714, "language_loss": 0.8711859, "learning_rate": 3.0083080967383165e-06, "loss": 0.892712, "num_input_tokens_seen": 125884620, "step": 5852, "time_per_iteration": 4.37211275100708 }, { "auxiliary_loss_clip": 0.01134086, "auxiliary_loss_mlp": 0.01034974, "balance_loss_clip": 1.05088282, "balance_loss_mlp": 1.02020407, "epoch": 0.3519013978656245, "flos": 22455265881600.0, "grad_norm": 4.894656899057391, "language_loss": 0.67756367, "learning_rate": 3.007971733162737e-06, "loss": 0.69925427, "num_input_tokens_seen": 125902430, "step": 5853, "time_per_iteration": 2.6657445430755615 }, { "auxiliary_loss_clip": 0.0110992, "auxiliary_loss_mlp": 0.01035315, "balance_loss_clip": 1.04499912, "balance_loss_mlp": 1.01943672, "epoch": 0.3519615211182925, "flos": 13115295918720.0, "grad_norm": 1.9396695842158058, "language_loss": 0.80834955, "learning_rate": 3.0076353313647686e-06, "loss": 0.82980192, "num_input_tokens_seen": 125920570, "step": 5854, "time_per_iteration": 2.741804361343384 }, { "auxiliary_loss_clip": 0.0111683, "auxiliary_loss_mlp": 0.01035573, "balance_loss_clip": 1.05230534, "balance_loss_mlp": 1.02117872, "epoch": 0.35202164437096045, "flos": 19135144984320.0, "grad_norm": 2.236186864476635, "language_loss": 0.73234653, "learning_rate": 3.0072988913571666e-06, "loss": 0.75387061, "num_input_tokens_seen": 125939800, "step": 5855, "time_per_iteration": 2.730731725692749 }, { "auxiliary_loss_clip": 0.0113392, "auxiliary_loss_mlp": 0.01038425, "balance_loss_clip": 1.05024409, "balance_loss_mlp": 1.02407861, "epoch": 0.3520817676236284, "flos": 26542187343360.0, "grad_norm": 2.4482136775911427, "language_loss": 0.71000826, "learning_rate": 3.006962413152691e-06, "loss": 0.73173165, "num_input_tokens_seen": 125958720, "step": 5856, "time_per_iteration": 2.632906436920166 }, { "auxiliary_loss_clip": 0.01121339, "auxiliary_loss_mlp": 0.01047265, "balance_loss_clip": 1.0479008, "balance_loss_mlp": 1.03056359, "epoch": 0.3521418908762964, "flos": 44893472803200.0, "grad_norm": 1.9582827204032656, "language_loss": 0.61505377, "learning_rate": 3.0066258967640987e-06, "loss": 0.63673985, "num_input_tokens_seen": 125984310, "step": 5857, "time_per_iteration": 2.8992249965667725 }, { "auxiliary_loss_clip": 0.01126198, "auxiliary_loss_mlp": 0.0103782, "balance_loss_clip": 1.05141187, "balance_loss_mlp": 1.02197754, "epoch": 0.3522020141289644, "flos": 20187398931840.0, "grad_norm": 2.047463358229584, "language_loss": 0.73246485, "learning_rate": 3.006289342204152e-06, "loss": 0.75410509, "num_input_tokens_seen": 126002410, "step": 5858, "time_per_iteration": 2.6754567623138428 }, { "auxiliary_loss_clip": 0.01139705, "auxiliary_loss_mlp": 0.01044718, "balance_loss_clip": 1.05193448, "balance_loss_mlp": 1.028947, "epoch": 0.35226213738163237, "flos": 27563917708800.0, "grad_norm": 1.8174320112537778, "language_loss": 0.7662344, "learning_rate": 3.0059527494856126e-06, "loss": 0.78807867, "num_input_tokens_seen": 126022490, "step": 5859, "time_per_iteration": 2.6464414596557617 }, { "auxiliary_loss_clip": 0.01123734, "auxiliary_loss_mlp": 0.0104748, "balance_loss_clip": 1.05600715, "balance_loss_mlp": 1.03037381, "epoch": 0.35232226063430033, "flos": 22966310632320.0, "grad_norm": 2.0728265984729974, "language_loss": 0.71452159, "learning_rate": 3.0056161186212435e-06, "loss": 0.73623371, "num_input_tokens_seen": 126042895, "step": 5860, "time_per_iteration": 2.7567954063415527 }, { "auxiliary_loss_clip": 0.01107752, "auxiliary_loss_mlp": 0.01042463, "balance_loss_clip": 1.04505348, "balance_loss_mlp": 1.02517724, "epoch": 0.3523823838869683, "flos": 19168290259200.0, "grad_norm": 2.4820154826508896, "language_loss": 0.66456246, "learning_rate": 3.005279449623811e-06, "loss": 0.6860646, "num_input_tokens_seen": 126060130, "step": 5861, "time_per_iteration": 2.6954853534698486 }, { "auxiliary_loss_clip": 0.01114832, "auxiliary_loss_mlp": 0.01037396, "balance_loss_clip": 1.05085611, "balance_loss_mlp": 1.0220778, "epoch": 0.35244250713963626, "flos": 17930988420480.0, "grad_norm": 2.552495084661914, "language_loss": 0.66833258, "learning_rate": 3.0049427425060815e-06, "loss": 0.68985492, "num_input_tokens_seen": 126077850, "step": 5862, "time_per_iteration": 2.758626699447632 }, { "auxiliary_loss_clip": 0.01111543, "auxiliary_loss_mlp": 0.01046885, "balance_loss_clip": 1.04932082, "balance_loss_mlp": 1.02999306, "epoch": 0.35250263039230423, "flos": 21432529935360.0, "grad_norm": 2.001922070828984, "language_loss": 0.77027225, "learning_rate": 3.0046059972808215e-06, "loss": 0.79185653, "num_input_tokens_seen": 126095985, "step": 5863, "time_per_iteration": 2.692974328994751 }, { "auxiliary_loss_clip": 0.01124448, "auxiliary_loss_mlp": 0.01041257, "balance_loss_clip": 1.05029762, "balance_loss_mlp": 1.02602828, "epoch": 0.3525627536449722, "flos": 27416863428480.0, "grad_norm": 2.204178263750967, "language_loss": 0.75406265, "learning_rate": 3.0042692139608024e-06, "loss": 0.77571976, "num_input_tokens_seen": 126116070, "step": 5864, "time_per_iteration": 2.7303273677825928 }, { "auxiliary_loss_clip": 0.01124417, "auxiliary_loss_mlp": 0.01048097, "balance_loss_clip": 1.04847336, "balance_loss_mlp": 1.03237331, "epoch": 0.35262287689764016, "flos": 24789818430720.0, "grad_norm": 2.3571129928423713, "language_loss": 0.79312253, "learning_rate": 3.003932392558793e-06, "loss": 0.81484771, "num_input_tokens_seen": 126135205, "step": 5865, "time_per_iteration": 2.6439075469970703 }, { "auxiliary_loss_clip": 0.01136688, "auxiliary_loss_mlp": 0.01047929, "balance_loss_clip": 1.05626893, "balance_loss_mlp": 1.03143001, "epoch": 0.3526830001503081, "flos": 17821604528640.0, "grad_norm": 2.261768767041389, "language_loss": 0.81215894, "learning_rate": 3.0035955330875677e-06, "loss": 0.83400512, "num_input_tokens_seen": 126151895, "step": 5866, "time_per_iteration": 2.649991035461426 }, { "auxiliary_loss_clip": 0.01095064, "auxiliary_loss_mlp": 0.01040513, "balance_loss_clip": 1.04940605, "balance_loss_mlp": 1.0227983, "epoch": 0.3527431234029761, "flos": 18078114528000.0, "grad_norm": 2.4092573216113182, "language_loss": 0.84224141, "learning_rate": 3.0032586355598986e-06, "loss": 0.86359721, "num_input_tokens_seen": 126168515, "step": 5867, "time_per_iteration": 2.7634172439575195 }, { "auxiliary_loss_clip": 0.01142449, "auxiliary_loss_mlp": 0.01051484, "balance_loss_clip": 1.05421114, "balance_loss_mlp": 1.03525996, "epoch": 0.35280324665564405, "flos": 19427350124160.0, "grad_norm": 1.8115003163784764, "language_loss": 0.74367464, "learning_rate": 3.0029216999885613e-06, "loss": 0.76561391, "num_input_tokens_seen": 126186460, "step": 5868, "time_per_iteration": 2.5986721515655518 }, { "auxiliary_loss_clip": 0.01131163, "auxiliary_loss_mlp": 0.01040977, "balance_loss_clip": 1.05391645, "balance_loss_mlp": 1.02457356, "epoch": 0.352863369908312, "flos": 21504027957120.0, "grad_norm": 1.9536193185751474, "language_loss": 0.6105355, "learning_rate": 3.0025847263863327e-06, "loss": 0.63225693, "num_input_tokens_seen": 126206170, "step": 5869, "time_per_iteration": 2.6737887859344482 }, { "auxiliary_loss_clip": 0.0112854, "auxiliary_loss_mlp": 0.01048512, "balance_loss_clip": 1.05128717, "balance_loss_mlp": 1.03254998, "epoch": 0.35292349316098, "flos": 22309504490880.0, "grad_norm": 2.4234624332717347, "language_loss": 0.74279565, "learning_rate": 3.0022477147659917e-06, "loss": 0.76456618, "num_input_tokens_seen": 126225605, "step": 5870, "time_per_iteration": 2.6921114921569824 }, { "auxiliary_loss_clip": 0.01126478, "auxiliary_loss_mlp": 0.01039703, "balance_loss_clip": 1.05037582, "balance_loss_mlp": 1.02376485, "epoch": 0.352983616413648, "flos": 33109745967360.0, "grad_norm": 1.6641276231491144, "language_loss": 0.71796882, "learning_rate": 3.001910665140316e-06, "loss": 0.73963058, "num_input_tokens_seen": 126250230, "step": 5871, "time_per_iteration": 2.8457682132720947 }, { "auxiliary_loss_clip": 0.01120204, "auxiliary_loss_mlp": 0.01040363, "balance_loss_clip": 1.04829907, "balance_loss_mlp": 1.02547359, "epoch": 0.35304373966631597, "flos": 18696603836160.0, "grad_norm": 2.0001362497177233, "language_loss": 0.73279023, "learning_rate": 3.0015735775220873e-06, "loss": 0.75439584, "num_input_tokens_seen": 126268315, "step": 5872, "time_per_iteration": 2.6763055324554443 }, { "auxiliary_loss_clip": 0.01114426, "auxiliary_loss_mlp": 0.0077352, "balance_loss_clip": 1.04808497, "balance_loss_mlp": 1.00056779, "epoch": 0.35310386291898394, "flos": 23364954748800.0, "grad_norm": 1.9067005964756008, "language_loss": 0.82472706, "learning_rate": 3.001236451924089e-06, "loss": 0.84360659, "num_input_tokens_seen": 126288390, "step": 5873, "time_per_iteration": 2.7487120628356934 }, { "auxiliary_loss_clip": 0.0111852, "auxiliary_loss_mlp": 0.01055173, "balance_loss_clip": 1.04805684, "balance_loss_mlp": 1.03743458, "epoch": 0.3531639861716519, "flos": 24461954064000.0, "grad_norm": 2.0747562837168956, "language_loss": 0.65867126, "learning_rate": 3.000899288359104e-06, "loss": 0.68040824, "num_input_tokens_seen": 126305750, "step": 5874, "time_per_iteration": 2.717100143432617 }, { "auxiliary_loss_clip": 0.01065517, "auxiliary_loss_mlp": 0.01018804, "balance_loss_clip": 1.04397154, "balance_loss_mlp": 1.01712346, "epoch": 0.35322410942431987, "flos": 70312446881280.0, "grad_norm": 0.7718710282270123, "language_loss": 0.61513722, "learning_rate": 3.000562086839917e-06, "loss": 0.63598049, "num_input_tokens_seen": 126362495, "step": 5875, "time_per_iteration": 3.1768009662628174 }, { "auxiliary_loss_clip": 0.0106968, "auxiliary_loss_mlp": 0.01053019, "balance_loss_clip": 1.04069328, "balance_loss_mlp": 1.03722405, "epoch": 0.35328423267698783, "flos": 19820894509440.0, "grad_norm": 1.9274751499515825, "language_loss": 0.79748046, "learning_rate": 3.0002248473793163e-06, "loss": 0.81870747, "num_input_tokens_seen": 126378320, "step": 5876, "time_per_iteration": 2.7911314964294434 }, { "auxiliary_loss_clip": 0.01038976, "auxiliary_loss_mlp": 0.00753375, "balance_loss_clip": 1.03853297, "balance_loss_mlp": 1.00146759, "epoch": 0.3533443559296558, "flos": 60826356391680.0, "grad_norm": 0.6715924709851474, "language_loss": 0.56771934, "learning_rate": 2.999887569990088e-06, "loss": 0.58564281, "num_input_tokens_seen": 126442735, "step": 5877, "time_per_iteration": 3.3190126419067383 }, { "auxiliary_loss_clip": 0.01106988, "auxiliary_loss_mlp": 0.0103768, "balance_loss_clip": 1.04755747, "balance_loss_mlp": 1.02150357, "epoch": 0.35340447918232376, "flos": 24755775315840.0, "grad_norm": 2.262624772342981, "language_loss": 0.72041059, "learning_rate": 2.999550254685024e-06, "loss": 0.74185729, "num_input_tokens_seen": 126463090, "step": 5878, "time_per_iteration": 2.769482135772705 }, { "auxiliary_loss_clip": 0.01111223, "auxiliary_loss_mlp": 0.01039233, "balance_loss_clip": 1.0494144, "balance_loss_mlp": 1.02333045, "epoch": 0.3534646024349917, "flos": 21796304924160.0, "grad_norm": 1.9529875004972157, "language_loss": 0.78282005, "learning_rate": 2.9992129014769136e-06, "loss": 0.80432463, "num_input_tokens_seen": 126482105, "step": 5879, "time_per_iteration": 2.7066614627838135 }, { "auxiliary_loss_clip": 0.01111375, "auxiliary_loss_mlp": 0.01046843, "balance_loss_clip": 1.05344558, "balance_loss_mlp": 1.0287354, "epoch": 0.3535247256876597, "flos": 20012119539840.0, "grad_norm": 2.4774809869114547, "language_loss": 0.63312674, "learning_rate": 2.9988755103785493e-06, "loss": 0.65470898, "num_input_tokens_seen": 126502125, "step": 5880, "time_per_iteration": 2.87187123298645 }, { "auxiliary_loss_clip": 0.01116729, "auxiliary_loss_mlp": 0.01037267, "balance_loss_clip": 1.05014002, "balance_loss_mlp": 1.02067327, "epoch": 0.35358484894032766, "flos": 18187929383040.0, "grad_norm": 2.079670586085082, "language_loss": 0.65503716, "learning_rate": 2.998538081402727e-06, "loss": 0.67657715, "num_input_tokens_seen": 126521950, "step": 5881, "time_per_iteration": 2.701570510864258 }, { "auxiliary_loss_clip": 0.01119778, "auxiliary_loss_mlp": 0.01035576, "balance_loss_clip": 1.05182576, "balance_loss_mlp": 1.02047253, "epoch": 0.3536449721929956, "flos": 22820369673600.0, "grad_norm": 1.437925300063569, "language_loss": 0.75797737, "learning_rate": 2.998200614562239e-06, "loss": 0.77953088, "num_input_tokens_seen": 126542445, "step": 5882, "time_per_iteration": 2.713350772857666 }, { "auxiliary_loss_clip": 0.01112568, "auxiliary_loss_mlp": 0.01044857, "balance_loss_clip": 1.0485872, "balance_loss_mlp": 1.02591491, "epoch": 0.3537050954456636, "flos": 26432336574720.0, "grad_norm": 2.160470372067537, "language_loss": 0.70095098, "learning_rate": 2.9978631098698847e-06, "loss": 0.72252524, "num_input_tokens_seen": 126560690, "step": 5883, "time_per_iteration": 2.77695631980896 }, { "auxiliary_loss_clip": 0.01107169, "auxiliary_loss_mlp": 0.01040706, "balance_loss_clip": 1.04937398, "balance_loss_mlp": 1.02364671, "epoch": 0.3537652186983316, "flos": 17197153562880.0, "grad_norm": 3.3935912100169117, "language_loss": 0.78052664, "learning_rate": 2.9975255673384614e-06, "loss": 0.80200535, "num_input_tokens_seen": 126577620, "step": 5884, "time_per_iteration": 2.8704800605773926 }, { "auxiliary_loss_clip": 0.0111409, "auxiliary_loss_mlp": 0.01036742, "balance_loss_clip": 1.05093837, "balance_loss_mlp": 1.02157819, "epoch": 0.3538253419509996, "flos": 19536769929600.0, "grad_norm": 1.9052381201351025, "language_loss": 0.7519542, "learning_rate": 2.9971879869807673e-06, "loss": 0.77346253, "num_input_tokens_seen": 126596235, "step": 5885, "time_per_iteration": 2.74930477142334 }, { "auxiliary_loss_clip": 0.01088229, "auxiliary_loss_mlp": 0.01040915, "balance_loss_clip": 1.04355764, "balance_loss_mlp": 1.02321255, "epoch": 0.35388546520366754, "flos": 12128578335360.0, "grad_norm": 3.360136520151105, "language_loss": 0.83904099, "learning_rate": 2.996850368809606e-06, "loss": 0.86033243, "num_input_tokens_seen": 126612830, "step": 5886, "time_per_iteration": 2.9362361431121826 }, { "auxiliary_loss_clip": 0.01139122, "auxiliary_loss_mlp": 0.01039479, "balance_loss_clip": 1.05223978, "balance_loss_mlp": 1.02178788, "epoch": 0.3539455884563355, "flos": 19678149861120.0, "grad_norm": 2.3342407880968765, "language_loss": 0.78239143, "learning_rate": 2.9965127128377787e-06, "loss": 0.8041774, "num_input_tokens_seen": 126630910, "step": 5887, "time_per_iteration": 4.157519340515137 }, { "auxiliary_loss_clip": 0.01079386, "auxiliary_loss_mlp": 0.01047635, "balance_loss_clip": 1.04380405, "balance_loss_mlp": 1.03155398, "epoch": 0.35400571170900347, "flos": 18072045129600.0, "grad_norm": 3.4693260211189614, "language_loss": 0.65532601, "learning_rate": 2.996175019078089e-06, "loss": 0.67659628, "num_input_tokens_seen": 126648365, "step": 5888, "time_per_iteration": 2.7693519592285156 }, { "auxiliary_loss_clip": 0.01108859, "auxiliary_loss_mlp": 0.01038745, "balance_loss_clip": 1.04853678, "balance_loss_mlp": 1.02278328, "epoch": 0.35406583496167143, "flos": 26068058795520.0, "grad_norm": 2.324375134725136, "language_loss": 0.77100271, "learning_rate": 2.9958372875433437e-06, "loss": 0.7924788, "num_input_tokens_seen": 126667500, "step": 5889, "time_per_iteration": 4.211338996887207 }, { "auxiliary_loss_clip": 0.0110217, "auxiliary_loss_mlp": 0.01041504, "balance_loss_clip": 1.05017257, "balance_loss_mlp": 1.0262332, "epoch": 0.3541259582143394, "flos": 19792453916160.0, "grad_norm": 2.074151752869495, "language_loss": 0.81132901, "learning_rate": 2.9954995182463478e-06, "loss": 0.83276576, "num_input_tokens_seen": 126686820, "step": 5890, "time_per_iteration": 4.248823642730713 }, { "auxiliary_loss_clip": 0.01112591, "auxiliary_loss_mlp": 0.01034659, "balance_loss_clip": 1.04692972, "balance_loss_mlp": 1.01979923, "epoch": 0.35418608146700736, "flos": 24022084112640.0, "grad_norm": 1.8036187380252735, "language_loss": 0.79384875, "learning_rate": 2.99516171119991e-06, "loss": 0.81532121, "num_input_tokens_seen": 126706965, "step": 5891, "time_per_iteration": 4.335815668106079 }, { "auxiliary_loss_clip": 0.01099264, "auxiliary_loss_mlp": 0.01046084, "balance_loss_clip": 1.04669261, "balance_loss_mlp": 1.0285244, "epoch": 0.35424620471967533, "flos": 12385770693120.0, "grad_norm": 2.015603194975926, "language_loss": 0.73404211, "learning_rate": 2.9948238664168415e-06, "loss": 0.75549555, "num_input_tokens_seen": 126724015, "step": 5892, "time_per_iteration": 2.760498046875 }, { "auxiliary_loss_clip": 0.01112321, "auxiliary_loss_mlp": 0.01041472, "balance_loss_clip": 1.04650092, "balance_loss_mlp": 1.02434158, "epoch": 0.3543063279723433, "flos": 19673624747520.0, "grad_norm": 2.094655212929219, "language_loss": 0.6720162, "learning_rate": 2.9944859839099518e-06, "loss": 0.6935541, "num_input_tokens_seen": 126737565, "step": 5893, "time_per_iteration": 2.671706199645996 }, { "auxiliary_loss_clip": 0.01084647, "auxiliary_loss_mlp": 0.01041527, "balance_loss_clip": 1.04317796, "balance_loss_mlp": 1.02440834, "epoch": 0.35436645122501126, "flos": 21909208348800.0, "grad_norm": 1.9115541405313234, "language_loss": 0.69860309, "learning_rate": 2.9941480636920533e-06, "loss": 0.71986485, "num_input_tokens_seen": 126756095, "step": 5894, "time_per_iteration": 2.720066785812378 }, { "auxiliary_loss_clip": 0.01111006, "auxiliary_loss_mlp": 0.00773076, "balance_loss_clip": 1.04764175, "balance_loss_mlp": 1.00055242, "epoch": 0.3544265744776792, "flos": 21719527603200.0, "grad_norm": 1.7998653616668008, "language_loss": 0.74833035, "learning_rate": 2.9938101057759615e-06, "loss": 0.76717114, "num_input_tokens_seen": 126775455, "step": 5895, "time_per_iteration": 2.8295304775238037 }, { "auxiliary_loss_clip": 0.011052, "auxiliary_loss_mlp": 0.01040742, "balance_loss_clip": 1.04288006, "balance_loss_mlp": 1.02485108, "epoch": 0.3544866977303472, "flos": 21213223447680.0, "grad_norm": 2.053997857318945, "language_loss": 0.83762395, "learning_rate": 2.993472110174491e-06, "loss": 0.85908329, "num_input_tokens_seen": 126792320, "step": 5896, "time_per_iteration": 2.723158836364746 }, { "auxiliary_loss_clip": 0.01111237, "auxiliary_loss_mlp": 0.00773671, "balance_loss_clip": 1.04756641, "balance_loss_mlp": 1.0005331, "epoch": 0.35454682098301515, "flos": 29311402371840.0, "grad_norm": 1.7709518935889355, "language_loss": 0.70033729, "learning_rate": 2.9931340769004576e-06, "loss": 0.71918637, "num_input_tokens_seen": 126813680, "step": 5897, "time_per_iteration": 2.744617223739624 }, { "auxiliary_loss_clip": 0.01111293, "auxiliary_loss_mlp": 0.01046033, "balance_loss_clip": 1.04829669, "balance_loss_mlp": 1.02830625, "epoch": 0.3546069442356832, "flos": 24316587722880.0, "grad_norm": 3.0934933528513344, "language_loss": 0.81546402, "learning_rate": 2.9927960059666816e-06, "loss": 0.83703721, "num_input_tokens_seen": 126834395, "step": 5898, "time_per_iteration": 2.77911376953125 }, { "auxiliary_loss_clip": 0.0113395, "auxiliary_loss_mlp": 0.01037456, "balance_loss_clip": 1.04943967, "balance_loss_mlp": 1.02232838, "epoch": 0.35466706748835114, "flos": 22857285876480.0, "grad_norm": 5.100417261000322, "language_loss": 0.73975331, "learning_rate": 2.9924578973859804e-06, "loss": 0.7614674, "num_input_tokens_seen": 126855145, "step": 5899, "time_per_iteration": 2.6566851139068604 }, { "auxiliary_loss_clip": 0.0113747, "auxiliary_loss_mlp": 0.00772565, "balance_loss_clip": 1.04971743, "balance_loss_mlp": 1.00056052, "epoch": 0.3547271907410191, "flos": 28330107742080.0, "grad_norm": 1.7615083390778834, "language_loss": 0.79458243, "learning_rate": 2.9921197511711763e-06, "loss": 0.81368273, "num_input_tokens_seen": 126873790, "step": 5900, "time_per_iteration": 2.6658642292022705 }, { "auxiliary_loss_clip": 0.0111331, "auxiliary_loss_mlp": 0.01044824, "balance_loss_clip": 1.04659319, "balance_loss_mlp": 1.0288384, "epoch": 0.35478731399368707, "flos": 23514092017920.0, "grad_norm": 2.160550694830747, "language_loss": 0.81303531, "learning_rate": 2.991781567335093e-06, "loss": 0.83461666, "num_input_tokens_seen": 126892865, "step": 5901, "time_per_iteration": 2.711568593978882 }, { "auxiliary_loss_clip": 0.01125037, "auxiliary_loss_mlp": 0.00772744, "balance_loss_clip": 1.05092883, "balance_loss_mlp": 1.00049663, "epoch": 0.35484743724635504, "flos": 18624315715200.0, "grad_norm": 2.0558354102165373, "language_loss": 0.75869077, "learning_rate": 2.9914433458905525e-06, "loss": 0.7776686, "num_input_tokens_seen": 126911935, "step": 5902, "time_per_iteration": 2.6833012104034424 }, { "auxiliary_loss_clip": 0.01123978, "auxiliary_loss_mlp": 0.01036322, "balance_loss_clip": 1.04852581, "balance_loss_mlp": 1.02142096, "epoch": 0.354907560499023, "flos": 17384499924480.0, "grad_norm": 2.534328384273088, "language_loss": 0.70550704, "learning_rate": 2.991105086850381e-06, "loss": 0.72711003, "num_input_tokens_seen": 126930040, "step": 5903, "time_per_iteration": 2.689303159713745 }, { "auxiliary_loss_clip": 0.01128401, "auxiliary_loss_mlp": 0.01036477, "balance_loss_clip": 1.05025887, "balance_loss_mlp": 1.02051437, "epoch": 0.35496768375169097, "flos": 19208546426880.0, "grad_norm": 3.3775979872187203, "language_loss": 0.7448622, "learning_rate": 2.9907667902274053e-06, "loss": 0.76651096, "num_input_tokens_seen": 126948390, "step": 5904, "time_per_iteration": 2.6360747814178467 }, { "auxiliary_loss_clip": 0.01113034, "auxiliary_loss_mlp": 0.00772738, "balance_loss_clip": 1.04721618, "balance_loss_mlp": 1.000543, "epoch": 0.35502780700435893, "flos": 18332792933760.0, "grad_norm": 3.051840518778985, "language_loss": 0.78653091, "learning_rate": 2.9904284560344536e-06, "loss": 0.80538863, "num_input_tokens_seen": 126964905, "step": 5905, "time_per_iteration": 2.8539419174194336 }, { "auxiliary_loss_clip": 0.01101916, "auxiliary_loss_mlp": 0.01038927, "balance_loss_clip": 1.04842138, "balance_loss_mlp": 1.02486014, "epoch": 0.3550879302570269, "flos": 15448555578240.0, "grad_norm": 18.846860460510154, "language_loss": 0.72740704, "learning_rate": 2.990090084284356e-06, "loss": 0.74881542, "num_input_tokens_seen": 126982000, "step": 5906, "time_per_iteration": 2.7013392448425293 }, { "auxiliary_loss_clip": 0.01109726, "auxiliary_loss_mlp": 0.01039804, "balance_loss_clip": 1.04908431, "balance_loss_mlp": 1.02265012, "epoch": 0.35514805350969486, "flos": 21979197999360.0, "grad_norm": 1.821131131528883, "language_loss": 0.74746358, "learning_rate": 2.9897516749899426e-06, "loss": 0.76895893, "num_input_tokens_seen": 126998390, "step": 5907, "time_per_iteration": 2.7603847980499268 }, { "auxiliary_loss_clip": 0.01062812, "auxiliary_loss_mlp": 0.01042872, "balance_loss_clip": 1.03682017, "balance_loss_mlp": 1.02463293, "epoch": 0.3552081767623628, "flos": 29861949104640.0, "grad_norm": 3.0473905008627775, "language_loss": 0.7563526, "learning_rate": 2.989413228164047e-06, "loss": 0.77740943, "num_input_tokens_seen": 127020220, "step": 5908, "time_per_iteration": 2.8653454780578613 }, { "auxiliary_loss_clip": 0.01114185, "auxiliary_loss_mlp": 0.01042445, "balance_loss_clip": 1.05034626, "balance_loss_mlp": 1.02736473, "epoch": 0.3552683000150308, "flos": 26432264747520.0, "grad_norm": 2.926995842336842, "language_loss": 0.68243527, "learning_rate": 2.989074743819502e-06, "loss": 0.70400161, "num_input_tokens_seen": 127038585, "step": 5909, "time_per_iteration": 2.6967928409576416 }, { "auxiliary_loss_clip": 0.01120713, "auxiliary_loss_mlp": 0.01037454, "balance_loss_clip": 1.0503571, "balance_loss_mlp": 1.02271986, "epoch": 0.35532842326769876, "flos": 19785989468160.0, "grad_norm": 2.2169711344959864, "language_loss": 0.78605235, "learning_rate": 2.988736221969144e-06, "loss": 0.807634, "num_input_tokens_seen": 127056215, "step": 5910, "time_per_iteration": 2.65592885017395 }, { "auxiliary_loss_clip": 0.01111825, "auxiliary_loss_mlp": 0.01044022, "balance_loss_clip": 1.04383612, "balance_loss_mlp": 1.02745175, "epoch": 0.3553885465203668, "flos": 17239277237760.0, "grad_norm": 4.097628076705993, "language_loss": 0.71322721, "learning_rate": 2.98839766262581e-06, "loss": 0.73478568, "num_input_tokens_seen": 127075825, "step": 5911, "time_per_iteration": 2.6958134174346924 }, { "auxiliary_loss_clip": 0.01122761, "auxiliary_loss_mlp": 0.01041881, "balance_loss_clip": 1.04820287, "balance_loss_mlp": 1.02711153, "epoch": 0.35544866977303474, "flos": 14934350430720.0, "grad_norm": 2.592685980990988, "language_loss": 0.86703777, "learning_rate": 2.9880590658023366e-06, "loss": 0.88868415, "num_input_tokens_seen": 127091205, "step": 5912, "time_per_iteration": 2.615788221359253 }, { "auxiliary_loss_clip": 0.01113661, "auxiliary_loss_mlp": 0.01038659, "balance_loss_clip": 1.04849911, "balance_loss_mlp": 1.02413917, "epoch": 0.3555087930257027, "flos": 19756040503680.0, "grad_norm": 1.9602305341473392, "language_loss": 0.76948488, "learning_rate": 2.9877204315115646e-06, "loss": 0.79100811, "num_input_tokens_seen": 127109210, "step": 5913, "time_per_iteration": 2.7827799320220947 }, { "auxiliary_loss_clip": 0.01098195, "auxiliary_loss_mlp": 0.01036489, "balance_loss_clip": 1.04796672, "balance_loss_mlp": 1.02183783, "epoch": 0.3555689162783707, "flos": 21068252156160.0, "grad_norm": 1.6272917241322848, "language_loss": 0.82545209, "learning_rate": 2.9873817597663353e-06, "loss": 0.8467989, "num_input_tokens_seen": 127128400, "step": 5914, "time_per_iteration": 2.7242603302001953 }, { "auxiliary_loss_clip": 0.01137835, "auxiliary_loss_mlp": 0.01037677, "balance_loss_clip": 1.05178475, "balance_loss_mlp": 1.02247739, "epoch": 0.35562903953103864, "flos": 33069633454080.0, "grad_norm": 2.9034799926536, "language_loss": 0.70664769, "learning_rate": 2.98704305057949e-06, "loss": 0.72840279, "num_input_tokens_seen": 127149965, "step": 5915, "time_per_iteration": 2.6785290241241455 }, { "auxiliary_loss_clip": 0.01124956, "auxiliary_loss_mlp": 0.01042738, "balance_loss_clip": 1.04884696, "balance_loss_mlp": 1.02823067, "epoch": 0.3556891627837066, "flos": 20557853850240.0, "grad_norm": 1.7433450554379117, "language_loss": 0.76387751, "learning_rate": 2.9867043039638737e-06, "loss": 0.78555447, "num_input_tokens_seen": 127169865, "step": 5916, "time_per_iteration": 2.646141529083252 }, { "auxiliary_loss_clip": 0.01103991, "auxiliary_loss_mlp": 0.01039438, "balance_loss_clip": 1.04549897, "balance_loss_mlp": 1.02451277, "epoch": 0.35574928603637457, "flos": 20703327932160.0, "grad_norm": 1.7213233773991115, "language_loss": 0.88551259, "learning_rate": 2.986365519932332e-06, "loss": 0.9069469, "num_input_tokens_seen": 127188075, "step": 5917, "time_per_iteration": 2.735424757003784 }, { "auxiliary_loss_clip": 0.01057648, "auxiliary_loss_mlp": 0.01050179, "balance_loss_clip": 1.03888357, "balance_loss_mlp": 1.03190458, "epoch": 0.35580940928904253, "flos": 15194595444480.0, "grad_norm": 2.1986231946039916, "language_loss": 0.74800515, "learning_rate": 2.98602669849771e-06, "loss": 0.76908338, "num_input_tokens_seen": 127206065, "step": 5918, "time_per_iteration": 2.759612798690796 }, { "auxiliary_loss_clip": 0.01046226, "auxiliary_loss_mlp": 0.01004318, "balance_loss_clip": 1.03416467, "balance_loss_mlp": 1.00212467, "epoch": 0.3558695325417105, "flos": 58639145431680.0, "grad_norm": 0.9523078238877629, "language_loss": 0.63871694, "learning_rate": 2.985687839672857e-06, "loss": 0.65922242, "num_input_tokens_seen": 127257885, "step": 5919, "time_per_iteration": 2.974400281906128 }, { "auxiliary_loss_clip": 0.01125949, "auxiliary_loss_mlp": 0.01037737, "balance_loss_clip": 1.05126309, "balance_loss_mlp": 1.02168, "epoch": 0.35592965579437846, "flos": 22018233104640.0, "grad_norm": 2.3466450300124952, "language_loss": 0.73515332, "learning_rate": 2.9853489434706223e-06, "loss": 0.75679016, "num_input_tokens_seen": 127275550, "step": 5920, "time_per_iteration": 2.6402368545532227 }, { "auxiliary_loss_clip": 0.01092607, "auxiliary_loss_mlp": 0.01035798, "balance_loss_clip": 1.0452888, "balance_loss_mlp": 1.02082539, "epoch": 0.35598977904704643, "flos": 23367684182400.0, "grad_norm": 2.020155019062759, "language_loss": 0.76745147, "learning_rate": 2.985010009903857e-06, "loss": 0.78873557, "num_input_tokens_seen": 127295110, "step": 5921, "time_per_iteration": 2.7224855422973633 }, { "auxiliary_loss_clip": 0.01112186, "auxiliary_loss_mlp": 0.01038012, "balance_loss_clip": 1.04887438, "balance_loss_mlp": 1.0231111, "epoch": 0.3560499022997144, "flos": 17785334770560.0, "grad_norm": 2.0978128065546717, "language_loss": 0.68095905, "learning_rate": 2.9846710389854133e-06, "loss": 0.702461, "num_input_tokens_seen": 127312865, "step": 5922, "time_per_iteration": 2.6849706172943115 }, { "auxiliary_loss_clip": 0.01120912, "auxiliary_loss_mlp": 0.01035687, "balance_loss_clip": 1.04752564, "balance_loss_mlp": 1.02032125, "epoch": 0.35611002555238236, "flos": 20740459616640.0, "grad_norm": 3.470851899346702, "language_loss": 0.79121947, "learning_rate": 2.9843320307281454e-06, "loss": 0.81278539, "num_input_tokens_seen": 127331710, "step": 5923, "time_per_iteration": 2.659977436065674 }, { "auxiliary_loss_clip": 0.01118161, "auxiliary_loss_mlp": 0.01042419, "balance_loss_clip": 1.0530231, "balance_loss_mlp": 1.02770221, "epoch": 0.3561701488050504, "flos": 19462219251840.0, "grad_norm": 2.2084385051152946, "language_loss": 0.85266459, "learning_rate": 2.983992985144908e-06, "loss": 0.87427044, "num_input_tokens_seen": 127350950, "step": 5924, "time_per_iteration": 2.680994987487793 }, { "auxiliary_loss_clip": 0.01109604, "auxiliary_loss_mlp": 0.01046078, "balance_loss_clip": 1.04669881, "balance_loss_mlp": 1.02974653, "epoch": 0.35623027205771834, "flos": 30774942023040.0, "grad_norm": 3.12021389910605, "language_loss": 0.77619767, "learning_rate": 2.9836539022485578e-06, "loss": 0.79775453, "num_input_tokens_seen": 127369385, "step": 5925, "time_per_iteration": 2.854043960571289 }, { "auxiliary_loss_clip": 0.01078608, "auxiliary_loss_mlp": 0.01047631, "balance_loss_clip": 1.04546142, "balance_loss_mlp": 1.03274155, "epoch": 0.3562903953103863, "flos": 16981079299200.0, "grad_norm": 2.0406100546628108, "language_loss": 0.75402963, "learning_rate": 2.9833147820519535e-06, "loss": 0.77529198, "num_input_tokens_seen": 127386965, "step": 5926, "time_per_iteration": 4.347430467605591 }, { "auxiliary_loss_clip": 0.01110536, "auxiliary_loss_mlp": 0.00773423, "balance_loss_clip": 1.04907203, "balance_loss_mlp": 1.00041842, "epoch": 0.3563505185630543, "flos": 23839837482240.0, "grad_norm": 2.7011184644215254, "language_loss": 0.69563019, "learning_rate": 2.9829756245679544e-06, "loss": 0.71446979, "num_input_tokens_seen": 127406075, "step": 5927, "time_per_iteration": 2.8237216472625732 }, { "auxiliary_loss_clip": 0.01136293, "auxiliary_loss_mlp": 0.01040585, "balance_loss_clip": 1.05083871, "balance_loss_mlp": 1.0256958, "epoch": 0.35641064181572224, "flos": 22273450214400.0, "grad_norm": 2.594343371199836, "language_loss": 0.79681075, "learning_rate": 2.9826364298094212e-06, "loss": 0.81857955, "num_input_tokens_seen": 127425350, "step": 5928, "time_per_iteration": 4.171353340148926 }, { "auxiliary_loss_clip": 0.01139765, "auxiliary_loss_mlp": 0.01040338, "balance_loss_clip": 1.05304861, "balance_loss_mlp": 1.02473354, "epoch": 0.3564707650683902, "flos": 23001251587200.0, "grad_norm": 1.4355701611092584, "language_loss": 0.81758744, "learning_rate": 2.982297197789215e-06, "loss": 0.83938849, "num_input_tokens_seen": 127446335, "step": 5929, "time_per_iteration": 4.3162572383880615 }, { "auxiliary_loss_clip": 0.01120871, "auxiliary_loss_mlp": 0.01037566, "balance_loss_clip": 1.04776335, "balance_loss_mlp": 1.02304602, "epoch": 0.35653088832105817, "flos": 14684268965760.0, "grad_norm": 1.9323399136404307, "language_loss": 0.70277226, "learning_rate": 2.981957928520201e-06, "loss": 0.72435665, "num_input_tokens_seen": 127462795, "step": 5930, "time_per_iteration": 2.6527109146118164 }, { "auxiliary_loss_clip": 0.01131875, "auxiliary_loss_mlp": 0.01045641, "balance_loss_clip": 1.05533779, "balance_loss_mlp": 1.02960742, "epoch": 0.35659101157372614, "flos": 23477068074240.0, "grad_norm": 2.2535070260025147, "language_loss": 0.6758765, "learning_rate": 2.981618622015244e-06, "loss": 0.69765162, "num_input_tokens_seen": 127482675, "step": 5931, "time_per_iteration": 4.3453147411346436 }, { "auxiliary_loss_clip": 0.0112554, "auxiliary_loss_mlp": 0.01040124, "balance_loss_clip": 1.04992425, "balance_loss_mlp": 1.02531803, "epoch": 0.3566511348263941, "flos": 26578672583040.0, "grad_norm": 1.9436277425022137, "language_loss": 0.67792088, "learning_rate": 2.981279278287211e-06, "loss": 0.69957745, "num_input_tokens_seen": 127502275, "step": 5932, "time_per_iteration": 2.700096368789673 }, { "auxiliary_loss_clip": 0.01082532, "auxiliary_loss_mlp": 0.01033095, "balance_loss_clip": 1.04578543, "balance_loss_mlp": 1.01849222, "epoch": 0.35671125807906207, "flos": 13115008609920.0, "grad_norm": 5.160615382495107, "language_loss": 0.78454852, "learning_rate": 2.980939897348969e-06, "loss": 0.80570471, "num_input_tokens_seen": 127520195, "step": 5933, "time_per_iteration": 2.6900391578674316 }, { "auxiliary_loss_clip": 0.01121777, "auxiliary_loss_mlp": 0.01052933, "balance_loss_clip": 1.0480361, "balance_loss_mlp": 1.03600574, "epoch": 0.35677138133173003, "flos": 33000577557120.0, "grad_norm": 1.6861574442761758, "language_loss": 0.69256425, "learning_rate": 2.980600479213388e-06, "loss": 0.7143113, "num_input_tokens_seen": 127544495, "step": 5934, "time_per_iteration": 2.7415738105773926 }, { "auxiliary_loss_clip": 0.01117054, "auxiliary_loss_mlp": 0.0077763, "balance_loss_clip": 1.05076528, "balance_loss_mlp": 1.00057197, "epoch": 0.356831504584398, "flos": 20777842696320.0, "grad_norm": 1.9577931058258786, "language_loss": 0.70848507, "learning_rate": 2.9802610238933384e-06, "loss": 0.72743189, "num_input_tokens_seen": 127563810, "step": 5935, "time_per_iteration": 2.689974069595337 }, { "auxiliary_loss_clip": 0.01105553, "auxiliary_loss_mlp": 0.01040367, "balance_loss_clip": 1.04790044, "balance_loss_mlp": 1.02414298, "epoch": 0.35689162783706596, "flos": 12165566365440.0, "grad_norm": 2.8406009493899567, "language_loss": 0.7755211, "learning_rate": 2.979921531401692e-06, "loss": 0.79698032, "num_input_tokens_seen": 127579065, "step": 5936, "time_per_iteration": 2.741913318634033 }, { "auxiliary_loss_clip": 0.0112859, "auxiliary_loss_mlp": 0.00773213, "balance_loss_clip": 1.05281317, "balance_loss_mlp": 1.00073922, "epoch": 0.356951751089734, "flos": 23841489507840.0, "grad_norm": 1.4219917851433757, "language_loss": 0.64282179, "learning_rate": 2.9795820017513242e-06, "loss": 0.66183978, "num_input_tokens_seen": 127599105, "step": 5937, "time_per_iteration": 2.698432207107544 }, { "auxiliary_loss_clip": 0.011437, "auxiliary_loss_mlp": 0.00773044, "balance_loss_clip": 1.05475211, "balance_loss_mlp": 1.00064254, "epoch": 0.35701187434240195, "flos": 11722176881280.0, "grad_norm": 3.0634993604384744, "language_loss": 0.78483748, "learning_rate": 2.9792424349551073e-06, "loss": 0.80400497, "num_input_tokens_seen": 127614940, "step": 5938, "time_per_iteration": 2.617074489593506 }, { "auxiliary_loss_clip": 0.01104152, "auxiliary_loss_mlp": 0.01042471, "balance_loss_clip": 1.05522823, "balance_loss_mlp": 1.0276773, "epoch": 0.3570719975950699, "flos": 24898879100160.0, "grad_norm": 1.4921508018011957, "language_loss": 0.8058449, "learning_rate": 2.9789028310259202e-06, "loss": 0.82731104, "num_input_tokens_seen": 127634960, "step": 5939, "time_per_iteration": 2.805285930633545 }, { "auxiliary_loss_clip": 0.01119857, "auxiliary_loss_mlp": 0.01039048, "balance_loss_clip": 1.05386829, "balance_loss_mlp": 1.02343178, "epoch": 0.3571321208477379, "flos": 25994836920960.0, "grad_norm": 2.412769849050775, "language_loss": 0.79263425, "learning_rate": 2.9785631899766395e-06, "loss": 0.81422341, "num_input_tokens_seen": 127654545, "step": 5940, "time_per_iteration": 2.729759693145752 }, { "auxiliary_loss_clip": 0.01122797, "auxiliary_loss_mlp": 0.0103573, "balance_loss_clip": 1.05434561, "balance_loss_mlp": 1.01836729, "epoch": 0.35719224410040584, "flos": 14501663199360.0, "grad_norm": 2.99992676537861, "language_loss": 0.72561693, "learning_rate": 2.9782235118201443e-06, "loss": 0.74720228, "num_input_tokens_seen": 127672320, "step": 5941, "time_per_iteration": 2.7407357692718506 }, { "auxiliary_loss_clip": 0.01131761, "auxiliary_loss_mlp": 0.01043456, "balance_loss_clip": 1.0537883, "balance_loss_mlp": 1.02636182, "epoch": 0.3572523673530738, "flos": 31175453646720.0, "grad_norm": 4.524453853263744, "language_loss": 0.64234614, "learning_rate": 2.9778837965693154e-06, "loss": 0.66409832, "num_input_tokens_seen": 127693315, "step": 5942, "time_per_iteration": 2.693835735321045 }, { "auxiliary_loss_clip": 0.01125006, "auxiliary_loss_mlp": 0.0104058, "balance_loss_clip": 1.05074191, "balance_loss_mlp": 1.02442718, "epoch": 0.3573124906057418, "flos": 15851976203520.0, "grad_norm": 1.88999720959261, "language_loss": 0.7433207, "learning_rate": 2.9775440442370354e-06, "loss": 0.76497656, "num_input_tokens_seen": 127711570, "step": 5943, "time_per_iteration": 2.6655383110046387 }, { "auxiliary_loss_clip": 0.0107084, "auxiliary_loss_mlp": 0.01002098, "balance_loss_clip": 1.04128122, "balance_loss_mlp": 1.000512, "epoch": 0.35737261385840974, "flos": 60822729118080.0, "grad_norm": 0.7930578325967097, "language_loss": 0.60739905, "learning_rate": 2.9772042548361867e-06, "loss": 0.62812841, "num_input_tokens_seen": 127772475, "step": 5944, "time_per_iteration": 3.257052421569824 }, { "auxiliary_loss_clip": 0.01113544, "auxiliary_loss_mlp": 0.01038819, "balance_loss_clip": 1.05017304, "balance_loss_mlp": 1.02329779, "epoch": 0.3574327371110777, "flos": 18843765857280.0, "grad_norm": 2.0176419730945554, "language_loss": 0.72310007, "learning_rate": 2.976864428379655e-06, "loss": 0.74462366, "num_input_tokens_seen": 127790940, "step": 5945, "time_per_iteration": 2.6320457458496094 }, { "auxiliary_loss_clip": 0.01113199, "auxiliary_loss_mlp": 0.00773448, "balance_loss_clip": 1.04710388, "balance_loss_mlp": 1.00053716, "epoch": 0.35749286036374567, "flos": 23549679417600.0, "grad_norm": 2.1873404124300655, "language_loss": 0.81147355, "learning_rate": 2.976524564880326e-06, "loss": 0.83034003, "num_input_tokens_seen": 127808275, "step": 5946, "time_per_iteration": 2.7045581340789795 }, { "auxiliary_loss_clip": 0.01142015, "auxiliary_loss_mlp": 0.01041839, "balance_loss_clip": 1.05382085, "balance_loss_mlp": 1.02568626, "epoch": 0.35755298361641363, "flos": 21105491581440.0, "grad_norm": 1.5286248167474699, "language_loss": 0.68842459, "learning_rate": 2.9761846643510882e-06, "loss": 0.71026313, "num_input_tokens_seen": 127828840, "step": 5947, "time_per_iteration": 2.6360325813293457 }, { "auxiliary_loss_clip": 0.01107164, "auxiliary_loss_mlp": 0.01039633, "balance_loss_clip": 1.04598188, "balance_loss_mlp": 1.02426696, "epoch": 0.3576131068690816, "flos": 19245031666560.0, "grad_norm": 4.061535671212192, "language_loss": 0.76024956, "learning_rate": 2.9758447268048297e-06, "loss": 0.78171754, "num_input_tokens_seen": 127846240, "step": 5948, "time_per_iteration": 2.6968884468078613 }, { "auxiliary_loss_clip": 0.01081903, "auxiliary_loss_mlp": 0.01043894, "balance_loss_clip": 1.04692364, "balance_loss_mlp": 1.0291121, "epoch": 0.35767323012174956, "flos": 28654703971200.0, "grad_norm": 1.8353415788349725, "language_loss": 0.70553362, "learning_rate": 2.9755047522544415e-06, "loss": 0.72679162, "num_input_tokens_seen": 127866880, "step": 5949, "time_per_iteration": 2.8849079608917236 }, { "auxiliary_loss_clip": 0.01113321, "auxiliary_loss_mlp": 0.01041031, "balance_loss_clip": 1.04892492, "balance_loss_mlp": 1.02688098, "epoch": 0.35773335337441753, "flos": 17085363459840.0, "grad_norm": 2.820547719587591, "language_loss": 0.77489066, "learning_rate": 2.9751647407128154e-06, "loss": 0.79643422, "num_input_tokens_seen": 127883560, "step": 5950, "time_per_iteration": 2.6595206260681152 }, { "auxiliary_loss_clip": 0.0112732, "auxiliary_loss_mlp": 0.01041981, "balance_loss_clip": 1.04834211, "balance_loss_mlp": 1.02592397, "epoch": 0.35779347662708555, "flos": 15888605097600.0, "grad_norm": 1.7233867228761917, "language_loss": 0.72746027, "learning_rate": 2.9748246921928445e-06, "loss": 0.74915326, "num_input_tokens_seen": 127902330, "step": 5951, "time_per_iteration": 2.6544554233551025 }, { "auxiliary_loss_clip": 0.01129333, "auxiliary_loss_mlp": 0.01041471, "balance_loss_clip": 1.05047357, "balance_loss_mlp": 1.0256753, "epoch": 0.3578535998797535, "flos": 28658834035200.0, "grad_norm": 2.2344429074284693, "language_loss": 0.69326741, "learning_rate": 2.9744846067074236e-06, "loss": 0.71497542, "num_input_tokens_seen": 127922325, "step": 5952, "time_per_iteration": 2.7666146755218506 }, { "auxiliary_loss_clip": 0.01080716, "auxiliary_loss_mlp": 0.01049645, "balance_loss_clip": 1.04122877, "balance_loss_mlp": 1.03411233, "epoch": 0.3579137231324215, "flos": 37852432076160.0, "grad_norm": 4.791743787800428, "language_loss": 0.69651616, "learning_rate": 2.974144484269449e-06, "loss": 0.71781975, "num_input_tokens_seen": 127942635, "step": 5953, "time_per_iteration": 2.900196075439453 }, { "auxiliary_loss_clip": 0.01113192, "auxiliary_loss_mlp": 0.01034652, "balance_loss_clip": 1.0476222, "balance_loss_mlp": 1.0198822, "epoch": 0.35797384638508944, "flos": 22346851656960.0, "grad_norm": 2.3015234956442394, "language_loss": 0.6670965, "learning_rate": 2.9738043248918175e-06, "loss": 0.68857497, "num_input_tokens_seen": 127962520, "step": 5954, "time_per_iteration": 2.7609100341796875 }, { "auxiliary_loss_clip": 0.011102, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.04845512, "balance_loss_mlp": 1.02633798, "epoch": 0.3580339696377574, "flos": 13589711775360.0, "grad_norm": 1.9332002852280215, "language_loss": 0.74798024, "learning_rate": 2.9734641285874282e-06, "loss": 0.76948655, "num_input_tokens_seen": 127981180, "step": 5955, "time_per_iteration": 2.727787733078003 }, { "auxiliary_loss_clip": 0.01114534, "auxiliary_loss_mlp": 0.01039755, "balance_loss_clip": 1.04827058, "balance_loss_mlp": 1.02546179, "epoch": 0.3580940928904254, "flos": 23768231719680.0, "grad_norm": 1.745052650810224, "language_loss": 0.75871193, "learning_rate": 2.973123895369182e-06, "loss": 0.78025484, "num_input_tokens_seen": 127999725, "step": 5956, "time_per_iteration": 2.685006856918335 }, { "auxiliary_loss_clip": 0.01133387, "auxiliary_loss_mlp": 0.01035002, "balance_loss_clip": 1.05088747, "balance_loss_mlp": 1.0211376, "epoch": 0.35815421614309334, "flos": 19463871277440.0, "grad_norm": 4.15447674959345, "language_loss": 0.73543882, "learning_rate": 2.9727836252499805e-06, "loss": 0.75712276, "num_input_tokens_seen": 128018885, "step": 5957, "time_per_iteration": 2.6640098094940186 }, { "auxiliary_loss_clip": 0.01113163, "auxiliary_loss_mlp": 0.01037962, "balance_loss_clip": 1.04958355, "balance_loss_mlp": 1.02395511, "epoch": 0.3582143393957613, "flos": 23368186972800.0, "grad_norm": 3.3283201757671037, "language_loss": 0.70960939, "learning_rate": 2.972443318242726e-06, "loss": 0.73112065, "num_input_tokens_seen": 128037875, "step": 5958, "time_per_iteration": 2.6962838172912598 }, { "auxiliary_loss_clip": 0.01093969, "auxiliary_loss_mlp": 0.01038485, "balance_loss_clip": 1.04454029, "balance_loss_mlp": 1.02435875, "epoch": 0.35827446264842927, "flos": 26323275905280.0, "grad_norm": 2.5438119471533494, "language_loss": 0.88630176, "learning_rate": 2.972102974360324e-06, "loss": 0.90762633, "num_input_tokens_seen": 128056045, "step": 5959, "time_per_iteration": 2.713508129119873 }, { "auxiliary_loss_clip": 0.0113447, "auxiliary_loss_mlp": 0.010399, "balance_loss_clip": 1.05009389, "balance_loss_mlp": 1.02511787, "epoch": 0.35833458590109724, "flos": 30446610779520.0, "grad_norm": 2.2010810744211486, "language_loss": 0.58033586, "learning_rate": 2.971762593615679e-06, "loss": 0.60207957, "num_input_tokens_seen": 128077815, "step": 5960, "time_per_iteration": 2.685009479522705 }, { "auxiliary_loss_clip": 0.0113445, "auxiliary_loss_mlp": 0.01041748, "balance_loss_clip": 1.04900908, "balance_loss_mlp": 1.0255897, "epoch": 0.3583947091537652, "flos": 14829886702080.0, "grad_norm": 2.9088839798225035, "language_loss": 0.75860739, "learning_rate": 2.9714221760216993e-06, "loss": 0.7803694, "num_input_tokens_seen": 128095460, "step": 5961, "time_per_iteration": 2.591665506362915 }, { "auxiliary_loss_clip": 0.01103629, "auxiliary_loss_mlp": 0.01037452, "balance_loss_clip": 1.04985154, "balance_loss_mlp": 1.022223, "epoch": 0.35845483240643317, "flos": 34240644743040.0, "grad_norm": 1.7962139278871543, "language_loss": 0.70392656, "learning_rate": 2.971081721591294e-06, "loss": 0.72533739, "num_input_tokens_seen": 128118605, "step": 5962, "time_per_iteration": 2.78696346282959 }, { "auxiliary_loss_clip": 0.01116632, "auxiliary_loss_mlp": 0.01038106, "balance_loss_clip": 1.0513072, "balance_loss_mlp": 1.02532077, "epoch": 0.35851495565910113, "flos": 20960089326720.0, "grad_norm": 3.937600501619356, "language_loss": 0.75052911, "learning_rate": 2.9707412303373716e-06, "loss": 0.77207649, "num_input_tokens_seen": 128139205, "step": 5963, "time_per_iteration": 2.779210090637207 }, { "auxiliary_loss_clip": 0.01136067, "auxiliary_loss_mlp": 0.01044967, "balance_loss_clip": 1.05189323, "balance_loss_mlp": 1.03017306, "epoch": 0.35857507891176915, "flos": 22309863626880.0, "grad_norm": 3.7087256254692305, "language_loss": 0.78717148, "learning_rate": 2.9704007022728447e-06, "loss": 0.80898178, "num_input_tokens_seen": 128158765, "step": 5964, "time_per_iteration": 2.598621368408203 }, { "auxiliary_loss_clip": 0.01112011, "auxiliary_loss_mlp": 0.01041333, "balance_loss_clip": 1.05019569, "balance_loss_mlp": 1.02534723, "epoch": 0.3586352021644371, "flos": 23367863750400.0, "grad_norm": 2.0226045347569857, "language_loss": 0.66572571, "learning_rate": 2.970060137410626e-06, "loss": 0.6872592, "num_input_tokens_seen": 128177850, "step": 5965, "time_per_iteration": 2.684847116470337 }, { "auxiliary_loss_clip": 0.01132652, "auxiliary_loss_mlp": 0.0077213, "balance_loss_clip": 1.04819942, "balance_loss_mlp": 1.00052619, "epoch": 0.3586953254171051, "flos": 27849227437440.0, "grad_norm": 2.180178648475794, "language_loss": 0.79150963, "learning_rate": 2.9697195357636294e-06, "loss": 0.81055743, "num_input_tokens_seen": 128196925, "step": 5966, "time_per_iteration": 4.321925163269043 }, { "auxiliary_loss_clip": 0.01076497, "auxiliary_loss_mlp": 0.01042048, "balance_loss_clip": 1.04272628, "balance_loss_mlp": 1.02573991, "epoch": 0.35875544866977305, "flos": 19500500171520.0, "grad_norm": 2.3639555115609663, "language_loss": 0.91201752, "learning_rate": 2.9693788973447715e-06, "loss": 0.93320298, "num_input_tokens_seen": 128213955, "step": 5967, "time_per_iteration": 2.7455573081970215 }, { "auxiliary_loss_clip": 0.01101026, "auxiliary_loss_mlp": 0.01053293, "balance_loss_clip": 1.04794097, "balance_loss_mlp": 1.03494644, "epoch": 0.358815571922441, "flos": 21471134077440.0, "grad_norm": 5.4514250686274695, "language_loss": 0.80356693, "learning_rate": 2.9690382221669682e-06, "loss": 0.82511014, "num_input_tokens_seen": 128232980, "step": 5968, "time_per_iteration": 4.176758766174316 }, { "auxiliary_loss_clip": 0.01109306, "auxiliary_loss_mlp": 0.01052187, "balance_loss_clip": 1.04507756, "balance_loss_mlp": 1.03602266, "epoch": 0.358875695175109, "flos": 21835411856640.0, "grad_norm": 2.18425096992674, "language_loss": 0.8341769, "learning_rate": 2.9686975102431384e-06, "loss": 0.85579193, "num_input_tokens_seen": 128252795, "step": 5969, "time_per_iteration": 4.278231382369995 }, { "auxiliary_loss_clip": 0.01089525, "auxiliary_loss_mlp": 0.01034474, "balance_loss_clip": 1.04389262, "balance_loss_mlp": 1.0201571, "epoch": 0.35893581842777694, "flos": 32011633330560.0, "grad_norm": 2.040075228447558, "language_loss": 0.72608048, "learning_rate": 2.968356761586202e-06, "loss": 0.74732047, "num_input_tokens_seen": 128273115, "step": 5970, "time_per_iteration": 2.7784154415130615 }, { "auxiliary_loss_clip": 0.01110616, "auxiliary_loss_mlp": 0.01033542, "balance_loss_clip": 1.04673791, "balance_loss_mlp": 1.01868832, "epoch": 0.3589959416804449, "flos": 20485817124480.0, "grad_norm": 1.7975318028216438, "language_loss": 0.79562962, "learning_rate": 2.9680159762090805e-06, "loss": 0.8170712, "num_input_tokens_seen": 128292220, "step": 5971, "time_per_iteration": 4.519066333770752 }, { "auxiliary_loss_clip": 0.01098267, "auxiliary_loss_mlp": 0.01043063, "balance_loss_clip": 1.04956031, "balance_loss_mlp": 1.02766144, "epoch": 0.3590560649331129, "flos": 16180666583040.0, "grad_norm": 1.754965992567408, "language_loss": 0.78217793, "learning_rate": 2.967675154124696e-06, "loss": 0.80359125, "num_input_tokens_seen": 128310305, "step": 5972, "time_per_iteration": 2.7724227905273438 }, { "auxiliary_loss_clip": 0.01092509, "auxiliary_loss_mlp": 0.01035503, "balance_loss_clip": 1.04198921, "balance_loss_mlp": 1.02043509, "epoch": 0.35911618818578084, "flos": 20375391738240.0, "grad_norm": 2.4812117519320287, "language_loss": 0.8120966, "learning_rate": 2.9673342953459722e-06, "loss": 0.83337677, "num_input_tokens_seen": 128328305, "step": 5973, "time_per_iteration": 2.8266379833221436 }, { "auxiliary_loss_clip": 0.01042329, "auxiliary_loss_mlp": 0.01005341, "balance_loss_clip": 1.03088689, "balance_loss_mlp": 1.0036602, "epoch": 0.3591763114384488, "flos": 41236691685120.0, "grad_norm": 0.9056618080123127, "language_loss": 0.56743383, "learning_rate": 2.9669933998858355e-06, "loss": 0.58791053, "num_input_tokens_seen": 128378380, "step": 5974, "time_per_iteration": 3.0758044719696045 }, { "auxiliary_loss_clip": 0.01126274, "auxiliary_loss_mlp": 0.01037404, "balance_loss_clip": 1.04946434, "balance_loss_mlp": 1.02339661, "epoch": 0.35923643469111677, "flos": 18695454600960.0, "grad_norm": 2.5569125412900022, "language_loss": 0.68787563, "learning_rate": 2.9666524677572114e-06, "loss": 0.70951241, "num_input_tokens_seen": 128394315, "step": 5975, "time_per_iteration": 2.657576084136963 }, { "auxiliary_loss_clip": 0.01134392, "auxiliary_loss_mlp": 0.01038612, "balance_loss_clip": 1.04914975, "balance_loss_mlp": 1.02426553, "epoch": 0.35929655794378473, "flos": 25009950931200.0, "grad_norm": 1.804443520579843, "language_loss": 0.79982442, "learning_rate": 2.96631149897303e-06, "loss": 0.82155442, "num_input_tokens_seen": 128414515, "step": 5976, "time_per_iteration": 2.6197311878204346 }, { "auxiliary_loss_clip": 0.01074524, "auxiliary_loss_mlp": 0.01040105, "balance_loss_clip": 1.04337287, "balance_loss_mlp": 1.02404785, "epoch": 0.35935668119645275, "flos": 14975576265600.0, "grad_norm": 1.9714674470262432, "language_loss": 0.78818405, "learning_rate": 2.9659704935462194e-06, "loss": 0.8093304, "num_input_tokens_seen": 128430615, "step": 5977, "time_per_iteration": 2.735844612121582 }, { "auxiliary_loss_clip": 0.01094647, "auxiliary_loss_mlp": 0.01041851, "balance_loss_clip": 1.04511654, "balance_loss_mlp": 1.02789736, "epoch": 0.3594168044491207, "flos": 21178138838400.0, "grad_norm": 2.560014574379112, "language_loss": 0.79859221, "learning_rate": 2.9656294514897102e-06, "loss": 0.8199572, "num_input_tokens_seen": 128449480, "step": 5978, "time_per_iteration": 2.704134941101074 }, { "auxiliary_loss_clip": 0.01135434, "auxiliary_loss_mlp": 0.00773692, "balance_loss_clip": 1.04890609, "balance_loss_mlp": 1.00073409, "epoch": 0.3594769277017887, "flos": 27672152365440.0, "grad_norm": 4.868201977342703, "language_loss": 0.68310702, "learning_rate": 2.965288372816436e-06, "loss": 0.70219827, "num_input_tokens_seen": 128471465, "step": 5979, "time_per_iteration": 2.667222499847412 }, { "auxiliary_loss_clip": 0.01105596, "auxiliary_loss_mlp": 0.01033841, "balance_loss_clip": 1.04548645, "balance_loss_mlp": 1.01876652, "epoch": 0.35953705095445665, "flos": 23002328995200.0, "grad_norm": 6.298210491387724, "language_loss": 0.67445302, "learning_rate": 2.9649472575393296e-06, "loss": 0.69584739, "num_input_tokens_seen": 128490645, "step": 5980, "time_per_iteration": 2.6262974739074707 }, { "auxiliary_loss_clip": 0.01113802, "auxiliary_loss_mlp": 0.01040029, "balance_loss_clip": 1.04725266, "balance_loss_mlp": 1.02324414, "epoch": 0.3595971742071246, "flos": 25513992529920.0, "grad_norm": 1.8251567017824133, "language_loss": 0.71328801, "learning_rate": 2.964606105671327e-06, "loss": 0.73482633, "num_input_tokens_seen": 128510225, "step": 5981, "time_per_iteration": 2.696676254272461 }, { "auxiliary_loss_clip": 0.01109039, "auxiliary_loss_mlp": 0.01041685, "balance_loss_clip": 1.04872131, "balance_loss_mlp": 1.02498353, "epoch": 0.3596572974597926, "flos": 29862559635840.0, "grad_norm": 2.0089481436352767, "language_loss": 0.71294796, "learning_rate": 2.9642649172253635e-06, "loss": 0.73445523, "num_input_tokens_seen": 128530195, "step": 5982, "time_per_iteration": 2.7264244556427 }, { "auxiliary_loss_clip": 0.01114107, "auxiliary_loss_mlp": 0.01046667, "balance_loss_clip": 1.04542398, "balance_loss_mlp": 1.03115773, "epoch": 0.35971742071246054, "flos": 23112538899840.0, "grad_norm": 1.8520970942870048, "language_loss": 0.75614822, "learning_rate": 2.9639236922143786e-06, "loss": 0.77775598, "num_input_tokens_seen": 128549990, "step": 5983, "time_per_iteration": 2.6827449798583984 }, { "auxiliary_loss_clip": 0.01140239, "auxiliary_loss_mlp": 0.01042697, "balance_loss_clip": 1.0510025, "balance_loss_mlp": 1.02626991, "epoch": 0.3597775439651285, "flos": 16725359399040.0, "grad_norm": 17.088734777986428, "language_loss": 0.76256114, "learning_rate": 2.96358243065131e-06, "loss": 0.78439057, "num_input_tokens_seen": 128567925, "step": 5984, "time_per_iteration": 2.695389747619629 }, { "auxiliary_loss_clip": 0.01117847, "auxiliary_loss_mlp": 0.00772256, "balance_loss_clip": 1.04583967, "balance_loss_mlp": 1.00047541, "epoch": 0.3598376672177965, "flos": 19719483436800.0, "grad_norm": 1.8513392555770956, "language_loss": 0.86111921, "learning_rate": 2.9632411325490993e-06, "loss": 0.88002026, "num_input_tokens_seen": 128585655, "step": 5985, "time_per_iteration": 2.6440985202789307 }, { "auxiliary_loss_clip": 0.01117958, "auxiliary_loss_mlp": 0.01045892, "balance_loss_clip": 1.04564977, "balance_loss_mlp": 1.03012037, "epoch": 0.35989779047046444, "flos": 17311529445120.0, "grad_norm": 2.5721307867834406, "language_loss": 0.72770452, "learning_rate": 2.9628997979206884e-06, "loss": 0.74934304, "num_input_tokens_seen": 128604820, "step": 5986, "time_per_iteration": 2.6169698238372803 }, { "auxiliary_loss_clip": 0.01100506, "auxiliary_loss_mlp": 0.01039862, "balance_loss_clip": 1.04264784, "balance_loss_mlp": 1.02473474, "epoch": 0.3599579137231324, "flos": 22711237176960.0, "grad_norm": 2.1943162754876497, "language_loss": 0.73883474, "learning_rate": 2.9625584267790204e-06, "loss": 0.76023847, "num_input_tokens_seen": 128623070, "step": 5987, "time_per_iteration": 2.72385573387146 }, { "auxiliary_loss_clip": 0.0114047, "auxiliary_loss_mlp": 0.01040262, "balance_loss_clip": 1.05135727, "balance_loss_mlp": 1.02456188, "epoch": 0.36001803697580037, "flos": 20959873845120.0, "grad_norm": 2.225645474388546, "language_loss": 0.69665354, "learning_rate": 2.9622170191370404e-06, "loss": 0.71846086, "num_input_tokens_seen": 128642430, "step": 5988, "time_per_iteration": 2.6040101051330566 }, { "auxiliary_loss_clip": 0.01127132, "auxiliary_loss_mlp": 0.01043358, "balance_loss_clip": 1.04819822, "balance_loss_mlp": 1.0278132, "epoch": 0.36007816022846834, "flos": 20485565729280.0, "grad_norm": 2.281223653114012, "language_loss": 0.73300481, "learning_rate": 2.9618755750076953e-06, "loss": 0.75470972, "num_input_tokens_seen": 128661285, "step": 5989, "time_per_iteration": 2.6532981395721436 }, { "auxiliary_loss_clip": 0.01089891, "auxiliary_loss_mlp": 0.01037817, "balance_loss_clip": 1.04161119, "balance_loss_mlp": 1.02237916, "epoch": 0.36013828348113636, "flos": 28001237794560.0, "grad_norm": 3.1935134184936156, "language_loss": 0.79950285, "learning_rate": 2.961534094403931e-06, "loss": 0.82077992, "num_input_tokens_seen": 128682210, "step": 5990, "time_per_iteration": 2.785142421722412 }, { "auxiliary_loss_clip": 0.01123339, "auxiliary_loss_mlp": 0.0103344, "balance_loss_clip": 1.04714704, "balance_loss_mlp": 1.01775789, "epoch": 0.3601984067338043, "flos": 20082181017600.0, "grad_norm": 2.506195073342272, "language_loss": 0.83875644, "learning_rate": 2.961192577338698e-06, "loss": 0.86032414, "num_input_tokens_seen": 128700445, "step": 5991, "time_per_iteration": 2.6310808658599854 }, { "auxiliary_loss_clip": 0.01111044, "auxiliary_loss_mlp": 0.01045829, "balance_loss_clip": 1.04896092, "balance_loss_mlp": 1.03068912, "epoch": 0.3602585299864723, "flos": 18617599872000.0, "grad_norm": 2.314320245159203, "language_loss": 0.75628942, "learning_rate": 2.9608510238249463e-06, "loss": 0.77785814, "num_input_tokens_seen": 128716855, "step": 5992, "time_per_iteration": 2.6698272228240967 }, { "auxiliary_loss_clip": 0.01134951, "auxiliary_loss_mlp": 0.01039412, "balance_loss_clip": 1.04993188, "balance_loss_mlp": 1.02385557, "epoch": 0.36031865323914025, "flos": 19573003774080.0, "grad_norm": 2.1820524355734072, "language_loss": 0.76886415, "learning_rate": 2.960509433875627e-06, "loss": 0.79060775, "num_input_tokens_seen": 128735835, "step": 5993, "time_per_iteration": 2.5999341011047363 }, { "auxiliary_loss_clip": 0.01111748, "auxiliary_loss_mlp": 0.01054388, "balance_loss_clip": 1.04750419, "balance_loss_mlp": 1.03762674, "epoch": 0.3603787764918082, "flos": 17490615678720.0, "grad_norm": 1.8546706349055275, "language_loss": 0.74672681, "learning_rate": 2.9601678075036943e-06, "loss": 0.76838815, "num_input_tokens_seen": 128752465, "step": 5994, "time_per_iteration": 2.6691155433654785 }, { "auxiliary_loss_clip": 0.01095118, "auxiliary_loss_mlp": 0.01038312, "balance_loss_clip": 1.0480628, "balance_loss_mlp": 1.02331567, "epoch": 0.3604388997444762, "flos": 15523393564800.0, "grad_norm": 2.7696142346579666, "language_loss": 0.68887782, "learning_rate": 2.9598261447221024e-06, "loss": 0.71021217, "num_input_tokens_seen": 128770865, "step": 5995, "time_per_iteration": 2.7497267723083496 }, { "auxiliary_loss_clip": 0.01104395, "auxiliary_loss_mlp": 0.01046311, "balance_loss_clip": 1.04338932, "balance_loss_mlp": 1.03031349, "epoch": 0.36049902299714415, "flos": 17310883000320.0, "grad_norm": 2.2305093143222248, "language_loss": 0.82564914, "learning_rate": 2.9594844455438057e-06, "loss": 0.84715617, "num_input_tokens_seen": 128789730, "step": 5996, "time_per_iteration": 2.7227983474731445 }, { "auxiliary_loss_clip": 0.01135369, "auxiliary_loss_mlp": 0.0103828, "balance_loss_clip": 1.04974842, "balance_loss_mlp": 1.02300954, "epoch": 0.3605591462498121, "flos": 17056025026560.0, "grad_norm": 2.068995609090248, "language_loss": 0.73795009, "learning_rate": 2.959142709981763e-06, "loss": 0.75968659, "num_input_tokens_seen": 128806610, "step": 5997, "time_per_iteration": 2.572842836380005 }, { "auxiliary_loss_clip": 0.01121916, "auxiliary_loss_mlp": 0.01036628, "balance_loss_clip": 1.0482775, "balance_loss_mlp": 1.0226686, "epoch": 0.3606192695024801, "flos": 16836862193280.0, "grad_norm": 2.7116535757300215, "language_loss": 0.69209671, "learning_rate": 2.9588009380489337e-06, "loss": 0.71368217, "num_input_tokens_seen": 128824830, "step": 5998, "time_per_iteration": 2.604459047317505 }, { "auxiliary_loss_clip": 0.01085406, "auxiliary_loss_mlp": 0.01041904, "balance_loss_clip": 1.04395008, "balance_loss_mlp": 1.02565587, "epoch": 0.36067939275514804, "flos": 12129655743360.0, "grad_norm": 2.6293691676304745, "language_loss": 0.76580822, "learning_rate": 2.9584591297582758e-06, "loss": 0.78708136, "num_input_tokens_seen": 128838170, "step": 5999, "time_per_iteration": 2.6671667098999023 }, { "auxiliary_loss_clip": 0.01098137, "auxiliary_loss_mlp": 0.01040783, "balance_loss_clip": 1.04674315, "balance_loss_mlp": 1.02590609, "epoch": 0.360739516007816, "flos": 18041449720320.0, "grad_norm": 1.8157116334206203, "language_loss": 0.78264523, "learning_rate": 2.9581172851227516e-06, "loss": 0.80403441, "num_input_tokens_seen": 128855625, "step": 6000, "time_per_iteration": 2.743117332458496 }, { "auxiliary_loss_clip": 0.01095162, "auxiliary_loss_mlp": 0.01036289, "balance_loss_clip": 1.04705954, "balance_loss_mlp": 1.02203155, "epoch": 0.360799639260484, "flos": 18549800951040.0, "grad_norm": 1.8701006971713747, "language_loss": 0.78316295, "learning_rate": 2.9577754041553243e-06, "loss": 0.80447751, "num_input_tokens_seen": 128873540, "step": 6001, "time_per_iteration": 2.7342417240142822 }, { "auxiliary_loss_clip": 0.01130356, "auxiliary_loss_mlp": 0.0077146, "balance_loss_clip": 1.04727733, "balance_loss_mlp": 1.00072694, "epoch": 0.36085976251315194, "flos": 19682028529920.0, "grad_norm": 3.3927220139250056, "language_loss": 0.83151853, "learning_rate": 2.9574334868689575e-06, "loss": 0.8505367, "num_input_tokens_seen": 128889925, "step": 6002, "time_per_iteration": 2.6884238719940186 }, { "auxiliary_loss_clip": 0.01101804, "auxiliary_loss_mlp": 0.01033284, "balance_loss_clip": 1.04249346, "balance_loss_mlp": 1.02011156, "epoch": 0.3609198857658199, "flos": 24198943703040.0, "grad_norm": 2.135208430409031, "language_loss": 0.90677911, "learning_rate": 2.9570915332766165e-06, "loss": 0.92812997, "num_input_tokens_seen": 128906890, "step": 6003, "time_per_iteration": 2.666738986968994 }, { "auxiliary_loss_clip": 0.01036783, "auxiliary_loss_mlp": 0.0101378, "balance_loss_clip": 1.03707922, "balance_loss_mlp": 1.01194429, "epoch": 0.3609800090184879, "flos": 57115995160320.0, "grad_norm": 0.8844533830179444, "language_loss": 0.53396428, "learning_rate": 2.9567495433912693e-06, "loss": 0.55446988, "num_input_tokens_seen": 128965940, "step": 6004, "time_per_iteration": 3.1421444416046143 }, { "auxiliary_loss_clip": 0.01112391, "auxiliary_loss_mlp": 0.00772771, "balance_loss_clip": 1.04665363, "balance_loss_mlp": 1.00050342, "epoch": 0.3610401322711559, "flos": 20811239366400.0, "grad_norm": 2.085214899207264, "language_loss": 0.77743608, "learning_rate": 2.956407517225883e-06, "loss": 0.79628766, "num_input_tokens_seen": 128985835, "step": 6005, "time_per_iteration": 4.196998596191406 }, { "auxiliary_loss_clip": 0.01114373, "auxiliary_loss_mlp": 0.01043264, "balance_loss_clip": 1.04545391, "balance_loss_mlp": 1.02866125, "epoch": 0.36110025552382385, "flos": 13699167494400.0, "grad_norm": 1.984756598411705, "language_loss": 0.78795588, "learning_rate": 2.956065454793429e-06, "loss": 0.80953228, "num_input_tokens_seen": 129003120, "step": 6006, "time_per_iteration": 2.642446517944336 }, { "auxiliary_loss_clip": 0.01135515, "auxiliary_loss_mlp": 0.01037404, "balance_loss_clip": 1.04913247, "balance_loss_mlp": 1.02116823, "epoch": 0.3611603787764918, "flos": 22455014486400.0, "grad_norm": 3.6522767524231248, "language_loss": 0.84766537, "learning_rate": 2.955723356106876e-06, "loss": 0.86939454, "num_input_tokens_seen": 129021645, "step": 6007, "time_per_iteration": 4.38408637046814 }, { "auxiliary_loss_clip": 0.01120706, "auxiliary_loss_mlp": 0.01035853, "balance_loss_clip": 1.05059266, "balance_loss_mlp": 1.01940203, "epoch": 0.3612205020291598, "flos": 20886651970560.0, "grad_norm": 2.20663208121776, "language_loss": 0.72179425, "learning_rate": 2.955381221179198e-06, "loss": 0.7433598, "num_input_tokens_seen": 129038375, "step": 6008, "time_per_iteration": 4.262283802032471 }, { "auxiliary_loss_clip": 0.01118211, "auxiliary_loss_mlp": 0.0103587, "balance_loss_clip": 1.04345882, "balance_loss_mlp": 1.02150559, "epoch": 0.36128062528182775, "flos": 15741981780480.0, "grad_norm": 7.815944525258205, "language_loss": 0.83056295, "learning_rate": 2.955039050023368e-06, "loss": 0.85210377, "num_input_tokens_seen": 129056235, "step": 6009, "time_per_iteration": 2.643824577331543 }, { "auxiliary_loss_clip": 0.01105662, "auxiliary_loss_mlp": 0.01045676, "balance_loss_clip": 1.04862237, "balance_loss_mlp": 1.03013086, "epoch": 0.3613407485344957, "flos": 16764502245120.0, "grad_norm": 2.1132167438001166, "language_loss": 0.7616573, "learning_rate": 2.954696842652362e-06, "loss": 0.7831707, "num_input_tokens_seen": 129072405, "step": 6010, "time_per_iteration": 4.361377000808716 }, { "auxiliary_loss_clip": 0.01104786, "auxiliary_loss_mlp": 0.01035576, "balance_loss_clip": 1.04665053, "balance_loss_mlp": 1.02091312, "epoch": 0.3614008717871637, "flos": 20371189847040.0, "grad_norm": 1.759609272436165, "language_loss": 0.83214396, "learning_rate": 2.9543545990791554e-06, "loss": 0.85354757, "num_input_tokens_seen": 129090225, "step": 6011, "time_per_iteration": 2.679145574569702 }, { "auxiliary_loss_clip": 0.01141696, "auxiliary_loss_mlp": 0.01041601, "balance_loss_clip": 1.05070031, "balance_loss_mlp": 1.02562666, "epoch": 0.36146099503983165, "flos": 22776665800320.0, "grad_norm": 2.194420173883677, "language_loss": 0.62446111, "learning_rate": 2.954012319316727e-06, "loss": 0.64629406, "num_input_tokens_seen": 129107685, "step": 6012, "time_per_iteration": 2.6012516021728516 }, { "auxiliary_loss_clip": 0.01106556, "auxiliary_loss_mlp": 0.01038245, "balance_loss_clip": 1.04518831, "balance_loss_mlp": 1.02368951, "epoch": 0.3615211182924996, "flos": 22996654646400.0, "grad_norm": 1.831524666449312, "language_loss": 0.8381623, "learning_rate": 2.9536700033780565e-06, "loss": 0.85961026, "num_input_tokens_seen": 129125315, "step": 6013, "time_per_iteration": 2.7191901206970215 }, { "auxiliary_loss_clip": 0.01131608, "auxiliary_loss_mlp": 0.01040321, "balance_loss_clip": 1.04590511, "balance_loss_mlp": 1.02466893, "epoch": 0.3615812415451676, "flos": 16648079287680.0, "grad_norm": 3.6755742539930285, "language_loss": 0.91541535, "learning_rate": 2.9533276512761228e-06, "loss": 0.93713462, "num_input_tokens_seen": 129141600, "step": 6014, "time_per_iteration": 2.714121103286743 }, { "auxiliary_loss_clip": 0.01131507, "auxiliary_loss_mlp": 0.01042414, "balance_loss_clip": 1.0463829, "balance_loss_mlp": 1.0268693, "epoch": 0.36164136479783554, "flos": 21320093387520.0, "grad_norm": 2.2181121985150094, "language_loss": 0.73578274, "learning_rate": 2.95298526302391e-06, "loss": 0.75752199, "num_input_tokens_seen": 129160665, "step": 6015, "time_per_iteration": 2.668600082397461 }, { "auxiliary_loss_clip": 0.0105036, "auxiliary_loss_mlp": 0.01047702, "balance_loss_clip": 1.03610015, "balance_loss_mlp": 1.02980912, "epoch": 0.3617014880505035, "flos": 24169569356160.0, "grad_norm": 2.2662955263586158, "language_loss": 0.64756966, "learning_rate": 2.9526428386344e-06, "loss": 0.66855025, "num_input_tokens_seen": 129179220, "step": 6016, "time_per_iteration": 2.8753597736358643 }, { "auxiliary_loss_clip": 0.01127577, "auxiliary_loss_mlp": 0.01039172, "balance_loss_clip": 1.05000329, "balance_loss_mlp": 1.02170801, "epoch": 0.3617616113031715, "flos": 39014824101120.0, "grad_norm": 2.0483319793753343, "language_loss": 0.71927178, "learning_rate": 2.9523003781205785e-06, "loss": 0.74093938, "num_input_tokens_seen": 129200385, "step": 6017, "time_per_iteration": 2.8195903301239014 }, { "auxiliary_loss_clip": 0.01123165, "auxiliary_loss_mlp": 0.01043013, "balance_loss_clip": 1.04506993, "balance_loss_mlp": 1.02724147, "epoch": 0.3618217345558395, "flos": 12130840892160.0, "grad_norm": 2.196881428409859, "language_loss": 0.73543239, "learning_rate": 2.9519578814954307e-06, "loss": 0.7570942, "num_input_tokens_seen": 129217395, "step": 6018, "time_per_iteration": 2.6454639434814453 }, { "auxiliary_loss_clip": 0.01088616, "auxiliary_loss_mlp": 0.01036025, "balance_loss_clip": 1.0470562, "balance_loss_mlp": 1.02079058, "epoch": 0.36188185780850746, "flos": 24935005203840.0, "grad_norm": 2.8373114264415222, "language_loss": 0.69157374, "learning_rate": 2.9516153487719448e-06, "loss": 0.71282017, "num_input_tokens_seen": 129238940, "step": 6019, "time_per_iteration": 2.824361801147461 }, { "auxiliary_loss_clip": 0.0111438, "auxiliary_loss_mlp": 0.0103897, "balance_loss_clip": 1.04542887, "balance_loss_mlp": 1.02275765, "epoch": 0.3619419810611754, "flos": 20958832350720.0, "grad_norm": 3.405770043894724, "language_loss": 0.76428473, "learning_rate": 2.95127277996311e-06, "loss": 0.78581828, "num_input_tokens_seen": 129258240, "step": 6020, "time_per_iteration": 2.6757993698120117 }, { "auxiliary_loss_clip": 0.01124662, "auxiliary_loss_mlp": 0.01041506, "balance_loss_clip": 1.04899478, "balance_loss_mlp": 1.02512705, "epoch": 0.3620021043138434, "flos": 22528882805760.0, "grad_norm": 2.1413312386751606, "language_loss": 0.73802006, "learning_rate": 2.9509301750819156e-06, "loss": 0.7596817, "num_input_tokens_seen": 129279040, "step": 6021, "time_per_iteration": 2.6422386169433594 }, { "auxiliary_loss_clip": 0.01094575, "auxiliary_loss_mlp": 0.01036086, "balance_loss_clip": 1.04502845, "balance_loss_mlp": 1.02170944, "epoch": 0.36206222756651135, "flos": 15596687266560.0, "grad_norm": 8.65046906858069, "language_loss": 0.80683851, "learning_rate": 2.9505875341413533e-06, "loss": 0.82814515, "num_input_tokens_seen": 129295415, "step": 6022, "time_per_iteration": 2.7069809436798096 }, { "auxiliary_loss_clip": 0.0112144, "auxiliary_loss_mlp": 0.01034482, "balance_loss_clip": 1.04967427, "balance_loss_mlp": 1.02036762, "epoch": 0.3621223508191793, "flos": 23587170238080.0, "grad_norm": 1.6359940708258738, "language_loss": 0.81630391, "learning_rate": 2.950244857154417e-06, "loss": 0.83786309, "num_input_tokens_seen": 129312620, "step": 6023, "time_per_iteration": 2.676196575164795 }, { "auxiliary_loss_clip": 0.01115391, "auxiliary_loss_mlp": 0.01037931, "balance_loss_clip": 1.04994166, "balance_loss_mlp": 1.02266037, "epoch": 0.3621824740718473, "flos": 22309899540480.0, "grad_norm": 2.238629896510925, "language_loss": 0.79401833, "learning_rate": 2.9499021441341e-06, "loss": 0.81555158, "num_input_tokens_seen": 129331825, "step": 6024, "time_per_iteration": 2.6479294300079346 }, { "auxiliary_loss_clip": 0.01098352, "auxiliary_loss_mlp": 0.01041698, "balance_loss_clip": 1.04168642, "balance_loss_mlp": 1.02567625, "epoch": 0.36224259732451525, "flos": 16763640318720.0, "grad_norm": 2.1016508822119517, "language_loss": 0.74409318, "learning_rate": 2.9495593950933997e-06, "loss": 0.76549369, "num_input_tokens_seen": 129350400, "step": 6025, "time_per_iteration": 2.720113515853882 }, { "auxiliary_loss_clip": 0.01121634, "auxiliary_loss_mlp": 0.00772492, "balance_loss_clip": 1.04758501, "balance_loss_mlp": 1.00045466, "epoch": 0.3623027205771832, "flos": 23149742411520.0, "grad_norm": 1.7192758683210898, "language_loss": 0.72363192, "learning_rate": 2.9492166100453107e-06, "loss": 0.74257314, "num_input_tokens_seen": 129371155, "step": 6026, "time_per_iteration": 2.647515296936035 }, { "auxiliary_loss_clip": 0.01130763, "auxiliary_loss_mlp": 0.01045791, "balance_loss_clip": 1.05090141, "balance_loss_mlp": 1.0300554, "epoch": 0.3623628438298512, "flos": 28549162834560.0, "grad_norm": 3.1509295844270166, "language_loss": 0.79584157, "learning_rate": 2.948873789002833e-06, "loss": 0.81760705, "num_input_tokens_seen": 129391230, "step": 6027, "time_per_iteration": 2.666778802871704 }, { "auxiliary_loss_clip": 0.01112806, "auxiliary_loss_mlp": 0.01044567, "balance_loss_clip": 1.04690945, "balance_loss_mlp": 1.02730584, "epoch": 0.36242296708251914, "flos": 25484941405440.0, "grad_norm": 2.036912075012155, "language_loss": 0.67857373, "learning_rate": 2.9485309319789667e-06, "loss": 0.70014751, "num_input_tokens_seen": 129410065, "step": 6028, "time_per_iteration": 2.721635103225708 }, { "auxiliary_loss_clip": 0.01093428, "auxiliary_loss_mlp": 0.01039806, "balance_loss_clip": 1.04534137, "balance_loss_mlp": 1.02493429, "epoch": 0.3624830903351871, "flos": 16290373697280.0, "grad_norm": 2.040296243102333, "language_loss": 0.85588348, "learning_rate": 2.9481880389867117e-06, "loss": 0.87721586, "num_input_tokens_seen": 129428655, "step": 6029, "time_per_iteration": 2.768638849258423 }, { "auxiliary_loss_clip": 0.01097178, "auxiliary_loss_mlp": 0.01040472, "balance_loss_clip": 1.04583371, "balance_loss_mlp": 1.02534389, "epoch": 0.36254321358785513, "flos": 18296307694080.0, "grad_norm": 1.826841085229912, "language_loss": 0.72638077, "learning_rate": 2.9478451100390714e-06, "loss": 0.74775726, "num_input_tokens_seen": 129447845, "step": 6030, "time_per_iteration": 2.6222145557403564 }, { "auxiliary_loss_clip": 0.01111443, "auxiliary_loss_mlp": 0.0104401, "balance_loss_clip": 1.0471518, "balance_loss_mlp": 1.02635479, "epoch": 0.3626033368405231, "flos": 14865294533760.0, "grad_norm": 2.682823168265615, "language_loss": 0.74219912, "learning_rate": 2.94750214514905e-06, "loss": 0.76375365, "num_input_tokens_seen": 129463275, "step": 6031, "time_per_iteration": 2.62003493309021 }, { "auxiliary_loss_clip": 0.01090216, "auxiliary_loss_mlp": 0.01046109, "balance_loss_clip": 1.04174352, "balance_loss_mlp": 1.03031349, "epoch": 0.36266346009319106, "flos": 22306595489280.0, "grad_norm": 2.122404426395552, "language_loss": 0.72930032, "learning_rate": 2.9471591443296516e-06, "loss": 0.75066358, "num_input_tokens_seen": 129483205, "step": 6032, "time_per_iteration": 2.7382266521453857 }, { "auxiliary_loss_clip": 0.01089342, "auxiliary_loss_mlp": 0.0104871, "balance_loss_clip": 1.0457828, "balance_loss_mlp": 1.03320134, "epoch": 0.362723583345859, "flos": 18222331633920.0, "grad_norm": 2.0052695882675895, "language_loss": 0.77577424, "learning_rate": 2.946816107593884e-06, "loss": 0.79715478, "num_input_tokens_seen": 129499885, "step": 6033, "time_per_iteration": 2.712574005126953 }, { "auxiliary_loss_clip": 0.01011518, "auxiliary_loss_mlp": 0.01010455, "balance_loss_clip": 1.02346182, "balance_loss_mlp": 1.00881004, "epoch": 0.362783706598527, "flos": 68499174458880.0, "grad_norm": 0.775881514372135, "language_loss": 0.6472615, "learning_rate": 2.9464730349547547e-06, "loss": 0.66748118, "num_input_tokens_seen": 129561885, "step": 6034, "time_per_iteration": 3.33389949798584 }, { "auxiliary_loss_clip": 0.0111586, "auxiliary_loss_mlp": 0.01039589, "balance_loss_clip": 1.04362679, "balance_loss_mlp": 1.02373409, "epoch": 0.36284382985119495, "flos": 26576589594240.0, "grad_norm": 2.348469757016237, "language_loss": 0.89869213, "learning_rate": 2.946129926425273e-06, "loss": 0.9202466, "num_input_tokens_seen": 129582325, "step": 6035, "time_per_iteration": 2.661137580871582 }, { "auxiliary_loss_clip": 0.01112128, "auxiliary_loss_mlp": 0.01040682, "balance_loss_clip": 1.04810882, "balance_loss_mlp": 1.02445734, "epoch": 0.3629039531038629, "flos": 20156767608960.0, "grad_norm": 1.7965494412259506, "language_loss": 0.73480749, "learning_rate": 2.9457867820184496e-06, "loss": 0.75633562, "num_input_tokens_seen": 129600350, "step": 6036, "time_per_iteration": 2.627746105194092 }, { "auxiliary_loss_clip": 0.01118939, "auxiliary_loss_mlp": 0.01034203, "balance_loss_clip": 1.0476563, "balance_loss_mlp": 1.01825309, "epoch": 0.3629640763565309, "flos": 18625716345600.0, "grad_norm": 2.247638401714898, "language_loss": 0.75895989, "learning_rate": 2.945443601747297e-06, "loss": 0.78049135, "num_input_tokens_seen": 129618425, "step": 6037, "time_per_iteration": 2.6763134002685547 }, { "auxiliary_loss_clip": 0.01117432, "auxiliary_loss_mlp": 0.0105958, "balance_loss_clip": 1.04722893, "balance_loss_mlp": 1.04149556, "epoch": 0.36302419960919885, "flos": 19571459489280.0, "grad_norm": 1.7641921793444904, "language_loss": 0.78425813, "learning_rate": 2.945100385624828e-06, "loss": 0.80602825, "num_input_tokens_seen": 129636750, "step": 6038, "time_per_iteration": 2.6576154232025146 }, { "auxiliary_loss_clip": 0.01042272, "auxiliary_loss_mlp": 0.01000075, "balance_loss_clip": 1.02576721, "balance_loss_mlp": 0.99842948, "epoch": 0.3630843228618668, "flos": 63797606444160.0, "grad_norm": 0.8328343708327894, "language_loss": 0.63371962, "learning_rate": 2.9447571336640573e-06, "loss": 0.6541431, "num_input_tokens_seen": 129699030, "step": 6039, "time_per_iteration": 3.268035650253296 }, { "auxiliary_loss_clip": 0.01108663, "auxiliary_loss_mlp": 0.01052032, "balance_loss_clip": 1.04687905, "balance_loss_mlp": 1.03485394, "epoch": 0.3631444461145348, "flos": 21835160461440.0, "grad_norm": 2.83972356132426, "language_loss": 0.71349055, "learning_rate": 2.944413845878002e-06, "loss": 0.73509747, "num_input_tokens_seen": 129717135, "step": 6040, "time_per_iteration": 2.7468066215515137 }, { "auxiliary_loss_clip": 0.01129452, "auxiliary_loss_mlp": 0.01039721, "balance_loss_clip": 1.05027485, "balance_loss_mlp": 1.02372289, "epoch": 0.36320456936720275, "flos": 21722041555200.0, "grad_norm": 1.6017927687359714, "language_loss": 0.81615877, "learning_rate": 2.9440705222796783e-06, "loss": 0.83785057, "num_input_tokens_seen": 129735940, "step": 6041, "time_per_iteration": 2.6624767780303955 }, { "auxiliary_loss_clip": 0.01116373, "auxiliary_loss_mlp": 0.01037475, "balance_loss_clip": 1.04789138, "balance_loss_mlp": 1.02039289, "epoch": 0.3632646926198707, "flos": 17019072910080.0, "grad_norm": 6.335898198250863, "language_loss": 0.83848882, "learning_rate": 2.943727162882107e-06, "loss": 0.86002731, "num_input_tokens_seen": 129752790, "step": 6042, "time_per_iteration": 2.6279616355895996 }, { "auxiliary_loss_clip": 0.01113831, "auxiliary_loss_mlp": 0.01045895, "balance_loss_clip": 1.04817295, "balance_loss_mlp": 1.03020668, "epoch": 0.36332481587253873, "flos": 23331163029120.0, "grad_norm": 1.8194124872693949, "language_loss": 0.78401059, "learning_rate": 2.9433837676983064e-06, "loss": 0.80560786, "num_input_tokens_seen": 129773655, "step": 6043, "time_per_iteration": 4.221862077713013 }, { "auxiliary_loss_clip": 0.01111193, "auxiliary_loss_mlp": 0.01036813, "balance_loss_clip": 1.05454051, "balance_loss_mlp": 1.02078581, "epoch": 0.3633849391252067, "flos": 10743539857920.0, "grad_norm": 2.743973887678544, "language_loss": 0.65664518, "learning_rate": 2.943040336741298e-06, "loss": 0.67812526, "num_input_tokens_seen": 129791605, "step": 6044, "time_per_iteration": 2.7301173210144043 }, { "auxiliary_loss_clip": 0.01109397, "auxiliary_loss_mlp": 0.01034976, "balance_loss_clip": 1.04838157, "balance_loss_mlp": 1.02035475, "epoch": 0.36344506237787466, "flos": 25849147357440.0, "grad_norm": 2.5365479968338187, "language_loss": 0.81149542, "learning_rate": 2.9426968700241066e-06, "loss": 0.83293915, "num_input_tokens_seen": 129811075, "step": 6045, "time_per_iteration": 2.6896753311157227 }, { "auxiliary_loss_clip": 0.0110304, "auxiliary_loss_mlp": 0.01045503, "balance_loss_clip": 1.04706383, "balance_loss_mlp": 1.02923083, "epoch": 0.3635051856305426, "flos": 30154046503680.0, "grad_norm": 2.400629400498793, "language_loss": 0.65010375, "learning_rate": 2.942353367559755e-06, "loss": 0.67158914, "num_input_tokens_seen": 129833755, "step": 6046, "time_per_iteration": 2.800321578979492 }, { "auxiliary_loss_clip": 0.01102544, "auxiliary_loss_mlp": 0.01038937, "balance_loss_clip": 1.0467155, "balance_loss_mlp": 1.02399993, "epoch": 0.3635653088832106, "flos": 22198396746240.0, "grad_norm": 2.172977049503826, "language_loss": 0.77142686, "learning_rate": 2.9420098293612692e-06, "loss": 0.79284167, "num_input_tokens_seen": 129854475, "step": 6047, "time_per_iteration": 4.274283170700073 }, { "auxiliary_loss_clip": 0.01137356, "auxiliary_loss_mlp": 0.01047564, "balance_loss_clip": 1.05142486, "balance_loss_mlp": 1.02983761, "epoch": 0.36362543213587856, "flos": 24787053083520.0, "grad_norm": 1.922622021112015, "language_loss": 0.79610157, "learning_rate": 2.9416662554416767e-06, "loss": 0.81795079, "num_input_tokens_seen": 129873530, "step": 6048, "time_per_iteration": 4.283480644226074 }, { "auxiliary_loss_clip": 0.01037942, "auxiliary_loss_mlp": 0.01005664, "balance_loss_clip": 1.01860034, "balance_loss_mlp": 1.00387573, "epoch": 0.3636855553885465, "flos": 62526369231360.0, "grad_norm": 0.749844121463454, "language_loss": 0.52550006, "learning_rate": 2.9413226458140054e-06, "loss": 0.54593611, "num_input_tokens_seen": 129940400, "step": 6049, "time_per_iteration": 3.2647299766540527 }, { "auxiliary_loss_clip": 0.01105759, "auxiliary_loss_mlp": 0.01042028, "balance_loss_clip": 1.04831481, "balance_loss_mlp": 1.02467084, "epoch": 0.3637456786412145, "flos": 24060652341120.0, "grad_norm": 9.722138117523357, "language_loss": 0.8628068, "learning_rate": 2.9409790004912845e-06, "loss": 0.88428462, "num_input_tokens_seen": 129958635, "step": 6050, "time_per_iteration": 2.744236469268799 }, { "auxiliary_loss_clip": 0.01120328, "auxiliary_loss_mlp": 0.00772785, "balance_loss_clip": 1.04944158, "balance_loss_mlp": 1.0004611, "epoch": 0.36380580189388245, "flos": 16691495852160.0, "grad_norm": 3.109361789309709, "language_loss": 0.78116536, "learning_rate": 2.940635319486546e-06, "loss": 0.80009651, "num_input_tokens_seen": 129977685, "step": 6051, "time_per_iteration": 2.6305320262908936 }, { "auxiliary_loss_clip": 0.01127196, "auxiliary_loss_mlp": 0.01040856, "balance_loss_clip": 1.04900503, "balance_loss_mlp": 1.02559745, "epoch": 0.3638659251465504, "flos": 25114091437440.0, "grad_norm": 1.9275322741448784, "language_loss": 0.82526582, "learning_rate": 2.940291602812822e-06, "loss": 0.84694636, "num_input_tokens_seen": 129997530, "step": 6052, "time_per_iteration": 2.711794853210449 }, { "auxiliary_loss_clip": 0.01100415, "auxiliary_loss_mlp": 0.01036967, "balance_loss_clip": 1.04675376, "balance_loss_mlp": 1.02270949, "epoch": 0.3639260483992184, "flos": 23003011353600.0, "grad_norm": 1.7820298413079305, "language_loss": 0.72085792, "learning_rate": 2.939947850483145e-06, "loss": 0.74223173, "num_input_tokens_seen": 130017955, "step": 6053, "time_per_iteration": 2.725600481033325 }, { "auxiliary_loss_clip": 0.01015406, "auxiliary_loss_mlp": 0.01003631, "balance_loss_clip": 1.0300014, "balance_loss_mlp": 1.00155663, "epoch": 0.36398617165188635, "flos": 70716011160960.0, "grad_norm": 0.7712310074836012, "language_loss": 0.61214095, "learning_rate": 2.9396040625105532e-06, "loss": 0.63233131, "num_input_tokens_seen": 130074275, "step": 6054, "time_per_iteration": 3.3252007961273193 }, { "auxiliary_loss_clip": 0.0111079, "auxiliary_loss_mlp": 0.01038999, "balance_loss_clip": 1.04735899, "balance_loss_mlp": 1.02214301, "epoch": 0.3640462949045543, "flos": 22235456603520.0, "grad_norm": 2.93078334140581, "language_loss": 0.75820959, "learning_rate": 2.9392602389080802e-06, "loss": 0.77970749, "num_input_tokens_seen": 130091375, "step": 6055, "time_per_iteration": 2.656001091003418 }, { "auxiliary_loss_clip": 0.0113529, "auxiliary_loss_mlp": 0.01041525, "balance_loss_clip": 1.04910016, "balance_loss_mlp": 1.02581286, "epoch": 0.3641064181572223, "flos": 21543529939200.0, "grad_norm": 1.6734377169093124, "language_loss": 0.7533145, "learning_rate": 2.938916379688765e-06, "loss": 0.77508265, "num_input_tokens_seen": 130111595, "step": 6056, "time_per_iteration": 2.654418468475342 }, { "auxiliary_loss_clip": 0.01121707, "auxiliary_loss_mlp": 0.01038714, "balance_loss_clip": 1.055071, "balance_loss_mlp": 1.02337217, "epoch": 0.3641665414098903, "flos": 22273306560000.0, "grad_norm": 2.035168503846255, "language_loss": 0.80473512, "learning_rate": 2.9385724848656468e-06, "loss": 0.82633936, "num_input_tokens_seen": 130131440, "step": 6057, "time_per_iteration": 2.7347753047943115 }, { "auxiliary_loss_clip": 0.01107128, "auxiliary_loss_mlp": 0.01039802, "balance_loss_clip": 1.04495037, "balance_loss_mlp": 1.02438855, "epoch": 0.36422666466255826, "flos": 28329676778880.0, "grad_norm": 2.043030499006847, "language_loss": 0.80264485, "learning_rate": 2.9382285544517647e-06, "loss": 0.8241142, "num_input_tokens_seen": 130151375, "step": 6058, "time_per_iteration": 2.695674180984497 }, { "auxiliary_loss_clip": 0.01102831, "auxiliary_loss_mlp": 0.00772601, "balance_loss_clip": 1.04357934, "balance_loss_mlp": 1.00046432, "epoch": 0.36428678791522623, "flos": 24170503109760.0, "grad_norm": 2.032310914115462, "language_loss": 0.84994543, "learning_rate": 2.9378845884601636e-06, "loss": 0.86869979, "num_input_tokens_seen": 130169960, "step": 6059, "time_per_iteration": 2.6912410259246826 }, { "auxiliary_loss_clip": 0.01093721, "auxiliary_loss_mlp": 0.01039242, "balance_loss_clip": 1.04318213, "balance_loss_mlp": 1.02287483, "epoch": 0.3643469111678942, "flos": 22528451842560.0, "grad_norm": 5.903326132338396, "language_loss": 0.87806225, "learning_rate": 2.937540586903884e-06, "loss": 0.89939183, "num_input_tokens_seen": 130189800, "step": 6060, "time_per_iteration": 2.713115692138672 }, { "auxiliary_loss_clip": 0.01125791, "auxiliary_loss_mlp": 0.01040312, "balance_loss_clip": 1.0498302, "balance_loss_mlp": 1.02388453, "epoch": 0.36440703442056216, "flos": 19426595938560.0, "grad_norm": 2.3521788015610805, "language_loss": 0.66954017, "learning_rate": 2.937196549795971e-06, "loss": 0.69120121, "num_input_tokens_seen": 130206370, "step": 6061, "time_per_iteration": 2.8435866832733154 }, { "auxiliary_loss_clip": 0.0111942, "auxiliary_loss_mlp": 0.01038694, "balance_loss_clip": 1.05207086, "balance_loss_mlp": 1.02260041, "epoch": 0.3644671576732301, "flos": 18040515966720.0, "grad_norm": 2.5119296796020354, "language_loss": 0.75012159, "learning_rate": 2.9368524771494718e-06, "loss": 0.77170277, "num_input_tokens_seen": 130224445, "step": 6062, "time_per_iteration": 2.659853935241699 }, { "auxiliary_loss_clip": 0.01108402, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.04851866, "balance_loss_mlp": 1.01628149, "epoch": 0.3645272809258981, "flos": 21542811667200.0, "grad_norm": 2.568706719167558, "language_loss": 0.72070628, "learning_rate": 2.936508368977432e-06, "loss": 0.74213189, "num_input_tokens_seen": 130245380, "step": 6063, "time_per_iteration": 2.7098159790039062 }, { "auxiliary_loss_clip": 0.01118768, "auxiliary_loss_mlp": 0.010373, "balance_loss_clip": 1.04472148, "balance_loss_mlp": 1.02187479, "epoch": 0.36458740417856605, "flos": 22746860490240.0, "grad_norm": 2.3511982692020936, "language_loss": 0.68179435, "learning_rate": 2.936164225292901e-06, "loss": 0.70335501, "num_input_tokens_seen": 130265575, "step": 6064, "time_per_iteration": 2.6513044834136963 }, { "auxiliary_loss_clip": 0.01116627, "auxiliary_loss_mlp": 0.01045789, "balance_loss_clip": 1.04925466, "balance_loss_mlp": 1.02988076, "epoch": 0.364647527431234, "flos": 26140670138880.0, "grad_norm": 1.9840367281230236, "language_loss": 0.74147421, "learning_rate": 2.9358200461089297e-06, "loss": 0.76309836, "num_input_tokens_seen": 130286195, "step": 6065, "time_per_iteration": 2.764556407928467 }, { "auxiliary_loss_clip": 0.0111688, "auxiliary_loss_mlp": 0.01040465, "balance_loss_clip": 1.04924774, "balance_loss_mlp": 1.02306008, "epoch": 0.364707650683902, "flos": 31029907737600.0, "grad_norm": 2.0108238901766042, "language_loss": 0.75444913, "learning_rate": 2.9354758314385676e-06, "loss": 0.77602255, "num_input_tokens_seen": 130306095, "step": 6066, "time_per_iteration": 2.749293088912964 }, { "auxiliary_loss_clip": 0.01121102, "auxiliary_loss_mlp": 0.01034674, "balance_loss_clip": 1.04859555, "balance_loss_mlp": 1.02010643, "epoch": 0.36476777393656995, "flos": 19572896033280.0, "grad_norm": 2.8385875288429587, "language_loss": 0.76480901, "learning_rate": 2.9351315812948684e-06, "loss": 0.78636676, "num_input_tokens_seen": 130324685, "step": 6067, "time_per_iteration": 2.619833469390869 }, { "auxiliary_loss_clip": 0.01135088, "auxiliary_loss_mlp": 0.0103807, "balance_loss_clip": 1.05067635, "balance_loss_mlp": 1.02401567, "epoch": 0.3648278971892379, "flos": 17748849530880.0, "grad_norm": 2.2214902441228563, "language_loss": 0.71036232, "learning_rate": 2.934787295690886e-06, "loss": 0.73209393, "num_input_tokens_seen": 130343855, "step": 6068, "time_per_iteration": 2.633678674697876 }, { "auxiliary_loss_clip": 0.01119276, "auxiliary_loss_mlp": 0.01039471, "balance_loss_clip": 1.0432384, "balance_loss_mlp": 1.02402711, "epoch": 0.3648880204419059, "flos": 17931167988480.0, "grad_norm": 2.184109901605664, "language_loss": 0.74421692, "learning_rate": 2.9344429746396755e-06, "loss": 0.76580441, "num_input_tokens_seen": 130362320, "step": 6069, "time_per_iteration": 2.6463425159454346 }, { "auxiliary_loss_clip": 0.01115147, "auxiliary_loss_mlp": 0.0103807, "balance_loss_clip": 1.04814148, "balance_loss_mlp": 1.02237022, "epoch": 0.3649481436945739, "flos": 22638266697600.0, "grad_norm": 1.8874088651190308, "language_loss": 0.66247845, "learning_rate": 2.9340986181542945e-06, "loss": 0.68401062, "num_input_tokens_seen": 130383165, "step": 6070, "time_per_iteration": 2.70835280418396 }, { "auxiliary_loss_clip": 0.01118852, "auxiliary_loss_mlp": 0.01036547, "balance_loss_clip": 1.04837227, "balance_loss_mlp": 1.02161574, "epoch": 0.36500826694724187, "flos": 21579656042880.0, "grad_norm": 1.882521473859371, "language_loss": 0.74406028, "learning_rate": 2.9337542262477994e-06, "loss": 0.76561427, "num_input_tokens_seen": 130402425, "step": 6071, "time_per_iteration": 2.6479921340942383 }, { "auxiliary_loss_clip": 0.0112348, "auxiliary_loss_mlp": 0.01037332, "balance_loss_clip": 1.04683149, "balance_loss_mlp": 1.02142978, "epoch": 0.36506839019990983, "flos": 13772533023360.0, "grad_norm": 1.9443656652026238, "language_loss": 0.88592315, "learning_rate": 2.9334097989332506e-06, "loss": 0.9075312, "num_input_tokens_seen": 130419440, "step": 6072, "time_per_iteration": 2.641340732574463 }, { "auxiliary_loss_clip": 0.01122637, "auxiliary_loss_mlp": 0.01036427, "balance_loss_clip": 1.0495832, "balance_loss_mlp": 1.02225924, "epoch": 0.3651285134525778, "flos": 17274972378240.0, "grad_norm": 2.382408041683643, "language_loss": 0.72436309, "learning_rate": 2.9330653362237094e-06, "loss": 0.7459538, "num_input_tokens_seen": 130438495, "step": 6073, "time_per_iteration": 2.6814513206481934 }, { "auxiliary_loss_clip": 0.01067321, "auxiliary_loss_mlp": 0.01042007, "balance_loss_clip": 1.04483008, "balance_loss_mlp": 1.0249722, "epoch": 0.36518863670524576, "flos": 21907987286400.0, "grad_norm": 3.1332797030940913, "language_loss": 0.66850221, "learning_rate": 2.932720838132236e-06, "loss": 0.68959546, "num_input_tokens_seen": 130455575, "step": 6074, "time_per_iteration": 2.7943460941314697 }, { "auxiliary_loss_clip": 0.01103652, "auxiliary_loss_mlp": 0.01037343, "balance_loss_clip": 1.04833269, "balance_loss_mlp": 1.02238262, "epoch": 0.3652487599579137, "flos": 27122180250240.0, "grad_norm": 1.5371260958261816, "language_loss": 0.72812623, "learning_rate": 2.9323763046718954e-06, "loss": 0.74953616, "num_input_tokens_seen": 130476385, "step": 6075, "time_per_iteration": 2.7581374645233154 }, { "auxiliary_loss_clip": 0.01100578, "auxiliary_loss_mlp": 0.01046604, "balance_loss_clip": 1.04679585, "balance_loss_mlp": 1.03011715, "epoch": 0.3653088832105817, "flos": 19755573626880.0, "grad_norm": 2.1248471900324186, "language_loss": 0.89377797, "learning_rate": 2.9320317358557524e-06, "loss": 0.91524976, "num_input_tokens_seen": 130493630, "step": 6076, "time_per_iteration": 2.7085182666778564 }, { "auxiliary_loss_clip": 0.01125287, "auxiliary_loss_mlp": 0.01043945, "balance_loss_clip": 1.0504595, "balance_loss_mlp": 1.02784586, "epoch": 0.36536900646324966, "flos": 13115008609920.0, "grad_norm": 2.218138292044272, "language_loss": 0.69377828, "learning_rate": 2.931687131696872e-06, "loss": 0.71547067, "num_input_tokens_seen": 130510735, "step": 6077, "time_per_iteration": 2.6516926288604736 }, { "auxiliary_loss_clip": 0.01063406, "auxiliary_loss_mlp": 0.01003112, "balance_loss_clip": 1.03200221, "balance_loss_mlp": 1.00121677, "epoch": 0.3654291297159176, "flos": 71100472383360.0, "grad_norm": 0.7484778409156561, "language_loss": 0.61802375, "learning_rate": 2.9313424922083224e-06, "loss": 0.63868892, "num_input_tokens_seen": 130577050, "step": 6078, "time_per_iteration": 3.2192225456237793 }, { "auxiliary_loss_clip": 0.01105852, "auxiliary_loss_mlp": 0.01053011, "balance_loss_clip": 1.04234397, "balance_loss_mlp": 1.03565383, "epoch": 0.3654892529685856, "flos": 23617478338560.0, "grad_norm": 2.6620805395927283, "language_loss": 0.78445792, "learning_rate": 2.930997817403173e-06, "loss": 0.80604661, "num_input_tokens_seen": 130593780, "step": 6079, "time_per_iteration": 2.6616902351379395 }, { "auxiliary_loss_clip": 0.01129934, "auxiliary_loss_mlp": 0.01040158, "balance_loss_clip": 1.05226243, "balance_loss_mlp": 1.02386224, "epoch": 0.36554937622125355, "flos": 43470799850880.0, "grad_norm": 2.4767906644356037, "language_loss": 0.62662333, "learning_rate": 2.9306531072944913e-06, "loss": 0.64832425, "num_input_tokens_seen": 130615510, "step": 6080, "time_per_iteration": 2.8651509284973145 }, { "auxiliary_loss_clip": 0.01108292, "auxiliary_loss_mlp": 0.01042236, "balance_loss_clip": 1.04737091, "balance_loss_mlp": 1.02529645, "epoch": 0.3656094994739215, "flos": 23294641875840.0, "grad_norm": 3.1314387429818327, "language_loss": 0.67686033, "learning_rate": 2.930308361895352e-06, "loss": 0.69836557, "num_input_tokens_seen": 130635410, "step": 6081, "time_per_iteration": 2.707031011581421 }, { "auxiliary_loss_clip": 0.01112746, "auxiliary_loss_mlp": 0.00773158, "balance_loss_clip": 1.04989302, "balance_loss_mlp": 1.00033236, "epoch": 0.3656696227265895, "flos": 24571984400640.0, "grad_norm": 1.5854068035466964, "language_loss": 0.74755692, "learning_rate": 2.9299635812188257e-06, "loss": 0.76641595, "num_input_tokens_seen": 130657725, "step": 6082, "time_per_iteration": 2.7261881828308105 }, { "auxiliary_loss_clip": 0.01072732, "auxiliary_loss_mlp": 0.00772597, "balance_loss_clip": 1.04222691, "balance_loss_mlp": 1.00042963, "epoch": 0.3657297459792575, "flos": 27928375056000.0, "grad_norm": 2.051480252043875, "language_loss": 0.82956016, "learning_rate": 2.929618765277987e-06, "loss": 0.8480134, "num_input_tokens_seen": 130678360, "step": 6083, "time_per_iteration": 4.360748529434204 }, { "auxiliary_loss_clip": 0.01041394, "auxiliary_loss_mlp": 0.01001412, "balance_loss_clip": 1.02900386, "balance_loss_mlp": 0.99936181, "epoch": 0.36578986923192547, "flos": 67392622126080.0, "grad_norm": 0.8163771270511553, "language_loss": 0.59314513, "learning_rate": 2.9292739140859125e-06, "loss": 0.61357319, "num_input_tokens_seen": 130742110, "step": 6084, "time_per_iteration": 3.3273561000823975 }, { "auxiliary_loss_clip": 0.0109183, "auxiliary_loss_mlp": 0.0104143, "balance_loss_clip": 1.04496968, "balance_loss_mlp": 1.02570593, "epoch": 0.36584999248459343, "flos": 20227511445120.0, "grad_norm": 3.4329037043843478, "language_loss": 0.72791892, "learning_rate": 2.9289290276556767e-06, "loss": 0.74925154, "num_input_tokens_seen": 130759870, "step": 6085, "time_per_iteration": 2.7221856117248535 }, { "auxiliary_loss_clip": 0.01101549, "auxiliary_loss_mlp": 0.01038512, "balance_loss_clip": 1.04982924, "balance_loss_mlp": 1.02383745, "epoch": 0.3659101157372614, "flos": 19062461813760.0, "grad_norm": 2.636651052815632, "language_loss": 0.77860379, "learning_rate": 2.9285841060003604e-06, "loss": 0.80000436, "num_input_tokens_seen": 130778510, "step": 6086, "time_per_iteration": 4.265977621078491 }, { "auxiliary_loss_clip": 0.0111591, "auxiliary_loss_mlp": 0.01032554, "balance_loss_clip": 1.04616153, "balance_loss_mlp": 1.01771855, "epoch": 0.36597023898992936, "flos": 30810708990720.0, "grad_norm": 1.8562986050024126, "language_loss": 0.76759315, "learning_rate": 2.9282391491330416e-06, "loss": 0.78907776, "num_input_tokens_seen": 130798535, "step": 6087, "time_per_iteration": 4.227373123168945 }, { "auxiliary_loss_clip": 0.01081855, "auxiliary_loss_mlp": 0.01042282, "balance_loss_clip": 1.04556108, "balance_loss_mlp": 1.02589023, "epoch": 0.36603036224259733, "flos": 20521799573760.0, "grad_norm": 2.2476274891892474, "language_loss": 0.71063232, "learning_rate": 2.9278941570668002e-06, "loss": 0.73187363, "num_input_tokens_seen": 130816655, "step": 6088, "time_per_iteration": 4.3080058097839355 }, { "auxiliary_loss_clip": 0.01136094, "auxiliary_loss_mlp": 0.01039702, "balance_loss_clip": 1.05314517, "balance_loss_mlp": 1.02267289, "epoch": 0.3660904854952653, "flos": 38329397798400.0, "grad_norm": 1.6318023186273214, "language_loss": 0.79717827, "learning_rate": 2.92754912981472e-06, "loss": 0.81893623, "num_input_tokens_seen": 130841225, "step": 6089, "time_per_iteration": 2.782954216003418 }, { "auxiliary_loss_clip": 0.01099767, "auxiliary_loss_mlp": 0.01036428, "balance_loss_clip": 1.04514015, "balance_loss_mlp": 1.02220643, "epoch": 0.36615060874793326, "flos": 21835555511040.0, "grad_norm": 2.0312735397290043, "language_loss": 0.71617413, "learning_rate": 2.927204067389884e-06, "loss": 0.73753607, "num_input_tokens_seen": 130861050, "step": 6090, "time_per_iteration": 2.7414958477020264 }, { "auxiliary_loss_clip": 0.01105933, "auxiliary_loss_mlp": 0.01047805, "balance_loss_clip": 1.05133104, "balance_loss_mlp": 1.03305852, "epoch": 0.3662107320006012, "flos": 16581537342720.0, "grad_norm": 2.037307676788604, "language_loss": 0.74434924, "learning_rate": 2.9268589698053763e-06, "loss": 0.7658866, "num_input_tokens_seen": 130879775, "step": 6091, "time_per_iteration": 2.628554344177246 }, { "auxiliary_loss_clip": 0.01076087, "auxiliary_loss_mlp": 0.01042935, "balance_loss_clip": 1.04836047, "balance_loss_mlp": 1.02728868, "epoch": 0.3662708552532692, "flos": 20958365473920.0, "grad_norm": 2.1960531931019682, "language_loss": 0.73387206, "learning_rate": 2.926513837074284e-06, "loss": 0.75506234, "num_input_tokens_seen": 130898070, "step": 6092, "time_per_iteration": 2.7320556640625 }, { "auxiliary_loss_clip": 0.01127006, "auxiliary_loss_mlp": 0.01044139, "balance_loss_clip": 1.04809344, "balance_loss_mlp": 1.02796876, "epoch": 0.36633097850593715, "flos": 21902707987200.0, "grad_norm": 1.9967925590844784, "language_loss": 0.77662504, "learning_rate": 2.9261686692096942e-06, "loss": 0.79833645, "num_input_tokens_seen": 130915250, "step": 6093, "time_per_iteration": 2.721311092376709 }, { "auxiliary_loss_clip": 0.01124005, "auxiliary_loss_mlp": 0.01042053, "balance_loss_clip": 1.04696584, "balance_loss_mlp": 1.02686548, "epoch": 0.3663911017586051, "flos": 32854133808000.0, "grad_norm": 1.926436620767835, "language_loss": 0.7455743, "learning_rate": 2.925823466224696e-06, "loss": 0.76723486, "num_input_tokens_seen": 130936995, "step": 6094, "time_per_iteration": 2.767188310623169 }, { "auxiliary_loss_clip": 0.01142303, "auxiliary_loss_mlp": 0.01055832, "balance_loss_clip": 1.05334711, "balance_loss_mlp": 1.03969133, "epoch": 0.3664512250112731, "flos": 27271748482560.0, "grad_norm": 1.743331442809004, "language_loss": 0.79444361, "learning_rate": 2.9254782281323785e-06, "loss": 0.81642497, "num_input_tokens_seen": 130957970, "step": 6095, "time_per_iteration": 2.718632459640503 }, { "auxiliary_loss_clip": 0.01118218, "auxiliary_loss_mlp": 0.00774719, "balance_loss_clip": 1.05141842, "balance_loss_mlp": 1.00037265, "epoch": 0.3665113482639411, "flos": 17784436930560.0, "grad_norm": 3.4988865885900178, "language_loss": 0.73592722, "learning_rate": 2.925132954945834e-06, "loss": 0.75485659, "num_input_tokens_seen": 130974915, "step": 6096, "time_per_iteration": 2.674382448196411 }, { "auxiliary_loss_clip": 0.01099743, "auxiliary_loss_mlp": 0.01038971, "balance_loss_clip": 1.04458702, "balance_loss_mlp": 1.02355742, "epoch": 0.36657147151660907, "flos": 27854614477440.0, "grad_norm": 2.41624095312735, "language_loss": 0.67081815, "learning_rate": 2.924787646678155e-06, "loss": 0.69220531, "num_input_tokens_seen": 130995745, "step": 6097, "time_per_iteration": 2.789118766784668 }, { "auxiliary_loss_clip": 0.01077673, "auxiliary_loss_mlp": 0.01038362, "balance_loss_clip": 1.04489172, "balance_loss_mlp": 1.02268624, "epoch": 0.36663159476927704, "flos": 25374013228800.0, "grad_norm": 1.4796406838499911, "language_loss": 0.77679402, "learning_rate": 2.9244423033424365e-06, "loss": 0.79795432, "num_input_tokens_seen": 131015545, "step": 6098, "time_per_iteration": 2.7803733348846436 }, { "auxiliary_loss_clip": 0.01122346, "auxiliary_loss_mlp": 0.01045291, "balance_loss_clip": 1.04734826, "balance_loss_mlp": 1.02987766, "epoch": 0.366691718021945, "flos": 21357225072000.0, "grad_norm": 1.744595499322522, "language_loss": 0.73707491, "learning_rate": 2.9240969249517723e-06, "loss": 0.75875127, "num_input_tokens_seen": 131033990, "step": 6099, "time_per_iteration": 2.6809163093566895 }, { "auxiliary_loss_clip": 0.01111202, "auxiliary_loss_mlp": 0.01044256, "balance_loss_clip": 1.04759586, "balance_loss_mlp": 1.02931285, "epoch": 0.36675184127461297, "flos": 16800376953600.0, "grad_norm": 1.8475933970370078, "language_loss": 0.84773195, "learning_rate": 2.9237515115192602e-06, "loss": 0.86928654, "num_input_tokens_seen": 131050710, "step": 6100, "time_per_iteration": 2.6730356216430664 }, { "auxiliary_loss_clip": 0.01102438, "auxiliary_loss_mlp": 0.01037575, "balance_loss_clip": 1.04448223, "balance_loss_mlp": 1.02181566, "epoch": 0.36681196452728093, "flos": 21906514828800.0, "grad_norm": 3.9532097547953104, "language_loss": 0.70893979, "learning_rate": 2.9234060630579992e-06, "loss": 0.73033994, "num_input_tokens_seen": 131071435, "step": 6101, "time_per_iteration": 2.7369589805603027 }, { "auxiliary_loss_clip": 0.01111262, "auxiliary_loss_mlp": 0.01052791, "balance_loss_clip": 1.05096185, "balance_loss_mlp": 1.0361371, "epoch": 0.3668720877799489, "flos": 17712436118400.0, "grad_norm": 2.286737474315047, "language_loss": 0.76634502, "learning_rate": 2.9230605795810865e-06, "loss": 0.7879855, "num_input_tokens_seen": 131088775, "step": 6102, "time_per_iteration": 2.7081708908081055 }, { "auxiliary_loss_clip": 0.01131629, "auxiliary_loss_mlp": 0.01037373, "balance_loss_clip": 1.0524683, "balance_loss_mlp": 1.02050483, "epoch": 0.36693221103261686, "flos": 47045455499520.0, "grad_norm": 4.369253140908342, "language_loss": 0.70019859, "learning_rate": 2.922715061101625e-06, "loss": 0.72188866, "num_input_tokens_seen": 131112800, "step": 6103, "time_per_iteration": 2.8610281944274902 }, { "auxiliary_loss_clip": 0.01093091, "auxiliary_loss_mlp": 0.0103895, "balance_loss_clip": 1.04730344, "balance_loss_mlp": 1.02283263, "epoch": 0.3669923342852848, "flos": 15960929132160.0, "grad_norm": 3.0883152470965842, "language_loss": 0.72272754, "learning_rate": 2.922369507632716e-06, "loss": 0.744048, "num_input_tokens_seen": 131131150, "step": 6104, "time_per_iteration": 2.7520432472229004 }, { "auxiliary_loss_clip": 0.01127975, "auxiliary_loss_mlp": 0.01036046, "balance_loss_clip": 1.05017686, "balance_loss_mlp": 1.01940393, "epoch": 0.3670524575379528, "flos": 19974485064960.0, "grad_norm": 2.1608886453477947, "language_loss": 0.81461251, "learning_rate": 2.9220239191874617e-06, "loss": 0.83625269, "num_input_tokens_seen": 131150365, "step": 6105, "time_per_iteration": 2.7565362453460693 }, { "auxiliary_loss_clip": 0.0114363, "auxiliary_loss_mlp": 0.01041522, "balance_loss_clip": 1.05170739, "balance_loss_mlp": 1.02526236, "epoch": 0.36711258079062076, "flos": 25702955003520.0, "grad_norm": 1.7202629897198451, "language_loss": 0.81035495, "learning_rate": 2.9216782957789692e-06, "loss": 0.83220649, "num_input_tokens_seen": 131169310, "step": 6106, "time_per_iteration": 2.73502779006958 }, { "auxiliary_loss_clip": 0.01035121, "auxiliary_loss_mlp": 0.00753905, "balance_loss_clip": 1.03131676, "balance_loss_mlp": 1.00104892, "epoch": 0.3671727040432887, "flos": 60772743342720.0, "grad_norm": 0.6921927745874564, "language_loss": 0.59176284, "learning_rate": 2.9213326374203426e-06, "loss": 0.60965312, "num_input_tokens_seen": 131232900, "step": 6107, "time_per_iteration": 3.2754647731781006 }, { "auxiliary_loss_clip": 0.01111272, "auxiliary_loss_mlp": 0.01035704, "balance_loss_clip": 1.04770529, "balance_loss_mlp": 1.02058864, "epoch": 0.3672328272959567, "flos": 18661303745280.0, "grad_norm": 1.8102661289525128, "language_loss": 0.74492711, "learning_rate": 2.92098694412469e-06, "loss": 0.76639688, "num_input_tokens_seen": 131250920, "step": 6108, "time_per_iteration": 2.730562448501587 }, { "auxiliary_loss_clip": 0.01129123, "auxiliary_loss_mlp": 0.01037704, "balance_loss_clip": 1.04957151, "balance_loss_mlp": 1.02196801, "epoch": 0.3672929505486247, "flos": 15049049535360.0, "grad_norm": 2.04949693656995, "language_loss": 0.72790694, "learning_rate": 2.9206412159051213e-06, "loss": 0.7495752, "num_input_tokens_seen": 131267910, "step": 6109, "time_per_iteration": 2.6488542556762695 }, { "auxiliary_loss_clip": 0.01065451, "auxiliary_loss_mlp": 0.01040533, "balance_loss_clip": 1.04156637, "balance_loss_mlp": 1.02426052, "epoch": 0.3673530738012927, "flos": 20589347099520.0, "grad_norm": 4.856830375229604, "language_loss": 0.53295934, "learning_rate": 2.920295452774744e-06, "loss": 0.55401909, "num_input_tokens_seen": 131287150, "step": 6110, "time_per_iteration": 2.8366596698760986 }, { "auxiliary_loss_clip": 0.01123878, "auxiliary_loss_mlp": 0.01039006, "balance_loss_clip": 1.04783487, "balance_loss_mlp": 1.02253747, "epoch": 0.36741319705396064, "flos": 21689830033920.0, "grad_norm": 1.6516494205850427, "language_loss": 0.80507129, "learning_rate": 2.919949654746672e-06, "loss": 0.82670015, "num_input_tokens_seen": 131308225, "step": 6111, "time_per_iteration": 2.7537708282470703 }, { "auxiliary_loss_clip": 0.01083524, "auxiliary_loss_mlp": 0.01044306, "balance_loss_clip": 1.04381704, "balance_loss_mlp": 1.02897525, "epoch": 0.3674733203066286, "flos": 29862200499840.0, "grad_norm": 1.7980410764958656, "language_loss": 0.72401643, "learning_rate": 2.9196038218340163e-06, "loss": 0.74529469, "num_input_tokens_seen": 131332115, "step": 6112, "time_per_iteration": 2.80513858795166 }, { "auxiliary_loss_clip": 0.0112775, "auxiliary_loss_mlp": 0.01046215, "balance_loss_clip": 1.05025816, "balance_loss_mlp": 1.03102732, "epoch": 0.36753344355929657, "flos": 18257021193600.0, "grad_norm": 1.6179233760027578, "language_loss": 0.8539387, "learning_rate": 2.919257954049892e-06, "loss": 0.8756783, "num_input_tokens_seen": 131351885, "step": 6113, "time_per_iteration": 2.6997315883636475 }, { "auxiliary_loss_clip": 0.01128342, "auxiliary_loss_mlp": 0.01041644, "balance_loss_clip": 1.04813516, "balance_loss_mlp": 1.02512193, "epoch": 0.36759356681196453, "flos": 25301150490240.0, "grad_norm": 2.2420277636185872, "language_loss": 0.78542709, "learning_rate": 2.918912051407413e-06, "loss": 0.807127, "num_input_tokens_seen": 131370245, "step": 6114, "time_per_iteration": 2.694831609725952 }, { "auxiliary_loss_clip": 0.01133627, "auxiliary_loss_mlp": 0.01044455, "balance_loss_clip": 1.05145383, "balance_loss_mlp": 1.02612031, "epoch": 0.3676536900646325, "flos": 21032952065280.0, "grad_norm": 1.6750895304816946, "language_loss": 0.67368686, "learning_rate": 2.918566113919698e-06, "loss": 0.69546771, "num_input_tokens_seen": 131388115, "step": 6115, "time_per_iteration": 2.6966724395751953 }, { "auxiliary_loss_clip": 0.01104674, "auxiliary_loss_mlp": 0.01037383, "balance_loss_clip": 1.04332471, "balance_loss_mlp": 1.02229142, "epoch": 0.36771381331730046, "flos": 16288506190080.0, "grad_norm": 3.500949938115168, "language_loss": 0.76685899, "learning_rate": 2.9182201415998636e-06, "loss": 0.78827953, "num_input_tokens_seen": 131404595, "step": 6116, "time_per_iteration": 2.6796109676361084 }, { "auxiliary_loss_clip": 0.01088778, "auxiliary_loss_mlp": 0.01043047, "balance_loss_clip": 1.04433835, "balance_loss_mlp": 1.02729988, "epoch": 0.36777393656996843, "flos": 22309971367680.0, "grad_norm": 1.7533988300226562, "language_loss": 0.62997502, "learning_rate": 2.9178741344610286e-06, "loss": 0.65129328, "num_input_tokens_seen": 131423760, "step": 6117, "time_per_iteration": 2.7784011363983154 }, { "auxiliary_loss_clip": 0.01103848, "auxiliary_loss_mlp": 0.01037351, "balance_loss_clip": 1.04275632, "balance_loss_mlp": 1.0210557, "epoch": 0.3678340598226364, "flos": 26834069260800.0, "grad_norm": 1.9867834860772036, "language_loss": 0.73087811, "learning_rate": 2.9175280925163156e-06, "loss": 0.75229007, "num_input_tokens_seen": 131444955, "step": 6118, "time_per_iteration": 2.734731674194336 }, { "auxiliary_loss_clip": 0.01132746, "auxiliary_loss_mlp": 0.01043898, "balance_loss_clip": 1.05198336, "balance_loss_mlp": 1.0266242, "epoch": 0.36789418307530436, "flos": 21761723105280.0, "grad_norm": 2.319960114880422, "language_loss": 0.72638988, "learning_rate": 2.9171820157788445e-06, "loss": 0.74815631, "num_input_tokens_seen": 131465720, "step": 6119, "time_per_iteration": 2.7073371410369873 }, { "auxiliary_loss_clip": 0.0111183, "auxiliary_loss_mlp": 0.01037904, "balance_loss_clip": 1.04830384, "balance_loss_mlp": 1.02101171, "epoch": 0.3679543063279723, "flos": 15924192497280.0, "grad_norm": 1.9587818101138383, "language_loss": 0.80524689, "learning_rate": 2.9168359042617404e-06, "loss": 0.8267442, "num_input_tokens_seen": 131483080, "step": 6120, "time_per_iteration": 2.679933547973633 }, { "auxiliary_loss_clip": 0.01093981, "auxiliary_loss_mlp": 0.0104441, "balance_loss_clip": 1.04785204, "balance_loss_mlp": 1.02894819, "epoch": 0.3680144295806403, "flos": 24275541456000.0, "grad_norm": 2.4092121945194496, "language_loss": 0.64745319, "learning_rate": 2.916489757978126e-06, "loss": 0.66883707, "num_input_tokens_seen": 131502545, "step": 6121, "time_per_iteration": 2.7067880630493164 }, { "auxiliary_loss_clip": 0.01126101, "auxiliary_loss_mlp": 0.01043212, "balance_loss_clip": 1.05021691, "balance_loss_mlp": 1.02735114, "epoch": 0.36807455283330826, "flos": 26104148985600.0, "grad_norm": 1.774708172393826, "language_loss": 0.71686751, "learning_rate": 2.9161435769411286e-06, "loss": 0.73856068, "num_input_tokens_seen": 131522155, "step": 6122, "time_per_iteration": 4.026647329330444 }, { "auxiliary_loss_clip": 0.01106964, "auxiliary_loss_mlp": 0.01043545, "balance_loss_clip": 1.04859734, "balance_loss_mlp": 1.0265938, "epoch": 0.3681346760859763, "flos": 24644990793600.0, "grad_norm": 5.6855406070233245, "language_loss": 0.69653022, "learning_rate": 2.915797361163875e-06, "loss": 0.71803534, "num_input_tokens_seen": 131543865, "step": 6123, "time_per_iteration": 2.7548627853393555 }, { "auxiliary_loss_clip": 0.01128204, "auxiliary_loss_mlp": 0.01040578, "balance_loss_clip": 1.04822993, "balance_loss_mlp": 1.02251744, "epoch": 0.36819479933864424, "flos": 23878369797120.0, "grad_norm": 7.022932421262019, "language_loss": 0.73640841, "learning_rate": 2.9154511106594933e-06, "loss": 0.75809622, "num_input_tokens_seen": 131562155, "step": 6124, "time_per_iteration": 2.6710870265960693 }, { "auxiliary_loss_clip": 0.01116833, "auxiliary_loss_mlp": 0.01045789, "balance_loss_clip": 1.04977059, "balance_loss_mlp": 1.02809882, "epoch": 0.3682549225913122, "flos": 25553997302400.0, "grad_norm": 1.931714997280456, "language_loss": 0.74334198, "learning_rate": 2.915104825441114e-06, "loss": 0.76496822, "num_input_tokens_seen": 131581695, "step": 6125, "time_per_iteration": 4.175686359405518 }, { "auxiliary_loss_clip": 0.01132649, "auxiliary_loss_mlp": 0.01053205, "balance_loss_clip": 1.05193818, "balance_loss_mlp": 1.03514445, "epoch": 0.36831504584398017, "flos": 16946605221120.0, "grad_norm": 1.8318884745506827, "language_loss": 0.78127813, "learning_rate": 2.9147585055218686e-06, "loss": 0.80313659, "num_input_tokens_seen": 131599465, "step": 6126, "time_per_iteration": 2.6783266067504883 }, { "auxiliary_loss_clip": 0.01128437, "auxiliary_loss_mlp": 0.01045021, "balance_loss_clip": 1.0490706, "balance_loss_mlp": 1.02659082, "epoch": 0.36837516909664814, "flos": 19865065259520.0, "grad_norm": 2.7490159956422575, "language_loss": 0.66118228, "learning_rate": 2.914412150914888e-06, "loss": 0.68291688, "num_input_tokens_seen": 131618330, "step": 6127, "time_per_iteration": 4.20530891418457 }, { "auxiliary_loss_clip": 0.01120142, "auxiliary_loss_mlp": 0.01046706, "balance_loss_clip": 1.05205703, "balance_loss_mlp": 1.02980185, "epoch": 0.3684352923493161, "flos": 37626984362880.0, "grad_norm": 1.8515813315176315, "language_loss": 0.70152593, "learning_rate": 2.9140657616333074e-06, "loss": 0.72319436, "num_input_tokens_seen": 131638960, "step": 6128, "time_per_iteration": 4.498606204986572 }, { "auxiliary_loss_clip": 0.0112131, "auxiliary_loss_mlp": 0.01046424, "balance_loss_clip": 1.05264103, "balance_loss_mlp": 1.02957964, "epoch": 0.36849541560198407, "flos": 14465501182080.0, "grad_norm": 2.3245894967836698, "language_loss": 0.75067866, "learning_rate": 2.9137193376902614e-06, "loss": 0.77235603, "num_input_tokens_seen": 131657440, "step": 6129, "time_per_iteration": 2.6874284744262695 }, { "auxiliary_loss_clip": 0.01118674, "auxiliary_loss_mlp": 0.01040759, "balance_loss_clip": 1.04533887, "balance_loss_mlp": 1.02403355, "epoch": 0.36855553885465203, "flos": 25770753924480.0, "grad_norm": 1.6533761140504426, "language_loss": 0.84758681, "learning_rate": 2.9133728790988868e-06, "loss": 0.86918116, "num_input_tokens_seen": 131678035, "step": 6130, "time_per_iteration": 2.729963541030884 }, { "auxiliary_loss_clip": 0.0102639, "auxiliary_loss_mlp": 0.01017875, "balance_loss_clip": 1.02295637, "balance_loss_mlp": 1.01620567, "epoch": 0.36861566210732, "flos": 65049417377280.0, "grad_norm": 0.8481176099425293, "language_loss": 0.60254776, "learning_rate": 2.913026385872321e-06, "loss": 0.62299049, "num_input_tokens_seen": 131742470, "step": 6131, "time_per_iteration": 3.2806124687194824 }, { "auxiliary_loss_clip": 0.01097122, "auxiliary_loss_mlp": 0.01035652, "balance_loss_clip": 1.04542315, "balance_loss_mlp": 1.01914179, "epoch": 0.36867578535998796, "flos": 30954495133440.0, "grad_norm": 1.5587449528822306, "language_loss": 0.73085582, "learning_rate": 2.9126798580237034e-06, "loss": 0.75218356, "num_input_tokens_seen": 131764570, "step": 6132, "time_per_iteration": 2.781385898590088 }, { "auxiliary_loss_clip": 0.01127214, "auxiliary_loss_mlp": 0.01039387, "balance_loss_clip": 1.04795551, "balance_loss_mlp": 1.02187514, "epoch": 0.3687359086126559, "flos": 28837956182400.0, "grad_norm": 1.9292425463255205, "language_loss": 0.74192035, "learning_rate": 2.9123332955661736e-06, "loss": 0.76358628, "num_input_tokens_seen": 131785720, "step": 6133, "time_per_iteration": 2.718660831451416 }, { "auxiliary_loss_clip": 0.01072831, "auxiliary_loss_mlp": 0.01049093, "balance_loss_clip": 1.041502, "balance_loss_mlp": 1.03042495, "epoch": 0.3687960318653239, "flos": 21396798881280.0, "grad_norm": 1.8863128538280483, "language_loss": 0.71522588, "learning_rate": 2.911986698512874e-06, "loss": 0.73644507, "num_input_tokens_seen": 131804430, "step": 6134, "time_per_iteration": 2.8003294467926025 }, { "auxiliary_loss_clip": 0.01102901, "auxiliary_loss_mlp": 0.01034768, "balance_loss_clip": 1.0472008, "balance_loss_mlp": 1.01838863, "epoch": 0.36885615511799186, "flos": 20266043760000.0, "grad_norm": 1.6906065874809195, "language_loss": 0.75386798, "learning_rate": 2.9116400668769477e-06, "loss": 0.77524465, "num_input_tokens_seen": 131822060, "step": 6135, "time_per_iteration": 2.7916624546051025 }, { "auxiliary_loss_clip": 0.01030435, "auxiliary_loss_mlp": 0.01019879, "balance_loss_clip": 1.0281316, "balance_loss_mlp": 1.01760185, "epoch": 0.3689162783706599, "flos": 63088836301440.0, "grad_norm": 0.8159837123545765, "language_loss": 0.58766222, "learning_rate": 2.9112934006715376e-06, "loss": 0.60816532, "num_input_tokens_seen": 131880715, "step": 6136, "time_per_iteration": 3.2766408920288086 }, { "auxiliary_loss_clip": 0.01106354, "auxiliary_loss_mlp": 0.01043903, "balance_loss_clip": 1.04497695, "balance_loss_mlp": 1.02723718, "epoch": 0.36897640162332784, "flos": 10961984419200.0, "grad_norm": 2.3780452593473393, "language_loss": 0.79126394, "learning_rate": 2.9109466999097918e-06, "loss": 0.81276655, "num_input_tokens_seen": 131895850, "step": 6137, "time_per_iteration": 2.8411052227020264 }, { "auxiliary_loss_clip": 0.011261, "auxiliary_loss_mlp": 0.01043272, "balance_loss_clip": 1.04803205, "balance_loss_mlp": 1.02645159, "epoch": 0.3690365248759958, "flos": 20704297599360.0, "grad_norm": 2.0312275113078337, "language_loss": 0.7454071, "learning_rate": 2.9105999646048552e-06, "loss": 0.76710081, "num_input_tokens_seen": 131915775, "step": 6138, "time_per_iteration": 2.7210230827331543 }, { "auxiliary_loss_clip": 0.01090918, "auxiliary_loss_mlp": 0.01042472, "balance_loss_clip": 1.04320955, "balance_loss_mlp": 1.0259856, "epoch": 0.3690966481286638, "flos": 31826369957760.0, "grad_norm": 2.0947758027881767, "language_loss": 0.64676917, "learning_rate": 2.9102531947698764e-06, "loss": 0.66810304, "num_input_tokens_seen": 131935715, "step": 6139, "time_per_iteration": 2.8667304515838623 }, { "auxiliary_loss_clip": 0.01095075, "auxiliary_loss_mlp": 0.01042873, "balance_loss_clip": 1.04443955, "balance_loss_mlp": 1.02646971, "epoch": 0.36915677138133174, "flos": 13114936782720.0, "grad_norm": 2.1146776737326998, "language_loss": 0.71764016, "learning_rate": 2.909906390418006e-06, "loss": 0.73901963, "num_input_tokens_seen": 131954120, "step": 6140, "time_per_iteration": 2.718100070953369 }, { "auxiliary_loss_clip": 0.01017799, "auxiliary_loss_mlp": 0.01004631, "balance_loss_clip": 1.02079976, "balance_loss_mlp": 1.00281894, "epoch": 0.3692168946339997, "flos": 68686879956480.0, "grad_norm": 0.7503567012350645, "language_loss": 0.59252203, "learning_rate": 2.9095595515623934e-06, "loss": 0.61274636, "num_input_tokens_seen": 132017485, "step": 6141, "time_per_iteration": 3.3003833293914795 }, { "auxiliary_loss_clip": 0.01122088, "auxiliary_loss_mlp": 0.01040836, "balance_loss_clip": 1.04716861, "balance_loss_mlp": 1.02458787, "epoch": 0.36927701788666767, "flos": 22017873968640.0, "grad_norm": 1.900744005055956, "language_loss": 0.75374687, "learning_rate": 2.909212678216192e-06, "loss": 0.77537608, "num_input_tokens_seen": 132036760, "step": 6142, "time_per_iteration": 2.707676410675049 }, { "auxiliary_loss_clip": 0.01122008, "auxiliary_loss_mlp": 0.01037683, "balance_loss_clip": 1.04708242, "balance_loss_mlp": 1.02276349, "epoch": 0.36933714113933563, "flos": 21835591424640.0, "grad_norm": 2.0868371024046346, "language_loss": 0.77474618, "learning_rate": 2.908865770392555e-06, "loss": 0.79634303, "num_input_tokens_seen": 132056935, "step": 6143, "time_per_iteration": 2.6308929920196533 }, { "auxiliary_loss_clip": 0.01122961, "auxiliary_loss_mlp": 0.01033227, "balance_loss_clip": 1.04840302, "balance_loss_mlp": 1.01860011, "epoch": 0.3693972643920036, "flos": 23691705793920.0, "grad_norm": 2.7754530777388555, "language_loss": 0.82127941, "learning_rate": 2.9085188281046364e-06, "loss": 0.84284127, "num_input_tokens_seen": 132077285, "step": 6144, "time_per_iteration": 2.7094409465789795 }, { "auxiliary_loss_clip": 0.01126238, "auxiliary_loss_mlp": 0.01040495, "balance_loss_clip": 1.0479883, "balance_loss_mlp": 1.02547419, "epoch": 0.36945738764467156, "flos": 22856747172480.0, "grad_norm": 2.260022101229928, "language_loss": 0.774791, "learning_rate": 2.908171851365593e-06, "loss": 0.79645836, "num_input_tokens_seen": 132095520, "step": 6145, "time_per_iteration": 2.6951241493225098 }, { "auxiliary_loss_clip": 0.01120499, "auxiliary_loss_mlp": 0.01030806, "balance_loss_clip": 1.04903388, "balance_loss_mlp": 1.01503491, "epoch": 0.36951751089733953, "flos": 16615939593600.0, "grad_norm": 2.2611713814894423, "language_loss": 0.76861286, "learning_rate": 2.9078248401885815e-06, "loss": 0.79012597, "num_input_tokens_seen": 132112810, "step": 6146, "time_per_iteration": 2.6205246448516846 }, { "auxiliary_loss_clip": 0.0110988, "auxiliary_loss_mlp": 0.01042802, "balance_loss_clip": 1.04717457, "balance_loss_mlp": 1.02518249, "epoch": 0.3695776341500075, "flos": 18914545607040.0, "grad_norm": 3.3549376840260394, "language_loss": 0.80945081, "learning_rate": 2.907477794586761e-06, "loss": 0.83097762, "num_input_tokens_seen": 132131615, "step": 6147, "time_per_iteration": 2.7176942825317383 }, { "auxiliary_loss_clip": 0.01108097, "auxiliary_loss_mlp": 0.00773519, "balance_loss_clip": 1.05041718, "balance_loss_mlp": 1.00029731, "epoch": 0.36963775740267546, "flos": 20808474019200.0, "grad_norm": 1.8104892137163535, "language_loss": 0.83325249, "learning_rate": 2.9071307145732926e-06, "loss": 0.85206866, "num_input_tokens_seen": 132149585, "step": 6148, "time_per_iteration": 2.7764229774475098 }, { "auxiliary_loss_clip": 0.01121751, "auxiliary_loss_mlp": 0.01033697, "balance_loss_clip": 1.04946411, "balance_loss_mlp": 1.01843238, "epoch": 0.3696978806553435, "flos": 26061881656320.0, "grad_norm": 2.472295207741171, "language_loss": 0.74167144, "learning_rate": 2.9067836001613357e-06, "loss": 0.76322597, "num_input_tokens_seen": 132165555, "step": 6149, "time_per_iteration": 2.729785680770874 }, { "auxiliary_loss_clip": 0.01141043, "auxiliary_loss_mlp": 0.01040796, "balance_loss_clip": 1.0524776, "balance_loss_mlp": 1.02347541, "epoch": 0.36975800390801145, "flos": 26833925606400.0, "grad_norm": 2.18045381202803, "language_loss": 0.71229833, "learning_rate": 2.906436451364054e-06, "loss": 0.73411667, "num_input_tokens_seen": 132185100, "step": 6150, "time_per_iteration": 2.6558914184570312 }, { "auxiliary_loss_clip": 0.01112432, "auxiliary_loss_mlp": 0.0104236, "balance_loss_clip": 1.04834723, "balance_loss_mlp": 1.02634454, "epoch": 0.3698181271606794, "flos": 21142623265920.0, "grad_norm": 2.1283605732632487, "language_loss": 0.82001126, "learning_rate": 2.906089268194611e-06, "loss": 0.84155917, "num_input_tokens_seen": 132203930, "step": 6151, "time_per_iteration": 2.811908483505249 }, { "auxiliary_loss_clip": 0.0104085, "auxiliary_loss_mlp": 0.01012111, "balance_loss_clip": 1.02895284, "balance_loss_mlp": 1.01035905, "epoch": 0.3698782504133474, "flos": 66742639568640.0, "grad_norm": 0.8434423047890295, "language_loss": 0.63103437, "learning_rate": 2.9057420506661726e-06, "loss": 0.651564, "num_input_tokens_seen": 132263845, "step": 6152, "time_per_iteration": 3.283348798751831 }, { "auxiliary_loss_clip": 0.01083912, "auxiliary_loss_mlp": 0.01046371, "balance_loss_clip": 1.04603028, "balance_loss_mlp": 1.02939606, "epoch": 0.36993837366601534, "flos": 24311523905280.0, "grad_norm": 2.101714417244525, "language_loss": 0.70249707, "learning_rate": 2.9053947987919044e-06, "loss": 0.72379988, "num_input_tokens_seen": 132282350, "step": 6153, "time_per_iteration": 2.776003837585449 }, { "auxiliary_loss_clip": 0.01126735, "auxiliary_loss_mlp": 0.01038393, "balance_loss_clip": 1.04984677, "balance_loss_mlp": 1.02176309, "epoch": 0.3699984969186833, "flos": 24349194293760.0, "grad_norm": 1.5983560512083512, "language_loss": 0.72364891, "learning_rate": 2.9050475125849755e-06, "loss": 0.74530017, "num_input_tokens_seen": 132301930, "step": 6154, "time_per_iteration": 2.7031455039978027 }, { "auxiliary_loss_clip": 0.01108862, "auxiliary_loss_mlp": 0.01038947, "balance_loss_clip": 1.04792106, "balance_loss_mlp": 1.02376008, "epoch": 0.37005862017135127, "flos": 19829154637440.0, "grad_norm": 1.6579101756116525, "language_loss": 0.67716074, "learning_rate": 2.9047001920585534e-06, "loss": 0.6986388, "num_input_tokens_seen": 132320915, "step": 6155, "time_per_iteration": 2.7716591358184814 }, { "auxiliary_loss_clip": 0.01124062, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.04789114, "balance_loss_mlp": 1.0171442, "epoch": 0.37011874342401924, "flos": 19573793873280.0, "grad_norm": 1.797024775246088, "language_loss": 0.68048114, "learning_rate": 2.9043528372258097e-06, "loss": 0.70204842, "num_input_tokens_seen": 132340415, "step": 6156, "time_per_iteration": 2.7830615043640137 }, { "auxiliary_loss_clip": 0.01109781, "auxiliary_loss_mlp": 0.0103684, "balance_loss_clip": 1.04603815, "balance_loss_mlp": 1.02202225, "epoch": 0.3701788666766872, "flos": 20374350243840.0, "grad_norm": 1.8485807917443284, "language_loss": 0.82232833, "learning_rate": 2.904005448099916e-06, "loss": 0.84379458, "num_input_tokens_seen": 132358600, "step": 6157, "time_per_iteration": 2.676429033279419 }, { "auxiliary_loss_clip": 0.01087924, "auxiliary_loss_mlp": 0.01042208, "balance_loss_clip": 1.04360199, "balance_loss_mlp": 1.02474344, "epoch": 0.37023898992935517, "flos": 15340931452800.0, "grad_norm": 2.2992188770836175, "language_loss": 0.76899838, "learning_rate": 2.9036580246940444e-06, "loss": 0.79029977, "num_input_tokens_seen": 132373160, "step": 6158, "time_per_iteration": 2.7764365673065186 }, { "auxiliary_loss_clip": 0.01138492, "auxiliary_loss_mlp": 0.01037057, "balance_loss_clip": 1.0489651, "balance_loss_mlp": 1.01997483, "epoch": 0.37029911318202313, "flos": 19573937527680.0, "grad_norm": 2.8360595009252196, "language_loss": 0.68930852, "learning_rate": 2.9033105670213708e-06, "loss": 0.71106398, "num_input_tokens_seen": 132392345, "step": 6159, "time_per_iteration": 2.664858818054199 }, { "auxiliary_loss_clip": 0.01110756, "auxiliary_loss_mlp": 0.01035031, "balance_loss_clip": 1.049088, "balance_loss_mlp": 1.02067792, "epoch": 0.3703592364346911, "flos": 26213353309440.0, "grad_norm": 2.9956624327703523, "language_loss": 0.71067882, "learning_rate": 2.9029630750950697e-06, "loss": 0.73213673, "num_input_tokens_seen": 132412620, "step": 6160, "time_per_iteration": 2.757081985473633 }, { "auxiliary_loss_clip": 0.01106906, "auxiliary_loss_mlp": 0.01033059, "balance_loss_clip": 1.04698467, "balance_loss_mlp": 1.01918936, "epoch": 0.37041935968735906, "flos": 20048317470720.0, "grad_norm": 2.0439504076987403, "language_loss": 0.79205775, "learning_rate": 2.9026155489283176e-06, "loss": 0.81345737, "num_input_tokens_seen": 132431570, "step": 6161, "time_per_iteration": 2.8008711338043213 }, { "auxiliary_loss_clip": 0.01136197, "auxiliary_loss_mlp": 0.01038947, "balance_loss_clip": 1.04960537, "balance_loss_mlp": 1.02284193, "epoch": 0.3704794829400271, "flos": 24133802388480.0, "grad_norm": 2.0425786778899058, "language_loss": 0.79665029, "learning_rate": 2.902267988534295e-06, "loss": 0.81840169, "num_input_tokens_seen": 132451525, "step": 6162, "time_per_iteration": 4.2554450035095215 }, { "auxiliary_loss_clip": 0.01107039, "auxiliary_loss_mlp": 0.00773743, "balance_loss_clip": 1.0442729, "balance_loss_mlp": 1.00038123, "epoch": 0.37053960619269505, "flos": 14866874732160.0, "grad_norm": 2.0272159369395193, "language_loss": 0.79314882, "learning_rate": 2.9019203939261783e-06, "loss": 0.81195664, "num_input_tokens_seen": 132469875, "step": 6163, "time_per_iteration": 2.753324508666992 }, { "auxiliary_loss_clip": 0.0112147, "auxiliary_loss_mlp": 0.01039825, "balance_loss_clip": 1.04676855, "balance_loss_mlp": 1.02351689, "epoch": 0.370599729445363, "flos": 21361498790400.0, "grad_norm": 1.847799951808159, "language_loss": 0.67843366, "learning_rate": 2.9015727651171507e-06, "loss": 0.7000466, "num_input_tokens_seen": 132488360, "step": 6164, "time_per_iteration": 2.7885541915893555 }, { "auxiliary_loss_clip": 0.01109766, "auxiliary_loss_mlp": 0.01045808, "balance_loss_clip": 1.04918885, "balance_loss_mlp": 1.02877307, "epoch": 0.370659852698031, "flos": 26829041356800.0, "grad_norm": 2.0007288653084334, "language_loss": 0.83441198, "learning_rate": 2.9012251021203935e-06, "loss": 0.85596776, "num_input_tokens_seen": 132508630, "step": 6165, "time_per_iteration": 4.3637871742248535 }, { "auxiliary_loss_clip": 0.01115767, "auxiliary_loss_mlp": 0.01037848, "balance_loss_clip": 1.0473845, "balance_loss_mlp": 1.02026439, "epoch": 0.37071997595069894, "flos": 19099018880640.0, "grad_norm": 1.7502292049636352, "language_loss": 0.69057518, "learning_rate": 2.9008774049490896e-06, "loss": 0.71211129, "num_input_tokens_seen": 132527465, "step": 6166, "time_per_iteration": 2.6754019260406494 }, { "auxiliary_loss_clip": 0.01032616, "auxiliary_loss_mlp": 0.01025464, "balance_loss_clip": 1.03081024, "balance_loss_mlp": 1.02362847, "epoch": 0.3707800992033669, "flos": 52178384920320.0, "grad_norm": 0.8028866408552083, "language_loss": 0.5688796, "learning_rate": 2.9005296736164244e-06, "loss": 0.58946037, "num_input_tokens_seen": 132579940, "step": 6167, "time_per_iteration": 6.357440233230591 }, { "auxiliary_loss_clip": 0.01110244, "auxiliary_loss_mlp": 0.01037896, "balance_loss_clip": 1.04592001, "balance_loss_mlp": 1.02284551, "epoch": 0.3708402224560349, "flos": 19901837808000.0, "grad_norm": 2.0812394742982203, "language_loss": 0.75159574, "learning_rate": 2.900181908135584e-06, "loss": 0.77307719, "num_input_tokens_seen": 132598390, "step": 6168, "time_per_iteration": 2.7107198238372803 }, { "auxiliary_loss_clip": 0.01117658, "auxiliary_loss_mlp": 0.00773774, "balance_loss_clip": 1.04381216, "balance_loss_mlp": 1.00029826, "epoch": 0.37090034570870284, "flos": 20007630339840.0, "grad_norm": 2.166706099657804, "language_loss": 0.73690271, "learning_rate": 2.899834108519755e-06, "loss": 0.755817, "num_input_tokens_seen": 132616920, "step": 6169, "time_per_iteration": 2.743741035461426 }, { "auxiliary_loss_clip": 0.0113208, "auxiliary_loss_mlp": 0.01038383, "balance_loss_clip": 1.0476737, "balance_loss_mlp": 1.02352989, "epoch": 0.3709604689613708, "flos": 24134700228480.0, "grad_norm": 1.6724632615545945, "language_loss": 0.79498589, "learning_rate": 2.899486274782127e-06, "loss": 0.81669056, "num_input_tokens_seen": 132637660, "step": 6170, "time_per_iteration": 2.738492727279663 }, { "auxiliary_loss_clip": 0.01122253, "auxiliary_loss_mlp": 0.01045679, "balance_loss_clip": 1.04780805, "balance_loss_mlp": 1.02913237, "epoch": 0.37102059221403877, "flos": 23876071326720.0, "grad_norm": 1.739457755704792, "language_loss": 0.76506341, "learning_rate": 2.8991384069358885e-06, "loss": 0.78674281, "num_input_tokens_seen": 132657635, "step": 6171, "time_per_iteration": 2.6531472206115723 }, { "auxiliary_loss_clip": 0.01112543, "auxiliary_loss_mlp": 0.01041865, "balance_loss_clip": 1.05081654, "balance_loss_mlp": 1.02546144, "epoch": 0.37108071546670673, "flos": 14501268149760.0, "grad_norm": 2.0084032146250608, "language_loss": 0.80705774, "learning_rate": 2.898790504994232e-06, "loss": 0.82860184, "num_input_tokens_seen": 132674455, "step": 6172, "time_per_iteration": 2.6587960720062256 }, { "auxiliary_loss_clip": 0.01125694, "auxiliary_loss_mlp": 0.01044257, "balance_loss_clip": 1.0475564, "balance_loss_mlp": 1.02747262, "epoch": 0.3711408387193747, "flos": 34562619279360.0, "grad_norm": 2.410153405618026, "language_loss": 0.59260982, "learning_rate": 2.89844256897035e-06, "loss": 0.61430931, "num_input_tokens_seen": 132695140, "step": 6173, "time_per_iteration": 2.738430976867676 }, { "auxiliary_loss_clip": 0.01110933, "auxiliary_loss_mlp": 0.01044385, "balance_loss_clip": 1.04549873, "balance_loss_mlp": 1.02885222, "epoch": 0.37120096197204266, "flos": 17310703432320.0, "grad_norm": 1.954423749693878, "language_loss": 0.80869365, "learning_rate": 2.898094598877435e-06, "loss": 0.83024681, "num_input_tokens_seen": 132712470, "step": 6174, "time_per_iteration": 2.7166690826416016 }, { "auxiliary_loss_clip": 0.01129522, "auxiliary_loss_mlp": 0.01045042, "balance_loss_clip": 1.04628158, "balance_loss_mlp": 1.03025961, "epoch": 0.37126108522471063, "flos": 30664049760000.0, "grad_norm": 2.1592050046005, "language_loss": 0.79910219, "learning_rate": 2.8977465947286826e-06, "loss": 0.82084787, "num_input_tokens_seen": 132732945, "step": 6175, "time_per_iteration": 2.6746280193328857 }, { "auxiliary_loss_clip": 0.011267, "auxiliary_loss_mlp": 0.01053826, "balance_loss_clip": 1.05173898, "balance_loss_mlp": 1.0380547, "epoch": 0.37132120847737865, "flos": 25155640494720.0, "grad_norm": 2.2578092376668315, "language_loss": 0.88735723, "learning_rate": 2.89739855653729e-06, "loss": 0.90916252, "num_input_tokens_seen": 132752470, "step": 6176, "time_per_iteration": 2.6791093349456787 }, { "auxiliary_loss_clip": 0.01124216, "auxiliary_loss_mlp": 0.01042973, "balance_loss_clip": 1.04811859, "balance_loss_mlp": 1.02713037, "epoch": 0.3713813317300466, "flos": 21213474842880.0, "grad_norm": 1.5716198978013565, "language_loss": 0.73431349, "learning_rate": 2.8970504843164546e-06, "loss": 0.75598538, "num_input_tokens_seen": 132771485, "step": 6177, "time_per_iteration": 2.6808605194091797 }, { "auxiliary_loss_clip": 0.01102086, "auxiliary_loss_mlp": 0.01051929, "balance_loss_clip": 1.04524541, "balance_loss_mlp": 1.03575838, "epoch": 0.3714414549827146, "flos": 21616644072960.0, "grad_norm": 2.0030850547718915, "language_loss": 0.75349051, "learning_rate": 2.896702378079374e-06, "loss": 0.77503073, "num_input_tokens_seen": 132791465, "step": 6178, "time_per_iteration": 2.7112066745758057 }, { "auxiliary_loss_clip": 0.0107122, "auxiliary_loss_mlp": 0.01050415, "balance_loss_clip": 1.04323864, "balance_loss_mlp": 1.03208089, "epoch": 0.37150157823538255, "flos": 19972294335360.0, "grad_norm": 2.0305314414463136, "language_loss": 0.72141892, "learning_rate": 2.8963542378392502e-06, "loss": 0.74263525, "num_input_tokens_seen": 132810160, "step": 6179, "time_per_iteration": 2.7965877056121826 }, { "auxiliary_loss_clip": 0.01137504, "auxiliary_loss_mlp": 0.01046799, "balance_loss_clip": 1.05008841, "balance_loss_mlp": 1.03018165, "epoch": 0.3715617014880505, "flos": 24860562266880.0, "grad_norm": 2.387630814732786, "language_loss": 0.6993162, "learning_rate": 2.896006063609283e-06, "loss": 0.72115916, "num_input_tokens_seen": 132831265, "step": 6180, "time_per_iteration": 2.695232391357422 }, { "auxiliary_loss_clip": 0.01113448, "auxiliary_loss_mlp": 0.01037109, "balance_loss_clip": 1.04914021, "balance_loss_mlp": 1.02208257, "epoch": 0.3716218247407185, "flos": 20449080489600.0, "grad_norm": 2.1080005695464243, "language_loss": 0.77920252, "learning_rate": 2.8956578554026767e-06, "loss": 0.80070812, "num_input_tokens_seen": 132850005, "step": 6181, "time_per_iteration": 2.7087795734405518 }, { "auxiliary_loss_clip": 0.01123157, "auxiliary_loss_mlp": 0.01041815, "balance_loss_clip": 1.05016994, "balance_loss_mlp": 1.02525139, "epoch": 0.37168194799338644, "flos": 24133479166080.0, "grad_norm": 2.570629027716188, "language_loss": 0.79222846, "learning_rate": 2.8953096132326343e-06, "loss": 0.81387818, "num_input_tokens_seen": 132865790, "step": 6182, "time_per_iteration": 2.6541473865509033 }, { "auxiliary_loss_clip": 0.01041849, "auxiliary_loss_mlp": 0.01016945, "balance_loss_clip": 1.03053021, "balance_loss_mlp": 1.01533604, "epoch": 0.3717420712460544, "flos": 67408926900480.0, "grad_norm": 0.7830434308203498, "language_loss": 0.57445002, "learning_rate": 2.894961337112362e-06, "loss": 0.59503794, "num_input_tokens_seen": 132921775, "step": 6183, "time_per_iteration": 3.191969633102417 }, { "auxiliary_loss_clip": 0.01126783, "auxiliary_loss_mlp": 0.00775242, "balance_loss_clip": 1.04496169, "balance_loss_mlp": 1.00043631, "epoch": 0.37180219449872237, "flos": 22376908362240.0, "grad_norm": 1.9647478507461604, "language_loss": 0.76617277, "learning_rate": 2.894613027055066e-06, "loss": 0.78519297, "num_input_tokens_seen": 132941060, "step": 6184, "time_per_iteration": 2.7096588611602783 }, { "auxiliary_loss_clip": 0.01090654, "auxiliary_loss_mlp": 0.01039062, "balance_loss_clip": 1.04084587, "balance_loss_mlp": 1.02344596, "epoch": 0.37186231775139034, "flos": 21869885934720.0, "grad_norm": 2.1021072738728717, "language_loss": 0.7217713, "learning_rate": 2.894264683073954e-06, "loss": 0.74306846, "num_input_tokens_seen": 132961850, "step": 6185, "time_per_iteration": 2.739130735397339 }, { "auxiliary_loss_clip": 0.01081138, "auxiliary_loss_mlp": 0.01034498, "balance_loss_clip": 1.04156423, "balance_loss_mlp": 1.01805878, "epoch": 0.3719224410040583, "flos": 22415225195520.0, "grad_norm": 2.1871647895832496, "language_loss": 0.76805776, "learning_rate": 2.8939163051822363e-06, "loss": 0.78921413, "num_input_tokens_seen": 132981625, "step": 6186, "time_per_iteration": 2.779510259628296 }, { "auxiliary_loss_clip": 0.01131414, "auxiliary_loss_mlp": 0.01042221, "balance_loss_clip": 1.05090106, "balance_loss_mlp": 1.02491212, "epoch": 0.37198256425672627, "flos": 25151223121920.0, "grad_norm": 1.8929067887672733, "language_loss": 0.84037393, "learning_rate": 2.8935678933931224e-06, "loss": 0.86211032, "num_input_tokens_seen": 133001225, "step": 6187, "time_per_iteration": 2.67541241645813 }, { "auxiliary_loss_clip": 0.01120953, "auxiliary_loss_mlp": 0.01040882, "balance_loss_clip": 1.04474545, "balance_loss_mlp": 1.02553999, "epoch": 0.37204268750939423, "flos": 21138313633920.0, "grad_norm": 1.7194664317181616, "language_loss": 0.84831274, "learning_rate": 2.893219447719824e-06, "loss": 0.86993104, "num_input_tokens_seen": 133018820, "step": 6188, "time_per_iteration": 2.6241226196289062 }, { "auxiliary_loss_clip": 0.01108827, "auxiliary_loss_mlp": 0.01040814, "balance_loss_clip": 1.04934168, "balance_loss_mlp": 1.02501917, "epoch": 0.37210281076206225, "flos": 21506829217920.0, "grad_norm": 2.498329305558477, "language_loss": 0.65702367, "learning_rate": 2.8928709681755548e-06, "loss": 0.67852014, "num_input_tokens_seen": 133040205, "step": 6189, "time_per_iteration": 2.724707841873169 }, { "auxiliary_loss_clip": 0.01112219, "auxiliary_loss_mlp": 0.0104713, "balance_loss_clip": 1.0451889, "balance_loss_mlp": 1.03045225, "epoch": 0.3721629340147302, "flos": 17347835116800.0, "grad_norm": 1.9571366893805608, "language_loss": 0.84120989, "learning_rate": 2.8925224547735293e-06, "loss": 0.86280334, "num_input_tokens_seen": 133058095, "step": 6190, "time_per_iteration": 2.719454050064087 }, { "auxiliary_loss_clip": 0.01109992, "auxiliary_loss_mlp": 0.01041587, "balance_loss_clip": 1.0465343, "balance_loss_mlp": 1.02571416, "epoch": 0.3722230572673982, "flos": 16432400073600.0, "grad_norm": 4.021000090429005, "language_loss": 0.87807733, "learning_rate": 2.8921739075269633e-06, "loss": 0.89959311, "num_input_tokens_seen": 133071530, "step": 6191, "time_per_iteration": 2.7081027030944824 }, { "auxiliary_loss_clip": 0.0108777, "auxiliary_loss_mlp": 0.01037991, "balance_loss_clip": 1.04300189, "balance_loss_mlp": 1.01962125, "epoch": 0.37228318052006615, "flos": 22674716023680.0, "grad_norm": 3.7199150853096508, "language_loss": 0.74228656, "learning_rate": 2.891825326449073e-06, "loss": 0.7635442, "num_input_tokens_seen": 133091410, "step": 6192, "time_per_iteration": 2.8161356449127197 }, { "auxiliary_loss_clip": 0.01134777, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.04818201, "balance_loss_mlp": 1.02497888, "epoch": 0.3723433037727341, "flos": 25265491263360.0, "grad_norm": 2.31871347399746, "language_loss": 0.80621845, "learning_rate": 2.8914767115530766e-06, "loss": 0.82796752, "num_input_tokens_seen": 133110365, "step": 6193, "time_per_iteration": 2.661550760269165 }, { "auxiliary_loss_clip": 0.01101478, "auxiliary_loss_mlp": 0.01041083, "balance_loss_clip": 1.04354334, "balance_loss_mlp": 1.02522826, "epoch": 0.3724034270254021, "flos": 10524664333440.0, "grad_norm": 2.475173523724827, "language_loss": 0.84729886, "learning_rate": 2.891128062852194e-06, "loss": 0.86872447, "num_input_tokens_seen": 133128255, "step": 6194, "time_per_iteration": 2.711531400680542 }, { "auxiliary_loss_clip": 0.0111161, "auxiliary_loss_mlp": 0.010372, "balance_loss_clip": 1.04650784, "balance_loss_mlp": 1.02142286, "epoch": 0.37246355027807004, "flos": 20266223328000.0, "grad_norm": 9.44838101604173, "language_loss": 0.77016377, "learning_rate": 2.890779380359646e-06, "loss": 0.79165184, "num_input_tokens_seen": 133143975, "step": 6195, "time_per_iteration": 2.6527512073516846 }, { "auxiliary_loss_clip": 0.01112195, "auxiliary_loss_mlp": 0.0103539, "balance_loss_clip": 1.0468967, "balance_loss_mlp": 1.02030444, "epoch": 0.372523673530738, "flos": 19500571998720.0, "grad_norm": 1.7021548935758455, "language_loss": 0.79216856, "learning_rate": 2.890430664088655e-06, "loss": 0.81364441, "num_input_tokens_seen": 133162935, "step": 6196, "time_per_iteration": 2.6642892360687256 }, { "auxiliary_loss_clip": 0.01124648, "auxiliary_loss_mlp": 0.01038359, "balance_loss_clip": 1.04975688, "balance_loss_mlp": 1.0240953, "epoch": 0.372583796783406, "flos": 16764250849920.0, "grad_norm": 2.570886031241156, "language_loss": 0.83998835, "learning_rate": 2.890081914052443e-06, "loss": 0.8616184, "num_input_tokens_seen": 133181180, "step": 6197, "time_per_iteration": 2.627305030822754 }, { "auxiliary_loss_clip": 0.01131102, "auxiliary_loss_mlp": 0.01040963, "balance_loss_clip": 1.04697967, "balance_loss_mlp": 1.02488184, "epoch": 0.37264392003607394, "flos": 22637979388800.0, "grad_norm": 1.697216275583005, "language_loss": 0.64450538, "learning_rate": 2.889733130264237e-06, "loss": 0.66622603, "num_input_tokens_seen": 133199615, "step": 6198, "time_per_iteration": 2.606621503829956 }, { "auxiliary_loss_clip": 0.01120059, "auxiliary_loss_mlp": 0.01044451, "balance_loss_clip": 1.04676938, "balance_loss_mlp": 1.02959776, "epoch": 0.3727040432887419, "flos": 19973120348160.0, "grad_norm": 1.4273324893736263, "language_loss": 0.737185, "learning_rate": 2.889384312737261e-06, "loss": 0.75883007, "num_input_tokens_seen": 133219650, "step": 6199, "time_per_iteration": 2.78157901763916 }, { "auxiliary_loss_clip": 0.01105963, "auxiliary_loss_mlp": 0.01037053, "balance_loss_clip": 1.04564095, "balance_loss_mlp": 1.02154374, "epoch": 0.37276416654140987, "flos": 63899122279680.0, "grad_norm": 2.2948998309451905, "language_loss": 0.80481982, "learning_rate": 2.889035461484742e-06, "loss": 0.82624996, "num_input_tokens_seen": 133245675, "step": 6200, "time_per_iteration": 3.0623533725738525 }, { "auxiliary_loss_clip": 0.0109608, "auxiliary_loss_mlp": 0.01045798, "balance_loss_clip": 1.04552174, "balance_loss_mlp": 1.03016961, "epoch": 0.37282428979407783, "flos": 39785970211200.0, "grad_norm": 2.0774746879263746, "language_loss": 0.60494614, "learning_rate": 2.88868657651991e-06, "loss": 0.62636495, "num_input_tokens_seen": 133266905, "step": 6201, "time_per_iteration": 2.8960700035095215 }, { "auxiliary_loss_clip": 0.01125447, "auxiliary_loss_mlp": 0.01039384, "balance_loss_clip": 1.0489639, "balance_loss_mlp": 1.02346373, "epoch": 0.37288441304674586, "flos": 22709046447360.0, "grad_norm": 1.870117482164085, "language_loss": 0.72692698, "learning_rate": 2.8883376578559934e-06, "loss": 0.74857527, "num_input_tokens_seen": 133286865, "step": 6202, "time_per_iteration": 4.202298402786255 }, { "auxiliary_loss_clip": 0.01110741, "auxiliary_loss_mlp": 0.01033326, "balance_loss_clip": 1.04642594, "balance_loss_mlp": 1.01800799, "epoch": 0.3729445362994138, "flos": 18770292587520.0, "grad_norm": 2.0679450432005666, "language_loss": 0.74148834, "learning_rate": 2.8879887055062243e-06, "loss": 0.76292896, "num_input_tokens_seen": 133305295, "step": 6203, "time_per_iteration": 2.7268033027648926 }, { "auxiliary_loss_clip": 0.01106859, "auxiliary_loss_mlp": 0.01038826, "balance_loss_clip": 1.04595554, "balance_loss_mlp": 1.02524805, "epoch": 0.3730046595520818, "flos": 22456199635200.0, "grad_norm": 1.649450499506288, "language_loss": 0.81921744, "learning_rate": 2.8876397194838353e-06, "loss": 0.84067428, "num_input_tokens_seen": 133324625, "step": 6204, "time_per_iteration": 4.347074747085571 }, { "auxiliary_loss_clip": 0.01123916, "auxiliary_loss_mlp": 0.01044159, "balance_loss_clip": 1.04827762, "balance_loss_mlp": 1.02794707, "epoch": 0.37306478280474975, "flos": 24316372241280.0, "grad_norm": 1.675399556922802, "language_loss": 0.74961317, "learning_rate": 2.8872906998020577e-06, "loss": 0.77129394, "num_input_tokens_seen": 133344625, "step": 6205, "time_per_iteration": 2.66701602935791 }, { "auxiliary_loss_clip": 0.01117233, "auxiliary_loss_mlp": 0.01045323, "balance_loss_clip": 1.04337549, "balance_loss_mlp": 1.02857447, "epoch": 0.3731249060574177, "flos": 15815167741440.0, "grad_norm": 1.8318607259579, "language_loss": 0.7815854, "learning_rate": 2.886941646474128e-06, "loss": 0.80321097, "num_input_tokens_seen": 133363605, "step": 6206, "time_per_iteration": 4.202580451965332 }, { "auxiliary_loss_clip": 0.01134488, "auxiliary_loss_mlp": 0.01039926, "balance_loss_clip": 1.04804325, "balance_loss_mlp": 1.02317739, "epoch": 0.3731850293100857, "flos": 19828077229440.0, "grad_norm": 2.3232535418166256, "language_loss": 0.93322426, "learning_rate": 2.886592559513283e-06, "loss": 0.95496845, "num_input_tokens_seen": 133379405, "step": 6207, "time_per_iteration": 4.318574666976929 }, { "auxiliary_loss_clip": 0.01105421, "auxiliary_loss_mlp": 0.0103386, "balance_loss_clip": 1.0478878, "balance_loss_mlp": 1.01876843, "epoch": 0.37324515256275365, "flos": 19062354072960.0, "grad_norm": 3.0736568130228363, "language_loss": 0.82651198, "learning_rate": 2.886243438932759e-06, "loss": 0.8479048, "num_input_tokens_seen": 133397585, "step": 6208, "time_per_iteration": 2.749662160873413 }, { "auxiliary_loss_clip": 0.01122225, "auxiliary_loss_mlp": 0.0103968, "balance_loss_clip": 1.04488516, "balance_loss_mlp": 1.0223707, "epoch": 0.3733052758154216, "flos": 20704333512960.0, "grad_norm": 2.0157740087962845, "language_loss": 0.73122764, "learning_rate": 2.8858942847457953e-06, "loss": 0.75284666, "num_input_tokens_seen": 133415365, "step": 6209, "time_per_iteration": 2.6315791606903076 }, { "auxiliary_loss_clip": 0.01095649, "auxiliary_loss_mlp": 0.01037134, "balance_loss_clip": 1.04820108, "balance_loss_mlp": 1.02065969, "epoch": 0.3733653990680896, "flos": 20193504243840.0, "grad_norm": 1.9650719997143145, "language_loss": 0.70413053, "learning_rate": 2.8855450969656305e-06, "loss": 0.72545838, "num_input_tokens_seen": 133435700, "step": 6210, "time_per_iteration": 2.7484405040740967 }, { "auxiliary_loss_clip": 0.01072484, "auxiliary_loss_mlp": 0.01045611, "balance_loss_clip": 1.03769457, "balance_loss_mlp": 1.02674007, "epoch": 0.37342552232075754, "flos": 20339660684160.0, "grad_norm": 2.0510282142916427, "language_loss": 0.77773547, "learning_rate": 2.8851958756055073e-06, "loss": 0.79891646, "num_input_tokens_seen": 133455180, "step": 6211, "time_per_iteration": 2.706294536590576 }, { "auxiliary_loss_clip": 0.01122999, "auxiliary_loss_mlp": 0.01042393, "balance_loss_clip": 1.04602683, "balance_loss_mlp": 1.02645469, "epoch": 0.3734856455734255, "flos": 35517879527040.0, "grad_norm": 1.675173432335243, "language_loss": 0.73258781, "learning_rate": 2.884846620678668e-06, "loss": 0.7542417, "num_input_tokens_seen": 133476715, "step": 6212, "time_per_iteration": 2.788787841796875 }, { "auxiliary_loss_clip": 0.01131124, "auxiliary_loss_mlp": 0.01047595, "balance_loss_clip": 1.05055571, "balance_loss_mlp": 1.03106034, "epoch": 0.37354576882609347, "flos": 21142300043520.0, "grad_norm": 1.9808770110660865, "language_loss": 0.81656909, "learning_rate": 2.884497332198356e-06, "loss": 0.83835626, "num_input_tokens_seen": 133494550, "step": 6213, "time_per_iteration": 2.6829304695129395 }, { "auxiliary_loss_clip": 0.01089374, "auxiliary_loss_mlp": 0.01046172, "balance_loss_clip": 1.0412662, "balance_loss_mlp": 1.02843404, "epoch": 0.37360589207876144, "flos": 21506793304320.0, "grad_norm": 2.223600899112558, "language_loss": 0.78999674, "learning_rate": 2.8841480101778167e-06, "loss": 0.81135225, "num_input_tokens_seen": 133512640, "step": 6214, "time_per_iteration": 2.674373149871826 }, { "auxiliary_loss_clip": 0.01109052, "auxiliary_loss_mlp": 0.01044175, "balance_loss_clip": 1.04420567, "balance_loss_mlp": 1.02827835, "epoch": 0.37366601533142946, "flos": 38435800861440.0, "grad_norm": 1.9266500277332215, "language_loss": 0.84611148, "learning_rate": 2.883798654630296e-06, "loss": 0.86764371, "num_input_tokens_seen": 133535540, "step": 6215, "time_per_iteration": 2.8276026248931885 }, { "auxiliary_loss_clip": 0.01100197, "auxiliary_loss_mlp": 0.01039814, "balance_loss_clip": 1.04435837, "balance_loss_mlp": 1.02298141, "epoch": 0.3737261385840974, "flos": 18441171244800.0, "grad_norm": 1.8731663372997254, "language_loss": 0.67690969, "learning_rate": 2.8834492655690423e-06, "loss": 0.69830984, "num_input_tokens_seen": 133555795, "step": 6216, "time_per_iteration": 2.724090576171875 }, { "auxiliary_loss_clip": 0.01111654, "auxiliary_loss_mlp": 0.01042601, "balance_loss_clip": 1.045977, "balance_loss_mlp": 1.02578092, "epoch": 0.3737862618367654, "flos": 22929861306240.0, "grad_norm": 2.3172976096058853, "language_loss": 0.65993899, "learning_rate": 2.883099843007303e-06, "loss": 0.68148154, "num_input_tokens_seen": 133575905, "step": 6217, "time_per_iteration": 2.7126269340515137 }, { "auxiliary_loss_clip": 0.01115905, "auxiliary_loss_mlp": 0.01039702, "balance_loss_clip": 1.0483315, "balance_loss_mlp": 1.02264857, "epoch": 0.37384638508943335, "flos": 15409664127360.0, "grad_norm": 2.0273109551694777, "language_loss": 0.80449212, "learning_rate": 2.88275038695833e-06, "loss": 0.82604814, "num_input_tokens_seen": 133592585, "step": 6218, "time_per_iteration": 2.680894374847412 }, { "auxiliary_loss_clip": 0.01115539, "auxiliary_loss_mlp": 0.0103289, "balance_loss_clip": 1.04488862, "balance_loss_mlp": 1.01760781, "epoch": 0.3739065083421013, "flos": 24280820755200.0, "grad_norm": 1.5960804840892617, "language_loss": 0.78692639, "learning_rate": 2.8824008974353736e-06, "loss": 0.80841064, "num_input_tokens_seen": 133615070, "step": 6219, "time_per_iteration": 2.6683976650238037 }, { "auxiliary_loss_clip": 0.01107805, "auxiliary_loss_mlp": 0.01040758, "balance_loss_clip": 1.04602623, "balance_loss_mlp": 1.0247364, "epoch": 0.3739666315947693, "flos": 23002831785600.0, "grad_norm": 1.8103875982928064, "language_loss": 0.77023458, "learning_rate": 2.8820513744516866e-06, "loss": 0.79172027, "num_input_tokens_seen": 133633490, "step": 6220, "time_per_iteration": 2.670686960220337 }, { "auxiliary_loss_clip": 0.01105245, "auxiliary_loss_mlp": 0.01041158, "balance_loss_clip": 1.04717016, "balance_loss_mlp": 1.02473164, "epoch": 0.37402675484743725, "flos": 19391116279680.0, "grad_norm": 3.4153989861378204, "language_loss": 0.8298834, "learning_rate": 2.8817018180205235e-06, "loss": 0.85134745, "num_input_tokens_seen": 133653425, "step": 6221, "time_per_iteration": 2.730738401412964 }, { "auxiliary_loss_clip": 0.01108391, "auxiliary_loss_mlp": 0.01043965, "balance_loss_clip": 1.04499435, "balance_loss_mlp": 1.02825367, "epoch": 0.3740868781001052, "flos": 17126158331520.0, "grad_norm": 1.9668982067313725, "language_loss": 0.75944567, "learning_rate": 2.8813522281551387e-06, "loss": 0.78096926, "num_input_tokens_seen": 133670220, "step": 6222, "time_per_iteration": 2.62052321434021 }, { "auxiliary_loss_clip": 0.01103117, "auxiliary_loss_mlp": 0.00772891, "balance_loss_clip": 1.04785156, "balance_loss_mlp": 1.00029564, "epoch": 0.3741470013527732, "flos": 20043505048320.0, "grad_norm": 1.8881600065301847, "language_loss": 0.70621789, "learning_rate": 2.881002604868789e-06, "loss": 0.72497797, "num_input_tokens_seen": 133688910, "step": 6223, "time_per_iteration": 2.7686285972595215 }, { "auxiliary_loss_clip": 0.01104752, "auxiliary_loss_mlp": 0.01035203, "balance_loss_clip": 1.05155015, "balance_loss_mlp": 1.02057576, "epoch": 0.37420712460544114, "flos": 36897279569280.0, "grad_norm": 2.1852519558340644, "language_loss": 0.6875304, "learning_rate": 2.8806529481747325e-06, "loss": 0.7089299, "num_input_tokens_seen": 133708690, "step": 6224, "time_per_iteration": 2.817263126373291 }, { "auxiliary_loss_clip": 0.01091747, "auxiliary_loss_mlp": 0.01036393, "balance_loss_clip": 1.04859614, "balance_loss_mlp": 1.02059817, "epoch": 0.3742672478581091, "flos": 22201198007040.0, "grad_norm": 2.246642459489035, "language_loss": 0.70192593, "learning_rate": 2.880303258086228e-06, "loss": 0.72320735, "num_input_tokens_seen": 133728095, "step": 6225, "time_per_iteration": 2.785083532333374 }, { "auxiliary_loss_clip": 0.01088757, "auxiliary_loss_mlp": 0.01048544, "balance_loss_clip": 1.04366183, "balance_loss_mlp": 1.03175974, "epoch": 0.3743273711107771, "flos": 24681547860480.0, "grad_norm": 2.1768682992812236, "language_loss": 0.7896018, "learning_rate": 2.879953534616536e-06, "loss": 0.81097472, "num_input_tokens_seen": 133745590, "step": 6226, "time_per_iteration": 2.7403974533081055 }, { "auxiliary_loss_clip": 0.01105293, "auxiliary_loss_mlp": 0.01039029, "balance_loss_clip": 1.04631484, "balance_loss_mlp": 1.02303696, "epoch": 0.37438749436344504, "flos": 24459619680000.0, "grad_norm": 1.7825799805329443, "language_loss": 0.67965841, "learning_rate": 2.879603777778917e-06, "loss": 0.70110166, "num_input_tokens_seen": 133766155, "step": 6227, "time_per_iteration": 2.6975693702697754 }, { "auxiliary_loss_clip": 0.01099252, "auxiliary_loss_mlp": 0.01034493, "balance_loss_clip": 1.04493213, "balance_loss_mlp": 1.01890039, "epoch": 0.374447617616113, "flos": 21798747048960.0, "grad_norm": 1.9005486801766094, "language_loss": 0.829476, "learning_rate": 2.879253987586635e-06, "loss": 0.85081351, "num_input_tokens_seen": 133783185, "step": 6228, "time_per_iteration": 2.7754271030426025 }, { "auxiliary_loss_clip": 0.01090082, "auxiliary_loss_mlp": 0.01048677, "balance_loss_clip": 1.04396605, "balance_loss_mlp": 1.03159404, "epoch": 0.374507740868781, "flos": 17968191932160.0, "grad_norm": 1.6406992237121778, "language_loss": 0.74450547, "learning_rate": 2.8789041640529535e-06, "loss": 0.76589304, "num_input_tokens_seen": 133800975, "step": 6229, "time_per_iteration": 2.6378824710845947 }, { "auxiliary_loss_clip": 0.0109707, "auxiliary_loss_mlp": 0.01035996, "balance_loss_clip": 1.0470053, "balance_loss_mlp": 1.01971197, "epoch": 0.374567864121449, "flos": 16105828596480.0, "grad_norm": 2.127994694324029, "language_loss": 0.83782691, "learning_rate": 2.8785543071911383e-06, "loss": 0.85915756, "num_input_tokens_seen": 133818020, "step": 6230, "time_per_iteration": 2.6857657432556152 }, { "auxiliary_loss_clip": 0.0112393, "auxiliary_loss_mlp": 0.01041627, "balance_loss_clip": 1.04905128, "balance_loss_mlp": 1.02556968, "epoch": 0.37462798737411696, "flos": 25773160135680.0, "grad_norm": 2.8382818326589145, "language_loss": 0.735865, "learning_rate": 2.878204417014456e-06, "loss": 0.75752056, "num_input_tokens_seen": 133840690, "step": 6231, "time_per_iteration": 2.7082016468048096 }, { "auxiliary_loss_clip": 0.0112579, "auxiliary_loss_mlp": 0.01046917, "balance_loss_clip": 1.05376148, "balance_loss_mlp": 1.03075266, "epoch": 0.3746881106267849, "flos": 16654507822080.0, "grad_norm": 2.9683381932525665, "language_loss": 0.7412858, "learning_rate": 2.8778544935361735e-06, "loss": 0.76301289, "num_input_tokens_seen": 133858350, "step": 6232, "time_per_iteration": 2.5764057636260986 }, { "auxiliary_loss_clip": 0.01106131, "auxiliary_loss_mlp": 0.01039245, "balance_loss_clip": 1.04461622, "balance_loss_mlp": 1.02237701, "epoch": 0.3747482338794529, "flos": 26177981391360.0, "grad_norm": 2.121427790242168, "language_loss": 0.77296579, "learning_rate": 2.877504536769561e-06, "loss": 0.79441959, "num_input_tokens_seen": 133879775, "step": 6233, "time_per_iteration": 2.692286252975464 }, { "auxiliary_loss_clip": 0.01118513, "auxiliary_loss_mlp": 0.01040639, "balance_loss_clip": 1.05093503, "balance_loss_mlp": 1.024593, "epoch": 0.37480835713212085, "flos": 12021061950720.0, "grad_norm": 1.8446337373318833, "language_loss": 0.69493848, "learning_rate": 2.8771545467278883e-06, "loss": 0.71652997, "num_input_tokens_seen": 133898295, "step": 6234, "time_per_iteration": 2.658332586288452 }, { "auxiliary_loss_clip": 0.01123531, "auxiliary_loss_mlp": 0.01042963, "balance_loss_clip": 1.04885483, "balance_loss_mlp": 1.02833033, "epoch": 0.3748684803847888, "flos": 19679263182720.0, "grad_norm": 1.9015387878630694, "language_loss": 0.82462788, "learning_rate": 2.8768045234244276e-06, "loss": 0.84629285, "num_input_tokens_seen": 133915230, "step": 6235, "time_per_iteration": 2.591198682785034 }, { "auxiliary_loss_clip": 0.01140927, "auxiliary_loss_mlp": 0.0103602, "balance_loss_clip": 1.05301189, "balance_loss_mlp": 1.02021289, "epoch": 0.3749286036374568, "flos": 20521189042560.0, "grad_norm": 1.8869628373328378, "language_loss": 0.78439927, "learning_rate": 2.8764544668724517e-06, "loss": 0.80616879, "num_input_tokens_seen": 133934110, "step": 6236, "time_per_iteration": 2.6754372119903564 }, { "auxiliary_loss_clip": 0.01118225, "auxiliary_loss_mlp": 0.01050242, "balance_loss_clip": 1.04519606, "balance_loss_mlp": 1.03202713, "epoch": 0.37498872689012475, "flos": 20704620821760.0, "grad_norm": 2.0770406770017242, "language_loss": 0.74357057, "learning_rate": 2.876104377085234e-06, "loss": 0.76525521, "num_input_tokens_seen": 133952395, "step": 6237, "time_per_iteration": 2.6760342121124268 }, { "auxiliary_loss_clip": 0.01114513, "auxiliary_loss_mlp": 0.00773766, "balance_loss_clip": 1.04626942, "balance_loss_mlp": 1.00036037, "epoch": 0.3750488501427927, "flos": 21574843620480.0, "grad_norm": 2.0699756536584633, "language_loss": 0.93258965, "learning_rate": 2.8757542540760508e-06, "loss": 0.95147252, "num_input_tokens_seen": 133969635, "step": 6238, "time_per_iteration": 2.6805243492126465 }, { "auxiliary_loss_clip": 0.01137619, "auxiliary_loss_mlp": 0.01037341, "balance_loss_clip": 1.04995167, "balance_loss_mlp": 1.02081275, "epoch": 0.3751089733954607, "flos": 15923869274880.0, "grad_norm": 2.3841921025147284, "language_loss": 0.70885909, "learning_rate": 2.8754040978581777e-06, "loss": 0.73060858, "num_input_tokens_seen": 133987215, "step": 6239, "time_per_iteration": 2.548285961151123 }, { "auxiliary_loss_clip": 0.01068531, "auxiliary_loss_mlp": 0.01040031, "balance_loss_clip": 1.04656243, "balance_loss_mlp": 1.02303219, "epoch": 0.37516909664812864, "flos": 36284644177920.0, "grad_norm": 1.601808094344726, "language_loss": 0.65752542, "learning_rate": 2.875053908444895e-06, "loss": 0.67861104, "num_input_tokens_seen": 134009250, "step": 6240, "time_per_iteration": 3.016897201538086 }, { "auxiliary_loss_clip": 0.01101858, "auxiliary_loss_mlp": 0.00773445, "balance_loss_clip": 1.04618907, "balance_loss_mlp": 1.00033951, "epoch": 0.3752292199007966, "flos": 13515915283200.0, "grad_norm": 2.721418670308367, "language_loss": 0.75816065, "learning_rate": 2.8747036858494795e-06, "loss": 0.7769137, "num_input_tokens_seen": 134026875, "step": 6241, "time_per_iteration": 4.402552843093872 }, { "auxiliary_loss_clip": 0.01103844, "auxiliary_loss_mlp": 0.01044119, "balance_loss_clip": 1.04654765, "balance_loss_mlp": 1.0276264, "epoch": 0.3752893431534646, "flos": 27198095644800.0, "grad_norm": 2.108703330368865, "language_loss": 0.83791685, "learning_rate": 2.874353430085213e-06, "loss": 0.85939646, "num_input_tokens_seen": 134047185, "step": 6242, "time_per_iteration": 2.7508704662323 }, { "auxiliary_loss_clip": 0.01110348, "auxiliary_loss_mlp": 0.01048171, "balance_loss_clip": 1.04799628, "balance_loss_mlp": 1.03319848, "epoch": 0.3753494664061326, "flos": 30007674581760.0, "grad_norm": 2.4924519814208774, "language_loss": 0.68438506, "learning_rate": 2.8740031411653766e-06, "loss": 0.70597029, "num_input_tokens_seen": 134067330, "step": 6243, "time_per_iteration": 2.7814478874206543 }, { "auxiliary_loss_clip": 0.01056696, "auxiliary_loss_mlp": 0.00776554, "balance_loss_clip": 1.04175019, "balance_loss_mlp": 1.00038528, "epoch": 0.37540958965880056, "flos": 24461954064000.0, "grad_norm": 1.7699519682943652, "language_loss": 0.84165168, "learning_rate": 2.8736528191032535e-06, "loss": 0.85998416, "num_input_tokens_seen": 134085525, "step": 6244, "time_per_iteration": 4.510041952133179 }, { "auxiliary_loss_clip": 0.01074238, "auxiliary_loss_mlp": 0.01042872, "balance_loss_clip": 1.03981614, "balance_loss_mlp": 1.02712417, "epoch": 0.3754697129114685, "flos": 16508387295360.0, "grad_norm": 2.7453088605805616, "language_loss": 0.82679987, "learning_rate": 2.8733024639121277e-06, "loss": 0.84797096, "num_input_tokens_seen": 134101855, "step": 6245, "time_per_iteration": 4.745215654373169 }, { "auxiliary_loss_clip": 0.01096909, "auxiliary_loss_mlp": 0.0104658, "balance_loss_clip": 1.04049206, "balance_loss_mlp": 1.0296756, "epoch": 0.3755298361641365, "flos": 19390900798080.0, "grad_norm": 8.46557879021872, "language_loss": 0.63902843, "learning_rate": 2.8729520756052853e-06, "loss": 0.66046333, "num_input_tokens_seen": 134119360, "step": 6246, "time_per_iteration": 4.33053731918335 }, { "auxiliary_loss_clip": 0.01112093, "auxiliary_loss_mlp": 0.0104355, "balance_loss_clip": 1.04961443, "balance_loss_mlp": 1.0264082, "epoch": 0.37558995941680445, "flos": 14720395069440.0, "grad_norm": 2.0508038288587183, "language_loss": 0.74467009, "learning_rate": 2.8726016541960124e-06, "loss": 0.76622653, "num_input_tokens_seen": 134137475, "step": 6247, "time_per_iteration": 2.688081979751587 }, { "auxiliary_loss_clip": 0.01126872, "auxiliary_loss_mlp": 0.01037368, "balance_loss_clip": 1.05022037, "balance_loss_mlp": 1.02133489, "epoch": 0.3756500826694724, "flos": 21689901861120.0, "grad_norm": 2.703785960910372, "language_loss": 0.5497098, "learning_rate": 2.872251199697598e-06, "loss": 0.57135224, "num_input_tokens_seen": 134154580, "step": 6248, "time_per_iteration": 2.6308822631835938 }, { "auxiliary_loss_clip": 0.01117073, "auxiliary_loss_mlp": 0.01036379, "balance_loss_clip": 1.04465234, "balance_loss_mlp": 1.0200597, "epoch": 0.3757102059221404, "flos": 26505666190080.0, "grad_norm": 4.209721572066423, "language_loss": 0.84492457, "learning_rate": 2.8719007121233297e-06, "loss": 0.86645913, "num_input_tokens_seen": 134174285, "step": 6249, "time_per_iteration": 2.6539809703826904 }, { "auxiliary_loss_clip": 0.01107733, "auxiliary_loss_mlp": 0.01035495, "balance_loss_clip": 1.04784632, "balance_loss_mlp": 1.01956248, "epoch": 0.37577032917480835, "flos": 37338083274240.0, "grad_norm": 1.546160982958922, "language_loss": 0.67701882, "learning_rate": 2.8715501914864993e-06, "loss": 0.69845104, "num_input_tokens_seen": 134195940, "step": 6250, "time_per_iteration": 2.787398338317871 }, { "auxiliary_loss_clip": 0.01117019, "auxiliary_loss_mlp": 0.01044359, "balance_loss_clip": 1.04946029, "balance_loss_mlp": 1.0293386, "epoch": 0.3758304524274763, "flos": 21908597817600.0, "grad_norm": 1.960309683567346, "language_loss": 0.77824795, "learning_rate": 2.8711996378003987e-06, "loss": 0.79986179, "num_input_tokens_seen": 134212235, "step": 6251, "time_per_iteration": 2.7143123149871826 }, { "auxiliary_loss_clip": 0.01121024, "auxiliary_loss_mlp": 0.01039102, "balance_loss_clip": 1.04994178, "balance_loss_mlp": 1.0236522, "epoch": 0.3758905756801443, "flos": 36569343375360.0, "grad_norm": 2.527016245081176, "language_loss": 0.58002663, "learning_rate": 2.8708490510783203e-06, "loss": 0.60162789, "num_input_tokens_seen": 134233810, "step": 6252, "time_per_iteration": 2.716597557067871 }, { "auxiliary_loss_clip": 0.01116459, "auxiliary_loss_mlp": 0.01042556, "balance_loss_clip": 1.05007291, "balance_loss_mlp": 1.0260098, "epoch": 0.37595069893281224, "flos": 24528783317760.0, "grad_norm": 4.856643583290163, "language_loss": 0.89482141, "learning_rate": 2.8704984313335584e-06, "loss": 0.91641152, "num_input_tokens_seen": 134252020, "step": 6253, "time_per_iteration": 2.701361894607544 }, { "auxiliary_loss_clip": 0.01098154, "auxiliary_loss_mlp": 0.01040398, "balance_loss_clip": 1.04815936, "balance_loss_mlp": 1.02562761, "epoch": 0.3760108221854802, "flos": 16435021766400.0, "grad_norm": 2.218099502464204, "language_loss": 0.76568806, "learning_rate": 2.8701477785794097e-06, "loss": 0.78707361, "num_input_tokens_seen": 134269495, "step": 6254, "time_per_iteration": 2.6995303630828857 }, { "auxiliary_loss_clip": 0.01096995, "auxiliary_loss_mlp": 0.01043484, "balance_loss_clip": 1.04379475, "balance_loss_mlp": 1.02628207, "epoch": 0.37607094543814823, "flos": 13771742924160.0, "grad_norm": 2.131769376763656, "language_loss": 0.6180023, "learning_rate": 2.869797092829169e-06, "loss": 0.6394071, "num_input_tokens_seen": 134287035, "step": 6255, "time_per_iteration": 2.7164864540100098 }, { "auxiliary_loss_clip": 0.01127282, "auxiliary_loss_mlp": 0.01036673, "balance_loss_clip": 1.04883361, "balance_loss_mlp": 1.02017426, "epoch": 0.3761310686908162, "flos": 19857918453120.0, "grad_norm": 2.6629341180561545, "language_loss": 0.74404681, "learning_rate": 2.869446374096135e-06, "loss": 0.76568639, "num_input_tokens_seen": 134304840, "step": 6256, "time_per_iteration": 2.588169574737549 }, { "auxiliary_loss_clip": 0.01127124, "auxiliary_loss_mlp": 0.01046358, "balance_loss_clip": 1.04913831, "balance_loss_mlp": 1.02977645, "epoch": 0.37619119194348416, "flos": 12750802657920.0, "grad_norm": 2.3087979716808937, "language_loss": 0.702447, "learning_rate": 2.8690956223936088e-06, "loss": 0.72418177, "num_input_tokens_seen": 134323180, "step": 6257, "time_per_iteration": 2.701555013656616 }, { "auxiliary_loss_clip": 0.01110787, "auxiliary_loss_mlp": 0.01033343, "balance_loss_clip": 1.04812109, "balance_loss_mlp": 1.01796508, "epoch": 0.3762513151961521, "flos": 17530548624000.0, "grad_norm": 1.673769537318751, "language_loss": 0.84842372, "learning_rate": 2.868744837734889e-06, "loss": 0.86986494, "num_input_tokens_seen": 134341390, "step": 6258, "time_per_iteration": 2.6336703300476074 }, { "auxiliary_loss_clip": 0.01091689, "auxiliary_loss_mlp": 0.01041654, "balance_loss_clip": 1.04571128, "balance_loss_mlp": 1.0271697, "epoch": 0.3763114384488201, "flos": 23617406511360.0, "grad_norm": 1.4940028515654036, "language_loss": 0.80920124, "learning_rate": 2.868394020133277e-06, "loss": 0.83053464, "num_input_tokens_seen": 134360425, "step": 6259, "time_per_iteration": 2.752392053604126 }, { "auxiliary_loss_clip": 0.01093234, "auxiliary_loss_mlp": 0.01046443, "balance_loss_clip": 1.04547083, "balance_loss_mlp": 1.02969444, "epoch": 0.37637156170148806, "flos": 25406978935680.0, "grad_norm": 2.4951694968605627, "language_loss": 0.71285564, "learning_rate": 2.8680431696020783e-06, "loss": 0.73425239, "num_input_tokens_seen": 134379775, "step": 6260, "time_per_iteration": 2.782561779022217 }, { "auxiliary_loss_clip": 0.01107136, "auxiliary_loss_mlp": 0.01039319, "balance_loss_clip": 1.04386747, "balance_loss_mlp": 1.02305889, "epoch": 0.376431684954156, "flos": 23440906056960.0, "grad_norm": 1.627422352949978, "language_loss": 0.78342533, "learning_rate": 2.867692286154594e-06, "loss": 0.80488986, "num_input_tokens_seen": 134400315, "step": 6261, "time_per_iteration": 2.6978867053985596 }, { "auxiliary_loss_clip": 0.01112259, "auxiliary_loss_mlp": 0.01048861, "balance_loss_clip": 1.04744315, "balance_loss_mlp": 1.0312773, "epoch": 0.376491808206824, "flos": 34204482725760.0, "grad_norm": 2.418447947297228, "language_loss": 0.80871278, "learning_rate": 2.867341369804132e-06, "loss": 0.83032399, "num_input_tokens_seen": 134422875, "step": 6262, "time_per_iteration": 2.852675437927246 }, { "auxiliary_loss_clip": 0.01115101, "auxiliary_loss_mlp": 0.01038136, "balance_loss_clip": 1.04584765, "balance_loss_mlp": 1.02277565, "epoch": 0.37655193145949195, "flos": 35185669614720.0, "grad_norm": 2.9875520790285774, "language_loss": 0.80295742, "learning_rate": 2.866990420563998e-06, "loss": 0.82448983, "num_input_tokens_seen": 134443025, "step": 6263, "time_per_iteration": 2.785395622253418 }, { "auxiliary_loss_clip": 0.01140252, "auxiliary_loss_mlp": 0.01045838, "balance_loss_clip": 1.05247605, "balance_loss_mlp": 1.0300312, "epoch": 0.3766120547121599, "flos": 16761844638720.0, "grad_norm": 2.896352989954936, "language_loss": 0.79601765, "learning_rate": 2.866639438447501e-06, "loss": 0.81787854, "num_input_tokens_seen": 134460945, "step": 6264, "time_per_iteration": 2.581125497817993 }, { "auxiliary_loss_clip": 0.01133548, "auxiliary_loss_mlp": 0.0105155, "balance_loss_clip": 1.04770851, "balance_loss_mlp": 1.03557551, "epoch": 0.3766721779648279, "flos": 23550361776000.0, "grad_norm": 2.0921625870578913, "language_loss": 0.73808366, "learning_rate": 2.8662884234679497e-06, "loss": 0.75993466, "num_input_tokens_seen": 134480440, "step": 6265, "time_per_iteration": 2.6998226642608643 }, { "auxiliary_loss_clip": 0.01123221, "auxiliary_loss_mlp": 0.0103937, "balance_loss_clip": 1.05005145, "balance_loss_mlp": 1.02543402, "epoch": 0.37673230121749585, "flos": 29129191655040.0, "grad_norm": 1.9744000825782282, "language_loss": 0.68550873, "learning_rate": 2.865937375638654e-06, "loss": 0.70713472, "num_input_tokens_seen": 134501110, "step": 6266, "time_per_iteration": 2.6934731006622314 }, { "auxiliary_loss_clip": 0.01128105, "auxiliary_loss_mlp": 0.01041187, "balance_loss_clip": 1.04846668, "balance_loss_mlp": 1.02536833, "epoch": 0.3767924244701638, "flos": 28146783703680.0, "grad_norm": 3.437883319374573, "language_loss": 0.63078731, "learning_rate": 2.8655862949729264e-06, "loss": 0.65248024, "num_input_tokens_seen": 134522460, "step": 6267, "time_per_iteration": 2.7006735801696777 }, { "auxiliary_loss_clip": 0.01050407, "auxiliary_loss_mlp": 0.01011452, "balance_loss_clip": 1.02822745, "balance_loss_mlp": 1.00960469, "epoch": 0.37685254772283183, "flos": 60797197526400.0, "grad_norm": 0.7198108741876666, "language_loss": 0.58852816, "learning_rate": 2.8652351814840795e-06, "loss": 0.60914677, "num_input_tokens_seen": 134589545, "step": 6268, "time_per_iteration": 3.355120897293091 }, { "auxiliary_loss_clip": 0.011375, "auxiliary_loss_mlp": 0.01043603, "balance_loss_clip": 1.05033755, "balance_loss_mlp": 1.02698505, "epoch": 0.3769126709754998, "flos": 26032543223040.0, "grad_norm": 2.34128493463531, "language_loss": 0.65263468, "learning_rate": 2.8648840351854283e-06, "loss": 0.67444575, "num_input_tokens_seen": 134610550, "step": 6269, "time_per_iteration": 2.656585931777954 }, { "auxiliary_loss_clip": 0.01099912, "auxiliary_loss_mlp": 0.01041008, "balance_loss_clip": 1.04970932, "balance_loss_mlp": 1.02536798, "epoch": 0.37697279422816776, "flos": 23579879777280.0, "grad_norm": 1.5250715006737088, "language_loss": 0.7069717, "learning_rate": 2.8645328560902874e-06, "loss": 0.72838092, "num_input_tokens_seen": 134630485, "step": 6270, "time_per_iteration": 2.7498419284820557 }, { "auxiliary_loss_clip": 0.01059818, "auxiliary_loss_mlp": 0.01007405, "balance_loss_clip": 1.02900875, "balance_loss_mlp": 1.00581956, "epoch": 0.3770329174808357, "flos": 64745935367040.0, "grad_norm": 0.7193704591933474, "language_loss": 0.56122422, "learning_rate": 2.8641816442119746e-06, "loss": 0.58189648, "num_input_tokens_seen": 134693510, "step": 6271, "time_per_iteration": 3.1569089889526367 }, { "auxiliary_loss_clip": 0.01121208, "auxiliary_loss_mlp": 0.01042721, "balance_loss_clip": 1.04645681, "balance_loss_mlp": 1.02609181, "epoch": 0.3770930407335037, "flos": 21835304115840.0, "grad_norm": 2.1611051517344246, "language_loss": 0.79855239, "learning_rate": 2.8638303995638066e-06, "loss": 0.82019162, "num_input_tokens_seen": 134713115, "step": 6272, "time_per_iteration": 2.628180742263794 }, { "auxiliary_loss_clip": 0.01118748, "auxiliary_loss_mlp": 0.01033695, "balance_loss_clip": 1.0451988, "balance_loss_mlp": 1.01934206, "epoch": 0.37715316398617166, "flos": 22747901984640.0, "grad_norm": 2.0954681641544304, "language_loss": 0.73789483, "learning_rate": 2.863479122159103e-06, "loss": 0.75941932, "num_input_tokens_seen": 134732635, "step": 6273, "time_per_iteration": 2.7064390182495117 }, { "auxiliary_loss_clip": 0.01117899, "auxiliary_loss_mlp": 0.01044408, "balance_loss_clip": 1.04745209, "balance_loss_mlp": 1.02905381, "epoch": 0.3772132872388396, "flos": 18914581520640.0, "grad_norm": 1.6440580648783938, "language_loss": 0.71867502, "learning_rate": 2.8631278120111858e-06, "loss": 0.74029803, "num_input_tokens_seen": 134750695, "step": 6274, "time_per_iteration": 2.650559186935425 }, { "auxiliary_loss_clip": 0.01105418, "auxiliary_loss_mlp": 0.01040714, "balance_loss_clip": 1.04509926, "balance_loss_mlp": 1.02567029, "epoch": 0.3772734104915076, "flos": 17346219004800.0, "grad_norm": 1.9251108643001593, "language_loss": 0.83620244, "learning_rate": 2.8627764691333742e-06, "loss": 0.85766381, "num_input_tokens_seen": 134768935, "step": 6275, "time_per_iteration": 2.662346839904785 }, { "auxiliary_loss_clip": 0.01077547, "auxiliary_loss_mlp": 0.01035941, "balance_loss_clip": 1.04383206, "balance_loss_mlp": 1.02238655, "epoch": 0.37733353374417555, "flos": 32342370785280.0, "grad_norm": 1.4850375213112275, "language_loss": 0.75779188, "learning_rate": 2.8624250935389935e-06, "loss": 0.77892679, "num_input_tokens_seen": 134791260, "step": 6276, "time_per_iteration": 2.824374198913574 }, { "auxiliary_loss_clip": 0.01109985, "auxiliary_loss_mlp": 0.01039728, "balance_loss_clip": 1.04301822, "balance_loss_mlp": 1.02318192, "epoch": 0.3773936569968435, "flos": 23360681030400.0, "grad_norm": 1.996464283971086, "language_loss": 0.85758084, "learning_rate": 2.862073685241366e-06, "loss": 0.87907803, "num_input_tokens_seen": 134808350, "step": 6277, "time_per_iteration": 2.6880812644958496 }, { "auxiliary_loss_clip": 0.01123239, "auxiliary_loss_mlp": 0.01035838, "balance_loss_clip": 1.04981339, "balance_loss_mlp": 1.02147365, "epoch": 0.3774537802495115, "flos": 21466788531840.0, "grad_norm": 2.8692620956149613, "language_loss": 0.78788501, "learning_rate": 2.861722244253818e-06, "loss": 0.80947578, "num_input_tokens_seen": 134826005, "step": 6278, "time_per_iteration": 2.6566152572631836 }, { "auxiliary_loss_clip": 0.01104603, "auxiliary_loss_mlp": 0.01044359, "balance_loss_clip": 1.04592609, "balance_loss_mlp": 1.02740717, "epoch": 0.37751390350217945, "flos": 24973717086720.0, "grad_norm": 2.420687530183356, "language_loss": 0.8289634, "learning_rate": 2.8613707705896767e-06, "loss": 0.85045302, "num_input_tokens_seen": 134844995, "step": 6279, "time_per_iteration": 2.732966899871826 }, { "auxiliary_loss_clip": 0.01110227, "auxiliary_loss_mlp": 0.01039275, "balance_loss_clip": 1.04498839, "balance_loss_mlp": 1.02520263, "epoch": 0.3775740267548474, "flos": 27819098904960.0, "grad_norm": 5.36242068768128, "language_loss": 0.74968797, "learning_rate": 2.861019264262269e-06, "loss": 0.77118295, "num_input_tokens_seen": 134865285, "step": 6280, "time_per_iteration": 4.266780376434326 }, { "auxiliary_loss_clip": 0.01130932, "auxiliary_loss_mlp": 0.01036032, "balance_loss_clip": 1.04845715, "balance_loss_mlp": 1.02235854, "epoch": 0.3776341500075154, "flos": 22565224391040.0, "grad_norm": 1.4530407212668277, "language_loss": 0.76169163, "learning_rate": 2.8606677252849242e-06, "loss": 0.7833612, "num_input_tokens_seen": 134886535, "step": 6281, "time_per_iteration": 2.649930477142334 }, { "auxiliary_loss_clip": 0.01101629, "auxiliary_loss_mlp": 0.01040327, "balance_loss_clip": 1.04291892, "balance_loss_mlp": 1.02471018, "epoch": 0.3776942732601834, "flos": 23077238808960.0, "grad_norm": 2.430303484367767, "language_loss": 0.83814883, "learning_rate": 2.860316153670974e-06, "loss": 0.85956836, "num_input_tokens_seen": 134907435, "step": 6282, "time_per_iteration": 2.6882312297821045 }, { "auxiliary_loss_clip": 0.0111945, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.04452085, "balance_loss_mlp": 1.02134025, "epoch": 0.37775439651285136, "flos": 21724411852800.0, "grad_norm": 2.5787880774083725, "language_loss": 0.698241, "learning_rate": 2.8599645494337484e-06, "loss": 0.71980345, "num_input_tokens_seen": 134925360, "step": 6283, "time_per_iteration": 4.2020978927612305 }, { "auxiliary_loss_clip": 0.01072442, "auxiliary_loss_mlp": 0.01052062, "balance_loss_clip": 1.04226279, "balance_loss_mlp": 1.03394175, "epoch": 0.37781451976551933, "flos": 23987753688960.0, "grad_norm": 2.007181392308561, "language_loss": 0.76503819, "learning_rate": 2.859612912586581e-06, "loss": 0.78628325, "num_input_tokens_seen": 134944205, "step": 6284, "time_per_iteration": 4.349794387817383 }, { "auxiliary_loss_clip": 0.01142581, "auxiliary_loss_mlp": 0.01033355, "balance_loss_clip": 1.05249381, "balance_loss_mlp": 1.01713097, "epoch": 0.3778746430181873, "flos": 13727967223680.0, "grad_norm": 2.7318562260547554, "language_loss": 0.85677552, "learning_rate": 2.8592612431428055e-06, "loss": 0.87853491, "num_input_tokens_seen": 134960255, "step": 6285, "time_per_iteration": 2.6949870586395264 }, { "auxiliary_loss_clip": 0.01111269, "auxiliary_loss_mlp": 0.01042933, "balance_loss_clip": 1.04731882, "balance_loss_mlp": 1.02694702, "epoch": 0.37793476627085526, "flos": 19460495399040.0, "grad_norm": 1.8544385642750592, "language_loss": 0.84419537, "learning_rate": 2.858909541115758e-06, "loss": 0.86573738, "num_input_tokens_seen": 134978605, "step": 6286, "time_per_iteration": 4.541024684906006 }, { "auxiliary_loss_clip": 0.01120151, "auxiliary_loss_mlp": 0.01043503, "balance_loss_clip": 1.05024576, "balance_loss_mlp": 1.0280652, "epoch": 0.3779948895235232, "flos": 10707018704640.0, "grad_norm": 2.400905995704231, "language_loss": 0.81738019, "learning_rate": 2.858557806518775e-06, "loss": 0.83901674, "num_input_tokens_seen": 134995020, "step": 6287, "time_per_iteration": 2.6611125469207764 }, { "auxiliary_loss_clip": 0.01118978, "auxiliary_loss_mlp": 0.01041796, "balance_loss_clip": 1.04537022, "balance_loss_mlp": 1.02645934, "epoch": 0.3780550127761912, "flos": 22310007281280.0, "grad_norm": 3.0671932020533133, "language_loss": 0.73071134, "learning_rate": 2.8582060393651927e-06, "loss": 0.7523191, "num_input_tokens_seen": 135012620, "step": 6288, "time_per_iteration": 2.6759073734283447 }, { "auxiliary_loss_clip": 0.01124666, "auxiliary_loss_mlp": 0.01036773, "balance_loss_clip": 1.05113983, "balance_loss_mlp": 1.02115071, "epoch": 0.37811513602885916, "flos": 28950644125440.0, "grad_norm": 1.9644960153972613, "language_loss": 0.75616127, "learning_rate": 2.857854239668352e-06, "loss": 0.77777576, "num_input_tokens_seen": 135033365, "step": 6289, "time_per_iteration": 2.656367778778076 }, { "auxiliary_loss_clip": 0.0112159, "auxiliary_loss_mlp": 0.01035617, "balance_loss_clip": 1.04737473, "balance_loss_mlp": 1.02025056, "epoch": 0.3781752592815271, "flos": 23112933949440.0, "grad_norm": 1.7941331023092641, "language_loss": 0.73271513, "learning_rate": 2.857502407441593e-06, "loss": 0.75428718, "num_input_tokens_seen": 135052185, "step": 6290, "time_per_iteration": 2.740370512008667 }, { "auxiliary_loss_clip": 0.01098389, "auxiliary_loss_mlp": 0.01041015, "balance_loss_clip": 1.04425681, "balance_loss_mlp": 1.023193, "epoch": 0.3782353825341951, "flos": 19755932762880.0, "grad_norm": 8.943174604406142, "language_loss": 0.79843229, "learning_rate": 2.8571505426982566e-06, "loss": 0.81982636, "num_input_tokens_seen": 135070425, "step": 6291, "time_per_iteration": 2.729116916656494 }, { "auxiliary_loss_clip": 0.01101536, "auxiliary_loss_mlp": 0.01032627, "balance_loss_clip": 1.04736066, "balance_loss_mlp": 1.01611638, "epoch": 0.37829550578686305, "flos": 22050839675520.0, "grad_norm": 2.1381581001103203, "language_loss": 0.76017123, "learning_rate": 2.8567986454516854e-06, "loss": 0.78151298, "num_input_tokens_seen": 135090525, "step": 6292, "time_per_iteration": 2.7115557193756104 }, { "auxiliary_loss_clip": 0.0111659, "auxiliary_loss_mlp": 0.01045333, "balance_loss_clip": 1.04599166, "balance_loss_mlp": 1.02922773, "epoch": 0.378355629039531, "flos": 16470357770880.0, "grad_norm": 2.0329947363530616, "language_loss": 0.69857049, "learning_rate": 2.856446715715224e-06, "loss": 0.72018969, "num_input_tokens_seen": 135109575, "step": 6293, "time_per_iteration": 2.6687965393066406 }, { "auxiliary_loss_clip": 0.01133204, "auxiliary_loss_mlp": 0.01039264, "balance_loss_clip": 1.04852223, "balance_loss_mlp": 1.02307534, "epoch": 0.378415752292199, "flos": 19974844200960.0, "grad_norm": 2.030259976194038, "language_loss": 0.70870757, "learning_rate": 2.8560947535022173e-06, "loss": 0.73043227, "num_input_tokens_seen": 135127000, "step": 6294, "time_per_iteration": 2.600249767303467 }, { "auxiliary_loss_clip": 0.01115678, "auxiliary_loss_mlp": 0.01040569, "balance_loss_clip": 1.04706097, "balance_loss_mlp": 1.02365303, "epoch": 0.378475875544867, "flos": 14647388676480.0, "grad_norm": 4.788626069957177, "language_loss": 0.82803214, "learning_rate": 2.855742758826011e-06, "loss": 0.84959471, "num_input_tokens_seen": 135145285, "step": 6295, "time_per_iteration": 2.656090497970581 }, { "auxiliary_loss_clip": 0.0111937, "auxiliary_loss_mlp": 0.0103653, "balance_loss_clip": 1.04782999, "balance_loss_mlp": 1.02058005, "epoch": 0.37853599879753497, "flos": 26650996617600.0, "grad_norm": 9.577751233202987, "language_loss": 0.71744889, "learning_rate": 2.8553907316999547e-06, "loss": 0.73900783, "num_input_tokens_seen": 135165240, "step": 6296, "time_per_iteration": 2.6698925495147705 }, { "auxiliary_loss_clip": 0.01134516, "auxiliary_loss_mlp": 0.01043376, "balance_loss_clip": 1.05133939, "balance_loss_mlp": 1.02771211, "epoch": 0.37859612205020293, "flos": 17311960408320.0, "grad_norm": 3.288847845161644, "language_loss": 0.76889098, "learning_rate": 2.855038672137396e-06, "loss": 0.79066986, "num_input_tokens_seen": 135184045, "step": 6297, "time_per_iteration": 2.629037380218506 }, { "auxiliary_loss_clip": 0.01109354, "auxiliary_loss_mlp": 0.01038115, "balance_loss_clip": 1.04526067, "balance_loss_mlp": 1.02226055, "epoch": 0.3786562453028709, "flos": 18220392299520.0, "grad_norm": 1.9191527971099975, "language_loss": 0.79743183, "learning_rate": 2.854686580151684e-06, "loss": 0.81890655, "num_input_tokens_seen": 135202365, "step": 6298, "time_per_iteration": 2.673081874847412 }, { "auxiliary_loss_clip": 0.01075918, "auxiliary_loss_mlp": 0.01051187, "balance_loss_clip": 1.04113722, "balance_loss_mlp": 1.03267384, "epoch": 0.37871636855553886, "flos": 21214875473280.0, "grad_norm": 1.8248163373816215, "language_loss": 0.84369445, "learning_rate": 2.8543344557561722e-06, "loss": 0.86496556, "num_input_tokens_seen": 135220955, "step": 6299, "time_per_iteration": 2.748072862625122 }, { "auxiliary_loss_clip": 0.01104171, "auxiliary_loss_mlp": 0.01036156, "balance_loss_clip": 1.0473597, "balance_loss_mlp": 1.02021194, "epoch": 0.3787764918082068, "flos": 20952727038720.0, "grad_norm": 2.2683019346862587, "language_loss": 0.76286763, "learning_rate": 2.8539822989642116e-06, "loss": 0.78427088, "num_input_tokens_seen": 135239715, "step": 6300, "time_per_iteration": 2.742335796356201 }, { "auxiliary_loss_clip": 0.01118244, "auxiliary_loss_mlp": 0.01037884, "balance_loss_clip": 1.04743147, "balance_loss_mlp": 1.01999068, "epoch": 0.3788366150608748, "flos": 17308009912320.0, "grad_norm": 2.2544575031135863, "language_loss": 0.82409781, "learning_rate": 2.8536301097891577e-06, "loss": 0.84565908, "num_input_tokens_seen": 135257035, "step": 6301, "time_per_iteration": 2.6785736083984375 }, { "auxiliary_loss_clip": 0.01120863, "auxiliary_loss_mlp": 0.01039969, "balance_loss_clip": 1.04765666, "balance_loss_mlp": 1.02410781, "epoch": 0.37889673831354276, "flos": 24311092942080.0, "grad_norm": 2.7886341766039466, "language_loss": 0.67584914, "learning_rate": 2.8532778882443636e-06, "loss": 0.69745743, "num_input_tokens_seen": 135275720, "step": 6302, "time_per_iteration": 2.677690029144287 }, { "auxiliary_loss_clip": 0.01090953, "auxiliary_loss_mlp": 0.01043064, "balance_loss_clip": 1.04460323, "balance_loss_mlp": 1.02736425, "epoch": 0.3789568615662107, "flos": 26683603188480.0, "grad_norm": 1.752291551629032, "language_loss": 0.68745166, "learning_rate": 2.8529256343431867e-06, "loss": 0.70879185, "num_input_tokens_seen": 135294140, "step": 6303, "time_per_iteration": 2.8387813568115234 }, { "auxiliary_loss_clip": 0.01133092, "auxiliary_loss_mlp": 0.01039166, "balance_loss_clip": 1.04745388, "balance_loss_mlp": 1.02412772, "epoch": 0.3790169848188787, "flos": 23585194990080.0, "grad_norm": 1.8875159078783896, "language_loss": 0.77695227, "learning_rate": 2.8525733480989846e-06, "loss": 0.79867482, "num_input_tokens_seen": 135314845, "step": 6304, "time_per_iteration": 2.673499584197998 }, { "auxiliary_loss_clip": 0.01145067, "auxiliary_loss_mlp": 0.01040581, "balance_loss_clip": 1.05417812, "balance_loss_mlp": 1.02412987, "epoch": 0.37907710807154665, "flos": 18437436230400.0, "grad_norm": 2.779181085633227, "language_loss": 0.79659361, "learning_rate": 2.8522210295251146e-06, "loss": 0.81845009, "num_input_tokens_seen": 135333055, "step": 6305, "time_per_iteration": 2.5770838260650635 }, { "auxiliary_loss_clip": 0.01046795, "auxiliary_loss_mlp": 0.01001141, "balance_loss_clip": 1.02554131, "balance_loss_mlp": 0.99954396, "epoch": 0.3791372313242146, "flos": 50107165954560.0, "grad_norm": 0.9814261912828969, "language_loss": 0.64473259, "learning_rate": 2.8518686786349387e-06, "loss": 0.66521198, "num_input_tokens_seen": 135387865, "step": 6306, "time_per_iteration": 3.0782721042633057 }, { "auxiliary_loss_clip": 0.01111605, "auxiliary_loss_mlp": 0.01058558, "balance_loss_clip": 1.04987538, "balance_loss_mlp": 1.03932941, "epoch": 0.3791973545768826, "flos": 24316551809280.0, "grad_norm": 3.4757923579383343, "language_loss": 0.73271245, "learning_rate": 2.851516295441817e-06, "loss": 0.75441408, "num_input_tokens_seen": 135409095, "step": 6307, "time_per_iteration": 2.756335973739624 }, { "auxiliary_loss_clip": 0.01112868, "auxiliary_loss_mlp": 0.01041837, "balance_loss_clip": 1.04757965, "balance_loss_mlp": 1.02545738, "epoch": 0.3792574778295506, "flos": 21579907438080.0, "grad_norm": 1.5984922637838355, "language_loss": 0.78426826, "learning_rate": 2.851163879959112e-06, "loss": 0.80581522, "num_input_tokens_seen": 135429585, "step": 6308, "time_per_iteration": 2.7782399654388428 }, { "auxiliary_loss_clip": 0.01099815, "auxiliary_loss_mlp": 0.01047567, "balance_loss_clip": 1.04646075, "balance_loss_mlp": 1.03061557, "epoch": 0.37931760108221857, "flos": 22272731942400.0, "grad_norm": 30.20771720098995, "language_loss": 0.72349942, "learning_rate": 2.8508114322001876e-06, "loss": 0.74497324, "num_input_tokens_seen": 135446320, "step": 6309, "time_per_iteration": 2.779332399368286 }, { "auxiliary_loss_clip": 0.0107726, "auxiliary_loss_mlp": 0.01047463, "balance_loss_clip": 1.04217935, "balance_loss_mlp": 1.03061867, "epoch": 0.37937772433488653, "flos": 19682998197120.0, "grad_norm": 1.3823910789919382, "language_loss": 0.78832853, "learning_rate": 2.8504589521784083e-06, "loss": 0.8095758, "num_input_tokens_seen": 135465720, "step": 6310, "time_per_iteration": 2.771423101425171 }, { "auxiliary_loss_clip": 0.01125039, "auxiliary_loss_mlp": 0.0077385, "balance_loss_clip": 1.04667282, "balance_loss_mlp": 1.00038886, "epoch": 0.3794378475875545, "flos": 19099378016640.0, "grad_norm": 2.0276391959107687, "language_loss": 0.76350379, "learning_rate": 2.8501064399071403e-06, "loss": 0.78249264, "num_input_tokens_seen": 135485155, "step": 6311, "time_per_iteration": 2.6458020210266113 }, { "auxiliary_loss_clip": 0.01111162, "auxiliary_loss_mlp": 0.01038798, "balance_loss_clip": 1.04782593, "balance_loss_mlp": 1.02345526, "epoch": 0.37949797084022246, "flos": 20339660684160.0, "grad_norm": 1.662830094695082, "language_loss": 0.7082535, "learning_rate": 2.8497538953997504e-06, "loss": 0.72975308, "num_input_tokens_seen": 135502675, "step": 6312, "time_per_iteration": 2.719555377960205 }, { "auxiliary_loss_clip": 0.01023104, "auxiliary_loss_mlp": 0.01013837, "balance_loss_clip": 1.02154779, "balance_loss_mlp": 1.0123291, "epoch": 0.37955809409289043, "flos": 63972203477760.0, "grad_norm": 0.7865225154891, "language_loss": 0.56087357, "learning_rate": 2.849401318669608e-06, "loss": 0.58124298, "num_input_tokens_seen": 135562005, "step": 6313, "time_per_iteration": 3.2287843227386475 }, { "auxiliary_loss_clip": 0.01096229, "auxiliary_loss_mlp": 0.01051812, "balance_loss_clip": 1.04299724, "balance_loss_mlp": 1.03592694, "epoch": 0.3796182173455584, "flos": 31540665179520.0, "grad_norm": 1.6673731637282567, "language_loss": 0.71260917, "learning_rate": 2.849048709730083e-06, "loss": 0.73408955, "num_input_tokens_seen": 135582600, "step": 6314, "time_per_iteration": 2.7842931747436523 }, { "auxiliary_loss_clip": 0.01129376, "auxiliary_loss_mlp": 0.01048605, "balance_loss_clip": 1.04880047, "balance_loss_mlp": 1.03201127, "epoch": 0.37967834059822636, "flos": 12130804978560.0, "grad_norm": 2.0299747539506408, "language_loss": 0.73270208, "learning_rate": 2.848696068594545e-06, "loss": 0.75448191, "num_input_tokens_seen": 135600280, "step": 6315, "time_per_iteration": 2.6785545349121094 }, { "auxiliary_loss_clip": 0.01122054, "auxiliary_loss_mlp": 0.01048691, "balance_loss_clip": 1.0479691, "balance_loss_mlp": 1.03326535, "epoch": 0.3797384638508943, "flos": 39348578298240.0, "grad_norm": 2.0273248392275645, "language_loss": 0.71108794, "learning_rate": 2.8483433952763677e-06, "loss": 0.73279542, "num_input_tokens_seen": 135621560, "step": 6316, "time_per_iteration": 2.7634074687957764 }, { "auxiliary_loss_clip": 0.01099766, "auxiliary_loss_mlp": 0.01041876, "balance_loss_clip": 1.04686475, "balance_loss_mlp": 1.02733219, "epoch": 0.3797985871035623, "flos": 34054016653440.0, "grad_norm": 6.091183487708486, "language_loss": 0.6551193, "learning_rate": 2.847990689788923e-06, "loss": 0.67653567, "num_input_tokens_seen": 135641745, "step": 6317, "time_per_iteration": 2.8334715366363525 }, { "auxiliary_loss_clip": 0.01119227, "auxiliary_loss_mlp": 0.01036315, "balance_loss_clip": 1.04556906, "balance_loss_mlp": 1.02204525, "epoch": 0.37985871035623026, "flos": 23222174186880.0, "grad_norm": 2.5588148844770364, "language_loss": 0.85254991, "learning_rate": 2.8476379521455877e-06, "loss": 0.87410533, "num_input_tokens_seen": 135660650, "step": 6318, "time_per_iteration": 2.6611499786376953 }, { "auxiliary_loss_clip": 0.01113843, "auxiliary_loss_mlp": 0.01046062, "balance_loss_clip": 1.04669976, "balance_loss_mlp": 1.02933645, "epoch": 0.3799188336088982, "flos": 18114958903680.0, "grad_norm": 2.5013130494780254, "language_loss": 0.75813186, "learning_rate": 2.8472851823597354e-06, "loss": 0.77973092, "num_input_tokens_seen": 135679980, "step": 6319, "time_per_iteration": 2.643206834793091 }, { "auxiliary_loss_clip": 0.01136645, "auxiliary_loss_mlp": 0.01043703, "balance_loss_clip": 1.04961717, "balance_loss_mlp": 1.02813435, "epoch": 0.3799789568615662, "flos": 21871897096320.0, "grad_norm": 1.6614251909537696, "language_loss": 0.64298296, "learning_rate": 2.846932380444744e-06, "loss": 0.66478646, "num_input_tokens_seen": 135699400, "step": 6320, "time_per_iteration": 4.031519174575806 }, { "auxiliary_loss_clip": 0.01102323, "auxiliary_loss_mlp": 0.01046665, "balance_loss_clip": 1.05175698, "balance_loss_mlp": 1.03132319, "epoch": 0.3800390801142342, "flos": 32962943082240.0, "grad_norm": 2.289587921641626, "language_loss": 0.713642, "learning_rate": 2.846579546413992e-06, "loss": 0.73513186, "num_input_tokens_seen": 135723455, "step": 6321, "time_per_iteration": 2.8465514183044434 }, { "auxiliary_loss_clip": 0.01096183, "auxiliary_loss_mlp": 0.01042053, "balance_loss_clip": 1.04067016, "balance_loss_mlp": 1.02673435, "epoch": 0.38009920336690217, "flos": 26907075653760.0, "grad_norm": 1.7413772853733611, "language_loss": 0.74461544, "learning_rate": 2.846226680280859e-06, "loss": 0.76599777, "num_input_tokens_seen": 135744335, "step": 6322, "time_per_iteration": 4.407487630844116 }, { "auxiliary_loss_clip": 0.01122719, "auxiliary_loss_mlp": 0.01040835, "balance_loss_clip": 1.0462966, "balance_loss_mlp": 1.02587986, "epoch": 0.38015932661957014, "flos": 22488913946880.0, "grad_norm": 3.5770930684707527, "language_loss": 0.84908414, "learning_rate": 2.845873782058725e-06, "loss": 0.87071967, "num_input_tokens_seen": 135761440, "step": 6323, "time_per_iteration": 2.6349892616271973 }, { "auxiliary_loss_clip": 0.01111414, "auxiliary_loss_mlp": 0.01037556, "balance_loss_clip": 1.04454303, "balance_loss_mlp": 1.02075982, "epoch": 0.3802194498722381, "flos": 21980993679360.0, "grad_norm": 5.3693824839272954, "language_loss": 0.73171353, "learning_rate": 2.845520851760973e-06, "loss": 0.75320327, "num_input_tokens_seen": 135779955, "step": 6324, "time_per_iteration": 4.240839958190918 }, { "auxiliary_loss_clip": 0.01105568, "auxiliary_loss_mlp": 0.01038696, "balance_loss_clip": 1.04704404, "balance_loss_mlp": 1.02263856, "epoch": 0.38027957312490607, "flos": 21324869896320.0, "grad_norm": 1.716026134262254, "language_loss": 0.83859229, "learning_rate": 2.8451678894009847e-06, "loss": 0.86003488, "num_input_tokens_seen": 135799840, "step": 6325, "time_per_iteration": 2.72074818611145 }, { "auxiliary_loss_clip": 0.01110489, "auxiliary_loss_mlp": 0.01035658, "balance_loss_clip": 1.04811895, "balance_loss_mlp": 1.02094209, "epoch": 0.38033969637757403, "flos": 16691244456960.0, "grad_norm": 2.0321742163093264, "language_loss": 0.80093408, "learning_rate": 2.8448148949921465e-06, "loss": 0.82239556, "num_input_tokens_seen": 135817880, "step": 6326, "time_per_iteration": 4.313997030258179 }, { "auxiliary_loss_clip": 0.01119893, "auxiliary_loss_mlp": 0.01038876, "balance_loss_clip": 1.04593146, "balance_loss_mlp": 1.02497053, "epoch": 0.380399819630242, "flos": 36210847685760.0, "grad_norm": 1.80559395505396, "language_loss": 0.72578084, "learning_rate": 2.844461868547842e-06, "loss": 0.74736857, "num_input_tokens_seen": 135838940, "step": 6327, "time_per_iteration": 2.7500593662261963 }, { "auxiliary_loss_clip": 0.01134332, "auxiliary_loss_mlp": 0.00772576, "balance_loss_clip": 1.04898763, "balance_loss_mlp": 1.00039506, "epoch": 0.38045994288290996, "flos": 21288851533440.0, "grad_norm": 1.9791898832174752, "language_loss": 0.83074433, "learning_rate": 2.844108810081459e-06, "loss": 0.84981334, "num_input_tokens_seen": 135858325, "step": 6328, "time_per_iteration": 2.7503418922424316 }, { "auxiliary_loss_clip": 0.01119735, "auxiliary_loss_mlp": 0.01029986, "balance_loss_clip": 1.04522514, "balance_loss_mlp": 1.01522779, "epoch": 0.38052006613557793, "flos": 20922885815040.0, "grad_norm": 1.5313878449465446, "language_loss": 0.61713332, "learning_rate": 2.843755719606385e-06, "loss": 0.63863051, "num_input_tokens_seen": 135878430, "step": 6329, "time_per_iteration": 2.682016134262085 }, { "auxiliary_loss_clip": 0.01103557, "auxiliary_loss_mlp": 0.01040275, "balance_loss_clip": 1.04332185, "balance_loss_mlp": 1.02436066, "epoch": 0.3805801893882459, "flos": 20990720649600.0, "grad_norm": 1.9096594726999414, "language_loss": 0.56007183, "learning_rate": 2.8434025971360104e-06, "loss": 0.58151013, "num_input_tokens_seen": 135894755, "step": 6330, "time_per_iteration": 2.6704044342041016 }, { "auxiliary_loss_clip": 0.01088801, "auxiliary_loss_mlp": 0.01035148, "balance_loss_clip": 1.04801345, "balance_loss_mlp": 1.02142704, "epoch": 0.38064031264091386, "flos": 25558594243200.0, "grad_norm": 3.9882905607247046, "language_loss": 0.65945244, "learning_rate": 2.8430494426837243e-06, "loss": 0.6806919, "num_input_tokens_seen": 135918275, "step": 6331, "time_per_iteration": 2.750293731689453 }, { "auxiliary_loss_clip": 0.01120934, "auxiliary_loss_mlp": 0.01042908, "balance_loss_clip": 1.05122471, "balance_loss_mlp": 1.02723169, "epoch": 0.3807004358935818, "flos": 15085857997440.0, "grad_norm": 2.769340057272882, "language_loss": 0.7601527, "learning_rate": 2.842696256262919e-06, "loss": 0.78179109, "num_input_tokens_seen": 135937430, "step": 6332, "time_per_iteration": 2.64774227142334 }, { "auxiliary_loss_clip": 0.01073508, "auxiliary_loss_mlp": 0.00772959, "balance_loss_clip": 1.04594767, "balance_loss_mlp": 1.00029111, "epoch": 0.3807605591462498, "flos": 16399398453120.0, "grad_norm": 2.059894273755589, "language_loss": 0.8224051, "learning_rate": 2.842343037886987e-06, "loss": 0.84086972, "num_input_tokens_seen": 135954210, "step": 6333, "time_per_iteration": 2.7650275230407715 }, { "auxiliary_loss_clip": 0.01121534, "auxiliary_loss_mlp": 0.01033205, "balance_loss_clip": 1.04730785, "balance_loss_mlp": 1.01878643, "epoch": 0.3808206823989178, "flos": 29057083102080.0, "grad_norm": 1.5368445040683132, "language_loss": 0.8620519, "learning_rate": 2.8419897875693226e-06, "loss": 0.88359934, "num_input_tokens_seen": 135974425, "step": 6334, "time_per_iteration": 2.7348363399505615 }, { "auxiliary_loss_clip": 0.01123412, "auxiliary_loss_mlp": 0.01038067, "balance_loss_clip": 1.04626036, "balance_loss_mlp": 1.02280819, "epoch": 0.3808808056515858, "flos": 15705855676800.0, "grad_norm": 1.7714454860846107, "language_loss": 0.79359698, "learning_rate": 2.841636505323321e-06, "loss": 0.81521177, "num_input_tokens_seen": 135991985, "step": 6335, "time_per_iteration": 2.7020695209503174 }, { "auxiliary_loss_clip": 0.01121693, "auxiliary_loss_mlp": 0.01033758, "balance_loss_clip": 1.04490542, "balance_loss_mlp": 1.01847494, "epoch": 0.38094092890425374, "flos": 20704584908160.0, "grad_norm": 1.872444579903983, "language_loss": 0.72939491, "learning_rate": 2.8412831911623795e-06, "loss": 0.75094938, "num_input_tokens_seen": 136010015, "step": 6336, "time_per_iteration": 2.7088463306427 }, { "auxiliary_loss_clip": 0.01117324, "auxiliary_loss_mlp": 0.01033417, "balance_loss_clip": 1.04605365, "balance_loss_mlp": 1.01930285, "epoch": 0.3810010521569217, "flos": 20667956014080.0, "grad_norm": 2.014308937626889, "language_loss": 0.69164217, "learning_rate": 2.840929845099894e-06, "loss": 0.71314949, "num_input_tokens_seen": 136028440, "step": 6337, "time_per_iteration": 2.6832611560821533 }, { "auxiliary_loss_clip": 0.01111033, "auxiliary_loss_mlp": 0.01036513, "balance_loss_clip": 1.04483473, "balance_loss_mlp": 1.02133763, "epoch": 0.38106117540958967, "flos": 31827626933760.0, "grad_norm": 1.9800177042646252, "language_loss": 0.63416338, "learning_rate": 2.8405764671492652e-06, "loss": 0.65563887, "num_input_tokens_seen": 136048360, "step": 6338, "time_per_iteration": 2.8045074939727783 }, { "auxiliary_loss_clip": 0.01112594, "auxiliary_loss_mlp": 0.01041591, "balance_loss_clip": 1.04514265, "balance_loss_mlp": 1.02520001, "epoch": 0.38112129866225763, "flos": 16902757693440.0, "grad_norm": 2.42049576026076, "language_loss": 0.69146717, "learning_rate": 2.8402230573238923e-06, "loss": 0.713009, "num_input_tokens_seen": 136065500, "step": 6339, "time_per_iteration": 2.6873764991760254 }, { "auxiliary_loss_clip": 0.01107753, "auxiliary_loss_mlp": 0.01047128, "balance_loss_clip": 1.04493856, "balance_loss_mlp": 1.03165436, "epoch": 0.3811814219149256, "flos": 20887226588160.0, "grad_norm": 2.484915003961603, "language_loss": 0.68283296, "learning_rate": 2.839869615637177e-06, "loss": 0.70438182, "num_input_tokens_seen": 136084060, "step": 6340, "time_per_iteration": 2.730966567993164 }, { "auxiliary_loss_clip": 0.01098909, "auxiliary_loss_mlp": 0.01040765, "balance_loss_clip": 1.0444243, "balance_loss_mlp": 1.02449322, "epoch": 0.38124154516759357, "flos": 16690813493760.0, "grad_norm": 2.645956512625022, "language_loss": 0.89689833, "learning_rate": 2.839516142102522e-06, "loss": 0.91829509, "num_input_tokens_seen": 136102310, "step": 6341, "time_per_iteration": 2.7552878856658936 }, { "auxiliary_loss_clip": 0.01127861, "auxiliary_loss_mlp": 0.01042909, "balance_loss_clip": 1.04863834, "balance_loss_mlp": 1.02668464, "epoch": 0.38130166842026153, "flos": 19681956702720.0, "grad_norm": 2.1539523414578103, "language_loss": 0.75359344, "learning_rate": 2.83916263673333e-06, "loss": 0.7753011, "num_input_tokens_seen": 136120725, "step": 6342, "time_per_iteration": 2.6937670707702637 }, { "auxiliary_loss_clip": 0.01109868, "auxiliary_loss_mlp": 0.01035797, "balance_loss_clip": 1.04506934, "balance_loss_mlp": 1.02071738, "epoch": 0.3813617916729295, "flos": 22198432659840.0, "grad_norm": 1.797512240627555, "language_loss": 0.8348105, "learning_rate": 2.838809099543007e-06, "loss": 0.85626709, "num_input_tokens_seen": 136139105, "step": 6343, "time_per_iteration": 2.6647467613220215 }, { "auxiliary_loss_clip": 0.01073856, "auxiliary_loss_mlp": 0.01047466, "balance_loss_clip": 1.04339314, "balance_loss_mlp": 1.03099144, "epoch": 0.38142191492559746, "flos": 19096899978240.0, "grad_norm": 1.8507846773973766, "language_loss": 0.76930642, "learning_rate": 2.838455530544959e-06, "loss": 0.7905196, "num_input_tokens_seen": 136158265, "step": 6344, "time_per_iteration": 2.807464838027954 }, { "auxiliary_loss_clip": 0.01099031, "auxiliary_loss_mlp": 0.01049913, "balance_loss_clip": 1.04580665, "balance_loss_mlp": 1.03225255, "epoch": 0.3814820381782654, "flos": 24097748112000.0, "grad_norm": 2.0591822661314847, "language_loss": 0.73010087, "learning_rate": 2.838101929752593e-06, "loss": 0.75159037, "num_input_tokens_seen": 136176100, "step": 6345, "time_per_iteration": 2.756462574005127 }, { "auxiliary_loss_clip": 0.01094565, "auxiliary_loss_mlp": 0.00771987, "balance_loss_clip": 1.04568338, "balance_loss_mlp": 1.00028944, "epoch": 0.3815421614309334, "flos": 15778502933760.0, "grad_norm": 1.8320535118847152, "language_loss": 0.69709373, "learning_rate": 2.8377482971793187e-06, "loss": 0.71575922, "num_input_tokens_seen": 136195125, "step": 6346, "time_per_iteration": 2.7221782207489014 }, { "auxiliary_loss_clip": 0.01124746, "auxiliary_loss_mlp": 0.01038046, "balance_loss_clip": 1.04819, "balance_loss_mlp": 1.02297819, "epoch": 0.38160228468360136, "flos": 19899754819200.0, "grad_norm": 1.9952986193352877, "language_loss": 0.75480664, "learning_rate": 2.8373946328385437e-06, "loss": 0.77643454, "num_input_tokens_seen": 136213885, "step": 6347, "time_per_iteration": 2.646730422973633 }, { "auxiliary_loss_clip": 0.0112204, "auxiliary_loss_mlp": 0.01039786, "balance_loss_clip": 1.04638994, "balance_loss_mlp": 1.0253861, "epoch": 0.3816624079362694, "flos": 19281050029440.0, "grad_norm": 3.670871038619067, "language_loss": 0.74398822, "learning_rate": 2.8370409367436813e-06, "loss": 0.76560652, "num_input_tokens_seen": 136232700, "step": 6348, "time_per_iteration": 2.651153802871704 }, { "auxiliary_loss_clip": 0.01109969, "auxiliary_loss_mlp": 0.01037685, "balance_loss_clip": 1.04792547, "balance_loss_mlp": 1.0233444, "epoch": 0.38172253118893734, "flos": 21177564220800.0, "grad_norm": 2.7978232906816665, "language_loss": 0.87172502, "learning_rate": 2.836687208908142e-06, "loss": 0.89320159, "num_input_tokens_seen": 136248975, "step": 6349, "time_per_iteration": 2.693459987640381 }, { "auxiliary_loss_clip": 0.0112098, "auxiliary_loss_mlp": 0.01037146, "balance_loss_clip": 1.04788637, "balance_loss_mlp": 1.02244771, "epoch": 0.3817826544416053, "flos": 17529219820800.0, "grad_norm": 1.7341599494512197, "language_loss": 0.76554048, "learning_rate": 2.836333449345341e-06, "loss": 0.78712171, "num_input_tokens_seen": 136266710, "step": 6350, "time_per_iteration": 2.6194076538085938 }, { "auxiliary_loss_clip": 0.01104228, "auxiliary_loss_mlp": 0.01032221, "balance_loss_clip": 1.04922175, "balance_loss_mlp": 1.01640153, "epoch": 0.38184277769427327, "flos": 16326535714560.0, "grad_norm": 2.525722230514251, "language_loss": 0.75608248, "learning_rate": 2.8359796580686907e-06, "loss": 0.77744693, "num_input_tokens_seen": 136284445, "step": 6351, "time_per_iteration": 2.723487138748169 }, { "auxiliary_loss_clip": 0.01122109, "auxiliary_loss_mlp": 0.01037028, "balance_loss_clip": 1.04607773, "balance_loss_mlp": 1.02048135, "epoch": 0.38190290094694124, "flos": 30443450382720.0, "grad_norm": 2.201358799690427, "language_loss": 0.74001205, "learning_rate": 2.8356258350916085e-06, "loss": 0.76160336, "num_input_tokens_seen": 136305730, "step": 6352, "time_per_iteration": 2.6779909133911133 }, { "auxiliary_loss_clip": 0.01093469, "auxiliary_loss_mlp": 0.01035075, "balance_loss_clip": 1.04185915, "balance_loss_mlp": 1.02093625, "epoch": 0.3819630241996092, "flos": 14209924936320.0, "grad_norm": 1.7014377772216425, "language_loss": 0.64249897, "learning_rate": 2.8352719804275104e-06, "loss": 0.66378438, "num_input_tokens_seen": 136323850, "step": 6353, "time_per_iteration": 2.731860399246216 }, { "auxiliary_loss_clip": 0.01133265, "auxiliary_loss_mlp": 0.01039549, "balance_loss_clip": 1.04809213, "balance_loss_mlp": 1.02529204, "epoch": 0.38202314745227717, "flos": 25009699536000.0, "grad_norm": 2.7523604394748644, "language_loss": 0.83447051, "learning_rate": 2.834918094089816e-06, "loss": 0.85619861, "num_input_tokens_seen": 136344880, "step": 6354, "time_per_iteration": 2.665891170501709 }, { "auxiliary_loss_clip": 0.01132291, "auxiliary_loss_mlp": 0.01034862, "balance_loss_clip": 1.04866302, "balance_loss_mlp": 1.02162409, "epoch": 0.38208327070494513, "flos": 20814507504000.0, "grad_norm": 16.091226432139102, "language_loss": 0.80633152, "learning_rate": 2.834564176091943e-06, "loss": 0.82800299, "num_input_tokens_seen": 136366060, "step": 6355, "time_per_iteration": 2.6580965518951416 }, { "auxiliary_loss_clip": 0.01092469, "auxiliary_loss_mlp": 0.01037027, "balance_loss_clip": 1.04551625, "balance_loss_mlp": 1.02263832, "epoch": 0.3821433939576131, "flos": 22637727993600.0, "grad_norm": 1.8508447811900344, "language_loss": 0.75970227, "learning_rate": 2.8342102264473125e-06, "loss": 0.78099722, "num_input_tokens_seen": 136385625, "step": 6356, "time_per_iteration": 2.7381057739257812 }, { "auxiliary_loss_clip": 0.01123851, "auxiliary_loss_mlp": 0.00772749, "balance_loss_clip": 1.04802036, "balance_loss_mlp": 1.00034022, "epoch": 0.38220351721028106, "flos": 26869872142080.0, "grad_norm": 2.3854964939919188, "language_loss": 0.81208009, "learning_rate": 2.833856245169348e-06, "loss": 0.8310461, "num_input_tokens_seen": 136405750, "step": 6357, "time_per_iteration": 2.8209376335144043 }, { "auxiliary_loss_clip": 0.01118527, "auxiliary_loss_mlp": 0.01044748, "balance_loss_clip": 1.05246222, "balance_loss_mlp": 1.02842796, "epoch": 0.38226364046294903, "flos": 23367468700800.0, "grad_norm": 2.215929075758269, "language_loss": 0.77378345, "learning_rate": 2.8335022322714695e-06, "loss": 0.79541618, "num_input_tokens_seen": 136426085, "step": 6358, "time_per_iteration": 2.7004640102386475 }, { "auxiliary_loss_clip": 0.01115504, "auxiliary_loss_mlp": 0.01047061, "balance_loss_clip": 1.0469476, "balance_loss_mlp": 1.03118849, "epoch": 0.382323763715617, "flos": 19646225648640.0, "grad_norm": 3.6635579737055837, "language_loss": 0.78477705, "learning_rate": 2.8331481877671036e-06, "loss": 0.80640268, "num_input_tokens_seen": 136442670, "step": 6359, "time_per_iteration": 4.184551954269409 }, { "auxiliary_loss_clip": 0.01065181, "auxiliary_loss_mlp": 0.01052018, "balance_loss_clip": 1.03820515, "balance_loss_mlp": 1.03462481, "epoch": 0.38238388696828496, "flos": 54124741232640.0, "grad_norm": 1.6779400536158158, "language_loss": 0.69735414, "learning_rate": 2.8327941116696754e-06, "loss": 0.71852612, "num_input_tokens_seen": 136465730, "step": 6360, "time_per_iteration": 3.1072845458984375 }, { "auxiliary_loss_clip": 0.01102455, "auxiliary_loss_mlp": 0.01037366, "balance_loss_clip": 1.04502857, "balance_loss_mlp": 1.02189279, "epoch": 0.382444010220953, "flos": 24936190352640.0, "grad_norm": 1.5790785802582266, "language_loss": 0.79362941, "learning_rate": 2.83244000399261e-06, "loss": 0.81502759, "num_input_tokens_seen": 136487215, "step": 6361, "time_per_iteration": 4.285314559936523 }, { "auxiliary_loss_clip": 0.01111113, "auxiliary_loss_mlp": 0.01043827, "balance_loss_clip": 1.04649949, "balance_loss_mlp": 1.02906859, "epoch": 0.38250413347362094, "flos": 42337351209600.0, "grad_norm": 1.9067122847602551, "language_loss": 0.65606177, "learning_rate": 2.832085864749337e-06, "loss": 0.67761117, "num_input_tokens_seen": 136510365, "step": 6362, "time_per_iteration": 2.8447117805480957 }, { "auxiliary_loss_clip": 0.0113439, "auxiliary_loss_mlp": 0.01035947, "balance_loss_clip": 1.0483737, "balance_loss_mlp": 1.01978207, "epoch": 0.3825642567262889, "flos": 16289224462080.0, "grad_norm": 2.3383155012254284, "language_loss": 0.82138497, "learning_rate": 2.8317316939532848e-06, "loss": 0.84308833, "num_input_tokens_seen": 136527100, "step": 6363, "time_per_iteration": 4.166736602783203 }, { "auxiliary_loss_clip": 0.01075728, "auxiliary_loss_mlp": 0.01042552, "balance_loss_clip": 1.04349709, "balance_loss_mlp": 1.02707291, "epoch": 0.3826243799789569, "flos": 45654778586880.0, "grad_norm": 2.1311203141010835, "language_loss": 0.59044886, "learning_rate": 2.8313774916178825e-06, "loss": 0.61163169, "num_input_tokens_seen": 136550870, "step": 6364, "time_per_iteration": 3.006801128387451 }, { "auxiliary_loss_clip": 0.01122076, "auxiliary_loss_mlp": 0.01041213, "balance_loss_clip": 1.05097353, "balance_loss_mlp": 1.02542353, "epoch": 0.38268450323162484, "flos": 25301581453440.0, "grad_norm": 1.9239689491626994, "language_loss": 0.68903065, "learning_rate": 2.8310232577565635e-06, "loss": 0.7106635, "num_input_tokens_seen": 136569895, "step": 6365, "time_per_iteration": 2.695068597793579 }, { "auxiliary_loss_clip": 0.01123716, "auxiliary_loss_mlp": 0.01039809, "balance_loss_clip": 1.04955769, "balance_loss_mlp": 1.02366817, "epoch": 0.3827446264842928, "flos": 21836022387840.0, "grad_norm": 2.0334034116186137, "language_loss": 0.73193848, "learning_rate": 2.830668992382758e-06, "loss": 0.75357372, "num_input_tokens_seen": 136588585, "step": 6366, "time_per_iteration": 4.418980598449707 }, { "auxiliary_loss_clip": 0.01115964, "auxiliary_loss_mlp": 0.01038347, "balance_loss_clip": 1.04846239, "balance_loss_mlp": 1.02265882, "epoch": 0.38280474973696077, "flos": 25734591907200.0, "grad_norm": 2.4539991484931645, "language_loss": 0.68623614, "learning_rate": 2.830314695509902e-06, "loss": 0.70777929, "num_input_tokens_seen": 136606640, "step": 6367, "time_per_iteration": 2.6878082752227783 }, { "auxiliary_loss_clip": 0.01125961, "auxiliary_loss_mlp": 0.01037618, "balance_loss_clip": 1.05120409, "balance_loss_mlp": 1.02256823, "epoch": 0.38286487298962874, "flos": 24895934184960.0, "grad_norm": 2.196344444241347, "language_loss": 0.64423102, "learning_rate": 2.82996036715143e-06, "loss": 0.66586685, "num_input_tokens_seen": 136624940, "step": 6368, "time_per_iteration": 2.6698646545410156 }, { "auxiliary_loss_clip": 0.01139795, "auxiliary_loss_mlp": 0.01040116, "balance_loss_clip": 1.05269098, "balance_loss_mlp": 1.02390361, "epoch": 0.3829249962422967, "flos": 28543703967360.0, "grad_norm": 1.346024597035963, "language_loss": 0.684017, "learning_rate": 2.8296060073207763e-06, "loss": 0.70581615, "num_input_tokens_seen": 136645540, "step": 6369, "time_per_iteration": 2.7156169414520264 }, { "auxiliary_loss_clip": 0.01084469, "auxiliary_loss_mlp": 0.01039929, "balance_loss_clip": 1.04267466, "balance_loss_mlp": 1.02391946, "epoch": 0.38298511949496467, "flos": 21471205904640.0, "grad_norm": 1.7824237306329542, "language_loss": 0.78701794, "learning_rate": 2.8292516160313804e-06, "loss": 0.80826187, "num_input_tokens_seen": 136664530, "step": 6370, "time_per_iteration": 2.7351901531219482 }, { "auxiliary_loss_clip": 0.01121027, "auxiliary_loss_mlp": 0.01050163, "balance_loss_clip": 1.04909503, "balance_loss_mlp": 1.03279376, "epoch": 0.38304524274763263, "flos": 31679998035840.0, "grad_norm": 2.5095706519371794, "language_loss": 0.65098304, "learning_rate": 2.8288971932966805e-06, "loss": 0.67269492, "num_input_tokens_seen": 136682315, "step": 6371, "time_per_iteration": 2.739689350128174 }, { "auxiliary_loss_clip": 0.01110581, "auxiliary_loss_mlp": 0.01041968, "balance_loss_clip": 1.04938042, "balance_loss_mlp": 1.02471852, "epoch": 0.3831053660003006, "flos": 25076816098560.0, "grad_norm": 3.269308088463154, "language_loss": 0.7304002, "learning_rate": 2.8285427391301155e-06, "loss": 0.75192571, "num_input_tokens_seen": 136701185, "step": 6372, "time_per_iteration": 2.7497966289520264 }, { "auxiliary_loss_clip": 0.01127864, "auxiliary_loss_mlp": 0.01034223, "balance_loss_clip": 1.05050421, "balance_loss_mlp": 1.01848698, "epoch": 0.38316548925296856, "flos": 23259018562560.0, "grad_norm": 1.83316702621751, "language_loss": 0.8491025, "learning_rate": 2.8281882535451266e-06, "loss": 0.87072337, "num_input_tokens_seen": 136721265, "step": 6373, "time_per_iteration": 2.6510777473449707 }, { "auxiliary_loss_clip": 0.01084717, "auxiliary_loss_mlp": 0.01048262, "balance_loss_clip": 1.0416218, "balance_loss_mlp": 1.0316565, "epoch": 0.3832256125056366, "flos": 34423465991040.0, "grad_norm": 2.287485479433922, "language_loss": 0.74893212, "learning_rate": 2.8278337365551567e-06, "loss": 0.770262, "num_input_tokens_seen": 136741885, "step": 6374, "time_per_iteration": 2.8658056259155273 }, { "auxiliary_loss_clip": 0.01130215, "auxiliary_loss_mlp": 0.01042427, "balance_loss_clip": 1.05264366, "balance_loss_mlp": 1.02613068, "epoch": 0.38328573575830455, "flos": 21762764599680.0, "grad_norm": 7.5426595342284735, "language_loss": 0.75737238, "learning_rate": 2.8274791881736485e-06, "loss": 0.77909875, "num_input_tokens_seen": 136760905, "step": 6375, "time_per_iteration": 2.6622958183288574 }, { "auxiliary_loss_clip": 0.01126708, "auxiliary_loss_mlp": 0.01039776, "balance_loss_clip": 1.05043924, "balance_loss_mlp": 1.0244453, "epoch": 0.3833458590109725, "flos": 17380010724480.0, "grad_norm": 2.1246389624552435, "language_loss": 0.72777182, "learning_rate": 2.8271246084140457e-06, "loss": 0.74943662, "num_input_tokens_seen": 136777240, "step": 6376, "time_per_iteration": 2.6562421321868896 }, { "auxiliary_loss_clip": 0.01122147, "auxiliary_loss_mlp": 0.01039822, "balance_loss_clip": 1.04791379, "balance_loss_mlp": 1.02381194, "epoch": 0.3834059822636405, "flos": 29424557191680.0, "grad_norm": 1.7414598633373413, "language_loss": 0.67441249, "learning_rate": 2.826769997289796e-06, "loss": 0.69603217, "num_input_tokens_seen": 136801040, "step": 6377, "time_per_iteration": 2.779766798019409 }, { "auxiliary_loss_clip": 0.01110002, "auxiliary_loss_mlp": 0.01041228, "balance_loss_clip": 1.05152845, "balance_loss_mlp": 1.02421689, "epoch": 0.38346610551630844, "flos": 21470739027840.0, "grad_norm": 2.377659826482013, "language_loss": 0.73287642, "learning_rate": 2.826415354814344e-06, "loss": 0.75438869, "num_input_tokens_seen": 136819495, "step": 6378, "time_per_iteration": 2.7345829010009766 }, { "auxiliary_loss_clip": 0.01085335, "auxiliary_loss_mlp": 0.01042694, "balance_loss_clip": 1.0479784, "balance_loss_mlp": 1.02707767, "epoch": 0.3835262287689764, "flos": 27561224188800.0, "grad_norm": 2.283576437082984, "language_loss": 0.69473612, "learning_rate": 2.8260606810011396e-06, "loss": 0.71601641, "num_input_tokens_seen": 136838840, "step": 6379, "time_per_iteration": 2.7592358589172363 }, { "auxiliary_loss_clip": 0.01124706, "auxiliary_loss_mlp": 0.01036177, "balance_loss_clip": 1.0516969, "balance_loss_mlp": 1.02094209, "epoch": 0.3835863520216444, "flos": 15523716787200.0, "grad_norm": 1.8393672130560537, "language_loss": 0.83356249, "learning_rate": 2.8257059758636315e-06, "loss": 0.85517132, "num_input_tokens_seen": 136854425, "step": 6380, "time_per_iteration": 2.6572370529174805 }, { "auxiliary_loss_clip": 0.01135434, "auxiliary_loss_mlp": 0.01035321, "balance_loss_clip": 1.05187774, "balance_loss_mlp": 1.02010989, "epoch": 0.38364647527431234, "flos": 21904934630400.0, "grad_norm": 1.5891747666862521, "language_loss": 0.8141042, "learning_rate": 2.8253512394152697e-06, "loss": 0.83581179, "num_input_tokens_seen": 136874355, "step": 6381, "time_per_iteration": 2.7251663208007812 }, { "auxiliary_loss_clip": 0.01057344, "auxiliary_loss_mlp": 0.01005901, "balance_loss_clip": 1.02759361, "balance_loss_mlp": 1.00418437, "epoch": 0.3837065985269803, "flos": 65534927558400.0, "grad_norm": 0.7954141143291842, "language_loss": 0.60376751, "learning_rate": 2.8249964716695068e-06, "loss": 0.62440002, "num_input_tokens_seen": 136937475, "step": 6382, "time_per_iteration": 3.1750948429107666 }, { "auxiliary_loss_clip": 0.01139607, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.05060625, "balance_loss_mlp": 1.02099442, "epoch": 0.38376672177964827, "flos": 28256598558720.0, "grad_norm": 3.8324285625149925, "language_loss": 0.66432369, "learning_rate": 2.824641672639794e-06, "loss": 0.68608773, "num_input_tokens_seen": 136955805, "step": 6383, "time_per_iteration": 2.7543957233428955 }, { "auxiliary_loss_clip": 0.01103794, "auxiliary_loss_mlp": 0.01039577, "balance_loss_clip": 1.04783142, "balance_loss_mlp": 1.02375221, "epoch": 0.38382684503231623, "flos": 20631363033600.0, "grad_norm": 2.110615575498957, "language_loss": 0.75144917, "learning_rate": 2.824286842339587e-06, "loss": 0.77288288, "num_input_tokens_seen": 136975240, "step": 6384, "time_per_iteration": 2.7796735763549805 }, { "auxiliary_loss_clip": 0.01122869, "auxiliary_loss_mlp": 0.01040365, "balance_loss_clip": 1.05156231, "balance_loss_mlp": 1.02510643, "epoch": 0.3838869682849842, "flos": 19605825826560.0, "grad_norm": 1.5394774946197278, "language_loss": 0.76096714, "learning_rate": 2.823931980782341e-06, "loss": 0.78259945, "num_input_tokens_seen": 136994985, "step": 6385, "time_per_iteration": 2.6831300258636475 }, { "auxiliary_loss_clip": 0.01046831, "auxiliary_loss_mlp": 0.01001133, "balance_loss_clip": 1.02648735, "balance_loss_mlp": 0.99943984, "epoch": 0.38394709153765216, "flos": 56556110891520.0, "grad_norm": 0.9063295744618779, "language_loss": 0.66955769, "learning_rate": 2.82357708798151e-06, "loss": 0.69003725, "num_input_tokens_seen": 137046290, "step": 6386, "time_per_iteration": 3.0693411827087402 }, { "auxiliary_loss_clip": 0.0109652, "auxiliary_loss_mlp": 0.01041859, "balance_loss_clip": 1.04551756, "balance_loss_mlp": 1.02686286, "epoch": 0.3840072147903202, "flos": 15888748752000.0, "grad_norm": 1.7986188221191803, "language_loss": 0.7215755, "learning_rate": 2.8232221639505547e-06, "loss": 0.74295932, "num_input_tokens_seen": 137064725, "step": 6387, "time_per_iteration": 2.736774206161499 }, { "auxiliary_loss_clip": 0.01134624, "auxiliary_loss_mlp": 0.01044946, "balance_loss_clip": 1.05156994, "balance_loss_mlp": 1.03039086, "epoch": 0.38406733804298815, "flos": 28218030330240.0, "grad_norm": 1.6374516085838389, "language_loss": 0.8088249, "learning_rate": 2.822867208702932e-06, "loss": 0.83062065, "num_input_tokens_seen": 137086030, "step": 6388, "time_per_iteration": 2.782958507537842 }, { "auxiliary_loss_clip": 0.01103471, "auxiliary_loss_mlp": 0.01047592, "balance_loss_clip": 1.04727554, "balance_loss_mlp": 1.03298843, "epoch": 0.3841274612956561, "flos": 18223588609920.0, "grad_norm": 1.7872750649564642, "language_loss": 0.76085746, "learning_rate": 2.8225122222521026e-06, "loss": 0.78236812, "num_input_tokens_seen": 137105400, "step": 6389, "time_per_iteration": 2.6644833087921143 }, { "auxiliary_loss_clip": 0.01119906, "auxiliary_loss_mlp": 0.0104877, "balance_loss_clip": 1.05389404, "balance_loss_mlp": 1.03203344, "epoch": 0.3841875845483241, "flos": 19792884879360.0, "grad_norm": 4.9507505317589775, "language_loss": 0.76550084, "learning_rate": 2.8221572046115273e-06, "loss": 0.78718758, "num_input_tokens_seen": 137124985, "step": 6390, "time_per_iteration": 2.825714588165283 }, { "auxiliary_loss_clip": 0.01090482, "auxiliary_loss_mlp": 0.01048203, "balance_loss_clip": 1.04517913, "balance_loss_mlp": 1.03196096, "epoch": 0.38424770780099204, "flos": 29898829393920.0, "grad_norm": 1.7614871223783444, "language_loss": 0.70377523, "learning_rate": 2.821802155794668e-06, "loss": 0.72516215, "num_input_tokens_seen": 137146745, "step": 6391, "time_per_iteration": 2.918065309524536 }, { "auxiliary_loss_clip": 0.01125443, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.04874265, "balance_loss_mlp": 1.02158153, "epoch": 0.38430783105366, "flos": 20813717404800.0, "grad_norm": 1.7948670510085722, "language_loss": 0.84005457, "learning_rate": 2.8214470758149884e-06, "loss": 0.86167878, "num_input_tokens_seen": 137163195, "step": 6392, "time_per_iteration": 2.679427146911621 }, { "auxiliary_loss_clip": 0.01122701, "auxiliary_loss_mlp": 0.01037128, "balance_loss_clip": 1.04846168, "balance_loss_mlp": 1.0227809, "epoch": 0.384367954306328, "flos": 10998577399680.0, "grad_norm": 2.3141685884805145, "language_loss": 0.6062203, "learning_rate": 2.8210919646859536e-06, "loss": 0.62781858, "num_input_tokens_seen": 137179330, "step": 6393, "time_per_iteration": 2.6622374057769775 }, { "auxiliary_loss_clip": 0.01110672, "auxiliary_loss_mlp": 0.01036894, "balance_loss_clip": 1.04954767, "balance_loss_mlp": 1.02025223, "epoch": 0.38442807755899594, "flos": 25338030779520.0, "grad_norm": 1.7908313499382054, "language_loss": 0.70639426, "learning_rate": 2.820736822421029e-06, "loss": 0.72786993, "num_input_tokens_seen": 137198655, "step": 6394, "time_per_iteration": 2.7460365295410156 }, { "auxiliary_loss_clip": 0.01123613, "auxiliary_loss_mlp": 0.01035163, "balance_loss_clip": 1.04763663, "balance_loss_mlp": 1.01871169, "epoch": 0.3844882008116639, "flos": 21069760527360.0, "grad_norm": 2.646318489707099, "language_loss": 0.81774974, "learning_rate": 2.8203816490336822e-06, "loss": 0.83933747, "num_input_tokens_seen": 137217120, "step": 6395, "time_per_iteration": 2.676023006439209 }, { "auxiliary_loss_clip": 0.01129196, "auxiliary_loss_mlp": 0.01046949, "balance_loss_clip": 1.05485177, "balance_loss_mlp": 1.03209007, "epoch": 0.38454832406433187, "flos": 17963235855360.0, "grad_norm": 1.9755185808990787, "language_loss": 0.71031433, "learning_rate": 2.8200264445373813e-06, "loss": 0.73207581, "num_input_tokens_seen": 137234410, "step": 6396, "time_per_iteration": 2.7082455158233643 }, { "auxiliary_loss_clip": 0.01044031, "auxiliary_loss_mlp": 0.0100801, "balance_loss_clip": 1.02689695, "balance_loss_mlp": 1.00657308, "epoch": 0.38460844731699984, "flos": 67924999555200.0, "grad_norm": 0.8839433118134116, "language_loss": 0.59671199, "learning_rate": 2.8196712089455954e-06, "loss": 0.61723238, "num_input_tokens_seen": 137294940, "step": 6397, "time_per_iteration": 3.2412428855895996 }, { "auxiliary_loss_clip": 0.01137376, "auxiliary_loss_mlp": 0.01035554, "balance_loss_clip": 1.05209756, "balance_loss_mlp": 1.02044976, "epoch": 0.3846685705696678, "flos": 25849075530240.0, "grad_norm": 2.648974669995796, "language_loss": 0.85017276, "learning_rate": 2.819315942271794e-06, "loss": 0.87190199, "num_input_tokens_seen": 137315035, "step": 6398, "time_per_iteration": 2.7374656200408936 }, { "auxiliary_loss_clip": 0.01136492, "auxiliary_loss_mlp": 0.01030698, "balance_loss_clip": 1.0517211, "balance_loss_mlp": 1.0165, "epoch": 0.38472869382233577, "flos": 16290194129280.0, "grad_norm": 2.1032431430060075, "language_loss": 0.79989493, "learning_rate": 2.8189606445294515e-06, "loss": 0.82156688, "num_input_tokens_seen": 137333155, "step": 6399, "time_per_iteration": 4.446218729019165 }, { "auxiliary_loss_clip": 0.0113807, "auxiliary_loss_mlp": 0.00773562, "balance_loss_clip": 1.05109119, "balance_loss_mlp": 1.00025833, "epoch": 0.38478881707500373, "flos": 19353122668800.0, "grad_norm": 3.0376300513317416, "language_loss": 0.67328328, "learning_rate": 2.818605315732038e-06, "loss": 0.69239962, "num_input_tokens_seen": 137351515, "step": 6400, "time_per_iteration": 2.6920905113220215 }, { "auxiliary_loss_clip": 0.01122811, "auxiliary_loss_mlp": 0.01042029, "balance_loss_clip": 1.05546772, "balance_loss_mlp": 1.0264008, "epoch": 0.38484894032767175, "flos": 24860849575680.0, "grad_norm": 11.158483612058907, "language_loss": 0.73623443, "learning_rate": 2.81824995589303e-06, "loss": 0.75788283, "num_input_tokens_seen": 137371255, "step": 6401, "time_per_iteration": 4.2371673583984375 }, { "auxiliary_loss_clip": 0.01102005, "auxiliary_loss_mlp": 0.01039851, "balance_loss_clip": 1.04852486, "balance_loss_mlp": 1.02387738, "epoch": 0.3849090635803397, "flos": 14501806853760.0, "grad_norm": 2.0006804524577233, "language_loss": 0.72059876, "learning_rate": 2.8178945650259012e-06, "loss": 0.74201727, "num_input_tokens_seen": 137388980, "step": 6402, "time_per_iteration": 2.686413288116455 }, { "auxiliary_loss_clip": 0.0113478, "auxiliary_loss_mlp": 0.01035082, "balance_loss_clip": 1.05094552, "balance_loss_mlp": 1.02016854, "epoch": 0.3849691868330077, "flos": 18515865576960.0, "grad_norm": 2.094788133183166, "language_loss": 0.82884681, "learning_rate": 2.817539143144128e-06, "loss": 0.85054541, "num_input_tokens_seen": 137406885, "step": 6403, "time_per_iteration": 4.234680891036987 }, { "auxiliary_loss_clip": 0.01078109, "auxiliary_loss_mlp": 0.01040581, "balance_loss_clip": 1.04205656, "balance_loss_mlp": 1.02466702, "epoch": 0.38502931008567565, "flos": 21616392677760.0, "grad_norm": 4.587008789601206, "language_loss": 0.82845348, "learning_rate": 2.817183690261189e-06, "loss": 0.84964037, "num_input_tokens_seen": 137425535, "step": 6404, "time_per_iteration": 2.777756452560425 }, { "auxiliary_loss_clip": 0.0111195, "auxiliary_loss_mlp": 0.01034861, "balance_loss_clip": 1.04970074, "balance_loss_mlp": 1.02046084, "epoch": 0.3850894333383436, "flos": 25415346804480.0, "grad_norm": 2.6287869212560646, "language_loss": 0.69417107, "learning_rate": 2.816828206390563e-06, "loss": 0.71563923, "num_input_tokens_seen": 137447700, "step": 6405, "time_per_iteration": 4.478301286697388 }, { "auxiliary_loss_clip": 0.01102381, "auxiliary_loss_mlp": 0.01038086, "balance_loss_clip": 1.0438571, "balance_loss_mlp": 1.02414417, "epoch": 0.3851495565910116, "flos": 20227870581120.0, "grad_norm": 1.9306681180439358, "language_loss": 0.79248095, "learning_rate": 2.816472691545729e-06, "loss": 0.81388557, "num_input_tokens_seen": 137462245, "step": 6406, "time_per_iteration": 2.7157816886901855 }, { "auxiliary_loss_clip": 0.01129296, "auxiliary_loss_mlp": 0.01040841, "balance_loss_clip": 1.05465746, "balance_loss_mlp": 1.02483082, "epoch": 0.38520967984367954, "flos": 16508459122560.0, "grad_norm": 5.929375109580111, "language_loss": 0.84107637, "learning_rate": 2.8161171457401694e-06, "loss": 0.86277771, "num_input_tokens_seen": 137476455, "step": 6407, "time_per_iteration": 2.6058037281036377 }, { "auxiliary_loss_clip": 0.01049614, "auxiliary_loss_mlp": 0.00999678, "balance_loss_clip": 1.03001904, "balance_loss_mlp": 0.99828893, "epoch": 0.3852698030963475, "flos": 61313772971520.0, "grad_norm": 0.845548946049954, "language_loss": 0.64919412, "learning_rate": 2.815761568987365e-06, "loss": 0.66968703, "num_input_tokens_seen": 137539845, "step": 6408, "time_per_iteration": 3.2015879154205322 }, { "auxiliary_loss_clip": 0.01110915, "auxiliary_loss_mlp": 0.01042045, "balance_loss_clip": 1.05201948, "balance_loss_mlp": 1.02547526, "epoch": 0.3853299263490155, "flos": 22893016930560.0, "grad_norm": 1.5734517214124462, "language_loss": 0.73444313, "learning_rate": 2.8154059613008e-06, "loss": 0.75597274, "num_input_tokens_seen": 137559880, "step": 6409, "time_per_iteration": 2.683310031890869 }, { "auxiliary_loss_clip": 0.01099042, "auxiliary_loss_mlp": 0.01052587, "balance_loss_clip": 1.05162942, "balance_loss_mlp": 1.03458679, "epoch": 0.38539004960168344, "flos": 20047491457920.0, "grad_norm": 3.095928763270071, "language_loss": 0.70505756, "learning_rate": 2.81505032269396e-06, "loss": 0.72657388, "num_input_tokens_seen": 137578225, "step": 6410, "time_per_iteration": 2.7694053649902344 }, { "auxiliary_loss_clip": 0.01018797, "auxiliary_loss_mlp": 0.00754046, "balance_loss_clip": 1.02754462, "balance_loss_mlp": 1.00070059, "epoch": 0.3854501728543514, "flos": 68730691570560.0, "grad_norm": 0.6824056349925876, "language_loss": 0.6019417, "learning_rate": 2.81469465318033e-06, "loss": 0.61967015, "num_input_tokens_seen": 137645770, "step": 6411, "time_per_iteration": 3.3692543506622314 }, { "auxiliary_loss_clip": 0.01091571, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 1.04337883, "balance_loss_mlp": 1.01451063, "epoch": 0.38551029610701937, "flos": 20485027025280.0, "grad_norm": 2.4386958956664344, "language_loss": 0.78219938, "learning_rate": 2.814338952773397e-06, "loss": 0.80340695, "num_input_tokens_seen": 137664090, "step": 6412, "time_per_iteration": 2.7462196350097656 }, { "auxiliary_loss_clip": 0.01097982, "auxiliary_loss_mlp": 0.01037754, "balance_loss_clip": 1.04309821, "balance_loss_mlp": 1.01995587, "epoch": 0.38557041935968733, "flos": 23471788775040.0, "grad_norm": 2.0249224045322802, "language_loss": 0.78112727, "learning_rate": 2.8139832214866493e-06, "loss": 0.80248463, "num_input_tokens_seen": 137683190, "step": 6413, "time_per_iteration": 2.768624782562256 }, { "auxiliary_loss_clip": 0.01056912, "auxiliary_loss_mlp": 0.01003998, "balance_loss_clip": 1.02733278, "balance_loss_mlp": 1.00254369, "epoch": 0.38563054261235535, "flos": 63966636869760.0, "grad_norm": 0.8082958368118873, "language_loss": 0.61342072, "learning_rate": 2.813627459333576e-06, "loss": 0.63402981, "num_input_tokens_seen": 137737315, "step": 6414, "time_per_iteration": 2.983466625213623 }, { "auxiliary_loss_clip": 0.01103716, "auxiliary_loss_mlp": 0.01038577, "balance_loss_clip": 1.05065155, "balance_loss_mlp": 1.02302015, "epoch": 0.3856906658650233, "flos": 23987789602560.0, "grad_norm": 2.2111312580879106, "language_loss": 0.77225536, "learning_rate": 2.8132716663276685e-06, "loss": 0.79367828, "num_input_tokens_seen": 137753535, "step": 6415, "time_per_iteration": 2.7486205101013184 }, { "auxiliary_loss_clip": 0.01109368, "auxiliary_loss_mlp": 0.01030786, "balance_loss_clip": 1.04894936, "balance_loss_mlp": 1.01676726, "epoch": 0.3857507891176913, "flos": 25007436979200.0, "grad_norm": 1.644505635534703, "language_loss": 0.80036473, "learning_rate": 2.8129158424824173e-06, "loss": 0.82176626, "num_input_tokens_seen": 137773405, "step": 6416, "time_per_iteration": 2.709200859069824 }, { "auxiliary_loss_clip": 0.0112133, "auxiliary_loss_mlp": 0.00771665, "balance_loss_clip": 1.04777813, "balance_loss_mlp": 1.00020468, "epoch": 0.38581091237035925, "flos": 21536778182400.0, "grad_norm": 1.8974153334913886, "language_loss": 0.78746861, "learning_rate": 2.8125599878113155e-06, "loss": 0.80639857, "num_input_tokens_seen": 137790810, "step": 6417, "time_per_iteration": 2.6839869022369385 }, { "auxiliary_loss_clip": 0.01106617, "auxiliary_loss_mlp": 0.0103795, "balance_loss_clip": 1.04771507, "balance_loss_mlp": 1.02424121, "epoch": 0.3858710356230272, "flos": 17383889393280.0, "grad_norm": 1.8492847143532247, "language_loss": 0.80066824, "learning_rate": 2.8122041023278583e-06, "loss": 0.82211387, "num_input_tokens_seen": 137810265, "step": 6418, "time_per_iteration": 2.709463119506836 }, { "auxiliary_loss_clip": 0.01106426, "auxiliary_loss_mlp": 0.01035927, "balance_loss_clip": 1.04606509, "balance_loss_mlp": 1.02115691, "epoch": 0.3859311588756952, "flos": 20339588856960.0, "grad_norm": 2.0121704661475524, "language_loss": 0.79591382, "learning_rate": 2.8118481860455407e-06, "loss": 0.81733727, "num_input_tokens_seen": 137828580, "step": 6419, "time_per_iteration": 2.687030553817749 }, { "auxiliary_loss_clip": 0.01109367, "auxiliary_loss_mlp": 0.01035627, "balance_loss_clip": 1.04662013, "balance_loss_mlp": 1.0194031, "epoch": 0.38599128212836314, "flos": 26321157002880.0, "grad_norm": 2.202509680177809, "language_loss": 0.67581224, "learning_rate": 2.8114922389778573e-06, "loss": 0.69726223, "num_input_tokens_seen": 137846145, "step": 6420, "time_per_iteration": 2.7517049312591553 }, { "auxiliary_loss_clip": 0.01089731, "auxiliary_loss_mlp": 0.01053637, "balance_loss_clip": 1.04479241, "balance_loss_mlp": 1.03771043, "epoch": 0.3860514053810311, "flos": 13553837066880.0, "grad_norm": 2.406147976104497, "language_loss": 0.81137526, "learning_rate": 2.8111362611383076e-06, "loss": 0.83280897, "num_input_tokens_seen": 137863705, "step": 6421, "time_per_iteration": 2.970040798187256 }, { "auxiliary_loss_clip": 0.01108309, "auxiliary_loss_mlp": 0.01040046, "balance_loss_clip": 1.04625583, "balance_loss_mlp": 1.02510345, "epoch": 0.3861115286336991, "flos": 20954271323520.0, "grad_norm": 2.6092074943148797, "language_loss": 0.71989834, "learning_rate": 2.8107802525403886e-06, "loss": 0.74138188, "num_input_tokens_seen": 137880285, "step": 6422, "time_per_iteration": 2.690490961074829 }, { "auxiliary_loss_clip": 0.01104575, "auxiliary_loss_mlp": 0.0104152, "balance_loss_clip": 1.04663455, "balance_loss_mlp": 1.02759588, "epoch": 0.38617165188636704, "flos": 16362697731840.0, "grad_norm": 1.6942063430957965, "language_loss": 0.66644311, "learning_rate": 2.8104242131976025e-06, "loss": 0.687904, "num_input_tokens_seen": 137898335, "step": 6423, "time_per_iteration": 2.6189329624176025 }, { "auxiliary_loss_clip": 0.01128312, "auxiliary_loss_mlp": 0.01042786, "balance_loss_clip": 1.05139875, "balance_loss_mlp": 1.02860618, "epoch": 0.386231775139035, "flos": 34787276893440.0, "grad_norm": 2.1536039728580394, "language_loss": 0.68359423, "learning_rate": 2.810068143123449e-06, "loss": 0.70530522, "num_input_tokens_seen": 137918605, "step": 6424, "time_per_iteration": 2.7609992027282715 }, { "auxiliary_loss_clip": 0.01098796, "auxiliary_loss_mlp": 0.01038309, "balance_loss_clip": 1.04750848, "balance_loss_mlp": 1.02387285, "epoch": 0.38629189839170297, "flos": 21726171619200.0, "grad_norm": 1.4481478329406698, "language_loss": 0.72367114, "learning_rate": 2.809712042331429e-06, "loss": 0.7450422, "num_input_tokens_seen": 137938245, "step": 6425, "time_per_iteration": 2.7069387435913086 }, { "auxiliary_loss_clip": 0.01099551, "auxiliary_loss_mlp": 0.00773141, "balance_loss_clip": 1.0428803, "balance_loss_mlp": 1.00013173, "epoch": 0.38635202164437094, "flos": 27923634460800.0, "grad_norm": 2.52438881915832, "language_loss": 0.80258477, "learning_rate": 2.8093559108350484e-06, "loss": 0.82131171, "num_input_tokens_seen": 137956770, "step": 6426, "time_per_iteration": 2.8976056575775146 }, { "auxiliary_loss_clip": 0.01125602, "auxiliary_loss_mlp": 0.0103515, "balance_loss_clip": 1.04929447, "balance_loss_mlp": 1.02013016, "epoch": 0.38641214489703896, "flos": 23586631534080.0, "grad_norm": 2.2578291383073825, "language_loss": 0.7536087, "learning_rate": 2.80899974864781e-06, "loss": 0.77521622, "num_input_tokens_seen": 137977040, "step": 6427, "time_per_iteration": 2.7281436920166016 }, { "auxiliary_loss_clip": 0.01075932, "auxiliary_loss_mlp": 0.01057335, "balance_loss_clip": 1.04142189, "balance_loss_mlp": 1.04013276, "epoch": 0.3864722681497069, "flos": 12641239198080.0, "grad_norm": 2.0875975256988055, "language_loss": 0.69435054, "learning_rate": 2.8086435557832203e-06, "loss": 0.71568322, "num_input_tokens_seen": 137993545, "step": 6428, "time_per_iteration": 2.7289116382598877 }, { "auxiliary_loss_clip": 0.01113154, "auxiliary_loss_mlp": 0.01042018, "balance_loss_clip": 1.04947257, "balance_loss_mlp": 1.02729535, "epoch": 0.3865323914023749, "flos": 17598922162560.0, "grad_norm": 2.847119477349317, "language_loss": 0.8444519, "learning_rate": 2.8082873322547863e-06, "loss": 0.86600363, "num_input_tokens_seen": 138010140, "step": 6429, "time_per_iteration": 2.7385170459747314 }, { "auxiliary_loss_clip": 0.01110797, "auxiliary_loss_mlp": 0.01038599, "balance_loss_clip": 1.04555535, "balance_loss_mlp": 1.02423429, "epoch": 0.38659251465504285, "flos": 18478949374080.0, "grad_norm": 2.174010980525696, "language_loss": 0.80673695, "learning_rate": 2.807931078076015e-06, "loss": 0.82823092, "num_input_tokens_seen": 138028880, "step": 6430, "time_per_iteration": 2.660228967666626 }, { "auxiliary_loss_clip": 0.0102628, "auxiliary_loss_mlp": 0.01015101, "balance_loss_clip": 1.02508974, "balance_loss_mlp": 1.01382565, "epoch": 0.3866526379077108, "flos": 64165726978560.0, "grad_norm": 0.719429045650031, "language_loss": 0.58803207, "learning_rate": 2.807574793260416e-06, "loss": 0.60844588, "num_input_tokens_seen": 138098090, "step": 6431, "time_per_iteration": 3.2772469520568848 }, { "auxiliary_loss_clip": 0.01086398, "auxiliary_loss_mlp": 0.01039293, "balance_loss_clip": 1.04541588, "balance_loss_mlp": 1.02296114, "epoch": 0.3867127611603788, "flos": 14388292897920.0, "grad_norm": 2.1660589654497424, "language_loss": 0.79041815, "learning_rate": 2.8072184778215004e-06, "loss": 0.81167507, "num_input_tokens_seen": 138114735, "step": 6432, "time_per_iteration": 2.7949061393737793 }, { "auxiliary_loss_clip": 0.01125593, "auxiliary_loss_mlp": 0.01048624, "balance_loss_clip": 1.04708362, "balance_loss_mlp": 1.03231645, "epoch": 0.38677288441304675, "flos": 20010754823040.0, "grad_norm": 2.0695366497364294, "language_loss": 0.80186564, "learning_rate": 2.806862131772779e-06, "loss": 0.82360786, "num_input_tokens_seen": 138130480, "step": 6433, "time_per_iteration": 2.6526312828063965 }, { "auxiliary_loss_clip": 0.01111087, "auxiliary_loss_mlp": 0.01037922, "balance_loss_clip": 1.04934025, "balance_loss_mlp": 1.02162611, "epoch": 0.3868330076657147, "flos": 22236893147520.0, "grad_norm": 1.6267030007711512, "language_loss": 0.70441496, "learning_rate": 2.806505755127765e-06, "loss": 0.72590506, "num_input_tokens_seen": 138150640, "step": 6434, "time_per_iteration": 2.6985394954681396 }, { "auxiliary_loss_clip": 0.01097728, "auxiliary_loss_mlp": 0.01047403, "balance_loss_clip": 1.04536152, "balance_loss_mlp": 1.03008235, "epoch": 0.3868931309183827, "flos": 16727442387840.0, "grad_norm": 1.7348790517482282, "language_loss": 0.77462173, "learning_rate": 2.806149347899972e-06, "loss": 0.79607308, "num_input_tokens_seen": 138169700, "step": 6435, "time_per_iteration": 2.7326719760894775 }, { "auxiliary_loss_clip": 0.01119609, "auxiliary_loss_mlp": 0.01035834, "balance_loss_clip": 1.04651809, "balance_loss_mlp": 1.0208497, "epoch": 0.38695325417105064, "flos": 22674716023680.0, "grad_norm": 2.3575842278582813, "language_loss": 0.79599082, "learning_rate": 2.805792910102915e-06, "loss": 0.81754529, "num_input_tokens_seen": 138185835, "step": 6436, "time_per_iteration": 2.6643154621124268 }, { "auxiliary_loss_clip": 0.01107099, "auxiliary_loss_mlp": 0.01036499, "balance_loss_clip": 1.04809546, "balance_loss_mlp": 1.0215621, "epoch": 0.3870133774237186, "flos": 23112036109440.0, "grad_norm": 1.9038851888933561, "language_loss": 0.76043606, "learning_rate": 2.8054364417501093e-06, "loss": 0.78187203, "num_input_tokens_seen": 138204080, "step": 6437, "time_per_iteration": 2.701834201812744 }, { "auxiliary_loss_clip": 0.01110073, "auxiliary_loss_mlp": 0.01037115, "balance_loss_clip": 1.04696321, "balance_loss_mlp": 1.02374589, "epoch": 0.3870735006763866, "flos": 17675699483520.0, "grad_norm": 2.022501790448194, "language_loss": 0.81817484, "learning_rate": 2.805079942855074e-06, "loss": 0.8396467, "num_input_tokens_seen": 138220710, "step": 6438, "time_per_iteration": 4.327820539474487 }, { "auxiliary_loss_clip": 0.01111326, "auxiliary_loss_mlp": 0.0077319, "balance_loss_clip": 1.04504764, "balance_loss_mlp": 1.00027561, "epoch": 0.38713362392905454, "flos": 23295791111040.0, "grad_norm": 1.7517226143139228, "language_loss": 0.75388491, "learning_rate": 2.804723413431326e-06, "loss": 0.77273011, "num_input_tokens_seen": 138241720, "step": 6439, "time_per_iteration": 2.797830104827881 }, { "auxiliary_loss_clip": 0.01131277, "auxiliary_loss_mlp": 0.01037901, "balance_loss_clip": 1.04915833, "balance_loss_mlp": 1.0235002, "epoch": 0.38719374718172256, "flos": 21031192298880.0, "grad_norm": 1.7565856090832077, "language_loss": 0.74071443, "learning_rate": 2.8043668534923855e-06, "loss": 0.76240611, "num_input_tokens_seen": 138261885, "step": 6440, "time_per_iteration": 4.2160422801971436 }, { "auxiliary_loss_clip": 0.01125111, "auxiliary_loss_mlp": 0.01034995, "balance_loss_clip": 1.04927301, "balance_loss_mlp": 1.01949763, "epoch": 0.3872538704343905, "flos": 19609776322560.0, "grad_norm": 2.101028456947384, "language_loss": 0.82017142, "learning_rate": 2.804010263051774e-06, "loss": 0.84177244, "num_input_tokens_seen": 138280255, "step": 6441, "time_per_iteration": 4.199851036071777 }, { "auxiliary_loss_clip": 0.0113476, "auxiliary_loss_mlp": 0.01039285, "balance_loss_clip": 1.05011272, "balance_loss_mlp": 1.02490842, "epoch": 0.3873139936870585, "flos": 17530045833600.0, "grad_norm": 2.8802239922493147, "language_loss": 0.80824792, "learning_rate": 2.8036536421230118e-06, "loss": 0.82998842, "num_input_tokens_seen": 138296675, "step": 6442, "time_per_iteration": 2.6942524909973145 }, { "auxiliary_loss_clip": 0.01090073, "auxiliary_loss_mlp": 0.01032275, "balance_loss_clip": 1.04431343, "balance_loss_mlp": 1.01747537, "epoch": 0.38737411693972645, "flos": 17786555832960.0, "grad_norm": 2.1593394156288044, "language_loss": 0.84054118, "learning_rate": 2.803296990719624e-06, "loss": 0.86176467, "num_input_tokens_seen": 138314985, "step": 6443, "time_per_iteration": 2.6660094261169434 }, { "auxiliary_loss_clip": 0.01033878, "auxiliary_loss_mlp": 0.01000185, "balance_loss_clip": 1.02513885, "balance_loss_mlp": 0.99879646, "epoch": 0.3874342401923944, "flos": 58304637048960.0, "grad_norm": 0.7605185654135588, "language_loss": 0.50208193, "learning_rate": 2.8029403088551327e-06, "loss": 0.52242255, "num_input_tokens_seen": 138373275, "step": 6444, "time_per_iteration": 4.807433128356934 }, { "auxiliary_loss_clip": 0.01086333, "auxiliary_loss_mlp": 0.00773648, "balance_loss_clip": 1.04187298, "balance_loss_mlp": 1.00033963, "epoch": 0.3874943634450624, "flos": 17711933328000.0, "grad_norm": 1.4666177781563792, "language_loss": 0.78874767, "learning_rate": 2.802583596543065e-06, "loss": 0.80734754, "num_input_tokens_seen": 138391145, "step": 6445, "time_per_iteration": 2.689142942428589 }, { "auxiliary_loss_clip": 0.0111426, "auxiliary_loss_mlp": 0.0103959, "balance_loss_clip": 1.04754841, "balance_loss_mlp": 1.02445602, "epoch": 0.38755448669773035, "flos": 19244852098560.0, "grad_norm": 2.4274750437973958, "language_loss": 0.81207073, "learning_rate": 2.8022268537969474e-06, "loss": 0.83360916, "num_input_tokens_seen": 138409875, "step": 6446, "time_per_iteration": 2.6582860946655273 }, { "auxiliary_loss_clip": 0.01107394, "auxiliary_loss_mlp": 0.01037275, "balance_loss_clip": 1.04530001, "balance_loss_mlp": 1.02277923, "epoch": 0.3876146099503983, "flos": 20594267262720.0, "grad_norm": 3.0137556994939887, "language_loss": 0.77366996, "learning_rate": 2.801870080630306e-06, "loss": 0.79511666, "num_input_tokens_seen": 138428965, "step": 6447, "time_per_iteration": 2.727285146713257 }, { "auxiliary_loss_clip": 0.01108854, "auxiliary_loss_mlp": 0.01037807, "balance_loss_clip": 1.04590762, "balance_loss_mlp": 1.02378821, "epoch": 0.3876747332030663, "flos": 19281121856640.0, "grad_norm": 2.4450461903172562, "language_loss": 0.76364803, "learning_rate": 2.801513277056671e-06, "loss": 0.78511459, "num_input_tokens_seen": 138448090, "step": 6448, "time_per_iteration": 2.663989543914795 }, { "auxiliary_loss_clip": 0.01102873, "auxiliary_loss_mlp": 0.01038866, "balance_loss_clip": 1.04449654, "balance_loss_mlp": 1.02322626, "epoch": 0.38773485645573424, "flos": 18945895201920.0, "grad_norm": 1.6490971101368535, "language_loss": 0.76146352, "learning_rate": 2.8011564430895725e-06, "loss": 0.7828809, "num_input_tokens_seen": 138466105, "step": 6449, "time_per_iteration": 2.806537628173828 }, { "auxiliary_loss_clip": 0.01098531, "auxiliary_loss_mlp": 0.00772575, "balance_loss_clip": 1.04406381, "balance_loss_mlp": 1.00027394, "epoch": 0.3877949797084022, "flos": 23071348978560.0, "grad_norm": 2.0995234377866985, "language_loss": 0.78572172, "learning_rate": 2.800799578742542e-06, "loss": 0.80443275, "num_input_tokens_seen": 138485160, "step": 6450, "time_per_iteration": 2.7541351318359375 }, { "auxiliary_loss_clip": 0.01137663, "auxiliary_loss_mlp": 0.01039948, "balance_loss_clip": 1.04827702, "balance_loss_mlp": 1.02452803, "epoch": 0.3878551029610702, "flos": 29095543589760.0, "grad_norm": 2.5655640440870946, "language_loss": 0.78046334, "learning_rate": 2.8004426840291106e-06, "loss": 0.80223942, "num_input_tokens_seen": 138504135, "step": 6451, "time_per_iteration": 2.6868700981140137 }, { "auxiliary_loss_clip": 0.01126689, "auxiliary_loss_mlp": 0.01031637, "balance_loss_clip": 1.04576159, "balance_loss_mlp": 1.01696229, "epoch": 0.38791522621373814, "flos": 20996394998400.0, "grad_norm": 2.633183178462793, "language_loss": 0.76404589, "learning_rate": 2.800085758962812e-06, "loss": 0.78562915, "num_input_tokens_seen": 138523955, "step": 6452, "time_per_iteration": 2.708750009536743 }, { "auxiliary_loss_clip": 0.01103834, "auxiliary_loss_mlp": 0.01042785, "balance_loss_clip": 1.04665875, "balance_loss_mlp": 1.0285815, "epoch": 0.3879753494664061, "flos": 15486836497920.0, "grad_norm": 1.5878969811553463, "language_loss": 0.79534453, "learning_rate": 2.799728803557182e-06, "loss": 0.81681073, "num_input_tokens_seen": 138541655, "step": 6453, "time_per_iteration": 2.7226593494415283 }, { "auxiliary_loss_clip": 0.0112782, "auxiliary_loss_mlp": 0.0104096, "balance_loss_clip": 1.04889584, "balance_loss_mlp": 1.02560616, "epoch": 0.3880354727190741, "flos": 22053964158720.0, "grad_norm": 19.957823861770734, "language_loss": 0.71643323, "learning_rate": 2.7993718178257555e-06, "loss": 0.73812103, "num_input_tokens_seen": 138560860, "step": 6454, "time_per_iteration": 2.7265548706054688 }, { "auxiliary_loss_clip": 0.01137183, "auxiliary_loss_mlp": 0.01043076, "balance_loss_clip": 1.04976404, "balance_loss_mlp": 1.02693522, "epoch": 0.3880955959717421, "flos": 20340307128960.0, "grad_norm": 2.029110970619929, "language_loss": 0.77489239, "learning_rate": 2.7990148017820694e-06, "loss": 0.79669499, "num_input_tokens_seen": 138580200, "step": 6455, "time_per_iteration": 2.7688205242156982 }, { "auxiliary_loss_clip": 0.01131496, "auxiliary_loss_mlp": 0.01043781, "balance_loss_clip": 1.04975748, "balance_loss_mlp": 1.02897501, "epoch": 0.38815571922441006, "flos": 23075407215360.0, "grad_norm": 1.8133016626985128, "language_loss": 0.76193333, "learning_rate": 2.798657755439662e-06, "loss": 0.78368604, "num_input_tokens_seen": 138598315, "step": 6456, "time_per_iteration": 2.6894283294677734 }, { "auxiliary_loss_clip": 0.01059894, "auxiliary_loss_mlp": 0.01038862, "balance_loss_clip": 1.04251969, "balance_loss_mlp": 1.02365136, "epoch": 0.388215842477078, "flos": 20776944856320.0, "grad_norm": 9.416859416659493, "language_loss": 0.59422505, "learning_rate": 2.7983006788120726e-06, "loss": 0.61521268, "num_input_tokens_seen": 138615695, "step": 6457, "time_per_iteration": 2.8189444541931152 }, { "auxiliary_loss_clip": 0.01136561, "auxiliary_loss_mlp": 0.01039144, "balance_loss_clip": 1.04989612, "balance_loss_mlp": 1.02262187, "epoch": 0.388275965729746, "flos": 20448182649600.0, "grad_norm": 2.336997181985419, "language_loss": 0.79927063, "learning_rate": 2.797943571912841e-06, "loss": 0.82102776, "num_input_tokens_seen": 138633180, "step": 6458, "time_per_iteration": 2.66198992729187 }, { "auxiliary_loss_clip": 0.01081764, "auxiliary_loss_mlp": 0.0104529, "balance_loss_clip": 1.04428816, "balance_loss_mlp": 1.02855277, "epoch": 0.38833608898241395, "flos": 27892392606720.0, "grad_norm": 2.218973373394608, "language_loss": 0.81497735, "learning_rate": 2.797586434755509e-06, "loss": 0.83624792, "num_input_tokens_seen": 138654785, "step": 6459, "time_per_iteration": 2.780120611190796 }, { "auxiliary_loss_clip": 0.01105714, "auxiliary_loss_mlp": 0.01037251, "balance_loss_clip": 1.04633725, "balance_loss_mlp": 1.0236907, "epoch": 0.3883962122350819, "flos": 18076390675200.0, "grad_norm": 1.942341955564712, "language_loss": 0.62001127, "learning_rate": 2.7972292673536202e-06, "loss": 0.64144087, "num_input_tokens_seen": 138673330, "step": 6460, "time_per_iteration": 2.625399112701416 }, { "auxiliary_loss_clip": 0.01120569, "auxiliary_loss_mlp": 0.01032391, "balance_loss_clip": 1.04955411, "balance_loss_mlp": 1.01920033, "epoch": 0.3884563354877499, "flos": 23622254847360.0, "grad_norm": 1.928823237011181, "language_loss": 0.86226058, "learning_rate": 2.796872069720717e-06, "loss": 0.88379019, "num_input_tokens_seen": 138694185, "step": 6461, "time_per_iteration": 2.6901583671569824 }, { "auxiliary_loss_clip": 0.0111976, "auxiliary_loss_mlp": 0.01038779, "balance_loss_clip": 1.04810238, "balance_loss_mlp": 1.0244205, "epoch": 0.38851645874041785, "flos": 27453528236160.0, "grad_norm": 4.963760229824091, "language_loss": 0.70659202, "learning_rate": 2.7965148418703456e-06, "loss": 0.72817743, "num_input_tokens_seen": 138714625, "step": 6462, "time_per_iteration": 2.7463371753692627 }, { "auxiliary_loss_clip": 0.01086013, "auxiliary_loss_mlp": 0.01043745, "balance_loss_clip": 1.04045033, "balance_loss_mlp": 1.02786636, "epoch": 0.3885765819930858, "flos": 25228072270080.0, "grad_norm": 2.747031306466439, "language_loss": 0.76228201, "learning_rate": 2.796157583816052e-06, "loss": 0.78357965, "num_input_tokens_seen": 138733585, "step": 6463, "time_per_iteration": 2.7231578826904297 }, { "auxiliary_loss_clip": 0.01103201, "auxiliary_loss_mlp": 0.0104459, "balance_loss_clip": 1.05013013, "balance_loss_mlp": 1.02841353, "epoch": 0.3886367052457538, "flos": 16946605221120.0, "grad_norm": 3.605418601568306, "language_loss": 0.70244539, "learning_rate": 2.795800295571382e-06, "loss": 0.72392333, "num_input_tokens_seen": 138752335, "step": 6464, "time_per_iteration": 2.773066759109497 }, { "auxiliary_loss_clip": 0.01110861, "auxiliary_loss_mlp": 0.01037039, "balance_loss_clip": 1.04950786, "balance_loss_mlp": 1.02211452, "epoch": 0.38869682849842174, "flos": 27154140376320.0, "grad_norm": 2.8184770761764777, "language_loss": 0.69632983, "learning_rate": 2.7954429771498858e-06, "loss": 0.71780872, "num_input_tokens_seen": 138768450, "step": 6465, "time_per_iteration": 2.7013487815856934 }, { "auxiliary_loss_clip": 0.01097351, "auxiliary_loss_mlp": 0.01041748, "balance_loss_clip": 1.04837847, "balance_loss_mlp": 1.02645373, "epoch": 0.3887569517510897, "flos": 21063619301760.0, "grad_norm": 2.665243237814177, "language_loss": 0.78489739, "learning_rate": 2.7950856285651117e-06, "loss": 0.80628836, "num_input_tokens_seen": 138786775, "step": 6466, "time_per_iteration": 2.736819267272949 }, { "auxiliary_loss_clip": 0.01095374, "auxiliary_loss_mlp": 0.01037568, "balance_loss_clip": 1.0463171, "balance_loss_mlp": 1.02242851, "epoch": 0.38881707500375773, "flos": 29497384016640.0, "grad_norm": 1.6522613533538497, "language_loss": 0.69341898, "learning_rate": 2.794728249830611e-06, "loss": 0.71474838, "num_input_tokens_seen": 138810100, "step": 6467, "time_per_iteration": 2.778083324432373 }, { "auxiliary_loss_clip": 0.01098114, "auxiliary_loss_mlp": 0.01048152, "balance_loss_clip": 1.04706931, "balance_loss_mlp": 1.0326246, "epoch": 0.3888771982564257, "flos": 17488281294720.0, "grad_norm": 3.2276382920067817, "language_loss": 0.84199375, "learning_rate": 2.794370840959936e-06, "loss": 0.86345637, "num_input_tokens_seen": 138825140, "step": 6468, "time_per_iteration": 2.6842098236083984 }, { "auxiliary_loss_clip": 0.01108569, "auxiliary_loss_mlp": 0.01036235, "balance_loss_clip": 1.048172, "balance_loss_mlp": 1.0227766, "epoch": 0.38893732150909366, "flos": 21942425450880.0, "grad_norm": 1.8219377355536144, "language_loss": 0.84232908, "learning_rate": 2.7940134019666383e-06, "loss": 0.86377716, "num_input_tokens_seen": 138844115, "step": 6469, "time_per_iteration": 2.7538135051727295 }, { "auxiliary_loss_clip": 0.0109067, "auxiliary_loss_mlp": 0.01048288, "balance_loss_clip": 1.04416847, "balance_loss_mlp": 1.03205132, "epoch": 0.3889974447617616, "flos": 24276367468800.0, "grad_norm": 2.339210402911935, "language_loss": 0.75173676, "learning_rate": 2.793655932864273e-06, "loss": 0.7731263, "num_input_tokens_seen": 138860860, "step": 6470, "time_per_iteration": 2.7425949573516846 }, { "auxiliary_loss_clip": 0.01095528, "auxiliary_loss_mlp": 0.00772188, "balance_loss_clip": 1.0480423, "balance_loss_mlp": 1.00016475, "epoch": 0.3890575680144296, "flos": 25667116208640.0, "grad_norm": 1.5943716760052937, "language_loss": 0.74977577, "learning_rate": 2.7932984336663953e-06, "loss": 0.76845288, "num_input_tokens_seen": 138881910, "step": 6471, "time_per_iteration": 2.8880369663238525 }, { "auxiliary_loss_clip": 0.01077518, "auxiliary_loss_mlp": 0.01049277, "balance_loss_clip": 1.03879571, "balance_loss_mlp": 1.03336215, "epoch": 0.38911769126709755, "flos": 22855274714880.0, "grad_norm": 2.421548050110463, "language_loss": 0.67984551, "learning_rate": 2.792940904386562e-06, "loss": 0.70111346, "num_input_tokens_seen": 138900975, "step": 6472, "time_per_iteration": 2.7776875495910645 }, { "auxiliary_loss_clip": 0.01103596, "auxiliary_loss_mlp": 0.01043152, "balance_loss_clip": 1.04819107, "balance_loss_mlp": 1.02974129, "epoch": 0.3891778145197655, "flos": 25447522412160.0, "grad_norm": 1.8102352941433608, "language_loss": 0.76068687, "learning_rate": 2.7925833450383293e-06, "loss": 0.78215432, "num_input_tokens_seen": 138920795, "step": 6473, "time_per_iteration": 2.7568469047546387 }, { "auxiliary_loss_clip": 0.01113975, "auxiliary_loss_mlp": 0.01046096, "balance_loss_clip": 1.05217087, "balance_loss_mlp": 1.03031242, "epoch": 0.3892379377724335, "flos": 14027965614720.0, "grad_norm": 2.045216735434868, "language_loss": 0.70959115, "learning_rate": 2.792225755635257e-06, "loss": 0.73119187, "num_input_tokens_seen": 138938770, "step": 6474, "time_per_iteration": 2.6930696964263916 }, { "auxiliary_loss_clip": 0.01135028, "auxiliary_loss_mlp": 0.01042055, "balance_loss_clip": 1.05145836, "balance_loss_mlp": 1.02861369, "epoch": 0.38929806102510145, "flos": 20157449967360.0, "grad_norm": 1.5519949793695216, "language_loss": 0.69049072, "learning_rate": 2.7918681361909046e-06, "loss": 0.71226156, "num_input_tokens_seen": 138958880, "step": 6475, "time_per_iteration": 2.670830011367798 }, { "auxiliary_loss_clip": 0.01110637, "auxiliary_loss_mlp": 0.01057592, "balance_loss_clip": 1.04578567, "balance_loss_mlp": 1.03981757, "epoch": 0.3893581842777694, "flos": 22163958581760.0, "grad_norm": 1.9596553320764234, "language_loss": 0.75820196, "learning_rate": 2.7915104867188332e-06, "loss": 0.77988434, "num_input_tokens_seen": 138977240, "step": 6476, "time_per_iteration": 2.683980941772461 }, { "auxiliary_loss_clip": 0.01039888, "auxiliary_loss_mlp": 0.01002183, "balance_loss_clip": 1.02862918, "balance_loss_mlp": 1.00084782, "epoch": 0.3894183075304374, "flos": 67301877392640.0, "grad_norm": 0.7759740468574157, "language_loss": 0.58146399, "learning_rate": 2.7911528072326055e-06, "loss": 0.60188472, "num_input_tokens_seen": 139039035, "step": 6477, "time_per_iteration": 3.2430496215820312 }, { "auxiliary_loss_clip": 0.01092497, "auxiliary_loss_mlp": 0.01040603, "balance_loss_clip": 1.04780793, "balance_loss_mlp": 1.02428961, "epoch": 0.38947843078310534, "flos": 18547502480640.0, "grad_norm": 1.9073891309950948, "language_loss": 0.78554142, "learning_rate": 2.7907950977457832e-06, "loss": 0.80687243, "num_input_tokens_seen": 139055560, "step": 6478, "time_per_iteration": 4.241156339645386 }, { "auxiliary_loss_clip": 0.01116081, "auxiliary_loss_mlp": 0.0103975, "balance_loss_clip": 1.04505491, "balance_loss_mlp": 1.02545047, "epoch": 0.3895385540357733, "flos": 14605875532800.0, "grad_norm": 2.6992371438810783, "language_loss": 0.82647753, "learning_rate": 2.7904373582719317e-06, "loss": 0.84803581, "num_input_tokens_seen": 139071865, "step": 6479, "time_per_iteration": 4.1569294929504395 }, { "auxiliary_loss_clip": 0.01131381, "auxiliary_loss_mlp": 0.01036344, "balance_loss_clip": 1.04886651, "balance_loss_mlp": 1.02161551, "epoch": 0.38959867728844133, "flos": 19975203336960.0, "grad_norm": 2.334048099077096, "language_loss": 0.79657412, "learning_rate": 2.790079588824617e-06, "loss": 0.81825137, "num_input_tokens_seen": 139089640, "step": 6480, "time_per_iteration": 4.170635938644409 }, { "auxiliary_loss_clip": 0.0110471, "auxiliary_loss_mlp": 0.01032466, "balance_loss_clip": 1.04561472, "balance_loss_mlp": 1.01822066, "epoch": 0.3896588005411093, "flos": 22672130244480.0, "grad_norm": 6.364109786330533, "language_loss": 0.83021134, "learning_rate": 2.7897217894174038e-06, "loss": 0.85158312, "num_input_tokens_seen": 139109365, "step": 6481, "time_per_iteration": 2.638821840286255 }, { "auxiliary_loss_clip": 0.01102815, "auxiliary_loss_mlp": 0.01038843, "balance_loss_clip": 1.04740214, "balance_loss_mlp": 1.02503228, "epoch": 0.38971892379377726, "flos": 20996035862400.0, "grad_norm": 1.7002276765936415, "language_loss": 0.75389051, "learning_rate": 2.789363960063863e-06, "loss": 0.77530706, "num_input_tokens_seen": 139128260, "step": 6482, "time_per_iteration": 2.5737624168395996 }, { "auxiliary_loss_clip": 0.01100553, "auxiliary_loss_mlp": 0.01035815, "balance_loss_clip": 1.04781246, "balance_loss_mlp": 1.02164662, "epoch": 0.3897790470464452, "flos": 22528487756160.0, "grad_norm": 2.0703094316503554, "language_loss": 0.78786725, "learning_rate": 2.78900610077756e-06, "loss": 0.80923092, "num_input_tokens_seen": 139147315, "step": 6483, "time_per_iteration": 2.6177117824554443 }, { "auxiliary_loss_clip": 0.01121516, "auxiliary_loss_mlp": 0.01030702, "balance_loss_clip": 1.04790664, "balance_loss_mlp": 1.01487088, "epoch": 0.3898391702991132, "flos": 26209905603840.0, "grad_norm": 1.6677367088018817, "language_loss": 0.79871929, "learning_rate": 2.788648211572067e-06, "loss": 0.82024151, "num_input_tokens_seen": 139167270, "step": 6484, "time_per_iteration": 4.221461534500122 }, { "auxiliary_loss_clip": 0.01119394, "auxiliary_loss_mlp": 0.01051487, "balance_loss_clip": 1.05063844, "balance_loss_mlp": 1.03472662, "epoch": 0.38989929355178116, "flos": 21065558636160.0, "grad_norm": 2.1008000508061104, "language_loss": 0.77901775, "learning_rate": 2.7882902924609557e-06, "loss": 0.80072653, "num_input_tokens_seen": 139185970, "step": 6485, "time_per_iteration": 2.664097785949707 }, { "auxiliary_loss_clip": 0.01085813, "auxiliary_loss_mlp": 0.01036912, "balance_loss_clip": 1.0427084, "balance_loss_mlp": 1.02207613, "epoch": 0.3899594168044491, "flos": 25484115392640.0, "grad_norm": 6.223818029706007, "language_loss": 0.85190272, "learning_rate": 2.7879323434577965e-06, "loss": 0.87312996, "num_input_tokens_seen": 139203730, "step": 6486, "time_per_iteration": 2.8325467109680176 }, { "auxiliary_loss_clip": 0.01111569, "auxiliary_loss_mlp": 0.01033392, "balance_loss_clip": 1.04786611, "balance_loss_mlp": 1.01883638, "epoch": 0.3900195400571171, "flos": 31139363456640.0, "grad_norm": 2.4250185390770618, "language_loss": 0.85333234, "learning_rate": 2.7875743645761645e-06, "loss": 0.87478197, "num_input_tokens_seen": 139222560, "step": 6487, "time_per_iteration": 2.8390486240386963 }, { "auxiliary_loss_clip": 0.01103222, "auxiliary_loss_mlp": 0.01032994, "balance_loss_clip": 1.04449213, "balance_loss_mlp": 1.01793766, "epoch": 0.39007966330978505, "flos": 20229917656320.0, "grad_norm": 1.5390409302603854, "language_loss": 0.72954559, "learning_rate": 2.787216355829633e-06, "loss": 0.75090778, "num_input_tokens_seen": 139242165, "step": 6488, "time_per_iteration": 2.7613236904144287 }, { "auxiliary_loss_clip": 0.01096805, "auxiliary_loss_mlp": 0.01044873, "balance_loss_clip": 1.04673266, "balance_loss_mlp": 1.02771914, "epoch": 0.390139786562453, "flos": 22528739151360.0, "grad_norm": 2.6420160637986383, "language_loss": 0.68467176, "learning_rate": 2.786858317231779e-06, "loss": 0.70608854, "num_input_tokens_seen": 139262525, "step": 6489, "time_per_iteration": 2.746307849884033 }, { "auxiliary_loss_clip": 0.01108111, "auxiliary_loss_mlp": 0.01041602, "balance_loss_clip": 1.04793715, "balance_loss_mlp": 1.02673674, "epoch": 0.390199909815121, "flos": 26432911192320.0, "grad_norm": 1.6912118236512272, "language_loss": 0.80629271, "learning_rate": 2.7865002487961788e-06, "loss": 0.82778984, "num_input_tokens_seen": 139282835, "step": 6490, "time_per_iteration": 2.7116847038269043 }, { "auxiliary_loss_clip": 0.01124963, "auxiliary_loss_mlp": 0.01033045, "balance_loss_clip": 1.04856181, "balance_loss_mlp": 1.0187161, "epoch": 0.39026003306778895, "flos": 17274577328640.0, "grad_norm": 3.073568327903315, "language_loss": 0.89115125, "learning_rate": 2.7861421505364104e-06, "loss": 0.91273135, "num_input_tokens_seen": 139299490, "step": 6491, "time_per_iteration": 2.6211190223693848 }, { "auxiliary_loss_clip": 0.01092029, "auxiliary_loss_mlp": 0.01045074, "balance_loss_clip": 1.04406416, "balance_loss_mlp": 1.02952874, "epoch": 0.3903201563204569, "flos": 24532841554560.0, "grad_norm": 1.8064559635296407, "language_loss": 0.78637981, "learning_rate": 2.7857840224660523e-06, "loss": 0.80775088, "num_input_tokens_seen": 139317865, "step": 6492, "time_per_iteration": 2.7505667209625244 }, { "auxiliary_loss_clip": 0.01108778, "auxiliary_loss_mlp": 0.01041967, "balance_loss_clip": 1.04486537, "balance_loss_mlp": 1.02735257, "epoch": 0.39038027957312493, "flos": 23767944410880.0, "grad_norm": 1.7227367696506604, "language_loss": 0.74431908, "learning_rate": 2.7854258645986857e-06, "loss": 0.76582652, "num_input_tokens_seen": 139339840, "step": 6493, "time_per_iteration": 2.7200233936309814 }, { "auxiliary_loss_clip": 0.01091358, "auxiliary_loss_mlp": 0.01040258, "balance_loss_clip": 1.04613161, "balance_loss_mlp": 1.02549398, "epoch": 0.3904404028257929, "flos": 14100612871680.0, "grad_norm": 2.9656676182999395, "language_loss": 0.7637316, "learning_rate": 2.7850676769478916e-06, "loss": 0.78504777, "num_input_tokens_seen": 139357555, "step": 6494, "time_per_iteration": 2.6818442344665527 }, { "auxiliary_loss_clip": 0.01131498, "auxiliary_loss_mlp": 0.01048378, "balance_loss_clip": 1.0500524, "balance_loss_mlp": 1.03182006, "epoch": 0.39050052607846086, "flos": 16910048154240.0, "grad_norm": 2.1152980497113782, "language_loss": 0.74208486, "learning_rate": 2.7847094595272525e-06, "loss": 0.76388359, "num_input_tokens_seen": 139374455, "step": 6495, "time_per_iteration": 2.6432337760925293 }, { "auxiliary_loss_clip": 0.01137243, "auxiliary_loss_mlp": 0.01045454, "balance_loss_clip": 1.05153751, "balance_loss_mlp": 1.02913451, "epoch": 0.39056064933112883, "flos": 25915761129600.0, "grad_norm": 2.402575660392066, "language_loss": 0.67757058, "learning_rate": 2.784351212350352e-06, "loss": 0.69939756, "num_input_tokens_seen": 139394770, "step": 6496, "time_per_iteration": 2.762009859085083 }, { "auxiliary_loss_clip": 0.01023856, "auxiliary_loss_mlp": 0.01010625, "balance_loss_clip": 1.02393842, "balance_loss_mlp": 1.00925446, "epoch": 0.3906207725837968, "flos": 60028421713920.0, "grad_norm": 0.6655460592599327, "language_loss": 0.53920811, "learning_rate": 2.783992935430775e-06, "loss": 0.55955297, "num_input_tokens_seen": 139454760, "step": 6497, "time_per_iteration": 3.351006507873535 }, { "auxiliary_loss_clip": 0.01094838, "auxiliary_loss_mlp": 0.00772151, "balance_loss_clip": 1.0476501, "balance_loss_mlp": 1.00038421, "epoch": 0.39068089583646476, "flos": 21068683119360.0, "grad_norm": 2.7558428999232847, "language_loss": 0.6865977, "learning_rate": 2.7836346287821068e-06, "loss": 0.70526755, "num_input_tokens_seen": 139472645, "step": 6498, "time_per_iteration": 2.7838692665100098 }, { "auxiliary_loss_clip": 0.01022021, "auxiliary_loss_mlp": 0.01009741, "balance_loss_clip": 1.02064919, "balance_loss_mlp": 1.00839996, "epoch": 0.3907410190891327, "flos": 70445677403520.0, "grad_norm": 0.7248596102007157, "language_loss": 0.51767612, "learning_rate": 2.783276292417936e-06, "loss": 0.53799379, "num_input_tokens_seen": 139536730, "step": 6499, "time_per_iteration": 3.2980377674102783 }, { "auxiliary_loss_clip": 0.01122618, "auxiliary_loss_mlp": 0.01044387, "balance_loss_clip": 1.04676056, "balance_loss_mlp": 1.02793658, "epoch": 0.3908011423418007, "flos": 27962454084480.0, "grad_norm": 1.973185164339423, "language_loss": 0.73842579, "learning_rate": 2.7829179263518487e-06, "loss": 0.76009583, "num_input_tokens_seen": 139557540, "step": 6500, "time_per_iteration": 2.7239198684692383 }, { "auxiliary_loss_clip": 0.01125366, "auxiliary_loss_mlp": 0.01037256, "balance_loss_clip": 1.05035591, "balance_loss_mlp": 1.02246249, "epoch": 0.39086126559446865, "flos": 24462097718400.0, "grad_norm": 2.6021512056662814, "language_loss": 0.68837166, "learning_rate": 2.7825595305974354e-06, "loss": 0.70999795, "num_input_tokens_seen": 139576875, "step": 6501, "time_per_iteration": 2.6926429271698 }, { "auxiliary_loss_clip": 0.01122637, "auxiliary_loss_mlp": 0.0103859, "balance_loss_clip": 1.04832482, "balance_loss_mlp": 1.02442181, "epoch": 0.3909213888471366, "flos": 16941541403520.0, "grad_norm": 2.1384909443348246, "language_loss": 0.78875881, "learning_rate": 2.782201105168287e-06, "loss": 0.8103711, "num_input_tokens_seen": 139594295, "step": 6502, "time_per_iteration": 2.647021770477295 }, { "auxiliary_loss_clip": 0.01109811, "auxiliary_loss_mlp": 0.01035328, "balance_loss_clip": 1.04876852, "balance_loss_mlp": 1.02171457, "epoch": 0.3909815120998046, "flos": 29278400751360.0, "grad_norm": 3.671996146003432, "language_loss": 0.80537987, "learning_rate": 2.7818426500779932e-06, "loss": 0.82683128, "num_input_tokens_seen": 139614080, "step": 6503, "time_per_iteration": 2.7318384647369385 }, { "auxiliary_loss_clip": 0.0110371, "auxiliary_loss_mlp": 0.01031248, "balance_loss_clip": 1.04387689, "balance_loss_mlp": 1.01760423, "epoch": 0.39104163535247255, "flos": 18951246328320.0, "grad_norm": 1.848076786389183, "language_loss": 0.71439689, "learning_rate": 2.7814841653401485e-06, "loss": 0.7357465, "num_input_tokens_seen": 139632755, "step": 6504, "time_per_iteration": 2.6983554363250732 }, { "auxiliary_loss_clip": 0.01130195, "auxiliary_loss_mlp": 0.01034576, "balance_loss_clip": 1.0459981, "balance_loss_mlp": 1.0199374, "epoch": 0.3911017586051405, "flos": 26323347732480.0, "grad_norm": 1.4848516480735832, "language_loss": 0.83245611, "learning_rate": 2.7811256509683454e-06, "loss": 0.8541038, "num_input_tokens_seen": 139654205, "step": 6505, "time_per_iteration": 2.6663267612457275 }, { "auxiliary_loss_clip": 0.01131259, "auxiliary_loss_mlp": 0.01036964, "balance_loss_clip": 1.04880178, "balance_loss_mlp": 1.02123427, "epoch": 0.3911618818578085, "flos": 21835770992640.0, "grad_norm": 1.9330872564568533, "language_loss": 0.71352887, "learning_rate": 2.7807671069761797e-06, "loss": 0.73521107, "num_input_tokens_seen": 139673595, "step": 6506, "time_per_iteration": 2.6168534755706787 }, { "auxiliary_loss_clip": 0.01105925, "auxiliary_loss_mlp": 0.01036209, "balance_loss_clip": 1.04536867, "balance_loss_mlp": 1.02267289, "epoch": 0.3912220051104765, "flos": 16359680989440.0, "grad_norm": 2.106647299507305, "language_loss": 0.75086504, "learning_rate": 2.7804085333772477e-06, "loss": 0.77228636, "num_input_tokens_seen": 139690565, "step": 6507, "time_per_iteration": 2.8207101821899414 }, { "auxiliary_loss_clip": 0.01053146, "auxiliary_loss_mlp": 0.01002126, "balance_loss_clip": 1.02403712, "balance_loss_mlp": 1.00068331, "epoch": 0.39128212836314447, "flos": 71050986420480.0, "grad_norm": 0.9386901837185221, "language_loss": 0.56488812, "learning_rate": 2.7800499301851446e-06, "loss": 0.58544087, "num_input_tokens_seen": 139749420, "step": 6508, "time_per_iteration": 3.3985793590545654 }, { "auxiliary_loss_clip": 0.01121659, "auxiliary_loss_mlp": 0.01038464, "balance_loss_clip": 1.05045915, "balance_loss_mlp": 1.02476096, "epoch": 0.39134225161581243, "flos": 20331975173760.0, "grad_norm": 2.0207920703954936, "language_loss": 0.76855135, "learning_rate": 2.779691297413471e-06, "loss": 0.79015261, "num_input_tokens_seen": 139766265, "step": 6509, "time_per_iteration": 2.6667048931121826 }, { "auxiliary_loss_clip": 0.01101334, "auxiliary_loss_mlp": 0.01043985, "balance_loss_clip": 1.04298568, "balance_loss_mlp": 1.02731967, "epoch": 0.3914023748684804, "flos": 17018390551680.0, "grad_norm": 5.905968065437354, "language_loss": 0.82739937, "learning_rate": 2.779332635075825e-06, "loss": 0.84885252, "num_input_tokens_seen": 139782400, "step": 6510, "time_per_iteration": 2.933931589126587 }, { "auxiliary_loss_clip": 0.0112259, "auxiliary_loss_mlp": 0.01038677, "balance_loss_clip": 1.04712081, "balance_loss_mlp": 1.02406788, "epoch": 0.39146249812114836, "flos": 18405224709120.0, "grad_norm": 5.781106582003857, "language_loss": 0.76999253, "learning_rate": 2.7789739431858073e-06, "loss": 0.79160517, "num_input_tokens_seen": 139801435, "step": 6511, "time_per_iteration": 2.6926233768463135 }, { "auxiliary_loss_clip": 0.01035867, "auxiliary_loss_mlp": 0.01006458, "balance_loss_clip": 1.02583003, "balance_loss_mlp": 1.00515223, "epoch": 0.3915226213738163, "flos": 67637355442560.0, "grad_norm": 0.716551875912138, "language_loss": 0.57749176, "learning_rate": 2.7786152217570196e-06, "loss": 0.59791505, "num_input_tokens_seen": 139869700, "step": 6512, "time_per_iteration": 3.3695731163024902 }, { "auxiliary_loss_clip": 0.01135844, "auxiliary_loss_mlp": 0.01035397, "balance_loss_clip": 1.05013657, "balance_loss_mlp": 1.02001858, "epoch": 0.3915827446264843, "flos": 26359330181760.0, "grad_norm": 1.8014676974175234, "language_loss": 0.69625974, "learning_rate": 2.7782564708030647e-06, "loss": 0.71797216, "num_input_tokens_seen": 139890140, "step": 6513, "time_per_iteration": 2.8037526607513428 }, { "auxiliary_loss_clip": 0.01095461, "auxiliary_loss_mlp": 0.01038913, "balance_loss_clip": 1.04791474, "balance_loss_mlp": 1.02376771, "epoch": 0.39164286787915226, "flos": 21943897908480.0, "grad_norm": 8.577901504868834, "language_loss": 0.75566119, "learning_rate": 2.7778976903375464e-06, "loss": 0.77700496, "num_input_tokens_seen": 139908020, "step": 6514, "time_per_iteration": 2.8419485092163086 }, { "auxiliary_loss_clip": 0.01094835, "auxiliary_loss_mlp": 0.01040327, "balance_loss_clip": 1.04639578, "balance_loss_mlp": 1.02636766, "epoch": 0.3917029911318202, "flos": 16399829416320.0, "grad_norm": 2.188170768945522, "language_loss": 0.77334291, "learning_rate": 2.7775388803740693e-06, "loss": 0.79469454, "num_input_tokens_seen": 139926180, "step": 6515, "time_per_iteration": 2.7894155979156494 }, { "auxiliary_loss_clip": 0.01087017, "auxiliary_loss_mlp": 0.0105158, "balance_loss_clip": 1.03979194, "balance_loss_mlp": 1.03763223, "epoch": 0.3917631143844882, "flos": 26211701283840.0, "grad_norm": 1.5088395946363757, "language_loss": 0.79678488, "learning_rate": 2.7771800409262406e-06, "loss": 0.81817091, "num_input_tokens_seen": 139947420, "step": 6516, "time_per_iteration": 2.902660608291626 }, { "auxiliary_loss_clip": 0.01092649, "auxiliary_loss_mlp": 0.01042434, "balance_loss_clip": 1.04691982, "balance_loss_mlp": 1.02799749, "epoch": 0.39182323763715615, "flos": 18548364407040.0, "grad_norm": 1.9539461980584907, "language_loss": 0.70539331, "learning_rate": 2.7768211720076665e-06, "loss": 0.72674412, "num_input_tokens_seen": 139965800, "step": 6517, "time_per_iteration": 4.275412082672119 }, { "auxiliary_loss_clip": 0.0108795, "auxiliary_loss_mlp": 0.01045392, "balance_loss_clip": 1.04107618, "balance_loss_mlp": 1.03034759, "epoch": 0.3918833608898241, "flos": 34313543395200.0, "grad_norm": 1.7270068216094292, "language_loss": 0.72215492, "learning_rate": 2.776462273631956e-06, "loss": 0.74348831, "num_input_tokens_seen": 139988140, "step": 6518, "time_per_iteration": 4.390907287597656 }, { "auxiliary_loss_clip": 0.01124647, "auxiliary_loss_mlp": 0.0104138, "balance_loss_clip": 1.05179489, "balance_loss_mlp": 1.02679503, "epoch": 0.3919434841424921, "flos": 36939582812160.0, "grad_norm": 1.8265438315676477, "language_loss": 0.61835045, "learning_rate": 2.7761033458127177e-06, "loss": 0.64001071, "num_input_tokens_seen": 140010060, "step": 6519, "time_per_iteration": 4.281017780303955 }, { "auxiliary_loss_clip": 0.01142133, "auxiliary_loss_mlp": 0.01043415, "balance_loss_clip": 1.05199361, "balance_loss_mlp": 1.02807307, "epoch": 0.3920036073951601, "flos": 23508956373120.0, "grad_norm": 2.723028929016538, "language_loss": 0.67084813, "learning_rate": 2.775744388563563e-06, "loss": 0.6927036, "num_input_tokens_seen": 140029400, "step": 6520, "time_per_iteration": 2.6971800327301025 }, { "auxiliary_loss_clip": 0.01130641, "auxiliary_loss_mlp": 0.01040483, "balance_loss_clip": 1.04749501, "balance_loss_mlp": 1.02648759, "epoch": 0.39206373064782807, "flos": 18406086635520.0, "grad_norm": 1.8214273138880266, "language_loss": 0.78716481, "learning_rate": 2.775385401898104e-06, "loss": 0.80887604, "num_input_tokens_seen": 140048940, "step": 6521, "time_per_iteration": 2.69966459274292 }, { "auxiliary_loss_clip": 0.01128458, "auxiliary_loss_mlp": 0.01040156, "balance_loss_clip": 1.05050826, "balance_loss_mlp": 1.02289462, "epoch": 0.39212385390049603, "flos": 12313051608960.0, "grad_norm": 2.9673341059873897, "language_loss": 0.70119011, "learning_rate": 2.775026385829952e-06, "loss": 0.72287625, "num_input_tokens_seen": 140066380, "step": 6522, "time_per_iteration": 2.7100417613983154 }, { "auxiliary_loss_clip": 0.0110971, "auxiliary_loss_mlp": 0.01035612, "balance_loss_clip": 1.0467701, "balance_loss_mlp": 1.02100325, "epoch": 0.392183977153164, "flos": 19719160214400.0, "grad_norm": 2.0481488550445595, "language_loss": 0.76847959, "learning_rate": 2.774667340372722e-06, "loss": 0.78993279, "num_input_tokens_seen": 140085275, "step": 6523, "time_per_iteration": 4.336375713348389 }, { "auxiliary_loss_clip": 0.01111577, "auxiliary_loss_mlp": 0.01040964, "balance_loss_clip": 1.04617906, "balance_loss_mlp": 1.02597904, "epoch": 0.39224410040583196, "flos": 33144902403840.0, "grad_norm": 2.4064695780458254, "language_loss": 0.62052447, "learning_rate": 2.7743082655400293e-06, "loss": 0.64204991, "num_input_tokens_seen": 140105105, "step": 6524, "time_per_iteration": 2.861999750137329 }, { "auxiliary_loss_clip": 0.0113421, "auxiliary_loss_mlp": 0.01041444, "balance_loss_clip": 1.04792655, "balance_loss_mlp": 1.02591681, "epoch": 0.39230422365849993, "flos": 27782434097280.0, "grad_norm": 3.311294983146634, "language_loss": 0.74027938, "learning_rate": 2.773949161345489e-06, "loss": 0.76203597, "num_input_tokens_seen": 140125645, "step": 6525, "time_per_iteration": 2.6660265922546387 }, { "auxiliary_loss_clip": 0.01111123, "auxiliary_loss_mlp": 0.01038937, "balance_loss_clip": 1.04621911, "balance_loss_mlp": 1.02488267, "epoch": 0.3923643469111679, "flos": 17931634865280.0, "grad_norm": 1.9378599466790423, "language_loss": 0.81101322, "learning_rate": 2.773590027802719e-06, "loss": 0.83251387, "num_input_tokens_seen": 140141925, "step": 6526, "time_per_iteration": 2.6949198246002197 }, { "auxiliary_loss_clip": 0.01122115, "auxiliary_loss_mlp": 0.01043128, "balance_loss_clip": 1.04750228, "balance_loss_mlp": 1.02844119, "epoch": 0.39242447016383586, "flos": 24059539019520.0, "grad_norm": 2.21390394508072, "language_loss": 0.69860446, "learning_rate": 2.7732308649253383e-06, "loss": 0.72025692, "num_input_tokens_seen": 140160965, "step": 6527, "time_per_iteration": 2.648738384246826 }, { "auxiliary_loss_clip": 0.01093845, "auxiliary_loss_mlp": 0.01034532, "balance_loss_clip": 1.04563034, "balance_loss_mlp": 1.01990485, "epoch": 0.3924845934165038, "flos": 10664069016960.0, "grad_norm": 2.870547931880311, "language_loss": 0.82659566, "learning_rate": 2.772871672726965e-06, "loss": 0.84787941, "num_input_tokens_seen": 140177780, "step": 6528, "time_per_iteration": 2.7436537742614746 }, { "auxiliary_loss_clip": 0.01105744, "auxiliary_loss_mlp": 0.01032675, "balance_loss_clip": 1.04709864, "balance_loss_mlp": 1.01909113, "epoch": 0.3925447166691718, "flos": 31245910174080.0, "grad_norm": 1.7012894335018593, "language_loss": 0.68846285, "learning_rate": 2.7725124512212205e-06, "loss": 0.70984709, "num_input_tokens_seen": 140201660, "step": 6529, "time_per_iteration": 2.7932794094085693 }, { "auxiliary_loss_clip": 0.01112194, "auxiliary_loss_mlp": 0.01035865, "balance_loss_clip": 1.04500198, "balance_loss_mlp": 1.02043366, "epoch": 0.39260483992183975, "flos": 29415040087680.0, "grad_norm": 2.4176127237752145, "language_loss": 0.80461496, "learning_rate": 2.7721532004217267e-06, "loss": 0.82609558, "num_input_tokens_seen": 140218585, "step": 6530, "time_per_iteration": 2.7094242572784424 }, { "auxiliary_loss_clip": 0.01119536, "auxiliary_loss_mlp": 0.0104093, "balance_loss_clip": 1.04586959, "balance_loss_mlp": 1.0264107, "epoch": 0.3926649631745077, "flos": 22857788666880.0, "grad_norm": 1.6828565274400475, "language_loss": 0.75680822, "learning_rate": 2.7717939203421063e-06, "loss": 0.77841288, "num_input_tokens_seen": 140239905, "step": 6531, "time_per_iteration": 2.7238411903381348 }, { "auxiliary_loss_clip": 0.01058847, "auxiliary_loss_mlp": 0.01008064, "balance_loss_clip": 1.03009987, "balance_loss_mlp": 1.00663972, "epoch": 0.3927250864271757, "flos": 63893881872000.0, "grad_norm": 0.8211432271778524, "language_loss": 0.60317427, "learning_rate": 2.7714346109959822e-06, "loss": 0.62384337, "num_input_tokens_seen": 140293820, "step": 6532, "time_per_iteration": 3.047954797744751 }, { "auxiliary_loss_clip": 0.01037233, "auxiliary_loss_mlp": 0.01004719, "balance_loss_clip": 1.02873898, "balance_loss_mlp": 1.00334251, "epoch": 0.3927852096798437, "flos": 68909741890560.0, "grad_norm": 0.7803139799058858, "language_loss": 0.55459583, "learning_rate": 2.771075272396981e-06, "loss": 0.57501537, "num_input_tokens_seen": 140360420, "step": 6533, "time_per_iteration": 3.306561231613159 }, { "auxiliary_loss_clip": 0.01112553, "auxiliary_loss_mlp": 0.01040733, "balance_loss_clip": 1.04983759, "balance_loss_mlp": 1.02614141, "epoch": 0.39284533293251167, "flos": 29715972232320.0, "grad_norm": 2.2467248181922232, "language_loss": 0.75955313, "learning_rate": 2.7707159045587284e-06, "loss": 0.78108597, "num_input_tokens_seen": 140381950, "step": 6534, "time_per_iteration": 2.7788329124450684 }, { "auxiliary_loss_clip": 0.0112134, "auxiliary_loss_mlp": 0.01045716, "balance_loss_clip": 1.04698312, "balance_loss_mlp": 1.02866912, "epoch": 0.39290545618517964, "flos": 18552027594240.0, "grad_norm": 2.2080736338994944, "language_loss": 0.78123498, "learning_rate": 2.770356507494851e-06, "loss": 0.80290556, "num_input_tokens_seen": 140399410, "step": 6535, "time_per_iteration": 2.6949005126953125 }, { "auxiliary_loss_clip": 0.0109337, "auxiliary_loss_mlp": 0.0103265, "balance_loss_clip": 1.04779291, "balance_loss_mlp": 1.01950169, "epoch": 0.3929655794378476, "flos": 26249479413120.0, "grad_norm": 1.9769476518607105, "language_loss": 0.686719, "learning_rate": 2.769997081218978e-06, "loss": 0.7079792, "num_input_tokens_seen": 140419055, "step": 6536, "time_per_iteration": 2.7684245109558105 }, { "auxiliary_loss_clip": 0.01104946, "auxiliary_loss_mlp": 0.01037851, "balance_loss_clip": 1.04767156, "balance_loss_mlp": 1.02469027, "epoch": 0.39302570269051557, "flos": 29277933874560.0, "grad_norm": 1.8012856746153256, "language_loss": 0.69048655, "learning_rate": 2.769637625744738e-06, "loss": 0.71191454, "num_input_tokens_seen": 140438800, "step": 6537, "time_per_iteration": 2.7638440132141113 }, { "auxiliary_loss_clip": 0.01122897, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.05155134, "balance_loss_mlp": 1.02624357, "epoch": 0.39308582594318353, "flos": 17347440067200.0, "grad_norm": 1.7514361880438423, "language_loss": 0.78990901, "learning_rate": 2.769278141085763e-06, "loss": 0.81154549, "num_input_tokens_seen": 140456880, "step": 6538, "time_per_iteration": 2.635075807571411 }, { "auxiliary_loss_clip": 0.01003397, "auxiliary_loss_mlp": 0.01017351, "balance_loss_clip": 1.02259159, "balance_loss_mlp": 1.01596797, "epoch": 0.3931459491958515, "flos": 61007094650880.0, "grad_norm": 0.8098068956453415, "language_loss": 0.6190061, "learning_rate": 2.768918627255683e-06, "loss": 0.63921356, "num_input_tokens_seen": 140507510, "step": 6539, "time_per_iteration": 3.0673203468322754 }, { "auxiliary_loss_clip": 0.01104217, "auxiliary_loss_mlp": 0.0103537, "balance_loss_clip": 1.04730296, "balance_loss_mlp": 1.0206002, "epoch": 0.39320607244851946, "flos": 39016009249920.0, "grad_norm": 3.0347619755245248, "language_loss": 0.68405032, "learning_rate": 2.7685590842681315e-06, "loss": 0.70544618, "num_input_tokens_seen": 140528740, "step": 6540, "time_per_iteration": 2.7993643283843994 }, { "auxiliary_loss_clip": 0.01105128, "auxiliary_loss_mlp": 0.01030736, "balance_loss_clip": 1.04439306, "balance_loss_mlp": 1.01638293, "epoch": 0.3932661957011874, "flos": 24679752180480.0, "grad_norm": 1.8325322608278536, "language_loss": 0.7276125, "learning_rate": 2.7681995121367433e-06, "loss": 0.74897116, "num_input_tokens_seen": 140547560, "step": 6541, "time_per_iteration": 2.659224510192871 }, { "auxiliary_loss_clip": 0.01054751, "auxiliary_loss_mlp": 0.01009472, "balance_loss_clip": 1.02648139, "balance_loss_mlp": 1.0080775, "epoch": 0.3933263189538554, "flos": 70096552185600.0, "grad_norm": 0.8313029932067456, "language_loss": 0.60319722, "learning_rate": 2.7678399108751516e-06, "loss": 0.6238395, "num_input_tokens_seen": 140601175, "step": 6542, "time_per_iteration": 2.968062400817871 }, { "auxiliary_loss_clip": 0.01121623, "auxiliary_loss_mlp": 0.01038302, "balance_loss_clip": 1.04764903, "balance_loss_mlp": 1.0243547, "epoch": 0.39338644220652336, "flos": 22929071207040.0, "grad_norm": 1.6209695943494522, "language_loss": 0.82034504, "learning_rate": 2.7674802804969947e-06, "loss": 0.84194422, "num_input_tokens_seen": 140622200, "step": 6543, "time_per_iteration": 2.638796806335449 }, { "auxiliary_loss_clip": 0.01103923, "auxiliary_loss_mlp": 0.01034902, "balance_loss_clip": 1.04355097, "balance_loss_mlp": 1.02045417, "epoch": 0.3934465654591913, "flos": 30848163897600.0, "grad_norm": 3.743075543188527, "language_loss": 0.69100285, "learning_rate": 2.767120621015908e-06, "loss": 0.71239114, "num_input_tokens_seen": 140643125, "step": 6544, "time_per_iteration": 2.7180936336517334 }, { "auxiliary_loss_clip": 0.01112442, "auxiliary_loss_mlp": 0.01047198, "balance_loss_clip": 1.04659534, "balance_loss_mlp": 1.0316174, "epoch": 0.3935066887118593, "flos": 29236528471680.0, "grad_norm": 2.0996268311737976, "language_loss": 0.76072371, "learning_rate": 2.76676093244553e-06, "loss": 0.78232014, "num_input_tokens_seen": 140662500, "step": 6545, "time_per_iteration": 2.7429869174957275 }, { "auxiliary_loss_clip": 0.01091051, "auxiliary_loss_mlp": 0.01033724, "balance_loss_clip": 1.04633403, "balance_loss_mlp": 1.02104044, "epoch": 0.3935668119645273, "flos": 19135288638720.0, "grad_norm": 1.7673371756448844, "language_loss": 0.74672133, "learning_rate": 2.7664012147995015e-06, "loss": 0.76796907, "num_input_tokens_seen": 140681960, "step": 6546, "time_per_iteration": 2.6785295009613037 }, { "auxiliary_loss_clip": 0.01109428, "auxiliary_loss_mlp": 0.0103425, "balance_loss_clip": 1.04903293, "balance_loss_mlp": 1.01946843, "epoch": 0.3936269352171953, "flos": 18516116972160.0, "grad_norm": 1.9230817449169166, "language_loss": 0.81627518, "learning_rate": 2.7660414680914617e-06, "loss": 0.83771199, "num_input_tokens_seen": 140699170, "step": 6547, "time_per_iteration": 2.638214588165283 }, { "auxiliary_loss_clip": 0.01114598, "auxiliary_loss_mlp": 0.00772919, "balance_loss_clip": 1.04404151, "balance_loss_mlp": 1.00032711, "epoch": 0.39368705846986324, "flos": 15632813370240.0, "grad_norm": 1.9821442562566327, "language_loss": 0.84406352, "learning_rate": 2.7656816923350525e-06, "loss": 0.86293864, "num_input_tokens_seen": 140714920, "step": 6548, "time_per_iteration": 2.6490747928619385 }, { "auxiliary_loss_clip": 0.01118074, "auxiliary_loss_mlp": 0.00771091, "balance_loss_clip": 1.04686236, "balance_loss_mlp": 1.00034189, "epoch": 0.3937471817225312, "flos": 21325839563520.0, "grad_norm": 1.7617733187332765, "language_loss": 0.7311933, "learning_rate": 2.7653218875439174e-06, "loss": 0.75008494, "num_input_tokens_seen": 140734595, "step": 6549, "time_per_iteration": 2.635380983352661 }, { "auxiliary_loss_clip": 0.01071621, "auxiliary_loss_mlp": 0.01042928, "balance_loss_clip": 1.0444963, "balance_loss_mlp": 1.0259527, "epoch": 0.39380730497519917, "flos": 20776693461120.0, "grad_norm": 2.774519883144605, "language_loss": 0.77592897, "learning_rate": 2.764962053731699e-06, "loss": 0.7970745, "num_input_tokens_seen": 140754050, "step": 6550, "time_per_iteration": 2.733921527862549 }, { "auxiliary_loss_clip": 0.01095205, "auxiliary_loss_mlp": 0.01030728, "balance_loss_clip": 1.04455531, "balance_loss_mlp": 1.01674485, "epoch": 0.39386742822786713, "flos": 21609784575360.0, "grad_norm": 3.1837220930493517, "language_loss": 0.81144142, "learning_rate": 2.7646021909120434e-06, "loss": 0.83270073, "num_input_tokens_seen": 140771440, "step": 6551, "time_per_iteration": 2.851475238800049 }, { "auxiliary_loss_clip": 0.01117625, "auxiliary_loss_mlp": 0.01036299, "balance_loss_clip": 1.0443331, "balance_loss_mlp": 1.02188659, "epoch": 0.3939275514805351, "flos": 12414642249600.0, "grad_norm": 12.177431380433415, "language_loss": 0.80449802, "learning_rate": 2.764242299098596e-06, "loss": 0.82603723, "num_input_tokens_seen": 140786715, "step": 6552, "time_per_iteration": 2.667344570159912 }, { "auxiliary_loss_clip": 0.01133223, "auxiliary_loss_mlp": 0.01043273, "balance_loss_clip": 1.04791522, "balance_loss_mlp": 1.02883697, "epoch": 0.39398767473320306, "flos": 18552027594240.0, "grad_norm": 2.002210962432939, "language_loss": 0.71199149, "learning_rate": 2.763882378305003e-06, "loss": 0.73375642, "num_input_tokens_seen": 140804950, "step": 6553, "time_per_iteration": 2.6329705715179443 }, { "auxiliary_loss_clip": 0.0111827, "auxiliary_loss_mlp": 0.0077145, "balance_loss_clip": 1.04818738, "balance_loss_mlp": 1.00036502, "epoch": 0.39404779798587103, "flos": 29308888419840.0, "grad_norm": 4.200797737547303, "language_loss": 0.64058566, "learning_rate": 2.7635224285449144e-06, "loss": 0.65948284, "num_input_tokens_seen": 140822800, "step": 6554, "time_per_iteration": 2.7190303802490234 }, { "auxiliary_loss_clip": 0.01109713, "auxiliary_loss_mlp": 0.01041117, "balance_loss_clip": 1.04655266, "balance_loss_mlp": 1.02747416, "epoch": 0.394107921238539, "flos": 34897055834880.0, "grad_norm": 2.186636266316066, "language_loss": 0.78957009, "learning_rate": 2.7631624498319796e-06, "loss": 0.81107843, "num_input_tokens_seen": 140842940, "step": 6555, "time_per_iteration": 2.7675819396972656 }, { "auxiliary_loss_clip": 0.01102424, "auxiliary_loss_mlp": 0.0104302, "balance_loss_clip": 1.04469514, "balance_loss_mlp": 1.02758873, "epoch": 0.39416804449120696, "flos": 25081413039360.0, "grad_norm": 1.7945119387028163, "language_loss": 0.71689165, "learning_rate": 2.7628024421798473e-06, "loss": 0.7383461, "num_input_tokens_seen": 140863060, "step": 6556, "time_per_iteration": 4.261122703552246 }, { "auxiliary_loss_clip": 0.01129248, "auxiliary_loss_mlp": 0.01031706, "balance_loss_clip": 1.0445503, "balance_loss_mlp": 1.01749015, "epoch": 0.3942281677438749, "flos": 32306639731200.0, "grad_norm": 1.7970895618407805, "language_loss": 0.84080362, "learning_rate": 2.7624424056021705e-06, "loss": 0.86241317, "num_input_tokens_seen": 140883795, "step": 6557, "time_per_iteration": 2.7031610012054443 }, { "auxiliary_loss_clip": 0.01116561, "auxiliary_loss_mlp": 0.01032116, "balance_loss_clip": 1.04790783, "balance_loss_mlp": 1.01810956, "epoch": 0.3942882909965429, "flos": 24936621315840.0, "grad_norm": 3.8501140650976238, "language_loss": 0.806759, "learning_rate": 2.7620823401126004e-06, "loss": 0.82824582, "num_input_tokens_seen": 140903055, "step": 6558, "time_per_iteration": 5.6523637771606445 }, { "auxiliary_loss_clip": 0.01130051, "auxiliary_loss_mlp": 0.01035884, "balance_loss_clip": 1.04807055, "balance_loss_mlp": 1.02238965, "epoch": 0.39434841424921085, "flos": 11874797769600.0, "grad_norm": 1.8974962376031472, "language_loss": 0.70930403, "learning_rate": 2.761722245724792e-06, "loss": 0.73096335, "num_input_tokens_seen": 140920685, "step": 6559, "time_per_iteration": 2.6645302772521973 }, { "auxiliary_loss_clip": 0.01113668, "auxiliary_loss_mlp": 0.0104073, "balance_loss_clip": 1.04660964, "balance_loss_mlp": 1.02452326, "epoch": 0.3944085375018789, "flos": 16361620323840.0, "grad_norm": 2.3002241644217865, "language_loss": 0.80355662, "learning_rate": 2.7613621224524003e-06, "loss": 0.82510054, "num_input_tokens_seen": 140937320, "step": 6560, "time_per_iteration": 2.8372745513916016 }, { "auxiliary_loss_clip": 0.01109469, "auxiliary_loss_mlp": 0.0103941, "balance_loss_clip": 1.04681468, "balance_loss_mlp": 1.02334619, "epoch": 0.39446866075454684, "flos": 10633365866880.0, "grad_norm": 2.2192317359233034, "language_loss": 0.828062, "learning_rate": 2.7610019703090803e-06, "loss": 0.84955078, "num_input_tokens_seen": 140954855, "step": 6561, "time_per_iteration": 2.6724014282226562 }, { "auxiliary_loss_clip": 0.01119263, "auxiliary_loss_mlp": 0.01043889, "balance_loss_clip": 1.04620779, "balance_loss_mlp": 1.02972126, "epoch": 0.3945287840072148, "flos": 18187498419840.0, "grad_norm": 2.478683034492453, "language_loss": 0.80985552, "learning_rate": 2.7606417893084887e-06, "loss": 0.83148706, "num_input_tokens_seen": 140973250, "step": 6562, "time_per_iteration": 4.211291074752808 }, { "auxiliary_loss_clip": 0.01100981, "auxiliary_loss_mlp": 0.01040375, "balance_loss_clip": 1.04367661, "balance_loss_mlp": 1.02568245, "epoch": 0.39458890725988277, "flos": 23039891642880.0, "grad_norm": 1.8396668644534004, "language_loss": 0.81574059, "learning_rate": 2.7602815794642853e-06, "loss": 0.83715415, "num_input_tokens_seen": 140993050, "step": 6563, "time_per_iteration": 2.6933205127716064 }, { "auxiliary_loss_clip": 0.01078578, "auxiliary_loss_mlp": 0.01052866, "balance_loss_clip": 1.03979552, "balance_loss_mlp": 1.03385234, "epoch": 0.39464903051255074, "flos": 17159052211200.0, "grad_norm": 2.4284687703059276, "language_loss": 0.69678622, "learning_rate": 2.759921340790127e-06, "loss": 0.71810067, "num_input_tokens_seen": 141010815, "step": 6564, "time_per_iteration": 2.7754619121551514 }, { "auxiliary_loss_clip": 0.01119553, "auxiliary_loss_mlp": 0.01037847, "balance_loss_clip": 1.04547322, "balance_loss_mlp": 1.02260029, "epoch": 0.3947091537652187, "flos": 15889000147200.0, "grad_norm": 2.342409184231709, "language_loss": 0.82842124, "learning_rate": 2.759561073299676e-06, "loss": 0.84999526, "num_input_tokens_seen": 141028720, "step": 6565, "time_per_iteration": 2.652029037475586 }, { "auxiliary_loss_clip": 0.01091527, "auxiliary_loss_mlp": 0.01044097, "balance_loss_clip": 1.04201448, "balance_loss_mlp": 1.02794445, "epoch": 0.39476927701788667, "flos": 18545491319040.0, "grad_norm": 1.8313066364371182, "language_loss": 0.83458865, "learning_rate": 2.7592007770065937e-06, "loss": 0.85594487, "num_input_tokens_seen": 141046025, "step": 6566, "time_per_iteration": 2.6853299140930176 }, { "auxiliary_loss_clip": 0.01137834, "auxiliary_loss_mlp": 0.01036947, "balance_loss_clip": 1.04882693, "balance_loss_mlp": 1.02146816, "epoch": 0.39482940027055463, "flos": 22275712771200.0, "grad_norm": 2.7953182854439973, "language_loss": 0.77462149, "learning_rate": 2.7588404519245403e-06, "loss": 0.79636931, "num_input_tokens_seen": 141066865, "step": 6567, "time_per_iteration": 2.6695878505706787 }, { "auxiliary_loss_clip": 0.01114738, "auxiliary_loss_mlp": 0.01037774, "balance_loss_clip": 1.04457474, "balance_loss_mlp": 1.0235877, "epoch": 0.3948895235232226, "flos": 14757634494720.0, "grad_norm": 3.2000391748281065, "language_loss": 0.80752146, "learning_rate": 2.758480098067182e-06, "loss": 0.82904655, "num_input_tokens_seen": 141084210, "step": 6568, "time_per_iteration": 2.6126980781555176 }, { "auxiliary_loss_clip": 0.01100656, "auxiliary_loss_mlp": 0.01035941, "balance_loss_clip": 1.04693437, "balance_loss_mlp": 1.02142143, "epoch": 0.39494964677589056, "flos": 22565763095040.0, "grad_norm": 3.155903507973262, "language_loss": 0.846977, "learning_rate": 2.7581197154481816e-06, "loss": 0.868343, "num_input_tokens_seen": 141103895, "step": 6569, "time_per_iteration": 2.731241464614868 }, { "auxiliary_loss_clip": 0.01076285, "auxiliary_loss_mlp": 0.01045444, "balance_loss_clip": 1.046417, "balance_loss_mlp": 1.03076911, "epoch": 0.3950097700285585, "flos": 22963186149120.0, "grad_norm": 2.966787083651573, "language_loss": 0.74931526, "learning_rate": 2.7577593040812066e-06, "loss": 0.77053261, "num_input_tokens_seen": 141124000, "step": 6570, "time_per_iteration": 2.816168785095215 }, { "auxiliary_loss_clip": 0.01093382, "auxiliary_loss_mlp": 0.01037489, "balance_loss_clip": 1.04271865, "balance_loss_mlp": 1.02224803, "epoch": 0.3950698932812265, "flos": 20595236929920.0, "grad_norm": 3.807643490882315, "language_loss": 0.80009687, "learning_rate": 2.757398863979922e-06, "loss": 0.82140559, "num_input_tokens_seen": 141142535, "step": 6571, "time_per_iteration": 2.7444143295288086 }, { "auxiliary_loss_clip": 0.0110309, "auxiliary_loss_mlp": 0.01042438, "balance_loss_clip": 1.046592, "balance_loss_mlp": 1.02792382, "epoch": 0.39513001653389446, "flos": 20375786787840.0, "grad_norm": 2.0513494110156105, "language_loss": 0.77667749, "learning_rate": 2.757038395157997e-06, "loss": 0.79813272, "num_input_tokens_seen": 141161575, "step": 6572, "time_per_iteration": 2.787951946258545 }, { "auxiliary_loss_clip": 0.01096298, "auxiliary_loss_mlp": 0.01039178, "balance_loss_clip": 1.04524946, "balance_loss_mlp": 1.02422285, "epoch": 0.3951901397865625, "flos": 26463650256000.0, "grad_norm": 2.2233910711840092, "language_loss": 0.74710405, "learning_rate": 2.7566778976291002e-06, "loss": 0.76845872, "num_input_tokens_seen": 141181150, "step": 6573, "time_per_iteration": 2.8065271377563477 }, { "auxiliary_loss_clip": 0.01119667, "auxiliary_loss_mlp": 0.01033875, "balance_loss_clip": 1.04583275, "balance_loss_mlp": 1.02073228, "epoch": 0.39525026303923044, "flos": 43838345767680.0, "grad_norm": 1.5702623893020402, "language_loss": 0.681665, "learning_rate": 2.7563173714069017e-06, "loss": 0.7032004, "num_input_tokens_seen": 141206310, "step": 6574, "time_per_iteration": 2.917938470840454 }, { "auxiliary_loss_clip": 0.01066027, "auxiliary_loss_mlp": 0.01046829, "balance_loss_clip": 1.03601551, "balance_loss_mlp": 1.02941298, "epoch": 0.3953103862918984, "flos": 18040803275520.0, "grad_norm": 11.51359836007049, "language_loss": 0.71934754, "learning_rate": 2.755956816505072e-06, "loss": 0.74047613, "num_input_tokens_seen": 141223925, "step": 6575, "time_per_iteration": 2.8125574588775635 }, { "auxiliary_loss_clip": 0.01106625, "auxiliary_loss_mlp": 0.01044084, "balance_loss_clip": 1.04328454, "balance_loss_mlp": 1.02871156, "epoch": 0.3953705095445664, "flos": 16976015481600.0, "grad_norm": 2.3082458130711276, "language_loss": 0.73497486, "learning_rate": 2.7555962329372845e-06, "loss": 0.75648189, "num_input_tokens_seen": 141239010, "step": 6576, "time_per_iteration": 2.7072994709014893 }, { "auxiliary_loss_clip": 0.01131853, "auxiliary_loss_mlp": 0.01038072, "balance_loss_clip": 1.04721868, "balance_loss_mlp": 1.02482772, "epoch": 0.39543063279723434, "flos": 17411144837760.0, "grad_norm": 2.584581612312142, "language_loss": 0.83806884, "learning_rate": 2.7552356207172124e-06, "loss": 0.85976809, "num_input_tokens_seen": 141252255, "step": 6577, "time_per_iteration": 2.673980236053467 }, { "auxiliary_loss_clip": 0.01108115, "auxiliary_loss_mlp": 0.01038249, "balance_loss_clip": 1.04473734, "balance_loss_mlp": 1.02394366, "epoch": 0.3954907560499023, "flos": 22784207656320.0, "grad_norm": 3.282604232183532, "language_loss": 0.90597945, "learning_rate": 2.75487497985853e-06, "loss": 0.92744309, "num_input_tokens_seen": 141269325, "step": 6578, "time_per_iteration": 2.8357715606689453 }, { "auxiliary_loss_clip": 0.01113431, "auxiliary_loss_mlp": 0.01038042, "balance_loss_clip": 1.04971015, "balance_loss_mlp": 1.0215559, "epoch": 0.39555087930257027, "flos": 21944400698880.0, "grad_norm": 1.9328811386040925, "language_loss": 0.77836883, "learning_rate": 2.7545143103749117e-06, "loss": 0.7998836, "num_input_tokens_seen": 141288505, "step": 6579, "time_per_iteration": 2.78900146484375 }, { "auxiliary_loss_clip": 0.01080071, "auxiliary_loss_mlp": 0.01037596, "balance_loss_clip": 1.04296517, "balance_loss_mlp": 1.02181292, "epoch": 0.39561100255523823, "flos": 20404622430720.0, "grad_norm": 2.0515288557813705, "language_loss": 0.68375254, "learning_rate": 2.754153612280037e-06, "loss": 0.70492923, "num_input_tokens_seen": 141303680, "step": 6580, "time_per_iteration": 2.796602249145508 }, { "auxiliary_loss_clip": 0.01119101, "auxiliary_loss_mlp": 0.01031775, "balance_loss_clip": 1.04687381, "balance_loss_mlp": 1.01770234, "epoch": 0.3956711258079062, "flos": 27964572986880.0, "grad_norm": 5.6192422063497425, "language_loss": 0.58592093, "learning_rate": 2.7537928855875797e-06, "loss": 0.60742974, "num_input_tokens_seen": 141324090, "step": 6581, "time_per_iteration": 2.738732099533081 }, { "auxiliary_loss_clip": 0.0110807, "auxiliary_loss_mlp": 0.01047889, "balance_loss_clip": 1.04554892, "balance_loss_mlp": 1.03111625, "epoch": 0.39573124906057416, "flos": 14428297670400.0, "grad_norm": 1.840388254222325, "language_loss": 0.69687581, "learning_rate": 2.7534321303112224e-06, "loss": 0.71843535, "num_input_tokens_seen": 141342235, "step": 6582, "time_per_iteration": 2.74564790725708 }, { "auxiliary_loss_clip": 0.01132763, "auxiliary_loss_mlp": 0.0077198, "balance_loss_clip": 1.04670966, "balance_loss_mlp": 1.00066948, "epoch": 0.39579137231324213, "flos": 18733699607040.0, "grad_norm": 2.093309053458098, "language_loss": 0.76838243, "learning_rate": 2.753071346464642e-06, "loss": 0.78742981, "num_input_tokens_seen": 141361195, "step": 6583, "time_per_iteration": 2.6127665042877197 }, { "auxiliary_loss_clip": 0.01084294, "auxiliary_loss_mlp": 0.00772199, "balance_loss_clip": 1.04135418, "balance_loss_mlp": 1.00058353, "epoch": 0.3958514955659101, "flos": 17676417755520.0, "grad_norm": 2.422087879109688, "language_loss": 0.66005278, "learning_rate": 2.7527105340615207e-06, "loss": 0.67861772, "num_input_tokens_seen": 141378275, "step": 6584, "time_per_iteration": 2.8412790298461914 }, { "auxiliary_loss_clip": 0.0109769, "auxiliary_loss_mlp": 0.01042803, "balance_loss_clip": 1.04634333, "balance_loss_mlp": 1.02687716, "epoch": 0.39591161881857806, "flos": 29309103901440.0, "grad_norm": 7.452692779947077, "language_loss": 0.72775561, "learning_rate": 2.7523496931155413e-06, "loss": 0.74916053, "num_input_tokens_seen": 141396960, "step": 6585, "time_per_iteration": 2.8504436016082764 }, { "auxiliary_loss_clip": 0.0109915, "auxiliary_loss_mlp": 0.01041099, "balance_loss_clip": 1.04335117, "balance_loss_mlp": 1.02628136, "epoch": 0.3959717420712461, "flos": 25771831332480.0, "grad_norm": 1.8603715362450812, "language_loss": 0.73381901, "learning_rate": 2.7519888236403856e-06, "loss": 0.75522149, "num_input_tokens_seen": 141417320, "step": 6586, "time_per_iteration": 2.8426311016082764 }, { "auxiliary_loss_clip": 0.01101854, "auxiliary_loss_mlp": 0.0103792, "balance_loss_clip": 1.04255629, "balance_loss_mlp": 1.02266693, "epoch": 0.39603186532391405, "flos": 20923783655040.0, "grad_norm": 2.174728382433504, "language_loss": 0.71447468, "learning_rate": 2.7516279256497382e-06, "loss": 0.73587245, "num_input_tokens_seen": 141435985, "step": 6587, "time_per_iteration": 2.7798478603363037 }, { "auxiliary_loss_clip": 0.01007869, "auxiliary_loss_mlp": 0.01003214, "balance_loss_clip": 1.02249742, "balance_loss_mlp": 1.00195026, "epoch": 0.396091988576582, "flos": 54880986176640.0, "grad_norm": 0.9478406102040471, "language_loss": 0.61186492, "learning_rate": 2.751266999157285e-06, "loss": 0.63197577, "num_input_tokens_seen": 141486075, "step": 6588, "time_per_iteration": 3.1663742065429688 }, { "auxiliary_loss_clip": 0.0110963, "auxiliary_loss_mlp": 0.00772247, "balance_loss_clip": 1.04547548, "balance_loss_mlp": 1.0006907, "epoch": 0.39615211182925, "flos": 20702896968960.0, "grad_norm": 3.1004380492305206, "language_loss": 0.81686854, "learning_rate": 2.7509060441767115e-06, "loss": 0.8356874, "num_input_tokens_seen": 141505280, "step": 6589, "time_per_iteration": 2.7711055278778076 }, { "auxiliary_loss_clip": 0.01106228, "auxiliary_loss_mlp": 0.01038149, "balance_loss_clip": 1.04562962, "balance_loss_mlp": 1.02241325, "epoch": 0.39621223508191794, "flos": 20994312009600.0, "grad_norm": 2.2429889858322802, "language_loss": 0.69913912, "learning_rate": 2.7505450607217057e-06, "loss": 0.72058284, "num_input_tokens_seen": 141523930, "step": 6590, "time_per_iteration": 2.793330669403076 }, { "auxiliary_loss_clip": 0.01117633, "auxiliary_loss_mlp": 0.01056421, "balance_loss_clip": 1.04669666, "balance_loss_mlp": 1.03980339, "epoch": 0.3962723583345859, "flos": 23368833417600.0, "grad_norm": 1.772211549409949, "language_loss": 0.75809395, "learning_rate": 2.750184048805956e-06, "loss": 0.77983451, "num_input_tokens_seen": 141541320, "step": 6591, "time_per_iteration": 2.7317981719970703 }, { "auxiliary_loss_clip": 0.01043506, "auxiliary_loss_mlp": 0.01049181, "balance_loss_clip": 1.03802264, "balance_loss_mlp": 1.03364813, "epoch": 0.39633248158725387, "flos": 25115599808640.0, "grad_norm": 2.064980952243903, "language_loss": 0.78466719, "learning_rate": 2.749823008443152e-06, "loss": 0.80559409, "num_input_tokens_seen": 141561880, "step": 6592, "time_per_iteration": 3.192194700241089 }, { "auxiliary_loss_clip": 0.01059924, "auxiliary_loss_mlp": 0.01033868, "balance_loss_clip": 1.03984666, "balance_loss_mlp": 1.01872826, "epoch": 0.39639260483992184, "flos": 39787622236800.0, "grad_norm": 1.9568402514544967, "language_loss": 0.69690341, "learning_rate": 2.7494619396469843e-06, "loss": 0.71784127, "num_input_tokens_seen": 141586460, "step": 6593, "time_per_iteration": 3.365752696990967 }, { "auxiliary_loss_clip": 0.01059564, "auxiliary_loss_mlp": 0.01046377, "balance_loss_clip": 1.03668404, "balance_loss_mlp": 1.03035569, "epoch": 0.3964527280925898, "flos": 17347045017600.0, "grad_norm": 1.624713370756075, "language_loss": 0.77905881, "learning_rate": 2.7491008424311452e-06, "loss": 0.80011821, "num_input_tokens_seen": 141605955, "step": 6594, "time_per_iteration": 2.890626907348633 }, { "auxiliary_loss_clip": 0.01025812, "auxiliary_loss_mlp": 0.01003509, "balance_loss_clip": 1.02550435, "balance_loss_mlp": 1.00200129, "epoch": 0.39651285134525777, "flos": 71717848369920.0, "grad_norm": 0.9363100872746896, "language_loss": 0.6304667, "learning_rate": 2.7487397168093265e-06, "loss": 0.65075988, "num_input_tokens_seen": 141673140, "step": 6595, "time_per_iteration": 3.3955094814300537 }, { "auxiliary_loss_clip": 0.01096586, "auxiliary_loss_mlp": 0.01055368, "balance_loss_clip": 1.0442034, "balance_loss_mlp": 1.03774858, "epoch": 0.39657297459792573, "flos": 25775710001280.0, "grad_norm": 2.5609780352809732, "language_loss": 0.63787287, "learning_rate": 2.748378562795223e-06, "loss": 0.65939242, "num_input_tokens_seen": 141692955, "step": 6596, "time_per_iteration": 4.60092568397522 }, { "auxiliary_loss_clip": 0.01120147, "auxiliary_loss_mlp": 0.010422, "balance_loss_clip": 1.04657853, "balance_loss_mlp": 1.02747798, "epoch": 0.3966330978505937, "flos": 20266115587200.0, "grad_norm": 2.0315739566024567, "language_loss": 0.79006839, "learning_rate": 2.7480173804025293e-06, "loss": 0.81169188, "num_input_tokens_seen": 141710680, "step": 6597, "time_per_iteration": 5.807824373245239 }, { "auxiliary_loss_clip": 0.01099639, "auxiliary_loss_mlp": 0.00773402, "balance_loss_clip": 1.04352951, "balance_loss_mlp": 1.00076032, "epoch": 0.39669322110326166, "flos": 20631183465600.0, "grad_norm": 2.966898609781474, "language_loss": 0.6772182, "learning_rate": 2.747656169644941e-06, "loss": 0.69594866, "num_input_tokens_seen": 141729860, "step": 6598, "time_per_iteration": 2.786884307861328 }, { "auxiliary_loss_clip": 0.01129462, "auxiliary_loss_mlp": 0.01041455, "balance_loss_clip": 1.04473436, "balance_loss_mlp": 1.02785325, "epoch": 0.3967533443559297, "flos": 21726063878400.0, "grad_norm": 2.1804433985902247, "language_loss": 0.79342777, "learning_rate": 2.747294930536157e-06, "loss": 0.81513697, "num_input_tokens_seen": 141749060, "step": 6599, "time_per_iteration": 2.6758370399475098 }, { "auxiliary_loss_clip": 0.01091573, "auxiliary_loss_mlp": 0.01041619, "balance_loss_clip": 1.04314208, "balance_loss_mlp": 1.02487051, "epoch": 0.39681346760859765, "flos": 25484151306240.0, "grad_norm": 2.279505844275463, "language_loss": 0.72878486, "learning_rate": 2.7469336630898737e-06, "loss": 0.75011677, "num_input_tokens_seen": 141769860, "step": 6600, "time_per_iteration": 2.7616889476776123 }, { "auxiliary_loss_clip": 0.01083152, "auxiliary_loss_mlp": 0.01037422, "balance_loss_clip": 1.03626251, "balance_loss_mlp": 1.0220201, "epoch": 0.3968735908612656, "flos": 20959586536320.0, "grad_norm": 2.0515710245603938, "language_loss": 0.85973942, "learning_rate": 2.746572367319791e-06, "loss": 0.88094509, "num_input_tokens_seen": 141788465, "step": 6601, "time_per_iteration": 2.755791664123535 }, { "auxiliary_loss_clip": 0.01095713, "auxiliary_loss_mlp": 0.01041964, "balance_loss_clip": 1.0429877, "balance_loss_mlp": 1.02468467, "epoch": 0.3969337141139336, "flos": 10707090531840.0, "grad_norm": 2.240549855963289, "language_loss": 0.70372766, "learning_rate": 2.7462110432396095e-06, "loss": 0.72510445, "num_input_tokens_seen": 141804955, "step": 6602, "time_per_iteration": 4.643726348876953 }, { "auxiliary_loss_clip": 0.01133428, "auxiliary_loss_mlp": 0.01047809, "balance_loss_clip": 1.04658508, "balance_loss_mlp": 1.03230548, "epoch": 0.39699383736660154, "flos": 17593714690560.0, "grad_norm": 3.7711392584572896, "language_loss": 0.83248609, "learning_rate": 2.7458496908630305e-06, "loss": 0.85429847, "num_input_tokens_seen": 141820025, "step": 6603, "time_per_iteration": 2.8909716606140137 }, { "auxiliary_loss_clip": 0.01112282, "auxiliary_loss_mlp": 0.01034339, "balance_loss_clip": 1.04651403, "balance_loss_mlp": 1.02003431, "epoch": 0.3970539606192695, "flos": 17785945301760.0, "grad_norm": 1.9508498227264648, "language_loss": 0.73302728, "learning_rate": 2.7454883102037563e-06, "loss": 0.75449347, "num_input_tokens_seen": 141838735, "step": 6604, "time_per_iteration": 2.828908920288086 }, { "auxiliary_loss_clip": 0.01105132, "auxiliary_loss_mlp": 0.01038476, "balance_loss_clip": 1.04384422, "balance_loss_mlp": 1.02364659, "epoch": 0.3971140838719375, "flos": 24789495208320.0, "grad_norm": 1.769953580879433, "language_loss": 0.82582277, "learning_rate": 2.745126901275491e-06, "loss": 0.84725887, "num_input_tokens_seen": 141858090, "step": 6605, "time_per_iteration": 2.6773502826690674 }, { "auxiliary_loss_clip": 0.01128613, "auxiliary_loss_mlp": 0.01033129, "balance_loss_clip": 1.04549098, "balance_loss_mlp": 1.01968801, "epoch": 0.39717420712460544, "flos": 24243581329920.0, "grad_norm": 1.4871941413504006, "language_loss": 0.73511499, "learning_rate": 2.7447654640919383e-06, "loss": 0.75673246, "num_input_tokens_seen": 141877540, "step": 6606, "time_per_iteration": 2.632805347442627 }, { "auxiliary_loss_clip": 0.01089285, "auxiliary_loss_mlp": 0.01048599, "balance_loss_clip": 1.0436089, "balance_loss_mlp": 1.03198171, "epoch": 0.3972343303772734, "flos": 25884698843520.0, "grad_norm": 2.092571399644939, "language_loss": 0.74296981, "learning_rate": 2.744403998666805e-06, "loss": 0.76434863, "num_input_tokens_seen": 141897315, "step": 6607, "time_per_iteration": 2.7277770042419434 }, { "auxiliary_loss_clip": 0.01124169, "auxiliary_loss_mlp": 0.01037393, "balance_loss_clip": 1.04697132, "balance_loss_mlp": 1.02267027, "epoch": 0.39729445362994137, "flos": 45623716300800.0, "grad_norm": 1.5196847129379933, "language_loss": 0.6787042, "learning_rate": 2.744042505013797e-06, "loss": 0.70031989, "num_input_tokens_seen": 141919580, "step": 6608, "time_per_iteration": 2.8229119777679443 }, { "auxiliary_loss_clip": 0.01094928, "auxiliary_loss_mlp": 0.01054175, "balance_loss_clip": 1.04091311, "balance_loss_mlp": 1.03580451, "epoch": 0.39735457688260933, "flos": 20193971120640.0, "grad_norm": 7.314681050409252, "language_loss": 0.74670005, "learning_rate": 2.7436809831466233e-06, "loss": 0.7681911, "num_input_tokens_seen": 141937045, "step": 6609, "time_per_iteration": 2.7502245903015137 }, { "auxiliary_loss_clip": 0.01107217, "auxiliary_loss_mlp": 0.01036058, "balance_loss_clip": 1.04354, "balance_loss_mlp": 1.02056026, "epoch": 0.3974147001352773, "flos": 23331163029120.0, "grad_norm": 1.742454501323656, "language_loss": 0.713238, "learning_rate": 2.7433194330789927e-06, "loss": 0.73467076, "num_input_tokens_seen": 141956695, "step": 6610, "time_per_iteration": 2.7225286960601807 }, { "auxiliary_loss_clip": 0.01105851, "auxiliary_loss_mlp": 0.01030068, "balance_loss_clip": 1.03818822, "balance_loss_mlp": 1.01509547, "epoch": 0.39747482338794526, "flos": 21688644885120.0, "grad_norm": 1.7960063460415152, "language_loss": 0.78151029, "learning_rate": 2.7429578548246133e-06, "loss": 0.8028695, "num_input_tokens_seen": 141975935, "step": 6611, "time_per_iteration": 2.6464622020721436 }, { "auxiliary_loss_clip": 0.01121213, "auxiliary_loss_mlp": 0.01038273, "balance_loss_clip": 1.04614162, "balance_loss_mlp": 1.0235095, "epoch": 0.3975349466406133, "flos": 30988717816320.0, "grad_norm": 1.7937788130001704, "language_loss": 0.7921629, "learning_rate": 2.7425962483971985e-06, "loss": 0.81375778, "num_input_tokens_seen": 141995750, "step": 6612, "time_per_iteration": 2.734950304031372 }, { "auxiliary_loss_clip": 0.01018209, "auxiliary_loss_mlp": 0.0100828, "balance_loss_clip": 1.02113628, "balance_loss_mlp": 1.00702214, "epoch": 0.39759506989328125, "flos": 63683948833920.0, "grad_norm": 0.8423760762856193, "language_loss": 0.64935088, "learning_rate": 2.742234613810459e-06, "loss": 0.66961575, "num_input_tokens_seen": 142057655, "step": 6613, "time_per_iteration": 3.1294105052948 }, { "auxiliary_loss_clip": 0.01097901, "auxiliary_loss_mlp": 0.01042526, "balance_loss_clip": 1.03916883, "balance_loss_mlp": 1.02507401, "epoch": 0.3976551931459492, "flos": 23695835857920.0, "grad_norm": 3.0444472336295636, "language_loss": 0.71508956, "learning_rate": 2.741872951078109e-06, "loss": 0.73649383, "num_input_tokens_seen": 142076020, "step": 6614, "time_per_iteration": 2.6479976177215576 }, { "auxiliary_loss_clip": 0.01116106, "auxiliary_loss_mlp": 0.01035284, "balance_loss_clip": 1.04503131, "balance_loss_mlp": 1.02034712, "epoch": 0.3977153163986172, "flos": 15669657745920.0, "grad_norm": 2.2927333729520885, "language_loss": 0.81362098, "learning_rate": 2.741511260213862e-06, "loss": 0.83513486, "num_input_tokens_seen": 142093790, "step": 6615, "time_per_iteration": 2.6567723751068115 }, { "auxiliary_loss_clip": 0.01094954, "auxiliary_loss_mlp": 0.01034601, "balance_loss_clip": 1.04491544, "balance_loss_mlp": 1.02023649, "epoch": 0.39777543965128515, "flos": 14064702249600.0, "grad_norm": 2.01024859105405, "language_loss": 0.67510247, "learning_rate": 2.741149541231434e-06, "loss": 0.69639802, "num_input_tokens_seen": 142110545, "step": 6616, "time_per_iteration": 2.6675400733947754 }, { "auxiliary_loss_clip": 0.01133654, "auxiliary_loss_mlp": 0.01043633, "balance_loss_clip": 1.04658771, "balance_loss_mlp": 1.02765918, "epoch": 0.3978355629039531, "flos": 23367468700800.0, "grad_norm": 2.3086733420735785, "language_loss": 0.83678514, "learning_rate": 2.740787794144541e-06, "loss": 0.85855806, "num_input_tokens_seen": 142128695, "step": 6617, "time_per_iteration": 2.5879552364349365 }, { "auxiliary_loss_clip": 0.01126085, "auxiliary_loss_mlp": 0.01039432, "balance_loss_clip": 1.04570735, "balance_loss_mlp": 1.02563334, "epoch": 0.3978956861566211, "flos": 19062785036160.0, "grad_norm": 1.7795732635152253, "language_loss": 0.72766519, "learning_rate": 2.7404260189669e-06, "loss": 0.74932027, "num_input_tokens_seen": 142148375, "step": 6618, "time_per_iteration": 2.613162040710449 }, { "auxiliary_loss_clip": 0.01111951, "auxiliary_loss_mlp": 0.01041983, "balance_loss_clip": 1.04827428, "balance_loss_mlp": 1.02544832, "epoch": 0.39795580940928904, "flos": 30227699341440.0, "grad_norm": 1.6960793445061386, "language_loss": 0.65858316, "learning_rate": 2.740064215712231e-06, "loss": 0.68012249, "num_input_tokens_seen": 142169735, "step": 6619, "time_per_iteration": 2.7474000453948975 }, { "auxiliary_loss_clip": 0.01052495, "auxiliary_loss_mlp": 0.01004058, "balance_loss_clip": 1.0230546, "balance_loss_mlp": 1.00270545, "epoch": 0.398015932661957, "flos": 69847224906240.0, "grad_norm": 0.7704475067145287, "language_loss": 0.58246851, "learning_rate": 2.7397023843942527e-06, "loss": 0.60303402, "num_input_tokens_seen": 142229520, "step": 6620, "time_per_iteration": 3.1400091648101807 }, { "auxiliary_loss_clip": 0.01113547, "auxiliary_loss_mlp": 0.0103675, "balance_loss_clip": 1.04998422, "balance_loss_mlp": 1.02314794, "epoch": 0.39807605591462497, "flos": 20157773189760.0, "grad_norm": 1.821199996328267, "language_loss": 0.7925806, "learning_rate": 2.739340525026686e-06, "loss": 0.81408358, "num_input_tokens_seen": 142247660, "step": 6621, "time_per_iteration": 2.7389161586761475 }, { "auxiliary_loss_clip": 0.0110802, "auxiliary_loss_mlp": 0.01034956, "balance_loss_clip": 1.04590595, "balance_loss_mlp": 1.02088952, "epoch": 0.39813617916729294, "flos": 21141761339520.0, "grad_norm": 1.899291394170355, "language_loss": 0.77800381, "learning_rate": 2.738978637623252e-06, "loss": 0.79943347, "num_input_tokens_seen": 142266990, "step": 6622, "time_per_iteration": 2.7175779342651367 }, { "auxiliary_loss_clip": 0.01101638, "auxiliary_loss_mlp": 0.01038721, "balance_loss_clip": 1.04108417, "balance_loss_mlp": 1.02377844, "epoch": 0.3981963024199609, "flos": 18988485753600.0, "grad_norm": 1.6278701941081761, "language_loss": 0.7497921, "learning_rate": 2.738616722197674e-06, "loss": 0.77119565, "num_input_tokens_seen": 142287170, "step": 6623, "time_per_iteration": 2.682567596435547 }, { "auxiliary_loss_clip": 0.01088304, "auxiliary_loss_mlp": 0.01040759, "balance_loss_clip": 1.04280734, "balance_loss_mlp": 1.02590537, "epoch": 0.39825642567262887, "flos": 16575108808320.0, "grad_norm": 2.4968757127264465, "language_loss": 0.79733497, "learning_rate": 2.7382547787636766e-06, "loss": 0.81862563, "num_input_tokens_seen": 142305405, "step": 6624, "time_per_iteration": 2.6878697872161865 }, { "auxiliary_loss_clip": 0.01135858, "auxiliary_loss_mlp": 0.01043783, "balance_loss_clip": 1.04792297, "balance_loss_mlp": 1.0270462, "epoch": 0.39831654892529683, "flos": 22199833290240.0, "grad_norm": 2.0557211895564884, "language_loss": 0.83616954, "learning_rate": 2.7378928073349832e-06, "loss": 0.85796595, "num_input_tokens_seen": 142322710, "step": 6625, "time_per_iteration": 2.5847036838531494 }, { "auxiliary_loss_clip": 0.011152, "auxiliary_loss_mlp": 0.01044436, "balance_loss_clip": 1.04585958, "balance_loss_mlp": 1.02948713, "epoch": 0.39837667217796485, "flos": 10487963612160.0, "grad_norm": 2.4120237094780377, "language_loss": 0.87324822, "learning_rate": 2.737530807925321e-06, "loss": 0.89484465, "num_input_tokens_seen": 142338535, "step": 6626, "time_per_iteration": 2.5845320224761963 }, { "auxiliary_loss_clip": 0.01067442, "auxiliary_loss_mlp": 0.00775778, "balance_loss_clip": 1.03995085, "balance_loss_mlp": 1.00066137, "epoch": 0.3984367954306328, "flos": 17965282930560.0, "grad_norm": 2.3324132294494797, "language_loss": 0.83462882, "learning_rate": 2.737168780548417e-06, "loss": 0.85306096, "num_input_tokens_seen": 142354570, "step": 6627, "time_per_iteration": 2.854428291320801 }, { "auxiliary_loss_clip": 0.01087071, "auxiliary_loss_mlp": 0.00771611, "balance_loss_clip": 1.04081798, "balance_loss_mlp": 1.00056684, "epoch": 0.3984969186833008, "flos": 22711057608960.0, "grad_norm": 1.4575889504047923, "language_loss": 0.82904339, "learning_rate": 2.736806725217998e-06, "loss": 0.84763026, "num_input_tokens_seen": 142374395, "step": 6628, "time_per_iteration": 2.772620916366577 }, { "auxiliary_loss_clip": 0.01092039, "auxiliary_loss_mlp": 0.01062711, "balance_loss_clip": 1.04402328, "balance_loss_mlp": 1.04652882, "epoch": 0.39855704193596875, "flos": 23405785534080.0, "grad_norm": 1.6631347026103094, "language_loss": 0.71145642, "learning_rate": 2.7364446419477945e-06, "loss": 0.73300385, "num_input_tokens_seen": 142396040, "step": 6629, "time_per_iteration": 2.681969165802002 }, { "auxiliary_loss_clip": 0.01097676, "auxiliary_loss_mlp": 0.01035809, "balance_loss_clip": 1.04695797, "balance_loss_mlp": 1.02136111, "epoch": 0.3986171651886367, "flos": 21251935330560.0, "grad_norm": 1.757569665448266, "language_loss": 0.80513418, "learning_rate": 2.7360825307515366e-06, "loss": 0.82646906, "num_input_tokens_seen": 142415495, "step": 6630, "time_per_iteration": 2.7747275829315186 }, { "auxiliary_loss_clip": 0.01072778, "auxiliary_loss_mlp": 0.01032526, "balance_loss_clip": 1.04495096, "balance_loss_mlp": 1.01805389, "epoch": 0.3986772884413047, "flos": 12458705258880.0, "grad_norm": 2.3833222910170857, "language_loss": 0.74846494, "learning_rate": 2.7357203916429555e-06, "loss": 0.76951796, "num_input_tokens_seen": 142431865, "step": 6631, "time_per_iteration": 2.8098866939544678 }, { "auxiliary_loss_clip": 0.01095184, "auxiliary_loss_mlp": 0.01040404, "balance_loss_clip": 1.04248333, "balance_loss_mlp": 1.02500248, "epoch": 0.39873741169397264, "flos": 19646117907840.0, "grad_norm": 2.096728163981437, "language_loss": 0.7160908, "learning_rate": 2.735358224635783e-06, "loss": 0.73744667, "num_input_tokens_seen": 142450595, "step": 6632, "time_per_iteration": 2.81479811668396 }, { "auxiliary_loss_clip": 0.01063774, "auxiliary_loss_mlp": 0.00771132, "balance_loss_clip": 1.04164338, "balance_loss_mlp": 1.00057721, "epoch": 0.3987975349466406, "flos": 21684766216320.0, "grad_norm": 2.0680050346702945, "language_loss": 0.7479074, "learning_rate": 2.7349960297437533e-06, "loss": 0.76625645, "num_input_tokens_seen": 142466650, "step": 6633, "time_per_iteration": 2.9533073902130127 }, { "auxiliary_loss_clip": 0.01105798, "auxiliary_loss_mlp": 0.01028668, "balance_loss_clip": 1.0465138, "balance_loss_mlp": 1.01509583, "epoch": 0.3988576581993086, "flos": 23914064937600.0, "grad_norm": 1.7626671777587215, "language_loss": 0.81420207, "learning_rate": 2.7346338069806e-06, "loss": 0.83554673, "num_input_tokens_seen": 142486165, "step": 6634, "time_per_iteration": 2.760012626647949 }, { "auxiliary_loss_clip": 0.0110458, "auxiliary_loss_mlp": 0.0103091, "balance_loss_clip": 1.04739153, "balance_loss_mlp": 1.01618731, "epoch": 0.39891778145197654, "flos": 18149899858560.0, "grad_norm": 2.495702621722643, "language_loss": 0.74914795, "learning_rate": 2.7342715563600597e-06, "loss": 0.77050287, "num_input_tokens_seen": 142505035, "step": 6635, "time_per_iteration": 4.225152015686035 }, { "auxiliary_loss_clip": 0.01101511, "auxiliary_loss_mlp": 0.01039121, "balance_loss_clip": 1.04791617, "balance_loss_mlp": 1.02265239, "epoch": 0.3989779047046445, "flos": 22595281096320.0, "grad_norm": 28.19463582214486, "language_loss": 0.66373086, "learning_rate": 2.733909277895868e-06, "loss": 0.68513715, "num_input_tokens_seen": 142521870, "step": 6636, "time_per_iteration": 4.455794811248779 }, { "auxiliary_loss_clip": 0.01118899, "auxiliary_loss_mlp": 0.01041724, "balance_loss_clip": 1.04681683, "balance_loss_mlp": 1.02687669, "epoch": 0.39903802795731247, "flos": 18077216688000.0, "grad_norm": 2.0422591411720723, "language_loss": 0.81318372, "learning_rate": 2.733546971601763e-06, "loss": 0.83478993, "num_input_tokens_seen": 142540455, "step": 6637, "time_per_iteration": 4.3843090534210205 }, { "auxiliary_loss_clip": 0.0102804, "auxiliary_loss_mlp": 0.01018728, "balance_loss_clip": 1.02743387, "balance_loss_mlp": 1.01694012, "epoch": 0.39909815120998043, "flos": 70441367771520.0, "grad_norm": 0.719892771757815, "language_loss": 0.53119934, "learning_rate": 2.733184637491484e-06, "loss": 0.55166698, "num_input_tokens_seen": 142599665, "step": 6638, "time_per_iteration": 3.2910361289978027 }, { "auxiliary_loss_clip": 0.01112783, "auxiliary_loss_mlp": 0.00772668, "balance_loss_clip": 1.04786587, "balance_loss_mlp": 1.00065207, "epoch": 0.39915827446264845, "flos": 18549262247040.0, "grad_norm": 1.6115719033099838, "language_loss": 0.75487578, "learning_rate": 2.732822275578769e-06, "loss": 0.77373028, "num_input_tokens_seen": 142618845, "step": 6639, "time_per_iteration": 2.7083969116210938 }, { "auxiliary_loss_clip": 0.0105821, "auxiliary_loss_mlp": 0.01036909, "balance_loss_clip": 1.03856301, "balance_loss_mlp": 1.022264, "epoch": 0.3992183977153164, "flos": 29897249195520.0, "grad_norm": 2.505539025941121, "language_loss": 0.76163709, "learning_rate": 2.7324598858773603e-06, "loss": 0.78258824, "num_input_tokens_seen": 142640885, "step": 6640, "time_per_iteration": 2.8801841735839844 }, { "auxiliary_loss_clip": 0.01102565, "auxiliary_loss_mlp": 0.01038995, "balance_loss_clip": 1.04663992, "balance_loss_mlp": 1.02430892, "epoch": 0.3992785209679844, "flos": 22565080736640.0, "grad_norm": 2.779199402703341, "language_loss": 0.81995392, "learning_rate": 2.7320974684009996e-06, "loss": 0.84136951, "num_input_tokens_seen": 142659340, "step": 6641, "time_per_iteration": 4.346608638763428 }, { "auxiliary_loss_clip": 0.01136449, "auxiliary_loss_mlp": 0.01038781, "balance_loss_clip": 1.05189252, "balance_loss_mlp": 1.02393353, "epoch": 0.39933864422065235, "flos": 19682674974720.0, "grad_norm": 2.1545130527280985, "language_loss": 0.76744998, "learning_rate": 2.7317350231634288e-06, "loss": 0.78920233, "num_input_tokens_seen": 142677085, "step": 6642, "time_per_iteration": 2.656057596206665 }, { "auxiliary_loss_clip": 0.01106418, "auxiliary_loss_mlp": 0.01034072, "balance_loss_clip": 1.04871511, "balance_loss_mlp": 1.0196898, "epoch": 0.3993987674733203, "flos": 23038491012480.0, "grad_norm": 2.1744041926742788, "language_loss": 0.72387367, "learning_rate": 2.731372550178393e-06, "loss": 0.7452786, "num_input_tokens_seen": 142694595, "step": 6643, "time_per_iteration": 2.680995225906372 }, { "auxiliary_loss_clip": 0.01123145, "auxiliary_loss_mlp": 0.01040337, "balance_loss_clip": 1.04840899, "balance_loss_mlp": 1.02565074, "epoch": 0.3994588907259883, "flos": 19390828970880.0, "grad_norm": 1.7059817149479597, "language_loss": 0.6665355, "learning_rate": 2.7310100494596375e-06, "loss": 0.68817025, "num_input_tokens_seen": 142714175, "step": 6644, "time_per_iteration": 2.6378324031829834 }, { "auxiliary_loss_clip": 0.01130779, "auxiliary_loss_mlp": 0.0103839, "balance_loss_clip": 1.04629064, "balance_loss_mlp": 1.02349472, "epoch": 0.39951901397865625, "flos": 13734395758080.0, "grad_norm": 2.1425296608964937, "language_loss": 0.78164649, "learning_rate": 2.730647521020907e-06, "loss": 0.80333817, "num_input_tokens_seen": 142730955, "step": 6645, "time_per_iteration": 2.6268746852874756 }, { "auxiliary_loss_clip": 0.0112116, "auxiliary_loss_mlp": 0.01037104, "balance_loss_clip": 1.04624033, "balance_loss_mlp": 1.02252507, "epoch": 0.3995791372313242, "flos": 23586451966080.0, "grad_norm": 1.7492924628724136, "language_loss": 0.69861412, "learning_rate": 2.73028496487595e-06, "loss": 0.72019678, "num_input_tokens_seen": 142751200, "step": 6646, "time_per_iteration": 2.7350409030914307 }, { "auxiliary_loss_clip": 0.0107684, "auxiliary_loss_mlp": 0.01037342, "balance_loss_clip": 1.03799927, "balance_loss_mlp": 1.02223825, "epoch": 0.3996392604839922, "flos": 21355896268800.0, "grad_norm": 1.7623657715359762, "language_loss": 0.72017872, "learning_rate": 2.729922381038513e-06, "loss": 0.74132061, "num_input_tokens_seen": 142770170, "step": 6647, "time_per_iteration": 2.7607529163360596 }, { "auxiliary_loss_clip": 0.01093143, "auxiliary_loss_mlp": 0.01043089, "balance_loss_clip": 1.04529011, "balance_loss_mlp": 1.02973795, "epoch": 0.39969938373666014, "flos": 26032255914240.0, "grad_norm": 1.4563496549616326, "language_loss": 0.74217343, "learning_rate": 2.7295597695223463e-06, "loss": 0.7635358, "num_input_tokens_seen": 142792680, "step": 6648, "time_per_iteration": 2.8048219680786133 }, { "auxiliary_loss_clip": 0.01133606, "auxiliary_loss_mlp": 0.01037616, "balance_loss_clip": 1.04912674, "balance_loss_mlp": 1.02281022, "epoch": 0.3997595069893281, "flos": 20116367786880.0, "grad_norm": 2.0433040752683578, "language_loss": 0.6589973, "learning_rate": 2.7291971303412006e-06, "loss": 0.6807096, "num_input_tokens_seen": 142810510, "step": 6649, "time_per_iteration": 2.6976583003997803 }, { "auxiliary_loss_clip": 0.01103049, "auxiliary_loss_mlp": 0.01042133, "balance_loss_clip": 1.04713392, "balance_loss_mlp": 1.02803016, "epoch": 0.39981963024199607, "flos": 27783403764480.0, "grad_norm": 1.7319771659085785, "language_loss": 0.75106388, "learning_rate": 2.728834463508826e-06, "loss": 0.77251565, "num_input_tokens_seen": 142832455, "step": 6650, "time_per_iteration": 2.7441325187683105 }, { "auxiliary_loss_clip": 0.01132922, "auxiliary_loss_mlp": 0.01042591, "balance_loss_clip": 1.04873252, "balance_loss_mlp": 1.02803564, "epoch": 0.39987975349466404, "flos": 21944436612480.0, "grad_norm": 1.5673208577322473, "language_loss": 0.72102094, "learning_rate": 2.728471769038975e-06, "loss": 0.74277604, "num_input_tokens_seen": 142852590, "step": 6651, "time_per_iteration": 2.6027066707611084 }, { "auxiliary_loss_clip": 0.01132958, "auxiliary_loss_mlp": 0.01045235, "balance_loss_clip": 1.04850328, "balance_loss_mlp": 1.03093004, "epoch": 0.39993987674733206, "flos": 20704405340160.0, "grad_norm": 1.8492457158027382, "language_loss": 0.73126423, "learning_rate": 2.728109046945403e-06, "loss": 0.75304615, "num_input_tokens_seen": 142870595, "step": 6652, "time_per_iteration": 2.5880327224731445 }, { "auxiliary_loss_clip": 0.01029168, "auxiliary_loss_mlp": 0.01002764, "balance_loss_clip": 1.02822125, "balance_loss_mlp": 1.00134552, "epoch": 0.4, "flos": 61525429862400.0, "grad_norm": 0.8458278780382239, "language_loss": 0.60614997, "learning_rate": 2.727746297241862e-06, "loss": 0.62646931, "num_input_tokens_seen": 142925805, "step": 6653, "time_per_iteration": 3.1626622676849365 }, { "auxiliary_loss_clip": 0.01093219, "auxiliary_loss_mlp": 0.01039197, "balance_loss_clip": 1.04810715, "balance_loss_mlp": 1.02577376, "epoch": 0.400060123252668, "flos": 14502309644160.0, "grad_norm": 3.0617453661279788, "language_loss": 0.66701174, "learning_rate": 2.7273835199421085e-06, "loss": 0.6883359, "num_input_tokens_seen": 142943145, "step": 6654, "time_per_iteration": 2.696179151535034 }, { "auxiliary_loss_clip": 0.01119303, "auxiliary_loss_mlp": 0.01043738, "balance_loss_clip": 1.04738593, "balance_loss_mlp": 1.03145993, "epoch": 0.40012024650533595, "flos": 19093308618240.0, "grad_norm": 2.461149956206156, "language_loss": 0.89818919, "learning_rate": 2.7270207150599e-06, "loss": 0.91981959, "num_input_tokens_seen": 142956925, "step": 6655, "time_per_iteration": 2.601891279220581 }, { "auxiliary_loss_clip": 0.01100614, "auxiliary_loss_mlp": 0.01040322, "balance_loss_clip": 1.04367936, "balance_loss_mlp": 1.02693462, "epoch": 0.4001803697580039, "flos": 29351012094720.0, "grad_norm": 1.7913118709828861, "language_loss": 0.73551166, "learning_rate": 2.7266578826089917e-06, "loss": 0.75692105, "num_input_tokens_seen": 142978040, "step": 6656, "time_per_iteration": 2.705662727355957 }, { "auxiliary_loss_clip": 0.01131953, "auxiliary_loss_mlp": 0.01046856, "balance_loss_clip": 1.04838896, "balance_loss_mlp": 1.03224063, "epoch": 0.4002404930106719, "flos": 20920048640640.0, "grad_norm": 1.6512050463613386, "language_loss": 0.73344004, "learning_rate": 2.726295022603144e-06, "loss": 0.75522816, "num_input_tokens_seen": 142998390, "step": 6657, "time_per_iteration": 2.7595558166503906 }, { "auxiliary_loss_clip": 0.0113267, "auxiliary_loss_mlp": 0.01046679, "balance_loss_clip": 1.04887247, "balance_loss_mlp": 1.03145635, "epoch": 0.40030061626333985, "flos": 28405735827840.0, "grad_norm": 1.7318374723338787, "language_loss": 0.79715288, "learning_rate": 2.725932135056117e-06, "loss": 0.81894636, "num_input_tokens_seen": 143021505, "step": 6658, "time_per_iteration": 2.6718270778656006 }, { "auxiliary_loss_clip": 0.01115521, "auxiliary_loss_mlp": 0.01042275, "balance_loss_clip": 1.04249525, "balance_loss_mlp": 1.02865553, "epoch": 0.4003607395160078, "flos": 25921615046400.0, "grad_norm": 2.0999446343296317, "language_loss": 0.77464151, "learning_rate": 2.72556921998167e-06, "loss": 0.79621947, "num_input_tokens_seen": 143041375, "step": 6659, "time_per_iteration": 2.7160539627075195 }, { "auxiliary_loss_clip": 0.01118822, "auxiliary_loss_mlp": 0.01028418, "balance_loss_clip": 1.04276848, "balance_loss_mlp": 1.01649117, "epoch": 0.4004208627686758, "flos": 20768648814720.0, "grad_norm": 1.6781351315554156, "language_loss": 0.72410327, "learning_rate": 2.7252062773935662e-06, "loss": 0.74557567, "num_input_tokens_seen": 143058725, "step": 6660, "time_per_iteration": 2.636833429336548 }, { "auxiliary_loss_clip": 0.01101229, "auxiliary_loss_mlp": 0.01041317, "balance_loss_clip": 1.04196119, "balance_loss_mlp": 1.02828765, "epoch": 0.40048098602134374, "flos": 24681224638080.0, "grad_norm": 1.813091564393644, "language_loss": 0.71008015, "learning_rate": 2.7248433073055674e-06, "loss": 0.73150557, "num_input_tokens_seen": 143076995, "step": 6661, "time_per_iteration": 2.6956517696380615 }, { "auxiliary_loss_clip": 0.0113437, "auxiliary_loss_mlp": 0.01042051, "balance_loss_clip": 1.0506804, "balance_loss_mlp": 1.02825832, "epoch": 0.4005411092740117, "flos": 23185688947200.0, "grad_norm": 1.8086148623568068, "language_loss": 0.75526643, "learning_rate": 2.724480309731437e-06, "loss": 0.77703071, "num_input_tokens_seen": 143096780, "step": 6662, "time_per_iteration": 2.6232621669769287 }, { "auxiliary_loss_clip": 0.01115634, "auxiliary_loss_mlp": 0.01036997, "balance_loss_clip": 1.04385805, "balance_loss_mlp": 1.02194118, "epoch": 0.4006012325266797, "flos": 17522324409600.0, "grad_norm": 2.00646115239694, "language_loss": 0.66450548, "learning_rate": 2.7241172846849417e-06, "loss": 0.68603182, "num_input_tokens_seen": 143112590, "step": 6663, "time_per_iteration": 2.622520923614502 }, { "auxiliary_loss_clip": 0.01112804, "auxiliary_loss_mlp": 0.01042686, "balance_loss_clip": 1.04327071, "balance_loss_mlp": 1.02767718, "epoch": 0.40066135577934764, "flos": 19857200181120.0, "grad_norm": 2.069962140682172, "language_loss": 0.86383915, "learning_rate": 2.7237542321798455e-06, "loss": 0.88539398, "num_input_tokens_seen": 143130220, "step": 6664, "time_per_iteration": 2.575124979019165 }, { "auxiliary_loss_clip": 0.01119355, "auxiliary_loss_mlp": 0.01036584, "balance_loss_clip": 1.04696763, "balance_loss_mlp": 1.0227679, "epoch": 0.40072147903201566, "flos": 18150007599360.0, "grad_norm": 16.441358853078547, "language_loss": 0.84723455, "learning_rate": 2.723391152229917e-06, "loss": 0.86879396, "num_input_tokens_seen": 143147160, "step": 6665, "time_per_iteration": 2.671715259552002 }, { "auxiliary_loss_clip": 0.01119739, "auxiliary_loss_mlp": 0.01037355, "balance_loss_clip": 1.04976356, "balance_loss_mlp": 1.02249575, "epoch": 0.4007816022846836, "flos": 18661267831680.0, "grad_norm": 1.8896907519127706, "language_loss": 0.78118432, "learning_rate": 2.7230280448489236e-06, "loss": 0.80275524, "num_input_tokens_seen": 143164605, "step": 6666, "time_per_iteration": 2.606566905975342 }, { "auxiliary_loss_clip": 0.01120664, "auxiliary_loss_mlp": 0.01038255, "balance_loss_clip": 1.0485028, "balance_loss_mlp": 1.02380657, "epoch": 0.4008417255373516, "flos": 25703170485120.0, "grad_norm": 1.7955817814438895, "language_loss": 0.73301423, "learning_rate": 2.7226649100506333e-06, "loss": 0.75460339, "num_input_tokens_seen": 143183965, "step": 6667, "time_per_iteration": 2.652503490447998 }, { "auxiliary_loss_clip": 0.0111465, "auxiliary_loss_mlp": 0.01054818, "balance_loss_clip": 1.04516435, "balance_loss_mlp": 1.03899896, "epoch": 0.40090184879001955, "flos": 22858614679680.0, "grad_norm": 1.708550182183753, "language_loss": 0.76022822, "learning_rate": 2.7223017478488183e-06, "loss": 0.78192288, "num_input_tokens_seen": 143204965, "step": 6668, "time_per_iteration": 2.6797566413879395 }, { "auxiliary_loss_clip": 0.01096645, "auxiliary_loss_mlp": 0.01046849, "balance_loss_clip": 1.04792619, "balance_loss_mlp": 1.0321629, "epoch": 0.4009619720426875, "flos": 29059848449280.0, "grad_norm": 2.335244314112793, "language_loss": 0.8221435, "learning_rate": 2.721938558257248e-06, "loss": 0.84357846, "num_input_tokens_seen": 143225015, "step": 6669, "time_per_iteration": 2.7661361694335938 }, { "auxiliary_loss_clip": 0.010311, "auxiliary_loss_mlp": 0.01009516, "balance_loss_clip": 1.02684975, "balance_loss_mlp": 1.00805604, "epoch": 0.4010220952953555, "flos": 66059763131520.0, "grad_norm": 0.69994773813092, "language_loss": 0.53312683, "learning_rate": 2.721575341289695e-06, "loss": 0.55353302, "num_input_tokens_seen": 143294925, "step": 6670, "time_per_iteration": 3.5547046661376953 }, { "auxiliary_loss_clip": 0.01083638, "auxiliary_loss_mlp": 0.01041448, "balance_loss_clip": 1.04546833, "balance_loss_mlp": 1.02720881, "epoch": 0.40108221854802345, "flos": 29642822184960.0, "grad_norm": 1.626307597556219, "language_loss": 0.88544351, "learning_rate": 2.7212120969599333e-06, "loss": 0.90669441, "num_input_tokens_seen": 143314170, "step": 6671, "time_per_iteration": 2.9112329483032227 }, { "auxiliary_loss_clip": 0.01119533, "auxiliary_loss_mlp": 0.01036462, "balance_loss_clip": 1.04568124, "balance_loss_mlp": 1.02137589, "epoch": 0.4011423418006914, "flos": 19929560129280.0, "grad_norm": 3.0264857347014993, "language_loss": 0.79105932, "learning_rate": 2.720848825281736e-06, "loss": 0.81261927, "num_input_tokens_seen": 143330050, "step": 6672, "time_per_iteration": 2.789889335632324 }, { "auxiliary_loss_clip": 0.01096186, "auxiliary_loss_mlp": 0.01045513, "balance_loss_clip": 1.04610085, "balance_loss_mlp": 1.03012288, "epoch": 0.4012024650533594, "flos": 20084299920000.0, "grad_norm": 4.192283777131793, "language_loss": 0.6293034, "learning_rate": 2.72048552626888e-06, "loss": 0.65072036, "num_input_tokens_seen": 143348650, "step": 6673, "time_per_iteration": 2.796834945678711 }, { "auxiliary_loss_clip": 0.011055, "auxiliary_loss_mlp": 0.00771502, "balance_loss_clip": 1.04474831, "balance_loss_mlp": 1.00076985, "epoch": 0.40126258830602735, "flos": 21695719864320.0, "grad_norm": 1.5776272245666931, "language_loss": 0.79948354, "learning_rate": 2.7201221999351402e-06, "loss": 0.81825352, "num_input_tokens_seen": 143370275, "step": 6674, "time_per_iteration": 4.298279523849487 }, { "auxiliary_loss_clip": 0.0108893, "auxiliary_loss_mlp": 0.01040876, "balance_loss_clip": 1.04919565, "balance_loss_mlp": 1.02610552, "epoch": 0.4013227115586953, "flos": 12020379592320.0, "grad_norm": 6.494329221896898, "language_loss": 0.82218468, "learning_rate": 2.719758846294294e-06, "loss": 0.84348273, "num_input_tokens_seen": 143385390, "step": 6675, "time_per_iteration": 2.7607553005218506 }, { "auxiliary_loss_clip": 0.01116053, "auxiliary_loss_mlp": 0.01038994, "balance_loss_clip": 1.04261947, "balance_loss_mlp": 1.02364039, "epoch": 0.4013828348113633, "flos": 25447522412160.0, "grad_norm": 2.205024073964141, "language_loss": 0.93500578, "learning_rate": 2.71939546536012e-06, "loss": 0.95655626, "num_input_tokens_seen": 143404215, "step": 6676, "time_per_iteration": 5.81420373916626 }, { "auxiliary_loss_clip": 0.01126662, "auxiliary_loss_mlp": 0.01041717, "balance_loss_clip": 1.04832482, "balance_loss_mlp": 1.02589226, "epoch": 0.40144295806403124, "flos": 18582946225920.0, "grad_norm": 2.1287377468959727, "language_loss": 0.79300511, "learning_rate": 2.719032057146399e-06, "loss": 0.81468892, "num_input_tokens_seen": 143422245, "step": 6677, "time_per_iteration": 2.6485939025878906 }, { "auxiliary_loss_clip": 0.01107812, "auxiliary_loss_mlp": 0.01035756, "balance_loss_clip": 1.04743207, "balance_loss_mlp": 1.02122426, "epoch": 0.4015030813166992, "flos": 22930220442240.0, "grad_norm": 2.404301700652251, "language_loss": 0.83507645, "learning_rate": 2.71866862166691e-06, "loss": 0.85651207, "num_input_tokens_seen": 143443130, "step": 6678, "time_per_iteration": 2.749229907989502 }, { "auxiliary_loss_clip": 0.01127798, "auxiliary_loss_mlp": 0.01039278, "balance_loss_clip": 1.04660463, "balance_loss_mlp": 1.02481759, "epoch": 0.4015632045693672, "flos": 20595057361920.0, "grad_norm": 2.137342142944676, "language_loss": 0.63547456, "learning_rate": 2.718305158935434e-06, "loss": 0.65714526, "num_input_tokens_seen": 143461385, "step": 6679, "time_per_iteration": 4.272741794586182 }, { "auxiliary_loss_clip": 0.01100371, "auxiliary_loss_mlp": 0.01032462, "balance_loss_clip": 1.04277802, "balance_loss_mlp": 1.01852596, "epoch": 0.4016233278220352, "flos": 23438930808960.0, "grad_norm": 2.2420809209281582, "language_loss": 0.78955674, "learning_rate": 2.7179416689657554e-06, "loss": 0.81088501, "num_input_tokens_seen": 143481750, "step": 6680, "time_per_iteration": 2.6541543006896973 }, { "auxiliary_loss_clip": 0.01099744, "auxiliary_loss_mlp": 0.00773185, "balance_loss_clip": 1.04565692, "balance_loss_mlp": 1.0009259, "epoch": 0.40168345107470316, "flos": 21431057477760.0, "grad_norm": 1.5474671150398438, "language_loss": 0.75901389, "learning_rate": 2.7175781517716556e-06, "loss": 0.77774316, "num_input_tokens_seen": 143501540, "step": 6681, "time_per_iteration": 2.747549295425415 }, { "auxiliary_loss_clip": 0.01092334, "auxiliary_loss_mlp": 0.01031295, "balance_loss_clip": 1.04743123, "balance_loss_mlp": 1.01728785, "epoch": 0.4017435743273711, "flos": 22857214049280.0, "grad_norm": 1.9537198932922564, "language_loss": 0.64593118, "learning_rate": 2.7172146073669213e-06, "loss": 0.66716748, "num_input_tokens_seen": 143520530, "step": 6682, "time_per_iteration": 2.764676094055176 }, { "auxiliary_loss_clip": 0.01084656, "auxiliary_loss_mlp": 0.01040039, "balance_loss_clip": 1.04031992, "balance_loss_mlp": 1.025424, "epoch": 0.4018036975800391, "flos": 28622312881920.0, "grad_norm": 8.033606907615594, "language_loss": 0.72794902, "learning_rate": 2.716851035765337e-06, "loss": 0.74919599, "num_input_tokens_seen": 143540210, "step": 6683, "time_per_iteration": 2.9106507301330566 }, { "auxiliary_loss_clip": 0.01116481, "auxiliary_loss_mlp": 0.01043307, "balance_loss_clip": 1.04472065, "balance_loss_mlp": 1.02844119, "epoch": 0.40186382083270705, "flos": 26651212099200.0, "grad_norm": 1.6079104273266733, "language_loss": 0.73560667, "learning_rate": 2.7164874369806896e-06, "loss": 0.75720453, "num_input_tokens_seen": 143560940, "step": 6684, "time_per_iteration": 2.814746141433716 }, { "auxiliary_loss_clip": 0.01038178, "auxiliary_loss_mlp": 0.01003165, "balance_loss_clip": 1.02248073, "balance_loss_mlp": 1.00177026, "epoch": 0.401923944085375, "flos": 59259969123840.0, "grad_norm": 0.8040960642815781, "language_loss": 0.6037817, "learning_rate": 2.716123811026767e-06, "loss": 0.6241951, "num_input_tokens_seen": 143624015, "step": 6685, "time_per_iteration": 3.3159523010253906 }, { "auxiliary_loss_clip": 0.01121727, "auxiliary_loss_mlp": 0.0103265, "balance_loss_clip": 1.04626095, "balance_loss_mlp": 1.01806533, "epoch": 0.401984067338043, "flos": 16982803152000.0, "grad_norm": 2.1640557725493563, "language_loss": 0.69947135, "learning_rate": 2.715760157917357e-06, "loss": 0.7210151, "num_input_tokens_seen": 143642750, "step": 6686, "time_per_iteration": 2.7339890003204346 }, { "auxiliary_loss_clip": 0.01109024, "auxiliary_loss_mlp": 0.01036336, "balance_loss_clip": 1.04641056, "balance_loss_mlp": 1.02213836, "epoch": 0.40204419059071095, "flos": 24972496024320.0, "grad_norm": 1.482832144271372, "language_loss": 0.74904519, "learning_rate": 2.7153964776662504e-06, "loss": 0.77049881, "num_input_tokens_seen": 143664515, "step": 6687, "time_per_iteration": 2.7403111457824707 }, { "auxiliary_loss_clip": 0.01110823, "auxiliary_loss_mlp": 0.01036282, "balance_loss_clip": 1.04890549, "balance_loss_mlp": 1.02179182, "epoch": 0.4021043138433789, "flos": 23477463123840.0, "grad_norm": 1.9109413621033529, "language_loss": 0.71165651, "learning_rate": 2.7150327702872385e-06, "loss": 0.73312759, "num_input_tokens_seen": 143683135, "step": 6688, "time_per_iteration": 2.7349321842193604 }, { "auxiliary_loss_clip": 0.01105847, "auxiliary_loss_mlp": 0.01043979, "balance_loss_clip": 1.0426929, "balance_loss_mlp": 1.02785039, "epoch": 0.4021644370960469, "flos": 25995806588160.0, "grad_norm": 2.0144045301965248, "language_loss": 0.64289308, "learning_rate": 2.7146690357941112e-06, "loss": 0.66439128, "num_input_tokens_seen": 143703985, "step": 6689, "time_per_iteration": 2.740938186645508 }, { "auxiliary_loss_clip": 0.0112261, "auxiliary_loss_mlp": 0.01032805, "balance_loss_clip": 1.04519129, "balance_loss_mlp": 1.01838636, "epoch": 0.40222456034871484, "flos": 13587987922560.0, "grad_norm": 2.8658666003554147, "language_loss": 0.7358911, "learning_rate": 2.7143052742006632e-06, "loss": 0.75744528, "num_input_tokens_seen": 143719245, "step": 6690, "time_per_iteration": 2.622920513153076 }, { "auxiliary_loss_clip": 0.01099316, "auxiliary_loss_mlp": 0.01037502, "balance_loss_clip": 1.04444623, "balance_loss_mlp": 1.0230422, "epoch": 0.4022846836013828, "flos": 24278019494400.0, "grad_norm": 1.7112869735009542, "language_loss": 0.74805617, "learning_rate": 2.7139414855206872e-06, "loss": 0.76942438, "num_input_tokens_seen": 143739575, "step": 6691, "time_per_iteration": 2.704138994216919 }, { "auxiliary_loss_clip": 0.0111344, "auxiliary_loss_mlp": 0.01040266, "balance_loss_clip": 1.0485332, "balance_loss_mlp": 1.02509689, "epoch": 0.40234480685405083, "flos": 20151596050560.0, "grad_norm": 1.5633314974955987, "language_loss": 0.7267946, "learning_rate": 2.7135776697679785e-06, "loss": 0.74833167, "num_input_tokens_seen": 143758515, "step": 6692, "time_per_iteration": 2.6782071590423584 }, { "auxiliary_loss_clip": 0.01081716, "auxiliary_loss_mlp": 0.0103731, "balance_loss_clip": 1.04122448, "balance_loss_mlp": 1.02274227, "epoch": 0.4024049301067188, "flos": 22930220442240.0, "grad_norm": 2.743543242099247, "language_loss": 0.84403068, "learning_rate": 2.7132138269563333e-06, "loss": 0.8652209, "num_input_tokens_seen": 143776770, "step": 6693, "time_per_iteration": 2.746689558029175 }, { "auxiliary_loss_clip": 0.01092043, "auxiliary_loss_mlp": 0.0104876, "balance_loss_clip": 1.04803836, "balance_loss_mlp": 1.03265464, "epoch": 0.40246505335938676, "flos": 36028421487360.0, "grad_norm": 2.4363636021200716, "language_loss": 0.70996636, "learning_rate": 2.7128499570995483e-06, "loss": 0.73137438, "num_input_tokens_seen": 143798450, "step": 6694, "time_per_iteration": 2.8071961402893066 }, { "auxiliary_loss_clip": 0.01104186, "auxiliary_loss_mlp": 0.01044295, "balance_loss_clip": 1.04619551, "balance_loss_mlp": 1.0292511, "epoch": 0.4025251766120547, "flos": 20594303176320.0, "grad_norm": 2.4336892369471976, "language_loss": 0.67823637, "learning_rate": 2.7124860602114212e-06, "loss": 0.6997211, "num_input_tokens_seen": 143816995, "step": 6695, "time_per_iteration": 2.628509283065796 }, { "auxiliary_loss_clip": 0.01100807, "auxiliary_loss_mlp": 0.01043105, "balance_loss_clip": 1.04269171, "balance_loss_mlp": 1.0272975, "epoch": 0.4025852998647227, "flos": 64523932381440.0, "grad_norm": 2.090135381279502, "language_loss": 0.79316044, "learning_rate": 2.7121221363057515e-06, "loss": 0.81459951, "num_input_tokens_seen": 143842090, "step": 6696, "time_per_iteration": 3.065619707107544 }, { "auxiliary_loss_clip": 0.01107424, "auxiliary_loss_mlp": 0.0105453, "balance_loss_clip": 1.04772997, "balance_loss_mlp": 1.03700638, "epoch": 0.40264542311739066, "flos": 20886292834560.0, "grad_norm": 2.0469796766510164, "language_loss": 0.71048194, "learning_rate": 2.7117581853963393e-06, "loss": 0.73210156, "num_input_tokens_seen": 143860800, "step": 6697, "time_per_iteration": 2.732112169265747 }, { "auxiliary_loss_clip": 0.01119834, "auxiliary_loss_mlp": 0.01045848, "balance_loss_clip": 1.04644823, "balance_loss_mlp": 1.03167999, "epoch": 0.4027055463700586, "flos": 26250197685120.0, "grad_norm": 2.1595912992700725, "language_loss": 0.6184175, "learning_rate": 2.711394207496984e-06, "loss": 0.64007437, "num_input_tokens_seen": 143878950, "step": 6698, "time_per_iteration": 2.6853909492492676 }, { "auxiliary_loss_clip": 0.01122685, "auxiliary_loss_mlp": 0.01038255, "balance_loss_clip": 1.04787982, "balance_loss_mlp": 1.02309155, "epoch": 0.4027656696227266, "flos": 20631398947200.0, "grad_norm": 2.043260848719272, "language_loss": 0.76455128, "learning_rate": 2.711030202621491e-06, "loss": 0.78616071, "num_input_tokens_seen": 143898385, "step": 6699, "time_per_iteration": 2.6033456325531006 }, { "auxiliary_loss_clip": 0.01093615, "auxiliary_loss_mlp": 0.01030999, "balance_loss_clip": 1.04446507, "balance_loss_mlp": 1.01700354, "epoch": 0.40282579287539455, "flos": 22346277039360.0, "grad_norm": 1.6890007857677205, "language_loss": 0.80442715, "learning_rate": 2.7106661707836605e-06, "loss": 0.82567334, "num_input_tokens_seen": 143918795, "step": 6700, "time_per_iteration": 2.777510404586792 }, { "auxiliary_loss_clip": 0.01112643, "auxiliary_loss_mlp": 0.01045016, "balance_loss_clip": 1.04943717, "balance_loss_mlp": 1.02808821, "epoch": 0.4028859161280625, "flos": 29274988959360.0, "grad_norm": 2.176323872107602, "language_loss": 0.74529326, "learning_rate": 2.7103021119972977e-06, "loss": 0.7668699, "num_input_tokens_seen": 143938245, "step": 6701, "time_per_iteration": 2.7424893379211426 }, { "auxiliary_loss_clip": 0.01099003, "auxiliary_loss_mlp": 0.01037912, "balance_loss_clip": 1.04379773, "balance_loss_mlp": 1.02355886, "epoch": 0.4029460393807305, "flos": 28622312881920.0, "grad_norm": 1.8130604516939894, "language_loss": 0.66064012, "learning_rate": 2.709938026276208e-06, "loss": 0.68200922, "num_input_tokens_seen": 143960995, "step": 6702, "time_per_iteration": 2.7448410987854004 }, { "auxiliary_loss_clip": 0.01105222, "auxiliary_loss_mlp": 0.01045108, "balance_loss_clip": 1.0470736, "balance_loss_mlp": 1.02900267, "epoch": 0.40300616263339845, "flos": 22601925112320.0, "grad_norm": 1.86356350955038, "language_loss": 0.66031915, "learning_rate": 2.7095739136341964e-06, "loss": 0.68182242, "num_input_tokens_seen": 143979910, "step": 6703, "time_per_iteration": 2.679979085922241 }, { "auxiliary_loss_clip": 0.01060539, "auxiliary_loss_mlp": 0.01041946, "balance_loss_clip": 1.04386449, "balance_loss_mlp": 1.02445817, "epoch": 0.4030662858860664, "flos": 25520313323520.0, "grad_norm": 2.0398618821746304, "language_loss": 0.82689512, "learning_rate": 2.709209774085071e-06, "loss": 0.84792, "num_input_tokens_seen": 144000095, "step": 6704, "time_per_iteration": 2.9296765327453613 }, { "auxiliary_loss_clip": 0.01112771, "auxiliary_loss_mlp": 0.01040131, "balance_loss_clip": 1.04960763, "balance_loss_mlp": 1.02517009, "epoch": 0.40312640913873443, "flos": 23586703361280.0, "grad_norm": 1.6638111373196858, "language_loss": 0.73759186, "learning_rate": 2.7088456076426407e-06, "loss": 0.75912088, "num_input_tokens_seen": 144019695, "step": 6705, "time_per_iteration": 3.0039970874786377 }, { "auxiliary_loss_clip": 0.0111798, "auxiliary_loss_mlp": 0.01039471, "balance_loss_clip": 1.04735386, "balance_loss_mlp": 1.02541077, "epoch": 0.4031865323914024, "flos": 20011042131840.0, "grad_norm": 1.7718881662691552, "language_loss": 0.65816283, "learning_rate": 2.708481414320713e-06, "loss": 0.67973745, "num_input_tokens_seen": 144038525, "step": 6706, "time_per_iteration": 2.6920299530029297 }, { "auxiliary_loss_clip": 0.01123098, "auxiliary_loss_mlp": 0.01039977, "balance_loss_clip": 1.05084229, "balance_loss_mlp": 1.02508759, "epoch": 0.40324665564407036, "flos": 21871430219520.0, "grad_norm": 1.5916886093016338, "language_loss": 0.71493578, "learning_rate": 2.7081171941330992e-06, "loss": 0.73656654, "num_input_tokens_seen": 144059485, "step": 6707, "time_per_iteration": 2.6424286365509033 }, { "auxiliary_loss_clip": 0.01104664, "auxiliary_loss_mlp": 0.0103554, "balance_loss_clip": 1.04652226, "balance_loss_mlp": 1.0201261, "epoch": 0.4033067788967383, "flos": 23878728933120.0, "grad_norm": 1.6049010195706548, "language_loss": 0.79860801, "learning_rate": 2.707752947093611e-06, "loss": 0.82001007, "num_input_tokens_seen": 144080265, "step": 6708, "time_per_iteration": 2.7476210594177246 }, { "auxiliary_loss_clip": 0.01081311, "auxiliary_loss_mlp": 0.01041497, "balance_loss_clip": 1.04192591, "balance_loss_mlp": 1.0254873, "epoch": 0.4033669021494063, "flos": 17419907756160.0, "grad_norm": 2.2092970812397823, "language_loss": 0.82527256, "learning_rate": 2.70738867321606e-06, "loss": 0.84650064, "num_input_tokens_seen": 144098040, "step": 6709, "time_per_iteration": 2.6981422901153564 }, { "auxiliary_loss_clip": 0.01126319, "auxiliary_loss_mlp": 0.01037264, "balance_loss_clip": 1.052701, "balance_loss_mlp": 1.02168322, "epoch": 0.40342702540207426, "flos": 29600554855680.0, "grad_norm": 3.462855853005799, "language_loss": 0.71349508, "learning_rate": 2.70702437251426e-06, "loss": 0.73513091, "num_input_tokens_seen": 144118265, "step": 6710, "time_per_iteration": 2.745234727859497 }, { "auxiliary_loss_clip": 0.01100277, "auxiliary_loss_mlp": 0.01040518, "balance_loss_clip": 1.0461812, "balance_loss_mlp": 1.02506852, "epoch": 0.4034871486547422, "flos": 11284605400320.0, "grad_norm": 2.0008015592173285, "language_loss": 0.8497777, "learning_rate": 2.7066600450020236e-06, "loss": 0.8711856, "num_input_tokens_seen": 144133865, "step": 6711, "time_per_iteration": 2.6388518810272217 }, { "auxiliary_loss_clip": 0.01124865, "auxiliary_loss_mlp": 0.01037377, "balance_loss_clip": 1.04873466, "balance_loss_mlp": 1.02192783, "epoch": 0.4035472719074102, "flos": 15552839738880.0, "grad_norm": 1.9288958482484087, "language_loss": 0.76210845, "learning_rate": 2.706295690693168e-06, "loss": 0.78373086, "num_input_tokens_seen": 144150125, "step": 6712, "time_per_iteration": 2.617612838745117 }, { "auxiliary_loss_clip": 0.0110296, "auxiliary_loss_mlp": 0.01042297, "balance_loss_clip": 1.0465771, "balance_loss_mlp": 1.02682328, "epoch": 0.40360739516007815, "flos": 24674365140480.0, "grad_norm": 2.8401310029686284, "language_loss": 0.79334903, "learning_rate": 2.7059313096015096e-06, "loss": 0.81480157, "num_input_tokens_seen": 144169295, "step": 6713, "time_per_iteration": 4.2229533195495605 }, { "auxiliary_loss_clip": 0.01096327, "auxiliary_loss_mlp": 0.01040909, "balance_loss_clip": 1.04259837, "balance_loss_mlp": 1.02437484, "epoch": 0.4036675184127461, "flos": 17304095329920.0, "grad_norm": 2.4269881355691867, "language_loss": 0.88230258, "learning_rate": 2.705566901740865e-06, "loss": 0.90367496, "num_input_tokens_seen": 144185790, "step": 6714, "time_per_iteration": 2.6861040592193604 }, { "auxiliary_loss_clip": 0.0112277, "auxiliary_loss_mlp": 0.01042461, "balance_loss_clip": 1.04913116, "balance_loss_mlp": 1.02755439, "epoch": 0.4037276416654141, "flos": 19864023765120.0, "grad_norm": 1.685218394347131, "language_loss": 0.69355965, "learning_rate": 2.7052024671250527e-06, "loss": 0.71521199, "num_input_tokens_seen": 144205190, "step": 6715, "time_per_iteration": 6.05805778503418 }, { "auxiliary_loss_clip": 0.01085368, "auxiliary_loss_mlp": 0.01039127, "balance_loss_clip": 1.03982067, "balance_loss_mlp": 1.02422547, "epoch": 0.40378776491808205, "flos": 18296271780480.0, "grad_norm": 2.4042590138214543, "language_loss": 0.7738142, "learning_rate": 2.704838005767892e-06, "loss": 0.7950592, "num_input_tokens_seen": 144222705, "step": 6716, "time_per_iteration": 2.874701738357544 }, { "auxiliary_loss_clip": 0.01084201, "auxiliary_loss_mlp": 0.01039901, "balance_loss_clip": 1.04515779, "balance_loss_mlp": 1.02554834, "epoch": 0.40384788817075, "flos": 15049372757760.0, "grad_norm": 1.8822370621315767, "language_loss": 0.7590825, "learning_rate": 2.7044735176832037e-06, "loss": 0.78032351, "num_input_tokens_seen": 144239545, "step": 6717, "time_per_iteration": 2.806605339050293 }, { "auxiliary_loss_clip": 0.01034573, "auxiliary_loss_mlp": 0.01006348, "balance_loss_clip": 1.03120637, "balance_loss_mlp": 1.00481057, "epoch": 0.40390801142341803, "flos": 61929927895680.0, "grad_norm": 0.9365934623644069, "language_loss": 0.60732949, "learning_rate": 2.7041090028848084e-06, "loss": 0.62773865, "num_input_tokens_seen": 144288145, "step": 6718, "time_per_iteration": 4.683047771453857 }, { "auxiliary_loss_clip": 0.01137275, "auxiliary_loss_mlp": 0.01039366, "balance_loss_clip": 1.04942691, "balance_loss_mlp": 1.02322555, "epoch": 0.403968134676086, "flos": 22738779930240.0, "grad_norm": 2.360676977629441, "language_loss": 0.74748445, "learning_rate": 2.7037444613865306e-06, "loss": 0.76925087, "num_input_tokens_seen": 144302315, "step": 6719, "time_per_iteration": 2.6020865440368652 }, { "auxiliary_loss_clip": 0.01122679, "auxiliary_loss_mlp": 0.01042794, "balance_loss_clip": 1.04766619, "balance_loss_mlp": 1.02643895, "epoch": 0.40402825792875396, "flos": 19784409269760.0, "grad_norm": 2.123342604077105, "language_loss": 0.81516802, "learning_rate": 2.7033798932021906e-06, "loss": 0.83682275, "num_input_tokens_seen": 144318990, "step": 6720, "time_per_iteration": 2.6707048416137695 }, { "auxiliary_loss_clip": 0.01106407, "auxiliary_loss_mlp": 0.01033364, "balance_loss_clip": 1.04365981, "balance_loss_mlp": 1.01866555, "epoch": 0.40408838118142193, "flos": 19609273532160.0, "grad_norm": 2.786601864332057, "language_loss": 0.77150661, "learning_rate": 2.7030152983456153e-06, "loss": 0.79290426, "num_input_tokens_seen": 144335765, "step": 6721, "time_per_iteration": 2.648050546646118 }, { "auxiliary_loss_clip": 0.01091711, "auxiliary_loss_mlp": 0.0102956, "balance_loss_clip": 1.04391122, "balance_loss_mlp": 1.01643503, "epoch": 0.4041485044340899, "flos": 24426043441920.0, "grad_norm": 2.012609049395132, "language_loss": 0.72214961, "learning_rate": 2.7026506768306304e-06, "loss": 0.74336231, "num_input_tokens_seen": 144355825, "step": 6722, "time_per_iteration": 2.7598764896392822 }, { "auxiliary_loss_clip": 0.01117849, "auxiliary_loss_mlp": 0.01035294, "balance_loss_clip": 1.04649532, "balance_loss_mlp": 1.02137017, "epoch": 0.40420862768675786, "flos": 16760192613120.0, "grad_norm": 2.003025152561758, "language_loss": 0.66099858, "learning_rate": 2.7022860286710602e-06, "loss": 0.68252993, "num_input_tokens_seen": 144374320, "step": 6723, "time_per_iteration": 2.6525375843048096 }, { "auxiliary_loss_clip": 0.0111764, "auxiliary_loss_mlp": 0.01047962, "balance_loss_clip": 1.04678059, "balance_loss_mlp": 1.03247619, "epoch": 0.4042687509394258, "flos": 22491571553280.0, "grad_norm": 1.6479262490520643, "language_loss": 0.73566139, "learning_rate": 2.701921353880734e-06, "loss": 0.75731742, "num_input_tokens_seen": 144394325, "step": 6724, "time_per_iteration": 2.6602234840393066 }, { "auxiliary_loss_clip": 0.01096943, "auxiliary_loss_mlp": 0.01034012, "balance_loss_clip": 1.04471684, "balance_loss_mlp": 1.02009475, "epoch": 0.4043288741920938, "flos": 30336149479680.0, "grad_norm": 1.8514955948130458, "language_loss": 0.74733102, "learning_rate": 2.7015566524734787e-06, "loss": 0.76864064, "num_input_tokens_seen": 144412765, "step": 6725, "time_per_iteration": 2.7086737155914307 }, { "auxiliary_loss_clip": 0.01116531, "auxiliary_loss_mlp": 0.01035939, "balance_loss_clip": 1.04757476, "balance_loss_mlp": 1.02062047, "epoch": 0.40438899744476176, "flos": 46348321363200.0, "grad_norm": 2.3229573968410766, "language_loss": 0.76987183, "learning_rate": 2.701191924463126e-06, "loss": 0.7913965, "num_input_tokens_seen": 144435400, "step": 6726, "time_per_iteration": 2.880244493484497 }, { "auxiliary_loss_clip": 0.01102844, "auxiliary_loss_mlp": 0.00775301, "balance_loss_clip": 1.04148483, "balance_loss_mlp": 1.00105536, "epoch": 0.4044491206974297, "flos": 13333524998400.0, "grad_norm": 2.125548317574291, "language_loss": 0.8180182, "learning_rate": 2.7008271698635054e-06, "loss": 0.83679968, "num_input_tokens_seen": 144452925, "step": 6727, "time_per_iteration": 2.6953587532043457 }, { "auxiliary_loss_clip": 0.01128783, "auxiliary_loss_mlp": 0.01036901, "balance_loss_clip": 1.04577255, "balance_loss_mlp": 1.02264905, "epoch": 0.4045092439500977, "flos": 12093745121280.0, "grad_norm": 2.0701087852414504, "language_loss": 0.85462439, "learning_rate": 2.700462388688447e-06, "loss": 0.87628114, "num_input_tokens_seen": 144470195, "step": 6728, "time_per_iteration": 2.5963056087493896 }, { "auxiliary_loss_clip": 0.01095663, "auxiliary_loss_mlp": 0.01043865, "balance_loss_clip": 1.04611719, "balance_loss_mlp": 1.029351, "epoch": 0.40456936720276565, "flos": 21179683123200.0, "grad_norm": 1.739738235535384, "language_loss": 0.81606215, "learning_rate": 2.700097580951786e-06, "loss": 0.83745748, "num_input_tokens_seen": 144490320, "step": 6729, "time_per_iteration": 2.8157620429992676 }, { "auxiliary_loss_clip": 0.01105665, "auxiliary_loss_mlp": 0.01043945, "balance_loss_clip": 1.0443244, "balance_loss_mlp": 1.02993762, "epoch": 0.4046294904554336, "flos": 23915286000000.0, "grad_norm": 1.917482865643355, "language_loss": 0.73375344, "learning_rate": 2.6997327466673533e-06, "loss": 0.75524956, "num_input_tokens_seen": 144508990, "step": 6730, "time_per_iteration": 2.67053484916687 }, { "auxiliary_loss_clip": 0.01113781, "auxiliary_loss_mlp": 0.01041271, "balance_loss_clip": 1.04319108, "balance_loss_mlp": 1.02674532, "epoch": 0.4046896137081016, "flos": 38071235773440.0, "grad_norm": 2.5953767613834673, "language_loss": 0.67485142, "learning_rate": 2.699367885848985e-06, "loss": 0.69640195, "num_input_tokens_seen": 144529550, "step": 6731, "time_per_iteration": 2.8106632232666016 }, { "auxiliary_loss_clip": 0.01128909, "auxiliary_loss_mlp": 0.01038653, "balance_loss_clip": 1.04689097, "balance_loss_mlp": 1.02531338, "epoch": 0.4047497369607696, "flos": 23617262856960.0, "grad_norm": 1.5691591770044138, "language_loss": 0.74245793, "learning_rate": 2.699002998510517e-06, "loss": 0.76413357, "num_input_tokens_seen": 144549310, "step": 6732, "time_per_iteration": 2.6608641147613525 }, { "auxiliary_loss_clip": 0.0110044, "auxiliary_loss_mlp": 0.00770096, "balance_loss_clip": 1.04635525, "balance_loss_mlp": 1.00099349, "epoch": 0.40480986021343757, "flos": 12823593569280.0, "grad_norm": 1.738611378800115, "language_loss": 0.77579916, "learning_rate": 2.6986380846657852e-06, "loss": 0.79450446, "num_input_tokens_seen": 144567430, "step": 6733, "time_per_iteration": 2.648707151412964 }, { "auxiliary_loss_clip": 0.01102753, "auxiliary_loss_mlp": 0.01043236, "balance_loss_clip": 1.04195142, "balance_loss_mlp": 1.0276798, "epoch": 0.40486998346610553, "flos": 23768770423680.0, "grad_norm": 1.875618790304424, "language_loss": 0.76887047, "learning_rate": 2.698273144328627e-06, "loss": 0.79033035, "num_input_tokens_seen": 144585975, "step": 6734, "time_per_iteration": 2.7222812175750732 }, { "auxiliary_loss_clip": 0.01110956, "auxiliary_loss_mlp": 0.01032993, "balance_loss_clip": 1.04893517, "balance_loss_mlp": 1.01923609, "epoch": 0.4049301067187735, "flos": 22856818999680.0, "grad_norm": 2.463703641644531, "language_loss": 0.64536786, "learning_rate": 2.6979081775128805e-06, "loss": 0.66680741, "num_input_tokens_seen": 144605225, "step": 6735, "time_per_iteration": 2.682111978530884 }, { "auxiliary_loss_clip": 0.01088904, "auxiliary_loss_mlp": 0.01039113, "balance_loss_clip": 1.04142201, "balance_loss_mlp": 1.0247122, "epoch": 0.40499022997144146, "flos": 22783992174720.0, "grad_norm": 1.9621030422141739, "language_loss": 0.83120507, "learning_rate": 2.697543184232387e-06, "loss": 0.85248524, "num_input_tokens_seen": 144624145, "step": 6736, "time_per_iteration": 2.737946033477783 }, { "auxiliary_loss_clip": 0.01103471, "auxiliary_loss_mlp": 0.00773133, "balance_loss_clip": 1.04903114, "balance_loss_mlp": 1.00089931, "epoch": 0.4050503532241094, "flos": 23039352938880.0, "grad_norm": 1.950757015883091, "language_loss": 0.75173002, "learning_rate": 2.6971781645009863e-06, "loss": 0.77049613, "num_input_tokens_seen": 144644470, "step": 6737, "time_per_iteration": 2.7009494304656982 }, { "auxiliary_loss_clip": 0.01119674, "auxiliary_loss_mlp": 0.01042697, "balance_loss_clip": 1.04876637, "balance_loss_mlp": 1.02858806, "epoch": 0.4051104764767774, "flos": 16647756065280.0, "grad_norm": 3.18955375846042, "language_loss": 0.72142565, "learning_rate": 2.696813118332519e-06, "loss": 0.74304938, "num_input_tokens_seen": 144661055, "step": 6738, "time_per_iteration": 2.63269305229187 }, { "auxiliary_loss_clip": 0.01094776, "auxiliary_loss_mlp": 0.01033432, "balance_loss_clip": 1.04453516, "balance_loss_mlp": 1.02065849, "epoch": 0.40517059972944536, "flos": 16358962717440.0, "grad_norm": 1.9585661201522753, "language_loss": 0.75113159, "learning_rate": 2.696448045740828e-06, "loss": 0.77241367, "num_input_tokens_seen": 144677935, "step": 6739, "time_per_iteration": 2.678330421447754 }, { "auxiliary_loss_clip": 0.01092708, "auxiliary_loss_mlp": 0.01036695, "balance_loss_clip": 1.04475963, "balance_loss_mlp": 1.02244925, "epoch": 0.4052307229821133, "flos": 28803374363520.0, "grad_norm": 2.0151914408481066, "language_loss": 0.73516095, "learning_rate": 2.6960829467397576e-06, "loss": 0.75645494, "num_input_tokens_seen": 144697725, "step": 6740, "time_per_iteration": 2.821165084838867 }, { "auxiliary_loss_clip": 0.01111182, "auxiliary_loss_mlp": 0.01032908, "balance_loss_clip": 1.04380143, "balance_loss_mlp": 1.01927674, "epoch": 0.4052908462347813, "flos": 21397876289280.0, "grad_norm": 1.5447802431592257, "language_loss": 0.77149022, "learning_rate": 2.695717821343153e-06, "loss": 0.79293114, "num_input_tokens_seen": 144718805, "step": 6741, "time_per_iteration": 2.639744758605957 }, { "auxiliary_loss_clip": 0.01132474, "auxiliary_loss_mlp": 0.01039418, "balance_loss_clip": 1.04797888, "balance_loss_mlp": 1.02415919, "epoch": 0.40535096948744925, "flos": 22419067950720.0, "grad_norm": 2.3470472177782584, "language_loss": 0.71132898, "learning_rate": 2.6953526695648577e-06, "loss": 0.73304784, "num_input_tokens_seen": 144737105, "step": 6742, "time_per_iteration": 2.566246509552002 }, { "auxiliary_loss_clip": 0.01132445, "auxiliary_loss_mlp": 0.01031417, "balance_loss_clip": 1.04941666, "balance_loss_mlp": 1.01739824, "epoch": 0.4054110927401172, "flos": 17010776868480.0, "grad_norm": 2.3285032794047966, "language_loss": 0.71915448, "learning_rate": 2.6949874914187202e-06, "loss": 0.74079311, "num_input_tokens_seen": 144751350, "step": 6743, "time_per_iteration": 2.7150700092315674 }, { "auxiliary_loss_clip": 0.01109405, "auxiliary_loss_mlp": 0.01036045, "balance_loss_clip": 1.04626715, "balance_loss_mlp": 1.0209291, "epoch": 0.4054712159927852, "flos": 21614848392960.0, "grad_norm": 2.2533363543989053, "language_loss": 0.70529258, "learning_rate": 2.694622286918588e-06, "loss": 0.72674704, "num_input_tokens_seen": 144770030, "step": 6744, "time_per_iteration": 2.715900421142578 }, { "auxiliary_loss_clip": 0.01118115, "auxiliary_loss_mlp": 0.01036188, "balance_loss_clip": 1.04826701, "balance_loss_mlp": 1.02316439, "epoch": 0.4055313392454532, "flos": 25812554376960.0, "grad_norm": 1.8071994834567642, "language_loss": 0.80102956, "learning_rate": 2.6942570560783076e-06, "loss": 0.82257259, "num_input_tokens_seen": 144790965, "step": 6745, "time_per_iteration": 2.6989259719848633 }, { "auxiliary_loss_clip": 0.01108583, "auxiliary_loss_mlp": 0.01034972, "balance_loss_clip": 1.04861784, "balance_loss_mlp": 1.02049959, "epoch": 0.40559146249812117, "flos": 14137098111360.0, "grad_norm": 1.8906308851954157, "language_loss": 0.66942173, "learning_rate": 2.693891798911731e-06, "loss": 0.69085735, "num_input_tokens_seen": 144807755, "step": 6746, "time_per_iteration": 2.7211005687713623 }, { "auxiliary_loss_clip": 0.01092509, "auxiliary_loss_mlp": 0.01033925, "balance_loss_clip": 1.04508781, "balance_loss_mlp": 1.02044201, "epoch": 0.40565158575078913, "flos": 41355481962240.0, "grad_norm": 1.4960206584486848, "language_loss": 0.57240731, "learning_rate": 2.6935265154327075e-06, "loss": 0.59367168, "num_input_tokens_seen": 144832405, "step": 6747, "time_per_iteration": 2.8735926151275635 }, { "auxiliary_loss_clip": 0.0109681, "auxiliary_loss_mlp": 0.01043537, "balance_loss_clip": 1.04770565, "balance_loss_mlp": 1.03084731, "epoch": 0.4057117090034571, "flos": 28544529980160.0, "grad_norm": 1.7545120295248704, "language_loss": 0.8468259, "learning_rate": 2.693161205655089e-06, "loss": 0.86822933, "num_input_tokens_seen": 144853890, "step": 6748, "time_per_iteration": 2.7470786571502686 }, { "auxiliary_loss_clip": 0.01107762, "auxiliary_loss_mlp": 0.0104113, "balance_loss_clip": 1.05110598, "balance_loss_mlp": 1.02695, "epoch": 0.40577183225612506, "flos": 18004066640640.0, "grad_norm": 2.5881063547984398, "language_loss": 0.81445849, "learning_rate": 2.6927958695927287e-06, "loss": 0.83594739, "num_input_tokens_seen": 144871395, "step": 6749, "time_per_iteration": 2.677762746810913 }, { "auxiliary_loss_clip": 0.01119763, "auxiliary_loss_mlp": 0.00771508, "balance_loss_clip": 1.04914761, "balance_loss_mlp": 1.00084698, "epoch": 0.40583195550879303, "flos": 19536734016000.0, "grad_norm": 1.7422987888005266, "language_loss": 0.75235945, "learning_rate": 2.6924305072594784e-06, "loss": 0.77127212, "num_input_tokens_seen": 144890975, "step": 6750, "time_per_iteration": 2.6956052780151367 }, { "auxiliary_loss_clip": 0.0111553, "auxiliary_loss_mlp": 0.01041156, "balance_loss_clip": 1.04812646, "balance_loss_mlp": 1.02654123, "epoch": 0.405892078761461, "flos": 22309468577280.0, "grad_norm": 2.479262216129207, "language_loss": 0.73942888, "learning_rate": 2.692065118669195e-06, "loss": 0.76099575, "num_input_tokens_seen": 144908170, "step": 6751, "time_per_iteration": 2.6845548152923584 }, { "auxiliary_loss_clip": 0.01086462, "auxiliary_loss_mlp": 0.01042521, "balance_loss_clip": 1.04832053, "balance_loss_mlp": 1.02627254, "epoch": 0.40595220201412896, "flos": 25484402701440.0, "grad_norm": 1.7042707146701068, "language_loss": 0.66690767, "learning_rate": 2.6916997038357326e-06, "loss": 0.68819749, "num_input_tokens_seen": 144928020, "step": 6752, "time_per_iteration": 4.372137784957886 }, { "auxiliary_loss_clip": 0.01086822, "auxiliary_loss_mlp": 0.0104486, "balance_loss_clip": 1.04698646, "balance_loss_mlp": 1.02896988, "epoch": 0.4060123252667969, "flos": 49856004103680.0, "grad_norm": 2.0675680438490374, "language_loss": 0.7062583, "learning_rate": 2.691334262772948e-06, "loss": 0.72757506, "num_input_tokens_seen": 144951240, "step": 6753, "time_per_iteration": 2.954685688018799 }, { "auxiliary_loss_clip": 0.0110904, "auxiliary_loss_mlp": 0.01037036, "balance_loss_clip": 1.04630709, "balance_loss_mlp": 1.02162218, "epoch": 0.4060724485194649, "flos": 21135476459520.0, "grad_norm": 1.6674578897026393, "language_loss": 0.72053552, "learning_rate": 2.690968795494699e-06, "loss": 0.74199629, "num_input_tokens_seen": 144969100, "step": 6754, "time_per_iteration": 5.758596420288086 }, { "auxiliary_loss_clip": 0.01097183, "auxiliary_loss_mlp": 0.01040377, "balance_loss_clip": 1.04531932, "balance_loss_mlp": 1.02634573, "epoch": 0.40613257177213286, "flos": 21758059918080.0, "grad_norm": 1.655202233640458, "language_loss": 0.8301084, "learning_rate": 2.690603302014844e-06, "loss": 0.851484, "num_input_tokens_seen": 144987065, "step": 6755, "time_per_iteration": 2.7983388900756836 }, { "auxiliary_loss_clip": 0.01086578, "auxiliary_loss_mlp": 0.01041496, "balance_loss_clip": 1.04638743, "balance_loss_mlp": 1.02645206, "epoch": 0.4061926950248008, "flos": 25555074710400.0, "grad_norm": 1.5597680276021608, "language_loss": 0.71212381, "learning_rate": 2.6902377823472426e-06, "loss": 0.73340452, "num_input_tokens_seen": 145007310, "step": 6756, "time_per_iteration": 2.8140816688537598 }, { "auxiliary_loss_clip": 0.01071802, "auxiliary_loss_mlp": 0.00773633, "balance_loss_clip": 1.04193711, "balance_loss_mlp": 1.00074661, "epoch": 0.4062528182774688, "flos": 23695799944320.0, "grad_norm": 2.0528550033278075, "language_loss": 0.79103237, "learning_rate": 2.689872236505755e-06, "loss": 0.80948675, "num_input_tokens_seen": 145026210, "step": 6757, "time_per_iteration": 4.472316741943359 }, { "auxiliary_loss_clip": 0.01112634, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.05197811, "balance_loss_mlp": 1.01777542, "epoch": 0.4063129415301368, "flos": 21726027964800.0, "grad_norm": 1.8573345394429819, "language_loss": 0.78500074, "learning_rate": 2.6895066645042437e-06, "loss": 0.80644321, "num_input_tokens_seen": 145045475, "step": 6758, "time_per_iteration": 2.732006072998047 }, { "auxiliary_loss_clip": 0.01096195, "auxiliary_loss_mlp": 0.0103788, "balance_loss_clip": 1.05092061, "balance_loss_mlp": 1.02355731, "epoch": 0.40637306478280477, "flos": 12787575206400.0, "grad_norm": 2.1068114153090254, "language_loss": 0.89142424, "learning_rate": 2.6891410663565703e-06, "loss": 0.91276503, "num_input_tokens_seen": 145062260, "step": 6759, "time_per_iteration": 2.768120288848877 }, { "auxiliary_loss_clip": 0.0109872, "auxiliary_loss_mlp": 0.01036325, "balance_loss_clip": 1.04916096, "balance_loss_mlp": 1.02241302, "epoch": 0.40643318803547274, "flos": 24024490323840.0, "grad_norm": 1.8143975866028277, "language_loss": 0.64272439, "learning_rate": 2.688775442076598e-06, "loss": 0.66407484, "num_input_tokens_seen": 145082470, "step": 6760, "time_per_iteration": 2.724278211593628 }, { "auxiliary_loss_clip": 0.01120642, "auxiliary_loss_mlp": 0.01035863, "balance_loss_clip": 1.04679084, "balance_loss_mlp": 1.02100921, "epoch": 0.4064933112881407, "flos": 25592421876480.0, "grad_norm": 1.9958038926303674, "language_loss": 0.75134486, "learning_rate": 2.688409791678193e-06, "loss": 0.77290988, "num_input_tokens_seen": 145105685, "step": 6761, "time_per_iteration": 2.81839919090271 }, { "auxiliary_loss_clip": 0.01097139, "auxiliary_loss_mlp": 0.0103932, "balance_loss_clip": 1.04636633, "balance_loss_mlp": 1.02598023, "epoch": 0.40655343454080867, "flos": 22054323294720.0, "grad_norm": 1.6270794268543942, "language_loss": 0.70070893, "learning_rate": 2.6880441151752185e-06, "loss": 0.72207355, "num_input_tokens_seen": 145125590, "step": 6762, "time_per_iteration": 2.6583070755004883 }, { "auxiliary_loss_clip": 0.0111912, "auxiliary_loss_mlp": 0.01032723, "balance_loss_clip": 1.0519619, "balance_loss_mlp": 1.01906157, "epoch": 0.40661355779347663, "flos": 26468893641600.0, "grad_norm": 1.6183981098694702, "language_loss": 0.73523986, "learning_rate": 2.6876784125815433e-06, "loss": 0.75675833, "num_input_tokens_seen": 145146810, "step": 6763, "time_per_iteration": 2.674830198287964 }, { "auxiliary_loss_clip": 0.01090413, "auxiliary_loss_mlp": 0.01035278, "balance_loss_clip": 1.04031014, "balance_loss_mlp": 1.0199244, "epoch": 0.4066736810461446, "flos": 13261129136640.0, "grad_norm": 2.065371393903723, "language_loss": 0.68689919, "learning_rate": 2.687312683911033e-06, "loss": 0.70815611, "num_input_tokens_seen": 145163130, "step": 6764, "time_per_iteration": 2.7424631118774414 }, { "auxiliary_loss_clip": 0.01104645, "auxiliary_loss_mlp": 0.01045832, "balance_loss_clip": 1.0461781, "balance_loss_mlp": 1.02930999, "epoch": 0.40673380429881256, "flos": 28803625758720.0, "grad_norm": 2.4553121190783, "language_loss": 0.91144872, "learning_rate": 2.686946929177557e-06, "loss": 0.93295348, "num_input_tokens_seen": 145181420, "step": 6765, "time_per_iteration": 2.705754280090332 }, { "auxiliary_loss_clip": 0.01121713, "auxiliary_loss_mlp": 0.01044564, "balance_loss_clip": 1.04742265, "balance_loss_mlp": 1.02876294, "epoch": 0.4067939275514805, "flos": 12495334152960.0, "grad_norm": 3.5832481358362673, "language_loss": 0.78673786, "learning_rate": 2.6865811483949855e-06, "loss": 0.80840063, "num_input_tokens_seen": 145198545, "step": 6766, "time_per_iteration": 2.6291732788085938 }, { "auxiliary_loss_clip": 0.01137462, "auxiliary_loss_mlp": 0.01042218, "balance_loss_clip": 1.0502665, "balance_loss_mlp": 1.02767396, "epoch": 0.4068540508041485, "flos": 18770508069120.0, "grad_norm": 2.203846422574217, "language_loss": 0.763403, "learning_rate": 2.6862153415771867e-06, "loss": 0.78519982, "num_input_tokens_seen": 145215835, "step": 6767, "time_per_iteration": 2.583494186401367 }, { "auxiliary_loss_clip": 0.01124058, "auxiliary_loss_mlp": 0.01038086, "balance_loss_clip": 1.0510633, "balance_loss_mlp": 1.02363229, "epoch": 0.40691417405681646, "flos": 28512821249280.0, "grad_norm": 2.5264206630573827, "language_loss": 0.77474844, "learning_rate": 2.685849508738034e-06, "loss": 0.79636991, "num_input_tokens_seen": 145236555, "step": 6768, "time_per_iteration": 2.6851589679718018 }, { "auxiliary_loss_clip": 0.01134023, "auxiliary_loss_mlp": 0.01032915, "balance_loss_clip": 1.05076826, "balance_loss_mlp": 1.01887226, "epoch": 0.4069742973094844, "flos": 20814040627200.0, "grad_norm": 1.8984102670150322, "language_loss": 0.87523651, "learning_rate": 2.6854836498913995e-06, "loss": 0.8969059, "num_input_tokens_seen": 145254595, "step": 6769, "time_per_iteration": 2.7267651557922363 }, { "auxiliary_loss_clip": 0.01105045, "auxiliary_loss_mlp": 0.0104448, "balance_loss_clip": 1.04947972, "balance_loss_mlp": 1.03028178, "epoch": 0.4070344205621524, "flos": 21470272151040.0, "grad_norm": 3.1498546640216234, "language_loss": 0.80951393, "learning_rate": 2.685117765051156e-06, "loss": 0.83100921, "num_input_tokens_seen": 145274005, "step": 6770, "time_per_iteration": 2.7272839546203613 }, { "auxiliary_loss_clip": 0.01136551, "auxiliary_loss_mlp": 0.0103334, "balance_loss_clip": 1.05021751, "balance_loss_mlp": 1.01781273, "epoch": 0.4070945438148204, "flos": 26830046937600.0, "grad_norm": 1.9062828764414554, "language_loss": 0.80237663, "learning_rate": 2.6847518542311783e-06, "loss": 0.82407558, "num_input_tokens_seen": 145294850, "step": 6771, "time_per_iteration": 2.5958163738250732 }, { "auxiliary_loss_clip": 0.01097968, "auxiliary_loss_mlp": 0.01044728, "balance_loss_clip": 1.04523098, "balance_loss_mlp": 1.02995801, "epoch": 0.4071546670674884, "flos": 26354158623360.0, "grad_norm": 1.4305431390081056, "language_loss": 0.76077241, "learning_rate": 2.6843859174453417e-06, "loss": 0.78219938, "num_input_tokens_seen": 145317050, "step": 6772, "time_per_iteration": 2.79603910446167 }, { "auxiliary_loss_clip": 0.01110195, "auxiliary_loss_mlp": 0.01043051, "balance_loss_clip": 1.04724109, "balance_loss_mlp": 1.0283401, "epoch": 0.40721479032015634, "flos": 17895401020800.0, "grad_norm": 1.8845179488175254, "language_loss": 0.81205189, "learning_rate": 2.6840199547075218e-06, "loss": 0.83358431, "num_input_tokens_seen": 145334480, "step": 6773, "time_per_iteration": 2.699221611022949 }, { "auxiliary_loss_clip": 0.01044722, "auxiliary_loss_mlp": 0.01025696, "balance_loss_clip": 1.03283918, "balance_loss_mlp": 1.02369332, "epoch": 0.4072749135728243, "flos": 49854570537600.0, "grad_norm": 0.9856620885651128, "language_loss": 0.64339805, "learning_rate": 2.683653966031597e-06, "loss": 0.6641022, "num_input_tokens_seen": 145388695, "step": 6774, "time_per_iteration": 3.147400140762329 }, { "auxiliary_loss_clip": 0.01089769, "auxiliary_loss_mlp": 0.01034709, "balance_loss_clip": 1.04686499, "balance_loss_mlp": 1.02041602, "epoch": 0.40733503682549227, "flos": 27563630400000.0, "grad_norm": 2.273542425652267, "language_loss": 0.72560251, "learning_rate": 2.683287951431446e-06, "loss": 0.74684727, "num_input_tokens_seen": 145408240, "step": 6775, "time_per_iteration": 2.787423849105835 }, { "auxiliary_loss_clip": 0.01105468, "auxiliary_loss_mlp": 0.00773431, "balance_loss_clip": 1.04828203, "balance_loss_mlp": 1.00090027, "epoch": 0.40739516007816023, "flos": 22126970551680.0, "grad_norm": 1.407391884450963, "language_loss": 0.77802348, "learning_rate": 2.6829219109209474e-06, "loss": 0.79681242, "num_input_tokens_seen": 145428395, "step": 6776, "time_per_iteration": 2.682548761367798 }, { "auxiliary_loss_clip": 0.01126451, "auxiliary_loss_mlp": 0.0104142, "balance_loss_clip": 1.05063748, "balance_loss_mlp": 1.02654302, "epoch": 0.4074552833308282, "flos": 23842243693440.0, "grad_norm": 2.817997105966, "language_loss": 0.79558617, "learning_rate": 2.682555844513981e-06, "loss": 0.81726491, "num_input_tokens_seen": 145448290, "step": 6777, "time_per_iteration": 2.7163336277008057 }, { "auxiliary_loss_clip": 0.01058602, "auxiliary_loss_mlp": 0.01001315, "balance_loss_clip": 1.02913916, "balance_loss_mlp": 0.99987298, "epoch": 0.40751540658349616, "flos": 58000008781440.0, "grad_norm": 0.6823534121540719, "language_loss": 0.5315339, "learning_rate": 2.6821897522244286e-06, "loss": 0.55213308, "num_input_tokens_seen": 145509785, "step": 6778, "time_per_iteration": 3.1687095165252686 }, { "auxiliary_loss_clip": 0.01135647, "auxiliary_loss_mlp": 0.00772948, "balance_loss_clip": 1.05136371, "balance_loss_mlp": 1.0008893, "epoch": 0.40757552983616413, "flos": 21214659991680.0, "grad_norm": 2.33935347330558, "language_loss": 0.82312328, "learning_rate": 2.6818236340661718e-06, "loss": 0.84220922, "num_input_tokens_seen": 145528620, "step": 6779, "time_per_iteration": 2.584343194961548 }, { "auxiliary_loss_clip": 0.0112113, "auxiliary_loss_mlp": 0.01036902, "balance_loss_clip": 1.04708278, "balance_loss_mlp": 1.02178645, "epoch": 0.4076356530888321, "flos": 26833530556800.0, "grad_norm": 1.5589663074171618, "language_loss": 0.76523471, "learning_rate": 2.6814574900530957e-06, "loss": 0.78681505, "num_input_tokens_seen": 145547775, "step": 6780, "time_per_iteration": 2.6672446727752686 }, { "auxiliary_loss_clip": 0.01117549, "auxiliary_loss_mlp": 0.01034773, "balance_loss_clip": 1.04889798, "balance_loss_mlp": 1.0212667, "epoch": 0.40769577634150006, "flos": 12203021272320.0, "grad_norm": 2.1749592638145123, "language_loss": 0.65482175, "learning_rate": 2.6810913201990827e-06, "loss": 0.67634493, "num_input_tokens_seen": 145564465, "step": 6781, "time_per_iteration": 2.612326145172119 }, { "auxiliary_loss_clip": 0.01107362, "auxiliary_loss_mlp": 0.01034378, "balance_loss_clip": 1.04472542, "balance_loss_mlp": 1.01922643, "epoch": 0.407755899594168, "flos": 33655264796160.0, "grad_norm": 1.5476514756078803, "language_loss": 0.71028459, "learning_rate": 2.6807251245180183e-06, "loss": 0.73170209, "num_input_tokens_seen": 145585965, "step": 6782, "time_per_iteration": 2.7483837604522705 }, { "auxiliary_loss_clip": 0.01124897, "auxiliary_loss_mlp": 0.01032941, "balance_loss_clip": 1.04813361, "balance_loss_mlp": 1.01833797, "epoch": 0.407816022846836, "flos": 20157342226560.0, "grad_norm": 1.9402515659282311, "language_loss": 0.82272756, "learning_rate": 2.6803589030237897e-06, "loss": 0.84430599, "num_input_tokens_seen": 145605000, "step": 6783, "time_per_iteration": 2.6157009601593018 }, { "auxiliary_loss_clip": 0.01117034, "auxiliary_loss_mlp": 0.01038578, "balance_loss_clip": 1.04744446, "balance_loss_mlp": 1.0235455, "epoch": 0.40787614609950396, "flos": 21178821196800.0, "grad_norm": 1.6713842677384587, "language_loss": 0.81044209, "learning_rate": 2.679992655730283e-06, "loss": 0.83199817, "num_input_tokens_seen": 145623740, "step": 6784, "time_per_iteration": 2.6054811477661133 }, { "auxiliary_loss_clip": 0.01107175, "auxiliary_loss_mlp": 0.01044009, "balance_loss_clip": 1.05123401, "balance_loss_mlp": 1.02725959, "epoch": 0.407936269352172, "flos": 20520650338560.0, "grad_norm": 2.1708514595694655, "language_loss": 0.65653902, "learning_rate": 2.679626382651386e-06, "loss": 0.67805088, "num_input_tokens_seen": 145643515, "step": 6785, "time_per_iteration": 2.816330671310425 }, { "auxiliary_loss_clip": 0.01115764, "auxiliary_loss_mlp": 0.01038413, "balance_loss_clip": 1.04758108, "balance_loss_mlp": 1.02347052, "epoch": 0.40799639260483994, "flos": 20118809911680.0, "grad_norm": 1.8523263348252557, "language_loss": 0.79567587, "learning_rate": 2.679260083800989e-06, "loss": 0.81721765, "num_input_tokens_seen": 145660890, "step": 6786, "time_per_iteration": 2.629009962081909 }, { "auxiliary_loss_clip": 0.01132323, "auxiliary_loss_mlp": 0.01042176, "balance_loss_clip": 1.04911721, "balance_loss_mlp": 1.02866411, "epoch": 0.4080565158575079, "flos": 20997328752000.0, "grad_norm": 1.716981063220771, "language_loss": 0.81870878, "learning_rate": 2.678893759192982e-06, "loss": 0.84045374, "num_input_tokens_seen": 145680070, "step": 6787, "time_per_iteration": 2.6304709911346436 }, { "auxiliary_loss_clip": 0.01117339, "auxiliary_loss_mlp": 0.0103421, "balance_loss_clip": 1.04705477, "balance_loss_mlp": 1.02019691, "epoch": 0.40811663911017587, "flos": 19317714837120.0, "grad_norm": 1.8408166150848957, "language_loss": 0.67954206, "learning_rate": 2.678527408841255e-06, "loss": 0.70105749, "num_input_tokens_seen": 145698010, "step": 6788, "time_per_iteration": 2.6314821243286133 }, { "auxiliary_loss_clip": 0.01102044, "auxiliary_loss_mlp": 0.01047553, "balance_loss_clip": 1.04318452, "balance_loss_mlp": 1.03095889, "epoch": 0.40817676236284384, "flos": 40625382119040.0, "grad_norm": 2.0355882471601014, "language_loss": 0.66265976, "learning_rate": 2.678161032759701e-06, "loss": 0.6841557, "num_input_tokens_seen": 145722215, "step": 6789, "time_per_iteration": 2.8329808712005615 }, { "auxiliary_loss_clip": 0.01084234, "auxiliary_loss_mlp": 0.01036407, "balance_loss_clip": 1.04236126, "balance_loss_mlp": 1.021101, "epoch": 0.4082368856155118, "flos": 20522086882560.0, "grad_norm": 1.7612282198179636, "language_loss": 0.60220939, "learning_rate": 2.6777946309622123e-06, "loss": 0.62341583, "num_input_tokens_seen": 145741090, "step": 6790, "time_per_iteration": 2.705007791519165 }, { "auxiliary_loss_clip": 0.01115221, "auxiliary_loss_mlp": 0.0104035, "balance_loss_clip": 1.04814339, "balance_loss_mlp": 1.02482939, "epoch": 0.40829700886817977, "flos": 11427745098240.0, "grad_norm": 2.946877052856992, "language_loss": 0.69406867, "learning_rate": 2.677428203462683e-06, "loss": 0.71562433, "num_input_tokens_seen": 145754985, "step": 6791, "time_per_iteration": 2.629746675491333 }, { "auxiliary_loss_clip": 0.01047663, "auxiliary_loss_mlp": 0.01005727, "balance_loss_clip": 1.02732182, "balance_loss_mlp": 1.00409365, "epoch": 0.40835713212084773, "flos": 67330677121920.0, "grad_norm": 0.7512190297569652, "language_loss": 0.59569383, "learning_rate": 2.6770617502750093e-06, "loss": 0.61622775, "num_input_tokens_seen": 145815260, "step": 6792, "time_per_iteration": 4.680825710296631 }, { "auxiliary_loss_clip": 0.0113903, "auxiliary_loss_mlp": 0.01043884, "balance_loss_clip": 1.05271673, "balance_loss_mlp": 1.02787423, "epoch": 0.4084172553735157, "flos": 21762010414080.0, "grad_norm": 1.9475859316217028, "language_loss": 0.80324817, "learning_rate": 2.6766952714130857e-06, "loss": 0.8250773, "num_input_tokens_seen": 145832665, "step": 6793, "time_per_iteration": 4.095003128051758 }, { "auxiliary_loss_clip": 0.01124776, "auxiliary_loss_mlp": 0.01044052, "balance_loss_clip": 1.04916334, "balance_loss_mlp": 1.02811408, "epoch": 0.40847737862618366, "flos": 27417258478080.0, "grad_norm": 1.8631596367030567, "language_loss": 0.84994531, "learning_rate": 2.6763287668908094e-06, "loss": 0.87163359, "num_input_tokens_seen": 145850240, "step": 6794, "time_per_iteration": 4.198231935501099 }, { "auxiliary_loss_clip": 0.01100105, "auxiliary_loss_mlp": 0.01040677, "balance_loss_clip": 1.0469923, "balance_loss_mlp": 1.02570391, "epoch": 0.4085375018788516, "flos": 18587255857920.0, "grad_norm": 2.862264995792616, "language_loss": 0.7989887, "learning_rate": 2.6759622367220788e-06, "loss": 0.82039654, "num_input_tokens_seen": 145869545, "step": 6795, "time_per_iteration": 2.7477807998657227 }, { "auxiliary_loss_clip": 0.01121705, "auxiliary_loss_mlp": 0.01039831, "balance_loss_clip": 1.04831719, "balance_loss_mlp": 1.02385116, "epoch": 0.4085976251315196, "flos": 15411783029760.0, "grad_norm": 2.7561951150254633, "language_loss": 0.70605052, "learning_rate": 2.675595680920792e-06, "loss": 0.7276659, "num_input_tokens_seen": 145884025, "step": 6796, "time_per_iteration": 4.261413335800171 }, { "auxiliary_loss_clip": 0.01116135, "auxiliary_loss_mlp": 0.0077634, "balance_loss_clip": 1.04606998, "balance_loss_mlp": 1.00082135, "epoch": 0.40865774838418756, "flos": 21252222639360.0, "grad_norm": 1.6356766399676357, "language_loss": 0.78218019, "learning_rate": 2.6752290995008498e-06, "loss": 0.80110496, "num_input_tokens_seen": 145903210, "step": 6797, "time_per_iteration": 2.6453776359558105 }, { "auxiliary_loss_clip": 0.01121906, "auxiliary_loss_mlp": 0.0105076, "balance_loss_clip": 1.04562223, "balance_loss_mlp": 1.03619301, "epoch": 0.4087178716368556, "flos": 13772245714560.0, "grad_norm": 2.2166943768421534, "language_loss": 0.86117017, "learning_rate": 2.6748624924761523e-06, "loss": 0.8828969, "num_input_tokens_seen": 145920985, "step": 6798, "time_per_iteration": 2.67480731010437 }, { "auxiliary_loss_clip": 0.01130307, "auxiliary_loss_mlp": 0.01042728, "balance_loss_clip": 1.04780984, "balance_loss_mlp": 1.02931094, "epoch": 0.40877799488952354, "flos": 23621752056960.0, "grad_norm": 1.473518761352831, "language_loss": 0.84252232, "learning_rate": 2.674495859860601e-06, "loss": 0.86425269, "num_input_tokens_seen": 145940350, "step": 6799, "time_per_iteration": 2.6273906230926514 }, { "auxiliary_loss_clip": 0.01093085, "auxiliary_loss_mlp": 0.01052249, "balance_loss_clip": 1.04557848, "balance_loss_mlp": 1.03427255, "epoch": 0.4088381181421915, "flos": 20918791664640.0, "grad_norm": 2.1256660898165913, "language_loss": 0.83567548, "learning_rate": 2.6741292016681e-06, "loss": 0.85712886, "num_input_tokens_seen": 145957460, "step": 6800, "time_per_iteration": 2.7064268589019775 }, { "auxiliary_loss_clip": 0.01119062, "auxiliary_loss_mlp": 0.01043239, "balance_loss_clip": 1.04534221, "balance_loss_mlp": 1.02778912, "epoch": 0.4088982413948595, "flos": 13297578462720.0, "grad_norm": 2.1612690472856353, "language_loss": 0.74336559, "learning_rate": 2.6737625179125514e-06, "loss": 0.76498854, "num_input_tokens_seen": 145975285, "step": 6801, "time_per_iteration": 2.631030321121216 }, { "auxiliary_loss_clip": 0.01122834, "auxiliary_loss_mlp": 0.0104231, "balance_loss_clip": 1.04511952, "balance_loss_mlp": 1.02699137, "epoch": 0.40895836464752744, "flos": 15267673664640.0, "grad_norm": 2.1715684147319907, "language_loss": 0.80430126, "learning_rate": 2.673395808607861e-06, "loss": 0.82595277, "num_input_tokens_seen": 145989150, "step": 6802, "time_per_iteration": 2.5802509784698486 }, { "auxiliary_loss_clip": 0.0112096, "auxiliary_loss_mlp": 0.01044934, "balance_loss_clip": 1.04893684, "balance_loss_mlp": 1.02843595, "epoch": 0.4090184879001954, "flos": 14501411804160.0, "grad_norm": 2.2436343912353283, "language_loss": 0.75734484, "learning_rate": 2.673029073767934e-06, "loss": 0.77900374, "num_input_tokens_seen": 146006980, "step": 6803, "time_per_iteration": 2.609602689743042 }, { "auxiliary_loss_clip": 0.0106898, "auxiliary_loss_mlp": 0.00773774, "balance_loss_clip": 1.04085743, "balance_loss_mlp": 1.00086641, "epoch": 0.40907861115286337, "flos": 13881593692800.0, "grad_norm": 1.8843395194203503, "language_loss": 0.78824151, "learning_rate": 2.6726623134066764e-06, "loss": 0.806669, "num_input_tokens_seen": 146025125, "step": 6804, "time_per_iteration": 2.7654101848602295 }, { "auxiliary_loss_clip": 0.01137979, "auxiliary_loss_mlp": 0.01045985, "balance_loss_clip": 1.04858065, "balance_loss_mlp": 1.03147769, "epoch": 0.40913873440553133, "flos": 28037615293440.0, "grad_norm": 2.2298994676504225, "language_loss": 0.75672269, "learning_rate": 2.672295527537998e-06, "loss": 0.77856231, "num_input_tokens_seen": 146044990, "step": 6805, "time_per_iteration": 2.680368185043335 }, { "auxiliary_loss_clip": 0.01089569, "auxiliary_loss_mlp": 0.01047964, "balance_loss_clip": 1.04342198, "balance_loss_mlp": 1.03309822, "epoch": 0.4091988576581993, "flos": 21618188357760.0, "grad_norm": 1.8743994628433338, "language_loss": 0.79440027, "learning_rate": 2.671928716175804e-06, "loss": 0.81577563, "num_input_tokens_seen": 146066045, "step": 6806, "time_per_iteration": 2.8212954998016357 }, { "auxiliary_loss_clip": 0.01126847, "auxiliary_loss_mlp": 0.01038318, "balance_loss_clip": 1.04977083, "balance_loss_mlp": 1.02272499, "epoch": 0.40925898091086726, "flos": 25224085860480.0, "grad_norm": 1.915245819215902, "language_loss": 0.71779263, "learning_rate": 2.671561879334007e-06, "loss": 0.73944426, "num_input_tokens_seen": 146086280, "step": 6807, "time_per_iteration": 2.7223496437072754 }, { "auxiliary_loss_clip": 0.01034248, "auxiliary_loss_mlp": 0.01005874, "balance_loss_clip": 1.0356338, "balance_loss_mlp": 1.00364494, "epoch": 0.40931910416353523, "flos": 68930568800640.0, "grad_norm": 0.8232207365722912, "language_loss": 0.58807027, "learning_rate": 2.6711950170265155e-06, "loss": 0.60847151, "num_input_tokens_seen": 146148840, "step": 6808, "time_per_iteration": 3.2951159477233887 }, { "auxiliary_loss_clip": 0.01113663, "auxiliary_loss_mlp": 0.01048693, "balance_loss_clip": 1.04732299, "balance_loss_mlp": 1.03419733, "epoch": 0.4093792274162032, "flos": 20189553747840.0, "grad_norm": 1.705790136999867, "language_loss": 0.54954052, "learning_rate": 2.670828129267242e-06, "loss": 0.57116413, "num_input_tokens_seen": 146166195, "step": 6809, "time_per_iteration": 2.663210868835449 }, { "auxiliary_loss_clip": 0.01108384, "auxiliary_loss_mlp": 0.01031551, "balance_loss_clip": 1.0446471, "balance_loss_mlp": 1.01682281, "epoch": 0.40943935066887116, "flos": 25228754628480.0, "grad_norm": 1.7788203343455933, "language_loss": 0.83185786, "learning_rate": 2.6704612160700983e-06, "loss": 0.85325718, "num_input_tokens_seen": 146185045, "step": 6810, "time_per_iteration": 2.683969020843506 }, { "auxiliary_loss_clip": 0.01105454, "auxiliary_loss_mlp": 0.01053382, "balance_loss_clip": 1.0451473, "balance_loss_mlp": 1.03608489, "epoch": 0.4094994739215392, "flos": 23255319461760.0, "grad_norm": 2.954085357706404, "language_loss": 0.77419919, "learning_rate": 2.670094277448999e-06, "loss": 0.79578757, "num_input_tokens_seen": 146204655, "step": 6811, "time_per_iteration": 2.6727347373962402 }, { "auxiliary_loss_clip": 0.01135893, "auxiliary_loss_mlp": 0.01036603, "balance_loss_clip": 1.04917455, "balance_loss_mlp": 1.02042687, "epoch": 0.40955959717420715, "flos": 17382165540480.0, "grad_norm": 1.6058461501005727, "language_loss": 0.70272696, "learning_rate": 2.669727313417857e-06, "loss": 0.72445196, "num_input_tokens_seen": 146222000, "step": 6812, "time_per_iteration": 2.6267693042755127 }, { "auxiliary_loss_clip": 0.01132783, "auxiliary_loss_mlp": 0.01048088, "balance_loss_clip": 1.04780114, "balance_loss_mlp": 1.03210163, "epoch": 0.4096197204268751, "flos": 25082418620160.0, "grad_norm": 1.9378136524882912, "language_loss": 0.66298044, "learning_rate": 2.6693603239905872e-06, "loss": 0.68478918, "num_input_tokens_seen": 146242630, "step": 6813, "time_per_iteration": 2.6447062492370605 }, { "auxiliary_loss_clip": 0.01117463, "auxiliary_loss_mlp": 0.00774455, "balance_loss_clip": 1.04784274, "balance_loss_mlp": 1.0009681, "epoch": 0.4096798436795431, "flos": 30586769648640.0, "grad_norm": 1.8922051995482987, "language_loss": 0.73949504, "learning_rate": 2.6689933091811087e-06, "loss": 0.75841421, "num_input_tokens_seen": 146263070, "step": 6814, "time_per_iteration": 2.7325870990753174 }, { "auxiliary_loss_clip": 0.0108334, "auxiliary_loss_mlp": 0.01038435, "balance_loss_clip": 1.04231858, "balance_loss_mlp": 1.02281821, "epoch": 0.40973996693221104, "flos": 24133622820480.0, "grad_norm": 2.0095509453801728, "language_loss": 0.65957761, "learning_rate": 2.6686262690033357e-06, "loss": 0.68079543, "num_input_tokens_seen": 146282890, "step": 6815, "time_per_iteration": 2.780668258666992 }, { "auxiliary_loss_clip": 0.01122383, "auxiliary_loss_mlp": 0.01045791, "balance_loss_clip": 1.05130887, "balance_loss_mlp": 1.03100336, "epoch": 0.409800090184879, "flos": 23988974751360.0, "grad_norm": 1.5903260932613887, "language_loss": 0.76872814, "learning_rate": 2.668259203471188e-06, "loss": 0.79040992, "num_input_tokens_seen": 146301755, "step": 6816, "time_per_iteration": 2.6901748180389404 }, { "auxiliary_loss_clip": 0.01118517, "auxiliary_loss_mlp": 0.0104269, "balance_loss_clip": 1.05008173, "balance_loss_mlp": 1.02716875, "epoch": 0.40986021343754697, "flos": 16143678552960.0, "grad_norm": 2.2788575244766966, "language_loss": 0.81621635, "learning_rate": 2.6678921125985843e-06, "loss": 0.8378284, "num_input_tokens_seen": 146316835, "step": 6817, "time_per_iteration": 2.6194167137145996 }, { "auxiliary_loss_clip": 0.01114033, "auxiliary_loss_mlp": 0.01046853, "balance_loss_clip": 1.04633307, "balance_loss_mlp": 1.02987719, "epoch": 0.40992033669021494, "flos": 24790824011520.0, "grad_norm": 2.698849637369061, "language_loss": 0.8016938, "learning_rate": 2.667524996399444e-06, "loss": 0.82330263, "num_input_tokens_seen": 146336650, "step": 6818, "time_per_iteration": 2.8449223041534424 }, { "auxiliary_loss_clip": 0.0111157, "auxiliary_loss_mlp": 0.01039221, "balance_loss_clip": 1.05212271, "balance_loss_mlp": 1.02459419, "epoch": 0.4099804599428829, "flos": 29641888431360.0, "grad_norm": 1.781955605236185, "language_loss": 0.66531783, "learning_rate": 2.66715785488769e-06, "loss": 0.68682575, "num_input_tokens_seen": 146357640, "step": 6819, "time_per_iteration": 2.8016393184661865 }, { "auxiliary_loss_clip": 0.01118061, "auxiliary_loss_mlp": 0.01052321, "balance_loss_clip": 1.05068922, "balance_loss_mlp": 1.03429687, "epoch": 0.41004058319555087, "flos": 24826590979200.0, "grad_norm": 1.7017427969889725, "language_loss": 0.85438228, "learning_rate": 2.6667906880772428e-06, "loss": 0.87608612, "num_input_tokens_seen": 146379325, "step": 6820, "time_per_iteration": 2.7182726860046387 }, { "auxiliary_loss_clip": 0.01127803, "auxiliary_loss_mlp": 0.01035666, "balance_loss_clip": 1.05361152, "balance_loss_mlp": 1.02019835, "epoch": 0.41010070644821883, "flos": 25737464995200.0, "grad_norm": 1.8388824613750698, "language_loss": 0.71235943, "learning_rate": 2.6664234959820256e-06, "loss": 0.73399413, "num_input_tokens_seen": 146398635, "step": 6821, "time_per_iteration": 2.6716413497924805 }, { "auxiliary_loss_clip": 0.01123531, "auxiliary_loss_mlp": 0.01036959, "balance_loss_clip": 1.05253363, "balance_loss_mlp": 1.02228427, "epoch": 0.4101608297008868, "flos": 22346061557760.0, "grad_norm": 1.9657765704612085, "language_loss": 0.74500406, "learning_rate": 2.6660562786159634e-06, "loss": 0.76660895, "num_input_tokens_seen": 146417585, "step": 6822, "time_per_iteration": 2.652270793914795 }, { "auxiliary_loss_clip": 0.01118135, "auxiliary_loss_mlp": 0.01038075, "balance_loss_clip": 1.05201709, "balance_loss_mlp": 1.02313757, "epoch": 0.41022095295355476, "flos": 21945083057280.0, "grad_norm": 2.1947910409652116, "language_loss": 0.75539672, "learning_rate": 2.6656890359929796e-06, "loss": 0.77695882, "num_input_tokens_seen": 146437035, "step": 6823, "time_per_iteration": 2.767306327819824 }, { "auxiliary_loss_clip": 0.01095631, "auxiliary_loss_mlp": 0.01044283, "balance_loss_clip": 1.05394316, "balance_loss_mlp": 1.02697372, "epoch": 0.4102810762062228, "flos": 27450511493760.0, "grad_norm": 2.0691169068872086, "language_loss": 0.73186851, "learning_rate": 2.665321768127001e-06, "loss": 0.75326765, "num_input_tokens_seen": 146457370, "step": 6824, "time_per_iteration": 2.793712615966797 }, { "auxiliary_loss_clip": 0.01110429, "auxiliary_loss_mlp": 0.0103962, "balance_loss_clip": 1.05025351, "balance_loss_mlp": 1.02316904, "epoch": 0.41034119945889075, "flos": 24499265316480.0, "grad_norm": 2.036284375586757, "language_loss": 0.72426587, "learning_rate": 2.6649544750319548e-06, "loss": 0.7457664, "num_input_tokens_seen": 146478105, "step": 6825, "time_per_iteration": 2.764977216720581 }, { "auxiliary_loss_clip": 0.01097265, "auxiliary_loss_mlp": 0.01045464, "balance_loss_clip": 1.04605746, "balance_loss_mlp": 1.03027654, "epoch": 0.4104013227115587, "flos": 24352641999360.0, "grad_norm": 1.8249289811640228, "language_loss": 0.85226274, "learning_rate": 2.664587156721768e-06, "loss": 0.87369001, "num_input_tokens_seen": 146497835, "step": 6826, "time_per_iteration": 2.7680137157440186 }, { "auxiliary_loss_clip": 0.01115829, "auxiliary_loss_mlp": 0.00775051, "balance_loss_clip": 1.05372024, "balance_loss_mlp": 1.00099707, "epoch": 0.4104614459642267, "flos": 23729340268800.0, "grad_norm": 1.8772466232345664, "language_loss": 0.66074443, "learning_rate": 2.6642198132103696e-06, "loss": 0.67965323, "num_input_tokens_seen": 146517735, "step": 6827, "time_per_iteration": 2.791212797164917 }, { "auxiliary_loss_clip": 0.01113343, "auxiliary_loss_mlp": 0.01033945, "balance_loss_clip": 1.04942787, "balance_loss_mlp": 1.01910365, "epoch": 0.41052156921689464, "flos": 22127976132480.0, "grad_norm": 2.0535618692070914, "language_loss": 0.72474444, "learning_rate": 2.663852444511689e-06, "loss": 0.74621731, "num_input_tokens_seen": 146537640, "step": 6828, "time_per_iteration": 2.6675491333007812 }, { "auxiliary_loss_clip": 0.01111113, "auxiliary_loss_mlp": 0.01048054, "balance_loss_clip": 1.04920423, "balance_loss_mlp": 1.03068542, "epoch": 0.4105816924695626, "flos": 20084371747200.0, "grad_norm": 2.67524304617312, "language_loss": 0.83464897, "learning_rate": 2.6634850506396574e-06, "loss": 0.85624069, "num_input_tokens_seen": 146554695, "step": 6829, "time_per_iteration": 2.762298107147217 }, { "auxiliary_loss_clip": 0.01124628, "auxiliary_loss_mlp": 0.01039003, "balance_loss_clip": 1.05062759, "balance_loss_mlp": 1.02405417, "epoch": 0.4106418157222306, "flos": 18076785724800.0, "grad_norm": 1.5363498208464375, "language_loss": 0.89878875, "learning_rate": 2.663117631608206e-06, "loss": 0.92042506, "num_input_tokens_seen": 146573740, "step": 6830, "time_per_iteration": 2.7726032733917236 }, { "auxiliary_loss_clip": 0.01098336, "auxiliary_loss_mlp": 0.01034169, "balance_loss_clip": 1.04938424, "balance_loss_mlp": 1.01833797, "epoch": 0.41070193897489854, "flos": 21647850013440.0, "grad_norm": 1.7853690904757185, "language_loss": 0.65810287, "learning_rate": 2.662750187431268e-06, "loss": 0.67942798, "num_input_tokens_seen": 146592885, "step": 6831, "time_per_iteration": 4.213804244995117 }, { "auxiliary_loss_clip": 0.01137663, "auxiliary_loss_mlp": 0.01039058, "balance_loss_clip": 1.05280805, "balance_loss_mlp": 1.02361393, "epoch": 0.4107620622275665, "flos": 26648195356800.0, "grad_norm": 1.7075421510763598, "language_loss": 0.69710165, "learning_rate": 2.662382718122776e-06, "loss": 0.71886885, "num_input_tokens_seen": 146611995, "step": 6832, "time_per_iteration": 4.146309852600098 }, { "auxiliary_loss_clip": 0.01089843, "auxiliary_loss_mlp": 0.01042117, "balance_loss_clip": 1.05080116, "balance_loss_mlp": 1.02703142, "epoch": 0.41082218548023447, "flos": 18734310138240.0, "grad_norm": 2.3374205466797537, "language_loss": 0.73910743, "learning_rate": 2.662015223696666e-06, "loss": 0.760427, "num_input_tokens_seen": 146628045, "step": 6833, "time_per_iteration": 4.23652195930481 }, { "auxiliary_loss_clip": 0.01083988, "auxiliary_loss_mlp": 0.01045346, "balance_loss_clip": 1.04393578, "balance_loss_mlp": 1.02754784, "epoch": 0.41088230873290243, "flos": 22893771116160.0, "grad_norm": 1.56012293193972, "language_loss": 0.7299009, "learning_rate": 2.6616477041668713e-06, "loss": 0.75119424, "num_input_tokens_seen": 146648355, "step": 6834, "time_per_iteration": 2.72806453704834 }, { "auxiliary_loss_clip": 0.0113018, "auxiliary_loss_mlp": 0.01049062, "balance_loss_clip": 1.05203891, "balance_loss_mlp": 1.03320765, "epoch": 0.4109424319855704, "flos": 24276978000000.0, "grad_norm": 1.7978087117059114, "language_loss": 0.71254998, "learning_rate": 2.661280159547329e-06, "loss": 0.73434246, "num_input_tokens_seen": 146668370, "step": 6835, "time_per_iteration": 4.406278133392334 }, { "auxiliary_loss_clip": 0.01130021, "auxiliary_loss_mlp": 0.01043294, "balance_loss_clip": 1.05188155, "balance_loss_mlp": 1.02630687, "epoch": 0.41100255523823837, "flos": 12969139478400.0, "grad_norm": 1.9060780079348063, "language_loss": 0.87366456, "learning_rate": 2.660912589851978e-06, "loss": 0.89539772, "num_input_tokens_seen": 146686665, "step": 6836, "time_per_iteration": 2.6482133865356445 }, { "auxiliary_loss_clip": 0.0112613, "auxiliary_loss_mlp": 0.01040074, "balance_loss_clip": 1.05334806, "balance_loss_mlp": 1.02461267, "epoch": 0.4110626784909064, "flos": 23145648261120.0, "grad_norm": 6.565804686602276, "language_loss": 0.69167227, "learning_rate": 2.6605449950947547e-06, "loss": 0.71333432, "num_input_tokens_seen": 146706570, "step": 6837, "time_per_iteration": 2.682241916656494 }, { "auxiliary_loss_clip": 0.0114114, "auxiliary_loss_mlp": 0.01041377, "balance_loss_clip": 1.0544312, "balance_loss_mlp": 1.02540302, "epoch": 0.41112280174357435, "flos": 22747399194240.0, "grad_norm": 1.8671169017141842, "language_loss": 0.75408459, "learning_rate": 2.660177375289599e-06, "loss": 0.77590978, "num_input_tokens_seen": 146723425, "step": 6838, "time_per_iteration": 2.625422239303589 }, { "auxiliary_loss_clip": 0.0110141, "auxiliary_loss_mlp": 0.01042257, "balance_loss_clip": 1.0521034, "balance_loss_mlp": 1.02617598, "epoch": 0.4111829249962423, "flos": 21102403011840.0, "grad_norm": 2.061873935528421, "language_loss": 0.82113552, "learning_rate": 2.659809730450451e-06, "loss": 0.84257221, "num_input_tokens_seen": 146741640, "step": 6839, "time_per_iteration": 2.7850279808044434 }, { "auxiliary_loss_clip": 0.01135439, "auxiliary_loss_mlp": 0.0103927, "balance_loss_clip": 1.05122948, "balance_loss_mlp": 1.02421379, "epoch": 0.4112430482489103, "flos": 21505787723520.0, "grad_norm": 5.701831641175022, "language_loss": 0.80077577, "learning_rate": 2.6594420605912523e-06, "loss": 0.82252288, "num_input_tokens_seen": 146759195, "step": 6840, "time_per_iteration": 2.656494140625 }, { "auxiliary_loss_clip": 0.01120054, "auxiliary_loss_mlp": 0.01035027, "balance_loss_clip": 1.0487783, "balance_loss_mlp": 1.02117467, "epoch": 0.41130317150157825, "flos": 19570022945280.0, "grad_norm": 1.862146821875906, "language_loss": 0.6778084, "learning_rate": 2.6590743657259442e-06, "loss": 0.69935924, "num_input_tokens_seen": 146774990, "step": 6841, "time_per_iteration": 2.6612377166748047 }, { "auxiliary_loss_clip": 0.01055489, "auxiliary_loss_mlp": 0.01004436, "balance_loss_clip": 1.03532803, "balance_loss_mlp": 1.00270772, "epoch": 0.4113632947542462, "flos": 62383157706240.0, "grad_norm": 0.8163554776107808, "language_loss": 0.59717554, "learning_rate": 2.65870664586847e-06, "loss": 0.61777478, "num_input_tokens_seen": 146839610, "step": 6842, "time_per_iteration": 3.2157862186431885 }, { "auxiliary_loss_clip": 0.01120166, "auxiliary_loss_mlp": 0.01038325, "balance_loss_clip": 1.05330658, "balance_loss_mlp": 1.02400184, "epoch": 0.4114234180069142, "flos": 13918617636480.0, "grad_norm": 2.3538351775584156, "language_loss": 0.70293331, "learning_rate": 2.6583389010327742e-06, "loss": 0.72451818, "num_input_tokens_seen": 146857360, "step": 6843, "time_per_iteration": 2.6172597408294678 }, { "auxiliary_loss_clip": 0.01014929, "auxiliary_loss_mlp": 0.01002572, "balance_loss_clip": 1.01983762, "balance_loss_mlp": 1.00047398, "epoch": 0.41148354125958214, "flos": 64928505219840.0, "grad_norm": 0.7263883634768764, "language_loss": 0.53593683, "learning_rate": 2.6579711312328013e-06, "loss": 0.55611187, "num_input_tokens_seen": 146917055, "step": 6844, "time_per_iteration": 3.21069598197937 }, { "auxiliary_loss_clip": 0.01124589, "auxiliary_loss_mlp": 0.01041114, "balance_loss_clip": 1.05226612, "balance_loss_mlp": 1.02679706, "epoch": 0.4115436645122501, "flos": 18728779443840.0, "grad_norm": 1.870188515464334, "language_loss": 0.66065252, "learning_rate": 2.6576033364824967e-06, "loss": 0.68230951, "num_input_tokens_seen": 146935215, "step": 6845, "time_per_iteration": 2.6289329528808594 }, { "auxiliary_loss_clip": 0.01134084, "auxiliary_loss_mlp": 0.01038479, "balance_loss_clip": 1.05250192, "balance_loss_mlp": 1.02355433, "epoch": 0.41160378776491807, "flos": 16252918790400.0, "grad_norm": 2.0932374873894655, "language_loss": 0.70088863, "learning_rate": 2.657235516795808e-06, "loss": 0.72261429, "num_input_tokens_seen": 146951970, "step": 6846, "time_per_iteration": 2.578780174255371 }, { "auxiliary_loss_clip": 0.01111001, "auxiliary_loss_mlp": 0.01041074, "balance_loss_clip": 1.04926157, "balance_loss_mlp": 1.0254035, "epoch": 0.41166391101758604, "flos": 27970031854080.0, "grad_norm": 1.8006459441278344, "language_loss": 0.65271175, "learning_rate": 2.6568676721866826e-06, "loss": 0.67423248, "num_input_tokens_seen": 146975615, "step": 6847, "time_per_iteration": 2.7504281997680664 }, { "auxiliary_loss_clip": 0.01111807, "auxiliary_loss_mlp": 0.01046607, "balance_loss_clip": 1.04943776, "balance_loss_mlp": 1.03167558, "epoch": 0.411724034270254, "flos": 34131296764800.0, "grad_norm": 1.371398558221349, "language_loss": 0.70655453, "learning_rate": 2.656499802669069e-06, "loss": 0.72813869, "num_input_tokens_seen": 146998855, "step": 6848, "time_per_iteration": 2.7842190265655518 }, { "auxiliary_loss_clip": 0.01032604, "auxiliary_loss_mlp": 0.00753743, "balance_loss_clip": 1.02356267, "balance_loss_mlp": 1.00076866, "epoch": 0.41178415752292197, "flos": 67923670752000.0, "grad_norm": 0.9037714041830832, "language_loss": 0.5627954, "learning_rate": 2.6561319082569174e-06, "loss": 0.58065879, "num_input_tokens_seen": 147062710, "step": 6849, "time_per_iteration": 3.3100218772888184 }, { "auxiliary_loss_clip": 0.01115279, "auxiliary_loss_mlp": 0.0104026, "balance_loss_clip": 1.05035055, "balance_loss_mlp": 1.0254786, "epoch": 0.41184428077558993, "flos": 34313938444800.0, "grad_norm": 2.6235370790375767, "language_loss": 0.76318872, "learning_rate": 2.6557639889641783e-06, "loss": 0.78474414, "num_input_tokens_seen": 147086075, "step": 6850, "time_per_iteration": 2.879258632659912 }, { "auxiliary_loss_clip": 0.010812, "auxiliary_loss_mlp": 0.01037976, "balance_loss_clip": 1.0412885, "balance_loss_mlp": 1.02356339, "epoch": 0.41190440402825795, "flos": 35444118948480.0, "grad_norm": 1.5473555335002718, "language_loss": 0.68093288, "learning_rate": 2.6553960448048025e-06, "loss": 0.70212466, "num_input_tokens_seen": 147107590, "step": 6851, "time_per_iteration": 2.931530237197876 }, { "auxiliary_loss_clip": 0.01101431, "auxiliary_loss_mlp": 0.01049233, "balance_loss_clip": 1.0504117, "balance_loss_mlp": 1.03207839, "epoch": 0.4119645272809259, "flos": 20849879422080.0, "grad_norm": 2.1361960755807634, "language_loss": 0.79698718, "learning_rate": 2.655028075792743e-06, "loss": 0.81849384, "num_input_tokens_seen": 147123715, "step": 6852, "time_per_iteration": 2.6807408332824707 }, { "auxiliary_loss_clip": 0.01141214, "auxiliary_loss_mlp": 0.01043074, "balance_loss_clip": 1.05327845, "balance_loss_mlp": 1.02688491, "epoch": 0.4120246505335939, "flos": 27562050201600.0, "grad_norm": 1.901908158264802, "language_loss": 0.77750659, "learning_rate": 2.6546600819419537e-06, "loss": 0.79934943, "num_input_tokens_seen": 147144290, "step": 6853, "time_per_iteration": 2.699430227279663 }, { "auxiliary_loss_clip": 0.01126437, "auxiliary_loss_mlp": 0.01046106, "balance_loss_clip": 1.04821801, "balance_loss_mlp": 1.0298574, "epoch": 0.41208477378626185, "flos": 37815444046080.0, "grad_norm": 1.8090743517086876, "language_loss": 0.65556479, "learning_rate": 2.6542920632663883e-06, "loss": 0.6772902, "num_input_tokens_seen": 147166340, "step": 6854, "time_per_iteration": 2.8111729621887207 }, { "auxiliary_loss_clip": 0.01104516, "auxiliary_loss_mlp": 0.01052436, "balance_loss_clip": 1.04534888, "balance_loss_mlp": 1.03615212, "epoch": 0.4121448970389298, "flos": 23440762402560.0, "grad_norm": 2.1224683572406917, "language_loss": 0.8348515, "learning_rate": 2.6539240197800023e-06, "loss": 0.85642099, "num_input_tokens_seen": 147184025, "step": 6855, "time_per_iteration": 2.6698896884918213 }, { "auxiliary_loss_clip": 0.01117307, "auxiliary_loss_mlp": 0.01044081, "balance_loss_clip": 1.04969764, "balance_loss_mlp": 1.02976418, "epoch": 0.4122050202915978, "flos": 21325300859520.0, "grad_norm": 2.1069107949142554, "language_loss": 0.7929827, "learning_rate": 2.6535559514967517e-06, "loss": 0.81459653, "num_input_tokens_seen": 147202730, "step": 6856, "time_per_iteration": 2.6754775047302246 }, { "auxiliary_loss_clip": 0.01098846, "auxiliary_loss_mlp": 0.01042601, "balance_loss_clip": 1.04761338, "balance_loss_mlp": 1.02777684, "epoch": 0.41226514354426574, "flos": 17306286059520.0, "grad_norm": 2.5035417030553018, "language_loss": 0.80352724, "learning_rate": 2.6531878584305935e-06, "loss": 0.82494175, "num_input_tokens_seen": 147215315, "step": 6857, "time_per_iteration": 2.7415785789489746 }, { "auxiliary_loss_clip": 0.01123756, "auxiliary_loss_mlp": 0.0077359, "balance_loss_clip": 1.04799688, "balance_loss_mlp": 1.00088441, "epoch": 0.4123252667969337, "flos": 17638855107840.0, "grad_norm": 2.1785137319374575, "language_loss": 0.70367694, "learning_rate": 2.6528197405954873e-06, "loss": 0.72265041, "num_input_tokens_seen": 147233330, "step": 6858, "time_per_iteration": 2.6482796669006348 }, { "auxiliary_loss_clip": 0.01123125, "auxiliary_loss_mlp": 0.01046787, "balance_loss_clip": 1.04916668, "balance_loss_mlp": 1.03116488, "epoch": 0.4123853900496017, "flos": 46424811375360.0, "grad_norm": 2.660424997773602, "language_loss": 0.59025121, "learning_rate": 2.652451598005391e-06, "loss": 0.61195034, "num_input_tokens_seen": 147257780, "step": 6859, "time_per_iteration": 2.8688454627990723 }, { "auxiliary_loss_clip": 0.01132817, "auxiliary_loss_mlp": 0.0104458, "balance_loss_clip": 1.04658365, "balance_loss_mlp": 1.0293684, "epoch": 0.41244551330226964, "flos": 17675160779520.0, "grad_norm": 2.4672414929748863, "language_loss": 0.73583943, "learning_rate": 2.652083430674264e-06, "loss": 0.75761342, "num_input_tokens_seen": 147276055, "step": 6860, "time_per_iteration": 2.552107572555542 }, { "auxiliary_loss_clip": 0.01058973, "auxiliary_loss_mlp": 0.01038942, "balance_loss_clip": 1.04514742, "balance_loss_mlp": 1.024279, "epoch": 0.4125056365549376, "flos": 18693730748160.0, "grad_norm": 1.7024014286117355, "language_loss": 0.7499401, "learning_rate": 2.651715238616068e-06, "loss": 0.7709192, "num_input_tokens_seen": 147293200, "step": 6861, "time_per_iteration": 2.8850560188293457 }, { "auxiliary_loss_clip": 0.01110545, "auxiliary_loss_mlp": 0.01044439, "balance_loss_clip": 1.04591155, "balance_loss_mlp": 1.03024721, "epoch": 0.41256575980760557, "flos": 17895293280000.0, "grad_norm": 2.2415523494511467, "language_loss": 0.79298902, "learning_rate": 2.651347021844765e-06, "loss": 0.8145389, "num_input_tokens_seen": 147310640, "step": 6862, "time_per_iteration": 2.900341510772705 }, { "auxiliary_loss_clip": 0.01101386, "auxiliary_loss_mlp": 0.01041536, "balance_loss_clip": 1.04071999, "balance_loss_mlp": 1.02640843, "epoch": 0.41262588306027354, "flos": 21981316901760.0, "grad_norm": 1.8032442507418176, "language_loss": 0.7571404, "learning_rate": 2.650978780374318e-06, "loss": 0.77856958, "num_input_tokens_seen": 147329435, "step": 6863, "time_per_iteration": 2.653726100921631 }, { "auxiliary_loss_clip": 0.01042253, "auxiliary_loss_mlp": 0.0101594, "balance_loss_clip": 1.02186918, "balance_loss_mlp": 1.01400852, "epoch": 0.41268600631294156, "flos": 53350006740480.0, "grad_norm": 0.7071869047358454, "language_loss": 0.52727556, "learning_rate": 2.650610514218691e-06, "loss": 0.54785752, "num_input_tokens_seen": 147385805, "step": 6864, "time_per_iteration": 3.1097042560577393 }, { "auxiliary_loss_clip": 0.01138053, "auxiliary_loss_mlp": 0.01037208, "balance_loss_clip": 1.04946339, "balance_loss_mlp": 1.02124572, "epoch": 0.4127461295656095, "flos": 24385356311040.0, "grad_norm": 2.542549123445174, "language_loss": 0.72281235, "learning_rate": 2.6502422233918468e-06, "loss": 0.74456495, "num_input_tokens_seen": 147405160, "step": 6865, "time_per_iteration": 2.6489152908325195 }, { "auxiliary_loss_clip": 0.01052076, "auxiliary_loss_mlp": 0.01005202, "balance_loss_clip": 1.02275848, "balance_loss_mlp": 1.0035094, "epoch": 0.4128062528182775, "flos": 71705242696320.0, "grad_norm": 0.9209058739863084, "language_loss": 0.66585267, "learning_rate": 2.649873907907753e-06, "loss": 0.68642545, "num_input_tokens_seen": 147460245, "step": 6866, "time_per_iteration": 3.062208890914917 }, { "auxiliary_loss_clip": 0.01129627, "auxiliary_loss_mlp": 0.01039004, "balance_loss_clip": 1.04632759, "balance_loss_mlp": 1.02420402, "epoch": 0.41286637607094545, "flos": 17849111368320.0, "grad_norm": 2.3224691577841905, "language_loss": 0.8131212, "learning_rate": 2.649505567780375e-06, "loss": 0.83480746, "num_input_tokens_seen": 147476200, "step": 6867, "time_per_iteration": 2.6058406829833984 }, { "auxiliary_loss_clip": 0.01114316, "auxiliary_loss_mlp": 0.01036267, "balance_loss_clip": 1.04773378, "balance_loss_mlp": 1.02069843, "epoch": 0.4129264993236134, "flos": 25549544016000.0, "grad_norm": 2.2632029728217913, "language_loss": 0.78249037, "learning_rate": 2.6491372030236815e-06, "loss": 0.80399621, "num_input_tokens_seen": 147494315, "step": 6868, "time_per_iteration": 2.7882273197174072 }, { "auxiliary_loss_clip": 0.0104195, "auxiliary_loss_mlp": 0.01002347, "balance_loss_clip": 1.02322721, "balance_loss_mlp": 1.00078535, "epoch": 0.4129866225762814, "flos": 65414446364160.0, "grad_norm": 0.8559261941349585, "language_loss": 0.57746547, "learning_rate": 2.64876881365164e-06, "loss": 0.59790844, "num_input_tokens_seen": 147543665, "step": 6869, "time_per_iteration": 2.9020984172821045 }, { "auxiliary_loss_clip": 0.01116756, "auxiliary_loss_mlp": 0.01037209, "balance_loss_clip": 1.04666448, "balance_loss_mlp": 1.02235568, "epoch": 0.41304674582894935, "flos": 28876991287680.0, "grad_norm": 2.064989454661501, "language_loss": 0.74957705, "learning_rate": 2.64840039967822e-06, "loss": 0.77111673, "num_input_tokens_seen": 147564870, "step": 6870, "time_per_iteration": 4.271910667419434 }, { "auxiliary_loss_clip": 0.01102765, "auxiliary_loss_mlp": 0.01045795, "balance_loss_clip": 1.04849434, "balance_loss_mlp": 1.0301609, "epoch": 0.4131068690816173, "flos": 22891975436160.0, "grad_norm": 1.7132239618858751, "language_loss": 0.83188486, "learning_rate": 2.6480319611173912e-06, "loss": 0.85337055, "num_input_tokens_seen": 147584840, "step": 6871, "time_per_iteration": 2.7382373809814453 }, { "auxiliary_loss_clip": 0.01102249, "auxiliary_loss_mlp": 0.01042486, "balance_loss_clip": 1.04694879, "balance_loss_mlp": 1.02648854, "epoch": 0.4131669923342853, "flos": 26065185707520.0, "grad_norm": 1.8588331523997874, "language_loss": 0.68419731, "learning_rate": 2.6476634979831263e-06, "loss": 0.70564461, "num_input_tokens_seen": 147604635, "step": 6872, "time_per_iteration": 2.731513738632202 }, { "auxiliary_loss_clip": 0.01116452, "auxiliary_loss_mlp": 0.0103393, "balance_loss_clip": 1.0480907, "balance_loss_mlp": 1.01936865, "epoch": 0.41322711558695324, "flos": 19244564789760.0, "grad_norm": 2.0600406966329468, "language_loss": 0.75857317, "learning_rate": 2.6472950102893964e-06, "loss": 0.78007692, "num_input_tokens_seen": 147620700, "step": 6873, "time_per_iteration": 4.200350999832153 }, { "auxiliary_loss_clip": 0.0110667, "auxiliary_loss_mlp": 0.01041498, "balance_loss_clip": 1.04465103, "balance_loss_mlp": 1.02552366, "epoch": 0.4132872388396212, "flos": 22674464628480.0, "grad_norm": 2.335780539187462, "language_loss": 0.83409697, "learning_rate": 2.6469264980501746e-06, "loss": 0.85557866, "num_input_tokens_seen": 147639490, "step": 6874, "time_per_iteration": 2.677481174468994 }, { "auxiliary_loss_clip": 0.01095645, "auxiliary_loss_mlp": 0.01037651, "balance_loss_clip": 1.04236686, "balance_loss_mlp": 1.02203512, "epoch": 0.4133473620922892, "flos": 20150195420160.0, "grad_norm": 2.13686316676373, "language_loss": 0.71832943, "learning_rate": 2.646557961279436e-06, "loss": 0.73966241, "num_input_tokens_seen": 147657205, "step": 6875, "time_per_iteration": 4.490081548690796 }, { "auxiliary_loss_clip": 0.01099487, "auxiliary_loss_mlp": 0.0104606, "balance_loss_clip": 1.0442456, "balance_loss_mlp": 1.03144503, "epoch": 0.41340748534495714, "flos": 24242755317120.0, "grad_norm": 2.0421788997824164, "language_loss": 0.82396001, "learning_rate": 2.646189399991154e-06, "loss": 0.84541547, "num_input_tokens_seen": 147677005, "step": 6876, "time_per_iteration": 2.7446470260620117 }, { "auxiliary_loss_clip": 0.01120566, "auxiliary_loss_mlp": 0.01041258, "balance_loss_clip": 1.04677415, "balance_loss_mlp": 1.02511716, "epoch": 0.41346760859762516, "flos": 14392171566720.0, "grad_norm": 2.56742905987435, "language_loss": 0.64847958, "learning_rate": 2.6458208141993048e-06, "loss": 0.67009783, "num_input_tokens_seen": 147693435, "step": 6877, "time_per_iteration": 2.5988993644714355 }, { "auxiliary_loss_clip": 0.01117576, "auxiliary_loss_mlp": 0.01038622, "balance_loss_clip": 1.04535675, "balance_loss_mlp": 1.02366138, "epoch": 0.4135277318502931, "flos": 22492002516480.0, "grad_norm": 1.9690610536683542, "language_loss": 0.76823169, "learning_rate": 2.6454522039178668e-06, "loss": 0.78979367, "num_input_tokens_seen": 147714000, "step": 6878, "time_per_iteration": 2.6289098262786865 }, { "auxiliary_loss_clip": 0.01120186, "auxiliary_loss_mlp": 0.0077293, "balance_loss_clip": 1.04670906, "balance_loss_mlp": 1.00107956, "epoch": 0.4135878551029611, "flos": 22418744728320.0, "grad_norm": 1.7550266496384528, "language_loss": 0.80281323, "learning_rate": 2.6450835691608154e-06, "loss": 0.82174444, "num_input_tokens_seen": 147731010, "step": 6879, "time_per_iteration": 2.661945343017578 }, { "auxiliary_loss_clip": 0.01130865, "auxiliary_loss_mlp": 0.01039257, "balance_loss_clip": 1.04709899, "balance_loss_mlp": 1.02471972, "epoch": 0.41364797835562905, "flos": 27053232094080.0, "grad_norm": 2.4786614895541312, "language_loss": 0.84795272, "learning_rate": 2.6447149099421315e-06, "loss": 0.869654, "num_input_tokens_seen": 147750880, "step": 6880, "time_per_iteration": 2.6188430786132812 }, { "auxiliary_loss_clip": 0.01111764, "auxiliary_loss_mlp": 0.0102976, "balance_loss_clip": 1.04788852, "balance_loss_mlp": 1.01497793, "epoch": 0.413708101608297, "flos": 22967603521920.0, "grad_norm": 3.387576232567814, "language_loss": 0.70222247, "learning_rate": 2.6443462262757927e-06, "loss": 0.72363776, "num_input_tokens_seen": 147771360, "step": 6881, "time_per_iteration": 2.733462333679199 }, { "auxiliary_loss_clip": 0.0112877, "auxiliary_loss_mlp": 0.01037286, "balance_loss_clip": 1.04717231, "balance_loss_mlp": 1.02352309, "epoch": 0.413768224860965, "flos": 13333991875200.0, "grad_norm": 2.043279627081185, "language_loss": 0.81609744, "learning_rate": 2.6439775181757805e-06, "loss": 0.837758, "num_input_tokens_seen": 147787440, "step": 6882, "time_per_iteration": 2.6478219032287598 }, { "auxiliary_loss_clip": 0.01107335, "auxiliary_loss_mlp": 0.0104742, "balance_loss_clip": 1.04388988, "balance_loss_mlp": 1.02958596, "epoch": 0.41382834811363295, "flos": 20813968800000.0, "grad_norm": 2.1226762712951195, "language_loss": 0.69825858, "learning_rate": 2.643608785656077e-06, "loss": 0.71980608, "num_input_tokens_seen": 147805720, "step": 6883, "time_per_iteration": 2.7219526767730713 }, { "auxiliary_loss_clip": 0.01117809, "auxiliary_loss_mlp": 0.01042891, "balance_loss_clip": 1.04390156, "balance_loss_mlp": 1.02804899, "epoch": 0.4138884713663009, "flos": 20667130001280.0, "grad_norm": 1.778769139531053, "language_loss": 0.76219916, "learning_rate": 2.643240028730663e-06, "loss": 0.7838062, "num_input_tokens_seen": 147824605, "step": 6884, "time_per_iteration": 2.7255208492279053 }, { "auxiliary_loss_clip": 0.01095169, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.04337394, "balance_loss_mlp": 1.02405715, "epoch": 0.4139485946189689, "flos": 29056616225280.0, "grad_norm": 1.442860134230448, "language_loss": 0.75787425, "learning_rate": 2.642871247413523e-06, "loss": 0.77921343, "num_input_tokens_seen": 147845445, "step": 6885, "time_per_iteration": 2.759103775024414 }, { "auxiliary_loss_clip": 0.0113157, "auxiliary_loss_mlp": 0.01040383, "balance_loss_clip": 1.04593658, "balance_loss_mlp": 1.0249809, "epoch": 0.41400871787163684, "flos": 24425720219520.0, "grad_norm": 2.975461049679227, "language_loss": 0.70157146, "learning_rate": 2.6425024417186414e-06, "loss": 0.72329092, "num_input_tokens_seen": 147865580, "step": 6886, "time_per_iteration": 2.5969202518463135 }, { "auxiliary_loss_clip": 0.01130858, "auxiliary_loss_mlp": 0.00772578, "balance_loss_clip": 1.04714894, "balance_loss_mlp": 1.00082159, "epoch": 0.4140688411243048, "flos": 19464050845440.0, "grad_norm": 4.863732808232375, "language_loss": 0.75765413, "learning_rate": 2.642133611660002e-06, "loss": 0.77668852, "num_input_tokens_seen": 147885230, "step": 6887, "time_per_iteration": 2.6130294799804688 }, { "auxiliary_loss_clip": 0.01115226, "auxiliary_loss_mlp": 0.0103352, "balance_loss_clip": 1.04343033, "balance_loss_mlp": 1.01858318, "epoch": 0.4141289643769728, "flos": 19313656600320.0, "grad_norm": 1.960325409954457, "language_loss": 0.70337266, "learning_rate": 2.641764757251592e-06, "loss": 0.72486007, "num_input_tokens_seen": 147903035, "step": 6888, "time_per_iteration": 2.616093635559082 }, { "auxiliary_loss_clip": 0.01125875, "auxiliary_loss_mlp": 0.01041471, "balance_loss_clip": 1.04317069, "balance_loss_mlp": 1.02698743, "epoch": 0.41418908762964074, "flos": 16726903683840.0, "grad_norm": 2.06267801428711, "language_loss": 0.76650596, "learning_rate": 2.6413958785073976e-06, "loss": 0.7881794, "num_input_tokens_seen": 147918745, "step": 6889, "time_per_iteration": 2.5624022483825684 }, { "auxiliary_loss_clip": 0.01098507, "auxiliary_loss_mlp": 0.00771883, "balance_loss_clip": 1.05070317, "balance_loss_mlp": 1.00089312, "epoch": 0.41424921088230876, "flos": 25296840858240.0, "grad_norm": 2.7156921824995224, "language_loss": 0.80554968, "learning_rate": 2.6410269754414074e-06, "loss": 0.82425356, "num_input_tokens_seen": 147938265, "step": 6890, "time_per_iteration": 2.796128273010254 }, { "auxiliary_loss_clip": 0.0112736, "auxiliary_loss_mlp": 0.01038801, "balance_loss_clip": 1.04589438, "balance_loss_mlp": 1.0235126, "epoch": 0.4143093341349767, "flos": 20960520289920.0, "grad_norm": 1.7630713030967287, "language_loss": 0.74180973, "learning_rate": 2.6406580480676113e-06, "loss": 0.76347136, "num_input_tokens_seen": 147957320, "step": 6891, "time_per_iteration": 2.6974401473999023 }, { "auxiliary_loss_clip": 0.01092037, "auxiliary_loss_mlp": 0.01043425, "balance_loss_clip": 1.0482198, "balance_loss_mlp": 1.02647936, "epoch": 0.4143694573876447, "flos": 22017694400640.0, "grad_norm": 1.8611116210645706, "language_loss": 0.84570521, "learning_rate": 2.6402890963999963e-06, "loss": 0.86705983, "num_input_tokens_seen": 147977045, "step": 6892, "time_per_iteration": 2.8065037727355957 }, { "auxiliary_loss_clip": 0.01081139, "auxiliary_loss_mlp": 0.00774401, "balance_loss_clip": 1.04017556, "balance_loss_mlp": 1.00088513, "epoch": 0.41442958064031266, "flos": 35697396723840.0, "grad_norm": 1.7475313827364956, "language_loss": 0.70824122, "learning_rate": 2.6399201204525554e-06, "loss": 0.72679669, "num_input_tokens_seen": 147996905, "step": 6893, "time_per_iteration": 2.865112543106079 }, { "auxiliary_loss_clip": 0.01126872, "auxiliary_loss_mlp": 0.01033016, "balance_loss_clip": 1.04508913, "balance_loss_mlp": 1.01873493, "epoch": 0.4144897038929806, "flos": 28293766156800.0, "grad_norm": 1.5118367219903406, "language_loss": 0.72955495, "learning_rate": 2.639551120239279e-06, "loss": 0.75115383, "num_input_tokens_seen": 148017875, "step": 6894, "time_per_iteration": 2.6412105560302734 }, { "auxiliary_loss_clip": 0.0111867, "auxiliary_loss_mlp": 0.01032409, "balance_loss_clip": 1.0444473, "balance_loss_mlp": 1.01803279, "epoch": 0.4145498271456486, "flos": 11648093080320.0, "grad_norm": 2.8699191887217697, "language_loss": 0.63006961, "learning_rate": 2.63918209577416e-06, "loss": 0.65158045, "num_input_tokens_seen": 148032300, "step": 6895, "time_per_iteration": 2.6429762840270996 }, { "auxiliary_loss_clip": 0.01084496, "auxiliary_loss_mlp": 0.01047641, "balance_loss_clip": 1.04230917, "balance_loss_mlp": 1.03178644, "epoch": 0.41460995039831655, "flos": 27235622378880.0, "grad_norm": 1.395247516884051, "language_loss": 0.7072767, "learning_rate": 2.638813047071192e-06, "loss": 0.728598, "num_input_tokens_seen": 148053260, "step": 6896, "time_per_iteration": 2.754567861557007 }, { "auxiliary_loss_clip": 0.01125613, "auxiliary_loss_mlp": 0.0104596, "balance_loss_clip": 1.04233313, "balance_loss_mlp": 1.03083241, "epoch": 0.4146700736509845, "flos": 25922369232000.0, "grad_norm": 1.6183082189069362, "language_loss": 0.73234701, "learning_rate": 2.6384439741443696e-06, "loss": 0.75406271, "num_input_tokens_seen": 148072965, "step": 6897, "time_per_iteration": 2.737884759902954 }, { "auxiliary_loss_clip": 0.01114786, "auxiliary_loss_mlp": 0.01041831, "balance_loss_clip": 1.04562593, "balance_loss_mlp": 1.02713859, "epoch": 0.4147301969036525, "flos": 26833243248000.0, "grad_norm": 1.834097351521641, "language_loss": 0.84865111, "learning_rate": 2.6380748770076873e-06, "loss": 0.87021732, "num_input_tokens_seen": 148093240, "step": 6898, "time_per_iteration": 2.689467430114746 }, { "auxiliary_loss_clip": 0.01079261, "auxiliary_loss_mlp": 0.01035002, "balance_loss_clip": 1.03853178, "balance_loss_mlp": 1.02030301, "epoch": 0.41479032015632045, "flos": 20298291194880.0, "grad_norm": 1.6538444757930724, "language_loss": 0.74696559, "learning_rate": 2.6377057556751416e-06, "loss": 0.76810819, "num_input_tokens_seen": 148110925, "step": 6899, "time_per_iteration": 2.73575758934021 }, { "auxiliary_loss_clip": 0.0109529, "auxiliary_loss_mlp": 0.0104143, "balance_loss_clip": 1.04097557, "balance_loss_mlp": 1.02549219, "epoch": 0.4148504434089884, "flos": 25264988472960.0, "grad_norm": 2.0028183144746254, "language_loss": 0.75739181, "learning_rate": 2.6373366101607306e-06, "loss": 0.778759, "num_input_tokens_seen": 148130670, "step": 6900, "time_per_iteration": 2.7304093837738037 }, { "auxiliary_loss_clip": 0.01112354, "auxiliary_loss_mlp": 0.01038142, "balance_loss_clip": 1.04515111, "balance_loss_mlp": 1.02218616, "epoch": 0.4149105666616564, "flos": 12822300679680.0, "grad_norm": 37.61175094058464, "language_loss": 0.79667652, "learning_rate": 2.6369674404784503e-06, "loss": 0.81818151, "num_input_tokens_seen": 148148350, "step": 6901, "time_per_iteration": 2.6238512992858887 }, { "auxiliary_loss_clip": 0.01085977, "auxiliary_loss_mlp": 0.01046173, "balance_loss_clip": 1.03959978, "balance_loss_mlp": 1.0302825, "epoch": 0.41497068991432434, "flos": 16763891713920.0, "grad_norm": 1.6395274695924928, "language_loss": 0.69640017, "learning_rate": 2.6365982466423014e-06, "loss": 0.7177217, "num_input_tokens_seen": 148167550, "step": 6902, "time_per_iteration": 2.6854305267333984 }, { "auxiliary_loss_clip": 0.01097592, "auxiliary_loss_mlp": 0.00770925, "balance_loss_clip": 1.04278207, "balance_loss_mlp": 1.00099885, "epoch": 0.4150308131669923, "flos": 18000906243840.0, "grad_norm": 2.384025861502229, "language_loss": 0.83949161, "learning_rate": 2.6362290286662834e-06, "loss": 0.85817683, "num_input_tokens_seen": 148184740, "step": 6903, "time_per_iteration": 2.6454520225524902 }, { "auxiliary_loss_clip": 0.01133263, "auxiliary_loss_mlp": 0.01042035, "balance_loss_clip": 1.04633808, "balance_loss_mlp": 1.02569163, "epoch": 0.41509093641966033, "flos": 30044770352640.0, "grad_norm": 1.9553359330266324, "language_loss": 0.67639846, "learning_rate": 2.6358597865643968e-06, "loss": 0.69815147, "num_input_tokens_seen": 148204605, "step": 6904, "time_per_iteration": 2.7322065830230713 }, { "auxiliary_loss_clip": 0.01130567, "auxiliary_loss_mlp": 0.0077237, "balance_loss_clip": 1.04620719, "balance_loss_mlp": 1.00097251, "epoch": 0.4151510596723283, "flos": 24279994742400.0, "grad_norm": 1.8757192691258513, "language_loss": 0.77572656, "learning_rate": 2.635490520350643e-06, "loss": 0.79475594, "num_input_tokens_seen": 148224675, "step": 6905, "time_per_iteration": 2.648400068283081 }, { "auxiliary_loss_clip": 0.0113062, "auxiliary_loss_mlp": 0.01033001, "balance_loss_clip": 1.04648256, "balance_loss_mlp": 1.01869583, "epoch": 0.41521118292499626, "flos": 23476206147840.0, "grad_norm": 1.5608092182069806, "language_loss": 0.68316001, "learning_rate": 2.635121230039025e-06, "loss": 0.7047962, "num_input_tokens_seen": 148243375, "step": 6906, "time_per_iteration": 2.6084086894989014 }, { "auxiliary_loss_clip": 0.01104219, "auxiliary_loss_mlp": 0.0103582, "balance_loss_clip": 1.04238176, "balance_loss_mlp": 1.02167583, "epoch": 0.4152713061776642, "flos": 22125498094080.0, "grad_norm": 2.313429051291415, "language_loss": 0.67982537, "learning_rate": 2.6347519156435467e-06, "loss": 0.70122576, "num_input_tokens_seen": 148261140, "step": 6907, "time_per_iteration": 2.715506076812744 }, { "auxiliary_loss_clip": 0.01100263, "auxiliary_loss_mlp": 0.01038198, "balance_loss_clip": 1.0479455, "balance_loss_mlp": 1.02419686, "epoch": 0.4153314294303322, "flos": 21251396626560.0, "grad_norm": 2.133321939860832, "language_loss": 0.77338696, "learning_rate": 2.6343825771782123e-06, "loss": 0.79477155, "num_input_tokens_seen": 148279655, "step": 6908, "time_per_iteration": 2.699028253555298 }, { "auxiliary_loss_clip": 0.01035537, "auxiliary_loss_mlp": 0.01050035, "balance_loss_clip": 1.02502179, "balance_loss_mlp": 1.04800892, "epoch": 0.41539155268300015, "flos": 57920681594880.0, "grad_norm": 0.8023457423545532, "language_loss": 0.64889216, "learning_rate": 2.634013214657026e-06, "loss": 0.66974789, "num_input_tokens_seen": 148339005, "step": 6909, "time_per_iteration": 3.174577474594116 }, { "auxiliary_loss_clip": 0.01096348, "auxiliary_loss_mlp": 0.0103783, "balance_loss_clip": 1.04794037, "balance_loss_mlp": 1.02368009, "epoch": 0.4154516759356681, "flos": 21903677654400.0, "grad_norm": 3.1710005220016293, "language_loss": 0.8712942, "learning_rate": 2.633643828093996e-06, "loss": 0.89263594, "num_input_tokens_seen": 148358715, "step": 6910, "time_per_iteration": 4.24171257019043 }, { "auxiliary_loss_clip": 0.01040831, "auxiliary_loss_mlp": 0.01008541, "balance_loss_clip": 1.02141929, "balance_loss_mlp": 1.00702703, "epoch": 0.4155117991883361, "flos": 67833677226240.0, "grad_norm": 0.8180681021689019, "language_loss": 0.62115103, "learning_rate": 2.633274417503128e-06, "loss": 0.64164472, "num_input_tokens_seen": 148417280, "step": 6911, "time_per_iteration": 3.171510696411133 }, { "auxiliary_loss_clip": 0.01138851, "auxiliary_loss_mlp": 0.01037606, "balance_loss_clip": 1.05016613, "balance_loss_mlp": 1.0219059, "epoch": 0.41557192244100405, "flos": 14282679934080.0, "grad_norm": 2.4116200088670845, "language_loss": 0.87474132, "learning_rate": 2.6329049828984312e-06, "loss": 0.89650595, "num_input_tokens_seen": 148432610, "step": 6912, "time_per_iteration": 5.576058864593506 }, { "auxiliary_loss_clip": 0.01117561, "auxiliary_loss_mlp": 0.01034627, "balance_loss_clip": 1.04753387, "balance_loss_mlp": 1.02098989, "epoch": 0.415632045693672, "flos": 24461954064000.0, "grad_norm": 22.77173838310247, "language_loss": 0.63224173, "learning_rate": 2.632535524293914e-06, "loss": 0.65376365, "num_input_tokens_seen": 148451510, "step": 6913, "time_per_iteration": 2.702631711959839 }, { "auxiliary_loss_clip": 0.01102511, "auxiliary_loss_mlp": 0.00771597, "balance_loss_clip": 1.04298615, "balance_loss_mlp": 1.00093937, "epoch": 0.41569216894634, "flos": 20115290378880.0, "grad_norm": 1.7272855093915238, "language_loss": 0.74980754, "learning_rate": 2.632166041703586e-06, "loss": 0.76854861, "num_input_tokens_seen": 148469945, "step": 6914, "time_per_iteration": 4.340964078903198 }, { "auxiliary_loss_clip": 0.01077278, "auxiliary_loss_mlp": 0.01044004, "balance_loss_clip": 1.04201877, "balance_loss_mlp": 1.02906704, "epoch": 0.41575229219900794, "flos": 23798827128960.0, "grad_norm": 1.8325905436461942, "language_loss": 0.87653631, "learning_rate": 2.631796535141458e-06, "loss": 0.89774919, "num_input_tokens_seen": 148486655, "step": 6915, "time_per_iteration": 2.757596731185913 }, { "auxiliary_loss_clip": 0.0109973, "auxiliary_loss_mlp": 0.01041371, "balance_loss_clip": 1.04447317, "balance_loss_mlp": 1.02728081, "epoch": 0.4158124154516759, "flos": 23108229267840.0, "grad_norm": 3.0600667343253214, "language_loss": 0.70990372, "learning_rate": 2.6314270046215426e-06, "loss": 0.73131478, "num_input_tokens_seen": 148505035, "step": 6916, "time_per_iteration": 2.6894583702087402 }, { "auxiliary_loss_clip": 0.01135969, "auxiliary_loss_mlp": 0.01038621, "balance_loss_clip": 1.04934418, "balance_loss_mlp": 1.02361822, "epoch": 0.41587253870434393, "flos": 24242970798720.0, "grad_norm": 1.53910679789622, "language_loss": 0.71859491, "learning_rate": 2.631057450157852e-06, "loss": 0.74034083, "num_input_tokens_seen": 148525575, "step": 6917, "time_per_iteration": 2.560401439666748 }, { "auxiliary_loss_clip": 0.01104226, "auxiliary_loss_mlp": 0.01032177, "balance_loss_clip": 1.04427075, "balance_loss_mlp": 1.01856291, "epoch": 0.4159326619570119, "flos": 23881602021120.0, "grad_norm": 1.8609084037764254, "language_loss": 0.80841225, "learning_rate": 2.6306878717643988e-06, "loss": 0.82977629, "num_input_tokens_seen": 148547270, "step": 6918, "time_per_iteration": 2.71455979347229 }, { "auxiliary_loss_clip": 0.01122968, "auxiliary_loss_mlp": 0.01038479, "balance_loss_clip": 1.05033052, "balance_loss_mlp": 1.02306533, "epoch": 0.41599278520967986, "flos": 40626531354240.0, "grad_norm": 1.460873312199365, "language_loss": 0.70399261, "learning_rate": 2.6303182694551995e-06, "loss": 0.72560704, "num_input_tokens_seen": 148572100, "step": 6919, "time_per_iteration": 2.784090518951416 }, { "auxiliary_loss_clip": 0.01108371, "auxiliary_loss_mlp": 0.0104095, "balance_loss_clip": 1.04570937, "balance_loss_mlp": 1.0255723, "epoch": 0.4160529084623478, "flos": 18222942165120.0, "grad_norm": 1.8818708282287906, "language_loss": 0.81701922, "learning_rate": 2.6299486432442677e-06, "loss": 0.83851242, "num_input_tokens_seen": 148591245, "step": 6920, "time_per_iteration": 2.644867181777954 }, { "auxiliary_loss_clip": 0.01113217, "auxiliary_loss_mlp": 0.01042119, "balance_loss_clip": 1.04909408, "balance_loss_mlp": 1.02627623, "epoch": 0.4161130317150158, "flos": 13661963982720.0, "grad_norm": 2.168550443744471, "language_loss": 0.65408564, "learning_rate": 2.6295789931456195e-06, "loss": 0.67563891, "num_input_tokens_seen": 148607980, "step": 6921, "time_per_iteration": 2.647270441055298 }, { "auxiliary_loss_clip": 0.01108151, "auxiliary_loss_mlp": 0.01042421, "balance_loss_clip": 1.04479325, "balance_loss_mlp": 1.02768648, "epoch": 0.41617315496768376, "flos": 16178511767040.0, "grad_norm": 2.3873319200859004, "language_loss": 0.80806041, "learning_rate": 2.629209319173274e-06, "loss": 0.82956612, "num_input_tokens_seen": 148624490, "step": 6922, "time_per_iteration": 2.6521530151367188 }, { "auxiliary_loss_clip": 0.01107722, "auxiliary_loss_mlp": 0.01037357, "balance_loss_clip": 1.04645085, "balance_loss_mlp": 1.02304578, "epoch": 0.4162332782203517, "flos": 26213317395840.0, "grad_norm": 1.6600188367705673, "language_loss": 0.67455506, "learning_rate": 2.628839621341247e-06, "loss": 0.69600594, "num_input_tokens_seen": 148646490, "step": 6923, "time_per_iteration": 2.6982760429382324 }, { "auxiliary_loss_clip": 0.01100761, "auxiliary_loss_mlp": 0.01052569, "balance_loss_clip": 1.04614723, "balance_loss_mlp": 1.03649926, "epoch": 0.4162934014730197, "flos": 28183987215360.0, "grad_norm": 2.1905305361602676, "language_loss": 0.75802875, "learning_rate": 2.6284698996635593e-06, "loss": 0.77956206, "num_input_tokens_seen": 148668580, "step": 6924, "time_per_iteration": 2.746675491333008 }, { "auxiliary_loss_clip": 0.01134317, "auxiliary_loss_mlp": 0.01042613, "balance_loss_clip": 1.04869533, "balance_loss_mlp": 1.02842665, "epoch": 0.41635352472568765, "flos": 19865316654720.0, "grad_norm": 2.7384378444587774, "language_loss": 0.73572767, "learning_rate": 2.62810015415423e-06, "loss": 0.75749695, "num_input_tokens_seen": 148688410, "step": 6925, "time_per_iteration": 2.6443655490875244 }, { "auxiliary_loss_clip": 0.01107096, "auxiliary_loss_mlp": 0.01035039, "balance_loss_clip": 1.04328012, "balance_loss_mlp": 1.02092457, "epoch": 0.4164136479783556, "flos": 14935356011520.0, "grad_norm": 2.2965796841293487, "language_loss": 0.83732742, "learning_rate": 2.6277303848272792e-06, "loss": 0.85874879, "num_input_tokens_seen": 148704855, "step": 6926, "time_per_iteration": 2.688778877258301 }, { "auxiliary_loss_clip": 0.01101563, "auxiliary_loss_mlp": 0.0104323, "balance_loss_clip": 1.04851913, "balance_loss_mlp": 1.03019416, "epoch": 0.4164737712310236, "flos": 21757593041280.0, "grad_norm": 1.7122304152619183, "language_loss": 0.86459213, "learning_rate": 2.6273605916967302e-06, "loss": 0.88604003, "num_input_tokens_seen": 148723065, "step": 6927, "time_per_iteration": 2.6891677379608154 }, { "auxiliary_loss_clip": 0.01123007, "auxiliary_loss_mlp": 0.01048103, "balance_loss_clip": 1.04902172, "balance_loss_mlp": 1.03252852, "epoch": 0.41653389448369155, "flos": 20740136394240.0, "grad_norm": 2.2496180093698555, "language_loss": 0.72619522, "learning_rate": 2.626990774776604e-06, "loss": 0.74790633, "num_input_tokens_seen": 148741780, "step": 6928, "time_per_iteration": 2.6853785514831543 }, { "auxiliary_loss_clip": 0.01103421, "auxiliary_loss_mlp": 0.01037571, "balance_loss_clip": 1.04516923, "balance_loss_mlp": 1.02305102, "epoch": 0.4165940177363595, "flos": 24972891073920.0, "grad_norm": 2.3320684503004667, "language_loss": 0.781192, "learning_rate": 2.6266209340809254e-06, "loss": 0.80260193, "num_input_tokens_seen": 148759795, "step": 6929, "time_per_iteration": 2.675412893295288 }, { "auxiliary_loss_clip": 0.01130228, "auxiliary_loss_mlp": 0.01034459, "balance_loss_clip": 1.04634309, "balance_loss_mlp": 1.02042162, "epoch": 0.41665414098902753, "flos": 20521727746560.0, "grad_norm": 2.2076337971053897, "language_loss": 0.70941442, "learning_rate": 2.6262510696237182e-06, "loss": 0.73106134, "num_input_tokens_seen": 148778680, "step": 6930, "time_per_iteration": 2.5896191596984863 }, { "auxiliary_loss_clip": 0.0110378, "auxiliary_loss_mlp": 0.01040113, "balance_loss_clip": 1.04316616, "balance_loss_mlp": 1.02566469, "epoch": 0.4167142642416955, "flos": 19682926369920.0, "grad_norm": 1.7468000498396183, "language_loss": 0.81265134, "learning_rate": 2.625881181419007e-06, "loss": 0.83409023, "num_input_tokens_seen": 148796470, "step": 6931, "time_per_iteration": 2.693753719329834 }, { "auxiliary_loss_clip": 0.01073611, "auxiliary_loss_mlp": 0.01040047, "balance_loss_clip": 1.03671885, "balance_loss_mlp": 1.0253247, "epoch": 0.41677438749436346, "flos": 23763742519680.0, "grad_norm": 1.7136797301427433, "language_loss": 0.78969777, "learning_rate": 2.6255112694808193e-06, "loss": 0.81083435, "num_input_tokens_seen": 148815300, "step": 6932, "time_per_iteration": 2.900186061859131 }, { "auxiliary_loss_clip": 0.01110051, "auxiliary_loss_mlp": 0.00772641, "balance_loss_clip": 1.04659891, "balance_loss_mlp": 1.00109386, "epoch": 0.41683451074703143, "flos": 30410053712640.0, "grad_norm": 1.8812444225834188, "language_loss": 0.81995165, "learning_rate": 2.6251413338231813e-06, "loss": 0.83877861, "num_input_tokens_seen": 148834315, "step": 6933, "time_per_iteration": 2.815415143966675 }, { "auxiliary_loss_clip": 0.01135077, "auxiliary_loss_mlp": 0.01036525, "balance_loss_clip": 1.04731107, "balance_loss_mlp": 1.02077699, "epoch": 0.4168946339996994, "flos": 21506757390720.0, "grad_norm": 2.9283724451949236, "language_loss": 0.76852083, "learning_rate": 2.624771374460121e-06, "loss": 0.79023689, "num_input_tokens_seen": 148852420, "step": 6934, "time_per_iteration": 2.7175137996673584 }, { "auxiliary_loss_clip": 0.01122637, "auxiliary_loss_mlp": 0.01034712, "balance_loss_clip": 1.048594, "balance_loss_mlp": 1.02038264, "epoch": 0.41695475725236736, "flos": 17638675539840.0, "grad_norm": 1.7602525666099749, "language_loss": 0.67555362, "learning_rate": 2.624401391405668e-06, "loss": 0.6971271, "num_input_tokens_seen": 148869305, "step": 6935, "time_per_iteration": 2.740238666534424 }, { "auxiliary_loss_clip": 0.01106934, "auxiliary_loss_mlp": 0.01041015, "balance_loss_clip": 1.04740202, "balance_loss_mlp": 1.02606606, "epoch": 0.4170148805050353, "flos": 15668903560320.0, "grad_norm": 2.0770148597671834, "language_loss": 0.73310643, "learning_rate": 2.6240313846738513e-06, "loss": 0.75458586, "num_input_tokens_seen": 148886395, "step": 6936, "time_per_iteration": 2.71653413772583 }, { "auxiliary_loss_clip": 0.01115958, "auxiliary_loss_mlp": 0.01036656, "balance_loss_clip": 1.04845905, "balance_loss_mlp": 1.02274418, "epoch": 0.4170750037577033, "flos": 15159151699200.0, "grad_norm": 2.3408521316198794, "language_loss": 0.74009961, "learning_rate": 2.6236613542787024e-06, "loss": 0.76162577, "num_input_tokens_seen": 148905235, "step": 6937, "time_per_iteration": 2.627197265625 }, { "auxiliary_loss_clip": 0.01105318, "auxiliary_loss_mlp": 0.01038451, "balance_loss_clip": 1.04543686, "balance_loss_mlp": 1.02422357, "epoch": 0.41713512701037125, "flos": 28768289754240.0, "grad_norm": 2.1407867738666977, "language_loss": 0.84349155, "learning_rate": 2.6232913002342518e-06, "loss": 0.8649292, "num_input_tokens_seen": 148928130, "step": 6938, "time_per_iteration": 2.7512307167053223 }, { "auxiliary_loss_clip": 0.01107641, "auxiliary_loss_mlp": 0.01037692, "balance_loss_clip": 1.04718804, "balance_loss_mlp": 1.02217638, "epoch": 0.4171952502630392, "flos": 28256993608320.0, "grad_norm": 1.985550471698889, "language_loss": 0.7437641, "learning_rate": 2.6229212225545334e-06, "loss": 0.76521742, "num_input_tokens_seen": 148948790, "step": 6939, "time_per_iteration": 2.8480472564697266 }, { "auxiliary_loss_clip": 0.01121822, "auxiliary_loss_mlp": 0.01033365, "balance_loss_clip": 1.0470984, "balance_loss_mlp": 1.01803446, "epoch": 0.4172553735157072, "flos": 24571697091840.0, "grad_norm": 2.560264252806934, "language_loss": 0.74981248, "learning_rate": 2.622551121253579e-06, "loss": 0.77136433, "num_input_tokens_seen": 148967690, "step": 6940, "time_per_iteration": 2.707803249359131 }, { "auxiliary_loss_clip": 0.01132435, "auxiliary_loss_mlp": 0.01040605, "balance_loss_clip": 1.04839242, "balance_loss_mlp": 1.0266397, "epoch": 0.41731549676837515, "flos": 27045797978880.0, "grad_norm": 2.248952291582723, "language_loss": 0.71683985, "learning_rate": 2.622180996345424e-06, "loss": 0.73857027, "num_input_tokens_seen": 148987150, "step": 6941, "time_per_iteration": 2.6406352519989014 }, { "auxiliary_loss_clip": 0.01119657, "auxiliary_loss_mlp": 0.0103964, "balance_loss_clip": 1.04871619, "balance_loss_mlp": 1.02461994, "epoch": 0.4173756200210431, "flos": 28394063907840.0, "grad_norm": 2.929963903641068, "language_loss": 0.74062824, "learning_rate": 2.621810847844104e-06, "loss": 0.76222122, "num_input_tokens_seen": 149004895, "step": 6942, "time_per_iteration": 2.7269139289855957 }, { "auxiliary_loss_clip": 0.01096497, "auxiliary_loss_mlp": 0.01046649, "balance_loss_clip": 1.04605746, "balance_loss_mlp": 1.03079462, "epoch": 0.41743574327371114, "flos": 22521556431360.0, "grad_norm": 2.258418581580233, "language_loss": 0.72607493, "learning_rate": 2.6214406757636534e-06, "loss": 0.74750638, "num_input_tokens_seen": 149020970, "step": 6943, "time_per_iteration": 2.8146276473999023 }, { "auxiliary_loss_clip": 0.01100254, "auxiliary_loss_mlp": 0.00772502, "balance_loss_clip": 1.04520488, "balance_loss_mlp": 1.00081825, "epoch": 0.4174958665263791, "flos": 30113431200000.0, "grad_norm": 1.7970886758223585, "language_loss": 0.63763773, "learning_rate": 2.621070480118111e-06, "loss": 0.65636539, "num_input_tokens_seen": 149041795, "step": 6944, "time_per_iteration": 2.7709715366363525 }, { "auxiliary_loss_clip": 0.0109928, "auxiliary_loss_mlp": 0.01037535, "balance_loss_clip": 1.03980803, "balance_loss_mlp": 1.02262771, "epoch": 0.41755598977904707, "flos": 25263444188160.0, "grad_norm": 1.5620596317333308, "language_loss": 0.70201832, "learning_rate": 2.620700260921513e-06, "loss": 0.72338641, "num_input_tokens_seen": 149063700, "step": 6945, "time_per_iteration": 2.7668464183807373 }, { "auxiliary_loss_clip": 0.01086028, "auxiliary_loss_mlp": 0.01052164, "balance_loss_clip": 1.03888953, "balance_loss_mlp": 1.03434181, "epoch": 0.41761611303171503, "flos": 19828580019840.0, "grad_norm": 3.903492543127265, "language_loss": 0.81313473, "learning_rate": 2.620330018187899e-06, "loss": 0.8345167, "num_input_tokens_seen": 149082410, "step": 6946, "time_per_iteration": 2.7656164169311523 }, { "auxiliary_loss_clip": 0.0111906, "auxiliary_loss_mlp": 0.01033842, "balance_loss_clip": 1.04820168, "balance_loss_mlp": 1.01947689, "epoch": 0.417676236284383, "flos": 15523249910400.0, "grad_norm": 3.3237502950686997, "language_loss": 0.77819085, "learning_rate": 2.6199597519313086e-06, "loss": 0.79971987, "num_input_tokens_seen": 149098745, "step": 6947, "time_per_iteration": 2.6658904552459717 }, { "auxiliary_loss_clip": 0.01131014, "auxiliary_loss_mlp": 0.01035473, "balance_loss_clip": 1.04678917, "balance_loss_mlp": 1.020262, "epoch": 0.41773635953705096, "flos": 32524473761280.0, "grad_norm": 4.535535573323162, "language_loss": 0.72142154, "learning_rate": 2.6195894621657825e-06, "loss": 0.7430864, "num_input_tokens_seen": 149122255, "step": 6948, "time_per_iteration": 2.728604316711426 }, { "auxiliary_loss_clip": 0.0111373, "auxiliary_loss_mlp": 0.01035464, "balance_loss_clip": 1.04416013, "balance_loss_mlp": 1.02127814, "epoch": 0.4177964827897189, "flos": 23440941970560.0, "grad_norm": 1.752796472610303, "language_loss": 0.77020466, "learning_rate": 2.619219148905362e-06, "loss": 0.79169655, "num_input_tokens_seen": 149142845, "step": 6949, "time_per_iteration": 4.2494752407073975 }, { "auxiliary_loss_clip": 0.011131, "auxiliary_loss_mlp": 0.01040025, "balance_loss_clip": 1.05060196, "balance_loss_mlp": 1.02523708, "epoch": 0.4178566060423869, "flos": 22748907565440.0, "grad_norm": 1.637174584956538, "language_loss": 0.8214075, "learning_rate": 2.6188488121640888e-06, "loss": 0.84293878, "num_input_tokens_seen": 149163375, "step": 6950, "time_per_iteration": 2.7383689880371094 }, { "auxiliary_loss_clip": 0.01099413, "auxiliary_loss_mlp": 0.00770849, "balance_loss_clip": 1.04511857, "balance_loss_mlp": 1.00090635, "epoch": 0.41791672929505486, "flos": 26032794618240.0, "grad_norm": 1.501775844018401, "language_loss": 0.7649653, "learning_rate": 2.618478451956007e-06, "loss": 0.78366792, "num_input_tokens_seen": 149185610, "step": 6951, "time_per_iteration": 5.789496660232544 }, { "auxiliary_loss_clip": 0.01088001, "auxiliary_loss_mlp": 0.01034314, "balance_loss_clip": 1.04565978, "balance_loss_mlp": 1.01929939, "epoch": 0.4179768525477228, "flos": 19568694142080.0, "grad_norm": 1.8438034417752391, "language_loss": 0.73442549, "learning_rate": 2.61810806829516e-06, "loss": 0.75564867, "num_input_tokens_seen": 149203990, "step": 6952, "time_per_iteration": 2.762404680252075 }, { "auxiliary_loss_clip": 0.01116339, "auxiliary_loss_mlp": 0.01038971, "balance_loss_clip": 1.04836369, "balance_loss_mlp": 1.0251013, "epoch": 0.4180369758003908, "flos": 17783826399360.0, "grad_norm": 2.8847563198217667, "language_loss": 0.7161783, "learning_rate": 2.617737661195593e-06, "loss": 0.73773146, "num_input_tokens_seen": 149221385, "step": 6953, "time_per_iteration": 2.6514034271240234 }, { "auxiliary_loss_clip": 0.01118442, "auxiliary_loss_mlp": 0.01038634, "balance_loss_clip": 1.04711723, "balance_loss_mlp": 1.02363181, "epoch": 0.41809709905305875, "flos": 20960663944320.0, "grad_norm": 1.7834717110535325, "language_loss": 0.75982141, "learning_rate": 2.617367230671353e-06, "loss": 0.78139216, "num_input_tokens_seen": 149241175, "step": 6954, "time_per_iteration": 4.3135082721710205 }, { "auxiliary_loss_clip": 0.01092319, "auxiliary_loss_mlp": 0.01046188, "balance_loss_clip": 1.04647863, "balance_loss_mlp": 1.02979708, "epoch": 0.4181572223057267, "flos": 22017622573440.0, "grad_norm": 2.907950037168039, "language_loss": 0.84492826, "learning_rate": 2.616996776736485e-06, "loss": 0.86631334, "num_input_tokens_seen": 149259115, "step": 6955, "time_per_iteration": 2.7724356651306152 }, { "auxiliary_loss_clip": 0.01121525, "auxiliary_loss_mlp": 0.01040437, "balance_loss_clip": 1.04870594, "balance_loss_mlp": 1.02604234, "epoch": 0.4182173455583947, "flos": 26245528917120.0, "grad_norm": 1.6794559400644542, "language_loss": 0.83262718, "learning_rate": 2.616626299405037e-06, "loss": 0.8542468, "num_input_tokens_seen": 149278705, "step": 6956, "time_per_iteration": 2.7260353565216064 }, { "auxiliary_loss_clip": 0.01093652, "auxiliary_loss_mlp": 0.01039325, "balance_loss_clip": 1.04491091, "balance_loss_mlp": 1.02423358, "epoch": 0.4182774688110627, "flos": 14791605782400.0, "grad_norm": 2.3946498969788634, "language_loss": 0.71788859, "learning_rate": 2.616255798691059e-06, "loss": 0.73921835, "num_input_tokens_seen": 149294040, "step": 6957, "time_per_iteration": 2.6826114654541016 }, { "auxiliary_loss_clip": 0.01099548, "auxiliary_loss_mlp": 0.01043781, "balance_loss_clip": 1.0462482, "balance_loss_mlp": 1.02966106, "epoch": 0.41833759206373067, "flos": 20412020632320.0, "grad_norm": 2.4781797095716276, "language_loss": 0.75947559, "learning_rate": 2.6158852746085982e-06, "loss": 0.78090888, "num_input_tokens_seen": 149310385, "step": 6958, "time_per_iteration": 2.7528226375579834 }, { "auxiliary_loss_clip": 0.01083285, "auxiliary_loss_mlp": 0.00772338, "balance_loss_clip": 1.04087532, "balance_loss_mlp": 1.0007602, "epoch": 0.41839771531639863, "flos": 23656333875840.0, "grad_norm": 1.8764496083097535, "language_loss": 0.7693305, "learning_rate": 2.6155147271717066e-06, "loss": 0.78788674, "num_input_tokens_seen": 149328235, "step": 6959, "time_per_iteration": 2.7859151363372803 }, { "auxiliary_loss_clip": 0.01089374, "auxiliary_loss_mlp": 0.00772565, "balance_loss_clip": 1.04304624, "balance_loss_mlp": 1.00090861, "epoch": 0.4184578385690666, "flos": 19754137082880.0, "grad_norm": 2.1131068778060498, "language_loss": 0.77339065, "learning_rate": 2.6151441563944347e-06, "loss": 0.79201001, "num_input_tokens_seen": 149347465, "step": 6960, "time_per_iteration": 2.7497265338897705 }, { "auxiliary_loss_clip": 0.01098942, "auxiliary_loss_mlp": 0.01037539, "balance_loss_clip": 1.04735017, "balance_loss_mlp": 1.02385998, "epoch": 0.41851796182173456, "flos": 20193396503040.0, "grad_norm": 1.8404962312042226, "language_loss": 0.75842559, "learning_rate": 2.614773562290835e-06, "loss": 0.7797904, "num_input_tokens_seen": 149366685, "step": 6961, "time_per_iteration": 2.6800267696380615 }, { "auxiliary_loss_clip": 0.01038031, "auxiliary_loss_mlp": 0.01001682, "balance_loss_clip": 1.03925419, "balance_loss_mlp": 0.99970287, "epoch": 0.41857808507440253, "flos": 59018794231680.0, "grad_norm": 0.7827663866056928, "language_loss": 0.54655838, "learning_rate": 2.61440294487496e-06, "loss": 0.56695551, "num_input_tokens_seen": 149422925, "step": 6962, "time_per_iteration": 3.1537134647369385 }, { "auxiliary_loss_clip": 0.01120288, "auxiliary_loss_mlp": 0.0104634, "balance_loss_clip": 1.04961705, "balance_loss_mlp": 1.0318327, "epoch": 0.4186382083270705, "flos": 18478805719680.0, "grad_norm": 1.960507757786237, "language_loss": 0.85535777, "learning_rate": 2.614032304160864e-06, "loss": 0.87702405, "num_input_tokens_seen": 149440820, "step": 6963, "time_per_iteration": 2.5925374031066895 }, { "auxiliary_loss_clip": 0.01106535, "auxiliary_loss_mlp": 0.01041093, "balance_loss_clip": 1.04856253, "balance_loss_mlp": 1.02657938, "epoch": 0.41869833157973846, "flos": 21578758202880.0, "grad_norm": 1.6555227491445992, "language_loss": 0.70422602, "learning_rate": 2.6136616401626014e-06, "loss": 0.72570229, "num_input_tokens_seen": 149461060, "step": 6964, "time_per_iteration": 2.675595760345459 }, { "auxiliary_loss_clip": 0.01131013, "auxiliary_loss_mlp": 0.01048168, "balance_loss_clip": 1.04926276, "balance_loss_mlp": 1.03433418, "epoch": 0.4187584548324064, "flos": 35517412650240.0, "grad_norm": 2.107779734715906, "language_loss": 0.71486962, "learning_rate": 2.6132909528942273e-06, "loss": 0.73666137, "num_input_tokens_seen": 149483115, "step": 6965, "time_per_iteration": 2.728795289993286 }, { "auxiliary_loss_clip": 0.01081273, "auxiliary_loss_mlp": 0.01038276, "balance_loss_clip": 1.04315698, "balance_loss_mlp": 1.02465594, "epoch": 0.4188185780850744, "flos": 18655880791680.0, "grad_norm": 1.546256806673652, "language_loss": 0.71920437, "learning_rate": 2.6129202423697997e-06, "loss": 0.74039984, "num_input_tokens_seen": 149501495, "step": 6966, "time_per_iteration": 2.9000282287597656 }, { "auxiliary_loss_clip": 0.01127558, "auxiliary_loss_mlp": 0.01037127, "balance_loss_clip": 1.04965436, "balance_loss_mlp": 1.02194023, "epoch": 0.41887870133774235, "flos": 40333428374400.0, "grad_norm": 2.0539481091161664, "language_loss": 0.71188843, "learning_rate": 2.612549508603375e-06, "loss": 0.73353529, "num_input_tokens_seen": 149523170, "step": 6967, "time_per_iteration": 2.8494174480438232 }, { "auxiliary_loss_clip": 0.01059483, "auxiliary_loss_mlp": 0.01001432, "balance_loss_clip": 1.039819, "balance_loss_mlp": 0.99973947, "epoch": 0.4189388245904103, "flos": 61371336516480.0, "grad_norm": 0.6719582962825281, "language_loss": 0.46191829, "learning_rate": 2.612178751609011e-06, "loss": 0.48252743, "num_input_tokens_seen": 149583955, "step": 6968, "time_per_iteration": 3.2362303733825684 }, { "auxiliary_loss_clip": 0.01123461, "auxiliary_loss_mlp": 0.01043061, "balance_loss_clip": 1.04708195, "balance_loss_mlp": 1.02722979, "epoch": 0.4189989478430783, "flos": 28215624119040.0, "grad_norm": 2.2684151977061386, "language_loss": 0.75044996, "learning_rate": 2.6118079714007685e-06, "loss": 0.77211517, "num_input_tokens_seen": 149604440, "step": 6969, "time_per_iteration": 2.836956739425659 }, { "auxiliary_loss_clip": 0.01108551, "auxiliary_loss_mlp": 0.01045091, "balance_loss_clip": 1.0470643, "balance_loss_mlp": 1.03178096, "epoch": 0.4190590710957463, "flos": 24565879088640.0, "grad_norm": 1.9985372976124152, "language_loss": 0.8083396, "learning_rate": 2.611437167992705e-06, "loss": 0.82987607, "num_input_tokens_seen": 149623745, "step": 6970, "time_per_iteration": 2.7209956645965576 }, { "auxiliary_loss_clip": 0.01119916, "auxiliary_loss_mlp": 0.0104141, "balance_loss_clip": 1.04898238, "balance_loss_mlp": 1.02689075, "epoch": 0.41911919434841427, "flos": 21726027964800.0, "grad_norm": 2.2489196165322713, "language_loss": 0.82699662, "learning_rate": 2.6110663413988835e-06, "loss": 0.84860986, "num_input_tokens_seen": 149643025, "step": 6971, "time_per_iteration": 2.6844992637634277 }, { "auxiliary_loss_clip": 0.01105807, "auxiliary_loss_mlp": 0.01047014, "balance_loss_clip": 1.0493474, "balance_loss_mlp": 1.03207135, "epoch": 0.41917931760108224, "flos": 17601543855360.0, "grad_norm": 1.6553402405348427, "language_loss": 0.74262661, "learning_rate": 2.6106954916333648e-06, "loss": 0.76415479, "num_input_tokens_seen": 149660695, "step": 6972, "time_per_iteration": 2.6240105628967285 }, { "auxiliary_loss_clip": 0.01102199, "auxiliary_loss_mlp": 0.01040482, "balance_loss_clip": 1.0421176, "balance_loss_mlp": 1.02589083, "epoch": 0.4192394408537502, "flos": 37816701022080.0, "grad_norm": 1.5708676830874608, "language_loss": 0.72811258, "learning_rate": 2.610324618710212e-06, "loss": 0.74953938, "num_input_tokens_seen": 149682040, "step": 6973, "time_per_iteration": 2.8109309673309326 }, { "auxiliary_loss_clip": 0.01101478, "auxiliary_loss_mlp": 0.01038786, "balance_loss_clip": 1.05107093, "balance_loss_mlp": 1.02461183, "epoch": 0.41929956410641817, "flos": 23107726477440.0, "grad_norm": 1.8294609220169469, "language_loss": 0.74864107, "learning_rate": 2.609953722643489e-06, "loss": 0.77004373, "num_input_tokens_seen": 149700855, "step": 6974, "time_per_iteration": 2.7036855220794678 }, { "auxiliary_loss_clip": 0.01117361, "auxiliary_loss_mlp": 0.01037617, "balance_loss_clip": 1.04402697, "balance_loss_mlp": 1.02359784, "epoch": 0.41935968735908613, "flos": 22524537260160.0, "grad_norm": 1.843462386151443, "language_loss": 0.7271533, "learning_rate": 2.609582803447259e-06, "loss": 0.748703, "num_input_tokens_seen": 149717360, "step": 6975, "time_per_iteration": 2.632661819458008 }, { "auxiliary_loss_clip": 0.01113766, "auxiliary_loss_mlp": 0.01042513, "balance_loss_clip": 1.04679942, "balance_loss_mlp": 1.02849412, "epoch": 0.4194198106117541, "flos": 26870446759680.0, "grad_norm": 1.580698900699299, "language_loss": 0.80874467, "learning_rate": 2.6092118611355885e-06, "loss": 0.83030754, "num_input_tokens_seen": 149738975, "step": 6976, "time_per_iteration": 2.68833327293396 }, { "auxiliary_loss_clip": 0.01098184, "auxiliary_loss_mlp": 0.01042179, "balance_loss_clip": 1.04087496, "balance_loss_mlp": 1.02671123, "epoch": 0.41947993386442206, "flos": 19902412425600.0, "grad_norm": 4.6015574144833264, "language_loss": 0.6767152, "learning_rate": 2.6088408957225425e-06, "loss": 0.69811881, "num_input_tokens_seen": 149757055, "step": 6977, "time_per_iteration": 2.6453959941864014 }, { "auxiliary_loss_clip": 0.01122702, "auxiliary_loss_mlp": 0.0104277, "balance_loss_clip": 1.04980922, "balance_loss_mlp": 1.02926338, "epoch": 0.41954005711709, "flos": 17383889393280.0, "grad_norm": 2.3946463459425966, "language_loss": 0.80506754, "learning_rate": 2.6084699072221898e-06, "loss": 0.82672226, "num_input_tokens_seen": 149772885, "step": 6978, "time_per_iteration": 2.596269369125366 }, { "auxiliary_loss_clip": 0.01133146, "auxiliary_loss_mlp": 0.0103908, "balance_loss_clip": 1.04677558, "balance_loss_mlp": 1.02459598, "epoch": 0.419600180369758, "flos": 25003306915200.0, "grad_norm": 1.7226002389356767, "language_loss": 0.82708085, "learning_rate": 2.6080988956485964e-06, "loss": 0.84880304, "num_input_tokens_seen": 149791515, "step": 6979, "time_per_iteration": 2.588383197784424 }, { "auxiliary_loss_clip": 0.01129014, "auxiliary_loss_mlp": 0.01037351, "balance_loss_clip": 1.04659355, "balance_loss_mlp": 1.02302253, "epoch": 0.41966030362242596, "flos": 17383781652480.0, "grad_norm": 2.4214608222579206, "language_loss": 0.83723533, "learning_rate": 2.6077278610158325e-06, "loss": 0.85889894, "num_input_tokens_seen": 149807250, "step": 6980, "time_per_iteration": 2.5890002250671387 }, { "auxiliary_loss_clip": 0.01132913, "auxiliary_loss_mlp": 0.01043925, "balance_loss_clip": 1.04753232, "balance_loss_mlp": 1.02994215, "epoch": 0.4197204268750939, "flos": 22156165330560.0, "grad_norm": 2.919161771051539, "language_loss": 0.7951659, "learning_rate": 2.6073568033379665e-06, "loss": 0.81693423, "num_input_tokens_seen": 149821640, "step": 6981, "time_per_iteration": 2.6015915870666504 }, { "auxiliary_loss_clip": 0.01096505, "auxiliary_loss_mlp": 0.01037263, "balance_loss_clip": 1.04636097, "balance_loss_mlp": 1.02382243, "epoch": 0.4197805501277619, "flos": 22084128604800.0, "grad_norm": 2.285836698514787, "language_loss": 0.84386683, "learning_rate": 2.6069857226290696e-06, "loss": 0.86520445, "num_input_tokens_seen": 149840545, "step": 6982, "time_per_iteration": 2.755657434463501 }, { "auxiliary_loss_clip": 0.01120032, "auxiliary_loss_mlp": 0.01038775, "balance_loss_clip": 1.04708028, "balance_loss_mlp": 1.02419019, "epoch": 0.4198406733804299, "flos": 26432192920320.0, "grad_norm": 2.941579449236281, "language_loss": 0.57212174, "learning_rate": 2.606614618903214e-06, "loss": 0.59370977, "num_input_tokens_seen": 149860375, "step": 6983, "time_per_iteration": 2.699927568435669 }, { "auxiliary_loss_clip": 0.01120799, "auxiliary_loss_mlp": 0.01037958, "balance_loss_clip": 1.05017662, "balance_loss_mlp": 1.02513719, "epoch": 0.4199007966330979, "flos": 12531029293440.0, "grad_norm": 1.788715678149628, "language_loss": 0.82569104, "learning_rate": 2.606243492174471e-06, "loss": 0.84727859, "num_input_tokens_seen": 149877850, "step": 6984, "time_per_iteration": 2.6608574390411377 }, { "auxiliary_loss_clip": 0.01110821, "auxiliary_loss_mlp": 0.01031336, "balance_loss_clip": 1.04403567, "balance_loss_mlp": 1.01740074, "epoch": 0.41996091988576584, "flos": 21762944167680.0, "grad_norm": 1.8578510762238896, "language_loss": 0.79251826, "learning_rate": 2.605872342456914e-06, "loss": 0.81393987, "num_input_tokens_seen": 149896110, "step": 6985, "time_per_iteration": 2.6915009021759033 }, { "auxiliary_loss_clip": 0.01134356, "auxiliary_loss_mlp": 0.01037444, "balance_loss_clip": 1.04694271, "balance_loss_mlp": 1.02278078, "epoch": 0.4200210431384338, "flos": 26541935948160.0, "grad_norm": 1.6735394330256788, "language_loss": 0.78439772, "learning_rate": 2.6055011697646173e-06, "loss": 0.80611569, "num_input_tokens_seen": 149916495, "step": 6986, "time_per_iteration": 2.6553595066070557 }, { "auxiliary_loss_clip": 0.01108367, "auxiliary_loss_mlp": 0.01032308, "balance_loss_clip": 1.04705167, "balance_loss_mlp": 1.01957011, "epoch": 0.42008116639110177, "flos": 26795824254720.0, "grad_norm": 1.6966099884396408, "language_loss": 0.72624969, "learning_rate": 2.605129974111655e-06, "loss": 0.7476564, "num_input_tokens_seen": 149936445, "step": 6987, "time_per_iteration": 2.7428104877471924 }, { "auxiliary_loss_clip": 0.01105896, "auxiliary_loss_mlp": 0.00774749, "balance_loss_clip": 1.04440594, "balance_loss_mlp": 1.00098395, "epoch": 0.42014128964376973, "flos": 32087333243520.0, "grad_norm": 1.4394465087417463, "language_loss": 0.74992245, "learning_rate": 2.604758755512104e-06, "loss": 0.76872891, "num_input_tokens_seen": 149959430, "step": 6988, "time_per_iteration": 4.499454975128174 }, { "auxiliary_loss_clip": 0.01124153, "auxiliary_loss_mlp": 0.01040193, "balance_loss_clip": 1.04908502, "balance_loss_mlp": 1.02585781, "epoch": 0.4202014128964377, "flos": 26467133875200.0, "grad_norm": 1.6029470393888554, "language_loss": 0.73995304, "learning_rate": 2.60438751398004e-06, "loss": 0.76159656, "num_input_tokens_seen": 149980365, "step": 6989, "time_per_iteration": 2.6979968547821045 }, { "auxiliary_loss_clip": 0.01109104, "auxiliary_loss_mlp": 0.01037728, "balance_loss_clip": 1.04531431, "balance_loss_mlp": 1.02353013, "epoch": 0.42026153614910566, "flos": 13401216178560.0, "grad_norm": 2.8939358842188043, "language_loss": 0.70562875, "learning_rate": 2.6040162495295404e-06, "loss": 0.72709703, "num_input_tokens_seen": 149997375, "step": 6990, "time_per_iteration": 4.2269814014434814 }, { "auxiliary_loss_clip": 0.01052428, "auxiliary_loss_mlp": 0.00753318, "balance_loss_clip": 1.03888559, "balance_loss_mlp": 1.00109041, "epoch": 0.42032165940177363, "flos": 60250457635200.0, "grad_norm": 1.417771869116233, "language_loss": 0.60470819, "learning_rate": 2.603644962174685e-06, "loss": 0.62276566, "num_input_tokens_seen": 150051230, "step": 6991, "time_per_iteration": 4.600361585617065 }, { "auxiliary_loss_clip": 0.01135512, "auxiliary_loss_mlp": 0.01038632, "balance_loss_clip": 1.05044973, "balance_loss_mlp": 1.02417135, "epoch": 0.4203817826544416, "flos": 24535211852160.0, "grad_norm": 1.4766426515770763, "language_loss": 0.832901, "learning_rate": 2.6032736519295517e-06, "loss": 0.85464245, "num_input_tokens_seen": 150071135, "step": 6992, "time_per_iteration": 2.688693046569824 }, { "auxiliary_loss_clip": 0.01058225, "auxiliary_loss_mlp": 0.01016781, "balance_loss_clip": 1.02967906, "balance_loss_mlp": 1.01523161, "epoch": 0.42044190590710956, "flos": 58820781530880.0, "grad_norm": 0.8077151468791776, "language_loss": 0.65494478, "learning_rate": 2.6029023188082217e-06, "loss": 0.67569482, "num_input_tokens_seen": 150125220, "step": 6993, "time_per_iteration": 4.7132039070129395 }, { "auxiliary_loss_clip": 0.011371, "auxiliary_loss_mlp": 0.010386, "balance_loss_clip": 1.04959965, "balance_loss_mlp": 1.02267361, "epoch": 0.4205020291597775, "flos": 16436063260800.0, "grad_norm": 1.948890763784571, "language_loss": 0.83380342, "learning_rate": 2.6025309628247746e-06, "loss": 0.85556042, "num_input_tokens_seen": 150142300, "step": 6994, "time_per_iteration": 2.5883679389953613 }, { "auxiliary_loss_clip": 0.01120964, "auxiliary_loss_mlp": 0.00771063, "balance_loss_clip": 1.04939461, "balance_loss_mlp": 1.00095451, "epoch": 0.4205621524124455, "flos": 18405655672320.0, "grad_norm": 1.5483229522184627, "language_loss": 0.78529471, "learning_rate": 2.6021595839932934e-06, "loss": 0.80421495, "num_input_tokens_seen": 150161345, "step": 6995, "time_per_iteration": 2.716649055480957 }, { "auxiliary_loss_clip": 0.0109323, "auxiliary_loss_mlp": 0.01032227, "balance_loss_clip": 1.04375339, "balance_loss_mlp": 1.01855421, "epoch": 0.4206222756651135, "flos": 25520097841920.0, "grad_norm": 1.4060831947988737, "language_loss": 0.80397403, "learning_rate": 2.60178818232786e-06, "loss": 0.82522857, "num_input_tokens_seen": 150182420, "step": 6996, "time_per_iteration": 2.773655891418457 }, { "auxiliary_loss_clip": 0.01111456, "auxiliary_loss_mlp": 0.00771084, "balance_loss_clip": 1.0477984, "balance_loss_mlp": 1.00100029, "epoch": 0.4206823989177815, "flos": 15304338472320.0, "grad_norm": 1.9934224916744, "language_loss": 0.7558648, "learning_rate": 2.601416757842559e-06, "loss": 0.77469015, "num_input_tokens_seen": 150200175, "step": 6997, "time_per_iteration": 2.6486191749572754 }, { "auxiliary_loss_clip": 0.01130573, "auxiliary_loss_mlp": 0.01042531, "balance_loss_clip": 1.04606771, "balance_loss_mlp": 1.02835727, "epoch": 0.42074252217044944, "flos": 15554096714880.0, "grad_norm": 3.451993658012451, "language_loss": 0.75860173, "learning_rate": 2.6010453105514743e-06, "loss": 0.78033274, "num_input_tokens_seen": 150217100, "step": 6998, "time_per_iteration": 2.548783540725708 }, { "auxiliary_loss_clip": 0.01136566, "auxiliary_loss_mlp": 0.01042996, "balance_loss_clip": 1.05027032, "balance_loss_mlp": 1.02827394, "epoch": 0.4208026454231174, "flos": 26145877610880.0, "grad_norm": 1.6802908884651202, "language_loss": 0.76294345, "learning_rate": 2.60067384046869e-06, "loss": 0.78473908, "num_input_tokens_seen": 150239830, "step": 6999, "time_per_iteration": 2.6605780124664307 }, { "auxiliary_loss_clip": 0.01082307, "auxiliary_loss_mlp": 0.01039523, "balance_loss_clip": 1.04213417, "balance_loss_mlp": 1.02420449, "epoch": 0.42086276867578537, "flos": 23550110380800.0, "grad_norm": 2.828142255503796, "language_loss": 0.64361006, "learning_rate": 2.600302347608295e-06, "loss": 0.66482836, "num_input_tokens_seen": 150260690, "step": 7000, "time_per_iteration": 2.7295126914978027 }, { "auxiliary_loss_clip": 0.01089826, "auxiliary_loss_mlp": 0.01039051, "balance_loss_clip": 1.04259682, "balance_loss_mlp": 1.02433491, "epoch": 0.42092289192845334, "flos": 18113414618880.0, "grad_norm": 2.276209987309232, "language_loss": 0.76550955, "learning_rate": 2.5999308319843743e-06, "loss": 0.78679836, "num_input_tokens_seen": 150279885, "step": 7001, "time_per_iteration": 2.793407917022705 }, { "auxiliary_loss_clip": 0.01091534, "auxiliary_loss_mlp": 0.00771163, "balance_loss_clip": 1.04483819, "balance_loss_mlp": 1.00107491, "epoch": 0.4209830151811213, "flos": 20006588845440.0, "grad_norm": 1.4928891465725471, "language_loss": 0.86682802, "learning_rate": 2.5995592936110154e-06, "loss": 0.88545501, "num_input_tokens_seen": 150297390, "step": 7002, "time_per_iteration": 2.719127655029297 }, { "auxiliary_loss_clip": 0.0109333, "auxiliary_loss_mlp": 0.0103625, "balance_loss_clip": 1.04801917, "balance_loss_mlp": 1.02297568, "epoch": 0.42104313843378927, "flos": 21978946604160.0, "grad_norm": 1.8843999139097827, "language_loss": 0.67807466, "learning_rate": 2.5991877325023096e-06, "loss": 0.6993705, "num_input_tokens_seen": 150317390, "step": 7003, "time_per_iteration": 2.732848882675171 }, { "auxiliary_loss_clip": 0.01132341, "auxiliary_loss_mlp": 0.01035322, "balance_loss_clip": 1.04725492, "balance_loss_mlp": 1.02031398, "epoch": 0.42110326168645723, "flos": 25443966965760.0, "grad_norm": 1.9778982096910334, "language_loss": 0.77774739, "learning_rate": 2.598816148672344e-06, "loss": 0.79942405, "num_input_tokens_seen": 150337455, "step": 7004, "time_per_iteration": 2.630838394165039 }, { "auxiliary_loss_clip": 0.01129987, "auxiliary_loss_mlp": 0.0103854, "balance_loss_clip": 1.04988933, "balance_loss_mlp": 1.02351916, "epoch": 0.4211633849391252, "flos": 17822574195840.0, "grad_norm": 2.0674356984544557, "language_loss": 0.67855948, "learning_rate": 2.59844454213521e-06, "loss": 0.70024478, "num_input_tokens_seen": 150355385, "step": 7005, "time_per_iteration": 2.588533401489258 }, { "auxiliary_loss_clip": 0.01121703, "auxiliary_loss_mlp": 0.01033597, "balance_loss_clip": 1.0483923, "balance_loss_mlp": 1.01941752, "epoch": 0.42122350819179316, "flos": 16282436791680.0, "grad_norm": 1.9633544911967673, "language_loss": 0.72481513, "learning_rate": 2.5980729129049994e-06, "loss": 0.74636805, "num_input_tokens_seen": 150371750, "step": 7006, "time_per_iteration": 2.5879828929901123 }, { "auxiliary_loss_clip": 0.01133912, "auxiliary_loss_mlp": 0.01032205, "balance_loss_clip": 1.04963207, "balance_loss_mlp": 1.01787031, "epoch": 0.4212836314444611, "flos": 19645866512640.0, "grad_norm": 1.722108681435548, "language_loss": 0.70495522, "learning_rate": 2.5977012609958033e-06, "loss": 0.72661638, "num_input_tokens_seen": 150389955, "step": 7007, "time_per_iteration": 2.5199153423309326 }, { "auxiliary_loss_clip": 0.0110564, "auxiliary_loss_mlp": 0.00771949, "balance_loss_clip": 1.04377306, "balance_loss_mlp": 1.00098372, "epoch": 0.4213437546971291, "flos": 18369026778240.0, "grad_norm": 1.772679877033185, "language_loss": 0.82893503, "learning_rate": 2.5973295864217166e-06, "loss": 0.84771085, "num_input_tokens_seen": 150405780, "step": 7008, "time_per_iteration": 2.6636033058166504 }, { "auxiliary_loss_clip": 0.01089865, "auxiliary_loss_mlp": 0.01039598, "balance_loss_clip": 1.04483509, "balance_loss_mlp": 1.02535856, "epoch": 0.42140387794979706, "flos": 27704507541120.0, "grad_norm": 1.895033591472922, "language_loss": 0.72206765, "learning_rate": 2.596957889196831e-06, "loss": 0.74336231, "num_input_tokens_seen": 150425615, "step": 7009, "time_per_iteration": 2.738678216934204 }, { "auxiliary_loss_clip": 0.01132456, "auxiliary_loss_mlp": 0.01030814, "balance_loss_clip": 1.04812074, "balance_loss_mlp": 1.01674712, "epoch": 0.4214640012024651, "flos": 28147071012480.0, "grad_norm": 2.558025018080716, "language_loss": 0.66191494, "learning_rate": 2.596586169335243e-06, "loss": 0.68354768, "num_input_tokens_seen": 150445765, "step": 7010, "time_per_iteration": 2.6812071800231934 }, { "auxiliary_loss_clip": 0.01092262, "auxiliary_loss_mlp": 0.01032367, "balance_loss_clip": 1.0424943, "balance_loss_mlp": 1.01774001, "epoch": 0.42152412445513304, "flos": 22997265177600.0, "grad_norm": 2.024875050938184, "language_loss": 0.72456133, "learning_rate": 2.5962144268510477e-06, "loss": 0.74580765, "num_input_tokens_seen": 150464405, "step": 7011, "time_per_iteration": 2.741454601287842 }, { "auxiliary_loss_clip": 0.01046137, "auxiliary_loss_mlp": 0.01001201, "balance_loss_clip": 1.02718639, "balance_loss_mlp": 0.99971068, "epoch": 0.421584247707801, "flos": 63749592938880.0, "grad_norm": 0.7906258228604641, "language_loss": 0.54322207, "learning_rate": 2.5958426617583417e-06, "loss": 0.56369549, "num_input_tokens_seen": 150520430, "step": 7012, "time_per_iteration": 3.1284689903259277 }, { "auxiliary_loss_clip": 0.01123004, "auxiliary_loss_mlp": 0.01031037, "balance_loss_clip": 1.05000162, "balance_loss_mlp": 1.01663089, "epoch": 0.421644370960469, "flos": 24314612474880.0, "grad_norm": 1.3828895368097467, "language_loss": 0.78401852, "learning_rate": 2.5954708740712215e-06, "loss": 0.80555892, "num_input_tokens_seen": 150542610, "step": 7013, "time_per_iteration": 2.6729819774627686 }, { "auxiliary_loss_clip": 0.01133162, "auxiliary_loss_mlp": 0.01033094, "balance_loss_clip": 1.04858398, "balance_loss_mlp": 1.01826453, "epoch": 0.42170449421313694, "flos": 23440690575360.0, "grad_norm": 1.8094728177732207, "language_loss": 0.81603825, "learning_rate": 2.595099063803787e-06, "loss": 0.83770084, "num_input_tokens_seen": 150560970, "step": 7014, "time_per_iteration": 2.662652015686035 }, { "auxiliary_loss_clip": 0.01117627, "auxiliary_loss_mlp": 0.0103256, "balance_loss_clip": 1.04452634, "balance_loss_mlp": 1.01831448, "epoch": 0.4217646174658049, "flos": 23695476721920.0, "grad_norm": 1.7861369926261594, "language_loss": 0.77908784, "learning_rate": 2.5947272309701354e-06, "loss": 0.80058968, "num_input_tokens_seen": 150582615, "step": 7015, "time_per_iteration": 2.763761043548584 }, { "auxiliary_loss_clip": 0.01132815, "auxiliary_loss_mlp": 0.01036697, "balance_loss_clip": 1.04966104, "balance_loss_mlp": 1.02183151, "epoch": 0.42182474071847287, "flos": 24971562270720.0, "grad_norm": 1.3268186837565954, "language_loss": 0.82412994, "learning_rate": 2.594355375584368e-06, "loss": 0.84582508, "num_input_tokens_seen": 150603640, "step": 7016, "time_per_iteration": 2.771812677383423 }, { "auxiliary_loss_clip": 0.01091213, "auxiliary_loss_mlp": 0.0103466, "balance_loss_clip": 1.04072332, "balance_loss_mlp": 1.01999736, "epoch": 0.42188486397114083, "flos": 22856639431680.0, "grad_norm": 1.813350419138722, "language_loss": 0.68270308, "learning_rate": 2.593983497660586e-06, "loss": 0.70396179, "num_input_tokens_seen": 150622490, "step": 7017, "time_per_iteration": 2.703078508377075 }, { "auxiliary_loss_clip": 0.01045206, "auxiliary_loss_mlp": 0.01012048, "balance_loss_clip": 1.02663231, "balance_loss_mlp": 1.01053989, "epoch": 0.4219449872238088, "flos": 66975700965120.0, "grad_norm": 0.7659311952437052, "language_loss": 0.59381223, "learning_rate": 2.5936115972128895e-06, "loss": 0.61438477, "num_input_tokens_seen": 150689545, "step": 7018, "time_per_iteration": 3.2514843940734863 }, { "auxiliary_loss_clip": 0.01113322, "auxiliary_loss_mlp": 0.01033039, "balance_loss_clip": 1.04147184, "balance_loss_mlp": 1.01840591, "epoch": 0.42200511047647676, "flos": 13115367745920.0, "grad_norm": 2.3056993234384957, "language_loss": 0.75083554, "learning_rate": 2.593239674255382e-06, "loss": 0.77229911, "num_input_tokens_seen": 150707610, "step": 7019, "time_per_iteration": 2.6845014095306396 }, { "auxiliary_loss_clip": 0.01106969, "auxiliary_loss_mlp": 0.01035543, "balance_loss_clip": 1.04650903, "balance_loss_mlp": 1.02023685, "epoch": 0.42206523372914473, "flos": 13991193066240.0, "grad_norm": 1.8835929197669175, "language_loss": 0.69198954, "learning_rate": 2.592867728802166e-06, "loss": 0.71341467, "num_input_tokens_seen": 150724530, "step": 7020, "time_per_iteration": 2.635646343231201 }, { "auxiliary_loss_clip": 0.01107351, "auxiliary_loss_mlp": 0.00771638, "balance_loss_clip": 1.04847479, "balance_loss_mlp": 1.00088549, "epoch": 0.4221253569818127, "flos": 21942317710080.0, "grad_norm": 3.182010152232146, "language_loss": 0.81085485, "learning_rate": 2.592495760867347e-06, "loss": 0.82964474, "num_input_tokens_seen": 150742870, "step": 7021, "time_per_iteration": 2.712358236312866 }, { "auxiliary_loss_clip": 0.0105744, "auxiliary_loss_mlp": 0.01040763, "balance_loss_clip": 1.03628528, "balance_loss_mlp": 1.02439523, "epoch": 0.42218548023448066, "flos": 32192587071360.0, "grad_norm": 1.7516152237568758, "language_loss": 0.70298421, "learning_rate": 2.5921237704650293e-06, "loss": 0.72396624, "num_input_tokens_seen": 150765500, "step": 7022, "time_per_iteration": 2.9338343143463135 }, { "auxiliary_loss_clip": 0.01114774, "auxiliary_loss_mlp": 0.01028964, "balance_loss_clip": 1.0467478, "balance_loss_mlp": 1.01637506, "epoch": 0.4222456034871487, "flos": 30118961894400.0, "grad_norm": 1.5162864908148717, "language_loss": 0.67418218, "learning_rate": 2.5917517576093188e-06, "loss": 0.69561946, "num_input_tokens_seen": 150784945, "step": 7023, "time_per_iteration": 2.7014782428741455 }, { "auxiliary_loss_clip": 0.01101297, "auxiliary_loss_mlp": 0.01042754, "balance_loss_clip": 1.0460372, "balance_loss_mlp": 1.0259577, "epoch": 0.42230572673981664, "flos": 22127904305280.0, "grad_norm": 1.6579428625462107, "language_loss": 0.69768953, "learning_rate": 2.591379722314322e-06, "loss": 0.71913004, "num_input_tokens_seen": 150803120, "step": 7024, "time_per_iteration": 2.8669025897979736 }, { "auxiliary_loss_clip": 0.011321, "auxiliary_loss_mlp": 0.01035188, "balance_loss_clip": 1.04982734, "balance_loss_mlp": 1.02107334, "epoch": 0.4223658499924846, "flos": 22055077480320.0, "grad_norm": 1.7199232023790467, "language_loss": 0.76781225, "learning_rate": 2.591007664594147e-06, "loss": 0.7894851, "num_input_tokens_seen": 150823135, "step": 7025, "time_per_iteration": 2.696200132369995 }, { "auxiliary_loss_clip": 0.01097355, "auxiliary_loss_mlp": 0.01036622, "balance_loss_clip": 1.04367328, "balance_loss_mlp": 1.02268052, "epoch": 0.4224259732451526, "flos": 20410727742720.0, "grad_norm": 1.6766870979897237, "language_loss": 0.79664457, "learning_rate": 2.5906355844629024e-06, "loss": 0.81798434, "num_input_tokens_seen": 150842070, "step": 7026, "time_per_iteration": 2.7131056785583496 }, { "auxiliary_loss_clip": 0.01053, "auxiliary_loss_mlp": 0.00999983, "balance_loss_clip": 1.02519512, "balance_loss_mlp": 0.9985466, "epoch": 0.42248609649782054, "flos": 62846655828480.0, "grad_norm": 0.7210787168966012, "language_loss": 0.61874068, "learning_rate": 2.5902634819346966e-06, "loss": 0.63927048, "num_input_tokens_seen": 150907450, "step": 7027, "time_per_iteration": 3.2111167907714844 }, { "auxiliary_loss_clip": 0.01131577, "auxiliary_loss_mlp": 0.01038162, "balance_loss_clip": 1.05022967, "balance_loss_mlp": 1.02400613, "epoch": 0.4225462197504885, "flos": 26249946289920.0, "grad_norm": 1.8872379728212205, "language_loss": 0.71137869, "learning_rate": 2.5898913570236414e-06, "loss": 0.7330761, "num_input_tokens_seen": 150928040, "step": 7028, "time_per_iteration": 4.185323476791382 }, { "auxiliary_loss_clip": 0.01109127, "auxiliary_loss_mlp": 0.01041278, "balance_loss_clip": 1.04935491, "balance_loss_mlp": 1.02702022, "epoch": 0.42260634300315647, "flos": 20521943228160.0, "grad_norm": 3.7456767842675136, "language_loss": 0.82652044, "learning_rate": 2.589519209743846e-06, "loss": 0.84802449, "num_input_tokens_seen": 150945760, "step": 7029, "time_per_iteration": 2.617464542388916 }, { "auxiliary_loss_clip": 0.01086316, "auxiliary_loss_mlp": 0.01043345, "balance_loss_clip": 1.04393244, "balance_loss_mlp": 1.02826512, "epoch": 0.42266646625582444, "flos": 24316731377280.0, "grad_norm": 1.852504104659585, "language_loss": 0.75125468, "learning_rate": 2.589147040109424e-06, "loss": 0.7725513, "num_input_tokens_seen": 150965665, "step": 7030, "time_per_iteration": 5.787954807281494 }, { "auxiliary_loss_clip": 0.01129772, "auxiliary_loss_mlp": 0.01039193, "balance_loss_clip": 1.04772067, "balance_loss_mlp": 1.02368367, "epoch": 0.4227265895084924, "flos": 24204151175040.0, "grad_norm": 1.9107182577124318, "language_loss": 0.86337131, "learning_rate": 2.588774848134486e-06, "loss": 0.88506097, "num_input_tokens_seen": 150982260, "step": 7031, "time_per_iteration": 2.622174024581909 }, { "auxiliary_loss_clip": 0.01120469, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.04873753, "balance_loss_mlp": 1.0234381, "epoch": 0.42278671276116037, "flos": 16909760845440.0, "grad_norm": 1.9974648735142886, "language_loss": 0.73489487, "learning_rate": 2.5884026338331473e-06, "loss": 0.75648719, "num_input_tokens_seen": 150999990, "step": 7032, "time_per_iteration": 2.681155204772949 }, { "auxiliary_loss_clip": 0.01100841, "auxiliary_loss_mlp": 0.01044575, "balance_loss_clip": 1.04449272, "balance_loss_mlp": 1.029531, "epoch": 0.42284683601382833, "flos": 25411073086080.0, "grad_norm": 1.657781585480679, "language_loss": 0.70232797, "learning_rate": 2.5880303972195222e-06, "loss": 0.72378218, "num_input_tokens_seen": 151021105, "step": 7033, "time_per_iteration": 4.264399290084839 }, { "auxiliary_loss_clip": 0.01105188, "auxiliary_loss_mlp": 0.00773118, "balance_loss_clip": 1.04417682, "balance_loss_mlp": 1.00101566, "epoch": 0.4229069592664963, "flos": 23040322606080.0, "grad_norm": 2.084860036541982, "language_loss": 0.90209413, "learning_rate": 2.5876581383077256e-06, "loss": 0.92087722, "num_input_tokens_seen": 151040665, "step": 7034, "time_per_iteration": 2.6903390884399414 }, { "auxiliary_loss_clip": 0.01107447, "auxiliary_loss_mlp": 0.01038024, "balance_loss_clip": 1.04703283, "balance_loss_mlp": 1.02456498, "epoch": 0.42296708251916426, "flos": 26067448264320.0, "grad_norm": 1.854470548564886, "language_loss": 0.77645576, "learning_rate": 2.5872858571118723e-06, "loss": 0.79791045, "num_input_tokens_seen": 151061240, "step": 7035, "time_per_iteration": 2.839463233947754 }, { "auxiliary_loss_clip": 0.01118463, "auxiliary_loss_mlp": 0.01043438, "balance_loss_clip": 1.04904413, "balance_loss_mlp": 1.02879918, "epoch": 0.4230272057718323, "flos": 19458376496640.0, "grad_norm": 1.8047665428966375, "language_loss": 0.82544887, "learning_rate": 2.5869135536460817e-06, "loss": 0.84706789, "num_input_tokens_seen": 151076870, "step": 7036, "time_per_iteration": 2.7344322204589844 }, { "auxiliary_loss_clip": 0.01105244, "auxiliary_loss_mlp": 0.01037982, "balance_loss_clip": 1.04819334, "balance_loss_mlp": 1.02430892, "epoch": 0.42308732902450025, "flos": 22383300983040.0, "grad_norm": 1.7884357315749977, "language_loss": 0.70379841, "learning_rate": 2.58654122792447e-06, "loss": 0.72523069, "num_input_tokens_seen": 151095110, "step": 7037, "time_per_iteration": 2.7701706886291504 }, { "auxiliary_loss_clip": 0.01088589, "auxiliary_loss_mlp": 0.00773432, "balance_loss_clip": 1.04192328, "balance_loss_mlp": 1.00089622, "epoch": 0.4231474522771682, "flos": 20995425331200.0, "grad_norm": 1.6174527275157642, "language_loss": 0.78031301, "learning_rate": 2.586168879961155e-06, "loss": 0.79893327, "num_input_tokens_seen": 151114355, "step": 7038, "time_per_iteration": 2.7142980098724365 }, { "auxiliary_loss_clip": 0.01093843, "auxiliary_loss_mlp": 0.01045553, "balance_loss_clip": 1.04870033, "balance_loss_mlp": 1.02938843, "epoch": 0.4232075755298362, "flos": 14975863574400.0, "grad_norm": 2.472987059089125, "language_loss": 0.67238259, "learning_rate": 2.585796509770259e-06, "loss": 0.69377655, "num_input_tokens_seen": 151131505, "step": 7039, "time_per_iteration": 2.723700761795044 }, { "auxiliary_loss_clip": 0.01126742, "auxiliary_loss_mlp": 0.0103978, "balance_loss_clip": 1.04828668, "balance_loss_mlp": 1.02421153, "epoch": 0.42326769878250414, "flos": 24532661986560.0, "grad_norm": 2.3861719735257627, "language_loss": 0.75643921, "learning_rate": 2.5854241173658996e-06, "loss": 0.77810442, "num_input_tokens_seen": 151151555, "step": 7040, "time_per_iteration": 2.6909239292144775 }, { "auxiliary_loss_clip": 0.01120351, "auxiliary_loss_mlp": 0.01033687, "balance_loss_clip": 1.04682565, "balance_loss_mlp": 1.01907206, "epoch": 0.4233278220351721, "flos": 26870303105280.0, "grad_norm": 1.612614450493485, "language_loss": 0.6520682, "learning_rate": 2.5850517027621996e-06, "loss": 0.67360854, "num_input_tokens_seen": 151172385, "step": 7041, "time_per_iteration": 2.705819845199585 }, { "auxiliary_loss_clip": 0.01105037, "auxiliary_loss_mlp": 0.01044866, "balance_loss_clip": 1.04526758, "balance_loss_mlp": 1.02961886, "epoch": 0.4233879452878401, "flos": 42814927463040.0, "grad_norm": 1.8077043446733942, "language_loss": 0.74725586, "learning_rate": 2.5846792659732803e-06, "loss": 0.76875484, "num_input_tokens_seen": 151194930, "step": 7042, "time_per_iteration": 2.8701279163360596 }, { "auxiliary_loss_clip": 0.01118432, "auxiliary_loss_mlp": 0.01041709, "balance_loss_clip": 1.04900146, "balance_loss_mlp": 1.02783322, "epoch": 0.42344806854050804, "flos": 25229006023680.0, "grad_norm": 1.5999390710673906, "language_loss": 0.82543206, "learning_rate": 2.5843068070132643e-06, "loss": 0.84703344, "num_input_tokens_seen": 151217905, "step": 7043, "time_per_iteration": 2.7351741790771484 }, { "auxiliary_loss_clip": 0.01110906, "auxiliary_loss_mlp": 0.01054459, "balance_loss_clip": 1.04981089, "balance_loss_mlp": 1.0383476, "epoch": 0.423508191793176, "flos": 22778820616320.0, "grad_norm": 4.941461848597107, "language_loss": 0.64840907, "learning_rate": 2.5839343258962763e-06, "loss": 0.67006272, "num_input_tokens_seen": 151234580, "step": 7044, "time_per_iteration": 2.729717969894409 }, { "auxiliary_loss_clip": 0.01118394, "auxiliary_loss_mlp": 0.01056481, "balance_loss_clip": 1.04780793, "balance_loss_mlp": 1.04023242, "epoch": 0.42356831504584397, "flos": 34637493179520.0, "grad_norm": 4.901784512002612, "language_loss": 0.75249708, "learning_rate": 2.5835618226364393e-06, "loss": 0.77424586, "num_input_tokens_seen": 151254765, "step": 7045, "time_per_iteration": 2.768423557281494 }, { "auxiliary_loss_clip": 0.0109684, "auxiliary_loss_mlp": 0.0105935, "balance_loss_clip": 1.04820228, "balance_loss_mlp": 1.04277968, "epoch": 0.42362843829851193, "flos": 17596767346560.0, "grad_norm": 2.3365752409002027, "language_loss": 0.80862033, "learning_rate": 2.5831892972478797e-06, "loss": 0.83018219, "num_input_tokens_seen": 151269045, "step": 7046, "time_per_iteration": 2.778648614883423 }, { "auxiliary_loss_clip": 0.01050075, "auxiliary_loss_mlp": 0.01043729, "balance_loss_clip": 1.04536414, "balance_loss_mlp": 1.02847028, "epoch": 0.4236885615511799, "flos": 22565691267840.0, "grad_norm": 1.629581050390514, "language_loss": 0.76806176, "learning_rate": 2.5828167497447242e-06, "loss": 0.78899974, "num_input_tokens_seen": 151287530, "step": 7047, "time_per_iteration": 2.957385301589966 }, { "auxiliary_loss_clip": 0.01132762, "auxiliary_loss_mlp": 0.01044489, "balance_loss_clip": 1.05149937, "balance_loss_mlp": 1.03061271, "epoch": 0.42374868480384786, "flos": 26469216864000.0, "grad_norm": 2.0123660706562294, "language_loss": 0.68135488, "learning_rate": 2.582444180141098e-06, "loss": 0.70312738, "num_input_tokens_seen": 151308905, "step": 7048, "time_per_iteration": 2.976609468460083 }, { "auxiliary_loss_clip": 0.01119986, "auxiliary_loss_mlp": 0.0104419, "balance_loss_clip": 1.04684722, "balance_loss_mlp": 1.02822733, "epoch": 0.4238088080565159, "flos": 20370220179840.0, "grad_norm": 1.9442365727521234, "language_loss": 0.78292572, "learning_rate": 2.5820715884511307e-06, "loss": 0.80456746, "num_input_tokens_seen": 151326525, "step": 7049, "time_per_iteration": 2.7592408657073975 }, { "auxiliary_loss_clip": 0.01128638, "auxiliary_loss_mlp": 0.0105084, "balance_loss_clip": 1.05336547, "balance_loss_mlp": 1.03632045, "epoch": 0.42386893130918385, "flos": 21172105353600.0, "grad_norm": 1.9473547987347861, "language_loss": 0.82839847, "learning_rate": 2.5816989746889504e-06, "loss": 0.85019326, "num_input_tokens_seen": 151344675, "step": 7050, "time_per_iteration": 2.70487117767334 }, { "auxiliary_loss_clip": 0.01132896, "auxiliary_loss_mlp": 0.01042146, "balance_loss_clip": 1.04812455, "balance_loss_mlp": 1.02791238, "epoch": 0.4239290545618518, "flos": 17675627656320.0, "grad_norm": 2.6140682586064754, "language_loss": 0.73742986, "learning_rate": 2.581326338868687e-06, "loss": 0.75918031, "num_input_tokens_seen": 151360730, "step": 7051, "time_per_iteration": 2.6406943798065186 }, { "auxiliary_loss_clip": 0.01103657, "auxiliary_loss_mlp": 0.0104179, "balance_loss_clip": 1.05070043, "balance_loss_mlp": 1.02773547, "epoch": 0.4239891778145198, "flos": 24314504734080.0, "grad_norm": 1.6610077810318091, "language_loss": 0.86273873, "learning_rate": 2.5809536810044706e-06, "loss": 0.88419318, "num_input_tokens_seen": 151380445, "step": 7052, "time_per_iteration": 2.7759416103363037 }, { "auxiliary_loss_clip": 0.01106373, "auxiliary_loss_mlp": 0.01058935, "balance_loss_clip": 1.04475808, "balance_loss_mlp": 1.04325902, "epoch": 0.42404930106718774, "flos": 20558428467840.0, "grad_norm": 2.094212061505075, "language_loss": 0.72460884, "learning_rate": 2.5805810011104323e-06, "loss": 0.74626195, "num_input_tokens_seen": 151399325, "step": 7053, "time_per_iteration": 2.6969964504241943 }, { "auxiliary_loss_clip": 0.0110264, "auxiliary_loss_mlp": 0.00773448, "balance_loss_clip": 1.05001807, "balance_loss_mlp": 1.00098944, "epoch": 0.4241094243198557, "flos": 22308067946880.0, "grad_norm": 7.333766574531878, "language_loss": 0.82380986, "learning_rate": 2.580208299200704e-06, "loss": 0.84257072, "num_input_tokens_seen": 151417240, "step": 7054, "time_per_iteration": 2.71956205368042 }, { "auxiliary_loss_clip": 0.01052303, "auxiliary_loss_mlp": 0.01036407, "balance_loss_clip": 1.03336191, "balance_loss_mlp": 1.03490484, "epoch": 0.4241695475725237, "flos": 70612445272320.0, "grad_norm": 0.7897337987883358, "language_loss": 0.60378659, "learning_rate": 2.5798355752894183e-06, "loss": 0.62467366, "num_input_tokens_seen": 151476015, "step": 7055, "time_per_iteration": 3.155177116394043 }, { "auxiliary_loss_clip": 0.01136773, "auxiliary_loss_mlp": 0.01045155, "balance_loss_clip": 1.05100691, "balance_loss_mlp": 1.0298965, "epoch": 0.42422967082519164, "flos": 14027462824320.0, "grad_norm": 2.6219010938669998, "language_loss": 0.7752226, "learning_rate": 2.5794628293907107e-06, "loss": 0.79704189, "num_input_tokens_seen": 151492035, "step": 7056, "time_per_iteration": 2.5975699424743652 }, { "auxiliary_loss_clip": 0.01129986, "auxiliary_loss_mlp": 0.01042696, "balance_loss_clip": 1.05187988, "balance_loss_mlp": 1.02583957, "epoch": 0.4242897940778596, "flos": 22345522853760.0, "grad_norm": 2.481094371553488, "language_loss": 0.8406778, "learning_rate": 2.579090061518714e-06, "loss": 0.86240464, "num_input_tokens_seen": 151508970, "step": 7057, "time_per_iteration": 2.690188407897949 }, { "auxiliary_loss_clip": 0.01095967, "auxiliary_loss_mlp": 0.01043613, "balance_loss_clip": 1.04596114, "balance_loss_mlp": 1.02778184, "epoch": 0.42434991733052757, "flos": 22595855713920.0, "grad_norm": 2.565187046091263, "language_loss": 0.83179426, "learning_rate": 2.5787172716875642e-06, "loss": 0.85319012, "num_input_tokens_seen": 151525295, "step": 7058, "time_per_iteration": 2.9978904724121094 }, { "auxiliary_loss_clip": 0.01107732, "auxiliary_loss_mlp": 0.0077171, "balance_loss_clip": 1.04935992, "balance_loss_mlp": 1.000875, "epoch": 0.42441004058319554, "flos": 20011437181440.0, "grad_norm": 1.910708490679684, "language_loss": 0.80493343, "learning_rate": 2.5783444599113973e-06, "loss": 0.82372791, "num_input_tokens_seen": 151544435, "step": 7059, "time_per_iteration": 2.7227041721343994 }, { "auxiliary_loss_clip": 0.01137284, "auxiliary_loss_mlp": 0.01041284, "balance_loss_clip": 1.05036783, "balance_loss_mlp": 1.02469015, "epoch": 0.4244701638358635, "flos": 11144985235200.0, "grad_norm": 2.371195034517477, "language_loss": 0.70500332, "learning_rate": 2.57797162620435e-06, "loss": 0.726789, "num_input_tokens_seen": 151559520, "step": 7060, "time_per_iteration": 2.6058552265167236 }, { "auxiliary_loss_clip": 0.01128623, "auxiliary_loss_mlp": 0.01038609, "balance_loss_clip": 1.05295658, "balance_loss_mlp": 1.02370787, "epoch": 0.42453028708853147, "flos": 23987753688960.0, "grad_norm": 1.575928079295092, "language_loss": 0.7634182, "learning_rate": 2.577598770580562e-06, "loss": 0.78509057, "num_input_tokens_seen": 151579790, "step": 7061, "time_per_iteration": 2.6592459678649902 }, { "auxiliary_loss_clip": 0.01127164, "auxiliary_loss_mlp": 0.01039243, "balance_loss_clip": 1.05133295, "balance_loss_mlp": 1.02308464, "epoch": 0.42459041034119943, "flos": 18406338030720.0, "grad_norm": 2.3470563522902195, "language_loss": 0.73278493, "learning_rate": 2.5772258930541693e-06, "loss": 0.75444901, "num_input_tokens_seen": 151598285, "step": 7062, "time_per_iteration": 2.5925838947296143 }, { "auxiliary_loss_clip": 0.01110528, "auxiliary_loss_mlp": 0.01044189, "balance_loss_clip": 1.05038309, "balance_loss_mlp": 1.02934098, "epoch": 0.42465053359386745, "flos": 20958006337920.0, "grad_norm": 1.735369540351847, "language_loss": 0.66238403, "learning_rate": 2.5768529936393137e-06, "loss": 0.68393123, "num_input_tokens_seen": 151615430, "step": 7063, "time_per_iteration": 2.618459939956665 }, { "auxiliary_loss_clip": 0.0109746, "auxiliary_loss_mlp": 0.00773106, "balance_loss_clip": 1.04320812, "balance_loss_mlp": 1.0009284, "epoch": 0.4247106568465354, "flos": 33106190520960.0, "grad_norm": 1.673900676033667, "language_loss": 0.78570068, "learning_rate": 2.5764800723501354e-06, "loss": 0.80440634, "num_input_tokens_seen": 151637030, "step": 7064, "time_per_iteration": 2.7396399974823 }, { "auxiliary_loss_clip": 0.0113726, "auxiliary_loss_mlp": 0.01038466, "balance_loss_clip": 1.05053115, "balance_loss_mlp": 1.02317119, "epoch": 0.4247707800992034, "flos": 20046916840320.0, "grad_norm": 1.9847642008914126, "language_loss": 0.75471151, "learning_rate": 2.5761071292007736e-06, "loss": 0.77646875, "num_input_tokens_seen": 151655745, "step": 7065, "time_per_iteration": 2.532046318054199 }, { "auxiliary_loss_clip": 0.01124888, "auxiliary_loss_mlp": 0.01038463, "balance_loss_clip": 1.05094182, "balance_loss_mlp": 1.02257848, "epoch": 0.42483090335187135, "flos": 22385132576640.0, "grad_norm": 1.3355357629490912, "language_loss": 0.72402596, "learning_rate": 2.5757341642053725e-06, "loss": 0.74565947, "num_input_tokens_seen": 151678040, "step": 7066, "time_per_iteration": 2.5829319953918457 }, { "auxiliary_loss_clip": 0.01101493, "auxiliary_loss_mlp": 0.01036883, "balance_loss_clip": 1.04836977, "balance_loss_mlp": 1.02044368, "epoch": 0.4248910266045393, "flos": 21356830022400.0, "grad_norm": 2.4907013500628166, "language_loss": 0.80009657, "learning_rate": 2.5753611773780745e-06, "loss": 0.82148039, "num_input_tokens_seen": 151696410, "step": 7067, "time_per_iteration": 2.6051836013793945 }, { "auxiliary_loss_clip": 0.01053553, "auxiliary_loss_mlp": 0.01005501, "balance_loss_clip": 1.02524805, "balance_loss_mlp": 1.00387979, "epoch": 0.4249511498572073, "flos": 64008114099840.0, "grad_norm": 0.9135939410418532, "language_loss": 0.6341064, "learning_rate": 2.574988168733022e-06, "loss": 0.65469694, "num_input_tokens_seen": 151756365, "step": 7068, "time_per_iteration": 4.699309825897217 }, { "auxiliary_loss_clip": 0.0113454, "auxiliary_loss_mlp": 0.01036767, "balance_loss_clip": 1.04911804, "balance_loss_mlp": 1.02070904, "epoch": 0.42501127310987524, "flos": 19607046888960.0, "grad_norm": 1.9072894618048717, "language_loss": 0.72502887, "learning_rate": 2.574615138284361e-06, "loss": 0.74674189, "num_input_tokens_seen": 151775165, "step": 7069, "time_per_iteration": 5.814046382904053 }, { "auxiliary_loss_clip": 0.01136556, "auxiliary_loss_mlp": 0.01039486, "balance_loss_clip": 1.05074239, "balance_loss_mlp": 1.02286839, "epoch": 0.4250713963625432, "flos": 19462326992640.0, "grad_norm": 2.348420544652142, "language_loss": 0.79105788, "learning_rate": 2.5742420860462364e-06, "loss": 0.81281829, "num_input_tokens_seen": 151792620, "step": 7070, "time_per_iteration": 2.6242294311523438 }, { "auxiliary_loss_clip": 0.0112233, "auxiliary_loss_mlp": 0.01033288, "balance_loss_clip": 1.04764843, "balance_loss_mlp": 1.01816082, "epoch": 0.4251315196152112, "flos": 25337707557120.0, "grad_norm": 1.7541837021075046, "language_loss": 0.70184052, "learning_rate": 2.573869012032795e-06, "loss": 0.72339666, "num_input_tokens_seen": 151812850, "step": 7071, "time_per_iteration": 2.6695022583007812 }, { "auxiliary_loss_clip": 0.01134965, "auxiliary_loss_mlp": 0.01034152, "balance_loss_clip": 1.05002129, "balance_loss_mlp": 1.0191201, "epoch": 0.42519164286787914, "flos": 26359186527360.0, "grad_norm": 2.353956848857114, "language_loss": 0.71210682, "learning_rate": 2.5734959162581824e-06, "loss": 0.73379803, "num_input_tokens_seen": 151831785, "step": 7072, "time_per_iteration": 2.654045581817627 }, { "auxiliary_loss_clip": 0.01090703, "auxiliary_loss_mlp": 0.01042672, "balance_loss_clip": 1.04456139, "balance_loss_mlp": 1.02779484, "epoch": 0.4252517661205471, "flos": 26031070765440.0, "grad_norm": 1.5509538260814284, "language_loss": 0.81704801, "learning_rate": 2.5731227987365475e-06, "loss": 0.83838177, "num_input_tokens_seen": 151853885, "step": 7073, "time_per_iteration": 4.4267754554748535 }, { "auxiliary_loss_clip": 0.01117821, "auxiliary_loss_mlp": 0.01035489, "balance_loss_clip": 1.04660416, "balance_loss_mlp": 1.02130294, "epoch": 0.42531188937321507, "flos": 12713635059840.0, "grad_norm": 2.6569023186466914, "language_loss": 0.91360795, "learning_rate": 2.5727496594820386e-06, "loss": 0.93514109, "num_input_tokens_seen": 151871780, "step": 7074, "time_per_iteration": 2.655850887298584 }, { "auxiliary_loss_clip": 0.01128859, "auxiliary_loss_mlp": 0.00774468, "balance_loss_clip": 1.05061221, "balance_loss_mlp": 1.0009917, "epoch": 0.42537201262588303, "flos": 22091670460800.0, "grad_norm": 1.6066127617392931, "language_loss": 0.64610291, "learning_rate": 2.572376498508805e-06, "loss": 0.66513622, "num_input_tokens_seen": 151891600, "step": 7075, "time_per_iteration": 2.7072041034698486 }, { "auxiliary_loss_clip": 0.01097292, "auxiliary_loss_mlp": 0.01030165, "balance_loss_clip": 1.04872322, "balance_loss_mlp": 1.01664686, "epoch": 0.42543213587855105, "flos": 23003119094400.0, "grad_norm": 1.6801281915446873, "language_loss": 0.736256, "learning_rate": 2.5720033158309973e-06, "loss": 0.75753057, "num_input_tokens_seen": 151911330, "step": 7076, "time_per_iteration": 2.7376084327697754 }, { "auxiliary_loss_clip": 0.01107519, "auxiliary_loss_mlp": 0.01042827, "balance_loss_clip": 1.0442965, "balance_loss_mlp": 1.02684128, "epoch": 0.425492259131219, "flos": 25082454533760.0, "grad_norm": 2.293658429237098, "language_loss": 0.78658164, "learning_rate": 2.571630111462766e-06, "loss": 0.80808508, "num_input_tokens_seen": 151930355, "step": 7077, "time_per_iteration": 2.9069621562957764 }, { "auxiliary_loss_clip": 0.01105315, "auxiliary_loss_mlp": 0.01032074, "balance_loss_clip": 1.04497409, "balance_loss_mlp": 1.01881242, "epoch": 0.425552382383887, "flos": 22816850140800.0, "grad_norm": 1.6369769525688158, "language_loss": 0.73094088, "learning_rate": 2.571256885418265e-06, "loss": 0.75231481, "num_input_tokens_seen": 151949695, "step": 7078, "time_per_iteration": 2.728288173675537 }, { "auxiliary_loss_clip": 0.01104463, "auxiliary_loss_mlp": 0.01040077, "balance_loss_clip": 1.04849982, "balance_loss_mlp": 1.02651131, "epoch": 0.42561250563655495, "flos": 13553585671680.0, "grad_norm": 1.8849915988224846, "language_loss": 0.79555357, "learning_rate": 2.5708836377116445e-06, "loss": 0.81699896, "num_input_tokens_seen": 151967640, "step": 7079, "time_per_iteration": 2.6294121742248535 }, { "auxiliary_loss_clip": 0.01125077, "auxiliary_loss_mlp": 0.01035166, "balance_loss_clip": 1.05348229, "balance_loss_mlp": 1.02171898, "epoch": 0.4256726288892229, "flos": 46978303023360.0, "grad_norm": 1.3719098160070018, "language_loss": 0.71853465, "learning_rate": 2.5705103683570592e-06, "loss": 0.7401371, "num_input_tokens_seen": 151994020, "step": 7080, "time_per_iteration": 2.8506548404693604 }, { "auxiliary_loss_clip": 0.01130776, "auxiliary_loss_mlp": 0.01033872, "balance_loss_clip": 1.04765022, "balance_loss_mlp": 1.02025867, "epoch": 0.4257327521418909, "flos": 23586451966080.0, "grad_norm": 2.0309872529354283, "language_loss": 0.80102706, "learning_rate": 2.5701370773686646e-06, "loss": 0.82267356, "num_input_tokens_seen": 152013415, "step": 7081, "time_per_iteration": 2.698814868927002 }, { "auxiliary_loss_clip": 0.01100197, "auxiliary_loss_mlp": 0.01034532, "balance_loss_clip": 1.04303122, "balance_loss_mlp": 1.02065063, "epoch": 0.42579287539455885, "flos": 18989994124800.0, "grad_norm": 1.6770375884870488, "language_loss": 0.81524366, "learning_rate": 2.5697637647606138e-06, "loss": 0.83659089, "num_input_tokens_seen": 152030860, "step": 7082, "time_per_iteration": 2.6388967037200928 }, { "auxiliary_loss_clip": 0.01122609, "auxiliary_loss_mlp": 0.01038264, "balance_loss_clip": 1.05003822, "balance_loss_mlp": 1.02411938, "epoch": 0.4258529986472268, "flos": 25191910252800.0, "grad_norm": 2.777460036178925, "language_loss": 0.70476681, "learning_rate": 2.569390430547065e-06, "loss": 0.72637558, "num_input_tokens_seen": 152050395, "step": 7083, "time_per_iteration": 2.666609048843384 }, { "auxiliary_loss_clip": 0.01045638, "auxiliary_loss_mlp": 0.0101356, "balance_loss_clip": 1.02604496, "balance_loss_mlp": 1.01191545, "epoch": 0.4259131218998948, "flos": 69968280718080.0, "grad_norm": 0.8664420799088798, "language_loss": 0.6701948, "learning_rate": 2.569017074742173e-06, "loss": 0.69078678, "num_input_tokens_seen": 152113555, "step": 7084, "time_per_iteration": 3.25407075881958 }, { "auxiliary_loss_clip": 0.01120239, "auxiliary_loss_mlp": 0.01042728, "balance_loss_clip": 1.04841447, "balance_loss_mlp": 1.02757668, "epoch": 0.42597324515256274, "flos": 18004964480640.0, "grad_norm": 2.05020327260517, "language_loss": 0.78917986, "learning_rate": 2.5686436973600964e-06, "loss": 0.81080949, "num_input_tokens_seen": 152131575, "step": 7085, "time_per_iteration": 2.6294076442718506 }, { "auxiliary_loss_clip": 0.01123765, "auxiliary_loss_mlp": 0.01045859, "balance_loss_clip": 1.05045295, "balance_loss_mlp": 1.03036761, "epoch": 0.4260333684052307, "flos": 15158792563200.0, "grad_norm": 2.015450242409387, "language_loss": 0.76097858, "learning_rate": 2.568270298414995e-06, "loss": 0.78267479, "num_input_tokens_seen": 152149435, "step": 7086, "time_per_iteration": 2.606201648712158 }, { "auxiliary_loss_clip": 0.01107732, "auxiliary_loss_mlp": 0.01040875, "balance_loss_clip": 1.04528451, "balance_loss_mlp": 1.02682662, "epoch": 0.42609349165789867, "flos": 14939342421120.0, "grad_norm": 4.435400492712099, "language_loss": 0.80159658, "learning_rate": 2.5678968779210255e-06, "loss": 0.82308263, "num_input_tokens_seen": 152166860, "step": 7087, "time_per_iteration": 2.6517395973205566 }, { "auxiliary_loss_clip": 0.01113938, "auxiliary_loss_mlp": 0.0103375, "balance_loss_clip": 1.04980528, "balance_loss_mlp": 1.01878285, "epoch": 0.42615361491056664, "flos": 23731961961600.0, "grad_norm": 1.6700745034234148, "language_loss": 0.65982199, "learning_rate": 2.5675234358923505e-06, "loss": 0.68129885, "num_input_tokens_seen": 152187475, "step": 7088, "time_per_iteration": 2.6658740043640137 }, { "auxiliary_loss_clip": 0.01079891, "auxiliary_loss_mlp": 0.01038449, "balance_loss_clip": 1.04348373, "balance_loss_mlp": 1.02308249, "epoch": 0.42621373816323466, "flos": 24936441747840.0, "grad_norm": 2.4696048983575376, "language_loss": 0.68491185, "learning_rate": 2.56714997234313e-06, "loss": 0.70609522, "num_input_tokens_seen": 152207235, "step": 7089, "time_per_iteration": 2.816352128982544 }, { "auxiliary_loss_clip": 0.01083453, "auxiliary_loss_mlp": 0.01038038, "balance_loss_clip": 1.04270887, "balance_loss_mlp": 1.02359009, "epoch": 0.4262738614159026, "flos": 13552975140480.0, "grad_norm": 2.0671888191777623, "language_loss": 0.73030579, "learning_rate": 2.566776487287525e-06, "loss": 0.75152063, "num_input_tokens_seen": 152224240, "step": 7090, "time_per_iteration": 2.801116704940796 }, { "auxiliary_loss_clip": 0.01114766, "auxiliary_loss_mlp": 0.0104358, "balance_loss_clip": 1.0483079, "balance_loss_mlp": 1.02875018, "epoch": 0.4263339846685706, "flos": 29748794284800.0, "grad_norm": 1.7852421559677654, "language_loss": 0.75632602, "learning_rate": 2.5664029807396994e-06, "loss": 0.77790952, "num_input_tokens_seen": 152242595, "step": 7091, "time_per_iteration": 2.779731273651123 }, { "auxiliary_loss_clip": 0.01081578, "auxiliary_loss_mlp": 0.01031582, "balance_loss_clip": 1.04725623, "balance_loss_mlp": 1.01879716, "epoch": 0.42639410792123855, "flos": 16834204586880.0, "grad_norm": 2.1009795194853567, "language_loss": 0.82635152, "learning_rate": 2.5660294527138156e-06, "loss": 0.84748316, "num_input_tokens_seen": 152260840, "step": 7092, "time_per_iteration": 2.7296979427337646 }, { "auxiliary_loss_clip": 0.01113469, "auxiliary_loss_mlp": 0.0104261, "balance_loss_clip": 1.04653692, "balance_loss_mlp": 1.02812648, "epoch": 0.4264542311739065, "flos": 28763118195840.0, "grad_norm": 1.6936837646094385, "language_loss": 0.73936713, "learning_rate": 2.565655903224038e-06, "loss": 0.76092792, "num_input_tokens_seen": 152280580, "step": 7093, "time_per_iteration": 2.738494634628296 }, { "auxiliary_loss_clip": 0.01124772, "auxiliary_loss_mlp": 0.01037897, "balance_loss_clip": 1.05013132, "balance_loss_mlp": 1.02285314, "epoch": 0.4265143544265745, "flos": 24713615727360.0, "grad_norm": 2.248863473367437, "language_loss": 0.69831914, "learning_rate": 2.565282332284532e-06, "loss": 0.71994585, "num_input_tokens_seen": 152298455, "step": 7094, "time_per_iteration": 2.696377754211426 }, { "auxiliary_loss_clip": 0.01102522, "auxiliary_loss_mlp": 0.01035266, "balance_loss_clip": 1.05082488, "balance_loss_mlp": 1.02069819, "epoch": 0.42657447767924245, "flos": 21865971352320.0, "grad_norm": 1.593904094988334, "language_loss": 0.8160966, "learning_rate": 2.564908739909464e-06, "loss": 0.83747452, "num_input_tokens_seen": 152316995, "step": 7095, "time_per_iteration": 2.7906196117401123 }, { "auxiliary_loss_clip": 0.01135526, "auxiliary_loss_mlp": 0.01039866, "balance_loss_clip": 1.05080557, "balance_loss_mlp": 1.02575183, "epoch": 0.4266346009319104, "flos": 21470236237440.0, "grad_norm": 1.8045956329002426, "language_loss": 0.80642307, "learning_rate": 2.5645351261129996e-06, "loss": 0.82817698, "num_input_tokens_seen": 152334800, "step": 7096, "time_per_iteration": 2.7473361492156982 }, { "auxiliary_loss_clip": 0.01130201, "auxiliary_loss_mlp": 0.01033007, "balance_loss_clip": 1.05325663, "balance_loss_mlp": 1.0182128, "epoch": 0.4266947241845784, "flos": 25519379569920.0, "grad_norm": 2.602963129491376, "language_loss": 0.64982784, "learning_rate": 2.5641614909093066e-06, "loss": 0.67145991, "num_input_tokens_seen": 152355175, "step": 7097, "time_per_iteration": 2.683868408203125 }, { "auxiliary_loss_clip": 0.01103674, "auxiliary_loss_mlp": 0.01032153, "balance_loss_clip": 1.04987097, "balance_loss_mlp": 1.01799679, "epoch": 0.42675484743724634, "flos": 26541217676160.0, "grad_norm": 1.7913732947115202, "language_loss": 0.74682045, "learning_rate": 2.5637878343125535e-06, "loss": 0.76817876, "num_input_tokens_seen": 152377245, "step": 7098, "time_per_iteration": 2.7669501304626465 }, { "auxiliary_loss_clip": 0.0112361, "auxiliary_loss_mlp": 0.01029914, "balance_loss_clip": 1.05006361, "balance_loss_mlp": 1.0165925, "epoch": 0.4268149706899143, "flos": 23112718467840.0, "grad_norm": 1.7242280164199693, "language_loss": 0.75574845, "learning_rate": 2.5634141563369086e-06, "loss": 0.77728367, "num_input_tokens_seen": 152396985, "step": 7099, "time_per_iteration": 2.652024507522583 }, { "auxiliary_loss_clip": 0.01113615, "auxiliary_loss_mlp": 0.01044502, "balance_loss_clip": 1.04767907, "balance_loss_mlp": 1.02964246, "epoch": 0.4268750939425823, "flos": 22706532495360.0, "grad_norm": 2.4499059435945956, "language_loss": 0.82854998, "learning_rate": 2.5630404569965432e-06, "loss": 0.85013109, "num_input_tokens_seen": 152415590, "step": 7100, "time_per_iteration": 2.66955304145813 }, { "auxiliary_loss_clip": 0.01114994, "auxiliary_loss_mlp": 0.01038973, "balance_loss_clip": 1.05028403, "balance_loss_mlp": 1.0246973, "epoch": 0.42693521719525024, "flos": 25374875155200.0, "grad_norm": 1.3265740257801202, "language_loss": 0.81932402, "learning_rate": 2.562666736305627e-06, "loss": 0.8408637, "num_input_tokens_seen": 152436735, "step": 7101, "time_per_iteration": 2.734703540802002 }, { "auxiliary_loss_clip": 0.01139197, "auxiliary_loss_mlp": 0.01033271, "balance_loss_clip": 1.0521878, "balance_loss_mlp": 1.01856041, "epoch": 0.42699534044791826, "flos": 18150689957760.0, "grad_norm": 6.39201802797086, "language_loss": 0.72548246, "learning_rate": 2.5622929942783314e-06, "loss": 0.74720716, "num_input_tokens_seen": 152455685, "step": 7102, "time_per_iteration": 2.6193687915802 }, { "auxiliary_loss_clip": 0.01123058, "auxiliary_loss_mlp": 0.01031478, "balance_loss_clip": 1.05015755, "balance_loss_mlp": 1.01770973, "epoch": 0.4270554637005862, "flos": 13698413308800.0, "grad_norm": 2.0187490499372243, "language_loss": 0.83425319, "learning_rate": 2.5619192309288297e-06, "loss": 0.8557986, "num_input_tokens_seen": 152473500, "step": 7103, "time_per_iteration": 2.6151843070983887 }, { "auxiliary_loss_clip": 0.01108466, "auxiliary_loss_mlp": 0.01042825, "balance_loss_clip": 1.04559612, "balance_loss_mlp": 1.02617157, "epoch": 0.4271155869532542, "flos": 17493596507520.0, "grad_norm": 4.588723714988328, "language_loss": 0.74312592, "learning_rate": 2.561545446271294e-06, "loss": 0.76463884, "num_input_tokens_seen": 152491320, "step": 7104, "time_per_iteration": 2.686087131500244 }, { "auxiliary_loss_clip": 0.01118632, "auxiliary_loss_mlp": 0.01030826, "balance_loss_clip": 1.04769945, "balance_loss_mlp": 1.01652098, "epoch": 0.42717571020592215, "flos": 32452293381120.0, "grad_norm": 3.9751824788265226, "language_loss": 0.7515536, "learning_rate": 2.5611716403198987e-06, "loss": 0.77304816, "num_input_tokens_seen": 152511970, "step": 7105, "time_per_iteration": 2.69466495513916 }, { "auxiliary_loss_clip": 0.01138696, "auxiliary_loss_mlp": 0.01032922, "balance_loss_clip": 1.05365109, "balance_loss_mlp": 1.01949859, "epoch": 0.4272358334585901, "flos": 16253062444800.0, "grad_norm": 1.828100931914864, "language_loss": 0.77001148, "learning_rate": 2.560797813088819e-06, "loss": 0.79172766, "num_input_tokens_seen": 152530515, "step": 7106, "time_per_iteration": 2.7470526695251465 }, { "auxiliary_loss_clip": 0.01113386, "auxiliary_loss_mlp": 0.01032071, "balance_loss_clip": 1.05155849, "balance_loss_mlp": 1.01898193, "epoch": 0.4272959567112581, "flos": 24200092938240.0, "grad_norm": 2.105539726439896, "language_loss": 0.79606462, "learning_rate": 2.560423964592229e-06, "loss": 0.81751919, "num_input_tokens_seen": 152549295, "step": 7107, "time_per_iteration": 4.302187919616699 }, { "auxiliary_loss_clip": 0.01084956, "auxiliary_loss_mlp": 0.01035225, "balance_loss_clip": 1.04738021, "balance_loss_mlp": 1.02138472, "epoch": 0.42735607996392605, "flos": 27963495578880.0, "grad_norm": 1.6344878343023064, "language_loss": 0.67924458, "learning_rate": 2.5600500948443075e-06, "loss": 0.70044637, "num_input_tokens_seen": 152570725, "step": 7108, "time_per_iteration": 6.044403314590454 }, { "auxiliary_loss_clip": 0.01110243, "auxiliary_loss_mlp": 0.01038292, "balance_loss_clip": 1.05136764, "balance_loss_mlp": 1.02539325, "epoch": 0.427416203216594, "flos": 20295597674880.0, "grad_norm": 1.7692691179194058, "language_loss": 0.71223509, "learning_rate": 2.5596762038592294e-06, "loss": 0.73372042, "num_input_tokens_seen": 152588950, "step": 7109, "time_per_iteration": 2.6695122718811035 }, { "auxiliary_loss_clip": 0.01120979, "auxiliary_loss_mlp": 0.01033902, "balance_loss_clip": 1.048154, "balance_loss_mlp": 1.01738, "epoch": 0.427476326469262, "flos": 26943955943040.0, "grad_norm": 2.0357298685431595, "language_loss": 0.64665484, "learning_rate": 2.559302291651174e-06, "loss": 0.66820359, "num_input_tokens_seen": 152608965, "step": 7110, "time_per_iteration": 2.6609907150268555 }, { "auxiliary_loss_clip": 0.01132801, "auxiliary_loss_mlp": 0.00771481, "balance_loss_clip": 1.04796886, "balance_loss_mlp": 1.00075054, "epoch": 0.42753644972192995, "flos": 25702847262720.0, "grad_norm": 6.311104463147988, "language_loss": 0.76556361, "learning_rate": 2.5589283582343197e-06, "loss": 0.7846064, "num_input_tokens_seen": 152630220, "step": 7111, "time_per_iteration": 2.704688310623169 }, { "auxiliary_loss_clip": 0.01111143, "auxiliary_loss_mlp": 0.01033331, "balance_loss_clip": 1.05706656, "balance_loss_mlp": 1.01936615, "epoch": 0.4275965729745979, "flos": 18767419499520.0, "grad_norm": 2.0174435424847084, "language_loss": 0.72800988, "learning_rate": 2.558554403622845e-06, "loss": 0.74945462, "num_input_tokens_seen": 152648835, "step": 7112, "time_per_iteration": 4.39399790763855 }, { "auxiliary_loss_clip": 0.01107213, "auxiliary_loss_mlp": 0.01037359, "balance_loss_clip": 1.04838848, "balance_loss_mlp": 1.02366805, "epoch": 0.4276566962272659, "flos": 23764424878080.0, "grad_norm": 1.714295461522007, "language_loss": 0.71427524, "learning_rate": 2.5581804278309323e-06, "loss": 0.73572093, "num_input_tokens_seen": 152668375, "step": 7113, "time_per_iteration": 2.6834428310394287 }, { "auxiliary_loss_clip": 0.01126637, "auxiliary_loss_mlp": 0.01040655, "balance_loss_clip": 1.05207372, "balance_loss_mlp": 1.02700508, "epoch": 0.42771681947993384, "flos": 22492505306880.0, "grad_norm": 1.6108261365545002, "language_loss": 0.61758566, "learning_rate": 2.5578064308727617e-06, "loss": 0.63925862, "num_input_tokens_seen": 152689725, "step": 7114, "time_per_iteration": 2.7341814041137695 }, { "auxiliary_loss_clip": 0.01131369, "auxiliary_loss_mlp": 0.01042209, "balance_loss_clip": 1.05489218, "balance_loss_mlp": 1.02556777, "epoch": 0.42777694273260186, "flos": 25044712318080.0, "grad_norm": 1.6215320240925026, "language_loss": 0.649822, "learning_rate": 2.5574324127625153e-06, "loss": 0.67155778, "num_input_tokens_seen": 152709375, "step": 7115, "time_per_iteration": 2.6360361576080322 }, { "auxiliary_loss_clip": 0.01110467, "auxiliary_loss_mlp": 0.01037107, "balance_loss_clip": 1.04954565, "balance_loss_mlp": 1.02359438, "epoch": 0.4278370659852698, "flos": 18661519226880.0, "grad_norm": 1.8869093124336491, "language_loss": 0.74057275, "learning_rate": 2.5570583735143753e-06, "loss": 0.76204848, "num_input_tokens_seen": 152727510, "step": 7116, "time_per_iteration": 2.701413869857788 }, { "auxiliary_loss_clip": 0.01105537, "auxiliary_loss_mlp": 0.01041231, "balance_loss_clip": 1.04539752, "balance_loss_mlp": 1.02783155, "epoch": 0.4278971892379378, "flos": 27308269635840.0, "grad_norm": 1.8577367375008744, "language_loss": 0.69426787, "learning_rate": 2.5566843131425275e-06, "loss": 0.71573555, "num_input_tokens_seen": 152746670, "step": 7117, "time_per_iteration": 2.740729570388794 }, { "auxiliary_loss_clip": 0.01110879, "auxiliary_loss_mlp": 0.0103835, "balance_loss_clip": 1.05176735, "balance_loss_mlp": 1.02402163, "epoch": 0.42795731249060576, "flos": 12888698970240.0, "grad_norm": 2.8863290375892148, "language_loss": 0.69564569, "learning_rate": 2.5563102316611536e-06, "loss": 0.71713799, "num_input_tokens_seen": 152760545, "step": 7118, "time_per_iteration": 2.7086899280548096 }, { "auxiliary_loss_clip": 0.01092131, "auxiliary_loss_mlp": 0.0104544, "balance_loss_clip": 1.04521, "balance_loss_mlp": 1.03076482, "epoch": 0.4280174357432737, "flos": 33401448316800.0, "grad_norm": 2.453050871280299, "language_loss": 0.74826419, "learning_rate": 2.55593612908444e-06, "loss": 0.76963991, "num_input_tokens_seen": 152780970, "step": 7119, "time_per_iteration": 2.805619239807129 }, { "auxiliary_loss_clip": 0.01069167, "auxiliary_loss_mlp": 0.01038035, "balance_loss_clip": 1.0436008, "balance_loss_mlp": 1.02377188, "epoch": 0.4280775589959417, "flos": 18259104182400.0, "grad_norm": 1.842272720773601, "language_loss": 0.75238574, "learning_rate": 2.555562005426573e-06, "loss": 0.77345783, "num_input_tokens_seen": 152798475, "step": 7120, "time_per_iteration": 2.8678669929504395 }, { "auxiliary_loss_clip": 0.01112615, "auxiliary_loss_mlp": 0.00770364, "balance_loss_clip": 1.05290043, "balance_loss_mlp": 1.00063229, "epoch": 0.42813768224860965, "flos": 21471277731840.0, "grad_norm": 1.7037705311845839, "language_loss": 0.76884449, "learning_rate": 2.5551878607017385e-06, "loss": 0.78767425, "num_input_tokens_seen": 152817555, "step": 7121, "time_per_iteration": 2.776524305343628 }, { "auxiliary_loss_clip": 0.01114442, "auxiliary_loss_mlp": 0.01034456, "balance_loss_clip": 1.05325198, "balance_loss_mlp": 1.02162266, "epoch": 0.4281978055012776, "flos": 15669262696320.0, "grad_norm": 1.9187062544957278, "language_loss": 0.85698652, "learning_rate": 2.554813694924126e-06, "loss": 0.87847555, "num_input_tokens_seen": 152836295, "step": 7122, "time_per_iteration": 2.7109732627868652 }, { "auxiliary_loss_clip": 0.01083707, "auxiliary_loss_mlp": 0.01035803, "balance_loss_clip": 1.04868889, "balance_loss_mlp": 1.02191544, "epoch": 0.4282579287539456, "flos": 17712005155200.0, "grad_norm": 2.4146794334180632, "language_loss": 0.81251013, "learning_rate": 2.554439508107921e-06, "loss": 0.83370531, "num_input_tokens_seen": 152854950, "step": 7123, "time_per_iteration": 2.7866828441619873 }, { "auxiliary_loss_clip": 0.01090954, "auxiliary_loss_mlp": 0.01034043, "balance_loss_clip": 1.04922438, "balance_loss_mlp": 1.02011371, "epoch": 0.42831805200661355, "flos": 19281157770240.0, "grad_norm": 1.7481094896376608, "language_loss": 0.81089389, "learning_rate": 2.5540653002673153e-06, "loss": 0.8321439, "num_input_tokens_seen": 152873995, "step": 7124, "time_per_iteration": 2.733530044555664 }, { "auxiliary_loss_clip": 0.01125145, "auxiliary_loss_mlp": 0.01037816, "balance_loss_clip": 1.05205929, "balance_loss_mlp": 1.02334404, "epoch": 0.4283781752592815, "flos": 19792633484160.0, "grad_norm": 1.8145132685178345, "language_loss": 0.80230892, "learning_rate": 2.553691071416498e-06, "loss": 0.82393849, "num_input_tokens_seen": 152892925, "step": 7125, "time_per_iteration": 2.635104179382324 }, { "auxiliary_loss_clip": 0.01132021, "auxiliary_loss_mlp": 0.0076966, "balance_loss_clip": 1.05282855, "balance_loss_mlp": 1.00061083, "epoch": 0.4284382985119495, "flos": 16508064072960.0, "grad_norm": 1.8752935538071442, "language_loss": 0.74911773, "learning_rate": 2.553316821569659e-06, "loss": 0.76813453, "num_input_tokens_seen": 152910935, "step": 7126, "time_per_iteration": 2.605344772338867 }, { "auxiliary_loss_clip": 0.01124108, "auxiliary_loss_mlp": 0.01031402, "balance_loss_clip": 1.05336213, "balance_loss_mlp": 1.01742435, "epoch": 0.42849842176461744, "flos": 23330767979520.0, "grad_norm": 4.135943969267594, "language_loss": 0.80782413, "learning_rate": 2.5529425507409913e-06, "loss": 0.82937926, "num_input_tokens_seen": 152931030, "step": 7127, "time_per_iteration": 2.662910223007202 }, { "auxiliary_loss_clip": 0.01088729, "auxiliary_loss_mlp": 0.0104147, "balance_loss_clip": 1.04972112, "balance_loss_mlp": 1.02753484, "epoch": 0.4285585450172854, "flos": 17274433674240.0, "grad_norm": 2.1393882563291773, "language_loss": 0.76243544, "learning_rate": 2.5525682589446867e-06, "loss": 0.78373742, "num_input_tokens_seen": 152948085, "step": 7128, "time_per_iteration": 2.7230868339538574 }, { "auxiliary_loss_clip": 0.01089264, "auxiliary_loss_mlp": 0.01035924, "balance_loss_clip": 1.04796708, "balance_loss_mlp": 1.02163041, "epoch": 0.42861866826995343, "flos": 24279599692800.0, "grad_norm": 1.945213992632333, "language_loss": 0.74079603, "learning_rate": 2.552193946194937e-06, "loss": 0.76204789, "num_input_tokens_seen": 152966265, "step": 7129, "time_per_iteration": 2.775891065597534 }, { "auxiliary_loss_clip": 0.01127944, "auxiliary_loss_mlp": 0.00770117, "balance_loss_clip": 1.05684757, "balance_loss_mlp": 1.0005461, "epoch": 0.4286787915226214, "flos": 24353108876160.0, "grad_norm": 1.5710338967277158, "language_loss": 0.77974319, "learning_rate": 2.5518196125059394e-06, "loss": 0.79872382, "num_input_tokens_seen": 152986775, "step": 7130, "time_per_iteration": 2.6977498531341553 }, { "auxiliary_loss_clip": 0.01119463, "auxiliary_loss_mlp": 0.01035523, "balance_loss_clip": 1.05768883, "balance_loss_mlp": 1.02184367, "epoch": 0.42873891477528936, "flos": 15449992122240.0, "grad_norm": 2.320631391566952, "language_loss": 0.73168224, "learning_rate": 2.551445257891886e-06, "loss": 0.75323212, "num_input_tokens_seen": 153003595, "step": 7131, "time_per_iteration": 2.6973114013671875 }, { "auxiliary_loss_clip": 0.01116554, "auxiliary_loss_mlp": 0.0103667, "balance_loss_clip": 1.05293584, "balance_loss_mlp": 1.02260923, "epoch": 0.4287990380279573, "flos": 17639573379840.0, "grad_norm": 5.223518802520722, "language_loss": 0.77257997, "learning_rate": 2.551070882366973e-06, "loss": 0.79411221, "num_input_tokens_seen": 153021960, "step": 7132, "time_per_iteration": 2.644556999206543 }, { "auxiliary_loss_clip": 0.01097397, "auxiliary_loss_mlp": 0.00771143, "balance_loss_clip": 1.05195022, "balance_loss_mlp": 1.00064743, "epoch": 0.4288591612806253, "flos": 27162328677120.0, "grad_norm": 2.003525879431933, "language_loss": 0.78719372, "learning_rate": 2.550696485945397e-06, "loss": 0.80587912, "num_input_tokens_seen": 153042110, "step": 7133, "time_per_iteration": 2.7668325901031494 }, { "auxiliary_loss_clip": 0.01111172, "auxiliary_loss_mlp": 0.01034109, "balance_loss_clip": 1.05302238, "balance_loss_mlp": 1.02091813, "epoch": 0.42891928453329325, "flos": 17163182275200.0, "grad_norm": 1.850568768068126, "language_loss": 0.7449469, "learning_rate": 2.550322068641355e-06, "loss": 0.76639962, "num_input_tokens_seen": 153058925, "step": 7134, "time_per_iteration": 2.714893341064453 }, { "auxiliary_loss_clip": 0.01112422, "auxiliary_loss_mlp": 0.01035997, "balance_loss_clip": 1.04541016, "balance_loss_mlp": 1.02241349, "epoch": 0.4289794077859612, "flos": 18187031543040.0, "grad_norm": 1.9214467858451951, "language_loss": 0.84098607, "learning_rate": 2.5499476304690455e-06, "loss": 0.86247027, "num_input_tokens_seen": 153078070, "step": 7135, "time_per_iteration": 2.646799325942993 }, { "auxiliary_loss_clip": 0.01060089, "auxiliary_loss_mlp": 0.01040969, "balance_loss_clip": 1.04197621, "balance_loss_mlp": 1.02555561, "epoch": 0.4290395310386292, "flos": 28256885867520.0, "grad_norm": 2.1625216270915493, "language_loss": 0.75274026, "learning_rate": 2.549573171442666e-06, "loss": 0.77375078, "num_input_tokens_seen": 153096680, "step": 7136, "time_per_iteration": 2.809598207473755 }, { "auxiliary_loss_clip": 0.0112086, "auxiliary_loss_mlp": 0.0103731, "balance_loss_clip": 1.04999709, "balance_loss_mlp": 1.02323103, "epoch": 0.42909965429129715, "flos": 16216074414720.0, "grad_norm": 2.3663507288699743, "language_loss": 0.79031229, "learning_rate": 2.5491986915764175e-06, "loss": 0.81189406, "num_input_tokens_seen": 153113305, "step": 7137, "time_per_iteration": 2.5979957580566406 }, { "auxiliary_loss_clip": 0.01139951, "auxiliary_loss_mlp": 0.01034457, "balance_loss_clip": 1.05516219, "balance_loss_mlp": 1.02047372, "epoch": 0.4291597775439651, "flos": 23112862122240.0, "grad_norm": 2.7024255480814166, "language_loss": 0.76951313, "learning_rate": 2.548824190884499e-06, "loss": 0.7912572, "num_input_tokens_seen": 153132735, "step": 7138, "time_per_iteration": 2.659080982208252 }, { "auxiliary_loss_clip": 0.01053167, "auxiliary_loss_mlp": 0.01001874, "balance_loss_clip": 1.04265583, "balance_loss_mlp": 1.000193, "epoch": 0.4292199007966331, "flos": 67546212681600.0, "grad_norm": 0.770527259841848, "language_loss": 0.56189907, "learning_rate": 2.548449669381113e-06, "loss": 0.58244956, "num_input_tokens_seen": 153187925, "step": 7139, "time_per_iteration": 3.10082745552063 }, { "auxiliary_loss_clip": 0.0113097, "auxiliary_loss_mlp": 0.00769947, "balance_loss_clip": 1.05131912, "balance_loss_mlp": 1.00071657, "epoch": 0.42928002404930105, "flos": 22999850956800.0, "grad_norm": 2.111862554587806, "language_loss": 0.80871445, "learning_rate": 2.5480751270804595e-06, "loss": 0.82772362, "num_input_tokens_seen": 153206990, "step": 7140, "time_per_iteration": 2.795779228210449 }, { "auxiliary_loss_clip": 0.01122496, "auxiliary_loss_mlp": 0.01032486, "balance_loss_clip": 1.05028069, "balance_loss_mlp": 1.01853812, "epoch": 0.429340147301969, "flos": 11544922241280.0, "grad_norm": 1.8811141343222446, "language_loss": 0.82105601, "learning_rate": 2.5477005639967424e-06, "loss": 0.84260583, "num_input_tokens_seen": 153222345, "step": 7141, "time_per_iteration": 2.7634544372558594 }, { "auxiliary_loss_clip": 0.0112355, "auxiliary_loss_mlp": 0.0103971, "balance_loss_clip": 1.05177212, "balance_loss_mlp": 1.02569723, "epoch": 0.42940027055463703, "flos": 25264988472960.0, "grad_norm": 3.1732751781519566, "language_loss": 0.86466211, "learning_rate": 2.547325980144166e-06, "loss": 0.88629478, "num_input_tokens_seen": 153240570, "step": 7142, "time_per_iteration": 2.73675537109375 }, { "auxiliary_loss_clip": 0.01107323, "auxiliary_loss_mlp": 0.0103324, "balance_loss_clip": 1.05093384, "balance_loss_mlp": 1.02018034, "epoch": 0.429460393807305, "flos": 23805004268160.0, "grad_norm": 2.0666274749088704, "language_loss": 0.78651458, "learning_rate": 2.5469513755369323e-06, "loss": 0.80792016, "num_input_tokens_seen": 153259575, "step": 7143, "time_per_iteration": 2.704951047897339 }, { "auxiliary_loss_clip": 0.01085856, "auxiliary_loss_mlp": 0.01042533, "balance_loss_clip": 1.04870784, "balance_loss_mlp": 1.02862692, "epoch": 0.42952051705997296, "flos": 13918294414080.0, "grad_norm": 1.8720341937391007, "language_loss": 0.77237451, "learning_rate": 2.5465767501892484e-06, "loss": 0.79365838, "num_input_tokens_seen": 153276650, "step": 7144, "time_per_iteration": 2.8080482482910156 }, { "auxiliary_loss_clip": 0.01111048, "auxiliary_loss_mlp": 0.01029176, "balance_loss_clip": 1.05582607, "balance_loss_mlp": 1.01565719, "epoch": 0.4295806403126409, "flos": 26760380509440.0, "grad_norm": 2.7559580952375335, "language_loss": 0.73788631, "learning_rate": 2.54620210411532e-06, "loss": 0.75928855, "num_input_tokens_seen": 153298025, "step": 7145, "time_per_iteration": 2.876610040664673 }, { "auxiliary_loss_clip": 0.01124065, "auxiliary_loss_mlp": 0.01036309, "balance_loss_clip": 1.05205083, "balance_loss_mlp": 1.02291536, "epoch": 0.4296407635653089, "flos": 20952619297920.0, "grad_norm": 2.2535739124191623, "language_loss": 0.78997326, "learning_rate": 2.545827437329352e-06, "loss": 0.81157696, "num_input_tokens_seen": 153315775, "step": 7146, "time_per_iteration": 4.237323999404907 }, { "auxiliary_loss_clip": 0.01118325, "auxiliary_loss_mlp": 0.01033233, "balance_loss_clip": 1.04862475, "balance_loss_mlp": 1.02041841, "epoch": 0.42970088681797686, "flos": 15852335339520.0, "grad_norm": 2.134935554118882, "language_loss": 0.83125973, "learning_rate": 2.5454527498455532e-06, "loss": 0.85277522, "num_input_tokens_seen": 153332765, "step": 7147, "time_per_iteration": 4.170353412628174 }, { "auxiliary_loss_clip": 0.01120236, "auxiliary_loss_mlp": 0.01036768, "balance_loss_clip": 1.05321455, "balance_loss_mlp": 1.02217066, "epoch": 0.4297610100706448, "flos": 22382618624640.0, "grad_norm": 2.0255914888463837, "language_loss": 0.87308717, "learning_rate": 2.545078041678131e-06, "loss": 0.89465714, "num_input_tokens_seen": 153350760, "step": 7148, "time_per_iteration": 4.25404167175293 }, { "auxiliary_loss_clip": 0.01106949, "auxiliary_loss_mlp": 0.0103361, "balance_loss_clip": 1.0480504, "balance_loss_mlp": 1.02031255, "epoch": 0.4298211333233128, "flos": 27925681536000.0, "grad_norm": 1.5866853205406048, "language_loss": 0.77782673, "learning_rate": 2.5447033128412957e-06, "loss": 0.79923236, "num_input_tokens_seen": 153370765, "step": 7149, "time_per_iteration": 2.7506890296936035 }, { "auxiliary_loss_clip": 0.01089941, "auxiliary_loss_mlp": 0.01034354, "balance_loss_clip": 1.04399276, "balance_loss_mlp": 1.02023959, "epoch": 0.42988125657598075, "flos": 24425612478720.0, "grad_norm": 1.8521512589115583, "language_loss": 0.80214548, "learning_rate": 2.544328563349256e-06, "loss": 0.8233884, "num_input_tokens_seen": 153390725, "step": 7150, "time_per_iteration": 2.7500832080841064 }, { "auxiliary_loss_clip": 0.01129377, "auxiliary_loss_mlp": 0.01039727, "balance_loss_clip": 1.05486202, "balance_loss_mlp": 1.02441442, "epoch": 0.4299413798286487, "flos": 15850180523520.0, "grad_norm": 1.9985895227285218, "language_loss": 0.75273871, "learning_rate": 2.5439537932162222e-06, "loss": 0.7744298, "num_input_tokens_seen": 153408010, "step": 7151, "time_per_iteration": 5.016021251678467 }, { "auxiliary_loss_clip": 0.01085345, "auxiliary_loss_mlp": 0.01034438, "balance_loss_clip": 1.0429914, "balance_loss_mlp": 1.02001333, "epoch": 0.4300015030813167, "flos": 22309504490880.0, "grad_norm": 2.1817188720110954, "language_loss": 0.70050609, "learning_rate": 2.543579002456406e-06, "loss": 0.72170389, "num_input_tokens_seen": 153426865, "step": 7152, "time_per_iteration": 2.7800815105438232 }, { "auxiliary_loss_clip": 0.01111211, "auxiliary_loss_mlp": 0.01037662, "balance_loss_clip": 1.04997575, "balance_loss_mlp": 1.02443016, "epoch": 0.43006162633398465, "flos": 34897666366080.0, "grad_norm": 1.6446083910432685, "language_loss": 0.71179092, "learning_rate": 2.54320419108402e-06, "loss": 0.73327965, "num_input_tokens_seen": 153449410, "step": 7153, "time_per_iteration": 2.829648017883301 }, { "auxiliary_loss_clip": 0.01119902, "auxiliary_loss_mlp": 0.01033204, "balance_loss_clip": 1.0488553, "balance_loss_mlp": 1.01928604, "epoch": 0.4301217495866526, "flos": 15961575576960.0, "grad_norm": 1.892610527455045, "language_loss": 0.78175116, "learning_rate": 2.542829359113276e-06, "loss": 0.80328226, "num_input_tokens_seen": 153467910, "step": 7154, "time_per_iteration": 2.723484516143799 }, { "auxiliary_loss_clip": 0.01099683, "auxiliary_loss_mlp": 0.01040214, "balance_loss_clip": 1.04681695, "balance_loss_mlp": 1.02599812, "epoch": 0.43018187283932063, "flos": 18770364414720.0, "grad_norm": 1.5463056134535458, "language_loss": 0.78802991, "learning_rate": 2.542454506558389e-06, "loss": 0.80942887, "num_input_tokens_seen": 153487100, "step": 7155, "time_per_iteration": 2.7014451026916504 }, { "auxiliary_loss_clip": 0.01105109, "auxiliary_loss_mlp": 0.01032701, "balance_loss_clip": 1.04913473, "balance_loss_mlp": 1.01963592, "epoch": 0.4302419960919886, "flos": 20151703791360.0, "grad_norm": 1.7272401238355637, "language_loss": 0.88303947, "learning_rate": 2.5420796334335723e-06, "loss": 0.90441763, "num_input_tokens_seen": 153505565, "step": 7156, "time_per_iteration": 2.696967363357544 }, { "auxiliary_loss_clip": 0.01135167, "auxiliary_loss_mlp": 0.01033618, "balance_loss_clip": 1.05029023, "balance_loss_mlp": 1.01970661, "epoch": 0.43030211934465656, "flos": 26432731624320.0, "grad_norm": 1.8553568722970555, "language_loss": 0.82653069, "learning_rate": 2.541704739753042e-06, "loss": 0.84821856, "num_input_tokens_seen": 153526130, "step": 7157, "time_per_iteration": 2.706956148147583 }, { "auxiliary_loss_clip": 0.01138655, "auxiliary_loss_mlp": 0.01033446, "balance_loss_clip": 1.05253196, "balance_loss_mlp": 1.0191586, "epoch": 0.43036224259732453, "flos": 24389234979840.0, "grad_norm": 1.8412394525159426, "language_loss": 0.71535289, "learning_rate": 2.5413298255310132e-06, "loss": 0.73707396, "num_input_tokens_seen": 153546370, "step": 7158, "time_per_iteration": 2.717587471008301 }, { "auxiliary_loss_clip": 0.01122952, "auxiliary_loss_mlp": 0.01034381, "balance_loss_clip": 1.05053186, "balance_loss_mlp": 1.02094615, "epoch": 0.4304223658499925, "flos": 17201714590080.0, "grad_norm": 2.4063235591116, "language_loss": 0.82592964, "learning_rate": 2.5409548907817034e-06, "loss": 0.84750295, "num_input_tokens_seen": 153562800, "step": 7159, "time_per_iteration": 2.657625436782837 }, { "auxiliary_loss_clip": 0.01105982, "auxiliary_loss_mlp": 0.01034344, "balance_loss_clip": 1.04629135, "balance_loss_mlp": 1.02073002, "epoch": 0.43048248910266046, "flos": 14903000835840.0, "grad_norm": 2.253245664419059, "language_loss": 0.83222294, "learning_rate": 2.54057993551933e-06, "loss": 0.85362625, "num_input_tokens_seen": 153578395, "step": 7160, "time_per_iteration": 2.6994106769561768 }, { "auxiliary_loss_clip": 0.0112897, "auxiliary_loss_mlp": 0.01040585, "balance_loss_clip": 1.05215347, "balance_loss_mlp": 1.02446771, "epoch": 0.4305426123553284, "flos": 21579835610880.0, "grad_norm": 2.2814219127337236, "language_loss": 0.77506208, "learning_rate": 2.5402049597581116e-06, "loss": 0.79675758, "num_input_tokens_seen": 153596880, "step": 7161, "time_per_iteration": 2.819274425506592 }, { "auxiliary_loss_clip": 0.01120227, "auxiliary_loss_mlp": 0.0103714, "balance_loss_clip": 1.04739952, "balance_loss_mlp": 1.02265632, "epoch": 0.4306027356079964, "flos": 22601278667520.0, "grad_norm": 2.279224529598255, "language_loss": 0.73028505, "learning_rate": 2.5398299635122662e-06, "loss": 0.75185871, "num_input_tokens_seen": 153616570, "step": 7162, "time_per_iteration": 2.62280011177063 }, { "auxiliary_loss_clip": 0.01016488, "auxiliary_loss_mlp": 0.00753107, "balance_loss_clip": 1.02147388, "balance_loss_mlp": 1.00100327, "epoch": 0.43066285886066435, "flos": 70672091806080.0, "grad_norm": 0.7910606346239517, "language_loss": 0.58986276, "learning_rate": 2.5394549467960147e-06, "loss": 0.60755867, "num_input_tokens_seen": 153671450, "step": 7163, "time_per_iteration": 3.1325736045837402 }, { "auxiliary_loss_clip": 0.01104143, "auxiliary_loss_mlp": 0.01044649, "balance_loss_clip": 1.04593122, "balance_loss_mlp": 1.02948582, "epoch": 0.4307229821133323, "flos": 26720591218560.0, "grad_norm": 1.8311930089659938, "language_loss": 0.79205155, "learning_rate": 2.5390799096235783e-06, "loss": 0.81353945, "num_input_tokens_seen": 153691405, "step": 7164, "time_per_iteration": 2.753256320953369 }, { "auxiliary_loss_clip": 0.01138029, "auxiliary_loss_mlp": 0.01040201, "balance_loss_clip": 1.0510416, "balance_loss_mlp": 1.02608645, "epoch": 0.4307831053660003, "flos": 26177119464960.0, "grad_norm": 2.032413289263653, "language_loss": 0.67551947, "learning_rate": 2.538704852009177e-06, "loss": 0.69730175, "num_input_tokens_seen": 153711555, "step": 7165, "time_per_iteration": 2.719172477722168 }, { "auxiliary_loss_clip": 0.01106688, "auxiliary_loss_mlp": 0.00771886, "balance_loss_clip": 1.05042744, "balance_loss_mlp": 1.00068462, "epoch": 0.43084322861866825, "flos": 18910343715840.0, "grad_norm": 2.1027726489364436, "language_loss": 0.75451279, "learning_rate": 2.538329773967034e-06, "loss": 0.77329856, "num_input_tokens_seen": 153730095, "step": 7166, "time_per_iteration": 2.710304021835327 }, { "auxiliary_loss_clip": 0.01126475, "auxiliary_loss_mlp": 0.01036095, "balance_loss_clip": 1.05613852, "balance_loss_mlp": 1.02310109, "epoch": 0.4309033518713362, "flos": 26432911192320.0, "grad_norm": 1.6200122801673495, "language_loss": 0.71809006, "learning_rate": 2.537954675511372e-06, "loss": 0.7397157, "num_input_tokens_seen": 153749320, "step": 7167, "time_per_iteration": 2.676224946975708 }, { "auxiliary_loss_clip": 0.01104337, "auxiliary_loss_mlp": 0.00771035, "balance_loss_clip": 1.04866242, "balance_loss_mlp": 1.00059962, "epoch": 0.43096347512400424, "flos": 21213295274880.0, "grad_norm": 1.6573858575043368, "language_loss": 0.78183687, "learning_rate": 2.537579556656414e-06, "loss": 0.80059052, "num_input_tokens_seen": 153767825, "step": 7168, "time_per_iteration": 2.8030035495758057 }, { "auxiliary_loss_clip": 0.01111425, "auxiliary_loss_mlp": 0.0104262, "balance_loss_clip": 1.05006397, "balance_loss_mlp": 1.02867889, "epoch": 0.4310235983766722, "flos": 16540131939840.0, "grad_norm": 1.8701517899109106, "language_loss": 0.82348084, "learning_rate": 2.537204417416387e-06, "loss": 0.84502125, "num_input_tokens_seen": 153785350, "step": 7169, "time_per_iteration": 2.683119773864746 }, { "auxiliary_loss_clip": 0.01047083, "auxiliary_loss_mlp": 0.01001288, "balance_loss_clip": 1.03727269, "balance_loss_mlp": 0.99934483, "epoch": 0.43108372162934017, "flos": 64775704763520.0, "grad_norm": 0.7280845280825856, "language_loss": 0.60741472, "learning_rate": 2.5368292578055132e-06, "loss": 0.6278984, "num_input_tokens_seen": 153856400, "step": 7170, "time_per_iteration": 3.345574140548706 }, { "auxiliary_loss_clip": 0.01135698, "auxiliary_loss_mlp": 0.01037021, "balance_loss_clip": 1.05163968, "balance_loss_mlp": 1.02352667, "epoch": 0.43114384488200813, "flos": 13444094039040.0, "grad_norm": 1.7903297890514136, "language_loss": 0.75776696, "learning_rate": 2.536454077838021e-06, "loss": 0.77949417, "num_input_tokens_seen": 153875230, "step": 7171, "time_per_iteration": 2.612459897994995 }, { "auxiliary_loss_clip": 0.01120974, "auxiliary_loss_mlp": 0.01034567, "balance_loss_clip": 1.05036652, "balance_loss_mlp": 1.02106678, "epoch": 0.4312039681346761, "flos": 26286682924800.0, "grad_norm": 3.289099345654009, "language_loss": 0.77644551, "learning_rate": 2.5360788775281357e-06, "loss": 0.79800093, "num_input_tokens_seen": 153894740, "step": 7172, "time_per_iteration": 2.69909930229187 }, { "auxiliary_loss_clip": 0.01105721, "auxiliary_loss_mlp": 0.010481, "balance_loss_clip": 1.04574609, "balance_loss_mlp": 1.03119648, "epoch": 0.43126409138734406, "flos": 20376684627840.0, "grad_norm": 2.89880180493229, "language_loss": 0.76759243, "learning_rate": 2.535703656890086e-06, "loss": 0.78913063, "num_input_tokens_seen": 153913230, "step": 7173, "time_per_iteration": 2.6338369846343994 }, { "auxiliary_loss_clip": 0.01130423, "auxiliary_loss_mlp": 0.00772103, "balance_loss_clip": 1.04817533, "balance_loss_mlp": 1.00070202, "epoch": 0.431324214640012, "flos": 22123091882880.0, "grad_norm": 1.4474212501027515, "language_loss": 0.76933503, "learning_rate": 2.5353284159381e-06, "loss": 0.78836024, "num_input_tokens_seen": 153933250, "step": 7174, "time_per_iteration": 2.809385061264038 }, { "auxiliary_loss_clip": 0.01135393, "auxiliary_loss_mlp": 0.01035645, "balance_loss_clip": 1.0494926, "balance_loss_mlp": 1.02004063, "epoch": 0.43138433789268, "flos": 15231008856960.0, "grad_norm": 1.5868683627972313, "language_loss": 0.8226738, "learning_rate": 2.534953154686407e-06, "loss": 0.84438419, "num_input_tokens_seen": 153951325, "step": 7175, "time_per_iteration": 2.609368324279785 }, { "auxiliary_loss_clip": 0.01092364, "auxiliary_loss_mlp": 0.01052008, "balance_loss_clip": 1.0459013, "balance_loss_mlp": 1.03422189, "epoch": 0.43144446114534796, "flos": 18150294908160.0, "grad_norm": 2.243705003900615, "language_loss": 0.74261117, "learning_rate": 2.5345778731492366e-06, "loss": 0.76405489, "num_input_tokens_seen": 153966975, "step": 7176, "time_per_iteration": 2.680771827697754 }, { "auxiliary_loss_clip": 0.01122908, "auxiliary_loss_mlp": 0.01035728, "balance_loss_clip": 1.04637945, "balance_loss_mlp": 1.0215838, "epoch": 0.4315045843980159, "flos": 22929861306240.0, "grad_norm": 1.6403527990581428, "language_loss": 0.73309958, "learning_rate": 2.534202571340819e-06, "loss": 0.754686, "num_input_tokens_seen": 153986695, "step": 7177, "time_per_iteration": 2.760601758956909 }, { "auxiliary_loss_clip": 0.011222, "auxiliary_loss_mlp": 0.01043971, "balance_loss_clip": 1.05072641, "balance_loss_mlp": 1.02720976, "epoch": 0.4315647076506839, "flos": 22126862810880.0, "grad_norm": 1.7813773885441684, "language_loss": 0.81519645, "learning_rate": 2.533827249275387e-06, "loss": 0.83685815, "num_input_tokens_seen": 154004710, "step": 7178, "time_per_iteration": 2.6687469482421875 }, { "auxiliary_loss_clip": 0.01109607, "auxiliary_loss_mlp": 0.01033774, "balance_loss_clip": 1.04922378, "balance_loss_mlp": 1.02013087, "epoch": 0.43162483090335185, "flos": 26871129118080.0, "grad_norm": 32.445562208198496, "language_loss": 0.84143358, "learning_rate": 2.5334519069671725e-06, "loss": 0.86286741, "num_input_tokens_seen": 154024320, "step": 7179, "time_per_iteration": 2.696716547012329 }, { "auxiliary_loss_clip": 0.01108857, "auxiliary_loss_mlp": 0.010342, "balance_loss_clip": 1.04713559, "balance_loss_mlp": 1.0200026, "epoch": 0.4316849541560198, "flos": 13913122855680.0, "grad_norm": 1.7762155940538253, "language_loss": 0.75679082, "learning_rate": 2.5330765444304075e-06, "loss": 0.77822137, "num_input_tokens_seen": 154041755, "step": 7180, "time_per_iteration": 2.6832194328308105 }, { "auxiliary_loss_clip": 0.01104614, "auxiliary_loss_mlp": 0.00776174, "balance_loss_clip": 1.0417347, "balance_loss_mlp": 1.00057638, "epoch": 0.4317450774086878, "flos": 16435165420800.0, "grad_norm": 1.9971445999801452, "language_loss": 0.81773126, "learning_rate": 2.5327011616793274e-06, "loss": 0.83653915, "num_input_tokens_seen": 154056775, "step": 7181, "time_per_iteration": 2.6499931812286377 }, { "auxiliary_loss_clip": 0.01110303, "auxiliary_loss_mlp": 0.01040472, "balance_loss_clip": 1.04747176, "balance_loss_mlp": 1.02473664, "epoch": 0.4318052006613558, "flos": 20554980762240.0, "grad_norm": 1.7092925782952597, "language_loss": 0.89020073, "learning_rate": 2.532325758728165e-06, "loss": 0.91170847, "num_input_tokens_seen": 154075015, "step": 7182, "time_per_iteration": 2.6567654609680176 }, { "auxiliary_loss_clip": 0.01121856, "auxiliary_loss_mlp": 0.00772189, "balance_loss_clip": 1.05025744, "balance_loss_mlp": 1.00049865, "epoch": 0.43186532391402377, "flos": 22820046451200.0, "grad_norm": 1.602704996145881, "language_loss": 0.75739694, "learning_rate": 2.5319503355911566e-06, "loss": 0.77633733, "num_input_tokens_seen": 154095170, "step": 7183, "time_per_iteration": 2.6784613132476807 }, { "auxiliary_loss_clip": 0.01123979, "auxiliary_loss_mlp": 0.01032993, "balance_loss_clip": 1.05125499, "balance_loss_mlp": 1.01853919, "epoch": 0.43192544716669173, "flos": 25556583081600.0, "grad_norm": 1.538308227417617, "language_loss": 0.77589077, "learning_rate": 2.5315748922825393e-06, "loss": 0.7974605, "num_input_tokens_seen": 154116895, "step": 7184, "time_per_iteration": 2.6501550674438477 }, { "auxiliary_loss_clip": 0.01103086, "auxiliary_loss_mlp": 0.01037708, "balance_loss_clip": 1.04594743, "balance_loss_mlp": 1.02377832, "epoch": 0.4319855704193597, "flos": 30954674701440.0, "grad_norm": 1.7848849500644928, "language_loss": 0.73435313, "learning_rate": 2.5311994288165474e-06, "loss": 0.75576103, "num_input_tokens_seen": 154138395, "step": 7185, "time_per_iteration": 2.766298770904541 }, { "auxiliary_loss_clip": 0.01122479, "auxiliary_loss_mlp": 0.01042205, "balance_loss_clip": 1.05223203, "balance_loss_mlp": 1.02754247, "epoch": 0.43204569367202766, "flos": 24238732993920.0, "grad_norm": 3.4842964823639515, "language_loss": 0.75962853, "learning_rate": 2.530823945207421e-06, "loss": 0.78127533, "num_input_tokens_seen": 154156775, "step": 7186, "time_per_iteration": 4.334157705307007 }, { "auxiliary_loss_clip": 0.01099566, "auxiliary_loss_mlp": 0.0103912, "balance_loss_clip": 1.04762721, "balance_loss_mlp": 1.02477932, "epoch": 0.43210581692469563, "flos": 18406948561920.0, "grad_norm": 3.9729453010836218, "language_loss": 0.76471615, "learning_rate": 2.5304484414693962e-06, "loss": 0.78610301, "num_input_tokens_seen": 154177500, "step": 7187, "time_per_iteration": 5.956019401550293 }, { "auxiliary_loss_clip": 0.01025499, "auxiliary_loss_mlp": 0.01034135, "balance_loss_clip": 1.03011787, "balance_loss_mlp": 1.03272867, "epoch": 0.4321659401773636, "flos": 49832378910720.0, "grad_norm": 0.8609493763660439, "language_loss": 0.68115592, "learning_rate": 2.530072917616714e-06, "loss": 0.70175231, "num_input_tokens_seen": 154237110, "step": 7188, "time_per_iteration": 3.246208667755127 }, { "auxiliary_loss_clip": 0.01100014, "auxiliary_loss_mlp": 0.01038065, "balance_loss_clip": 1.0437665, "balance_loss_mlp": 1.02437973, "epoch": 0.43222606343003156, "flos": 17128564542720.0, "grad_norm": 1.9766532511253156, "language_loss": 0.77875316, "learning_rate": 2.529697373663614e-06, "loss": 0.80013394, "num_input_tokens_seen": 154253910, "step": 7189, "time_per_iteration": 2.681076765060425 }, { "auxiliary_loss_clip": 0.01083825, "auxiliary_loss_mlp": 0.01046889, "balance_loss_clip": 1.04553795, "balance_loss_mlp": 1.0314517, "epoch": 0.4322861866826995, "flos": 22749949059840.0, "grad_norm": 1.8062049350419371, "language_loss": 0.71379328, "learning_rate": 2.5293218096243364e-06, "loss": 0.73510039, "num_input_tokens_seen": 154274770, "step": 7190, "time_per_iteration": 2.785278081893921 }, { "auxiliary_loss_clip": 0.01109749, "auxiliary_loss_mlp": 0.01039244, "balance_loss_clip": 1.04681444, "balance_loss_mlp": 1.02500999, "epoch": 0.4323463099353675, "flos": 27891925729920.0, "grad_norm": 1.4390067860166444, "language_loss": 0.79639554, "learning_rate": 2.5289462255131223e-06, "loss": 0.81788546, "num_input_tokens_seen": 154295035, "step": 7191, "time_per_iteration": 4.571990728378296 }, { "auxiliary_loss_clip": 0.0108611, "auxiliary_loss_mlp": 0.01033322, "balance_loss_clip": 1.04733062, "balance_loss_mlp": 1.01954126, "epoch": 0.43240643318803546, "flos": 21614740652160.0, "grad_norm": 1.5570148329267672, "language_loss": 0.74904197, "learning_rate": 2.5285706213442146e-06, "loss": 0.77023631, "num_input_tokens_seen": 154314905, "step": 7192, "time_per_iteration": 2.7427282333374023 }, { "auxiliary_loss_clip": 0.01090847, "auxiliary_loss_mlp": 0.01047049, "balance_loss_clip": 1.04693365, "balance_loss_mlp": 1.03140879, "epoch": 0.4324665564407034, "flos": 17558378686080.0, "grad_norm": 2.028484656266998, "language_loss": 0.7934891, "learning_rate": 2.5281949971318557e-06, "loss": 0.81486803, "num_input_tokens_seen": 154331740, "step": 7193, "time_per_iteration": 2.708481550216675 }, { "auxiliary_loss_clip": 0.01114828, "auxiliary_loss_mlp": 0.0104506, "balance_loss_clip": 1.04726183, "balance_loss_mlp": 1.02971745, "epoch": 0.4325266796933714, "flos": 18402423448320.0, "grad_norm": 1.769737496980083, "language_loss": 0.75720823, "learning_rate": 2.5278193528902897e-06, "loss": 0.77880704, "num_input_tokens_seen": 154348740, "step": 7194, "time_per_iteration": 2.685701608657837 }, { "auxiliary_loss_clip": 0.01135356, "auxiliary_loss_mlp": 0.01041388, "balance_loss_clip": 1.05137146, "balance_loss_mlp": 1.02693963, "epoch": 0.4325868029460394, "flos": 22564793427840.0, "grad_norm": 3.855960133728433, "language_loss": 0.59479225, "learning_rate": 2.5274436886337613e-06, "loss": 0.61655968, "num_input_tokens_seen": 154368835, "step": 7195, "time_per_iteration": 2.634310483932495 }, { "auxiliary_loss_clip": 0.01112701, "auxiliary_loss_mlp": 0.01040238, "balance_loss_clip": 1.04618812, "balance_loss_mlp": 1.02434754, "epoch": 0.43264692619870737, "flos": 14605516396800.0, "grad_norm": 2.711649843090413, "language_loss": 0.65653574, "learning_rate": 2.527068004376515e-06, "loss": 0.67806506, "num_input_tokens_seen": 154384620, "step": 7196, "time_per_iteration": 2.608530044555664 }, { "auxiliary_loss_clip": 0.01141945, "auxiliary_loss_mlp": 0.01038972, "balance_loss_clip": 1.05338526, "balance_loss_mlp": 1.02316523, "epoch": 0.43270704945137534, "flos": 21501657659520.0, "grad_norm": 1.8654403969935065, "language_loss": 0.72525519, "learning_rate": 2.526692300132797e-06, "loss": 0.74706435, "num_input_tokens_seen": 154402865, "step": 7197, "time_per_iteration": 2.644087791442871 }, { "auxiliary_loss_clip": 0.01124491, "auxiliary_loss_mlp": 0.01040937, "balance_loss_clip": 1.05245936, "balance_loss_mlp": 1.02619135, "epoch": 0.4327671727040433, "flos": 25155891889920.0, "grad_norm": 1.511486884186769, "language_loss": 0.73146015, "learning_rate": 2.5263165759168547e-06, "loss": 0.75311446, "num_input_tokens_seen": 154423625, "step": 7198, "time_per_iteration": 2.7317864894866943 }, { "auxiliary_loss_clip": 0.0109556, "auxiliary_loss_mlp": 0.01034886, "balance_loss_clip": 1.04451466, "balance_loss_mlp": 1.02034283, "epoch": 0.43282729595671127, "flos": 25447163276160.0, "grad_norm": 1.539323937310933, "language_loss": 0.80887341, "learning_rate": 2.525940831742934e-06, "loss": 0.8301779, "num_input_tokens_seen": 154444775, "step": 7199, "time_per_iteration": 2.736016035079956 }, { "auxiliary_loss_clip": 0.01121231, "auxiliary_loss_mlp": 0.01034417, "balance_loss_clip": 1.05255413, "balance_loss_mlp": 1.0201118, "epoch": 0.43288741920937923, "flos": 24126116878080.0, "grad_norm": 2.6908376787400186, "language_loss": 0.68332666, "learning_rate": 2.525565067625286e-06, "loss": 0.70488322, "num_input_tokens_seen": 154460815, "step": 7200, "time_per_iteration": 2.688460350036621 }, { "auxiliary_loss_clip": 0.01114262, "auxiliary_loss_mlp": 0.00772856, "balance_loss_clip": 1.05025625, "balance_loss_mlp": 1.00067294, "epoch": 0.4329475424620472, "flos": 19204955066880.0, "grad_norm": 1.9560728888597885, "language_loss": 0.87379515, "learning_rate": 2.525189283578157e-06, "loss": 0.89266634, "num_input_tokens_seen": 154479145, "step": 7201, "time_per_iteration": 2.7547309398651123 }, { "auxiliary_loss_clip": 0.01086041, "auxiliary_loss_mlp": 0.01040787, "balance_loss_clip": 1.04952443, "balance_loss_mlp": 1.02395487, "epoch": 0.43300766571471516, "flos": 22638374438400.0, "grad_norm": 2.3345355752276706, "language_loss": 0.64547086, "learning_rate": 2.5248134796157974e-06, "loss": 0.66673917, "num_input_tokens_seen": 154498905, "step": 7202, "time_per_iteration": 2.878486156463623 }, { "auxiliary_loss_clip": 0.01082437, "auxiliary_loss_mlp": 0.01030304, "balance_loss_clip": 1.04730773, "balance_loss_mlp": 1.01676202, "epoch": 0.4330677889673831, "flos": 22121080721280.0, "grad_norm": 2.291722240352509, "language_loss": 0.81795621, "learning_rate": 2.5244376557524586e-06, "loss": 0.83908355, "num_input_tokens_seen": 154517270, "step": 7203, "time_per_iteration": 2.7338409423828125 }, { "auxiliary_loss_clip": 0.01102737, "auxiliary_loss_mlp": 0.01051208, "balance_loss_clip": 1.04656279, "balance_loss_mlp": 1.0357945, "epoch": 0.4331279122200511, "flos": 23221527742080.0, "grad_norm": 1.8864588919547398, "language_loss": 0.81453216, "learning_rate": 2.5240618120023912e-06, "loss": 0.83607161, "num_input_tokens_seen": 154535945, "step": 7204, "time_per_iteration": 2.7719802856445312 }, { "auxiliary_loss_clip": 0.01111895, "auxiliary_loss_mlp": 0.01038561, "balance_loss_clip": 1.04900229, "balance_loss_mlp": 1.02450609, "epoch": 0.43318803547271906, "flos": 18259750627200.0, "grad_norm": 2.1348551022614077, "language_loss": 0.73979616, "learning_rate": 2.5236859483798468e-06, "loss": 0.76130074, "num_input_tokens_seen": 154554935, "step": 7205, "time_per_iteration": 2.73463773727417 }, { "auxiliary_loss_clip": 0.01139834, "auxiliary_loss_mlp": 0.00772219, "balance_loss_clip": 1.05782342, "balance_loss_mlp": 1.00075722, "epoch": 0.433248158725387, "flos": 27418407713280.0, "grad_norm": 1.7497294767683989, "language_loss": 0.75183374, "learning_rate": 2.5233100648990803e-06, "loss": 0.77095425, "num_input_tokens_seen": 154576065, "step": 7206, "time_per_iteration": 2.712897300720215 }, { "auxiliary_loss_clip": 0.01082016, "auxiliary_loss_mlp": 0.01036402, "balance_loss_clip": 1.04904056, "balance_loss_mlp": 1.02218044, "epoch": 0.433308281978055, "flos": 23218008209280.0, "grad_norm": 5.825458886470942, "language_loss": 0.79041201, "learning_rate": 2.522934161574342e-06, "loss": 0.81159621, "num_input_tokens_seen": 154595110, "step": 7207, "time_per_iteration": 2.7708940505981445 }, { "auxiliary_loss_clip": 0.01104721, "auxiliary_loss_mlp": 0.01039597, "balance_loss_clip": 1.04836667, "balance_loss_mlp": 1.02374804, "epoch": 0.433368405230723, "flos": 15852407166720.0, "grad_norm": 1.8464623058117935, "language_loss": 0.81316662, "learning_rate": 2.5225582384198888e-06, "loss": 0.83460987, "num_input_tokens_seen": 154612255, "step": 7208, "time_per_iteration": 2.869554281234741 }, { "auxiliary_loss_clip": 0.01114033, "auxiliary_loss_mlp": 0.01033629, "balance_loss_clip": 1.04989004, "balance_loss_mlp": 1.01924682, "epoch": 0.433428528483391, "flos": 19026084314880.0, "grad_norm": 2.1101386955173154, "language_loss": 0.70337081, "learning_rate": 2.5221822954499744e-06, "loss": 0.72484744, "num_input_tokens_seen": 154630440, "step": 7209, "time_per_iteration": 2.692166805267334 }, { "auxiliary_loss_clip": 0.01122508, "auxiliary_loss_mlp": 0.0103772, "balance_loss_clip": 1.04924512, "balance_loss_mlp": 1.02234209, "epoch": 0.43348865173605894, "flos": 24718248581760.0, "grad_norm": 1.435580666015418, "language_loss": 0.81432891, "learning_rate": 2.5218063326788557e-06, "loss": 0.83593118, "num_input_tokens_seen": 154652515, "step": 7210, "time_per_iteration": 2.7368991374969482 }, { "auxiliary_loss_clip": 0.01111056, "auxiliary_loss_mlp": 0.01040693, "balance_loss_clip": 1.05043674, "balance_loss_mlp": 1.02690065, "epoch": 0.4335487749887269, "flos": 22090664880000.0, "grad_norm": 2.4268266327689005, "language_loss": 0.82382917, "learning_rate": 2.5214303501207885e-06, "loss": 0.84534657, "num_input_tokens_seen": 154670965, "step": 7211, "time_per_iteration": 2.6840522289276123 }, { "auxiliary_loss_clip": 0.01124683, "auxiliary_loss_mlp": 0.01036766, "balance_loss_clip": 1.04992187, "balance_loss_mlp": 1.02354002, "epoch": 0.43360889824139487, "flos": 22382941847040.0, "grad_norm": 1.7229238689988244, "language_loss": 0.74880648, "learning_rate": 2.521054347790029e-06, "loss": 0.77042103, "num_input_tokens_seen": 154689980, "step": 7212, "time_per_iteration": 2.6535651683807373 }, { "auxiliary_loss_clip": 0.01111992, "auxiliary_loss_mlp": 0.01035698, "balance_loss_clip": 1.05274439, "balance_loss_mlp": 1.0224421, "epoch": 0.43366902149406283, "flos": 17528286067200.0, "grad_norm": 1.7659929391516203, "language_loss": 0.76887298, "learning_rate": 2.5206783257008375e-06, "loss": 0.7903499, "num_input_tokens_seen": 154706570, "step": 7213, "time_per_iteration": 2.7639784812927246 }, { "auxiliary_loss_clip": 0.01127555, "auxiliary_loss_mlp": 0.01037213, "balance_loss_clip": 1.05343771, "balance_loss_mlp": 1.0235039, "epoch": 0.4337291447467308, "flos": 19022672522880.0, "grad_norm": 2.352447655586991, "language_loss": 0.64672804, "learning_rate": 2.520302283867471e-06, "loss": 0.66837579, "num_input_tokens_seen": 154725210, "step": 7214, "time_per_iteration": 2.6546545028686523 }, { "auxiliary_loss_clip": 0.01107197, "auxiliary_loss_mlp": 0.01037553, "balance_loss_clip": 1.04624152, "balance_loss_mlp": 1.02401102, "epoch": 0.43378926799939876, "flos": 27234042180480.0, "grad_norm": 1.8015946289097802, "language_loss": 0.71728516, "learning_rate": 2.519926222304191e-06, "loss": 0.73873264, "num_input_tokens_seen": 154745945, "step": 7215, "time_per_iteration": 2.7694337368011475 }, { "auxiliary_loss_clip": 0.01105367, "auxiliary_loss_mlp": 0.01038295, "balance_loss_clip": 1.04855013, "balance_loss_mlp": 1.02280354, "epoch": 0.43384939125206673, "flos": 15961108700160.0, "grad_norm": 2.003102925000143, "language_loss": 0.75037885, "learning_rate": 2.519550141025255e-06, "loss": 0.77181542, "num_input_tokens_seen": 154763580, "step": 7216, "time_per_iteration": 2.725843667984009 }, { "auxiliary_loss_clip": 0.01116821, "auxiliary_loss_mlp": 0.01045067, "balance_loss_clip": 1.05096495, "balance_loss_mlp": 1.02885413, "epoch": 0.4339095145047347, "flos": 21793216354560.0, "grad_norm": 2.430460894289381, "language_loss": 0.75723612, "learning_rate": 2.519174040044927e-06, "loss": 0.77885503, "num_input_tokens_seen": 154776825, "step": 7217, "time_per_iteration": 2.7089385986328125 }, { "auxiliary_loss_clip": 0.01100856, "auxiliary_loss_mlp": 0.01039272, "balance_loss_clip": 1.04884839, "balance_loss_mlp": 1.02414465, "epoch": 0.43396963775740266, "flos": 14209853109120.0, "grad_norm": 1.9588734650761437, "language_loss": 0.74091554, "learning_rate": 2.5187979193774664e-06, "loss": 0.76231682, "num_input_tokens_seen": 154794025, "step": 7218, "time_per_iteration": 2.6733574867248535 }, { "auxiliary_loss_clip": 0.01108387, "auxiliary_loss_mlp": 0.01033005, "balance_loss_clip": 1.05125904, "balance_loss_mlp": 1.01892698, "epoch": 0.4340297610100706, "flos": 19719052473600.0, "grad_norm": 1.867471044964119, "language_loss": 0.69258481, "learning_rate": 2.5184217790371367e-06, "loss": 0.71399873, "num_input_tokens_seen": 154813105, "step": 7219, "time_per_iteration": 2.6384527683258057 }, { "auxiliary_loss_clip": 0.01103251, "auxiliary_loss_mlp": 0.01039305, "balance_loss_clip": 1.04848611, "balance_loss_mlp": 1.02513123, "epoch": 0.4340898842627386, "flos": 18953508885120.0, "grad_norm": 2.2592610231798274, "language_loss": 0.77296734, "learning_rate": 2.518045619038202e-06, "loss": 0.79439294, "num_input_tokens_seen": 154833525, "step": 7220, "time_per_iteration": 2.693434476852417 }, { "auxiliary_loss_clip": 0.01068716, "auxiliary_loss_mlp": 0.01037568, "balance_loss_clip": 1.04492617, "balance_loss_mlp": 1.02248216, "epoch": 0.4341500075154066, "flos": 22018304931840.0, "grad_norm": 2.0152794755447183, "language_loss": 0.6924417, "learning_rate": 2.5176694393949243e-06, "loss": 0.71350455, "num_input_tokens_seen": 154853090, "step": 7221, "time_per_iteration": 2.8318276405334473 }, { "auxiliary_loss_clip": 0.01126059, "auxiliary_loss_mlp": 0.01040304, "balance_loss_clip": 1.04850173, "balance_loss_mlp": 1.02628446, "epoch": 0.4342101307680746, "flos": 23582465556480.0, "grad_norm": 2.7538415889200554, "language_loss": 0.65288424, "learning_rate": 2.51729324012157e-06, "loss": 0.67454779, "num_input_tokens_seen": 154872055, "step": 7222, "time_per_iteration": 2.6848082542419434 }, { "auxiliary_loss_clip": 0.01095727, "auxiliary_loss_mlp": 0.0103288, "balance_loss_clip": 1.04434943, "balance_loss_mlp": 1.01868868, "epoch": 0.43427025402074254, "flos": 17967976450560.0, "grad_norm": 2.2547341093747884, "language_loss": 0.72800291, "learning_rate": 2.5169170212324053e-06, "loss": 0.74928898, "num_input_tokens_seen": 154886645, "step": 7223, "time_per_iteration": 2.6691431999206543 }, { "auxiliary_loss_clip": 0.0113251, "auxiliary_loss_mlp": 0.01035818, "balance_loss_clip": 1.04656434, "balance_loss_mlp": 1.02130401, "epoch": 0.4343303772734105, "flos": 26286395616000.0, "grad_norm": 1.8756720282844566, "language_loss": 0.93602765, "learning_rate": 2.516540782741694e-06, "loss": 0.95771086, "num_input_tokens_seen": 154906775, "step": 7224, "time_per_iteration": 2.667450189590454 }, { "auxiliary_loss_clip": 0.01092783, "auxiliary_loss_mlp": 0.01039248, "balance_loss_clip": 1.04234195, "balance_loss_mlp": 1.02426362, "epoch": 0.43439050052607847, "flos": 26833961520000.0, "grad_norm": 1.4167248746748424, "language_loss": 0.61521256, "learning_rate": 2.5161645246637056e-06, "loss": 0.63653284, "num_input_tokens_seen": 154926990, "step": 7225, "time_per_iteration": 4.334634304046631 }, { "auxiliary_loss_clip": 0.01107023, "auxiliary_loss_mlp": 0.00773069, "balance_loss_clip": 1.04763186, "balance_loss_mlp": 1.00081611, "epoch": 0.43445062377874644, "flos": 21397660807680.0, "grad_norm": 1.859930915167877, "language_loss": 0.77928364, "learning_rate": 2.5157882470127054e-06, "loss": 0.79808456, "num_input_tokens_seen": 154946210, "step": 7226, "time_per_iteration": 5.937607765197754 }, { "auxiliary_loss_clip": 0.01118617, "auxiliary_loss_mlp": 0.0103402, "balance_loss_clip": 1.047508, "balance_loss_mlp": 1.02045417, "epoch": 0.4345107470314144, "flos": 19901945548800.0, "grad_norm": 1.6822192052663985, "language_loss": 0.84638822, "learning_rate": 2.515411949802964e-06, "loss": 0.86791462, "num_input_tokens_seen": 154964995, "step": 7227, "time_per_iteration": 2.6521942615509033 }, { "auxiliary_loss_clip": 0.01117348, "auxiliary_loss_mlp": 0.0103842, "balance_loss_clip": 1.04574108, "balance_loss_mlp": 1.02328634, "epoch": 0.43457087028408237, "flos": 26432623883520.0, "grad_norm": 1.9493500401331498, "language_loss": 0.76725572, "learning_rate": 2.5150356330487498e-06, "loss": 0.78881335, "num_input_tokens_seen": 154984775, "step": 7228, "time_per_iteration": 2.6870598793029785 }, { "auxiliary_loss_clip": 0.01089608, "auxiliary_loss_mlp": 0.01040618, "balance_loss_clip": 1.04927957, "balance_loss_mlp": 1.02599132, "epoch": 0.43463099353675033, "flos": 31868816855040.0, "grad_norm": 1.513481048537933, "language_loss": 0.80442667, "learning_rate": 2.5146592967643324e-06, "loss": 0.82572889, "num_input_tokens_seen": 155008125, "step": 7229, "time_per_iteration": 2.9437830448150635 }, { "auxiliary_loss_clip": 0.01121336, "auxiliary_loss_mlp": 0.01045576, "balance_loss_clip": 1.04673219, "balance_loss_mlp": 1.03047252, "epoch": 0.4346911167894183, "flos": 24571266128640.0, "grad_norm": 2.5474712737755016, "language_loss": 0.81467843, "learning_rate": 2.5142829409639834e-06, "loss": 0.83634758, "num_input_tokens_seen": 155027885, "step": 7230, "time_per_iteration": 4.6465747356414795 }, { "auxiliary_loss_clip": 0.0111898, "auxiliary_loss_mlp": 0.01049467, "balance_loss_clip": 1.04806113, "balance_loss_mlp": 1.03399396, "epoch": 0.43475124004208626, "flos": 17090678672640.0, "grad_norm": 2.126712012780947, "language_loss": 0.76608211, "learning_rate": 2.513906565661973e-06, "loss": 0.78776658, "num_input_tokens_seen": 155043375, "step": 7231, "time_per_iteration": 2.668262243270874 }, { "auxiliary_loss_clip": 0.01085236, "auxiliary_loss_mlp": 0.010365, "balance_loss_clip": 1.04462624, "balance_loss_mlp": 1.02319062, "epoch": 0.4348113632947542, "flos": 26104615862400.0, "grad_norm": 1.4622957052763208, "language_loss": 0.6875934, "learning_rate": 2.513530170872575e-06, "loss": 0.70881081, "num_input_tokens_seen": 155062930, "step": 7232, "time_per_iteration": 2.7392327785491943 }, { "auxiliary_loss_clip": 0.01098662, "auxiliary_loss_mlp": 0.01036589, "balance_loss_clip": 1.04562938, "balance_loss_mlp": 1.02119923, "epoch": 0.4348714865474222, "flos": 34200496316160.0, "grad_norm": 1.6380302947056737, "language_loss": 0.72123957, "learning_rate": 2.5131537566100605e-06, "loss": 0.74259216, "num_input_tokens_seen": 155084980, "step": 7233, "time_per_iteration": 2.8322300910949707 }, { "auxiliary_loss_clip": 0.01073793, "auxiliary_loss_mlp": 0.01045709, "balance_loss_clip": 1.04429805, "balance_loss_mlp": 1.02930558, "epoch": 0.43493160980009016, "flos": 31537468869120.0, "grad_norm": 1.5095585359817736, "language_loss": 0.74440682, "learning_rate": 2.5127773228887053e-06, "loss": 0.76560181, "num_input_tokens_seen": 155107260, "step": 7234, "time_per_iteration": 2.9071762561798096 }, { "auxiliary_loss_clip": 0.011103, "auxiliary_loss_mlp": 0.01043772, "balance_loss_clip": 1.04619622, "balance_loss_mlp": 1.02835774, "epoch": 0.4349917330527582, "flos": 24061334699520.0, "grad_norm": 2.005736270415063, "language_loss": 0.59333825, "learning_rate": 2.512400869722782e-06, "loss": 0.61487895, "num_input_tokens_seen": 155126720, "step": 7235, "time_per_iteration": 2.6738569736480713 }, { "auxiliary_loss_clip": 0.01064764, "auxiliary_loss_mlp": 0.01055431, "balance_loss_clip": 1.03919065, "balance_loss_mlp": 1.03892064, "epoch": 0.43505185630542614, "flos": 30519329863680.0, "grad_norm": 1.6349929491691664, "language_loss": 0.77779961, "learning_rate": 2.512024397126566e-06, "loss": 0.79900157, "num_input_tokens_seen": 155148640, "step": 7236, "time_per_iteration": 2.8045287132263184 }, { "auxiliary_loss_clip": 0.01129354, "auxiliary_loss_mlp": 0.01036773, "balance_loss_clip": 1.04843307, "balance_loss_mlp": 1.02221155, "epoch": 0.4351119795580941, "flos": 15735158196480.0, "grad_norm": 1.6962419767837338, "language_loss": 0.81330889, "learning_rate": 2.5116479051143345e-06, "loss": 0.83497024, "num_input_tokens_seen": 155165870, "step": 7237, "time_per_iteration": 2.648671865463257 }, { "auxiliary_loss_clip": 0.01115513, "auxiliary_loss_mlp": 0.0103661, "balance_loss_clip": 1.04350662, "balance_loss_mlp": 1.02228153, "epoch": 0.4351721028107621, "flos": 18731760272640.0, "grad_norm": 3.1516026268664485, "language_loss": 0.62781835, "learning_rate": 2.5112713937003623e-06, "loss": 0.64933956, "num_input_tokens_seen": 155185315, "step": 7238, "time_per_iteration": 2.708812713623047 }, { "auxiliary_loss_clip": 0.01093861, "auxiliary_loss_mlp": 0.00771839, "balance_loss_clip": 1.04551601, "balance_loss_mlp": 1.00081944, "epoch": 0.43523222606343004, "flos": 25226887121280.0, "grad_norm": 1.9011673436513334, "language_loss": 0.85935599, "learning_rate": 2.510894862898928e-06, "loss": 0.87801301, "num_input_tokens_seen": 155205790, "step": 7239, "time_per_iteration": 2.7664706707000732 }, { "auxiliary_loss_clip": 0.01108836, "auxiliary_loss_mlp": 0.01032451, "balance_loss_clip": 1.04520702, "balance_loss_mlp": 1.01814556, "epoch": 0.435292349316098, "flos": 22709190101760.0, "grad_norm": 1.536559176560054, "language_loss": 0.7257551, "learning_rate": 2.510518312724309e-06, "loss": 0.747168, "num_input_tokens_seen": 155226475, "step": 7240, "time_per_iteration": 2.7275354862213135 }, { "auxiliary_loss_clip": 0.01096929, "auxiliary_loss_mlp": 0.01033031, "balance_loss_clip": 1.04623103, "balance_loss_mlp": 1.01821971, "epoch": 0.43535247256876597, "flos": 25775889569280.0, "grad_norm": 2.0741794573690613, "language_loss": 0.8174212, "learning_rate": 2.5101417431907842e-06, "loss": 0.83872074, "num_input_tokens_seen": 155247110, "step": 7241, "time_per_iteration": 2.7412314414978027 }, { "auxiliary_loss_clip": 0.01104486, "auxiliary_loss_mlp": 0.00773075, "balance_loss_clip": 1.04755354, "balance_loss_mlp": 1.000664, "epoch": 0.43541259582143393, "flos": 17528142412800.0, "grad_norm": 2.5029472103375627, "language_loss": 0.7954601, "learning_rate": 2.5097651543126345e-06, "loss": 0.81423575, "num_input_tokens_seen": 155261335, "step": 7242, "time_per_iteration": 2.7832155227661133 }, { "auxiliary_loss_clip": 0.01105652, "auxiliary_loss_mlp": 0.01038715, "balance_loss_clip": 1.04170573, "balance_loss_mlp": 1.0224551, "epoch": 0.4354727190741019, "flos": 15195205975680.0, "grad_norm": 5.632863009629144, "language_loss": 0.68174016, "learning_rate": 2.509388546104138e-06, "loss": 0.70318383, "num_input_tokens_seen": 155278510, "step": 7243, "time_per_iteration": 2.731621742248535 }, { "auxiliary_loss_clip": 0.01070337, "auxiliary_loss_mlp": 0.01035269, "balance_loss_clip": 1.04518962, "balance_loss_mlp": 1.02096963, "epoch": 0.43553284232676986, "flos": 16649264436480.0, "grad_norm": 1.737599591064028, "language_loss": 0.81023276, "learning_rate": 2.5090119185795766e-06, "loss": 0.83128881, "num_input_tokens_seen": 155296450, "step": 7244, "time_per_iteration": 2.869999885559082 }, { "auxiliary_loss_clip": 0.0107405, "auxiliary_loss_mlp": 0.01033039, "balance_loss_clip": 1.04502463, "balance_loss_mlp": 1.01974106, "epoch": 0.43559296557943783, "flos": 23400865370880.0, "grad_norm": 1.7613354011100055, "language_loss": 0.73543227, "learning_rate": 2.508635271753234e-06, "loss": 0.75650311, "num_input_tokens_seen": 155316080, "step": 7245, "time_per_iteration": 2.8238213062286377 }, { "auxiliary_loss_clip": 0.01073655, "auxiliary_loss_mlp": 0.01040109, "balance_loss_clip": 1.042413, "balance_loss_mlp": 1.02626252, "epoch": 0.4356530888321058, "flos": 22419067950720.0, "grad_norm": 1.8556670419976653, "language_loss": 0.76651436, "learning_rate": 2.508258605639389e-06, "loss": 0.78765202, "num_input_tokens_seen": 155336765, "step": 7246, "time_per_iteration": 2.74566912651062 }, { "auxiliary_loss_clip": 0.01117733, "auxiliary_loss_mlp": 0.01046964, "balance_loss_clip": 1.04482377, "balance_loss_mlp": 1.03185987, "epoch": 0.43571321208477376, "flos": 21616141282560.0, "grad_norm": 1.8292531725431629, "language_loss": 0.85409153, "learning_rate": 2.5078819202523275e-06, "loss": 0.8757385, "num_input_tokens_seen": 155356440, "step": 7247, "time_per_iteration": 2.6183457374572754 }, { "auxiliary_loss_clip": 0.01130523, "auxiliary_loss_mlp": 0.01039193, "balance_loss_clip": 1.047526, "balance_loss_mlp": 1.02565122, "epoch": 0.4357733353374418, "flos": 23987358639360.0, "grad_norm": 1.611147300467871, "language_loss": 0.72544634, "learning_rate": 2.507505215606333e-06, "loss": 0.74714351, "num_input_tokens_seen": 155377070, "step": 7248, "time_per_iteration": 2.614370822906494 }, { "auxiliary_loss_clip": 0.01120332, "auxiliary_loss_mlp": 0.01038636, "balance_loss_clip": 1.0502224, "balance_loss_mlp": 1.0246768, "epoch": 0.43583345859010975, "flos": 25264737077760.0, "grad_norm": 1.6765876969892934, "language_loss": 0.87089729, "learning_rate": 2.5071284917156893e-06, "loss": 0.89248699, "num_input_tokens_seen": 155398415, "step": 7249, "time_per_iteration": 2.6826605796813965 }, { "auxiliary_loss_clip": 0.01113045, "auxiliary_loss_mlp": 0.01045156, "balance_loss_clip": 1.04740214, "balance_loss_mlp": 1.03150034, "epoch": 0.4358935818427777, "flos": 23696302734720.0, "grad_norm": 2.0541786270405495, "language_loss": 0.81998801, "learning_rate": 2.506751748594683e-06, "loss": 0.84157008, "num_input_tokens_seen": 155415625, "step": 7250, "time_per_iteration": 2.6470022201538086 }, { "auxiliary_loss_clip": 0.01124271, "auxiliary_loss_mlp": 0.01035047, "balance_loss_clip": 1.05197597, "balance_loss_mlp": 1.02089727, "epoch": 0.4359537050954457, "flos": 29532827761920.0, "grad_norm": 1.9267289360456135, "language_loss": 0.84933323, "learning_rate": 2.5063749862575988e-06, "loss": 0.87092638, "num_input_tokens_seen": 155435505, "step": 7251, "time_per_iteration": 2.665776014328003 }, { "auxiliary_loss_clip": 0.01108984, "auxiliary_loss_mlp": 0.01043132, "balance_loss_clip": 1.04255629, "balance_loss_mlp": 1.02783751, "epoch": 0.43601382834811364, "flos": 22711273090560.0, "grad_norm": 2.7582881981862335, "language_loss": 0.69538188, "learning_rate": 2.5059982047187245e-06, "loss": 0.71690303, "num_input_tokens_seen": 155455425, "step": 7252, "time_per_iteration": 2.644498825073242 }, { "auxiliary_loss_clip": 0.01102038, "auxiliary_loss_mlp": 0.01039697, "balance_loss_clip": 1.04452658, "balance_loss_mlp": 1.02410412, "epoch": 0.4360739516007816, "flos": 19098731571840.0, "grad_norm": 2.1859211403409717, "language_loss": 0.83621645, "learning_rate": 2.505621403992348e-06, "loss": 0.85763383, "num_input_tokens_seen": 155474250, "step": 7253, "time_per_iteration": 2.662623882293701 }, { "auxiliary_loss_clip": 0.01119158, "auxiliary_loss_mlp": 0.01041761, "balance_loss_clip": 1.04809666, "balance_loss_mlp": 1.0271399, "epoch": 0.43613407485344957, "flos": 23404420817280.0, "grad_norm": 1.5459938146205512, "language_loss": 0.70561367, "learning_rate": 2.505244584092757e-06, "loss": 0.7272228, "num_input_tokens_seen": 155494685, "step": 7254, "time_per_iteration": 2.677427053451538 }, { "auxiliary_loss_clip": 0.01106538, "auxiliary_loss_mlp": 0.01041179, "balance_loss_clip": 1.04567051, "balance_loss_mlp": 1.02734506, "epoch": 0.43619419810611754, "flos": 22637799820800.0, "grad_norm": 1.8056505398017555, "language_loss": 0.812729, "learning_rate": 2.5048677450342406e-06, "loss": 0.83420616, "num_input_tokens_seen": 155513040, "step": 7255, "time_per_iteration": 2.7150163650512695 }, { "auxiliary_loss_clip": 0.01132135, "auxiliary_loss_mlp": 0.01040806, "balance_loss_clip": 1.04807031, "balance_loss_mlp": 1.02626252, "epoch": 0.4362543213587855, "flos": 20047958334720.0, "grad_norm": 1.9676871720710198, "language_loss": 0.7780782, "learning_rate": 2.504490886831089e-06, "loss": 0.79980761, "num_input_tokens_seen": 155530100, "step": 7256, "time_per_iteration": 2.551403522491455 }, { "auxiliary_loss_clip": 0.0112974, "auxiliary_loss_mlp": 0.01041375, "balance_loss_clip": 1.04864502, "balance_loss_mlp": 1.02721334, "epoch": 0.43631444461145347, "flos": 21361319222400.0, "grad_norm": 1.9475980639851616, "language_loss": 0.76180404, "learning_rate": 2.5041140094975922e-06, "loss": 0.78351521, "num_input_tokens_seen": 155549375, "step": 7257, "time_per_iteration": 2.6217384338378906 }, { "auxiliary_loss_clip": 0.01120044, "auxiliary_loss_mlp": 0.01042182, "balance_loss_clip": 1.04656029, "balance_loss_mlp": 1.02711391, "epoch": 0.43637456786412143, "flos": 22418529246720.0, "grad_norm": 1.6554456872207661, "language_loss": 0.73254454, "learning_rate": 2.5037371130480417e-06, "loss": 0.75416678, "num_input_tokens_seen": 155569395, "step": 7258, "time_per_iteration": 2.7399442195892334 }, { "auxiliary_loss_clip": 0.01107425, "auxiliary_loss_mlp": 0.01034778, "balance_loss_clip": 1.0456903, "balance_loss_mlp": 1.02084827, "epoch": 0.4364346911167894, "flos": 28548839612160.0, "grad_norm": 2.059273423297749, "language_loss": 0.76950562, "learning_rate": 2.5033601974967297e-06, "loss": 0.79092765, "num_input_tokens_seen": 155589090, "step": 7259, "time_per_iteration": 2.814030647277832 }, { "auxiliary_loss_clip": 0.01025258, "auxiliary_loss_mlp": 0.01002872, "balance_loss_clip": 1.02231717, "balance_loss_mlp": 1.0011797, "epoch": 0.43649481436945736, "flos": 62659345380480.0, "grad_norm": 0.7406116287283647, "language_loss": 0.56990582, "learning_rate": 2.5029832628579483e-06, "loss": 0.59018713, "num_input_tokens_seen": 155648660, "step": 7260, "time_per_iteration": 3.184105396270752 }, { "auxiliary_loss_clip": 0.01114574, "auxiliary_loss_mlp": 0.01046133, "balance_loss_clip": 1.04780877, "balance_loss_mlp": 1.03077888, "epoch": 0.4365549376221254, "flos": 30592120775040.0, "grad_norm": 2.4789338774629024, "language_loss": 0.71279275, "learning_rate": 2.5026063091459907e-06, "loss": 0.73439986, "num_input_tokens_seen": 155669945, "step": 7261, "time_per_iteration": 2.781569242477417 }, { "auxiliary_loss_clip": 0.01084597, "auxiliary_loss_mlp": 0.01054365, "balance_loss_clip": 1.04558206, "balance_loss_mlp": 1.0377475, "epoch": 0.43661506087479335, "flos": 17165875795200.0, "grad_norm": 1.8767730803011844, "language_loss": 0.69520628, "learning_rate": 2.5022293363751522e-06, "loss": 0.71659589, "num_input_tokens_seen": 155688555, "step": 7262, "time_per_iteration": 2.73209810256958 }, { "auxiliary_loss_clip": 0.0106364, "auxiliary_loss_mlp": 0.01034084, "balance_loss_clip": 1.04300487, "balance_loss_mlp": 1.02154875, "epoch": 0.4366751841274613, "flos": 22047499710720.0, "grad_norm": 1.5954483681391127, "language_loss": 0.79909682, "learning_rate": 2.501852344559726e-06, "loss": 0.82007402, "num_input_tokens_seen": 155705370, "step": 7263, "time_per_iteration": 2.7780513763427734 }, { "auxiliary_loss_clip": 0.01093795, "auxiliary_loss_mlp": 0.01046831, "balance_loss_clip": 1.0481534, "balance_loss_mlp": 1.03220403, "epoch": 0.4367353073801293, "flos": 15997306631040.0, "grad_norm": 1.6219151151282696, "language_loss": 0.7545082, "learning_rate": 2.50147533371401e-06, "loss": 0.77591443, "num_input_tokens_seen": 155721890, "step": 7264, "time_per_iteration": 4.158029079437256 }, { "auxiliary_loss_clip": 0.01079604, "auxiliary_loss_mlp": 0.01037561, "balance_loss_clip": 1.04681587, "balance_loss_mlp": 1.02243328, "epoch": 0.43679543063279724, "flos": 38217535868160.0, "grad_norm": 2.5655359697781854, "language_loss": 0.61799812, "learning_rate": 2.501098303852298e-06, "loss": 0.63916975, "num_input_tokens_seen": 155743970, "step": 7265, "time_per_iteration": 4.454209804534912 }, { "auxiliary_loss_clip": 0.01105521, "auxiliary_loss_mlp": 0.01031338, "balance_loss_clip": 1.04521823, "balance_loss_mlp": 1.01762891, "epoch": 0.4368555538854652, "flos": 15193230727680.0, "grad_norm": 2.0447032285328004, "language_loss": 0.72610664, "learning_rate": 2.5007212549888884e-06, "loss": 0.74747527, "num_input_tokens_seen": 155761830, "step": 7266, "time_per_iteration": 4.213090181350708 }, { "auxiliary_loss_clip": 0.0110385, "auxiliary_loss_mlp": 0.01037912, "balance_loss_clip": 1.04488015, "balance_loss_mlp": 1.02356541, "epoch": 0.4369156771381332, "flos": 23069086421760.0, "grad_norm": 1.8602157597317315, "language_loss": 0.82307518, "learning_rate": 2.5003441871380794e-06, "loss": 0.84449285, "num_input_tokens_seen": 155779610, "step": 7267, "time_per_iteration": 2.6675074100494385 }, { "auxiliary_loss_clip": 0.01126927, "auxiliary_loss_mlp": 0.01028504, "balance_loss_clip": 1.04546976, "balance_loss_mlp": 1.01499796, "epoch": 0.43697580039080114, "flos": 23441085624960.0, "grad_norm": 2.021840044875845, "language_loss": 0.74740797, "learning_rate": 2.4999671003141674e-06, "loss": 0.76896226, "num_input_tokens_seen": 155798765, "step": 7268, "time_per_iteration": 2.6228766441345215 }, { "auxiliary_loss_clip": 0.01135364, "auxiliary_loss_mlp": 0.01041324, "balance_loss_clip": 1.04851401, "balance_loss_mlp": 1.02567148, "epoch": 0.4370359236434691, "flos": 18514680428160.0, "grad_norm": 2.5093195722714365, "language_loss": 0.80133688, "learning_rate": 2.499589994531454e-06, "loss": 0.82310379, "num_input_tokens_seen": 155817750, "step": 7269, "time_per_iteration": 4.289510726928711 }, { "auxiliary_loss_clip": 0.01110775, "auxiliary_loss_mlp": 0.01036898, "balance_loss_clip": 1.04814553, "balance_loss_mlp": 1.02253354, "epoch": 0.43709604689613707, "flos": 23222497409280.0, "grad_norm": 1.7772509501505356, "language_loss": 0.74977714, "learning_rate": 2.499212869804237e-06, "loss": 0.77125382, "num_input_tokens_seen": 155836490, "step": 7270, "time_per_iteration": 2.7519397735595703 }, { "auxiliary_loss_clip": 0.01068873, "auxiliary_loss_mlp": 0.01045139, "balance_loss_clip": 1.04005837, "balance_loss_mlp": 1.02886677, "epoch": 0.43715617014880503, "flos": 23803711378560.0, "grad_norm": 1.9652522706029574, "language_loss": 0.79716229, "learning_rate": 2.4988357261468182e-06, "loss": 0.81830239, "num_input_tokens_seen": 155856225, "step": 7271, "time_per_iteration": 2.8002872467041016 }, { "auxiliary_loss_clip": 0.01036128, "auxiliary_loss_mlp": 0.01021454, "balance_loss_clip": 1.01824927, "balance_loss_mlp": 1.01974964, "epoch": 0.437216293401473, "flos": 61941204766080.0, "grad_norm": 0.7022630698763936, "language_loss": 0.54855651, "learning_rate": 2.4984585635734993e-06, "loss": 0.56913233, "num_input_tokens_seen": 155916770, "step": 7272, "time_per_iteration": 3.1893959045410156 }, { "auxiliary_loss_clip": 0.0113475, "auxiliary_loss_mlp": 0.01041916, "balance_loss_clip": 1.0497241, "balance_loss_mlp": 1.02704489, "epoch": 0.43727641665414096, "flos": 21982250655360.0, "grad_norm": 1.6582852351426143, "language_loss": 0.69981074, "learning_rate": 2.498081382098581e-06, "loss": 0.72157741, "num_input_tokens_seen": 155936490, "step": 7273, "time_per_iteration": 2.622006893157959 }, { "auxiliary_loss_clip": 0.01109468, "auxiliary_loss_mlp": 0.01050566, "balance_loss_clip": 1.04725552, "balance_loss_mlp": 1.03434145, "epoch": 0.437336539906809, "flos": 39530860842240.0, "grad_norm": 7.356047522605187, "language_loss": 0.75699592, "learning_rate": 2.497704181736367e-06, "loss": 0.77859622, "num_input_tokens_seen": 155957595, "step": 7274, "time_per_iteration": 2.850834846496582 }, { "auxiliary_loss_clip": 0.0111429, "auxiliary_loss_mlp": 0.0102741, "balance_loss_clip": 1.04778564, "balance_loss_mlp": 1.01473844, "epoch": 0.43739666315947695, "flos": 17457147181440.0, "grad_norm": 1.6567651402589496, "language_loss": 0.80280751, "learning_rate": 2.49732696250116e-06, "loss": 0.82422453, "num_input_tokens_seen": 155975710, "step": 7275, "time_per_iteration": 2.638493776321411 }, { "auxiliary_loss_clip": 0.01107442, "auxiliary_loss_mlp": 0.01039938, "balance_loss_clip": 1.04763556, "balance_loss_mlp": 1.02628231, "epoch": 0.4374567864121449, "flos": 16358747235840.0, "grad_norm": 1.960961081760492, "language_loss": 0.81285107, "learning_rate": 2.496949724407266e-06, "loss": 0.83432496, "num_input_tokens_seen": 155993090, "step": 7276, "time_per_iteration": 2.665069341659546 }, { "auxiliary_loss_clip": 0.01119385, "auxiliary_loss_mlp": 0.01033929, "balance_loss_clip": 1.05310118, "balance_loss_mlp": 1.01923609, "epoch": 0.4375169096648129, "flos": 30587523834240.0, "grad_norm": 1.9041346547019917, "language_loss": 0.7327143, "learning_rate": 2.496572467468988e-06, "loss": 0.75424743, "num_input_tokens_seen": 156013685, "step": 7277, "time_per_iteration": 2.7329320907592773 }, { "auxiliary_loss_clip": 0.01109724, "auxiliary_loss_mlp": 0.0077177, "balance_loss_clip": 1.04805493, "balance_loss_mlp": 1.00070667, "epoch": 0.43757703291748085, "flos": 30555599621760.0, "grad_norm": 1.7992627956176412, "language_loss": 0.73366892, "learning_rate": 2.4961951917006317e-06, "loss": 0.7524839, "num_input_tokens_seen": 156034300, "step": 7278, "time_per_iteration": 2.7531094551086426 }, { "auxiliary_loss_clip": 0.01094743, "auxiliary_loss_mlp": 0.0103965, "balance_loss_clip": 1.0471983, "balance_loss_mlp": 1.02677512, "epoch": 0.4376371561701488, "flos": 21397373498880.0, "grad_norm": 1.4932293615412522, "language_loss": 0.66024888, "learning_rate": 2.4958178971165046e-06, "loss": 0.68159282, "num_input_tokens_seen": 156053805, "step": 7279, "time_per_iteration": 2.671842336654663 }, { "auxiliary_loss_clip": 0.01139939, "auxiliary_loss_mlp": 0.01037875, "balance_loss_clip": 1.05298817, "balance_loss_mlp": 1.02337885, "epoch": 0.4376972794228168, "flos": 23404384903680.0, "grad_norm": 1.7693107777348598, "language_loss": 0.81793606, "learning_rate": 2.4954405837309126e-06, "loss": 0.83971423, "num_input_tokens_seen": 156073295, "step": 7280, "time_per_iteration": 2.588303565979004 }, { "auxiliary_loss_clip": 0.01106326, "auxiliary_loss_mlp": 0.01031835, "balance_loss_clip": 1.04587424, "balance_loss_mlp": 1.01867414, "epoch": 0.43775740267548474, "flos": 22892945103360.0, "grad_norm": 1.5627499875085749, "language_loss": 0.77005875, "learning_rate": 2.4950632515581653e-06, "loss": 0.79144037, "num_input_tokens_seen": 156094540, "step": 7281, "time_per_iteration": 2.6939706802368164 }, { "auxiliary_loss_clip": 0.011079, "auxiliary_loss_mlp": 0.01037268, "balance_loss_clip": 1.04824066, "balance_loss_mlp": 1.02360058, "epoch": 0.4378175259281527, "flos": 23294390480640.0, "grad_norm": 1.8010941727109018, "language_loss": 0.75983417, "learning_rate": 2.494685900612569e-06, "loss": 0.78128588, "num_input_tokens_seen": 156114070, "step": 7282, "time_per_iteration": 2.6834237575531006 }, { "auxiliary_loss_clip": 0.01092611, "auxiliary_loss_mlp": 0.01040673, "balance_loss_clip": 1.04627228, "balance_loss_mlp": 1.02654076, "epoch": 0.43787764918082067, "flos": 23876897339520.0, "grad_norm": 2.1500126437968925, "language_loss": 0.85044593, "learning_rate": 2.4943085309084333e-06, "loss": 0.87177879, "num_input_tokens_seen": 156132130, "step": 7283, "time_per_iteration": 2.7042722702026367 }, { "auxiliary_loss_clip": 0.01111303, "auxiliary_loss_mlp": 0.01037633, "balance_loss_clip": 1.04814124, "balance_loss_mlp": 1.02266598, "epoch": 0.43793777243348864, "flos": 23988148738560.0, "grad_norm": 14.144168664775597, "language_loss": 0.80311596, "learning_rate": 2.49393114246007e-06, "loss": 0.82460535, "num_input_tokens_seen": 156150820, "step": 7284, "time_per_iteration": 2.676689863204956 }, { "auxiliary_loss_clip": 0.01123026, "auxiliary_loss_mlp": 0.01038411, "balance_loss_clip": 1.04910016, "balance_loss_mlp": 1.02514315, "epoch": 0.4379978956861566, "flos": 18624064320000.0, "grad_norm": 2.0075840095153925, "language_loss": 0.80086255, "learning_rate": 2.493553735281787e-06, "loss": 0.82247692, "num_input_tokens_seen": 156170125, "step": 7285, "time_per_iteration": 2.6446423530578613 }, { "auxiliary_loss_clip": 0.01121831, "auxiliary_loss_mlp": 0.01030999, "balance_loss_clip": 1.04847312, "balance_loss_mlp": 1.0175761, "epoch": 0.43805801893882457, "flos": 21981388728960.0, "grad_norm": 2.1352627983894545, "language_loss": 0.7498579, "learning_rate": 2.493176309387897e-06, "loss": 0.77138615, "num_input_tokens_seen": 156187320, "step": 7286, "time_per_iteration": 2.6779184341430664 }, { "auxiliary_loss_clip": 0.01095439, "auxiliary_loss_mlp": 0.01032312, "balance_loss_clip": 1.04372525, "balance_loss_mlp": 1.0179832, "epoch": 0.43811814219149253, "flos": 26393337383040.0, "grad_norm": 1.5473009908217328, "language_loss": 0.73641115, "learning_rate": 2.492798864792712e-06, "loss": 0.75768864, "num_input_tokens_seen": 156207455, "step": 7287, "time_per_iteration": 2.867501735687256 }, { "auxiliary_loss_clip": 0.0111224, "auxiliary_loss_mlp": 0.01045008, "balance_loss_clip": 1.05047917, "balance_loss_mlp": 1.03040457, "epoch": 0.43817826544416055, "flos": 17493309198720.0, "grad_norm": 1.6804566494971647, "language_loss": 0.8243767, "learning_rate": 2.492421401510545e-06, "loss": 0.84594917, "num_input_tokens_seen": 156226560, "step": 7288, "time_per_iteration": 2.677922010421753 }, { "auxiliary_loss_clip": 0.01094679, "auxiliary_loss_mlp": 0.01031914, "balance_loss_clip": 1.04326773, "balance_loss_mlp": 1.01793718, "epoch": 0.4382383886968285, "flos": 21581020759680.0, "grad_norm": 1.441403002582157, "language_loss": 0.84301102, "learning_rate": 2.4920439195557093e-06, "loss": 0.86427689, "num_input_tokens_seen": 156246740, "step": 7289, "time_per_iteration": 2.8586435317993164 }, { "auxiliary_loss_clip": 0.0109844, "auxiliary_loss_mlp": 0.01052991, "balance_loss_clip": 1.04162121, "balance_loss_mlp": 1.03685021, "epoch": 0.4382985119494965, "flos": 27923742201600.0, "grad_norm": 1.6202567248687665, "language_loss": 0.78218126, "learning_rate": 2.4916664189425183e-06, "loss": 0.80369556, "num_input_tokens_seen": 156266440, "step": 7290, "time_per_iteration": 2.7211575508117676 }, { "auxiliary_loss_clip": 0.01132305, "auxiliary_loss_mlp": 0.01039679, "balance_loss_clip": 1.05053866, "balance_loss_mlp": 1.02617884, "epoch": 0.43835863520216445, "flos": 24936836797440.0, "grad_norm": 1.8734686520238957, "language_loss": 0.78314757, "learning_rate": 2.491288899685288e-06, "loss": 0.80486739, "num_input_tokens_seen": 156286900, "step": 7291, "time_per_iteration": 2.629904270172119 }, { "auxiliary_loss_clip": 0.0109159, "auxiliary_loss_mlp": 0.01033172, "balance_loss_clip": 1.04265332, "balance_loss_mlp": 1.0194335, "epoch": 0.4384187584548324, "flos": 33510293504640.0, "grad_norm": 1.5839432646062752, "language_loss": 0.6487931, "learning_rate": 2.4909113617983325e-06, "loss": 0.67004073, "num_input_tokens_seen": 156307690, "step": 7292, "time_per_iteration": 2.7952499389648438 }, { "auxiliary_loss_clip": 0.01112801, "auxiliary_loss_mlp": 0.01036982, "balance_loss_clip": 1.04319155, "balance_loss_mlp": 1.0226171, "epoch": 0.4384788817075004, "flos": 23951052967680.0, "grad_norm": 1.6336411060838572, "language_loss": 0.74232095, "learning_rate": 2.49053380529597e-06, "loss": 0.7638188, "num_input_tokens_seen": 156326620, "step": 7293, "time_per_iteration": 2.636462688446045 }, { "auxiliary_loss_clip": 0.01098755, "auxiliary_loss_mlp": 0.01037795, "balance_loss_clip": 1.0494585, "balance_loss_mlp": 1.02318609, "epoch": 0.43853900496016834, "flos": 19098516090240.0, "grad_norm": 4.136423906080754, "language_loss": 0.78758669, "learning_rate": 2.490156230192516e-06, "loss": 0.80895221, "num_input_tokens_seen": 156345495, "step": 7294, "time_per_iteration": 2.670069456100464 }, { "auxiliary_loss_clip": 0.01089917, "auxiliary_loss_mlp": 0.01038854, "balance_loss_clip": 1.04422832, "balance_loss_mlp": 1.02485299, "epoch": 0.4385991282128363, "flos": 13225362168960.0, "grad_norm": 1.7954692393859477, "language_loss": 0.7296086, "learning_rate": 2.4897786365022883e-06, "loss": 0.75089628, "num_input_tokens_seen": 156363155, "step": 7295, "time_per_iteration": 2.7159199714660645 }, { "auxiliary_loss_clip": 0.01090098, "auxiliary_loss_mlp": 0.01044926, "balance_loss_clip": 1.04397202, "balance_loss_mlp": 1.02860653, "epoch": 0.4386592514655043, "flos": 14319883445760.0, "grad_norm": 1.6136170201094728, "language_loss": 0.75463378, "learning_rate": 2.4894010242396063e-06, "loss": 0.77598405, "num_input_tokens_seen": 156380940, "step": 7296, "time_per_iteration": 2.7475438117980957 }, { "auxiliary_loss_clip": 0.01118725, "auxiliary_loss_mlp": 0.01032483, "balance_loss_clip": 1.04859519, "balance_loss_mlp": 1.0183568, "epoch": 0.43871937471817224, "flos": 22784423137920.0, "grad_norm": 1.7142829102326689, "language_loss": 0.69474953, "learning_rate": 2.4890233934187873e-06, "loss": 0.71626163, "num_input_tokens_seen": 156400415, "step": 7297, "time_per_iteration": 2.6689095497131348 }, { "auxiliary_loss_clip": 0.01111936, "auxiliary_loss_mlp": 0.01033453, "balance_loss_clip": 1.04589987, "balance_loss_mlp": 1.02004242, "epoch": 0.4387794979708402, "flos": 28072304853120.0, "grad_norm": 2.137486700340973, "language_loss": 0.70327055, "learning_rate": 2.4886457440541535e-06, "loss": 0.72472441, "num_input_tokens_seen": 156421120, "step": 7298, "time_per_iteration": 2.7896294593811035 }, { "auxiliary_loss_clip": 0.01117974, "auxiliary_loss_mlp": 0.0102859, "balance_loss_clip": 1.0481534, "balance_loss_mlp": 1.01508379, "epoch": 0.43883962122350817, "flos": 26249551240320.0, "grad_norm": 1.5518132007083414, "language_loss": 0.72407347, "learning_rate": 2.4882680761600238e-06, "loss": 0.74553907, "num_input_tokens_seen": 156441535, "step": 7299, "time_per_iteration": 2.724134922027588 }, { "auxiliary_loss_clip": 0.01100992, "auxiliary_loss_mlp": 0.00773554, "balance_loss_clip": 1.04556322, "balance_loss_mlp": 1.00063753, "epoch": 0.43889974447617613, "flos": 25883765089920.0, "grad_norm": 1.9116194577137513, "language_loss": 0.7702527, "learning_rate": 2.487890389750719e-06, "loss": 0.78899813, "num_input_tokens_seen": 156462015, "step": 7300, "time_per_iteration": 2.754582166671753 }, { "auxiliary_loss_clip": 0.01105938, "auxiliary_loss_mlp": 0.01033126, "balance_loss_clip": 1.04505253, "balance_loss_mlp": 1.01922047, "epoch": 0.43895986772884416, "flos": 25046615738880.0, "grad_norm": 1.6899733258560021, "language_loss": 0.70417237, "learning_rate": 2.4875126848405626e-06, "loss": 0.72556305, "num_input_tokens_seen": 156482165, "step": 7301, "time_per_iteration": 2.8213343620300293 }, { "auxiliary_loss_clip": 0.01082543, "auxiliary_loss_mlp": 0.01042943, "balance_loss_clip": 1.04282618, "balance_loss_mlp": 1.0270884, "epoch": 0.4390199909815121, "flos": 25994585525760.0, "grad_norm": 1.824867215084726, "language_loss": 0.70808041, "learning_rate": 2.4871349614438757e-06, "loss": 0.72933531, "num_input_tokens_seen": 156503170, "step": 7302, "time_per_iteration": 2.7875969409942627 }, { "auxiliary_loss_clip": 0.01107602, "auxiliary_loss_mlp": 0.01039104, "balance_loss_clip": 1.04878247, "balance_loss_mlp": 1.02599669, "epoch": 0.4390801142341801, "flos": 29022249888000.0, "grad_norm": 1.5936626078522842, "language_loss": 0.82381457, "learning_rate": 2.486757219574983e-06, "loss": 0.8452816, "num_input_tokens_seen": 156523005, "step": 7303, "time_per_iteration": 2.838871717453003 }, { "auxiliary_loss_clip": 0.01116821, "auxiliary_loss_mlp": 0.01046972, "balance_loss_clip": 1.04648411, "balance_loss_mlp": 1.03164792, "epoch": 0.43914023748684805, "flos": 33438544087680.0, "grad_norm": 10.027739157490931, "language_loss": 0.69036293, "learning_rate": 2.4863794592482067e-06, "loss": 0.71200085, "num_input_tokens_seen": 156544440, "step": 7304, "time_per_iteration": 5.9847636222839355 }, { "auxiliary_loss_clip": 0.01105223, "auxiliary_loss_mlp": 0.00770446, "balance_loss_clip": 1.04475939, "balance_loss_mlp": 1.0005337, "epoch": 0.439200360739516, "flos": 34531844302080.0, "grad_norm": 1.5264108649470638, "language_loss": 0.78100759, "learning_rate": 2.486001680477873e-06, "loss": 0.79976428, "num_input_tokens_seen": 156565410, "step": 7305, "time_per_iteration": 4.283693313598633 }, { "auxiliary_loss_clip": 0.01102752, "auxiliary_loss_mlp": 0.01034686, "balance_loss_clip": 1.04440284, "balance_loss_mlp": 1.02097106, "epoch": 0.439260483992184, "flos": 21907843632000.0, "grad_norm": 1.7445713343884877, "language_loss": 0.68756545, "learning_rate": 2.485623883278308e-06, "loss": 0.70893979, "num_input_tokens_seen": 156584210, "step": 7306, "time_per_iteration": 2.7069246768951416 }, { "auxiliary_loss_clip": 0.01089881, "auxiliary_loss_mlp": 0.01031704, "balance_loss_clip": 1.0450325, "balance_loss_mlp": 1.01757789, "epoch": 0.43932060724485195, "flos": 20996430912000.0, "grad_norm": 2.2471251539247428, "language_loss": 0.62507868, "learning_rate": 2.4852460676638344e-06, "loss": 0.64629447, "num_input_tokens_seen": 156602730, "step": 7307, "time_per_iteration": 2.719836950302124 }, { "auxiliary_loss_clip": 0.01130769, "auxiliary_loss_mlp": 0.01032376, "balance_loss_clip": 1.04645061, "balance_loss_mlp": 1.0188818, "epoch": 0.4393807304975199, "flos": 17747053850880.0, "grad_norm": 1.9621539490577573, "language_loss": 0.71752089, "learning_rate": 2.4848682336487828e-06, "loss": 0.73915237, "num_input_tokens_seen": 156619405, "step": 7308, "time_per_iteration": 4.218705892562866 }, { "auxiliary_loss_clip": 0.0110959, "auxiliary_loss_mlp": 0.01034032, "balance_loss_clip": 1.0438807, "balance_loss_mlp": 1.020859, "epoch": 0.4394408537501879, "flos": 22528523669760.0, "grad_norm": 1.855171270613647, "language_loss": 0.76671213, "learning_rate": 2.4844903812474787e-06, "loss": 0.78814828, "num_input_tokens_seen": 156638165, "step": 7309, "time_per_iteration": 2.726790428161621 }, { "auxiliary_loss_clip": 0.01111334, "auxiliary_loss_mlp": 0.01031706, "balance_loss_clip": 1.04383993, "balance_loss_mlp": 1.01888466, "epoch": 0.43950097700285584, "flos": 23440654661760.0, "grad_norm": 1.9900388133775502, "language_loss": 0.7067014, "learning_rate": 2.484112510474251e-06, "loss": 0.72813171, "num_input_tokens_seen": 156658845, "step": 7310, "time_per_iteration": 2.644737958908081 }, { "auxiliary_loss_clip": 0.01099363, "auxiliary_loss_mlp": 0.00771301, "balance_loss_clip": 1.04282653, "balance_loss_mlp": 1.00065351, "epoch": 0.4395611002555238, "flos": 23180696956800.0, "grad_norm": 2.0308560550957813, "language_loss": 0.76245713, "learning_rate": 2.483734621343429e-06, "loss": 0.78116381, "num_input_tokens_seen": 156677275, "step": 7311, "time_per_iteration": 2.676393985748291 }, { "auxiliary_loss_clip": 0.01118807, "auxiliary_loss_mlp": 0.01036972, "balance_loss_clip": 1.04605961, "balance_loss_mlp": 1.02365649, "epoch": 0.43962122350819177, "flos": 22127365601280.0, "grad_norm": 1.941188934607737, "language_loss": 0.81554043, "learning_rate": 2.483356713869341e-06, "loss": 0.83709824, "num_input_tokens_seen": 156695815, "step": 7312, "time_per_iteration": 2.734691858291626 }, { "auxiliary_loss_clip": 0.01099053, "auxiliary_loss_mlp": 0.01030855, "balance_loss_clip": 1.04661798, "balance_loss_mlp": 1.01802182, "epoch": 0.43968134676085974, "flos": 17420554200960.0, "grad_norm": 4.309677618927981, "language_loss": 0.85387003, "learning_rate": 2.482978788066318e-06, "loss": 0.8751691, "num_input_tokens_seen": 156714385, "step": 7313, "time_per_iteration": 2.7130918502807617 }, { "auxiliary_loss_clip": 0.01101603, "auxiliary_loss_mlp": 0.01034502, "balance_loss_clip": 1.04015613, "balance_loss_mlp": 1.02131104, "epoch": 0.43974147001352776, "flos": 18952646958720.0, "grad_norm": 1.7624997398560822, "language_loss": 0.67982185, "learning_rate": 2.4826008439486904e-06, "loss": 0.70118284, "num_input_tokens_seen": 156732615, "step": 7314, "time_per_iteration": 2.660019636154175 }, { "auxiliary_loss_clip": 0.01107647, "auxiliary_loss_mlp": 0.01029957, "balance_loss_clip": 1.04436517, "balance_loss_mlp": 1.01645088, "epoch": 0.4398015932661957, "flos": 18953508885120.0, "grad_norm": 1.864599678602129, "language_loss": 0.76799178, "learning_rate": 2.4822228815307915e-06, "loss": 0.78936785, "num_input_tokens_seen": 156750920, "step": 7315, "time_per_iteration": 2.6958022117614746 }, { "auxiliary_loss_clip": 0.01103713, "auxiliary_loss_mlp": 0.01033338, "balance_loss_clip": 1.04664755, "balance_loss_mlp": 1.02002192, "epoch": 0.4398617165188637, "flos": 24199913370240.0, "grad_norm": 2.581770130909348, "language_loss": 0.74439812, "learning_rate": 2.4818449008269523e-06, "loss": 0.76576865, "num_input_tokens_seen": 156768520, "step": 7316, "time_per_iteration": 2.7142746448516846 }, { "auxiliary_loss_clip": 0.01091829, "auxiliary_loss_mlp": 0.01038409, "balance_loss_clip": 1.04720306, "balance_loss_mlp": 1.02546883, "epoch": 0.43992183977153165, "flos": 22236677665920.0, "grad_norm": 2.381148700310756, "language_loss": 0.64676511, "learning_rate": 2.481466901851506e-06, "loss": 0.66806751, "num_input_tokens_seen": 156788700, "step": 7317, "time_per_iteration": 2.6647984981536865 }, { "auxiliary_loss_clip": 0.01100358, "auxiliary_loss_mlp": 0.01036318, "balance_loss_clip": 1.04315925, "balance_loss_mlp": 1.02252579, "epoch": 0.4399819630241996, "flos": 18697465762560.0, "grad_norm": 2.00656387252293, "language_loss": 0.79769003, "learning_rate": 2.4810888846187865e-06, "loss": 0.81905675, "num_input_tokens_seen": 156806470, "step": 7318, "time_per_iteration": 2.6569128036499023 }, { "auxiliary_loss_clip": 0.01085209, "auxiliary_loss_mlp": 0.0104302, "balance_loss_clip": 1.03973842, "balance_loss_mlp": 1.02808332, "epoch": 0.4400420862768676, "flos": 23879375377920.0, "grad_norm": 1.4911827600564649, "language_loss": 0.79173744, "learning_rate": 2.4807108491431283e-06, "loss": 0.81301975, "num_input_tokens_seen": 156825895, "step": 7319, "time_per_iteration": 2.7476212978363037 }, { "auxiliary_loss_clip": 0.01110516, "auxiliary_loss_mlp": 0.01041368, "balance_loss_clip": 1.0416882, "balance_loss_mlp": 1.02647328, "epoch": 0.44010220952953555, "flos": 28037615293440.0, "grad_norm": 1.9147413156076512, "language_loss": 0.80129063, "learning_rate": 2.4803327954388667e-06, "loss": 0.82280946, "num_input_tokens_seen": 156845990, "step": 7320, "time_per_iteration": 2.716813802719116 }, { "auxiliary_loss_clip": 0.01088202, "auxiliary_loss_mlp": 0.01041527, "balance_loss_clip": 1.04271483, "balance_loss_mlp": 1.02871788, "epoch": 0.4401623327822035, "flos": 23768985905280.0, "grad_norm": 3.0980421986856777, "language_loss": 0.69580001, "learning_rate": 2.4799547235203376e-06, "loss": 0.71709728, "num_input_tokens_seen": 156866685, "step": 7321, "time_per_iteration": 2.753053903579712 }, { "auxiliary_loss_clip": 0.01016924, "auxiliary_loss_mlp": 0.01013574, "balance_loss_clip": 1.02610326, "balance_loss_mlp": 1.01153517, "epoch": 0.4402224560348715, "flos": 70774583264640.0, "grad_norm": 0.8888992176827548, "language_loss": 0.56922823, "learning_rate": 2.4795766334018763e-06, "loss": 0.58953327, "num_input_tokens_seen": 156923450, "step": 7322, "time_per_iteration": 3.3513524532318115 }, { "auxiliary_loss_clip": 0.01073209, "auxiliary_loss_mlp": 0.01039777, "balance_loss_clip": 1.03671217, "balance_loss_mlp": 1.02677715, "epoch": 0.44028257928753944, "flos": 22891795868160.0, "grad_norm": 1.5589182914821764, "language_loss": 0.76272774, "learning_rate": 2.479198525097822e-06, "loss": 0.78385758, "num_input_tokens_seen": 156944795, "step": 7323, "time_per_iteration": 2.7524306774139404 }, { "auxiliary_loss_clip": 0.01119465, "auxiliary_loss_mlp": 0.01043388, "balance_loss_clip": 1.04591155, "balance_loss_mlp": 1.0296607, "epoch": 0.4403427025402074, "flos": 17895760156800.0, "grad_norm": 1.5124862196762965, "language_loss": 0.80590653, "learning_rate": 2.478820398622511e-06, "loss": 0.82753503, "num_input_tokens_seen": 156962755, "step": 7324, "time_per_iteration": 2.6558468341827393 }, { "auxiliary_loss_clip": 0.01025531, "auxiliary_loss_mlp": 0.0100492, "balance_loss_clip": 1.02356136, "balance_loss_mlp": 1.00322747, "epoch": 0.4404028257928754, "flos": 69562525708800.0, "grad_norm": 0.6843753140185513, "language_loss": 0.54592586, "learning_rate": 2.478442253990283e-06, "loss": 0.5662303, "num_input_tokens_seen": 157028095, "step": 7325, "time_per_iteration": 3.228588819503784 }, { "auxiliary_loss_clip": 0.01128033, "auxiliary_loss_mlp": 0.01028317, "balance_loss_clip": 1.04957604, "balance_loss_mlp": 1.0163784, "epoch": 0.44046294904554334, "flos": 20923675914240.0, "grad_norm": 1.4618535572581854, "language_loss": 0.70052326, "learning_rate": 2.4780640912154766e-06, "loss": 0.72208667, "num_input_tokens_seen": 157048365, "step": 7326, "time_per_iteration": 2.643843650817871 }, { "auxiliary_loss_clip": 0.01081906, "auxiliary_loss_mlp": 0.010325, "balance_loss_clip": 1.03812075, "balance_loss_mlp": 1.01949978, "epoch": 0.44052307229821136, "flos": 23623475909760.0, "grad_norm": 1.533904509031544, "language_loss": 0.76754719, "learning_rate": 2.477685910312432e-06, "loss": 0.78869128, "num_input_tokens_seen": 157069130, "step": 7327, "time_per_iteration": 2.7409613132476807 }, { "auxiliary_loss_clip": 0.01097799, "auxiliary_loss_mlp": 0.01038346, "balance_loss_clip": 1.04025364, "balance_loss_mlp": 1.0256505, "epoch": 0.4405831955508793, "flos": 17597665186560.0, "grad_norm": 1.9457575580966853, "language_loss": 0.8413341, "learning_rate": 2.4773077112954897e-06, "loss": 0.86269557, "num_input_tokens_seen": 157084940, "step": 7328, "time_per_iteration": 2.6578822135925293 }, { "auxiliary_loss_clip": 0.01102477, "auxiliary_loss_mlp": 0.01028668, "balance_loss_clip": 1.04432774, "balance_loss_mlp": 1.01576972, "epoch": 0.4406433188035473, "flos": 21463376739840.0, "grad_norm": 2.377465022226765, "language_loss": 0.77753079, "learning_rate": 2.4769294941789908e-06, "loss": 0.79884225, "num_input_tokens_seen": 157102770, "step": 7329, "time_per_iteration": 2.6732001304626465 }, { "auxiliary_loss_clip": 0.01114069, "auxiliary_loss_mlp": 0.01039308, "balance_loss_clip": 1.04399741, "balance_loss_mlp": 1.02568269, "epoch": 0.44070344205621526, "flos": 22673566788480.0, "grad_norm": 1.63533295854216, "language_loss": 0.73525596, "learning_rate": 2.476551258977278e-06, "loss": 0.75678968, "num_input_tokens_seen": 157122035, "step": 7330, "time_per_iteration": 2.6258528232574463 }, { "auxiliary_loss_clip": 0.01104463, "auxiliary_loss_mlp": 0.01039279, "balance_loss_clip": 1.04494476, "balance_loss_mlp": 1.02678585, "epoch": 0.4407635653088832, "flos": 23441193365760.0, "grad_norm": 1.852759340776506, "language_loss": 0.74862218, "learning_rate": 2.4761730057046936e-06, "loss": 0.77005959, "num_input_tokens_seen": 157142800, "step": 7331, "time_per_iteration": 2.767972469329834 }, { "auxiliary_loss_clip": 0.01075234, "auxiliary_loss_mlp": 0.01034744, "balance_loss_clip": 1.04043937, "balance_loss_mlp": 1.02114189, "epoch": 0.4408236885615512, "flos": 24021294013440.0, "grad_norm": 1.4106194210898035, "language_loss": 0.76326358, "learning_rate": 2.475794734375581e-06, "loss": 0.78436339, "num_input_tokens_seen": 157163295, "step": 7332, "time_per_iteration": 2.7810683250427246 }, { "auxiliary_loss_clip": 0.01099425, "auxiliary_loss_mlp": 0.01041411, "balance_loss_clip": 1.04447377, "balance_loss_mlp": 1.02958584, "epoch": 0.44088381181421915, "flos": 12676826597760.0, "grad_norm": 1.919719554260373, "language_loss": 0.73795688, "learning_rate": 2.475416445004285e-06, "loss": 0.75936526, "num_input_tokens_seen": 157180890, "step": 7333, "time_per_iteration": 2.661736488342285 }, { "auxiliary_loss_clip": 0.01086658, "auxiliary_loss_mlp": 0.01034222, "balance_loss_clip": 1.04458117, "balance_loss_mlp": 1.02134728, "epoch": 0.4409439350668871, "flos": 24569865498240.0, "grad_norm": 1.5776913121160454, "language_loss": 0.79113179, "learning_rate": 2.4750381376051493e-06, "loss": 0.81234062, "num_input_tokens_seen": 157200580, "step": 7334, "time_per_iteration": 2.8023018836975098 }, { "auxiliary_loss_clip": 0.01102091, "auxiliary_loss_mlp": 0.01039411, "balance_loss_clip": 1.04475522, "balance_loss_mlp": 1.02343714, "epoch": 0.4410040583195551, "flos": 22668574798080.0, "grad_norm": 2.426268589885391, "language_loss": 0.75184131, "learning_rate": 2.47465981219252e-06, "loss": 0.77325642, "num_input_tokens_seen": 157218345, "step": 7335, "time_per_iteration": 2.7240371704101562 }, { "auxiliary_loss_clip": 0.01101432, "auxiliary_loss_mlp": 0.0103515, "balance_loss_clip": 1.04350579, "balance_loss_mlp": 1.02189362, "epoch": 0.44106418157222305, "flos": 10852528700160.0, "grad_norm": 1.9825426915131346, "language_loss": 0.72498572, "learning_rate": 2.4742814687807423e-06, "loss": 0.74635154, "num_input_tokens_seen": 157234395, "step": 7336, "time_per_iteration": 2.6489880084991455 }, { "auxiliary_loss_clip": 0.01118861, "auxiliary_loss_mlp": 0.01040583, "balance_loss_clip": 1.04398608, "balance_loss_mlp": 1.02684367, "epoch": 0.441124304824891, "flos": 21726710323200.0, "grad_norm": 2.2630715311051617, "language_loss": 0.62847346, "learning_rate": 2.473903107384165e-06, "loss": 0.65006793, "num_input_tokens_seen": 157254805, "step": 7337, "time_per_iteration": 2.632335901260376 }, { "auxiliary_loss_clip": 0.01029242, "auxiliary_loss_mlp": 0.00753616, "balance_loss_clip": 1.0181427, "balance_loss_mlp": 1.00070596, "epoch": 0.441184428077559, "flos": 63220486625280.0, "grad_norm": 0.7364595311582042, "language_loss": 0.52639711, "learning_rate": 2.473524728017134e-06, "loss": 0.54422569, "num_input_tokens_seen": 157317870, "step": 7338, "time_per_iteration": 3.253746509552002 }, { "auxiliary_loss_clip": 0.01106453, "auxiliary_loss_mlp": 0.01046288, "balance_loss_clip": 1.04105973, "balance_loss_mlp": 1.03120804, "epoch": 0.44124455133022694, "flos": 21177959270400.0, "grad_norm": 2.22639682548465, "language_loss": 0.70776093, "learning_rate": 2.473146330693997e-06, "loss": 0.7292884, "num_input_tokens_seen": 157336505, "step": 7339, "time_per_iteration": 2.655733823776245 }, { "auxiliary_loss_clip": 0.01053755, "auxiliary_loss_mlp": 0.01042988, "balance_loss_clip": 1.03682137, "balance_loss_mlp": 1.02918971, "epoch": 0.4413046745828949, "flos": 17457865453440.0, "grad_norm": 1.5022359473102205, "language_loss": 0.70075929, "learning_rate": 2.472767915429105e-06, "loss": 0.72172678, "num_input_tokens_seen": 157354995, "step": 7340, "time_per_iteration": 2.767920970916748 }, { "auxiliary_loss_clip": 0.01030747, "auxiliary_loss_mlp": 0.01003789, "balance_loss_clip": 1.02245617, "balance_loss_mlp": 1.00190568, "epoch": 0.4413647978355629, "flos": 61586153804160.0, "grad_norm": 0.8827965218567749, "language_loss": 0.63983381, "learning_rate": 2.4723894822368054e-06, "loss": 0.66017926, "num_input_tokens_seen": 157404260, "step": 7341, "time_per_iteration": 3.049508810043335 }, { "auxiliary_loss_clip": 0.01091178, "auxiliary_loss_mlp": 0.01040152, "balance_loss_clip": 1.0418849, "balance_loss_mlp": 1.02682424, "epoch": 0.4414249210882309, "flos": 27527001505920.0, "grad_norm": 2.055823294856648, "language_loss": 0.73636287, "learning_rate": 2.47201103113145e-06, "loss": 0.75767612, "num_input_tokens_seen": 157423045, "step": 7342, "time_per_iteration": 2.795201063156128 }, { "auxiliary_loss_clip": 0.01125069, "auxiliary_loss_mlp": 0.01041127, "balance_loss_clip": 1.04345822, "balance_loss_mlp": 1.02709007, "epoch": 0.44148504434089886, "flos": 23513984277120.0, "grad_norm": 2.2044048255358515, "language_loss": 0.79979384, "learning_rate": 2.4716325621273886e-06, "loss": 0.82145584, "num_input_tokens_seen": 157441815, "step": 7343, "time_per_iteration": 5.804108142852783 }, { "auxiliary_loss_clip": 0.010937, "auxiliary_loss_mlp": 0.01034348, "balance_loss_clip": 1.04503846, "balance_loss_mlp": 1.02072287, "epoch": 0.4415451675935668, "flos": 21580589796480.0, "grad_norm": 2.707350721832692, "language_loss": 0.76721787, "learning_rate": 2.4712540752389725e-06, "loss": 0.78849834, "num_input_tokens_seen": 157460470, "step": 7344, "time_per_iteration": 2.7370471954345703 }, { "auxiliary_loss_clip": 0.01038191, "auxiliary_loss_mlp": 0.01020913, "balance_loss_clip": 1.0274384, "balance_loss_mlp": 1.01902914, "epoch": 0.4416052908462348, "flos": 59006368126080.0, "grad_norm": 0.7980536604903562, "language_loss": 0.63813043, "learning_rate": 2.470875570480556e-06, "loss": 0.65872145, "num_input_tokens_seen": 157512655, "step": 7345, "time_per_iteration": 4.502060890197754 }, { "auxiliary_loss_clip": 0.01130065, "auxiliary_loss_mlp": 0.01040621, "balance_loss_clip": 1.04656529, "balance_loss_mlp": 1.02670372, "epoch": 0.44166541409890275, "flos": 26357642242560.0, "grad_norm": 1.8234046338758734, "language_loss": 0.86094856, "learning_rate": 2.470497047866489e-06, "loss": 0.88265538, "num_input_tokens_seen": 157533700, "step": 7346, "time_per_iteration": 2.697648763656616 }, { "auxiliary_loss_clip": 0.01119294, "auxiliary_loss_mlp": 0.0104301, "balance_loss_clip": 1.04583025, "balance_loss_mlp": 1.02862179, "epoch": 0.4417255373515707, "flos": 20192678231040.0, "grad_norm": 1.7966519054380148, "language_loss": 0.80474353, "learning_rate": 2.470118507411128e-06, "loss": 0.8263666, "num_input_tokens_seen": 157551105, "step": 7347, "time_per_iteration": 4.3498101234436035 }, { "auxiliary_loss_clip": 0.01107859, "auxiliary_loss_mlp": 0.01035246, "balance_loss_clip": 1.04878783, "balance_loss_mlp": 1.02088118, "epoch": 0.4417856606042387, "flos": 17887895078400.0, "grad_norm": 1.7585337264872751, "language_loss": 0.83156574, "learning_rate": 2.4697399491288263e-06, "loss": 0.85299683, "num_input_tokens_seen": 157568285, "step": 7348, "time_per_iteration": 2.6866180896759033 }, { "auxiliary_loss_clip": 0.01119234, "auxiliary_loss_mlp": 0.01035311, "balance_loss_clip": 1.04732084, "balance_loss_mlp": 1.02139926, "epoch": 0.44184578385690665, "flos": 27964034282880.0, "grad_norm": 2.0657656881846505, "language_loss": 0.70507312, "learning_rate": 2.469361373033938e-06, "loss": 0.72661853, "num_input_tokens_seen": 157590405, "step": 7349, "time_per_iteration": 2.7241854667663574 }, { "auxiliary_loss_clip": 0.0109864, "auxiliary_loss_mlp": 0.01033665, "balance_loss_clip": 1.04184258, "balance_loss_mlp": 1.01935983, "epoch": 0.4419059071095746, "flos": 23367899664000.0, "grad_norm": 1.9069897602324009, "language_loss": 0.74060279, "learning_rate": 2.468982779140819e-06, "loss": 0.76192582, "num_input_tokens_seen": 157607420, "step": 7350, "time_per_iteration": 2.724295139312744 }, { "auxiliary_loss_clip": 0.01129716, "auxiliary_loss_mlp": 0.01036435, "balance_loss_clip": 1.04692149, "balance_loss_mlp": 1.02279782, "epoch": 0.4419660303622426, "flos": 15012169246080.0, "grad_norm": 4.28906993354027, "language_loss": 0.81133771, "learning_rate": 2.468604167463827e-06, "loss": 0.83299923, "num_input_tokens_seen": 157624990, "step": 7351, "time_per_iteration": 2.6151175498962402 }, { "auxiliary_loss_clip": 0.01077442, "auxiliary_loss_mlp": 0.00770493, "balance_loss_clip": 1.03664398, "balance_loss_mlp": 1.00027013, "epoch": 0.44202615361491054, "flos": 25371750672000.0, "grad_norm": 1.4842739809833707, "language_loss": 0.72872806, "learning_rate": 2.4682255380173176e-06, "loss": 0.7472074, "num_input_tokens_seen": 157645300, "step": 7352, "time_per_iteration": 2.822618007659912 }, { "auxiliary_loss_clip": 0.01105652, "auxiliary_loss_mlp": 0.01030051, "balance_loss_clip": 1.05031562, "balance_loss_mlp": 1.01625896, "epoch": 0.4420862768675785, "flos": 24681116897280.0, "grad_norm": 2.2734813659209316, "language_loss": 0.87014645, "learning_rate": 2.467846890815649e-06, "loss": 0.89150345, "num_input_tokens_seen": 157664060, "step": 7353, "time_per_iteration": 2.8141496181488037 }, { "auxiliary_loss_clip": 0.01131466, "auxiliary_loss_mlp": 0.01036857, "balance_loss_clip": 1.04851007, "balance_loss_mlp": 1.02385104, "epoch": 0.44214640012024653, "flos": 19528437974400.0, "grad_norm": 2.0005767830632464, "language_loss": 0.75907683, "learning_rate": 2.4674682258731795e-06, "loss": 0.78076005, "num_input_tokens_seen": 157680905, "step": 7354, "time_per_iteration": 2.6416475772857666 }, { "auxiliary_loss_clip": 0.01087376, "auxiliary_loss_mlp": 0.01035112, "balance_loss_clip": 1.04345286, "balance_loss_mlp": 1.02218962, "epoch": 0.4422065233729145, "flos": 47557434003840.0, "grad_norm": 1.702490286843937, "language_loss": 0.64954734, "learning_rate": 2.467089543204268e-06, "loss": 0.67077219, "num_input_tokens_seen": 157701980, "step": 7355, "time_per_iteration": 2.9349570274353027 }, { "auxiliary_loss_clip": 0.01133882, "auxiliary_loss_mlp": 0.01035511, "balance_loss_clip": 1.04775596, "balance_loss_mlp": 1.02121234, "epoch": 0.44226664662558246, "flos": 19281050029440.0, "grad_norm": 1.8300716428477437, "language_loss": 0.78527248, "learning_rate": 2.466710842823274e-06, "loss": 0.80696642, "num_input_tokens_seen": 157720555, "step": 7356, "time_per_iteration": 2.5932910442352295 }, { "auxiliary_loss_clip": 0.01109756, "auxiliary_loss_mlp": 0.00771729, "balance_loss_clip": 1.04629183, "balance_loss_mlp": 1.0004859, "epoch": 0.4423267698782504, "flos": 17821820010240.0, "grad_norm": 1.6708598029973696, "language_loss": 0.77472621, "learning_rate": 2.4663321247445577e-06, "loss": 0.79354107, "num_input_tokens_seen": 157739160, "step": 7357, "time_per_iteration": 2.7050111293792725 }, { "auxiliary_loss_clip": 0.01102733, "auxiliary_loss_mlp": 0.01037231, "balance_loss_clip": 1.04357672, "balance_loss_mlp": 1.02280128, "epoch": 0.4423868931309184, "flos": 29204424691200.0, "grad_norm": 1.492131344457668, "language_loss": 0.73277801, "learning_rate": 2.465953388982481e-06, "loss": 0.75417769, "num_input_tokens_seen": 157760020, "step": 7358, "time_per_iteration": 2.7339792251586914 }, { "auxiliary_loss_clip": 0.01108517, "auxiliary_loss_mlp": 0.01035507, "balance_loss_clip": 1.04953265, "balance_loss_mlp": 1.02198911, "epoch": 0.44244701638358636, "flos": 29713135057920.0, "grad_norm": 1.890703165597896, "language_loss": 0.75731266, "learning_rate": 2.465574635551405e-06, "loss": 0.77875292, "num_input_tokens_seen": 157780435, "step": 7359, "time_per_iteration": 2.7597005367279053 }, { "auxiliary_loss_clip": 0.01106411, "auxiliary_loss_mlp": 0.01037766, "balance_loss_clip": 1.04658461, "balance_loss_mlp": 1.02315068, "epoch": 0.4425071396362543, "flos": 22930040874240.0, "grad_norm": 1.6679218305876244, "language_loss": 0.69988406, "learning_rate": 2.4651958644656923e-06, "loss": 0.72132587, "num_input_tokens_seen": 157799420, "step": 7360, "time_per_iteration": 2.7118403911590576 }, { "auxiliary_loss_clip": 0.01104133, "auxiliary_loss_mlp": 0.01032941, "balance_loss_clip": 1.04686546, "balance_loss_mlp": 1.01859379, "epoch": 0.4425672628889223, "flos": 19792346175360.0, "grad_norm": 3.404305353939149, "language_loss": 0.69860107, "learning_rate": 2.4648170757397053e-06, "loss": 0.71997184, "num_input_tokens_seen": 157817025, "step": 7361, "time_per_iteration": 2.672388792037964 }, { "auxiliary_loss_clip": 0.01105237, "auxiliary_loss_mlp": 0.01040581, "balance_loss_clip": 1.04377937, "balance_loss_mlp": 1.02539372, "epoch": 0.44262738614159025, "flos": 13662215377920.0, "grad_norm": 2.0698565080434888, "language_loss": 0.82494795, "learning_rate": 2.464438269387809e-06, "loss": 0.84640616, "num_input_tokens_seen": 157834345, "step": 7362, "time_per_iteration": 2.6258609294891357 }, { "auxiliary_loss_clip": 0.01102915, "auxiliary_loss_mlp": 0.01040381, "balance_loss_clip": 1.04801464, "balance_loss_mlp": 1.02494311, "epoch": 0.4426875093942582, "flos": 14210212245120.0, "grad_norm": 1.7089384580193987, "language_loss": 0.74628377, "learning_rate": 2.464059445424366e-06, "loss": 0.76771677, "num_input_tokens_seen": 157852290, "step": 7363, "time_per_iteration": 2.7868857383728027 }, { "auxiliary_loss_clip": 0.01008645, "auxiliary_loss_mlp": 0.01003596, "balance_loss_clip": 1.02228582, "balance_loss_mlp": 1.0016526, "epoch": 0.4427476326469262, "flos": 70117525728000.0, "grad_norm": 0.6804595751696751, "language_loss": 0.55677116, "learning_rate": 2.463680603863743e-06, "loss": 0.57689351, "num_input_tokens_seen": 157923060, "step": 7364, "time_per_iteration": 3.3737823963165283 }, { "auxiliary_loss_clip": 0.01109131, "auxiliary_loss_mlp": 0.01040851, "balance_loss_clip": 1.04670477, "balance_loss_mlp": 1.02778566, "epoch": 0.44280775589959415, "flos": 25445080287360.0, "grad_norm": 1.640155581598939, "language_loss": 0.74618137, "learning_rate": 2.463301744720305e-06, "loss": 0.76768118, "num_input_tokens_seen": 157944110, "step": 7365, "time_per_iteration": 2.789905071258545 }, { "auxiliary_loss_clip": 0.01099825, "auxiliary_loss_mlp": 0.01043396, "balance_loss_clip": 1.04348397, "balance_loss_mlp": 1.0287931, "epoch": 0.4428678791522621, "flos": 22857214049280.0, "grad_norm": 1.5674103047703387, "language_loss": 0.74297303, "learning_rate": 2.4629228680084184e-06, "loss": 0.76440525, "num_input_tokens_seen": 157964295, "step": 7366, "time_per_iteration": 2.700286626815796 }, { "auxiliary_loss_clip": 0.01108412, "auxiliary_loss_mlp": 0.0103649, "balance_loss_clip": 1.04708481, "balance_loss_mlp": 1.02240598, "epoch": 0.44292800240493013, "flos": 25812446636160.0, "grad_norm": 3.271133633367276, "language_loss": 0.73245466, "learning_rate": 2.46254397374245e-06, "loss": 0.75390375, "num_input_tokens_seen": 157983970, "step": 7367, "time_per_iteration": 2.6946957111358643 }, { "auxiliary_loss_clip": 0.01130142, "auxiliary_loss_mlp": 0.01040167, "balance_loss_clip": 1.04803169, "balance_loss_mlp": 1.02645779, "epoch": 0.4429881256575981, "flos": 32416885549440.0, "grad_norm": 1.566124307945558, "language_loss": 0.73996794, "learning_rate": 2.4621650619367677e-06, "loss": 0.76167101, "num_input_tokens_seen": 158006515, "step": 7368, "time_per_iteration": 2.7544407844543457 }, { "auxiliary_loss_clip": 0.01100906, "auxiliary_loss_mlp": 0.01031006, "balance_loss_clip": 1.04302347, "balance_loss_mlp": 1.01735687, "epoch": 0.44304824891026606, "flos": 22163707186560.0, "grad_norm": 2.0120848529023334, "language_loss": 0.7961669, "learning_rate": 2.4617861326057403e-06, "loss": 0.81748605, "num_input_tokens_seen": 158025565, "step": 7369, "time_per_iteration": 2.697190046310425 }, { "auxiliary_loss_clip": 0.010901, "auxiliary_loss_mlp": 0.01035588, "balance_loss_clip": 1.04244113, "balance_loss_mlp": 1.02251637, "epoch": 0.443108372162934, "flos": 25338569483520.0, "grad_norm": 1.9393131166495303, "language_loss": 0.72057104, "learning_rate": 2.461407185763737e-06, "loss": 0.74182796, "num_input_tokens_seen": 158045620, "step": 7370, "time_per_iteration": 2.7959940433502197 }, { "auxiliary_loss_clip": 0.01129082, "auxiliary_loss_mlp": 0.0103749, "balance_loss_clip": 1.04668999, "balance_loss_mlp": 1.02349448, "epoch": 0.443168495415602, "flos": 23330947547520.0, "grad_norm": 1.8535232870502223, "language_loss": 0.70380038, "learning_rate": 2.461028221425126e-06, "loss": 0.72546607, "num_input_tokens_seen": 158063505, "step": 7371, "time_per_iteration": 2.677718162536621 }, { "auxiliary_loss_clip": 0.01119855, "auxiliary_loss_mlp": 0.01031238, "balance_loss_clip": 1.0492835, "balance_loss_mlp": 1.01867962, "epoch": 0.44322861866826996, "flos": 21871502046720.0, "grad_norm": 2.0883513439310577, "language_loss": 0.68410224, "learning_rate": 2.4606492396042786e-06, "loss": 0.70561314, "num_input_tokens_seen": 158080335, "step": 7372, "time_per_iteration": 2.6676101684570312 }, { "auxiliary_loss_clip": 0.01096245, "auxiliary_loss_mlp": 0.0103489, "balance_loss_clip": 1.04236257, "balance_loss_mlp": 1.0203104, "epoch": 0.4432887419209379, "flos": 20084407660800.0, "grad_norm": 1.830573306058503, "language_loss": 0.83560812, "learning_rate": 2.4602702403155664e-06, "loss": 0.85691947, "num_input_tokens_seen": 158098955, "step": 7373, "time_per_iteration": 2.706554651260376 }, { "auxiliary_loss_clip": 0.0103821, "auxiliary_loss_mlp": 0.0100315, "balance_loss_clip": 1.01858282, "balance_loss_mlp": 1.00125432, "epoch": 0.4433488651736059, "flos": 70035540935040.0, "grad_norm": 0.769882260063621, "language_loss": 0.55201387, "learning_rate": 2.4598912235733604e-06, "loss": 0.57242751, "num_input_tokens_seen": 158164110, "step": 7374, "time_per_iteration": 3.2373340129852295 }, { "auxiliary_loss_clip": 0.01078736, "auxiliary_loss_mlp": 0.01042384, "balance_loss_clip": 1.04519641, "balance_loss_mlp": 1.02773309, "epoch": 0.44340898842627385, "flos": 16282472705280.0, "grad_norm": 2.3490774090653592, "language_loss": 0.8289665, "learning_rate": 2.4595121893920327e-06, "loss": 0.85017765, "num_input_tokens_seen": 158179850, "step": 7375, "time_per_iteration": 2.7468464374542236 }, { "auxiliary_loss_clip": 0.01129641, "auxiliary_loss_mlp": 0.01034202, "balance_loss_clip": 1.04680073, "balance_loss_mlp": 1.02032566, "epoch": 0.4434691116789418, "flos": 16611989097600.0, "grad_norm": 1.9296092769688273, "language_loss": 0.84076023, "learning_rate": 2.4591331377859578e-06, "loss": 0.86239868, "num_input_tokens_seen": 158196590, "step": 7376, "time_per_iteration": 2.5597686767578125 }, { "auxiliary_loss_clip": 0.01105366, "auxiliary_loss_mlp": 0.01036479, "balance_loss_clip": 1.04541779, "balance_loss_mlp": 1.02299011, "epoch": 0.4435292349316098, "flos": 19063251912960.0, "grad_norm": 1.7983383352892115, "language_loss": 0.77172405, "learning_rate": 2.4587540687695077e-06, "loss": 0.79314244, "num_input_tokens_seen": 158216355, "step": 7377, "time_per_iteration": 2.7065727710723877 }, { "auxiliary_loss_clip": 0.01111732, "auxiliary_loss_mlp": 0.01032577, "balance_loss_clip": 1.04586828, "balance_loss_mlp": 1.01916027, "epoch": 0.44358935818427775, "flos": 21251324799360.0, "grad_norm": 2.2025516465061568, "language_loss": 0.76422131, "learning_rate": 2.458374982357057e-06, "loss": 0.78566432, "num_input_tokens_seen": 158235825, "step": 7378, "time_per_iteration": 2.6680550575256348 }, { "auxiliary_loss_clip": 0.01104625, "auxiliary_loss_mlp": 0.01055785, "balance_loss_clip": 1.04471672, "balance_loss_mlp": 1.0404191, "epoch": 0.4436494814369457, "flos": 12495298239360.0, "grad_norm": 1.9484405267541265, "language_loss": 0.69165838, "learning_rate": 2.457995878562982e-06, "loss": 0.7132625, "num_input_tokens_seen": 158254230, "step": 7379, "time_per_iteration": 2.6700775623321533 }, { "auxiliary_loss_clip": 0.01063579, "auxiliary_loss_mlp": 0.01045674, "balance_loss_clip": 1.03913927, "balance_loss_mlp": 1.0297358, "epoch": 0.44370960468961373, "flos": 23659853408640.0, "grad_norm": 2.073474855716146, "language_loss": 0.7288872, "learning_rate": 2.457616757401656e-06, "loss": 0.74997967, "num_input_tokens_seen": 158273400, "step": 7380, "time_per_iteration": 2.8017635345458984 }, { "auxiliary_loss_clip": 0.01110205, "auxiliary_loss_mlp": 0.01035159, "balance_loss_clip": 1.04831696, "balance_loss_mlp": 1.02124155, "epoch": 0.4437697279422817, "flos": 32416849635840.0, "grad_norm": 1.6338701103198854, "language_loss": 0.64961064, "learning_rate": 2.457237618887458e-06, "loss": 0.67106432, "num_input_tokens_seen": 158296840, "step": 7381, "time_per_iteration": 2.791595458984375 }, { "auxiliary_loss_clip": 0.01120176, "auxiliary_loss_mlp": 0.0104083, "balance_loss_clip": 1.04781485, "balance_loss_mlp": 1.02696049, "epoch": 0.44382985119494966, "flos": 18112875914880.0, "grad_norm": 5.151492667638541, "language_loss": 0.80450714, "learning_rate": 2.456858463034763e-06, "loss": 0.82611728, "num_input_tokens_seen": 158314935, "step": 7382, "time_per_iteration": 4.177164316177368 }, { "auxiliary_loss_clip": 0.0112542, "auxiliary_loss_mlp": 0.01039884, "balance_loss_clip": 1.05130458, "balance_loss_mlp": 1.02599657, "epoch": 0.44388997444761763, "flos": 30774151923840.0, "grad_norm": 1.842434773727105, "language_loss": 0.65955621, "learning_rate": 2.456479289857949e-06, "loss": 0.68120921, "num_input_tokens_seen": 158334620, "step": 7383, "time_per_iteration": 4.142000436782837 }, { "auxiliary_loss_clip": 0.01104406, "auxiliary_loss_mlp": 0.01036969, "balance_loss_clip": 1.04357898, "balance_loss_mlp": 1.02228832, "epoch": 0.4439500977002856, "flos": 20339157893760.0, "grad_norm": 2.431816949897044, "language_loss": 0.76046586, "learning_rate": 2.4561000993713953e-06, "loss": 0.78187954, "num_input_tokens_seen": 158350550, "step": 7384, "time_per_iteration": 4.309042453765869 }, { "auxiliary_loss_clip": 0.01132692, "auxiliary_loss_mlp": 0.01040021, "balance_loss_clip": 1.04878867, "balance_loss_mlp": 1.02595425, "epoch": 0.44401022095295356, "flos": 20371225760640.0, "grad_norm": 1.6001418974541146, "language_loss": 0.81145859, "learning_rate": 2.4557208915894796e-06, "loss": 0.83318579, "num_input_tokens_seen": 158369555, "step": 7385, "time_per_iteration": 2.6569409370422363 }, { "auxiliary_loss_clip": 0.01085589, "auxiliary_loss_mlp": 0.01035837, "balance_loss_clip": 1.04551208, "balance_loss_mlp": 1.02062619, "epoch": 0.4440703442056215, "flos": 20230635928320.0, "grad_norm": 1.8953258070837995, "language_loss": 0.81531972, "learning_rate": 2.455341666526582e-06, "loss": 0.8365339, "num_input_tokens_seen": 158388045, "step": 7386, "time_per_iteration": 2.757857084274292 }, { "auxiliary_loss_clip": 0.01092623, "auxiliary_loss_mlp": 0.01033697, "balance_loss_clip": 1.04583073, "balance_loss_mlp": 1.01829553, "epoch": 0.4441304674582895, "flos": 39494698824960.0, "grad_norm": 2.1898431457791827, "language_loss": 0.70026255, "learning_rate": 2.4549624241970832e-06, "loss": 0.72152579, "num_input_tokens_seen": 158410115, "step": 7387, "time_per_iteration": 4.4056620597839355 }, { "auxiliary_loss_clip": 0.01064296, "auxiliary_loss_mlp": 0.01040123, "balance_loss_clip": 1.04571772, "balance_loss_mlp": 1.02586579, "epoch": 0.44419059071095746, "flos": 14829671220480.0, "grad_norm": 1.9497255625781733, "language_loss": 0.71838999, "learning_rate": 2.4545831646153628e-06, "loss": 0.73943412, "num_input_tokens_seen": 158427765, "step": 7388, "time_per_iteration": 2.7504312992095947 }, { "auxiliary_loss_clip": 0.01120562, "auxiliary_loss_mlp": 0.01036757, "balance_loss_clip": 1.04769969, "balance_loss_mlp": 1.02277958, "epoch": 0.4442507139636254, "flos": 22637835734400.0, "grad_norm": 1.8353800507100826, "language_loss": 0.6930418, "learning_rate": 2.4542038877958044e-06, "loss": 0.71461499, "num_input_tokens_seen": 158446375, "step": 7389, "time_per_iteration": 2.620847702026367 }, { "auxiliary_loss_clip": 0.01119935, "auxiliary_loss_mlp": 0.01035558, "balance_loss_clip": 1.04713047, "balance_loss_mlp": 1.02149689, "epoch": 0.4443108372162934, "flos": 38290721829120.0, "grad_norm": 1.8033342781314554, "language_loss": 0.75145507, "learning_rate": 2.453824593752788e-06, "loss": 0.77301002, "num_input_tokens_seen": 158467260, "step": 7390, "time_per_iteration": 2.794739246368408 }, { "auxiliary_loss_clip": 0.01112569, "auxiliary_loss_mlp": 0.0104339, "balance_loss_clip": 1.04474115, "balance_loss_mlp": 1.0285244, "epoch": 0.44437096046896135, "flos": 17748993185280.0, "grad_norm": 2.757944013002859, "language_loss": 0.8139115, "learning_rate": 2.4534452825006988e-06, "loss": 0.83547109, "num_input_tokens_seen": 158486720, "step": 7391, "time_per_iteration": 2.62081241607666 }, { "auxiliary_loss_clip": 0.01100157, "auxiliary_loss_mlp": 0.01039531, "balance_loss_clip": 1.04489446, "balance_loss_mlp": 1.02436733, "epoch": 0.4444310837216293, "flos": 13732348682880.0, "grad_norm": 1.7057692393428199, "language_loss": 0.73885345, "learning_rate": 2.4530659540539185e-06, "loss": 0.76025033, "num_input_tokens_seen": 158502530, "step": 7392, "time_per_iteration": 2.619123935699463 }, { "auxiliary_loss_clip": 0.01116796, "auxiliary_loss_mlp": 0.01032995, "balance_loss_clip": 1.04451931, "balance_loss_mlp": 1.01976895, "epoch": 0.44449120697429734, "flos": 25010238240000.0, "grad_norm": 1.6244243517648933, "language_loss": 0.79316819, "learning_rate": 2.4526866084268313e-06, "loss": 0.81466603, "num_input_tokens_seen": 158522715, "step": 7393, "time_per_iteration": 2.761636257171631 }, { "auxiliary_loss_clip": 0.01123845, "auxiliary_loss_mlp": 0.01034263, "balance_loss_clip": 1.04784608, "balance_loss_mlp": 1.02036357, "epoch": 0.4445513302269653, "flos": 32671707609600.0, "grad_norm": 1.7936817608261026, "language_loss": 0.80767369, "learning_rate": 2.4523072456338226e-06, "loss": 0.82925481, "num_input_tokens_seen": 158543615, "step": 7394, "time_per_iteration": 2.731896162033081 }, { "auxiliary_loss_clip": 0.01101431, "auxiliary_loss_mlp": 0.010406, "balance_loss_clip": 1.04235363, "balance_loss_mlp": 1.02805972, "epoch": 0.44461145347963327, "flos": 11655814504320.0, "grad_norm": 2.5483522979722886, "language_loss": 0.79701138, "learning_rate": 2.4519278656892785e-06, "loss": 0.81843174, "num_input_tokens_seen": 158560330, "step": 7395, "time_per_iteration": 2.6799733638763428 }, { "auxiliary_loss_clip": 0.0110231, "auxiliary_loss_mlp": 0.01040027, "balance_loss_clip": 1.04210639, "balance_loss_mlp": 1.02630031, "epoch": 0.44467157673230123, "flos": 20886759711360.0, "grad_norm": 1.725775342310971, "language_loss": 0.68280721, "learning_rate": 2.451548468607584e-06, "loss": 0.70423067, "num_input_tokens_seen": 158579735, "step": 7396, "time_per_iteration": 2.7539262771606445 }, { "auxiliary_loss_clip": 0.01115853, "auxiliary_loss_mlp": 0.00771942, "balance_loss_clip": 1.04396296, "balance_loss_mlp": 1.00035286, "epoch": 0.4447316999849692, "flos": 18546137763840.0, "grad_norm": 1.749232481773879, "language_loss": 0.80780083, "learning_rate": 2.451169054403126e-06, "loss": 0.82667875, "num_input_tokens_seen": 158597075, "step": 7397, "time_per_iteration": 2.6620333194732666 }, { "auxiliary_loss_clip": 0.01119828, "auxiliary_loss_mlp": 0.01038203, "balance_loss_clip": 1.04740441, "balance_loss_mlp": 1.02525663, "epoch": 0.44479182323763716, "flos": 23769057732480.0, "grad_norm": 1.6626939297991263, "language_loss": 0.67383635, "learning_rate": 2.450789623090293e-06, "loss": 0.69541669, "num_input_tokens_seen": 158616650, "step": 7398, "time_per_iteration": 2.671193838119507 }, { "auxiliary_loss_clip": 0.01097104, "auxiliary_loss_mlp": 0.01040281, "balance_loss_clip": 1.04477727, "balance_loss_mlp": 1.0271976, "epoch": 0.44485194649030513, "flos": 16543831040640.0, "grad_norm": 1.7055478439146432, "language_loss": 0.69250667, "learning_rate": 2.450410174683472e-06, "loss": 0.71388054, "num_input_tokens_seen": 158634515, "step": 7399, "time_per_iteration": 2.6823384761810303 }, { "auxiliary_loss_clip": 0.01097596, "auxiliary_loss_mlp": 0.01035766, "balance_loss_clip": 1.04475021, "balance_loss_mlp": 1.0225575, "epoch": 0.4449120697429731, "flos": 22600955445120.0, "grad_norm": 1.8287170900617375, "language_loss": 0.72332168, "learning_rate": 2.4500307091970514e-06, "loss": 0.74465525, "num_input_tokens_seen": 158653760, "step": 7400, "time_per_iteration": 2.7227253913879395 }, { "auxiliary_loss_clip": 0.01076093, "auxiliary_loss_mlp": 0.00770024, "balance_loss_clip": 1.04184151, "balance_loss_mlp": 1.00039887, "epoch": 0.44497219299564106, "flos": 20004864992640.0, "grad_norm": 1.6814996958378423, "language_loss": 0.85252142, "learning_rate": 2.449651226645422e-06, "loss": 0.87098259, "num_input_tokens_seen": 158672190, "step": 7401, "time_per_iteration": 2.757293701171875 }, { "auxiliary_loss_clip": 0.01102171, "auxiliary_loss_mlp": 0.0103733, "balance_loss_clip": 1.04564703, "balance_loss_mlp": 1.02497375, "epoch": 0.445032316248309, "flos": 25594253470080.0, "grad_norm": 1.6805452055908299, "language_loss": 0.83201802, "learning_rate": 2.449271727042973e-06, "loss": 0.85341299, "num_input_tokens_seen": 158694115, "step": 7402, "time_per_iteration": 2.7132928371429443 }, { "auxiliary_loss_clip": 0.01107267, "auxiliary_loss_mlp": 0.01032822, "balance_loss_clip": 1.0461576, "balance_loss_mlp": 1.0188688, "epoch": 0.445092439500977, "flos": 21250426959360.0, "grad_norm": 1.9019306445781163, "language_loss": 0.7714172, "learning_rate": 2.4488922104040947e-06, "loss": 0.79281807, "num_input_tokens_seen": 158711000, "step": 7403, "time_per_iteration": 2.6282217502593994 }, { "auxiliary_loss_clip": 0.01023728, "auxiliary_loss_mlp": 0.01005808, "balance_loss_clip": 1.0202831, "balance_loss_mlp": 1.00413918, "epoch": 0.44515256275364495, "flos": 57764900309760.0, "grad_norm": 0.7456605721636542, "language_loss": 0.59988129, "learning_rate": 2.4485126767431793e-06, "loss": 0.62017667, "num_input_tokens_seen": 158769675, "step": 7404, "time_per_iteration": 3.173560619354248 }, { "auxiliary_loss_clip": 0.01105136, "auxiliary_loss_mlp": 0.01044638, "balance_loss_clip": 1.04419279, "balance_loss_mlp": 1.02934957, "epoch": 0.4452126860063129, "flos": 15596004908160.0, "grad_norm": 1.6768296122026118, "language_loss": 0.82246673, "learning_rate": 2.4481331260746177e-06, "loss": 0.8439644, "num_input_tokens_seen": 158788215, "step": 7405, "time_per_iteration": 2.6669278144836426 }, { "auxiliary_loss_clip": 0.01104648, "auxiliary_loss_mlp": 0.01029929, "balance_loss_clip": 1.04628932, "balance_loss_mlp": 1.01669657, "epoch": 0.4452728092589809, "flos": 21617398258560.0, "grad_norm": 4.56209401129754, "language_loss": 0.75126898, "learning_rate": 2.4477535584128036e-06, "loss": 0.77261472, "num_input_tokens_seen": 158809090, "step": 7406, "time_per_iteration": 2.6722404956817627 }, { "auxiliary_loss_clip": 0.01091029, "auxiliary_loss_mlp": 0.01030298, "balance_loss_clip": 1.0434047, "balance_loss_mlp": 1.01746488, "epoch": 0.4453329325116489, "flos": 29497491757440.0, "grad_norm": 1.6633570284980403, "language_loss": 0.6572476, "learning_rate": 2.447373973772129e-06, "loss": 0.67846084, "num_input_tokens_seen": 158828320, "step": 7407, "time_per_iteration": 2.819289207458496 }, { "auxiliary_loss_clip": 0.01102137, "auxiliary_loss_mlp": 0.01031486, "balance_loss_clip": 1.04499328, "balance_loss_mlp": 1.0179081, "epoch": 0.44539305576431687, "flos": 21361139654400.0, "grad_norm": 1.6186505097592758, "language_loss": 0.67861688, "learning_rate": 2.4469943721669887e-06, "loss": 0.69995308, "num_input_tokens_seen": 158847040, "step": 7408, "time_per_iteration": 2.6846649646759033 }, { "auxiliary_loss_clip": 0.01128678, "auxiliary_loss_mlp": 0.01035504, "balance_loss_clip": 1.04559541, "balance_loss_mlp": 1.02121043, "epoch": 0.44545317901698483, "flos": 41427626428800.0, "grad_norm": 1.4740715510068387, "language_loss": 0.72127414, "learning_rate": 2.4466147536117776e-06, "loss": 0.74291599, "num_input_tokens_seen": 158870490, "step": 7409, "time_per_iteration": 2.7701869010925293 }, { "auxiliary_loss_clip": 0.01107577, "auxiliary_loss_mlp": 0.010375, "balance_loss_clip": 1.04669523, "balance_loss_mlp": 1.02308798, "epoch": 0.4455133022696528, "flos": 22055005653120.0, "grad_norm": 1.9118661854704846, "language_loss": 0.65146017, "learning_rate": 2.4462351181208895e-06, "loss": 0.67291093, "num_input_tokens_seen": 158889920, "step": 7410, "time_per_iteration": 2.780905246734619 }, { "auxiliary_loss_clip": 0.01104956, "auxiliary_loss_mlp": 0.01038448, "balance_loss_clip": 1.04414868, "balance_loss_mlp": 1.02369618, "epoch": 0.44557342552232077, "flos": 23476960333440.0, "grad_norm": 2.076728084707015, "language_loss": 0.73772335, "learning_rate": 2.4458554657087217e-06, "loss": 0.75915742, "num_input_tokens_seen": 158909580, "step": 7411, "time_per_iteration": 2.745547294616699 }, { "auxiliary_loss_clip": 0.01061885, "auxiliary_loss_mlp": 0.01033363, "balance_loss_clip": 1.04457641, "balance_loss_mlp": 1.01967764, "epoch": 0.44563354877498873, "flos": 19134678107520.0, "grad_norm": 1.7330985507109689, "language_loss": 0.79373199, "learning_rate": 2.4454757963896695e-06, "loss": 0.81468445, "num_input_tokens_seen": 158924600, "step": 7412, "time_per_iteration": 2.76361346244812 }, { "auxiliary_loss_clip": 0.01108589, "auxiliary_loss_mlp": 0.01037974, "balance_loss_clip": 1.04357016, "balance_loss_mlp": 1.02453899, "epoch": 0.4456936720276567, "flos": 13621420506240.0, "grad_norm": 1.9356381581130233, "language_loss": 0.80161285, "learning_rate": 2.4450961101781304e-06, "loss": 0.82307845, "num_input_tokens_seen": 158939345, "step": 7413, "time_per_iteration": 2.619915008544922 }, { "auxiliary_loss_clip": 0.01113419, "auxiliary_loss_mlp": 0.0103316, "balance_loss_clip": 1.0433104, "balance_loss_mlp": 1.01962328, "epoch": 0.44575379528032466, "flos": 14713715139840.0, "grad_norm": 1.9889124982728665, "language_loss": 0.76648301, "learning_rate": 2.4447164070885026e-06, "loss": 0.78794879, "num_input_tokens_seen": 158955855, "step": 7414, "time_per_iteration": 2.5959794521331787 }, { "auxiliary_loss_clip": 0.01096052, "auxiliary_loss_mlp": 0.01040946, "balance_loss_clip": 1.0415467, "balance_loss_mlp": 1.02701616, "epoch": 0.4458139185329926, "flos": 24170682677760.0, "grad_norm": 1.6599120729875612, "language_loss": 0.83765483, "learning_rate": 2.4443366871351837e-06, "loss": 0.85902476, "num_input_tokens_seen": 158976315, "step": 7415, "time_per_iteration": 2.785512685775757 }, { "auxiliary_loss_clip": 0.01124247, "auxiliary_loss_mlp": 0.01043831, "balance_loss_clip": 1.04321933, "balance_loss_mlp": 1.03093266, "epoch": 0.4458740417856606, "flos": 21762225895680.0, "grad_norm": 2.1888037109264933, "language_loss": 0.84245199, "learning_rate": 2.4439569503325732e-06, "loss": 0.86413276, "num_input_tokens_seen": 158996725, "step": 7416, "time_per_iteration": 2.60307240486145 }, { "auxiliary_loss_clip": 0.01095417, "auxiliary_loss_mlp": 0.01034003, "balance_loss_clip": 1.04398692, "balance_loss_mlp": 1.01991272, "epoch": 0.44593416503832856, "flos": 21068790860160.0, "grad_norm": 1.494230693182331, "language_loss": 0.81091261, "learning_rate": 2.4435771966950706e-06, "loss": 0.83220685, "num_input_tokens_seen": 159017255, "step": 7417, "time_per_iteration": 2.7423362731933594 }, { "auxiliary_loss_clip": 0.01105133, "auxiliary_loss_mlp": 0.01040379, "balance_loss_clip": 1.04227042, "balance_loss_mlp": 1.02650881, "epoch": 0.4459942882909965, "flos": 22600488568320.0, "grad_norm": 2.47121292521638, "language_loss": 0.81035185, "learning_rate": 2.443197426237077e-06, "loss": 0.8318069, "num_input_tokens_seen": 159035010, "step": 7418, "time_per_iteration": 2.67476487159729 }, { "auxiliary_loss_clip": 0.01120234, "auxiliary_loss_mlp": 0.007712, "balance_loss_clip": 1.04618478, "balance_loss_mlp": 1.00049162, "epoch": 0.4460544115436645, "flos": 26505486622080.0, "grad_norm": 2.084312717643635, "language_loss": 0.77342117, "learning_rate": 2.442817638972991e-06, "loss": 0.79233551, "num_input_tokens_seen": 159055345, "step": 7419, "time_per_iteration": 2.760847806930542 }, { "auxiliary_loss_clip": 0.0108993, "auxiliary_loss_mlp": 0.0103388, "balance_loss_clip": 1.03954124, "balance_loss_mlp": 1.02063632, "epoch": 0.4461145347963325, "flos": 17604021893760.0, "grad_norm": 1.824664612180611, "language_loss": 0.72570968, "learning_rate": 2.4424378349172176e-06, "loss": 0.74694777, "num_input_tokens_seen": 159074225, "step": 7420, "time_per_iteration": 2.6990244388580322 }, { "auxiliary_loss_clip": 0.01104512, "auxiliary_loss_mlp": 0.01032052, "balance_loss_clip": 1.0432508, "balance_loss_mlp": 1.01793802, "epoch": 0.44617465804900047, "flos": 27268193036160.0, "grad_norm": 1.5590654083825235, "language_loss": 0.75280499, "learning_rate": 2.442058014084156e-06, "loss": 0.77417064, "num_input_tokens_seen": 159095415, "step": 7421, "time_per_iteration": 2.751757860183716 }, { "auxiliary_loss_clip": 0.01059239, "auxiliary_loss_mlp": 0.01037453, "balance_loss_clip": 1.03808808, "balance_loss_mlp": 1.02374959, "epoch": 0.44623478130166844, "flos": 17786412178560.0, "grad_norm": 1.7359325284030627, "language_loss": 0.75753498, "learning_rate": 2.44167817648821e-06, "loss": 0.77850193, "num_input_tokens_seen": 159114615, "step": 7422, "time_per_iteration": 4.3189520835876465 }, { "auxiliary_loss_clip": 0.01125756, "auxiliary_loss_mlp": 0.01033879, "balance_loss_clip": 1.04443765, "balance_loss_mlp": 1.02083755, "epoch": 0.4462949045543364, "flos": 23003011353600.0, "grad_norm": 1.436007196155178, "language_loss": 0.65393054, "learning_rate": 2.441298322143784e-06, "loss": 0.67552686, "num_input_tokens_seen": 159134370, "step": 7423, "time_per_iteration": 4.272382020950317 }, { "auxiliary_loss_clip": 0.01096555, "auxiliary_loss_mlp": 0.01034422, "balance_loss_clip": 1.04093194, "balance_loss_mlp": 1.02195287, "epoch": 0.44635502780700437, "flos": 17820096157440.0, "grad_norm": 1.6490570846190094, "language_loss": 0.79002917, "learning_rate": 2.4409184510652807e-06, "loss": 0.8113389, "num_input_tokens_seen": 159152540, "step": 7424, "time_per_iteration": 2.6641786098480225 }, { "auxiliary_loss_clip": 0.01109138, "auxiliary_loss_mlp": 0.01031872, "balance_loss_clip": 1.04272473, "balance_loss_mlp": 1.01960564, "epoch": 0.44641515105967233, "flos": 26688020561280.0, "grad_norm": 1.5476168372337398, "language_loss": 0.80515361, "learning_rate": 2.4405385632671063e-06, "loss": 0.82656378, "num_input_tokens_seen": 159173425, "step": 7425, "time_per_iteration": 2.677921772003174 }, { "auxiliary_loss_clip": 0.01111593, "auxiliary_loss_mlp": 0.01030626, "balance_loss_clip": 1.04249597, "balance_loss_mlp": 1.01805556, "epoch": 0.4464752743123403, "flos": 18913324544640.0, "grad_norm": 1.7505920916906397, "language_loss": 0.77314126, "learning_rate": 2.4401586587636655e-06, "loss": 0.79456341, "num_input_tokens_seen": 159191210, "step": 7426, "time_per_iteration": 4.264745712280273 }, { "auxiliary_loss_clip": 0.01098153, "auxiliary_loss_mlp": 0.00770786, "balance_loss_clip": 1.04180968, "balance_loss_mlp": 1.00042045, "epoch": 0.44653539756500826, "flos": 29570318582400.0, "grad_norm": 2.512425150903693, "language_loss": 0.64678168, "learning_rate": 2.4397787375693634e-06, "loss": 0.66547108, "num_input_tokens_seen": 159211755, "step": 7427, "time_per_iteration": 2.746807336807251 }, { "auxiliary_loss_clip": 0.01114285, "auxiliary_loss_mlp": 0.01032662, "balance_loss_clip": 1.04756093, "balance_loss_mlp": 1.01968026, "epoch": 0.44659552081767623, "flos": 21468979261440.0, "grad_norm": 1.6794687580888963, "language_loss": 0.7564522, "learning_rate": 2.439398799698608e-06, "loss": 0.77792168, "num_input_tokens_seen": 159230315, "step": 7428, "time_per_iteration": 2.675830364227295 }, { "auxiliary_loss_clip": 0.01089417, "auxiliary_loss_mlp": 0.0103803, "balance_loss_clip": 1.03992331, "balance_loss_mlp": 1.0244813, "epoch": 0.4466556440703442, "flos": 17931886260480.0, "grad_norm": 2.160723316992149, "language_loss": 0.77906388, "learning_rate": 2.439018845165806e-06, "loss": 0.80033839, "num_input_tokens_seen": 159249810, "step": 7429, "time_per_iteration": 2.6864819526672363 }, { "auxiliary_loss_clip": 0.01117759, "auxiliary_loss_mlp": 0.01036133, "balance_loss_clip": 1.04573584, "balance_loss_mlp": 1.02222157, "epoch": 0.44671576732301216, "flos": 21107430915840.0, "grad_norm": 1.6783165407459442, "language_loss": 0.91421354, "learning_rate": 2.438638873985366e-06, "loss": 0.93575251, "num_input_tokens_seen": 159271715, "step": 7430, "time_per_iteration": 2.6472880840301514 }, { "auxiliary_loss_clip": 0.01105427, "auxiliary_loss_mlp": 0.00772764, "balance_loss_clip": 1.04418826, "balance_loss_mlp": 1.000386, "epoch": 0.4467758905756801, "flos": 23508920459520.0, "grad_norm": 1.918378394995702, "language_loss": 0.79452366, "learning_rate": 2.4382588861716954e-06, "loss": 0.8133055, "num_input_tokens_seen": 159290690, "step": 7431, "time_per_iteration": 2.7096598148345947 }, { "auxiliary_loss_clip": 0.01108777, "auxiliary_loss_mlp": 0.01036954, "balance_loss_clip": 1.04568875, "balance_loss_mlp": 1.02245188, "epoch": 0.4468360138283481, "flos": 18734022829440.0, "grad_norm": 1.6794320575098944, "language_loss": 0.79817986, "learning_rate": 2.437878881739204e-06, "loss": 0.81963724, "num_input_tokens_seen": 159309400, "step": 7432, "time_per_iteration": 2.676522970199585 }, { "auxiliary_loss_clip": 0.01094927, "auxiliary_loss_mlp": 0.01040483, "balance_loss_clip": 1.04654121, "balance_loss_mlp": 1.02803755, "epoch": 0.4468961370810161, "flos": 23477139901440.0, "grad_norm": 1.8261946877850768, "language_loss": 0.76878047, "learning_rate": 2.437498860702301e-06, "loss": 0.79013455, "num_input_tokens_seen": 159327425, "step": 7433, "time_per_iteration": 2.6820082664489746 }, { "auxiliary_loss_clip": 0.01106089, "auxiliary_loss_mlp": 0.01034932, "balance_loss_clip": 1.04236984, "balance_loss_mlp": 1.02372587, "epoch": 0.4469562603336841, "flos": 30075042539520.0, "grad_norm": 1.6244691365264956, "language_loss": 0.77377415, "learning_rate": 2.437118823075398e-06, "loss": 0.79518431, "num_input_tokens_seen": 159345805, "step": 7434, "time_per_iteration": 2.7471024990081787 }, { "auxiliary_loss_clip": 0.01118898, "auxiliary_loss_mlp": 0.01031979, "balance_loss_clip": 1.04707336, "balance_loss_mlp": 1.01909828, "epoch": 0.44701638358635204, "flos": 22456415116800.0, "grad_norm": 1.6740796261727897, "language_loss": 0.64705265, "learning_rate": 2.436738768872905e-06, "loss": 0.6685614, "num_input_tokens_seen": 159364595, "step": 7435, "time_per_iteration": 2.649425983428955 }, { "auxiliary_loss_clip": 0.01112389, "auxiliary_loss_mlp": 0.01029389, "balance_loss_clip": 1.04875195, "balance_loss_mlp": 1.01587653, "epoch": 0.44707650683902, "flos": 24057851080320.0, "grad_norm": 1.6005542791240868, "language_loss": 0.83477545, "learning_rate": 2.4363586981092346e-06, "loss": 0.85619318, "num_input_tokens_seen": 159385265, "step": 7436, "time_per_iteration": 2.6727020740509033 }, { "auxiliary_loss_clip": 0.01073439, "auxiliary_loss_mlp": 0.01045352, "balance_loss_clip": 1.0402267, "balance_loss_mlp": 1.02884197, "epoch": 0.44713663009168797, "flos": 23766938830080.0, "grad_norm": 2.1717582772549995, "language_loss": 0.79815632, "learning_rate": 2.435978610798798e-06, "loss": 0.81934428, "num_input_tokens_seen": 159405080, "step": 7437, "time_per_iteration": 2.7589898109436035 }, { "auxiliary_loss_clip": 0.01079969, "auxiliary_loss_mlp": 0.01037183, "balance_loss_clip": 1.0433023, "balance_loss_mlp": 1.02375364, "epoch": 0.44719675334435594, "flos": 24499265316480.0, "grad_norm": 1.7231807337022225, "language_loss": 0.71860999, "learning_rate": 2.435598506956009e-06, "loss": 0.7397815, "num_input_tokens_seen": 159424595, "step": 7438, "time_per_iteration": 2.794978380203247 }, { "auxiliary_loss_clip": 0.01084835, "auxiliary_loss_mlp": 0.01035733, "balance_loss_clip": 1.04564655, "balance_loss_mlp": 1.02180314, "epoch": 0.4472568765970239, "flos": 29781759991680.0, "grad_norm": 1.556366888574876, "language_loss": 0.67619812, "learning_rate": 2.4352183865952808e-06, "loss": 0.69740379, "num_input_tokens_seen": 159443865, "step": 7439, "time_per_iteration": 2.9251644611358643 }, { "auxiliary_loss_clip": 0.01102346, "auxiliary_loss_mlp": 0.01039634, "balance_loss_clip": 1.0403614, "balance_loss_mlp": 1.02436376, "epoch": 0.44731699984969187, "flos": 24643123286400.0, "grad_norm": 1.714649831944237, "language_loss": 0.73915118, "learning_rate": 2.4348382497310285e-06, "loss": 0.760571, "num_input_tokens_seen": 159464525, "step": 7440, "time_per_iteration": 2.773106813430786 }, { "auxiliary_loss_clip": 0.01072825, "auxiliary_loss_mlp": 0.01042282, "balance_loss_clip": 1.03706956, "balance_loss_mlp": 1.02789354, "epoch": 0.44737712310235983, "flos": 29455691304960.0, "grad_norm": 1.740924989183362, "language_loss": 0.74161476, "learning_rate": 2.4344580963776655e-06, "loss": 0.76276582, "num_input_tokens_seen": 159486385, "step": 7441, "time_per_iteration": 2.9042701721191406 }, { "auxiliary_loss_clip": 0.01096694, "auxiliary_loss_mlp": 0.01036467, "balance_loss_clip": 1.04596698, "balance_loss_mlp": 1.0220542, "epoch": 0.4474372463550278, "flos": 24896832024960.0, "grad_norm": 1.9641422641471569, "language_loss": 0.75060695, "learning_rate": 2.4340779265496082e-06, "loss": 0.77193856, "num_input_tokens_seen": 159503880, "step": 7442, "time_per_iteration": 2.776219129562378 }, { "auxiliary_loss_clip": 0.01131095, "auxiliary_loss_mlp": 0.01033925, "balance_loss_clip": 1.04641354, "balance_loss_mlp": 1.01900017, "epoch": 0.44749736960769576, "flos": 33181603125120.0, "grad_norm": 1.741320347682455, "language_loss": 0.74572098, "learning_rate": 2.433697740261273e-06, "loss": 0.76737112, "num_input_tokens_seen": 159522980, "step": 7443, "time_per_iteration": 2.783189058303833 }, { "auxiliary_loss_clip": 0.01099877, "auxiliary_loss_mlp": 0.0103204, "balance_loss_clip": 1.03843653, "balance_loss_mlp": 1.01699591, "epoch": 0.4475574928603637, "flos": 21071807602560.0, "grad_norm": 1.581803518054495, "language_loss": 0.77928406, "learning_rate": 2.4333175375270748e-06, "loss": 0.80060327, "num_input_tokens_seen": 159543340, "step": 7444, "time_per_iteration": 2.750493049621582 }, { "auxiliary_loss_clip": 0.01108777, "auxiliary_loss_mlp": 0.01033259, "balance_loss_clip": 1.04501557, "balance_loss_mlp": 1.01988959, "epoch": 0.4476176161130317, "flos": 21862523646720.0, "grad_norm": 2.5006881318170917, "language_loss": 0.85238421, "learning_rate": 2.4329373183614333e-06, "loss": 0.87380457, "num_input_tokens_seen": 159558210, "step": 7445, "time_per_iteration": 2.6802477836608887 }, { "auxiliary_loss_clip": 0.01087309, "auxiliary_loss_mlp": 0.0104165, "balance_loss_clip": 1.04073787, "balance_loss_mlp": 1.02471042, "epoch": 0.4476777393656997, "flos": 22528667324160.0, "grad_norm": 3.110631371373827, "language_loss": 0.63355798, "learning_rate": 2.432557082778765e-06, "loss": 0.65484762, "num_input_tokens_seen": 159577920, "step": 7446, "time_per_iteration": 2.746697187423706 }, { "auxiliary_loss_clip": 0.01039011, "auxiliary_loss_mlp": 0.01002627, "balance_loss_clip": 1.02036047, "balance_loss_mlp": 1.00081527, "epoch": 0.4477378626183677, "flos": 49017133877760.0, "grad_norm": 0.738380684617154, "language_loss": 0.50261772, "learning_rate": 2.4321768307934884e-06, "loss": 0.5230341, "num_input_tokens_seen": 159632295, "step": 7447, "time_per_iteration": 3.0176138877868652 }, { "auxiliary_loss_clip": 0.01047805, "auxiliary_loss_mlp": 0.0099926, "balance_loss_clip": 1.0195471, "balance_loss_mlp": 0.9976145, "epoch": 0.44779798587103564, "flos": 56542179392640.0, "grad_norm": 0.7822716011451579, "language_loss": 0.59427667, "learning_rate": 2.4317965624200235e-06, "loss": 0.61474735, "num_input_tokens_seen": 159698435, "step": 7448, "time_per_iteration": 3.1922085285186768 }, { "auxiliary_loss_clip": 0.01093955, "auxiliary_loss_mlp": 0.01032649, "balance_loss_clip": 1.04417181, "balance_loss_mlp": 1.01983976, "epoch": 0.4478581091237036, "flos": 46498536040320.0, "grad_norm": 1.6983811072489297, "language_loss": 0.58952618, "learning_rate": 2.431416277672789e-06, "loss": 0.61079222, "num_input_tokens_seen": 159722150, "step": 7449, "time_per_iteration": 2.9170258045196533 }, { "auxiliary_loss_clip": 0.01096033, "auxiliary_loss_mlp": 0.01031648, "balance_loss_clip": 1.04244077, "balance_loss_mlp": 1.01851141, "epoch": 0.4479182323763716, "flos": 20814363849600.0, "grad_norm": 2.0305308033418497, "language_loss": 0.8022064, "learning_rate": 2.4310359765662065e-06, "loss": 0.82348317, "num_input_tokens_seen": 159740550, "step": 7450, "time_per_iteration": 2.640101671218872 }, { "auxiliary_loss_clip": 0.01128944, "auxiliary_loss_mlp": 0.0103919, "balance_loss_clip": 1.04747844, "balance_loss_mlp": 1.02609515, "epoch": 0.44797835562903954, "flos": 14245979212800.0, "grad_norm": 2.0706353062233878, "language_loss": 0.79404807, "learning_rate": 2.430655659114697e-06, "loss": 0.81572944, "num_input_tokens_seen": 159758245, "step": 7451, "time_per_iteration": 2.6094324588775635 }, { "auxiliary_loss_clip": 0.01008441, "auxiliary_loss_mlp": 0.01004662, "balance_loss_clip": 1.02162147, "balance_loss_mlp": 1.00313568, "epoch": 0.4480384788817075, "flos": 63534560169600.0, "grad_norm": 0.8263901394620045, "language_loss": 0.62780499, "learning_rate": 2.430275325332681e-06, "loss": 0.64793605, "num_input_tokens_seen": 159826790, "step": 7452, "time_per_iteration": 3.3816721439361572 }, { "auxiliary_loss_clip": 0.01128154, "auxiliary_loss_mlp": 0.01034079, "balance_loss_clip": 1.04587567, "balance_loss_mlp": 1.01958907, "epoch": 0.44809860213437547, "flos": 21652626522240.0, "grad_norm": 1.717773614702603, "language_loss": 0.62656605, "learning_rate": 2.429894975234582e-06, "loss": 0.64818835, "num_input_tokens_seen": 159845805, "step": 7453, "time_per_iteration": 2.6495423316955566 }, { "auxiliary_loss_clip": 0.0102644, "auxiliary_loss_mlp": 0.01007957, "balance_loss_clip": 1.01617622, "balance_loss_mlp": 1.00627661, "epoch": 0.44815872538704343, "flos": 69190634246400.0, "grad_norm": 0.7452851567935764, "language_loss": 0.57032764, "learning_rate": 2.4295146088348224e-06, "loss": 0.59067166, "num_input_tokens_seen": 159898860, "step": 7454, "time_per_iteration": 3.0483179092407227 }, { "auxiliary_loss_clip": 0.0110232, "auxiliary_loss_mlp": 0.0104097, "balance_loss_clip": 1.04301405, "balance_loss_mlp": 1.02651, "epoch": 0.4482188486397114, "flos": 12598289510400.0, "grad_norm": 2.1814246614415795, "language_loss": 0.75516129, "learning_rate": 2.4291342261478255e-06, "loss": 0.77659416, "num_input_tokens_seen": 159911555, "step": 7455, "time_per_iteration": 2.639425039291382 }, { "auxiliary_loss_clip": 0.01103634, "auxiliary_loss_mlp": 0.0103636, "balance_loss_clip": 1.0440948, "balance_loss_mlp": 1.02343822, "epoch": 0.44827897189237936, "flos": 34058182631040.0, "grad_norm": 1.8295063999245702, "language_loss": 0.75630772, "learning_rate": 2.428753827188016e-06, "loss": 0.7777077, "num_input_tokens_seen": 159931470, "step": 7456, "time_per_iteration": 2.809356451034546 }, { "auxiliary_loss_clip": 0.01130195, "auxiliary_loss_mlp": 0.01036439, "balance_loss_clip": 1.05033028, "balance_loss_mlp": 1.02355289, "epoch": 0.44833909514504733, "flos": 25147416280320.0, "grad_norm": 60.5899352460765, "language_loss": 0.76306677, "learning_rate": 2.428373411969818e-06, "loss": 0.78473306, "num_input_tokens_seen": 159946115, "step": 7457, "time_per_iteration": 2.632532835006714 }, { "auxiliary_loss_clip": 0.01111792, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.04215193, "balance_loss_mlp": 1.01695263, "epoch": 0.4483992183977153, "flos": 16179984224640.0, "grad_norm": 2.8627685619088203, "language_loss": 0.68479908, "learning_rate": 2.4279929805076576e-06, "loss": 0.70623147, "num_input_tokens_seen": 159963915, "step": 7458, "time_per_iteration": 2.6376359462738037 }, { "auxiliary_loss_clip": 0.01091284, "auxiliary_loss_mlp": 0.01033162, "balance_loss_clip": 1.04267764, "balance_loss_mlp": 1.018332, "epoch": 0.44845934165038326, "flos": 17746048270080.0, "grad_norm": 1.5800915665139277, "language_loss": 0.71851492, "learning_rate": 2.427612532815961e-06, "loss": 0.73975933, "num_input_tokens_seen": 159982140, "step": 7459, "time_per_iteration": 2.713164806365967 }, { "auxiliary_loss_clip": 0.01108578, "auxiliary_loss_mlp": 0.01036526, "balance_loss_clip": 1.04210949, "balance_loss_mlp": 1.02282834, "epoch": 0.4485194649030513, "flos": 21835914647040.0, "grad_norm": 1.672173614468041, "language_loss": 0.70216429, "learning_rate": 2.427232068909154e-06, "loss": 0.72361535, "num_input_tokens_seen": 160002280, "step": 7460, "time_per_iteration": 2.6243271827697754 }, { "auxiliary_loss_clip": 0.01129261, "auxiliary_loss_mlp": 0.01038736, "balance_loss_clip": 1.04698896, "balance_loss_mlp": 1.02463329, "epoch": 0.44857958815571924, "flos": 20084515401600.0, "grad_norm": 1.9532472719910148, "language_loss": 0.77566743, "learning_rate": 2.4268515888016635e-06, "loss": 0.79734743, "num_input_tokens_seen": 160020260, "step": 7461, "time_per_iteration": 4.114460468292236 }, { "auxiliary_loss_clip": 0.01128704, "auxiliary_loss_mlp": 0.01034261, "balance_loss_clip": 1.0455538, "balance_loss_mlp": 1.02091575, "epoch": 0.4486397114083872, "flos": 27053519402880.0, "grad_norm": 1.943200777150693, "language_loss": 0.67738903, "learning_rate": 2.4264710925079184e-06, "loss": 0.69901872, "num_input_tokens_seen": 160040240, "step": 7462, "time_per_iteration": 5.671550035476685 }, { "auxiliary_loss_clip": 0.01046056, "auxiliary_loss_mlp": 0.01002183, "balance_loss_clip": 1.0179913, "balance_loss_mlp": 1.0006094, "epoch": 0.4486998346610552, "flos": 67321195931520.0, "grad_norm": 0.7528637907126196, "language_loss": 0.5449208, "learning_rate": 2.4260905800423462e-06, "loss": 0.5654031, "num_input_tokens_seen": 160093865, "step": 7463, "time_per_iteration": 3.132819890975952 }, { "auxiliary_loss_clip": 0.01117188, "auxiliary_loss_mlp": 0.01031184, "balance_loss_clip": 1.04449058, "balance_loss_mlp": 1.01758814, "epoch": 0.44875995791372314, "flos": 27636816360960.0, "grad_norm": 2.3886431821168954, "language_loss": 0.7580359, "learning_rate": 2.4257100514193775e-06, "loss": 0.77951968, "num_input_tokens_seen": 160113590, "step": 7464, "time_per_iteration": 2.7005674839019775 }, { "auxiliary_loss_clip": 0.01116572, "auxiliary_loss_mlp": 0.01037604, "balance_loss_clip": 1.04709184, "balance_loss_mlp": 1.02484834, "epoch": 0.4488200811663911, "flos": 13005947940480.0, "grad_norm": 1.7787597626645963, "language_loss": 0.74147099, "learning_rate": 2.425329506653441e-06, "loss": 0.76301277, "num_input_tokens_seen": 160131795, "step": 7465, "time_per_iteration": 4.423643112182617 }, { "auxiliary_loss_clip": 0.01110783, "auxiliary_loss_mlp": 0.01040781, "balance_loss_clip": 1.04708648, "balance_loss_mlp": 1.02503395, "epoch": 0.44888020441905907, "flos": 27489977562240.0, "grad_norm": 2.0439366025173347, "language_loss": 0.7991035, "learning_rate": 2.424948945758966e-06, "loss": 0.82061917, "num_input_tokens_seen": 160150635, "step": 7466, "time_per_iteration": 2.7003092765808105 }, { "auxiliary_loss_clip": 0.01110719, "auxiliary_loss_mlp": 0.01035258, "balance_loss_clip": 1.04898739, "balance_loss_mlp": 1.02141774, "epoch": 0.44894032767172704, "flos": 18259678800000.0, "grad_norm": 2.4307522297147357, "language_loss": 0.81000906, "learning_rate": 2.4245683687503844e-06, "loss": 0.83146888, "num_input_tokens_seen": 160168615, "step": 7467, "time_per_iteration": 2.6656453609466553 }, { "auxiliary_loss_clip": 0.01074952, "auxiliary_loss_mlp": 0.01032302, "balance_loss_clip": 1.04580259, "balance_loss_mlp": 1.01924217, "epoch": 0.449000450924395, "flos": 21579835610880.0, "grad_norm": 2.1126461235100726, "language_loss": 0.74707794, "learning_rate": 2.424187775642129e-06, "loss": 0.76815045, "num_input_tokens_seen": 160187295, "step": 7468, "time_per_iteration": 2.7112534046173096 }, { "auxiliary_loss_clip": 0.01097239, "auxiliary_loss_mlp": 0.01031291, "balance_loss_clip": 1.04224133, "balance_loss_mlp": 1.01881611, "epoch": 0.44906057417706297, "flos": 17967904623360.0, "grad_norm": 1.845085412210932, "language_loss": 0.71481991, "learning_rate": 2.4238071664486297e-06, "loss": 0.7361052, "num_input_tokens_seen": 160205115, "step": 7469, "time_per_iteration": 2.680678606033325 }, { "auxiliary_loss_clip": 0.01115577, "auxiliary_loss_mlp": 0.01040939, "balance_loss_clip": 1.04739857, "balance_loss_mlp": 1.02700388, "epoch": 0.44912069742973093, "flos": 20047347803520.0, "grad_norm": 1.9353970520381958, "language_loss": 0.71990728, "learning_rate": 2.4234265411843203e-06, "loss": 0.74147248, "num_input_tokens_seen": 160222580, "step": 7470, "time_per_iteration": 2.6266865730285645 }, { "auxiliary_loss_clip": 0.01085169, "auxiliary_loss_mlp": 0.01037894, "balance_loss_clip": 1.04166925, "balance_loss_mlp": 1.02263546, "epoch": 0.4491808206823989, "flos": 21033526682880.0, "grad_norm": 1.7352200929350259, "language_loss": 0.76839507, "learning_rate": 2.423045899863634e-06, "loss": 0.78962576, "num_input_tokens_seen": 160241520, "step": 7471, "time_per_iteration": 2.692333698272705 }, { "auxiliary_loss_clip": 0.0112922, "auxiliary_loss_mlp": 0.010358, "balance_loss_clip": 1.04736388, "balance_loss_mlp": 1.02259803, "epoch": 0.44924094393506686, "flos": 22967136645120.0, "grad_norm": 1.6949435247941296, "language_loss": 0.70284784, "learning_rate": 2.4226652425010048e-06, "loss": 0.72449803, "num_input_tokens_seen": 160261815, "step": 7472, "time_per_iteration": 2.714059829711914 }, { "auxiliary_loss_clip": 0.01033495, "auxiliary_loss_mlp": 0.01004013, "balance_loss_clip": 1.01477528, "balance_loss_mlp": 1.00226104, "epoch": 0.4493010671877349, "flos": 59233467864960.0, "grad_norm": 0.7390973196636706, "language_loss": 0.6168009, "learning_rate": 2.4222845691108676e-06, "loss": 0.63717604, "num_input_tokens_seen": 160317070, "step": 7473, "time_per_iteration": 3.1489851474761963 }, { "auxiliary_loss_clip": 0.01131224, "auxiliary_loss_mlp": 0.00771593, "balance_loss_clip": 1.04812014, "balance_loss_mlp": 1.0004611, "epoch": 0.44936119044040285, "flos": 18004892653440.0, "grad_norm": 2.3114379148666817, "language_loss": 0.78279471, "learning_rate": 2.421903879707657e-06, "loss": 0.80182284, "num_input_tokens_seen": 160334980, "step": 7474, "time_per_iteration": 2.5561118125915527 }, { "auxiliary_loss_clip": 0.01074804, "auxiliary_loss_mlp": 0.01040047, "balance_loss_clip": 1.03983307, "balance_loss_mlp": 1.0254494, "epoch": 0.4494213136930708, "flos": 21251827589760.0, "grad_norm": 1.6204554836894525, "language_loss": 0.72024751, "learning_rate": 2.4215231743058086e-06, "loss": 0.74139607, "num_input_tokens_seen": 160354500, "step": 7475, "time_per_iteration": 2.7745461463928223 }, { "auxiliary_loss_clip": 0.01080301, "auxiliary_loss_mlp": 0.01041054, "balance_loss_clip": 1.04167461, "balance_loss_mlp": 1.02563405, "epoch": 0.4494814369457388, "flos": 27418695022080.0, "grad_norm": 2.241823557245511, "language_loss": 0.76592773, "learning_rate": 2.4211424529197594e-06, "loss": 0.78714132, "num_input_tokens_seen": 160373650, "step": 7476, "time_per_iteration": 2.7856860160827637 }, { "auxiliary_loss_clip": 0.01122132, "auxiliary_loss_mlp": 0.00773102, "balance_loss_clip": 1.04493368, "balance_loss_mlp": 1.00047529, "epoch": 0.44954156019840674, "flos": 22854053652480.0, "grad_norm": 4.385259299883037, "language_loss": 0.72134888, "learning_rate": 2.4207617155639464e-06, "loss": 0.74030131, "num_input_tokens_seen": 160393430, "step": 7477, "time_per_iteration": 2.641645669937134 }, { "auxiliary_loss_clip": 0.01103781, "auxiliary_loss_mlp": 0.01047956, "balance_loss_clip": 1.04083133, "balance_loss_mlp": 1.03148091, "epoch": 0.4496016834510747, "flos": 17201570935680.0, "grad_norm": 2.795464855062127, "language_loss": 0.67799896, "learning_rate": 2.4203809622528062e-06, "loss": 0.69951636, "num_input_tokens_seen": 160410545, "step": 7478, "time_per_iteration": 2.6307947635650635 }, { "auxiliary_loss_clip": 0.01102543, "auxiliary_loss_mlp": 0.01038923, "balance_loss_clip": 1.04405093, "balance_loss_mlp": 1.02537441, "epoch": 0.4496618067037427, "flos": 18916628595840.0, "grad_norm": 1.8532543047361745, "language_loss": 0.89243561, "learning_rate": 2.420000193000779e-06, "loss": 0.91385025, "num_input_tokens_seen": 160428105, "step": 7479, "time_per_iteration": 2.733828544616699 }, { "auxiliary_loss_clip": 0.01068922, "auxiliary_loss_mlp": 0.01043272, "balance_loss_clip": 1.04273605, "balance_loss_mlp": 1.02804279, "epoch": 0.44972192995641064, "flos": 21031659175680.0, "grad_norm": 2.916606412127397, "language_loss": 0.75539804, "learning_rate": 2.419619407822302e-06, "loss": 0.77652001, "num_input_tokens_seen": 160448815, "step": 7480, "time_per_iteration": 2.8518130779266357 }, { "auxiliary_loss_clip": 0.01095249, "auxiliary_loss_mlp": 0.01035055, "balance_loss_clip": 1.04253781, "balance_loss_mlp": 1.02012968, "epoch": 0.4497820532090786, "flos": 20777088510720.0, "grad_norm": 1.9829776726262367, "language_loss": 0.79885375, "learning_rate": 2.419238606731815e-06, "loss": 0.82015675, "num_input_tokens_seen": 160465940, "step": 7481, "time_per_iteration": 2.7299835681915283 }, { "auxiliary_loss_clip": 0.01102494, "auxiliary_loss_mlp": 0.01039566, "balance_loss_clip": 1.04328001, "balance_loss_mlp": 1.02454567, "epoch": 0.44984217646174657, "flos": 33802606385280.0, "grad_norm": 1.6381608125682177, "language_loss": 0.68340528, "learning_rate": 2.418857789743758e-06, "loss": 0.70482588, "num_input_tokens_seen": 160486710, "step": 7482, "time_per_iteration": 2.8123154640197754 }, { "auxiliary_loss_clip": 0.01122196, "auxiliary_loss_mlp": 0.01040775, "balance_loss_clip": 1.04835725, "balance_loss_mlp": 1.02638626, "epoch": 0.44990229971441453, "flos": 15518365660800.0, "grad_norm": 2.0379383366397232, "language_loss": 0.84707004, "learning_rate": 2.418476956872571e-06, "loss": 0.86869979, "num_input_tokens_seen": 160503405, "step": 7483, "time_per_iteration": 2.718548536300659 }, { "auxiliary_loss_clip": 0.01099077, "auxiliary_loss_mlp": 0.01046214, "balance_loss_clip": 1.04296637, "balance_loss_mlp": 1.03027594, "epoch": 0.4499624229670825, "flos": 29861913191040.0, "grad_norm": 1.8017494037756971, "language_loss": 0.80644262, "learning_rate": 2.4180961081326967e-06, "loss": 0.82789552, "num_input_tokens_seen": 160525080, "step": 7484, "time_per_iteration": 2.8435990810394287 }, { "auxiliary_loss_clip": 0.01075163, "auxiliary_loss_mlp": 0.01037509, "balance_loss_clip": 1.03809166, "balance_loss_mlp": 1.02145171, "epoch": 0.45002254621975046, "flos": 18513674847360.0, "grad_norm": 2.526248303429359, "language_loss": 0.75311351, "learning_rate": 2.4177152435385754e-06, "loss": 0.77424026, "num_input_tokens_seen": 160540895, "step": 7485, "time_per_iteration": 2.7453646659851074 }, { "auxiliary_loss_clip": 0.01027401, "auxiliary_loss_mlp": 0.0100295, "balance_loss_clip": 1.01817155, "balance_loss_mlp": 1.00125754, "epoch": 0.4500826694724185, "flos": 70420394229120.0, "grad_norm": 0.7859680562883086, "language_loss": 0.58644986, "learning_rate": 2.4173343631046504e-06, "loss": 0.60675335, "num_input_tokens_seen": 160598270, "step": 7486, "time_per_iteration": 3.2535924911499023 }, { "auxiliary_loss_clip": 0.0111614, "auxiliary_loss_mlp": 0.01045183, "balance_loss_clip": 1.04657292, "balance_loss_mlp": 1.02917325, "epoch": 0.45014279272508645, "flos": 15778897983360.0, "grad_norm": 2.484631064514228, "language_loss": 0.83677804, "learning_rate": 2.4169534668453654e-06, "loss": 0.85839128, "num_input_tokens_seen": 160614720, "step": 7487, "time_per_iteration": 2.7236413955688477 }, { "auxiliary_loss_clip": 0.01128709, "auxiliary_loss_mlp": 0.01039106, "balance_loss_clip": 1.04632056, "balance_loss_mlp": 1.02443182, "epoch": 0.4502029159777544, "flos": 21799573061760.0, "grad_norm": 1.5508029399024128, "language_loss": 0.77568138, "learning_rate": 2.4165725547751622e-06, "loss": 0.79735959, "num_input_tokens_seen": 160635170, "step": 7488, "time_per_iteration": 2.6660585403442383 }, { "auxiliary_loss_clip": 0.0112874, "auxiliary_loss_mlp": 0.01045145, "balance_loss_clip": 1.04882014, "balance_loss_mlp": 1.02954042, "epoch": 0.4502630392304224, "flos": 28767966531840.0, "grad_norm": 1.97851616048007, "language_loss": 0.72073781, "learning_rate": 2.4161916269084858e-06, "loss": 0.74247664, "num_input_tokens_seen": 160654490, "step": 7489, "time_per_iteration": 2.7274820804595947 }, { "auxiliary_loss_clip": 0.01109274, "auxiliary_loss_mlp": 0.01039798, "balance_loss_clip": 1.04584038, "balance_loss_mlp": 1.02314413, "epoch": 0.45032316248309034, "flos": 15844182952320.0, "grad_norm": 2.9737823054207926, "language_loss": 0.6968661, "learning_rate": 2.4158106832597817e-06, "loss": 0.71835679, "num_input_tokens_seen": 160669400, "step": 7490, "time_per_iteration": 2.650700569152832 }, { "auxiliary_loss_clip": 0.01026171, "auxiliary_loss_mlp": 0.01004705, "balance_loss_clip": 1.0231657, "balance_loss_mlp": 1.00323248, "epoch": 0.4503832857357583, "flos": 57853600945920.0, "grad_norm": 0.7292674820176653, "language_loss": 0.56675166, "learning_rate": 2.415429723843495e-06, "loss": 0.58706039, "num_input_tokens_seen": 160733820, "step": 7491, "time_per_iteration": 3.1893656253814697 }, { "auxiliary_loss_clip": 0.01116518, "auxiliary_loss_mlp": 0.01037403, "balance_loss_clip": 1.04746497, "balance_loss_mlp": 1.02327061, "epoch": 0.4504434089884263, "flos": 23878082488320.0, "grad_norm": 1.6154687272881363, "language_loss": 0.7939685, "learning_rate": 2.4150487486740713e-06, "loss": 0.81550771, "num_input_tokens_seen": 160753175, "step": 7492, "time_per_iteration": 2.7314138412475586 }, { "auxiliary_loss_clip": 0.010986, "auxiliary_loss_mlp": 0.00775969, "balance_loss_clip": 1.04425228, "balance_loss_mlp": 1.000494, "epoch": 0.45050353224109424, "flos": 17785083375360.0, "grad_norm": 2.875303360797025, "language_loss": 0.92825645, "learning_rate": 2.4146677577659573e-06, "loss": 0.94700211, "num_input_tokens_seen": 160768310, "step": 7493, "time_per_iteration": 2.7123935222625732 }, { "auxiliary_loss_clip": 0.01039208, "auxiliary_loss_mlp": 0.01001589, "balance_loss_clip": 1.02041435, "balance_loss_mlp": 0.99994355, "epoch": 0.4505636554937622, "flos": 65063420703360.0, "grad_norm": 0.8110713299155351, "language_loss": 0.62929082, "learning_rate": 2.4142867511336e-06, "loss": 0.64969873, "num_input_tokens_seen": 160827370, "step": 7494, "time_per_iteration": 3.289635181427002 }, { "auxiliary_loss_clip": 0.01129658, "auxiliary_loss_mlp": 0.01035034, "balance_loss_clip": 1.04754305, "balance_loss_mlp": 1.02150989, "epoch": 0.45062377874643017, "flos": 22200084685440.0, "grad_norm": 1.7474777674384385, "language_loss": 0.82263976, "learning_rate": 2.4139057287914484e-06, "loss": 0.84428668, "num_input_tokens_seen": 160849140, "step": 7495, "time_per_iteration": 2.659642219543457 }, { "auxiliary_loss_clip": 0.01115544, "auxiliary_loss_mlp": 0.01041634, "balance_loss_clip": 1.04483461, "balance_loss_mlp": 1.02449155, "epoch": 0.45068390199909814, "flos": 37670293186560.0, "grad_norm": 1.8332713503860085, "language_loss": 0.86039978, "learning_rate": 2.41352469075395e-06, "loss": 0.8819716, "num_input_tokens_seen": 160871280, "step": 7496, "time_per_iteration": 2.798741579055786 }, { "auxiliary_loss_clip": 0.01134499, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.04969478, "balance_loss_mlp": 1.02054274, "epoch": 0.4507440252517661, "flos": 22302501338880.0, "grad_norm": 2.0558646291387066, "language_loss": 0.76101983, "learning_rate": 2.4131436370355534e-06, "loss": 0.78272235, "num_input_tokens_seen": 160888625, "step": 7497, "time_per_iteration": 2.6553680896759033 }, { "auxiliary_loss_clip": 0.01098074, "auxiliary_loss_mlp": 0.01037956, "balance_loss_clip": 1.04377723, "balance_loss_mlp": 1.02352023, "epoch": 0.45080414850443407, "flos": 13188374138880.0, "grad_norm": 2.277785969464064, "language_loss": 0.75305939, "learning_rate": 2.4127625676507088e-06, "loss": 0.77441967, "num_input_tokens_seen": 160907040, "step": 7498, "time_per_iteration": 2.6950063705444336 }, { "auxiliary_loss_clip": 0.01133264, "auxiliary_loss_mlp": 0.01044893, "balance_loss_clip": 1.04848719, "balance_loss_mlp": 1.02897298, "epoch": 0.4508642717571021, "flos": 21944939402880.0, "grad_norm": 3.3346599205762826, "language_loss": 0.70080638, "learning_rate": 2.4123814826138663e-06, "loss": 0.72258794, "num_input_tokens_seen": 160927115, "step": 7499, "time_per_iteration": 2.6134774684906006 }, { "auxiliary_loss_clip": 0.01084574, "auxiliary_loss_mlp": 0.0103806, "balance_loss_clip": 1.04212165, "balance_loss_mlp": 1.02309906, "epoch": 0.45092439500977005, "flos": 23367468700800.0, "grad_norm": 1.9346658302408082, "language_loss": 0.77361268, "learning_rate": 2.412000381939477e-06, "loss": 0.79483902, "num_input_tokens_seen": 160944405, "step": 7500, "time_per_iteration": 4.306777000427246 }, { "auxiliary_loss_clip": 0.01084228, "auxiliary_loss_mlp": 0.01034656, "balance_loss_clip": 1.04249573, "balance_loss_mlp": 1.02007651, "epoch": 0.450984518262438, "flos": 20772958446720.0, "grad_norm": 1.9176241989159464, "language_loss": 0.63056326, "learning_rate": 2.411619265641992e-06, "loss": 0.65175211, "num_input_tokens_seen": 160961345, "step": 7501, "time_per_iteration": 5.803133487701416 }, { "auxiliary_loss_clip": 0.01135547, "auxiliary_loss_mlp": 0.01040046, "balance_loss_clip": 1.04915273, "balance_loss_mlp": 1.02445376, "epoch": 0.451044641515106, "flos": 17707372300800.0, "grad_norm": 1.9532762899000093, "language_loss": 0.84446234, "learning_rate": 2.411238133735863e-06, "loss": 0.86621827, "num_input_tokens_seen": 160977330, "step": 7502, "time_per_iteration": 2.604753017425537 }, { "auxiliary_loss_clip": 0.01105383, "auxiliary_loss_mlp": 0.01036548, "balance_loss_clip": 1.04670203, "balance_loss_mlp": 1.02238584, "epoch": 0.45110476476777395, "flos": 20594698225920.0, "grad_norm": 1.3813112457968315, "language_loss": 0.79642487, "learning_rate": 2.4108569862355418e-06, "loss": 0.81784415, "num_input_tokens_seen": 160997280, "step": 7503, "time_per_iteration": 2.666677236557007 }, { "auxiliary_loss_clip": 0.01104325, "auxiliary_loss_mlp": 0.01036781, "balance_loss_clip": 1.04764807, "balance_loss_mlp": 1.02240419, "epoch": 0.4511648880204419, "flos": 16034043265920.0, "grad_norm": 2.051596804130354, "language_loss": 0.81191939, "learning_rate": 2.410475823155484e-06, "loss": 0.83333045, "num_input_tokens_seen": 161014235, "step": 7504, "time_per_iteration": 4.276456117630005 }, { "auxiliary_loss_clip": 0.01087433, "auxiliary_loss_mlp": 0.01038305, "balance_loss_clip": 1.04069161, "balance_loss_mlp": 1.02469158, "epoch": 0.4512250112731099, "flos": 23978811202560.0, "grad_norm": 1.5834485358881918, "language_loss": 0.63315797, "learning_rate": 2.4100946445101405e-06, "loss": 0.65441537, "num_input_tokens_seen": 161032360, "step": 7505, "time_per_iteration": 2.947556734085083 }, { "auxiliary_loss_clip": 0.01014942, "auxiliary_loss_mlp": 0.01003244, "balance_loss_clip": 1.02198029, "balance_loss_mlp": 1.00188541, "epoch": 0.45128513452577784, "flos": 71462308037760.0, "grad_norm": 0.8317919198459461, "language_loss": 0.58857071, "learning_rate": 2.409713450313968e-06, "loss": 0.60875255, "num_input_tokens_seen": 161091360, "step": 7506, "time_per_iteration": 3.395158052444458 }, { "auxiliary_loss_clip": 0.01075605, "auxiliary_loss_mlp": 0.01037451, "balance_loss_clip": 1.04096067, "balance_loss_mlp": 1.02287173, "epoch": 0.4513452577784458, "flos": 22090844448000.0, "grad_norm": 1.7149339287343461, "language_loss": 0.79334831, "learning_rate": 2.40933224058142e-06, "loss": 0.81447887, "num_input_tokens_seen": 161110825, "step": 7507, "time_per_iteration": 2.8281381130218506 }, { "auxiliary_loss_clip": 0.01091142, "auxiliary_loss_mlp": 0.01036706, "balance_loss_clip": 1.0425905, "balance_loss_mlp": 1.02066064, "epoch": 0.4514053810311138, "flos": 24276403382400.0, "grad_norm": 1.5823194059388275, "language_loss": 0.73703611, "learning_rate": 2.4089510153269526e-06, "loss": 0.75831455, "num_input_tokens_seen": 161130685, "step": 7508, "time_per_iteration": 2.75742506980896 }, { "auxiliary_loss_clip": 0.01118642, "auxiliary_loss_mlp": 0.0103619, "balance_loss_clip": 1.04927611, "balance_loss_mlp": 1.02279091, "epoch": 0.45146550428378174, "flos": 17886781756800.0, "grad_norm": 2.075832981658432, "language_loss": 0.79118419, "learning_rate": 2.4085697745650217e-06, "loss": 0.81273252, "num_input_tokens_seen": 161147555, "step": 7509, "time_per_iteration": 2.6641790866851807 }, { "auxiliary_loss_clip": 0.01130929, "auxiliary_loss_mlp": 0.01034567, "balance_loss_clip": 1.05022097, "balance_loss_mlp": 1.02104306, "epoch": 0.4515256275364497, "flos": 24243437675520.0, "grad_norm": 1.9616298828862797, "language_loss": 0.73389792, "learning_rate": 2.4081885183100837e-06, "loss": 0.75555289, "num_input_tokens_seen": 161166255, "step": 7510, "time_per_iteration": 2.754516839981079 }, { "auxiliary_loss_clip": 0.01129503, "auxiliary_loss_mlp": 0.01032701, "balance_loss_clip": 1.04575419, "balance_loss_mlp": 1.01789534, "epoch": 0.45158575078911767, "flos": 20631039811200.0, "grad_norm": 1.8899584921112549, "language_loss": 0.77046561, "learning_rate": 2.4078072465765964e-06, "loss": 0.79208767, "num_input_tokens_seen": 161184720, "step": 7511, "time_per_iteration": 2.633896589279175 }, { "auxiliary_loss_clip": 0.01119455, "auxiliary_loss_mlp": 0.01033368, "balance_loss_clip": 1.04665303, "balance_loss_mlp": 1.01832986, "epoch": 0.45164587404178563, "flos": 23327751237120.0, "grad_norm": 1.8239087865443961, "language_loss": 0.78791374, "learning_rate": 2.4074259593790174e-06, "loss": 0.80944192, "num_input_tokens_seen": 161204360, "step": 7512, "time_per_iteration": 2.701643466949463 }, { "auxiliary_loss_clip": 0.01094327, "auxiliary_loss_mlp": 0.01039327, "balance_loss_clip": 1.04103267, "balance_loss_mlp": 1.02404392, "epoch": 0.45170599729445365, "flos": 23805973935360.0, "grad_norm": 2.0955290596831713, "language_loss": 0.87512183, "learning_rate": 2.4070446567318053e-06, "loss": 0.89645839, "num_input_tokens_seen": 161223575, "step": 7513, "time_per_iteration": 2.716236114501953 }, { "auxiliary_loss_clip": 0.01110578, "auxiliary_loss_mlp": 0.0103311, "balance_loss_clip": 1.0445292, "balance_loss_mlp": 1.02031827, "epoch": 0.4517661205471216, "flos": 23512942782720.0, "grad_norm": 2.109318524386585, "language_loss": 0.6707387, "learning_rate": 2.406663338649419e-06, "loss": 0.69217563, "num_input_tokens_seen": 161243805, "step": 7514, "time_per_iteration": 2.665377140045166 }, { "auxiliary_loss_clip": 0.01113013, "auxiliary_loss_mlp": 0.0103579, "balance_loss_clip": 1.04554498, "balance_loss_mlp": 1.01995873, "epoch": 0.4518262437997896, "flos": 23513948363520.0, "grad_norm": 2.2260653694398242, "language_loss": 0.69152886, "learning_rate": 2.406282005146318e-06, "loss": 0.71301687, "num_input_tokens_seen": 161261450, "step": 7515, "time_per_iteration": 2.6233787536621094 }, { "auxiliary_loss_clip": 0.01114597, "auxiliary_loss_mlp": 0.01038013, "balance_loss_clip": 1.04228842, "balance_loss_mlp": 1.02269435, "epoch": 0.45188636705245755, "flos": 14568061489920.0, "grad_norm": 6.104635540487547, "language_loss": 0.82568568, "learning_rate": 2.405900656236963e-06, "loss": 0.84721178, "num_input_tokens_seen": 161276965, "step": 7516, "time_per_iteration": 2.7125158309936523 }, { "auxiliary_loss_clip": 0.0112394, "auxiliary_loss_mlp": 0.0103396, "balance_loss_clip": 1.04487455, "balance_loss_mlp": 1.02003694, "epoch": 0.4519464903051255, "flos": 19901550499200.0, "grad_norm": 1.657947130481532, "language_loss": 0.65597039, "learning_rate": 2.4055192919358137e-06, "loss": 0.67754936, "num_input_tokens_seen": 161295375, "step": 7517, "time_per_iteration": 2.6732585430145264 }, { "auxiliary_loss_clip": 0.01091101, "auxiliary_loss_mlp": 0.01032878, "balance_loss_clip": 1.04268789, "balance_loss_mlp": 1.02015853, "epoch": 0.4520066135577935, "flos": 18844376388480.0, "grad_norm": 2.0502430920821904, "language_loss": 0.63127112, "learning_rate": 2.405137912257333e-06, "loss": 0.65251088, "num_input_tokens_seen": 161313010, "step": 7518, "time_per_iteration": 2.6873538494110107 }, { "auxiliary_loss_clip": 0.01116444, "auxiliary_loss_mlp": 0.01033811, "balance_loss_clip": 1.0465678, "balance_loss_mlp": 1.02015519, "epoch": 0.45206673681046144, "flos": 48214419713280.0, "grad_norm": 1.68859992173611, "language_loss": 0.59658802, "learning_rate": 2.404756517215982e-06, "loss": 0.61809057, "num_input_tokens_seen": 161336690, "step": 7519, "time_per_iteration": 2.8561198711395264 }, { "auxiliary_loss_clip": 0.01116298, "auxiliary_loss_mlp": 0.01038351, "balance_loss_clip": 1.0457139, "balance_loss_mlp": 1.02468395, "epoch": 0.4521268600631294, "flos": 23842171866240.0, "grad_norm": 1.5141513880128057, "language_loss": 0.72439361, "learning_rate": 2.404375106826223e-06, "loss": 0.74594009, "num_input_tokens_seen": 161357845, "step": 7520, "time_per_iteration": 2.709179162979126 }, { "auxiliary_loss_clip": 0.0110396, "auxiliary_loss_mlp": 0.01036085, "balance_loss_clip": 1.04404962, "balance_loss_mlp": 1.02297747, "epoch": 0.4521869833157974, "flos": 18843622202880.0, "grad_norm": 2.131399149186965, "language_loss": 0.75379634, "learning_rate": 2.4039936811025194e-06, "loss": 0.77519679, "num_input_tokens_seen": 161375160, "step": 7521, "time_per_iteration": 2.78236722946167 }, { "auxiliary_loss_clip": 0.01109339, "auxiliary_loss_mlp": 0.01039668, "balance_loss_clip": 1.04502964, "balance_loss_mlp": 1.02507663, "epoch": 0.45224710656846534, "flos": 19788072456960.0, "grad_norm": 2.2802922264962247, "language_loss": 0.68217206, "learning_rate": 2.4036122400593343e-06, "loss": 0.70366216, "num_input_tokens_seen": 161393690, "step": 7522, "time_per_iteration": 2.698141574859619 }, { "auxiliary_loss_clip": 0.01111702, "auxiliary_loss_mlp": 0.01036701, "balance_loss_clip": 1.04239058, "balance_loss_mlp": 1.02306962, "epoch": 0.4523072298211333, "flos": 28256131681920.0, "grad_norm": 1.6149288487041198, "language_loss": 0.6114409, "learning_rate": 2.403230783711134e-06, "loss": 0.63292497, "num_input_tokens_seen": 161415015, "step": 7523, "time_per_iteration": 2.765838623046875 }, { "auxiliary_loss_clip": 0.01122412, "auxiliary_loss_mlp": 0.01039402, "balance_loss_clip": 1.04672575, "balance_loss_mlp": 1.02425027, "epoch": 0.45236735307380127, "flos": 11181039511680.0, "grad_norm": 2.0249866031396837, "language_loss": 0.78044772, "learning_rate": 2.4028493120723813e-06, "loss": 0.80206585, "num_input_tokens_seen": 161432940, "step": 7524, "time_per_iteration": 2.6178715229034424 }, { "auxiliary_loss_clip": 0.01083067, "auxiliary_loss_mlp": 0.0103962, "balance_loss_clip": 1.04386139, "balance_loss_mlp": 1.02560115, "epoch": 0.45242747632646924, "flos": 22601386408320.0, "grad_norm": 2.4629173570449447, "language_loss": 0.63756073, "learning_rate": 2.4024678251575417e-06, "loss": 0.65878761, "num_input_tokens_seen": 161452215, "step": 7525, "time_per_iteration": 2.767791509628296 }, { "auxiliary_loss_clip": 0.01116902, "auxiliary_loss_mlp": 0.01037108, "balance_loss_clip": 1.04607654, "balance_loss_mlp": 1.02390599, "epoch": 0.45248759957913726, "flos": 18256267008000.0, "grad_norm": 1.8561008840058875, "language_loss": 0.78973663, "learning_rate": 2.402086322981083e-06, "loss": 0.81127673, "num_input_tokens_seen": 161469520, "step": 7526, "time_per_iteration": 2.6315999031066895 }, { "auxiliary_loss_clip": 0.01098614, "auxiliary_loss_mlp": 0.01030271, "balance_loss_clip": 1.04242575, "balance_loss_mlp": 1.01696694, "epoch": 0.4525477228318052, "flos": 22450094323200.0, "grad_norm": 1.8159616365895555, "language_loss": 0.80961096, "learning_rate": 2.40170480555747e-06, "loss": 0.83089983, "num_input_tokens_seen": 161487335, "step": 7527, "time_per_iteration": 2.6868715286254883 }, { "auxiliary_loss_clip": 0.01092415, "auxiliary_loss_mlp": 0.01031467, "balance_loss_clip": 1.04517341, "balance_loss_mlp": 1.01763892, "epoch": 0.4526078460844732, "flos": 29644869260160.0, "grad_norm": 11.448753069744305, "language_loss": 0.6562798, "learning_rate": 2.4013232729011706e-06, "loss": 0.67751861, "num_input_tokens_seen": 161510095, "step": 7528, "time_per_iteration": 2.816391944885254 }, { "auxiliary_loss_clip": 0.01100127, "auxiliary_loss_mlp": 0.01033759, "balance_loss_clip": 1.04077947, "balance_loss_mlp": 1.02030635, "epoch": 0.45266796933714115, "flos": 23039747988480.0, "grad_norm": 1.584867366654962, "language_loss": 0.75341809, "learning_rate": 2.4009417250266525e-06, "loss": 0.77475703, "num_input_tokens_seen": 161528725, "step": 7529, "time_per_iteration": 2.688854694366455 }, { "auxiliary_loss_clip": 0.01127981, "auxiliary_loss_mlp": 0.0103405, "balance_loss_clip": 1.04677176, "balance_loss_mlp": 1.02092457, "epoch": 0.4527280925898091, "flos": 14428405411200.0, "grad_norm": 2.148118662824089, "language_loss": 0.73154545, "learning_rate": 2.400560161948384e-06, "loss": 0.75316578, "num_input_tokens_seen": 161547195, "step": 7530, "time_per_iteration": 2.626149892807007 }, { "auxiliary_loss_clip": 0.01097205, "auxiliary_loss_mlp": 0.01036532, "balance_loss_clip": 1.04691768, "balance_loss_mlp": 1.0233357, "epoch": 0.4527882158424771, "flos": 22925515760640.0, "grad_norm": 1.600682021317837, "language_loss": 0.75962186, "learning_rate": 2.400178583680834e-06, "loss": 0.78095925, "num_input_tokens_seen": 161565565, "step": 7531, "time_per_iteration": 2.7901298999786377 }, { "auxiliary_loss_clip": 0.01122835, "auxiliary_loss_mlp": 0.01036019, "balance_loss_clip": 1.04418015, "balance_loss_mlp": 1.02203524, "epoch": 0.45284833909514505, "flos": 25555326105600.0, "grad_norm": 1.5467116056600763, "language_loss": 0.66987002, "learning_rate": 2.3997969902384717e-06, "loss": 0.69145852, "num_input_tokens_seen": 161586630, "step": 7532, "time_per_iteration": 2.693523645401001 }, { "auxiliary_loss_clip": 0.01115241, "auxiliary_loss_mlp": 0.0104024, "balance_loss_clip": 1.04580188, "balance_loss_mlp": 1.02715659, "epoch": 0.452908462347813, "flos": 18150007599360.0, "grad_norm": 3.168484665922808, "language_loss": 0.78721988, "learning_rate": 2.399415381635768e-06, "loss": 0.80877471, "num_input_tokens_seen": 161603815, "step": 7533, "time_per_iteration": 2.6418774127960205 }, { "auxiliary_loss_clip": 0.01101942, "auxiliary_loss_mlp": 0.01039812, "balance_loss_clip": 1.04315686, "balance_loss_mlp": 1.0244813, "epoch": 0.452968585600481, "flos": 19062749122560.0, "grad_norm": 2.220433880382594, "language_loss": 0.83064616, "learning_rate": 2.3990337578871927e-06, "loss": 0.85206366, "num_input_tokens_seen": 161622900, "step": 7534, "time_per_iteration": 2.751016855239868 }, { "auxiliary_loss_clip": 0.01102917, "auxiliary_loss_mlp": 0.0103851, "balance_loss_clip": 1.04744101, "balance_loss_mlp": 1.02389479, "epoch": 0.45302870885314894, "flos": 22051737515520.0, "grad_norm": 1.8531826529396993, "language_loss": 0.76665461, "learning_rate": 2.3986521190072176e-06, "loss": 0.78806889, "num_input_tokens_seen": 161641700, "step": 7535, "time_per_iteration": 2.6611855030059814 }, { "auxiliary_loss_clip": 0.01083875, "auxiliary_loss_mlp": 0.01036335, "balance_loss_clip": 1.04374576, "balance_loss_mlp": 1.02368724, "epoch": 0.4530888321058169, "flos": 20376217751040.0, "grad_norm": 1.5302063461742579, "language_loss": 0.80437911, "learning_rate": 2.3982704650103138e-06, "loss": 0.82558113, "num_input_tokens_seen": 161661955, "step": 7536, "time_per_iteration": 2.7666051387786865 }, { "auxiliary_loss_clip": 0.01097222, "auxiliary_loss_mlp": 0.01036263, "balance_loss_clip": 1.04180908, "balance_loss_mlp": 1.02248287, "epoch": 0.4531489553584849, "flos": 14830425406080.0, "grad_norm": 2.016168707097938, "language_loss": 0.76173598, "learning_rate": 2.3978887959109544e-06, "loss": 0.78307086, "num_input_tokens_seen": 161679245, "step": 7537, "time_per_iteration": 2.690034866333008 }, { "auxiliary_loss_clip": 0.01118629, "auxiliary_loss_mlp": 0.01035481, "balance_loss_clip": 1.04544806, "balance_loss_mlp": 1.0222249, "epoch": 0.45320907861115284, "flos": 21944975316480.0, "grad_norm": 1.9502516921913984, "language_loss": 0.75985712, "learning_rate": 2.3975071117236118e-06, "loss": 0.78139818, "num_input_tokens_seen": 161698795, "step": 7538, "time_per_iteration": 2.692582130432129 }, { "auxiliary_loss_clip": 0.01037446, "auxiliary_loss_mlp": 0.01009452, "balance_loss_clip": 1.01847482, "balance_loss_mlp": 1.00774765, "epoch": 0.45326920186382086, "flos": 66251455038720.0, "grad_norm": 0.7823640203744525, "language_loss": 0.62291718, "learning_rate": 2.3971254124627593e-06, "loss": 0.64338624, "num_input_tokens_seen": 161761980, "step": 7539, "time_per_iteration": 6.417045593261719 }, { "auxiliary_loss_clip": 0.01129753, "auxiliary_loss_mlp": 0.01046019, "balance_loss_clip": 1.04852843, "balance_loss_mlp": 1.03270316, "epoch": 0.4533293251164888, "flos": 14684233052160.0, "grad_norm": 1.7334435648675772, "language_loss": 0.65637821, "learning_rate": 2.396743698142872e-06, "loss": 0.67813587, "num_input_tokens_seen": 161779455, "step": 7540, "time_per_iteration": 2.7546002864837646 }, { "auxiliary_loss_clip": 0.01106819, "auxiliary_loss_mlp": 0.01043222, "balance_loss_clip": 1.0439229, "balance_loss_mlp": 1.02768898, "epoch": 0.4533894483691568, "flos": 22601206840320.0, "grad_norm": 2.0843332238803587, "language_loss": 0.84594655, "learning_rate": 2.396361968778424e-06, "loss": 0.86744702, "num_input_tokens_seen": 161798980, "step": 7541, "time_per_iteration": 4.3779473304748535 }, { "auxiliary_loss_clip": 0.01103981, "auxiliary_loss_mlp": 0.01038274, "balance_loss_clip": 1.04346132, "balance_loss_mlp": 1.02451134, "epoch": 0.45344957162182475, "flos": 34751617666560.0, "grad_norm": 1.786741767322354, "language_loss": 0.76398253, "learning_rate": 2.395980224383889e-06, "loss": 0.78540504, "num_input_tokens_seen": 161819745, "step": 7542, "time_per_iteration": 2.8061442375183105 }, { "auxiliary_loss_clip": 0.01100521, "auxiliary_loss_mlp": 0.01030908, "balance_loss_clip": 1.04320002, "balance_loss_mlp": 1.01665092, "epoch": 0.4535096948744927, "flos": 23550218121600.0, "grad_norm": 4.384838077420028, "language_loss": 0.80294377, "learning_rate": 2.395598464973746e-06, "loss": 0.82425809, "num_input_tokens_seen": 161838575, "step": 7543, "time_per_iteration": 4.4142186641693115 }, { "auxiliary_loss_clip": 0.01116855, "auxiliary_loss_mlp": 0.00771625, "balance_loss_clip": 1.04452896, "balance_loss_mlp": 1.00043499, "epoch": 0.4535698181271607, "flos": 25557552748800.0, "grad_norm": 1.7946145717938884, "language_loss": 0.75708425, "learning_rate": 2.395216690562469e-06, "loss": 0.77596909, "num_input_tokens_seen": 161858590, "step": 7544, "time_per_iteration": 2.706681966781616 }, { "auxiliary_loss_clip": 0.01097765, "auxiliary_loss_mlp": 0.01037632, "balance_loss_clip": 1.04519629, "balance_loss_mlp": 1.02378595, "epoch": 0.45362994137982865, "flos": 24864117713280.0, "grad_norm": 1.7108154873098056, "language_loss": 0.75483274, "learning_rate": 2.3948349011645355e-06, "loss": 0.7761867, "num_input_tokens_seen": 161878390, "step": 7545, "time_per_iteration": 2.741312026977539 }, { "auxiliary_loss_clip": 0.01106771, "auxiliary_loss_mlp": 0.0103517, "balance_loss_clip": 1.04418731, "balance_loss_mlp": 1.02098417, "epoch": 0.4536900646324966, "flos": 30806794408320.0, "grad_norm": 2.2011621045210057, "language_loss": 0.72520149, "learning_rate": 2.394453096794423e-06, "loss": 0.74662089, "num_input_tokens_seen": 161898610, "step": 7546, "time_per_iteration": 2.7891902923583984 }, { "auxiliary_loss_clip": 0.01108307, "auxiliary_loss_mlp": 0.01035115, "balance_loss_clip": 1.04388261, "balance_loss_mlp": 1.02008224, "epoch": 0.4537501878851646, "flos": 23404313076480.0, "grad_norm": 1.593135285125141, "language_loss": 0.75609434, "learning_rate": 2.394071277466609e-06, "loss": 0.77752858, "num_input_tokens_seen": 161918210, "step": 7547, "time_per_iteration": 2.7260210514068604 }, { "auxiliary_loss_clip": 0.01120791, "auxiliary_loss_mlp": 0.01033715, "balance_loss_clip": 1.04588616, "balance_loss_mlp": 1.01945722, "epoch": 0.45381031113783254, "flos": 18149289327360.0, "grad_norm": 2.150959748604014, "language_loss": 0.70081824, "learning_rate": 2.393689443195573e-06, "loss": 0.72236335, "num_input_tokens_seen": 161936950, "step": 7548, "time_per_iteration": 2.652388095855713 }, { "auxiliary_loss_clip": 0.01129285, "auxiliary_loss_mlp": 0.01039378, "balance_loss_clip": 1.04662538, "balance_loss_mlp": 1.0256331, "epoch": 0.4538704343905005, "flos": 25336666062720.0, "grad_norm": 2.8840782688813293, "language_loss": 0.73135072, "learning_rate": 2.393307593995794e-06, "loss": 0.75303733, "num_input_tokens_seen": 161955550, "step": 7549, "time_per_iteration": 2.8452274799346924 }, { "auxiliary_loss_clip": 0.01091023, "auxiliary_loss_mlp": 0.01028579, "balance_loss_clip": 1.040573, "balance_loss_mlp": 1.01576996, "epoch": 0.4539305576431685, "flos": 28731445378560.0, "grad_norm": 1.9190169905093657, "language_loss": 0.65320408, "learning_rate": 2.392925729881751e-06, "loss": 0.67440009, "num_input_tokens_seen": 161976760, "step": 7550, "time_per_iteration": 2.783653497695923 }, { "auxiliary_loss_clip": 0.01113741, "auxiliary_loss_mlp": 0.01035092, "balance_loss_clip": 1.05046797, "balance_loss_mlp": 1.02172232, "epoch": 0.45399068089583644, "flos": 22492397566080.0, "grad_norm": 1.6128261499338563, "language_loss": 0.69028163, "learning_rate": 2.3925438508679263e-06, "loss": 0.71176994, "num_input_tokens_seen": 161996120, "step": 7551, "time_per_iteration": 2.6571664810180664 }, { "auxiliary_loss_clip": 0.01115638, "auxiliary_loss_mlp": 0.010339, "balance_loss_clip": 1.04326105, "balance_loss_mlp": 1.01979804, "epoch": 0.45405080414850446, "flos": 12893403651840.0, "grad_norm": 1.789312830614556, "language_loss": 0.79496789, "learning_rate": 2.392161956968798e-06, "loss": 0.81646329, "num_input_tokens_seen": 162011125, "step": 7552, "time_per_iteration": 2.6482155323028564 }, { "auxiliary_loss_clip": 0.01042694, "auxiliary_loss_mlp": 0.0100358, "balance_loss_clip": 1.02483499, "balance_loss_mlp": 1.00200677, "epoch": 0.4541109274011724, "flos": 59766919724160.0, "grad_norm": 0.8270469682211425, "language_loss": 0.57826698, "learning_rate": 2.39178004819885e-06, "loss": 0.59872973, "num_input_tokens_seen": 162068705, "step": 7553, "time_per_iteration": 3.1456856727600098 }, { "auxiliary_loss_clip": 0.01064062, "auxiliary_loss_mlp": 0.01034097, "balance_loss_clip": 1.04350471, "balance_loss_mlp": 1.02177691, "epoch": 0.4541710506538404, "flos": 28511743841280.0, "grad_norm": 1.3658485385977341, "language_loss": 0.76709622, "learning_rate": 2.3913981245725626e-06, "loss": 0.78807783, "num_input_tokens_seen": 162089655, "step": 7554, "time_per_iteration": 2.8080356121063232 }, { "auxiliary_loss_clip": 0.01108851, "auxiliary_loss_mlp": 0.01035523, "balance_loss_clip": 1.0467329, "balance_loss_mlp": 1.02056265, "epoch": 0.45423117390650836, "flos": 17675591742720.0, "grad_norm": 3.0408177613289014, "language_loss": 0.7764836, "learning_rate": 2.3910161861044194e-06, "loss": 0.79792738, "num_input_tokens_seen": 162108465, "step": 7555, "time_per_iteration": 2.6776504516601562 }, { "auxiliary_loss_clip": 0.01059757, "auxiliary_loss_mlp": 0.01032208, "balance_loss_clip": 1.04157853, "balance_loss_mlp": 1.01914918, "epoch": 0.4542912971591763, "flos": 28072556248320.0, "grad_norm": 1.7035673731774164, "language_loss": 0.72646725, "learning_rate": 2.390634232808903e-06, "loss": 0.74738687, "num_input_tokens_seen": 162129910, "step": 7556, "time_per_iteration": 2.851022720336914 }, { "auxiliary_loss_clip": 0.01133495, "auxiliary_loss_mlp": 0.01038462, "balance_loss_clip": 1.04808855, "balance_loss_mlp": 1.02491426, "epoch": 0.4543514204118443, "flos": 22671771108480.0, "grad_norm": 2.040538066845486, "language_loss": 0.6298486, "learning_rate": 2.3902522647004982e-06, "loss": 0.65156817, "num_input_tokens_seen": 162148840, "step": 7557, "time_per_iteration": 2.7630646228790283 }, { "auxiliary_loss_clip": 0.01029784, "auxiliary_loss_mlp": 0.0100461, "balance_loss_clip": 1.02091062, "balance_loss_mlp": 1.00302434, "epoch": 0.45441154366451225, "flos": 58216549921920.0, "grad_norm": 0.683633086089208, "language_loss": 0.57569897, "learning_rate": 2.3898702817936875e-06, "loss": 0.59604287, "num_input_tokens_seen": 162208500, "step": 7558, "time_per_iteration": 3.1137866973876953 }, { "auxiliary_loss_clip": 0.01120146, "auxiliary_loss_mlp": 0.0104176, "balance_loss_clip": 1.04774594, "balance_loss_mlp": 1.02645946, "epoch": 0.4544716669171802, "flos": 16764286763520.0, "grad_norm": 4.36821938683546, "language_loss": 0.56214309, "learning_rate": 2.3894882841029573e-06, "loss": 0.58376217, "num_input_tokens_seen": 162224650, "step": 7559, "time_per_iteration": 2.6453661918640137 }, { "auxiliary_loss_clip": 0.01114034, "auxiliary_loss_mlp": 0.00771404, "balance_loss_clip": 1.04701853, "balance_loss_mlp": 1.00053644, "epoch": 0.4545317901698482, "flos": 15925233991680.0, "grad_norm": 3.62707185125481, "language_loss": 0.72154331, "learning_rate": 2.389106271642792e-06, "loss": 0.74039769, "num_input_tokens_seen": 162242930, "step": 7560, "time_per_iteration": 2.734957456588745 }, { "auxiliary_loss_clip": 0.01047807, "auxiliary_loss_mlp": 0.01042508, "balance_loss_clip": 1.03757131, "balance_loss_mlp": 1.02745199, "epoch": 0.45459191342251615, "flos": 17639752947840.0, "grad_norm": 2.1379103724447517, "language_loss": 0.69509232, "learning_rate": 2.3887242444276775e-06, "loss": 0.71599543, "num_input_tokens_seen": 162261455, "step": 7561, "time_per_iteration": 2.8633503913879395 }, { "auxiliary_loss_clip": 0.01103836, "auxiliary_loss_mlp": 0.01038069, "balance_loss_clip": 1.04502749, "balance_loss_mlp": 1.02508128, "epoch": 0.4546520366751841, "flos": 16176608346240.0, "grad_norm": 1.7850356135584633, "language_loss": 0.85308814, "learning_rate": 2.3883422024721015e-06, "loss": 0.87450719, "num_input_tokens_seen": 162279725, "step": 7562, "time_per_iteration": 2.6936264038085938 }, { "auxiliary_loss_clip": 0.01113259, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.04309893, "balance_loss_mlp": 1.0244745, "epoch": 0.4547121599278521, "flos": 19751443562880.0, "grad_norm": 1.7930294917475702, "language_loss": 0.89894032, "learning_rate": 2.38796014579055e-06, "loss": 0.92045587, "num_input_tokens_seen": 162297865, "step": 7563, "time_per_iteration": 2.6632707118988037 }, { "auxiliary_loss_clip": 0.01128772, "auxiliary_loss_mlp": 0.00772113, "balance_loss_clip": 1.04633093, "balance_loss_mlp": 1.00060475, "epoch": 0.45477228318052004, "flos": 19937461121280.0, "grad_norm": 1.7120070486519374, "language_loss": 0.71349525, "learning_rate": 2.3875780743975097e-06, "loss": 0.73250407, "num_input_tokens_seen": 162316010, "step": 7564, "time_per_iteration": 2.6610071659088135 }, { "auxiliary_loss_clip": 0.01118776, "auxiliary_loss_mlp": 0.01037586, "balance_loss_clip": 1.04351079, "balance_loss_mlp": 1.02376413, "epoch": 0.454832406433188, "flos": 21288312829440.0, "grad_norm": 2.3273072225052998, "language_loss": 0.67977536, "learning_rate": 2.3871959883074713e-06, "loss": 0.70133895, "num_input_tokens_seen": 162336115, "step": 7565, "time_per_iteration": 2.645447015762329 }, { "auxiliary_loss_clip": 0.01084701, "auxiliary_loss_mlp": 0.01033288, "balance_loss_clip": 1.04171932, "balance_loss_mlp": 1.02002633, "epoch": 0.45489252968585603, "flos": 24498726612480.0, "grad_norm": 1.877770036567151, "language_loss": 0.80176723, "learning_rate": 2.386813887534922e-06, "loss": 0.82294714, "num_input_tokens_seen": 162355705, "step": 7566, "time_per_iteration": 2.7949163913726807 }, { "auxiliary_loss_clip": 0.01090452, "auxiliary_loss_mlp": 0.01035417, "balance_loss_clip": 1.04210711, "balance_loss_mlp": 1.01981235, "epoch": 0.454952652938524, "flos": 17092474352640.0, "grad_norm": 1.6100724605132029, "language_loss": 0.73702621, "learning_rate": 2.3864317720943508e-06, "loss": 0.75828493, "num_input_tokens_seen": 162374055, "step": 7567, "time_per_iteration": 2.8082687854766846 }, { "auxiliary_loss_clip": 0.01093893, "auxiliary_loss_mlp": 0.01039284, "balance_loss_clip": 1.04401243, "balance_loss_mlp": 1.02519345, "epoch": 0.45501277619119196, "flos": 27630387826560.0, "grad_norm": 1.3909583171669249, "language_loss": 0.81125635, "learning_rate": 2.386049642000249e-06, "loss": 0.83258814, "num_input_tokens_seen": 162393560, "step": 7568, "time_per_iteration": 2.7837767601013184 }, { "auxiliary_loss_clip": 0.01126615, "auxiliary_loss_mlp": 0.01047153, "balance_loss_clip": 1.04950857, "balance_loss_mlp": 1.03145313, "epoch": 0.4550728994438599, "flos": 19974664632960.0, "grad_norm": 2.2201304610210175, "language_loss": 0.79881442, "learning_rate": 2.3856674972671055e-06, "loss": 0.82055211, "num_input_tokens_seen": 162413170, "step": 7569, "time_per_iteration": 2.6318490505218506 }, { "auxiliary_loss_clip": 0.01121847, "auxiliary_loss_mlp": 0.01038069, "balance_loss_clip": 1.04655576, "balance_loss_mlp": 1.02286983, "epoch": 0.4551330226965279, "flos": 26066873646720.0, "grad_norm": 1.3612588382742794, "language_loss": 0.75316679, "learning_rate": 2.385285337909412e-06, "loss": 0.77476597, "num_input_tokens_seen": 162434080, "step": 7570, "time_per_iteration": 2.6693389415740967 }, { "auxiliary_loss_clip": 0.0110874, "auxiliary_loss_mlp": 0.01042662, "balance_loss_clip": 1.0496285, "balance_loss_mlp": 1.02787971, "epoch": 0.45519314594919585, "flos": 32781091501440.0, "grad_norm": 1.7331933441120846, "language_loss": 0.74851429, "learning_rate": 2.3849031639416596e-06, "loss": 0.77002835, "num_input_tokens_seen": 162455445, "step": 7571, "time_per_iteration": 2.8367550373077393 }, { "auxiliary_loss_clip": 0.01118243, "auxiliary_loss_mlp": 0.01037051, "balance_loss_clip": 1.04903221, "balance_loss_mlp": 1.02305007, "epoch": 0.4552532692018638, "flos": 19172671718400.0, "grad_norm": 1.8103885190184377, "language_loss": 0.81033444, "learning_rate": 2.3845209753783414e-06, "loss": 0.83188736, "num_input_tokens_seen": 162474940, "step": 7572, "time_per_iteration": 2.654205322265625 }, { "auxiliary_loss_clip": 0.01114723, "auxiliary_loss_mlp": 0.01041135, "balance_loss_clip": 1.04709005, "balance_loss_mlp": 1.02511287, "epoch": 0.4553133924545318, "flos": 26027156183040.0, "grad_norm": 1.7361541689984175, "language_loss": 0.7262516, "learning_rate": 2.3841387722339486e-06, "loss": 0.74781018, "num_input_tokens_seen": 162493340, "step": 7573, "time_per_iteration": 2.7468600273132324 }, { "auxiliary_loss_clip": 0.01124507, "auxiliary_loss_mlp": 0.01039816, "balance_loss_clip": 1.04916418, "balance_loss_mlp": 1.02327013, "epoch": 0.45537351570719975, "flos": 30661535808000.0, "grad_norm": 1.869301925708578, "language_loss": 0.74335551, "learning_rate": 2.3837565545229748e-06, "loss": 0.76499879, "num_input_tokens_seen": 162514360, "step": 7574, "time_per_iteration": 2.7575597763061523 }, { "auxiliary_loss_clip": 0.01121884, "auxiliary_loss_mlp": 0.01036714, "balance_loss_clip": 1.04758859, "balance_loss_mlp": 1.02184868, "epoch": 0.4554336389598677, "flos": 24353396184960.0, "grad_norm": 1.5603127476263212, "language_loss": 0.7161333, "learning_rate": 2.383374322259915e-06, "loss": 0.7377193, "num_input_tokens_seen": 162535240, "step": 7575, "time_per_iteration": 2.6638269424438477 }, { "auxiliary_loss_clip": 0.01106959, "auxiliary_loss_mlp": 0.01035456, "balance_loss_clip": 1.04536855, "balance_loss_mlp": 1.02120471, "epoch": 0.4554937622125357, "flos": 20557925677440.0, "grad_norm": 1.872589408642276, "language_loss": 0.73370463, "learning_rate": 2.3829920754592617e-06, "loss": 0.7551288, "num_input_tokens_seen": 162553880, "step": 7576, "time_per_iteration": 2.686311721801758 }, { "auxiliary_loss_clip": 0.01129005, "auxiliary_loss_mlp": 0.01036522, "balance_loss_clip": 1.04784572, "balance_loss_mlp": 1.02179956, "epoch": 0.45555388546520365, "flos": 22820764723200.0, "grad_norm": 1.7873556557153987, "language_loss": 0.66664052, "learning_rate": 2.382609814135511e-06, "loss": 0.68829584, "num_input_tokens_seen": 162574485, "step": 7577, "time_per_iteration": 2.6766581535339355 }, { "auxiliary_loss_clip": 0.01103092, "auxiliary_loss_mlp": 0.01046596, "balance_loss_clip": 1.04435253, "balance_loss_mlp": 1.0300076, "epoch": 0.4556140087178716, "flos": 21725992051200.0, "grad_norm": 1.9298557564452474, "language_loss": 0.74309111, "learning_rate": 2.382227538303157e-06, "loss": 0.76458794, "num_input_tokens_seen": 162595130, "step": 7578, "time_per_iteration": 4.310480356216431 }, { "auxiliary_loss_clip": 0.01079377, "auxiliary_loss_mlp": 0.00774819, "balance_loss_clip": 1.04437256, "balance_loss_mlp": 1.00061071, "epoch": 0.45567413197053963, "flos": 25994513698560.0, "grad_norm": 1.7583976894464832, "language_loss": 0.69843179, "learning_rate": 2.381845247976697e-06, "loss": 0.71697378, "num_input_tokens_seen": 162615720, "step": 7579, "time_per_iteration": 4.325899362564087 }, { "auxiliary_loss_clip": 0.01116252, "auxiliary_loss_mlp": 0.01033231, "balance_loss_clip": 1.0446142, "balance_loss_mlp": 1.0195992, "epoch": 0.4557342552232076, "flos": 21537604195200.0, "grad_norm": 1.7639178263730233, "language_loss": 0.78628397, "learning_rate": 2.381462943170627e-06, "loss": 0.80777884, "num_input_tokens_seen": 162635825, "step": 7580, "time_per_iteration": 2.6391446590423584 }, { "auxiliary_loss_clip": 0.0113405, "auxiliary_loss_mlp": 0.01031474, "balance_loss_clip": 1.05214024, "balance_loss_mlp": 1.01697779, "epoch": 0.45579437847587556, "flos": 40001972647680.0, "grad_norm": 1.99718885063772, "language_loss": 0.68943548, "learning_rate": 2.381080623899444e-06, "loss": 0.71109068, "num_input_tokens_seen": 162659130, "step": 7581, "time_per_iteration": 4.234206914901733 }, { "auxiliary_loss_clip": 0.01111938, "auxiliary_loss_mlp": 0.01032821, "balance_loss_clip": 1.04282808, "balance_loss_mlp": 1.01836669, "epoch": 0.4558545017285435, "flos": 31138501530240.0, "grad_norm": 1.6647606381596314, "language_loss": 0.73356318, "learning_rate": 2.3806982901776455e-06, "loss": 0.75501084, "num_input_tokens_seen": 162681665, "step": 7582, "time_per_iteration": 4.333024978637695 }, { "auxiliary_loss_clip": 0.0113626, "auxiliary_loss_mlp": 0.01043946, "balance_loss_clip": 1.05043411, "balance_loss_mlp": 1.02829337, "epoch": 0.4559146249812115, "flos": 21725776569600.0, "grad_norm": 1.9011112097623832, "language_loss": 0.72327513, "learning_rate": 2.380315942019729e-06, "loss": 0.74507719, "num_input_tokens_seen": 162702040, "step": 7583, "time_per_iteration": 2.633423089981079 }, { "auxiliary_loss_clip": 0.01122524, "auxiliary_loss_mlp": 0.01037395, "balance_loss_clip": 1.05119634, "balance_loss_mlp": 1.02291131, "epoch": 0.45597474823387946, "flos": 23805973935360.0, "grad_norm": 1.6028864846132196, "language_loss": 0.72692537, "learning_rate": 2.379933579440195e-06, "loss": 0.74852461, "num_input_tokens_seen": 162722375, "step": 7584, "time_per_iteration": 2.6895499229431152 }, { "auxiliary_loss_clip": 0.01089384, "auxiliary_loss_mlp": 0.01040718, "balance_loss_clip": 1.04311633, "balance_loss_mlp": 1.02606773, "epoch": 0.4560348714865474, "flos": 31905661230720.0, "grad_norm": 1.833639423310481, "language_loss": 0.68204761, "learning_rate": 2.379551202453541e-06, "loss": 0.70334864, "num_input_tokens_seen": 162746095, "step": 7585, "time_per_iteration": 2.7882261276245117 }, { "auxiliary_loss_clip": 0.01132515, "auxiliary_loss_mlp": 0.01030518, "balance_loss_clip": 1.05002046, "balance_loss_mlp": 1.01725006, "epoch": 0.4560949947392154, "flos": 22048828513920.0, "grad_norm": 1.65915998971852, "language_loss": 0.7634117, "learning_rate": 2.379168811074267e-06, "loss": 0.78504205, "num_input_tokens_seen": 162766330, "step": 7586, "time_per_iteration": 2.636626720428467 }, { "auxiliary_loss_clip": 0.01109504, "auxiliary_loss_mlp": 0.01029829, "balance_loss_clip": 1.04642403, "balance_loss_mlp": 1.01651323, "epoch": 0.45615511799188335, "flos": 24571804832640.0, "grad_norm": 44.63874812648689, "language_loss": 0.78151405, "learning_rate": 2.3787864053168747e-06, "loss": 0.80290735, "num_input_tokens_seen": 162784755, "step": 7587, "time_per_iteration": 2.7801096439361572 }, { "auxiliary_loss_clip": 0.01105539, "auxiliary_loss_mlp": 0.01044536, "balance_loss_clip": 1.04288149, "balance_loss_mlp": 1.02933669, "epoch": 0.4562152412445513, "flos": 18330709944960.0, "grad_norm": 2.252015566278715, "language_loss": 0.6950196, "learning_rate": 2.378403985195863e-06, "loss": 0.71652043, "num_input_tokens_seen": 162803850, "step": 7588, "time_per_iteration": 2.7108840942382812 }, { "auxiliary_loss_clip": 0.01118383, "auxiliary_loss_mlp": 0.01036327, "balance_loss_clip": 1.05038464, "balance_loss_mlp": 1.02234375, "epoch": 0.4562753644972193, "flos": 13516525814400.0, "grad_norm": 1.6983482750091652, "language_loss": 0.79372728, "learning_rate": 2.378021550725735e-06, "loss": 0.81527448, "num_input_tokens_seen": 162820775, "step": 7589, "time_per_iteration": 2.6967854499816895 }, { "auxiliary_loss_clip": 0.01121003, "auxiliary_loss_mlp": 0.01035976, "balance_loss_clip": 1.04755974, "balance_loss_mlp": 1.02120006, "epoch": 0.45633548774988725, "flos": 29639697701760.0, "grad_norm": 2.457585749278853, "language_loss": 0.62875861, "learning_rate": 2.377639101920992e-06, "loss": 0.6503284, "num_input_tokens_seen": 162839695, "step": 7590, "time_per_iteration": 2.6659393310546875 }, { "auxiliary_loss_clip": 0.01101858, "auxiliary_loss_mlp": 0.01045493, "balance_loss_clip": 1.04248881, "balance_loss_mlp": 1.03150392, "epoch": 0.4563956110025552, "flos": 22233409528320.0, "grad_norm": 1.8064400322650376, "language_loss": 0.73125023, "learning_rate": 2.377256638796135e-06, "loss": 0.75272369, "num_input_tokens_seen": 162856095, "step": 7591, "time_per_iteration": 2.7296926975250244 }, { "auxiliary_loss_clip": 0.01113505, "auxiliary_loss_mlp": 0.01043243, "balance_loss_clip": 1.04979515, "balance_loss_mlp": 1.02757883, "epoch": 0.45645573425522323, "flos": 17092043389440.0, "grad_norm": 2.6622201495184923, "language_loss": 0.76661623, "learning_rate": 2.3768741613656695e-06, "loss": 0.78818369, "num_input_tokens_seen": 162874070, "step": 7592, "time_per_iteration": 2.855787992477417 }, { "auxiliary_loss_clip": 0.01104851, "auxiliary_loss_mlp": 0.01042123, "balance_loss_clip": 1.04489005, "balance_loss_mlp": 1.026191, "epoch": 0.4565158575078912, "flos": 20332334309760.0, "grad_norm": 2.112667667080726, "language_loss": 0.6938538, "learning_rate": 2.376491669644098e-06, "loss": 0.71532357, "num_input_tokens_seen": 162891000, "step": 7593, "time_per_iteration": 2.7688679695129395 }, { "auxiliary_loss_clip": 0.01110049, "auxiliary_loss_mlp": 0.01034633, "balance_loss_clip": 1.04238796, "balance_loss_mlp": 1.02174079, "epoch": 0.45657598076055916, "flos": 23983013093760.0, "grad_norm": 2.174557271524546, "language_loss": 0.83913857, "learning_rate": 2.3761091636459248e-06, "loss": 0.86058539, "num_input_tokens_seen": 162910120, "step": 7594, "time_per_iteration": 2.807098865509033 }, { "auxiliary_loss_clip": 0.01036589, "auxiliary_loss_mlp": 0.00753626, "balance_loss_clip": 1.01769352, "balance_loss_mlp": 1.00077426, "epoch": 0.45663610401322713, "flos": 69364297526400.0, "grad_norm": 0.7884707903863047, "language_loss": 0.52737939, "learning_rate": 2.375726643385654e-06, "loss": 0.54528153, "num_input_tokens_seen": 162963720, "step": 7595, "time_per_iteration": 3.2812860012054443 }, { "auxiliary_loss_clip": 0.01096992, "auxiliary_loss_mlp": 0.01034204, "balance_loss_clip": 1.04297972, "balance_loss_mlp": 1.01864684, "epoch": 0.4566962272658951, "flos": 15149095891200.0, "grad_norm": 2.562717754165903, "language_loss": 0.87188721, "learning_rate": 2.3753441088777915e-06, "loss": 0.89319921, "num_input_tokens_seen": 162975760, "step": 7596, "time_per_iteration": 2.683833122253418 }, { "auxiliary_loss_clip": 0.01126007, "auxiliary_loss_mlp": 0.01046188, "balance_loss_clip": 1.05094647, "balance_loss_mlp": 1.03226399, "epoch": 0.45675635051856306, "flos": 18697465762560.0, "grad_norm": 8.947162495751469, "language_loss": 0.77418292, "learning_rate": 2.374961560136843e-06, "loss": 0.79590482, "num_input_tokens_seen": 162994865, "step": 7597, "time_per_iteration": 2.686328887939453 }, { "auxiliary_loss_clip": 0.01117589, "auxiliary_loss_mlp": 0.01038291, "balance_loss_clip": 1.04493558, "balance_loss_mlp": 1.02389073, "epoch": 0.456816473771231, "flos": 19098300608640.0, "grad_norm": 1.6036220935275767, "language_loss": 0.78581583, "learning_rate": 2.374578997177314e-06, "loss": 0.80737466, "num_input_tokens_seen": 163014730, "step": 7598, "time_per_iteration": 2.6856606006622314 }, { "auxiliary_loss_clip": 0.01128723, "auxiliary_loss_mlp": 0.01034286, "balance_loss_clip": 1.04699326, "balance_loss_mlp": 1.02080941, "epoch": 0.456876597023899, "flos": 28950069507840.0, "grad_norm": 3.021485745265107, "language_loss": 0.71589166, "learning_rate": 2.374196420013712e-06, "loss": 0.73752177, "num_input_tokens_seen": 163033405, "step": 7599, "time_per_iteration": 2.672055244445801 }, { "auxiliary_loss_clip": 0.0109465, "auxiliary_loss_mlp": 0.01038748, "balance_loss_clip": 1.04185176, "balance_loss_mlp": 1.02445507, "epoch": 0.45693672027656695, "flos": 23289470317440.0, "grad_norm": 2.0431074720876046, "language_loss": 0.70262265, "learning_rate": 2.373813828660544e-06, "loss": 0.72395658, "num_input_tokens_seen": 163051400, "step": 7600, "time_per_iteration": 2.8163371086120605 }, { "auxiliary_loss_clip": 0.01066248, "auxiliary_loss_mlp": 0.01041467, "balance_loss_clip": 1.04143667, "balance_loss_mlp": 1.02802658, "epoch": 0.4569968435292349, "flos": 20558212986240.0, "grad_norm": 6.700465706217943, "language_loss": 0.79066253, "learning_rate": 2.373431223132319e-06, "loss": 0.81173962, "num_input_tokens_seen": 163069250, "step": 7601, "time_per_iteration": 2.8098480701446533 }, { "auxiliary_loss_clip": 0.01100447, "auxiliary_loss_mlp": 0.01041284, "balance_loss_clip": 1.04293573, "balance_loss_mlp": 1.02730095, "epoch": 0.4570569667819029, "flos": 41282619223680.0, "grad_norm": 6.824528646616988, "language_loss": 0.71565419, "learning_rate": 2.3730486034435448e-06, "loss": 0.73707151, "num_input_tokens_seen": 163091755, "step": 7602, "time_per_iteration": 2.8971548080444336 }, { "auxiliary_loss_clip": 0.01115269, "auxiliary_loss_mlp": 0.01034582, "balance_loss_clip": 1.04276979, "balance_loss_mlp": 1.01859641, "epoch": 0.45711709003457085, "flos": 26031573555840.0, "grad_norm": 1.8661067599139867, "language_loss": 0.73023772, "learning_rate": 2.372665969608729e-06, "loss": 0.75173628, "num_input_tokens_seen": 163111600, "step": 7603, "time_per_iteration": 2.709261417388916 }, { "auxiliary_loss_clip": 0.01120961, "auxiliary_loss_mlp": 0.01043179, "balance_loss_clip": 1.04799032, "balance_loss_mlp": 1.02714539, "epoch": 0.4571772132872388, "flos": 22158068751360.0, "grad_norm": 1.901129043888336, "language_loss": 0.83068597, "learning_rate": 2.372283321642383e-06, "loss": 0.85232735, "num_input_tokens_seen": 163127350, "step": 7604, "time_per_iteration": 2.713744640350342 }, { "auxiliary_loss_clip": 0.01113838, "auxiliary_loss_mlp": 0.01045941, "balance_loss_clip": 1.05216503, "balance_loss_mlp": 1.02981162, "epoch": 0.45723733653990684, "flos": 23878872587520.0, "grad_norm": 2.0592585158299133, "language_loss": 0.85998154, "learning_rate": 2.371900659559016e-06, "loss": 0.88157928, "num_input_tokens_seen": 163145855, "step": 7605, "time_per_iteration": 2.6666319370269775 }, { "auxiliary_loss_clip": 0.010831, "auxiliary_loss_mlp": 0.01041844, "balance_loss_clip": 1.04206753, "balance_loss_mlp": 1.02670407, "epoch": 0.4572974597925748, "flos": 16871803148160.0, "grad_norm": 1.8551011968860212, "language_loss": 0.73551464, "learning_rate": 2.371517983373138e-06, "loss": 0.75676405, "num_input_tokens_seen": 163163830, "step": 7606, "time_per_iteration": 2.8618602752685547 }, { "auxiliary_loss_clip": 0.01100268, "auxiliary_loss_mlp": 0.01043762, "balance_loss_clip": 1.0450927, "balance_loss_mlp": 1.02790761, "epoch": 0.45735758304524277, "flos": 13771491528960.0, "grad_norm": 1.9296458941386103, "language_loss": 0.80260599, "learning_rate": 2.371135293099262e-06, "loss": 0.82404631, "num_input_tokens_seen": 163180700, "step": 7607, "time_per_iteration": 2.717987537384033 }, { "auxiliary_loss_clip": 0.01097097, "auxiliary_loss_mlp": 0.01046228, "balance_loss_clip": 1.05015063, "balance_loss_mlp": 1.03169668, "epoch": 0.45741770629791073, "flos": 21100750986240.0, "grad_norm": 1.7686881404445909, "language_loss": 0.81263912, "learning_rate": 2.3707525887518982e-06, "loss": 0.83407241, "num_input_tokens_seen": 163199450, "step": 7608, "time_per_iteration": 2.7047500610351562 }, { "auxiliary_loss_clip": 0.01110681, "auxiliary_loss_mlp": 0.01043615, "balance_loss_clip": 1.04563498, "balance_loss_mlp": 1.02828515, "epoch": 0.4574778295505787, "flos": 23112898035840.0, "grad_norm": 3.284613619336592, "language_loss": 0.68429869, "learning_rate": 2.370369870345559e-06, "loss": 0.70584166, "num_input_tokens_seen": 163217875, "step": 7609, "time_per_iteration": 2.7123308181762695 }, { "auxiliary_loss_clip": 0.01105383, "auxiliary_loss_mlp": 0.01045291, "balance_loss_clip": 1.04979467, "balance_loss_mlp": 1.03011012, "epoch": 0.45753795280324666, "flos": 24352929308160.0, "grad_norm": 1.7858891409698046, "language_loss": 0.80873275, "learning_rate": 2.369987137894757e-06, "loss": 0.83023953, "num_input_tokens_seen": 163237430, "step": 7610, "time_per_iteration": 2.707108497619629 }, { "auxiliary_loss_clip": 0.01122367, "auxiliary_loss_mlp": 0.01042029, "balance_loss_clip": 1.04675138, "balance_loss_mlp": 1.02698421, "epoch": 0.4575980760559146, "flos": 16653789550080.0, "grad_norm": 2.2133206913732746, "language_loss": 0.82100248, "learning_rate": 2.3696043914140057e-06, "loss": 0.84264642, "num_input_tokens_seen": 163253905, "step": 7611, "time_per_iteration": 2.6911368370056152 }, { "auxiliary_loss_clip": 0.01127544, "auxiliary_loss_mlp": 0.01034771, "balance_loss_clip": 1.05061793, "balance_loss_mlp": 1.01889205, "epoch": 0.4576581993085826, "flos": 35911423912320.0, "grad_norm": 2.6253593942917677, "language_loss": 0.73971558, "learning_rate": 2.369221630917819e-06, "loss": 0.76133871, "num_input_tokens_seen": 163274285, "step": 7612, "time_per_iteration": 2.8162691593170166 }, { "auxiliary_loss_clip": 0.01103651, "auxiliary_loss_mlp": 0.01042157, "balance_loss_clip": 1.04241323, "balance_loss_mlp": 1.02680302, "epoch": 0.45771832256125056, "flos": 20080421251200.0, "grad_norm": 1.6042487302929564, "language_loss": 0.84652913, "learning_rate": 2.368838856420711e-06, "loss": 0.86798728, "num_input_tokens_seen": 163293150, "step": 7613, "time_per_iteration": 2.66471266746521 }, { "auxiliary_loss_clip": 0.01096161, "auxiliary_loss_mlp": 0.01038746, "balance_loss_clip": 1.04437852, "balance_loss_mlp": 1.02373135, "epoch": 0.4577784458139185, "flos": 10744329957120.0, "grad_norm": 2.314421678604919, "language_loss": 0.75271547, "learning_rate": 2.3684560679371965e-06, "loss": 0.77406454, "num_input_tokens_seen": 163310065, "step": 7614, "time_per_iteration": 2.740011215209961 }, { "auxiliary_loss_clip": 0.01132592, "auxiliary_loss_mlp": 0.01037968, "balance_loss_clip": 1.05067575, "balance_loss_mlp": 1.02378809, "epoch": 0.4578385690665865, "flos": 21907269014400.0, "grad_norm": 1.5980870069512307, "language_loss": 0.75026065, "learning_rate": 2.368073265481791e-06, "loss": 0.77196622, "num_input_tokens_seen": 163329415, "step": 7615, "time_per_iteration": 2.694354772567749 }, { "auxiliary_loss_clip": 0.01037366, "auxiliary_loss_mlp": 0.01005104, "balance_loss_clip": 1.02879357, "balance_loss_mlp": 1.00286281, "epoch": 0.45789869231925445, "flos": 64758286667520.0, "grad_norm": 0.785268606967784, "language_loss": 0.57671446, "learning_rate": 2.3676904490690105e-06, "loss": 0.59713912, "num_input_tokens_seen": 163385875, "step": 7616, "time_per_iteration": 3.2036197185516357 }, { "auxiliary_loss_clip": 0.010986, "auxiliary_loss_mlp": 0.00772301, "balance_loss_clip": 1.04307699, "balance_loss_mlp": 1.00081253, "epoch": 0.4579588155719224, "flos": 16144001775360.0, "grad_norm": 1.6020549029918738, "language_loss": 0.70836008, "learning_rate": 2.3673076187133704e-06, "loss": 0.72706908, "num_input_tokens_seen": 163405170, "step": 7617, "time_per_iteration": 2.7075886726379395 }, { "auxiliary_loss_clip": 0.01137127, "auxiliary_loss_mlp": 0.01037359, "balance_loss_clip": 1.05343175, "balance_loss_mlp": 1.02264261, "epoch": 0.45801893882459044, "flos": 21395541905280.0, "grad_norm": 1.8894449061399028, "language_loss": 0.76292491, "learning_rate": 2.36692477442939e-06, "loss": 0.78466976, "num_input_tokens_seen": 163423155, "step": 7618, "time_per_iteration": 5.8249146938323975 }, { "auxiliary_loss_clip": 0.01101544, "auxiliary_loss_mlp": 0.01045871, "balance_loss_clip": 1.05301738, "balance_loss_mlp": 1.03189957, "epoch": 0.4580790620772584, "flos": 19536554448000.0, "grad_norm": 1.7481433677396025, "language_loss": 0.77097881, "learning_rate": 2.366541916231585e-06, "loss": 0.79245299, "num_input_tokens_seen": 163442450, "step": 7619, "time_per_iteration": 2.766615629196167 }, { "auxiliary_loss_clip": 0.01134342, "auxiliary_loss_mlp": 0.01040375, "balance_loss_clip": 1.05348432, "balance_loss_mlp": 1.02757239, "epoch": 0.45813918532992637, "flos": 16581070465920.0, "grad_norm": 1.8920903156272437, "language_loss": 0.72002041, "learning_rate": 2.366159044134473e-06, "loss": 0.74176759, "num_input_tokens_seen": 163459810, "step": 7620, "time_per_iteration": 4.087975025177002 }, { "auxiliary_loss_clip": 0.01109227, "auxiliary_loss_mlp": 0.01032686, "balance_loss_clip": 1.04942107, "balance_loss_mlp": 1.01892948, "epoch": 0.45819930858259433, "flos": 42230301701760.0, "grad_norm": 1.5465249381842834, "language_loss": 0.77770388, "learning_rate": 2.3657761581525748e-06, "loss": 0.79912305, "num_input_tokens_seen": 163482970, "step": 7621, "time_per_iteration": 2.9124109745025635 }, { "auxiliary_loss_clip": 0.01044673, "auxiliary_loss_mlp": 0.01001257, "balance_loss_clip": 1.02584982, "balance_loss_mlp": 0.99903959, "epoch": 0.4582594318352623, "flos": 63714795638400.0, "grad_norm": 0.7823065471017115, "language_loss": 0.64958, "learning_rate": 2.3653932583004063e-06, "loss": 0.6700393, "num_input_tokens_seen": 163545330, "step": 7622, "time_per_iteration": 4.778898477554321 }, { "auxiliary_loss_clip": 0.01120212, "auxiliary_loss_mlp": 0.01034924, "balance_loss_clip": 1.05105555, "balance_loss_mlp": 1.02016604, "epoch": 0.45831955508793026, "flos": 26869979882880.0, "grad_norm": 3.654827974152138, "language_loss": 0.79468191, "learning_rate": 2.3650103445924903e-06, "loss": 0.81623328, "num_input_tokens_seen": 163564620, "step": 7623, "time_per_iteration": 2.7033259868621826 }, { "auxiliary_loss_clip": 0.01078844, "auxiliary_loss_mlp": 0.01041828, "balance_loss_clip": 1.04181957, "balance_loss_mlp": 1.02728403, "epoch": 0.45837967834059823, "flos": 18733951002240.0, "grad_norm": 1.8933831090876323, "language_loss": 0.70283759, "learning_rate": 2.3646274170433452e-06, "loss": 0.72404432, "num_input_tokens_seen": 163581010, "step": 7624, "time_per_iteration": 2.8526861667633057 }, { "auxiliary_loss_clip": 0.01100025, "auxiliary_loss_mlp": 0.01040188, "balance_loss_clip": 1.04250479, "balance_loss_mlp": 1.02558446, "epoch": 0.4584398015932662, "flos": 21178102924800.0, "grad_norm": 2.2295023596293273, "language_loss": 0.73171687, "learning_rate": 2.364244475667491e-06, "loss": 0.75311905, "num_input_tokens_seen": 163599955, "step": 7625, "time_per_iteration": 2.77284574508667 }, { "auxiliary_loss_clip": 0.01120178, "auxiliary_loss_mlp": 0.01036964, "balance_loss_clip": 1.05209434, "balance_loss_mlp": 1.02369022, "epoch": 0.45849992484593416, "flos": 19790047704960.0, "grad_norm": 2.499945379712242, "language_loss": 0.77924562, "learning_rate": 2.363861520479451e-06, "loss": 0.80081707, "num_input_tokens_seen": 163618545, "step": 7626, "time_per_iteration": 2.813945770263672 }, { "auxiliary_loss_clip": 0.01137615, "auxiliary_loss_mlp": 0.01040207, "balance_loss_clip": 1.05263078, "balance_loss_mlp": 1.02645612, "epoch": 0.4585600480986021, "flos": 18223265387520.0, "grad_norm": 1.5689934094814115, "language_loss": 0.84652817, "learning_rate": 2.3634785514937445e-06, "loss": 0.8683064, "num_input_tokens_seen": 163636055, "step": 7627, "time_per_iteration": 2.659053087234497 }, { "auxiliary_loss_clip": 0.01138145, "auxiliary_loss_mlp": 0.01040233, "balance_loss_clip": 1.05155802, "balance_loss_mlp": 1.02531946, "epoch": 0.4586201713512701, "flos": 29022213974400.0, "grad_norm": 1.5125222475387885, "language_loss": 0.6911087, "learning_rate": 2.3630955687248953e-06, "loss": 0.71289253, "num_input_tokens_seen": 163657485, "step": 7628, "time_per_iteration": 2.693678617477417 }, { "auxiliary_loss_clip": 0.01118783, "auxiliary_loss_mlp": 0.01034859, "balance_loss_clip": 1.04731619, "balance_loss_mlp": 1.02110827, "epoch": 0.45868029460393805, "flos": 23404600385280.0, "grad_norm": 1.4972122231294245, "language_loss": 0.78672099, "learning_rate": 2.3627125721874265e-06, "loss": 0.80825746, "num_input_tokens_seen": 163676030, "step": 7629, "time_per_iteration": 2.6437535285949707 }, { "auxiliary_loss_clip": 0.01113389, "auxiliary_loss_mlp": 0.01045555, "balance_loss_clip": 1.04590559, "balance_loss_mlp": 1.03034973, "epoch": 0.458740417856606, "flos": 18221972497920.0, "grad_norm": 2.2059444062956985, "language_loss": 0.79377991, "learning_rate": 2.3623295618958595e-06, "loss": 0.81536937, "num_input_tokens_seen": 163694490, "step": 7630, "time_per_iteration": 2.7565791606903076 }, { "auxiliary_loss_clip": 0.01111942, "auxiliary_loss_mlp": 0.01039415, "balance_loss_clip": 1.04838312, "balance_loss_mlp": 1.02481222, "epoch": 0.458800541109274, "flos": 34568760504960.0, "grad_norm": 2.1212994157581293, "language_loss": 0.72087741, "learning_rate": 2.3619465378647198e-06, "loss": 0.74239099, "num_input_tokens_seen": 163717035, "step": 7631, "time_per_iteration": 2.7880306243896484 }, { "auxiliary_loss_clip": 0.01094955, "auxiliary_loss_mlp": 0.01048432, "balance_loss_clip": 1.04605651, "balance_loss_mlp": 1.03280342, "epoch": 0.458860664361942, "flos": 17712112896000.0, "grad_norm": 2.4606182879569145, "language_loss": 0.71433818, "learning_rate": 2.361563500108531e-06, "loss": 0.73577201, "num_input_tokens_seen": 163734525, "step": 7632, "time_per_iteration": 2.7352800369262695 }, { "auxiliary_loss_clip": 0.01081835, "auxiliary_loss_mlp": 0.00774034, "balance_loss_clip": 1.04268694, "balance_loss_mlp": 1.00058782, "epoch": 0.45892078761460997, "flos": 18441889516800.0, "grad_norm": 2.5758659525876824, "language_loss": 0.68867576, "learning_rate": 2.3611804486418178e-06, "loss": 0.7072345, "num_input_tokens_seen": 163752860, "step": 7633, "time_per_iteration": 2.848534107208252 }, { "auxiliary_loss_clip": 0.01122955, "auxiliary_loss_mlp": 0.01043952, "balance_loss_clip": 1.05012798, "balance_loss_mlp": 1.02942061, "epoch": 0.45898091086727794, "flos": 22672956257280.0, "grad_norm": 1.690968390723207, "language_loss": 0.80858737, "learning_rate": 2.3607973834791062e-06, "loss": 0.83025646, "num_input_tokens_seen": 163772495, "step": 7634, "time_per_iteration": 2.6536448001861572 }, { "auxiliary_loss_clip": 0.01122911, "auxiliary_loss_mlp": 0.00773021, "balance_loss_clip": 1.04987049, "balance_loss_mlp": 1.00053596, "epoch": 0.4590410341199459, "flos": 21652949744640.0, "grad_norm": 1.6933583063541449, "language_loss": 0.81255853, "learning_rate": 2.3604143046349216e-06, "loss": 0.83151788, "num_input_tokens_seen": 163791475, "step": 7635, "time_per_iteration": 2.6140496730804443 }, { "auxiliary_loss_clip": 0.01110725, "auxiliary_loss_mlp": 0.01043522, "balance_loss_clip": 1.04990745, "balance_loss_mlp": 1.02941322, "epoch": 0.45910115737261387, "flos": 36535372087680.0, "grad_norm": 1.4938285014309638, "language_loss": 0.64786839, "learning_rate": 2.3600312121237905e-06, "loss": 0.66941082, "num_input_tokens_seen": 163812995, "step": 7636, "time_per_iteration": 2.9211695194244385 }, { "auxiliary_loss_clip": 0.01117391, "auxiliary_loss_mlp": 0.01034493, "balance_loss_clip": 1.05096126, "balance_loss_mlp": 1.0207361, "epoch": 0.45916128062528183, "flos": 24419866302720.0, "grad_norm": 1.5704675488980822, "language_loss": 0.8052876, "learning_rate": 2.3596481059602395e-06, "loss": 0.82680643, "num_input_tokens_seen": 163833945, "step": 7637, "time_per_iteration": 2.703902244567871 }, { "auxiliary_loss_clip": 0.0110221, "auxiliary_loss_mlp": 0.0104296, "balance_loss_clip": 1.04369295, "balance_loss_mlp": 1.02650893, "epoch": 0.4592214038779498, "flos": 23221958705280.0, "grad_norm": 1.340585421251073, "language_loss": 0.75339955, "learning_rate": 2.3592649861587965e-06, "loss": 0.7748512, "num_input_tokens_seen": 163853885, "step": 7638, "time_per_iteration": 2.8683316707611084 }, { "auxiliary_loss_clip": 0.01118666, "auxiliary_loss_mlp": 0.01037335, "balance_loss_clip": 1.04785442, "balance_loss_mlp": 1.02312553, "epoch": 0.45928152713061776, "flos": 19172133014400.0, "grad_norm": 1.8020175509044534, "language_loss": 0.74017608, "learning_rate": 2.358881852733989e-06, "loss": 0.76173615, "num_input_tokens_seen": 163871855, "step": 7639, "time_per_iteration": 2.6385724544525146 }, { "auxiliary_loss_clip": 0.01134704, "auxiliary_loss_mlp": 0.01038079, "balance_loss_clip": 1.05116391, "balance_loss_mlp": 1.02403021, "epoch": 0.4593416503832857, "flos": 22414686491520.0, "grad_norm": 1.704541952239469, "language_loss": 0.68183744, "learning_rate": 2.358498705700346e-06, "loss": 0.7035653, "num_input_tokens_seen": 163891450, "step": 7640, "time_per_iteration": 2.6786441802978516 }, { "auxiliary_loss_clip": 0.01104644, "auxiliary_loss_mlp": 0.01040873, "balance_loss_clip": 1.04305553, "balance_loss_mlp": 1.02640736, "epoch": 0.4594017736359537, "flos": 18880215183360.0, "grad_norm": 1.6440653073556697, "language_loss": 0.75610799, "learning_rate": 2.3581155450723958e-06, "loss": 0.77756315, "num_input_tokens_seen": 163909345, "step": 7641, "time_per_iteration": 2.6967337131500244 }, { "auxiliary_loss_clip": 0.01107468, "auxiliary_loss_mlp": 0.0103519, "balance_loss_clip": 1.04473758, "balance_loss_mlp": 1.01987791, "epoch": 0.45946189688862166, "flos": 20518567349760.0, "grad_norm": 1.7366807351650166, "language_loss": 0.7477932, "learning_rate": 2.357732370864668e-06, "loss": 0.76921976, "num_input_tokens_seen": 163926940, "step": 7642, "time_per_iteration": 2.7593836784362793 }, { "auxiliary_loss_clip": 0.01033439, "auxiliary_loss_mlp": 0.01015123, "balance_loss_clip": 1.02063584, "balance_loss_mlp": 1.01360917, "epoch": 0.4595220201412896, "flos": 61405990162560.0, "grad_norm": 0.8870453562304583, "language_loss": 0.58169055, "learning_rate": 2.357349183091694e-06, "loss": 0.60217613, "num_input_tokens_seen": 163977785, "step": 7643, "time_per_iteration": 3.008721351623535 }, { "auxiliary_loss_clip": 0.01126407, "auxiliary_loss_mlp": 0.01039184, "balance_loss_clip": 1.04902744, "balance_loss_mlp": 1.02468801, "epoch": 0.4595821433939576, "flos": 23330947547520.0, "grad_norm": 1.6727361984558426, "language_loss": 0.92977291, "learning_rate": 2.3569659817680016e-06, "loss": 0.95142883, "num_input_tokens_seen": 163996630, "step": 7644, "time_per_iteration": 2.6844348907470703 }, { "auxiliary_loss_clip": 0.01118806, "auxiliary_loss_mlp": 0.0103695, "balance_loss_clip": 1.04879534, "balance_loss_mlp": 1.02278805, "epoch": 0.4596422666466256, "flos": 14282356711680.0, "grad_norm": 2.49930104784668, "language_loss": 0.82485175, "learning_rate": 2.3565827669081243e-06, "loss": 0.84640932, "num_input_tokens_seen": 164013190, "step": 7645, "time_per_iteration": 2.649367332458496 }, { "auxiliary_loss_clip": 0.01010103, "auxiliary_loss_mlp": 0.00999811, "balance_loss_clip": 1.01816797, "balance_loss_mlp": 0.99795145, "epoch": 0.4597023898992936, "flos": 65727337737600.0, "grad_norm": 0.7581805782249401, "language_loss": 0.59857589, "learning_rate": 2.356199538526593e-06, "loss": 0.61867499, "num_input_tokens_seen": 164074030, "step": 7646, "time_per_iteration": 3.211512327194214 }, { "auxiliary_loss_clip": 0.01116258, "auxiliary_loss_mlp": 0.01035245, "balance_loss_clip": 1.04631102, "balance_loss_mlp": 1.02006984, "epoch": 0.45976251315196154, "flos": 26907075653760.0, "grad_norm": 1.794903772385352, "language_loss": 0.72503293, "learning_rate": 2.355816296637939e-06, "loss": 0.74654794, "num_input_tokens_seen": 164095515, "step": 7647, "time_per_iteration": 2.792795419692993 }, { "auxiliary_loss_clip": 0.01096575, "auxiliary_loss_mlp": 0.01041791, "balance_loss_clip": 1.04206514, "balance_loss_mlp": 1.02684855, "epoch": 0.4598226364046295, "flos": 26618066824320.0, "grad_norm": 1.7350588372730733, "language_loss": 0.66805142, "learning_rate": 2.3554330412566957e-06, "loss": 0.68943512, "num_input_tokens_seen": 164117270, "step": 7648, "time_per_iteration": 2.798882484436035 }, { "auxiliary_loss_clip": 0.01120443, "auxiliary_loss_mlp": 0.01037713, "balance_loss_clip": 1.04601169, "balance_loss_mlp": 1.0234313, "epoch": 0.45988275965729747, "flos": 24387762522240.0, "grad_norm": 1.4487791655991338, "language_loss": 0.78854847, "learning_rate": 2.3550497723973953e-06, "loss": 0.81013, "num_input_tokens_seen": 164137850, "step": 7649, "time_per_iteration": 2.710026979446411 }, { "auxiliary_loss_clip": 0.01071387, "auxiliary_loss_mlp": 0.01039161, "balance_loss_clip": 1.0469979, "balance_loss_mlp": 1.02459955, "epoch": 0.45994288290996543, "flos": 24535822383360.0, "grad_norm": 1.68877556398497, "language_loss": 0.69140404, "learning_rate": 2.3546664900745726e-06, "loss": 0.71250951, "num_input_tokens_seen": 164157960, "step": 7650, "time_per_iteration": 2.862882375717163 }, { "auxiliary_loss_clip": 0.01128714, "auxiliary_loss_mlp": 0.01042169, "balance_loss_clip": 1.05184257, "balance_loss_mlp": 1.02592099, "epoch": 0.4600030061626334, "flos": 14830245838080.0, "grad_norm": 2.8986833449878686, "language_loss": 0.844868, "learning_rate": 2.354283194302761e-06, "loss": 0.86657685, "num_input_tokens_seen": 164174590, "step": 7651, "time_per_iteration": 2.624094247817993 }, { "auxiliary_loss_clip": 0.01108337, "auxiliary_loss_mlp": 0.00771732, "balance_loss_clip": 1.04726708, "balance_loss_mlp": 1.00045896, "epoch": 0.46006312941530136, "flos": 18113845582080.0, "grad_norm": 1.8740934460638858, "language_loss": 0.75375748, "learning_rate": 2.3538998850964948e-06, "loss": 0.77255821, "num_input_tokens_seen": 164192935, "step": 7652, "time_per_iteration": 2.7064099311828613 }, { "auxiliary_loss_clip": 0.01083449, "auxiliary_loss_mlp": 0.01033562, "balance_loss_clip": 1.04353166, "balance_loss_mlp": 1.019364, "epoch": 0.46012325266796933, "flos": 21976468565760.0, "grad_norm": 1.6780448716001595, "language_loss": 0.75990206, "learning_rate": 2.3535165624703097e-06, "loss": 0.78107214, "num_input_tokens_seen": 164213160, "step": 7653, "time_per_iteration": 2.840228319168091 }, { "auxiliary_loss_clip": 0.01090017, "auxiliary_loss_mlp": 0.01037352, "balance_loss_clip": 1.04773235, "balance_loss_mlp": 1.02063906, "epoch": 0.4601833759206373, "flos": 15268068714240.0, "grad_norm": 4.060223218919271, "language_loss": 0.65658432, "learning_rate": 2.353133226438741e-06, "loss": 0.67785805, "num_input_tokens_seen": 164229330, "step": 7654, "time_per_iteration": 2.8097331523895264 }, { "auxiliary_loss_clip": 0.0110323, "auxiliary_loss_mlp": 0.01038674, "balance_loss_clip": 1.04187179, "balance_loss_mlp": 1.02436912, "epoch": 0.46024349917330526, "flos": 27088999061760.0, "grad_norm": 1.8761760458574834, "language_loss": 0.79274917, "learning_rate": 2.3527498770163248e-06, "loss": 0.81416821, "num_input_tokens_seen": 164248240, "step": 7655, "time_per_iteration": 2.758086681365967 }, { "auxiliary_loss_clip": 0.01090903, "auxiliary_loss_mlp": 0.01032546, "balance_loss_clip": 1.0439781, "balance_loss_mlp": 1.01801491, "epoch": 0.4603036224259732, "flos": 24462923731200.0, "grad_norm": 1.6240518023721515, "language_loss": 0.68172526, "learning_rate": 2.3523665142175985e-06, "loss": 0.70295978, "num_input_tokens_seen": 164268020, "step": 7656, "time_per_iteration": 2.740079402923584 }, { "auxiliary_loss_clip": 0.01107571, "auxiliary_loss_mlp": 0.01034222, "balance_loss_clip": 1.04353023, "balance_loss_mlp": 1.02023935, "epoch": 0.4603637456786412, "flos": 28109292883200.0, "grad_norm": 2.01428243239582, "language_loss": 0.80944681, "learning_rate": 2.351983138057098e-06, "loss": 0.83086479, "num_input_tokens_seen": 164287305, "step": 7657, "time_per_iteration": 5.946510314941406 }, { "auxiliary_loss_clip": 0.01130018, "auxiliary_loss_mlp": 0.00771647, "balance_loss_clip": 1.04671657, "balance_loss_mlp": 1.00056028, "epoch": 0.4604238689313092, "flos": 24348942898560.0, "grad_norm": 2.997035997447325, "language_loss": 0.70678955, "learning_rate": 2.3515997485493623e-06, "loss": 0.72580624, "num_input_tokens_seen": 164306835, "step": 7658, "time_per_iteration": 2.710728883743286 }, { "auxiliary_loss_clip": 0.01037878, "auxiliary_loss_mlp": 0.01003053, "balance_loss_clip": 1.01928806, "balance_loss_mlp": 1.00126505, "epoch": 0.4604839921839772, "flos": 53606229431040.0, "grad_norm": 0.9879963677197028, "language_loss": 0.62104321, "learning_rate": 2.351216345708928e-06, "loss": 0.64145255, "num_input_tokens_seen": 164367095, "step": 7659, "time_per_iteration": 4.733903646469116 }, { "auxiliary_loss_clip": 0.01079557, "auxiliary_loss_mlp": 0.01042331, "balance_loss_clip": 1.04242504, "balance_loss_mlp": 1.02548122, "epoch": 0.46054411543664514, "flos": 31248424126080.0, "grad_norm": 1.6833434349921483, "language_loss": 0.68750244, "learning_rate": 2.350832929550336e-06, "loss": 0.70872128, "num_input_tokens_seen": 164388895, "step": 7660, "time_per_iteration": 2.8501877784729004 }, { "auxiliary_loss_clip": 0.01115644, "auxiliary_loss_mlp": 0.01039595, "balance_loss_clip": 1.04312992, "balance_loss_mlp": 1.02450275, "epoch": 0.4606042386893131, "flos": 24092863862400.0, "grad_norm": 4.508470627980692, "language_loss": 0.77059424, "learning_rate": 2.3504495000881227e-06, "loss": 0.79214668, "num_input_tokens_seen": 164409080, "step": 7661, "time_per_iteration": 4.375652313232422 }, { "auxiliary_loss_clip": 0.01111668, "auxiliary_loss_mlp": 0.01045702, "balance_loss_clip": 1.04530478, "balance_loss_mlp": 1.02989531, "epoch": 0.46066436194198107, "flos": 26578457101440.0, "grad_norm": 1.8557827945777399, "language_loss": 0.75165689, "learning_rate": 2.3500660573368305e-06, "loss": 0.77323061, "num_input_tokens_seen": 164427585, "step": 7662, "time_per_iteration": 2.654381513595581 }, { "auxiliary_loss_clip": 0.01104085, "auxiliary_loss_mlp": 0.01041771, "balance_loss_clip": 1.0422461, "balance_loss_mlp": 1.02585697, "epoch": 0.46072448519464904, "flos": 17775602184960.0, "grad_norm": 3.5055114571256922, "language_loss": 0.79886508, "learning_rate": 2.349682601310998e-06, "loss": 0.82032371, "num_input_tokens_seen": 164438455, "step": 7663, "time_per_iteration": 2.6240744590759277 }, { "auxiliary_loss_clip": 0.0111588, "auxiliary_loss_mlp": 0.01034844, "balance_loss_clip": 1.04562616, "balance_loss_mlp": 1.02098536, "epoch": 0.460784608447317, "flos": 15086109392640.0, "grad_norm": 2.0015713101361565, "language_loss": 0.73791528, "learning_rate": 2.3492991320251653e-06, "loss": 0.75942254, "num_input_tokens_seen": 164456830, "step": 7664, "time_per_iteration": 2.673335075378418 }, { "auxiliary_loss_clip": 0.01096445, "auxiliary_loss_mlp": 0.01036863, "balance_loss_clip": 1.04571927, "balance_loss_mlp": 1.02313614, "epoch": 0.46084473169998497, "flos": 18588261438720.0, "grad_norm": 1.5274295482700302, "language_loss": 0.7257731, "learning_rate": 2.3489156494938753e-06, "loss": 0.74710619, "num_input_tokens_seen": 164475375, "step": 7665, "time_per_iteration": 2.7057924270629883 }, { "auxiliary_loss_clip": 0.01104187, "auxiliary_loss_mlp": 0.01034968, "balance_loss_clip": 1.04968786, "balance_loss_mlp": 1.02148521, "epoch": 0.46090485495265293, "flos": 19494789909120.0, "grad_norm": 1.7665019302136358, "language_loss": 0.78369665, "learning_rate": 2.348532153731669e-06, "loss": 0.80508822, "num_input_tokens_seen": 164492040, "step": 7666, "time_per_iteration": 2.6954169273376465 }, { "auxiliary_loss_clip": 0.0108371, "auxiliary_loss_mlp": 0.01035058, "balance_loss_clip": 1.04061627, "balance_loss_mlp": 1.01935792, "epoch": 0.4609649782053209, "flos": 33364927163520.0, "grad_norm": 1.7291426769142197, "language_loss": 0.74374932, "learning_rate": 2.348148644753088e-06, "loss": 0.76493704, "num_input_tokens_seen": 164513665, "step": 7667, "time_per_iteration": 2.781087636947632 }, { "auxiliary_loss_clip": 0.01083108, "auxiliary_loss_mlp": 0.01038011, "balance_loss_clip": 1.04470205, "balance_loss_mlp": 1.02440965, "epoch": 0.46102510145798886, "flos": 23769165473280.0, "grad_norm": 1.4213815945133983, "language_loss": 0.75993818, "learning_rate": 2.347765122572676e-06, "loss": 0.78114939, "num_input_tokens_seen": 164533890, "step": 7668, "time_per_iteration": 2.8653104305267334 }, { "auxiliary_loss_clip": 0.010726, "auxiliary_loss_mlp": 0.01033857, "balance_loss_clip": 1.04025698, "balance_loss_mlp": 1.02047563, "epoch": 0.4610852247106568, "flos": 23294821443840.0, "grad_norm": 1.7696248586775516, "language_loss": 0.78228277, "learning_rate": 2.347381587204975e-06, "loss": 0.80334735, "num_input_tokens_seen": 164553815, "step": 7669, "time_per_iteration": 2.783662796020508 }, { "auxiliary_loss_clip": 0.01110483, "auxiliary_loss_mlp": 0.01038047, "balance_loss_clip": 1.04095972, "balance_loss_mlp": 1.02259183, "epoch": 0.4611453479633248, "flos": 25447450584960.0, "grad_norm": 1.7322551840105593, "language_loss": 0.82352221, "learning_rate": 2.34699803866453e-06, "loss": 0.84500754, "num_input_tokens_seen": 164573125, "step": 7670, "time_per_iteration": 2.6722826957702637 }, { "auxiliary_loss_clip": 0.01118191, "auxiliary_loss_mlp": 0.01034929, "balance_loss_clip": 1.04624724, "balance_loss_mlp": 1.02086234, "epoch": 0.4612054712159928, "flos": 21139606523520.0, "grad_norm": 1.6399167633004121, "language_loss": 0.63361788, "learning_rate": 2.3466144769658845e-06, "loss": 0.6551491, "num_input_tokens_seen": 164592575, "step": 7671, "time_per_iteration": 2.6507785320281982 }, { "auxiliary_loss_clip": 0.01038838, "auxiliary_loss_mlp": 0.01005964, "balance_loss_clip": 1.02976012, "balance_loss_mlp": 1.0044564, "epoch": 0.4612655944686608, "flos": 69959266404480.0, "grad_norm": 0.6926647500019024, "language_loss": 0.55842638, "learning_rate": 2.346230902123583e-06, "loss": 0.57887447, "num_input_tokens_seen": 164659795, "step": 7672, "time_per_iteration": 3.330268144607544 }, { "auxiliary_loss_clip": 0.01119098, "auxiliary_loss_mlp": 0.01040288, "balance_loss_clip": 1.04617, "balance_loss_mlp": 1.02645397, "epoch": 0.46132571772132874, "flos": 16837149502080.0, "grad_norm": 1.8809200572873195, "language_loss": 0.70954943, "learning_rate": 2.3458473141521715e-06, "loss": 0.7311433, "num_input_tokens_seen": 164678735, "step": 7673, "time_per_iteration": 2.65659499168396 }, { "auxiliary_loss_clip": 0.01103001, "auxiliary_loss_mlp": 0.01033294, "balance_loss_clip": 1.04363799, "balance_loss_mlp": 1.01938248, "epoch": 0.4613858409739967, "flos": 35808935431680.0, "grad_norm": 1.9110713796675685, "language_loss": 0.70837104, "learning_rate": 2.345463713066195e-06, "loss": 0.72973394, "num_input_tokens_seen": 164700885, "step": 7674, "time_per_iteration": 2.8332366943359375 }, { "auxiliary_loss_clip": 0.01103023, "auxiliary_loss_mlp": 0.0104104, "balance_loss_clip": 1.04143381, "balance_loss_mlp": 1.02709818, "epoch": 0.4614459642266647, "flos": 35266756567680.0, "grad_norm": 1.6933433527162, "language_loss": 0.65489, "learning_rate": 2.3450800988801996e-06, "loss": 0.67633063, "num_input_tokens_seen": 164726960, "step": 7675, "time_per_iteration": 2.8454952239990234 }, { "auxiliary_loss_clip": 0.01047065, "auxiliary_loss_mlp": 0.01003099, "balance_loss_clip": 1.02009785, "balance_loss_mlp": 1.00131118, "epoch": 0.46150608747933264, "flos": 66704610044160.0, "grad_norm": 0.8598142136337862, "language_loss": 0.58659744, "learning_rate": 2.3446964716087327e-06, "loss": 0.60709906, "num_input_tokens_seen": 164788525, "step": 7676, "time_per_iteration": 3.1523091793060303 }, { "auxiliary_loss_clip": 0.0101473, "auxiliary_loss_mlp": 0.01002448, "balance_loss_clip": 1.01614749, "balance_loss_mlp": 1.00077868, "epoch": 0.4615662107320006, "flos": 55830177025920.0, "grad_norm": 0.7931279707742926, "language_loss": 0.62803817, "learning_rate": 2.344312831266341e-06, "loss": 0.64820993, "num_input_tokens_seen": 164843525, "step": 7677, "time_per_iteration": 3.1055288314819336 }, { "auxiliary_loss_clip": 0.01103004, "auxiliary_loss_mlp": 0.01036602, "balance_loss_clip": 1.04363084, "balance_loss_mlp": 1.02309012, "epoch": 0.46162633398466857, "flos": 15483245137920.0, "grad_norm": 2.4819209870900636, "language_loss": 0.76371491, "learning_rate": 2.3439291778675718e-06, "loss": 0.78511101, "num_input_tokens_seen": 164859895, "step": 7678, "time_per_iteration": 2.6796817779541016 }, { "auxiliary_loss_clip": 0.01131922, "auxiliary_loss_mlp": 0.01035943, "balance_loss_clip": 1.04888463, "balance_loss_mlp": 1.02157795, "epoch": 0.46168645723733653, "flos": 20011437181440.0, "grad_norm": 2.4568506909255974, "language_loss": 0.66881382, "learning_rate": 2.343545511426974e-06, "loss": 0.69049251, "num_input_tokens_seen": 164878030, "step": 7679, "time_per_iteration": 2.669527053833008 }, { "auxiliary_loss_clip": 0.01095986, "auxiliary_loss_mlp": 0.01037988, "balance_loss_clip": 1.04533219, "balance_loss_mlp": 1.02469063, "epoch": 0.4617465804900045, "flos": 20298542590080.0, "grad_norm": 2.335341416202827, "language_loss": 0.70432782, "learning_rate": 2.3431618319590963e-06, "loss": 0.7256676, "num_input_tokens_seen": 164895710, "step": 7680, "time_per_iteration": 2.7286808490753174 }, { "auxiliary_loss_clip": 0.01137583, "auxiliary_loss_mlp": 0.01043671, "balance_loss_clip": 1.05160725, "balance_loss_mlp": 1.02904963, "epoch": 0.46180670374267246, "flos": 22346312952960.0, "grad_norm": 1.9037139750308347, "language_loss": 0.63464803, "learning_rate": 2.342778139478487e-06, "loss": 0.65646052, "num_input_tokens_seen": 164913365, "step": 7681, "time_per_iteration": 2.6214568614959717 }, { "auxiliary_loss_clip": 0.01116453, "auxiliary_loss_mlp": 0.01029466, "balance_loss_clip": 1.04633749, "balance_loss_mlp": 1.01636481, "epoch": 0.46186682699534043, "flos": 19895696582400.0, "grad_norm": 1.5164971745129476, "language_loss": 0.67357612, "learning_rate": 2.342394433999697e-06, "loss": 0.69503522, "num_input_tokens_seen": 164931620, "step": 7682, "time_per_iteration": 2.647353410720825 }, { "auxiliary_loss_clip": 0.01088835, "auxiliary_loss_mlp": 0.01041013, "balance_loss_clip": 1.04340196, "balance_loss_mlp": 1.02619505, "epoch": 0.4619269502480084, "flos": 31503569408640.0, "grad_norm": 2.227871519060849, "language_loss": 0.73820949, "learning_rate": 2.342010715537275e-06, "loss": 0.75950789, "num_input_tokens_seen": 164950905, "step": 7683, "time_per_iteration": 2.7580692768096924 }, { "auxiliary_loss_clip": 0.01128951, "auxiliary_loss_mlp": 0.01039533, "balance_loss_clip": 1.04759753, "balance_loss_mlp": 1.02627087, "epoch": 0.46198707350067636, "flos": 25009484054400.0, "grad_norm": 1.7711337337418462, "language_loss": 0.76479292, "learning_rate": 2.3416269841057726e-06, "loss": 0.7864778, "num_input_tokens_seen": 164970950, "step": 7684, "time_per_iteration": 2.6827478408813477 }, { "auxiliary_loss_clip": 0.01136661, "auxiliary_loss_mlp": 0.01044253, "balance_loss_clip": 1.0495609, "balance_loss_mlp": 1.02969098, "epoch": 0.4620471967533444, "flos": 18292357198080.0, "grad_norm": 1.8114594945271643, "language_loss": 0.79657519, "learning_rate": 2.3412432397197412e-06, "loss": 0.81838435, "num_input_tokens_seen": 164989855, "step": 7685, "time_per_iteration": 2.6539084911346436 }, { "auxiliary_loss_clip": 0.01085193, "auxiliary_loss_mlp": 0.01046975, "balance_loss_clip": 1.04328656, "balance_loss_mlp": 1.03158486, "epoch": 0.46210732000601235, "flos": 33985104410880.0, "grad_norm": 2.276305365525513, "language_loss": 0.66791403, "learning_rate": 2.340859482393731e-06, "loss": 0.68923569, "num_input_tokens_seen": 165012290, "step": 7686, "time_per_iteration": 2.8229949474334717 }, { "auxiliary_loss_clip": 0.01106797, "auxiliary_loss_mlp": 0.00772257, "balance_loss_clip": 1.04507184, "balance_loss_mlp": 1.00066257, "epoch": 0.4621674432586803, "flos": 25009412227200.0, "grad_norm": 2.1846142929829693, "language_loss": 0.73938292, "learning_rate": 2.340475712142296e-06, "loss": 0.75817347, "num_input_tokens_seen": 165030810, "step": 7687, "time_per_iteration": 2.8577284812927246 }, { "auxiliary_loss_clip": 0.01066455, "auxiliary_loss_mlp": 0.01038717, "balance_loss_clip": 1.0470593, "balance_loss_mlp": 1.02399492, "epoch": 0.4622275665113483, "flos": 22014031213440.0, "grad_norm": 2.1409043019128253, "language_loss": 0.74955392, "learning_rate": 2.3400919289799873e-06, "loss": 0.77060568, "num_input_tokens_seen": 165050205, "step": 7688, "time_per_iteration": 2.8981478214263916 }, { "auxiliary_loss_clip": 0.01076735, "auxiliary_loss_mlp": 0.00771909, "balance_loss_clip": 1.03838563, "balance_loss_mlp": 1.0005393, "epoch": 0.46228768976401624, "flos": 24058820747520.0, "grad_norm": 1.6416992765701228, "language_loss": 0.78753114, "learning_rate": 2.3397081329213585e-06, "loss": 0.80601752, "num_input_tokens_seen": 165069370, "step": 7689, "time_per_iteration": 2.8450090885162354 }, { "auxiliary_loss_clip": 0.01117226, "auxiliary_loss_mlp": 0.01039789, "balance_loss_clip": 1.04319644, "balance_loss_mlp": 1.02512646, "epoch": 0.4623478130166842, "flos": 26651391667200.0, "grad_norm": 2.047300589730092, "language_loss": 0.56996405, "learning_rate": 2.339324323980964e-06, "loss": 0.5915342, "num_input_tokens_seen": 165089610, "step": 7690, "time_per_iteration": 2.6919097900390625 }, { "auxiliary_loss_clip": 0.0111777, "auxiliary_loss_mlp": 0.01042754, "balance_loss_clip": 1.04474783, "balance_loss_mlp": 1.02853799, "epoch": 0.46240793626935217, "flos": 20558428467840.0, "grad_norm": 2.950419828824325, "language_loss": 0.82586032, "learning_rate": 2.3389405021733562e-06, "loss": 0.84746557, "num_input_tokens_seen": 165109050, "step": 7691, "time_per_iteration": 2.695331573486328 }, { "auxiliary_loss_clip": 0.01108828, "auxiliary_loss_mlp": 0.01034489, "balance_loss_clip": 1.04660177, "balance_loss_mlp": 1.02088761, "epoch": 0.46246805952202014, "flos": 22456055980800.0, "grad_norm": 1.4872733065963748, "language_loss": 0.75199407, "learning_rate": 2.338556667513091e-06, "loss": 0.77342725, "num_input_tokens_seen": 165130130, "step": 7692, "time_per_iteration": 2.6822991371154785 }, { "auxiliary_loss_clip": 0.01097579, "auxiliary_loss_mlp": 0.01044516, "balance_loss_clip": 1.04742086, "balance_loss_mlp": 1.0297097, "epoch": 0.4625281827746881, "flos": 35041308854400.0, "grad_norm": 1.6276482481397991, "language_loss": 0.74345845, "learning_rate": 2.338172820014723e-06, "loss": 0.76487935, "num_input_tokens_seen": 165152685, "step": 7693, "time_per_iteration": 2.8581414222717285 }, { "auxiliary_loss_clip": 0.01087933, "auxiliary_loss_mlp": 0.01056162, "balance_loss_clip": 1.04530871, "balance_loss_mlp": 1.04086781, "epoch": 0.46258830602735607, "flos": 21068647205760.0, "grad_norm": 2.088066659615079, "language_loss": 0.85329688, "learning_rate": 2.337788959692808e-06, "loss": 0.8747378, "num_input_tokens_seen": 165173315, "step": 7694, "time_per_iteration": 2.730196237564087 }, { "auxiliary_loss_clip": 0.01111115, "auxiliary_loss_mlp": 0.01042848, "balance_loss_clip": 1.04707479, "balance_loss_mlp": 1.02936506, "epoch": 0.46264842928002403, "flos": 26177227205760.0, "grad_norm": 2.853578946778756, "language_loss": 0.79611814, "learning_rate": 2.337405086561902e-06, "loss": 0.81765783, "num_input_tokens_seen": 165192395, "step": 7695, "time_per_iteration": 2.7454562187194824 }, { "auxiliary_loss_clip": 0.01114811, "auxiliary_loss_mlp": 0.01037414, "balance_loss_clip": 1.04553604, "balance_loss_mlp": 1.02390218, "epoch": 0.462708552532692, "flos": 16764214936320.0, "grad_norm": 1.803891217274167, "language_loss": 0.72445035, "learning_rate": 2.3370212006365606e-06, "loss": 0.74597263, "num_input_tokens_seen": 165211355, "step": 7696, "time_per_iteration": 4.214217901229858 }, { "auxiliary_loss_clip": 0.01110882, "auxiliary_loss_mlp": 0.01046867, "balance_loss_clip": 1.04748213, "balance_loss_mlp": 1.03221607, "epoch": 0.46276867578535996, "flos": 15560453422080.0, "grad_norm": 1.5710514609338178, "language_loss": 0.69939005, "learning_rate": 2.3366373019313423e-06, "loss": 0.72096753, "num_input_tokens_seen": 165229380, "step": 7697, "time_per_iteration": 4.213683843612671 }, { "auxiliary_loss_clip": 0.01133171, "auxiliary_loss_mlp": 0.01036334, "balance_loss_clip": 1.05145979, "balance_loss_mlp": 1.02264249, "epoch": 0.462828799038028, "flos": 22415404763520.0, "grad_norm": 1.9243080556164578, "language_loss": 0.84559363, "learning_rate": 2.3362533904608025e-06, "loss": 0.86728865, "num_input_tokens_seen": 165247200, "step": 7698, "time_per_iteration": 2.6434006690979004 }, { "auxiliary_loss_clip": 0.01130166, "auxiliary_loss_mlp": 0.01037324, "balance_loss_clip": 1.04838073, "balance_loss_mlp": 1.02357352, "epoch": 0.46288892229069595, "flos": 21069580959360.0, "grad_norm": 8.31912219741259, "language_loss": 0.71345413, "learning_rate": 2.335869466239502e-06, "loss": 0.73512906, "num_input_tokens_seen": 165265825, "step": 7699, "time_per_iteration": 4.157729387283325 }, { "auxiliary_loss_clip": 0.01073609, "auxiliary_loss_mlp": 0.01040377, "balance_loss_clip": 1.04345739, "balance_loss_mlp": 1.02550519, "epoch": 0.4629490455433639, "flos": 23185688947200.0, "grad_norm": 1.732328117704307, "language_loss": 0.71911675, "learning_rate": 2.335485529281996e-06, "loss": 0.74025667, "num_input_tokens_seen": 165284380, "step": 7700, "time_per_iteration": 2.8432295322418213 }, { "auxiliary_loss_clip": 0.01128125, "auxiliary_loss_mlp": 0.00771852, "balance_loss_clip": 1.04640698, "balance_loss_mlp": 1.00047588, "epoch": 0.4630091687960319, "flos": 18835541642880.0, "grad_norm": 2.4184025660528863, "language_loss": 0.73149109, "learning_rate": 2.3351015796028467e-06, "loss": 0.7504909, "num_input_tokens_seen": 165300320, "step": 7701, "time_per_iteration": 4.2371203899383545 }, { "auxiliary_loss_clip": 0.01087014, "auxiliary_loss_mlp": 0.01044166, "balance_loss_clip": 1.04401398, "balance_loss_mlp": 1.02921128, "epoch": 0.46306929204869984, "flos": 38907020407680.0, "grad_norm": 2.4372676297457216, "language_loss": 0.65005761, "learning_rate": 2.3347176172166114e-06, "loss": 0.67136943, "num_input_tokens_seen": 165318130, "step": 7702, "time_per_iteration": 2.875633716583252 }, { "auxiliary_loss_clip": 0.01103467, "auxiliary_loss_mlp": 0.01032726, "balance_loss_clip": 1.04441071, "balance_loss_mlp": 1.01875424, "epoch": 0.4631294153013678, "flos": 19644178573440.0, "grad_norm": 1.9024039666922008, "language_loss": 0.73310453, "learning_rate": 2.33433364213785e-06, "loss": 0.75446641, "num_input_tokens_seen": 165336225, "step": 7703, "time_per_iteration": 2.7307324409484863 }, { "auxiliary_loss_clip": 0.01109216, "auxiliary_loss_mlp": 0.0103683, "balance_loss_clip": 1.04673266, "balance_loss_mlp": 1.02145839, "epoch": 0.4631895385540358, "flos": 24608254158720.0, "grad_norm": 1.9428423147374236, "language_loss": 0.68751299, "learning_rate": 2.3339496543811243e-06, "loss": 0.70897353, "num_input_tokens_seen": 165355005, "step": 7704, "time_per_iteration": 2.7113852500915527 }, { "auxiliary_loss_clip": 0.01120314, "auxiliary_loss_mlp": 0.01033991, "balance_loss_clip": 1.04720986, "balance_loss_mlp": 1.01935196, "epoch": 0.46324966180670374, "flos": 26320115508480.0, "grad_norm": 2.3420396256779443, "language_loss": 0.81331742, "learning_rate": 2.3335656539609934e-06, "loss": 0.83486044, "num_input_tokens_seen": 165374910, "step": 7705, "time_per_iteration": 2.804708480834961 }, { "auxiliary_loss_clip": 0.01119161, "auxiliary_loss_mlp": 0.01035806, "balance_loss_clip": 1.04762256, "balance_loss_mlp": 1.02172124, "epoch": 0.4633097850593717, "flos": 19240506552960.0, "grad_norm": 1.6909152504462979, "language_loss": 0.77714217, "learning_rate": 2.3331816408920196e-06, "loss": 0.79869187, "num_input_tokens_seen": 165392590, "step": 7706, "time_per_iteration": 2.67990779876709 }, { "auxiliary_loss_clip": 0.01102016, "auxiliary_loss_mlp": 0.01033802, "balance_loss_clip": 1.04767776, "balance_loss_mlp": 1.02023578, "epoch": 0.46336990831203967, "flos": 22783166161920.0, "grad_norm": 2.039386256395222, "language_loss": 0.699494, "learning_rate": 2.3327976151887654e-06, "loss": 0.7208522, "num_input_tokens_seen": 165411195, "step": 7707, "time_per_iteration": 2.7109720706939697 }, { "auxiliary_loss_clip": 0.01111011, "auxiliary_loss_mlp": 0.01038647, "balance_loss_clip": 1.04469609, "balance_loss_mlp": 1.02306628, "epoch": 0.46343003156470763, "flos": 38210604543360.0, "grad_norm": 1.931472234163978, "language_loss": 0.61287057, "learning_rate": 2.332413576865791e-06, "loss": 0.63436711, "num_input_tokens_seen": 165430150, "step": 7708, "time_per_iteration": 2.8489346504211426 }, { "auxiliary_loss_clip": 0.01089075, "auxiliary_loss_mlp": 0.01033464, "balance_loss_clip": 1.04273093, "balance_loss_mlp": 1.01930773, "epoch": 0.4634901548173756, "flos": 31938555110400.0, "grad_norm": 2.4081522593734332, "language_loss": 0.77443427, "learning_rate": 2.3320295259376614e-06, "loss": 0.79565972, "num_input_tokens_seen": 165450595, "step": 7709, "time_per_iteration": 2.720604419708252 }, { "auxiliary_loss_clip": 0.01134634, "auxiliary_loss_mlp": 0.0103959, "balance_loss_clip": 1.04938257, "balance_loss_mlp": 1.02433753, "epoch": 0.46355027807004356, "flos": 20082540153600.0, "grad_norm": 1.78810829524809, "language_loss": 0.77216917, "learning_rate": 2.3316454624189385e-06, "loss": 0.79391134, "num_input_tokens_seen": 165469515, "step": 7710, "time_per_iteration": 2.5303022861480713 }, { "auxiliary_loss_clip": 0.01122514, "auxiliary_loss_mlp": 0.01037619, "balance_loss_clip": 1.04637122, "balance_loss_mlp": 1.02172804, "epoch": 0.4636104013227116, "flos": 24061370613120.0, "grad_norm": 2.2400017320201187, "language_loss": 0.73509276, "learning_rate": 2.3312613863241865e-06, "loss": 0.75669408, "num_input_tokens_seen": 165488125, "step": 7711, "time_per_iteration": 2.5654797554016113 }, { "auxiliary_loss_clip": 0.0110546, "auxiliary_loss_mlp": 0.01046309, "balance_loss_clip": 1.04776788, "balance_loss_mlp": 1.03109789, "epoch": 0.46367052457537955, "flos": 23914639555200.0, "grad_norm": 1.4625168937424313, "language_loss": 0.71734262, "learning_rate": 2.33087729766797e-06, "loss": 0.73886031, "num_input_tokens_seen": 165509225, "step": 7712, "time_per_iteration": 2.6021108627319336 }, { "auxiliary_loss_clip": 0.01109448, "auxiliary_loss_mlp": 0.01039959, "balance_loss_clip": 1.04681897, "balance_loss_mlp": 1.02359128, "epoch": 0.4637306478280475, "flos": 26396533693440.0, "grad_norm": 10.680731903132253, "language_loss": 0.73100054, "learning_rate": 2.3304931964648524e-06, "loss": 0.75249463, "num_input_tokens_seen": 165529945, "step": 7713, "time_per_iteration": 2.7074029445648193 }, { "auxiliary_loss_clip": 0.01098034, "auxiliary_loss_mlp": 0.01037925, "balance_loss_clip": 1.0441041, "balance_loss_mlp": 1.02191556, "epoch": 0.4637907710807155, "flos": 21980706370560.0, "grad_norm": 1.6982870192648571, "language_loss": 0.5889293, "learning_rate": 2.3301090827294e-06, "loss": 0.61028892, "num_input_tokens_seen": 165550690, "step": 7714, "time_per_iteration": 2.710048198699951 }, { "auxiliary_loss_clip": 0.01120282, "auxiliary_loss_mlp": 0.01034073, "balance_loss_clip": 1.04763293, "balance_loss_mlp": 1.01950562, "epoch": 0.46385089433338345, "flos": 12422291846400.0, "grad_norm": 1.91274815186046, "language_loss": 0.70204347, "learning_rate": 2.3297249564761784e-06, "loss": 0.72358704, "num_input_tokens_seen": 165567775, "step": 7715, "time_per_iteration": 2.6403465270996094 }, { "auxiliary_loss_clip": 0.01138235, "auxiliary_loss_mlp": 0.01041941, "balance_loss_clip": 1.04938495, "balance_loss_mlp": 1.02725387, "epoch": 0.4639110175860514, "flos": 23915752876800.0, "grad_norm": 2.6000471859571777, "language_loss": 0.68646967, "learning_rate": 2.3293408177197527e-06, "loss": 0.7082715, "num_input_tokens_seen": 165587010, "step": 7716, "time_per_iteration": 2.6233439445495605 }, { "auxiliary_loss_clip": 0.01132713, "auxiliary_loss_mlp": 0.01031179, "balance_loss_clip": 1.0472188, "balance_loss_mlp": 1.01599193, "epoch": 0.4639711408387194, "flos": 25300396304640.0, "grad_norm": 1.7614766285874086, "language_loss": 0.809901, "learning_rate": 2.328956666474691e-06, "loss": 0.83153987, "num_input_tokens_seen": 165607850, "step": 7717, "time_per_iteration": 2.6267318725585938 }, { "auxiliary_loss_clip": 0.01131786, "auxiliary_loss_mlp": 0.01036738, "balance_loss_clip": 1.0477078, "balance_loss_mlp": 1.02206373, "epoch": 0.46403126409138734, "flos": 21211822817280.0, "grad_norm": 1.7513215449973674, "language_loss": 0.73192513, "learning_rate": 2.3285725027555593e-06, "loss": 0.75361037, "num_input_tokens_seen": 165627175, "step": 7718, "time_per_iteration": 2.5936009883880615 }, { "auxiliary_loss_clip": 0.01129362, "auxiliary_loss_mlp": 0.00772229, "balance_loss_clip": 1.04671347, "balance_loss_mlp": 1.00063276, "epoch": 0.4640913873440553, "flos": 35845564325760.0, "grad_norm": 1.6991265809872926, "language_loss": 0.70156294, "learning_rate": 2.3281883265769254e-06, "loss": 0.72057891, "num_input_tokens_seen": 165648340, "step": 7719, "time_per_iteration": 2.7047362327575684 }, { "auxiliary_loss_clip": 0.01112084, "auxiliary_loss_mlp": 0.01036441, "balance_loss_clip": 1.05082273, "balance_loss_mlp": 1.02101541, "epoch": 0.46415151059672327, "flos": 19166207270400.0, "grad_norm": 2.142564905802957, "language_loss": 0.86823177, "learning_rate": 2.327804137953357e-06, "loss": 0.88971704, "num_input_tokens_seen": 165667195, "step": 7720, "time_per_iteration": 2.7309963703155518 }, { "auxiliary_loss_clip": 0.01032352, "auxiliary_loss_mlp": 0.01008212, "balance_loss_clip": 1.02414155, "balance_loss_mlp": 1.00647151, "epoch": 0.46421163384939124, "flos": 58912750304640.0, "grad_norm": 0.7188509278747012, "language_loss": 0.55039424, "learning_rate": 2.3274199368994226e-06, "loss": 0.57079989, "num_input_tokens_seen": 165726760, "step": 7721, "time_per_iteration": 3.236877679824829 }, { "auxiliary_loss_clip": 0.01107525, "auxiliary_loss_mlp": 0.01036882, "balance_loss_clip": 1.04643178, "balance_loss_mlp": 1.02240443, "epoch": 0.4642717571020592, "flos": 20157342226560.0, "grad_norm": 2.140310045449241, "language_loss": 0.79792923, "learning_rate": 2.3270357234296918e-06, "loss": 0.81937331, "num_input_tokens_seen": 165745005, "step": 7722, "time_per_iteration": 2.660754919052124 }, { "auxiliary_loss_clip": 0.01135285, "auxiliary_loss_mlp": 0.01039973, "balance_loss_clip": 1.04771972, "balance_loss_mlp": 1.02478552, "epoch": 0.46433188035472717, "flos": 25046184775680.0, "grad_norm": 1.8420199747356898, "language_loss": 0.77947485, "learning_rate": 2.3266514975587332e-06, "loss": 0.80122739, "num_input_tokens_seen": 165765750, "step": 7723, "time_per_iteration": 2.650667667388916 }, { "auxiliary_loss_clip": 0.010296, "auxiliary_loss_mlp": 0.01034411, "balance_loss_clip": 1.03560913, "balance_loss_mlp": 1.01945066, "epoch": 0.4643920036073952, "flos": 28075644817920.0, "grad_norm": 1.6775959652720056, "language_loss": 0.68506896, "learning_rate": 2.326267259301118e-06, "loss": 0.7057091, "num_input_tokens_seen": 165787515, "step": 7724, "time_per_iteration": 3.0586209297180176 }, { "auxiliary_loss_clip": 0.01115779, "auxiliary_loss_mlp": 0.01034262, "balance_loss_clip": 1.04832113, "balance_loss_mlp": 1.0193367, "epoch": 0.46445212686006315, "flos": 18369350000640.0, "grad_norm": 3.606583728635542, "language_loss": 0.67163348, "learning_rate": 2.325883008671415e-06, "loss": 0.69313383, "num_input_tokens_seen": 165806675, "step": 7725, "time_per_iteration": 2.9137332439422607 }, { "auxiliary_loss_clip": 0.01113984, "auxiliary_loss_mlp": 0.01038381, "balance_loss_clip": 1.04604602, "balance_loss_mlp": 1.02554178, "epoch": 0.4645122501127311, "flos": 31721618920320.0, "grad_norm": 1.751091551827286, "language_loss": 0.65037453, "learning_rate": 2.3254987456841955e-06, "loss": 0.67189825, "num_input_tokens_seen": 165829835, "step": 7726, "time_per_iteration": 2.7184534072875977 }, { "auxiliary_loss_clip": 0.0110497, "auxiliary_loss_mlp": 0.00772968, "balance_loss_clip": 1.04436016, "balance_loss_mlp": 1.00061822, "epoch": 0.4645723733653991, "flos": 23768806337280.0, "grad_norm": 1.6559858063545494, "language_loss": 0.74796247, "learning_rate": 2.3251144703540307e-06, "loss": 0.76674187, "num_input_tokens_seen": 165849380, "step": 7727, "time_per_iteration": 2.7193634510040283 }, { "auxiliary_loss_clip": 0.01107461, "auxiliary_loss_mlp": 0.0104049, "balance_loss_clip": 1.0458529, "balance_loss_mlp": 1.02506471, "epoch": 0.46463249661806705, "flos": 33145512935040.0, "grad_norm": 2.1928121253358293, "language_loss": 0.78549933, "learning_rate": 2.3247301826954936e-06, "loss": 0.80697882, "num_input_tokens_seen": 165868620, "step": 7728, "time_per_iteration": 2.744900703430176 }, { "auxiliary_loss_clip": 0.01092904, "auxiliary_loss_mlp": 0.01038861, "balance_loss_clip": 1.0414784, "balance_loss_mlp": 1.02373958, "epoch": 0.464692619870735, "flos": 18296020385280.0, "grad_norm": 2.0549050897499135, "language_loss": 0.75892472, "learning_rate": 2.324345882723155e-06, "loss": 0.78024244, "num_input_tokens_seen": 165885915, "step": 7729, "time_per_iteration": 2.7145724296569824 }, { "auxiliary_loss_clip": 0.01108829, "auxiliary_loss_mlp": 0.01047351, "balance_loss_clip": 1.0485568, "balance_loss_mlp": 1.03153229, "epoch": 0.464752743123403, "flos": 22638051216000.0, "grad_norm": 1.8824527818993837, "language_loss": 0.79760742, "learning_rate": 2.323961570451588e-06, "loss": 0.81916922, "num_input_tokens_seen": 165905465, "step": 7730, "time_per_iteration": 2.7782390117645264 }, { "auxiliary_loss_clip": 0.01130146, "auxiliary_loss_mlp": 0.01037223, "balance_loss_clip": 1.04756629, "balance_loss_mlp": 1.02265573, "epoch": 0.46481286637607094, "flos": 20412128373120.0, "grad_norm": 1.6262082138117517, "language_loss": 0.77182668, "learning_rate": 2.3235772458953655e-06, "loss": 0.79350036, "num_input_tokens_seen": 165924640, "step": 7731, "time_per_iteration": 2.617314577102661 }, { "auxiliary_loss_clip": 0.01090917, "auxiliary_loss_mlp": 0.01035098, "balance_loss_clip": 1.04506755, "balance_loss_mlp": 1.02119207, "epoch": 0.4648729896287389, "flos": 34275406129920.0, "grad_norm": 1.6446435516271722, "language_loss": 0.65999961, "learning_rate": 2.323192909069061e-06, "loss": 0.68125969, "num_input_tokens_seen": 165945765, "step": 7732, "time_per_iteration": 2.806825876235962 }, { "auxiliary_loss_clip": 0.01109545, "auxiliary_loss_mlp": 0.0104247, "balance_loss_clip": 1.04427695, "balance_loss_mlp": 1.02551866, "epoch": 0.4649331128814069, "flos": 21321781326720.0, "grad_norm": 2.341941786180864, "language_loss": 0.72770941, "learning_rate": 2.32280855998725e-06, "loss": 0.74922955, "num_input_tokens_seen": 165964025, "step": 7733, "time_per_iteration": 2.6884191036224365 }, { "auxiliary_loss_clip": 0.01046209, "auxiliary_loss_mlp": 0.01002418, "balance_loss_clip": 1.01885557, "balance_loss_mlp": 1.00089204, "epoch": 0.46499323613407484, "flos": 58308515717760.0, "grad_norm": 1.2786299900123337, "language_loss": 0.51944834, "learning_rate": 2.3224241986645057e-06, "loss": 0.53993464, "num_input_tokens_seen": 166021950, "step": 7734, "time_per_iteration": 3.0932440757751465 }, { "auxiliary_loss_clip": 0.01111419, "auxiliary_loss_mlp": 0.01034362, "balance_loss_clip": 1.05044913, "balance_loss_mlp": 1.01990235, "epoch": 0.4650533593867428, "flos": 10889660384640.0, "grad_norm": 2.1631100357564788, "language_loss": 0.75439203, "learning_rate": 2.3220398251154035e-06, "loss": 0.77584982, "num_input_tokens_seen": 166039675, "step": 7735, "time_per_iteration": 4.546087265014648 }, { "auxiliary_loss_clip": 0.01087553, "auxiliary_loss_mlp": 0.01045865, "balance_loss_clip": 1.04543328, "balance_loss_mlp": 1.0305233, "epoch": 0.46511348263941077, "flos": 19974592805760.0, "grad_norm": 2.3653554564968435, "language_loss": 0.69901764, "learning_rate": 2.321655439354519e-06, "loss": 0.72035182, "num_input_tokens_seen": 166057745, "step": 7736, "time_per_iteration": 4.302860498428345 }, { "auxiliary_loss_clip": 0.01128458, "auxiliary_loss_mlp": 0.01036991, "balance_loss_clip": 1.0473057, "balance_loss_mlp": 1.0228653, "epoch": 0.46517360589207873, "flos": 19678401256320.0, "grad_norm": 1.6411657567334208, "language_loss": 0.71995008, "learning_rate": 2.321271041396427e-06, "loss": 0.74160457, "num_input_tokens_seen": 166076440, "step": 7737, "time_per_iteration": 2.566603183746338 }, { "auxiliary_loss_clip": 0.01111802, "auxiliary_loss_mlp": 0.01040407, "balance_loss_clip": 1.05224276, "balance_loss_mlp": 1.02456391, "epoch": 0.46523372914474675, "flos": 16872665074560.0, "grad_norm": 2.50928704064022, "language_loss": 0.83606738, "learning_rate": 2.3208866312557065e-06, "loss": 0.85758948, "num_input_tokens_seen": 166092520, "step": 7738, "time_per_iteration": 2.602149486541748 }, { "auxiliary_loss_clip": 0.0103645, "auxiliary_loss_mlp": 0.01000487, "balance_loss_clip": 1.01920033, "balance_loss_mlp": 0.99899715, "epoch": 0.4652938523974147, "flos": 53439138339840.0, "grad_norm": 0.7761784242108043, "language_loss": 0.57855058, "learning_rate": 2.320502208946932e-06, "loss": 0.59891999, "num_input_tokens_seen": 166156285, "step": 7739, "time_per_iteration": 4.744653940200806 }, { "auxiliary_loss_clip": 0.01111735, "auxiliary_loss_mlp": 0.0104196, "balance_loss_clip": 1.04867125, "balance_loss_mlp": 1.02728581, "epoch": 0.4653539756500827, "flos": 15231296165760.0, "grad_norm": 1.7825482177936647, "language_loss": 0.85391408, "learning_rate": 2.3201177744846815e-06, "loss": 0.87545103, "num_input_tokens_seen": 166173455, "step": 7740, "time_per_iteration": 4.26358962059021 }, { "auxiliary_loss_clip": 0.01103788, "auxiliary_loss_mlp": 0.01043392, "balance_loss_clip": 1.04354095, "balance_loss_mlp": 1.02769184, "epoch": 0.46541409890275065, "flos": 23732249270400.0, "grad_norm": 1.728452967927443, "language_loss": 0.75540549, "learning_rate": 2.3197333278835327e-06, "loss": 0.77687728, "num_input_tokens_seen": 166194370, "step": 7741, "time_per_iteration": 2.7189860343933105 }, { "auxiliary_loss_clip": 0.01102378, "auxiliary_loss_mlp": 0.0103993, "balance_loss_clip": 1.04642224, "balance_loss_mlp": 1.02583992, "epoch": 0.4654742221554186, "flos": 20847329556480.0, "grad_norm": 1.6912495786690362, "language_loss": 0.80807334, "learning_rate": 2.319348869158064e-06, "loss": 0.82949644, "num_input_tokens_seen": 166213195, "step": 7742, "time_per_iteration": 2.7285542488098145 }, { "auxiliary_loss_clip": 0.01109172, "auxiliary_loss_mlp": 0.01044204, "balance_loss_clip": 1.04378545, "balance_loss_mlp": 1.02846837, "epoch": 0.4655343454080866, "flos": 20704836303360.0, "grad_norm": 2.554211916953899, "language_loss": 0.7287879, "learning_rate": 2.3189643983228555e-06, "loss": 0.75032163, "num_input_tokens_seen": 166231350, "step": 7743, "time_per_iteration": 2.8064794540405273 }, { "auxiliary_loss_clip": 0.01097309, "auxiliary_loss_mlp": 0.01035628, "balance_loss_clip": 1.044186, "balance_loss_mlp": 1.01989281, "epoch": 0.46559446866075455, "flos": 18989850470400.0, "grad_norm": 1.9272268848768948, "language_loss": 0.71113133, "learning_rate": 2.318579915392483e-06, "loss": 0.73246074, "num_input_tokens_seen": 166250530, "step": 7744, "time_per_iteration": 2.7021846771240234 }, { "auxiliary_loss_clip": 0.01081647, "auxiliary_loss_mlp": 0.01033676, "balance_loss_clip": 1.04821372, "balance_loss_mlp": 1.01952028, "epoch": 0.4656545919134225, "flos": 34496364643200.0, "grad_norm": 1.5788774332625253, "language_loss": 0.84865856, "learning_rate": 2.31819542038153e-06, "loss": 0.86981177, "num_input_tokens_seen": 166272545, "step": 7745, "time_per_iteration": 2.8962950706481934 }, { "auxiliary_loss_clip": 0.01118243, "auxiliary_loss_mlp": 0.01044667, "balance_loss_clip": 1.04609525, "balance_loss_mlp": 1.02958083, "epoch": 0.4657147151660905, "flos": 24310554238080.0, "grad_norm": 1.3325532903447972, "language_loss": 0.72868127, "learning_rate": 2.317810913304574e-06, "loss": 0.75031042, "num_input_tokens_seen": 166292135, "step": 7746, "time_per_iteration": 2.654744863510132 }, { "auxiliary_loss_clip": 0.01115957, "auxiliary_loss_mlp": 0.01039896, "balance_loss_clip": 1.04620576, "balance_loss_mlp": 1.02557254, "epoch": 0.46577483841875844, "flos": 58795139220480.0, "grad_norm": 2.5149225133479667, "language_loss": 0.69942105, "learning_rate": 2.3174263941761963e-06, "loss": 0.72097951, "num_input_tokens_seen": 166316710, "step": 7747, "time_per_iteration": 2.946551561355591 }, { "auxiliary_loss_clip": 0.01087715, "auxiliary_loss_mlp": 0.01043482, "balance_loss_clip": 1.04082656, "balance_loss_mlp": 1.0269475, "epoch": 0.4658349616714264, "flos": 31321969223040.0, "grad_norm": 1.543824419854341, "language_loss": 0.67369974, "learning_rate": 2.317041863010978e-06, "loss": 0.69501168, "num_input_tokens_seen": 166338535, "step": 7748, "time_per_iteration": 2.7577450275421143 }, { "auxiliary_loss_clip": 0.01095867, "auxiliary_loss_mlp": 0.01040613, "balance_loss_clip": 1.04655099, "balance_loss_mlp": 1.0242455, "epoch": 0.46589508492409437, "flos": 14860338456960.0, "grad_norm": 2.2493825617355805, "language_loss": 0.6400212, "learning_rate": 2.3166573198235007e-06, "loss": 0.66138601, "num_input_tokens_seen": 166355540, "step": 7749, "time_per_iteration": 2.6768271923065186 }, { "auxiliary_loss_clip": 0.01124878, "auxiliary_loss_mlp": 0.01035356, "balance_loss_clip": 1.04833543, "balance_loss_mlp": 1.01912558, "epoch": 0.46595520817676234, "flos": 12895989431040.0, "grad_norm": 2.0851109379556414, "language_loss": 0.74756414, "learning_rate": 2.3162727646283456e-06, "loss": 0.76916647, "num_input_tokens_seen": 166372635, "step": 7750, "time_per_iteration": 2.6180553436279297 }, { "auxiliary_loss_clip": 0.01112353, "auxiliary_loss_mlp": 0.01032354, "balance_loss_clip": 1.04888475, "balance_loss_mlp": 1.01699984, "epoch": 0.46601533142943036, "flos": 32854169721600.0, "grad_norm": 2.1197385056246, "language_loss": 0.74433059, "learning_rate": 2.3158881974400963e-06, "loss": 0.76577765, "num_input_tokens_seen": 166393175, "step": 7751, "time_per_iteration": 2.7448816299438477 }, { "auxiliary_loss_clip": 0.01105983, "auxiliary_loss_mlp": 0.01039216, "balance_loss_clip": 1.049245, "balance_loss_mlp": 1.02301598, "epoch": 0.4660754546820983, "flos": 19967517826560.0, "grad_norm": 2.5234072122891176, "language_loss": 0.73595881, "learning_rate": 2.3155036182733345e-06, "loss": 0.75741076, "num_input_tokens_seen": 166408630, "step": 7752, "time_per_iteration": 2.6944475173950195 }, { "auxiliary_loss_clip": 0.01108633, "auxiliary_loss_mlp": 0.01040109, "balance_loss_clip": 1.04941273, "balance_loss_mlp": 1.02493417, "epoch": 0.4661355779347663, "flos": 26688164215680.0, "grad_norm": 2.044776600528041, "language_loss": 0.69086194, "learning_rate": 2.315119027142644e-06, "loss": 0.7123493, "num_input_tokens_seen": 166428170, "step": 7753, "time_per_iteration": 2.736854076385498 }, { "auxiliary_loss_clip": 0.01099142, "auxiliary_loss_mlp": 0.01040064, "balance_loss_clip": 1.04148221, "balance_loss_mlp": 1.02494824, "epoch": 0.46619570118743425, "flos": 20959442881920.0, "grad_norm": 2.155464287948458, "language_loss": 0.72724748, "learning_rate": 2.3147344240626076e-06, "loss": 0.74863952, "num_input_tokens_seen": 166446705, "step": 7754, "time_per_iteration": 2.6782143115997314 }, { "auxiliary_loss_clip": 0.01113403, "auxiliary_loss_mlp": 0.0103567, "balance_loss_clip": 1.04633951, "balance_loss_mlp": 1.01993394, "epoch": 0.4662558244401022, "flos": 24426079355520.0, "grad_norm": 1.424199388432646, "language_loss": 0.78797996, "learning_rate": 2.3143498090478114e-06, "loss": 0.80947065, "num_input_tokens_seen": 166466750, "step": 7755, "time_per_iteration": 2.8091399669647217 }, { "auxiliary_loss_clip": 0.01115387, "auxiliary_loss_mlp": 0.01030352, "balance_loss_clip": 1.04450297, "balance_loss_mlp": 1.01545656, "epoch": 0.4663159476927702, "flos": 20595452411520.0, "grad_norm": 1.631642654170447, "language_loss": 0.72453964, "learning_rate": 2.3139651821128382e-06, "loss": 0.74599707, "num_input_tokens_seen": 166485400, "step": 7756, "time_per_iteration": 2.7136480808258057 }, { "auxiliary_loss_clip": 0.01117973, "auxiliary_loss_mlp": 0.01036177, "balance_loss_clip": 1.04585207, "balance_loss_mlp": 1.02137136, "epoch": 0.46637607094543815, "flos": 25661872823040.0, "grad_norm": 2.024488409117557, "language_loss": 0.78578007, "learning_rate": 2.313580543272274e-06, "loss": 0.80732161, "num_input_tokens_seen": 166505730, "step": 7757, "time_per_iteration": 2.6828832626342773 }, { "auxiliary_loss_clip": 0.01090573, "auxiliary_loss_mlp": 0.01031697, "balance_loss_clip": 1.04173446, "balance_loss_mlp": 1.01717782, "epoch": 0.4664361941981061, "flos": 24273853516800.0, "grad_norm": 2.116616009232987, "language_loss": 0.6656999, "learning_rate": 2.313195892540705e-06, "loss": 0.68692255, "num_input_tokens_seen": 166523770, "step": 7758, "time_per_iteration": 2.7238266468048096 }, { "auxiliary_loss_clip": 0.01098442, "auxiliary_loss_mlp": 0.01044236, "balance_loss_clip": 1.04272914, "balance_loss_mlp": 1.02916837, "epoch": 0.4664963174507741, "flos": 18405871153920.0, "grad_norm": 1.6471741103867168, "language_loss": 0.74542332, "learning_rate": 2.3128112299327147e-06, "loss": 0.76685011, "num_input_tokens_seen": 166542935, "step": 7759, "time_per_iteration": 2.648406744003296 }, { "auxiliary_loss_clip": 0.01110559, "auxiliary_loss_mlp": 0.01047546, "balance_loss_clip": 1.04692769, "balance_loss_mlp": 1.0325253, "epoch": 0.46655644070344204, "flos": 22455122227200.0, "grad_norm": 1.575011375316493, "language_loss": 0.77734709, "learning_rate": 2.312426555462893e-06, "loss": 0.79892808, "num_input_tokens_seen": 166563935, "step": 7760, "time_per_iteration": 2.715393543243408 }, { "auxiliary_loss_clip": 0.01104604, "auxiliary_loss_mlp": 0.01034603, "balance_loss_clip": 1.04476929, "balance_loss_mlp": 1.01968408, "epoch": 0.46661656395611, "flos": 13808407731840.0, "grad_norm": 1.8509707336449404, "language_loss": 0.74408627, "learning_rate": 2.3120418691458237e-06, "loss": 0.76547837, "num_input_tokens_seen": 166582175, "step": 7761, "time_per_iteration": 2.679760217666626 }, { "auxiliary_loss_clip": 0.01118037, "auxiliary_loss_mlp": 0.01038779, "balance_loss_clip": 1.04605913, "balance_loss_mlp": 1.02199411, "epoch": 0.466676687208778, "flos": 21652159645440.0, "grad_norm": 1.9428650174374826, "language_loss": 0.78880894, "learning_rate": 2.3116571709960956e-06, "loss": 0.81037712, "num_input_tokens_seen": 166601870, "step": 7762, "time_per_iteration": 2.6236844062805176 }, { "auxiliary_loss_clip": 0.01032755, "auxiliary_loss_mlp": 0.01004567, "balance_loss_clip": 1.01497078, "balance_loss_mlp": 1.00300527, "epoch": 0.46673681046144594, "flos": 68534259068160.0, "grad_norm": 0.7915263755311791, "language_loss": 0.59707403, "learning_rate": 2.311272461028297e-06, "loss": 0.61744726, "num_input_tokens_seen": 166668960, "step": 7763, "time_per_iteration": 3.2309603691101074 }, { "auxiliary_loss_clip": 0.01092007, "auxiliary_loss_mlp": 0.01038011, "balance_loss_clip": 1.04239237, "balance_loss_mlp": 1.02181077, "epoch": 0.46679693371411396, "flos": 15814449469440.0, "grad_norm": 2.1149132662524766, "language_loss": 0.78707278, "learning_rate": 2.3108877392570146e-06, "loss": 0.80837297, "num_input_tokens_seen": 166686110, "step": 7764, "time_per_iteration": 2.667523145675659 }, { "auxiliary_loss_clip": 0.01102497, "auxiliary_loss_mlp": 0.01038126, "balance_loss_clip": 1.05066562, "balance_loss_mlp": 1.02470863, "epoch": 0.4668570569667819, "flos": 18514572687360.0, "grad_norm": 1.9076684434806583, "language_loss": 0.72103167, "learning_rate": 2.310503005696839e-06, "loss": 0.74243796, "num_input_tokens_seen": 166703930, "step": 7765, "time_per_iteration": 2.695037364959717 }, { "auxiliary_loss_clip": 0.0108654, "auxiliary_loss_mlp": 0.01041419, "balance_loss_clip": 1.04354358, "balance_loss_mlp": 1.02578509, "epoch": 0.4669171802194499, "flos": 19206643006080.0, "grad_norm": 3.5524770939500763, "language_loss": 0.77958077, "learning_rate": 2.3101182603623576e-06, "loss": 0.80086035, "num_input_tokens_seen": 166719940, "step": 7766, "time_per_iteration": 2.7083003520965576 }, { "auxiliary_loss_clip": 0.01111478, "auxiliary_loss_mlp": 0.01041119, "balance_loss_clip": 1.0413723, "balance_loss_mlp": 1.02596176, "epoch": 0.46697730347211786, "flos": 12276135406080.0, "grad_norm": 2.008926604773062, "language_loss": 0.64852947, "learning_rate": 2.3097335032681607e-06, "loss": 0.67005551, "num_input_tokens_seen": 166738285, "step": 7767, "time_per_iteration": 2.6344571113586426 }, { "auxiliary_loss_clip": 0.01120029, "auxiliary_loss_mlp": 0.0104422, "balance_loss_clip": 1.04623926, "balance_loss_mlp": 1.02955675, "epoch": 0.4670374267247858, "flos": 23586739274880.0, "grad_norm": 1.9514245068590486, "language_loss": 0.74225283, "learning_rate": 2.3093487344288393e-06, "loss": 0.76389533, "num_input_tokens_seen": 166758170, "step": 7768, "time_per_iteration": 2.7037155628204346 }, { "auxiliary_loss_clip": 0.01101883, "auxiliary_loss_mlp": 0.01035933, "balance_loss_clip": 1.04606605, "balance_loss_mlp": 1.02081776, "epoch": 0.4670975499774538, "flos": 15991093578240.0, "grad_norm": 1.8795722363955685, "language_loss": 0.70699239, "learning_rate": 2.308963953858982e-06, "loss": 0.72837055, "num_input_tokens_seen": 166775750, "step": 7769, "time_per_iteration": 2.6716794967651367 }, { "auxiliary_loss_clip": 0.0112823, "auxiliary_loss_mlp": 0.01035755, "balance_loss_clip": 1.04401624, "balance_loss_mlp": 1.02156949, "epoch": 0.46715767323012175, "flos": 15377596260480.0, "grad_norm": 2.0624542877059158, "language_loss": 0.81268704, "learning_rate": 2.3085791615731803e-06, "loss": 0.83432686, "num_input_tokens_seen": 166791720, "step": 7770, "time_per_iteration": 2.5958662033081055 }, { "auxiliary_loss_clip": 0.01043437, "auxiliary_loss_mlp": 0.01001838, "balance_loss_clip": 1.01635242, "balance_loss_mlp": 1.00027645, "epoch": 0.4672177964827897, "flos": 60252217401600.0, "grad_norm": 0.7961749107066677, "language_loss": 0.5562135, "learning_rate": 2.3081943575860265e-06, "loss": 0.57666636, "num_input_tokens_seen": 166856360, "step": 7771, "time_per_iteration": 3.1569736003875732 }, { "auxiliary_loss_clip": 0.01114939, "auxiliary_loss_mlp": 0.00771824, "balance_loss_clip": 1.04351723, "balance_loss_mlp": 1.00060511, "epoch": 0.4672779197354577, "flos": 27636134002560.0, "grad_norm": 1.896331384644372, "language_loss": 0.65528286, "learning_rate": 2.3078095419121117e-06, "loss": 0.67415047, "num_input_tokens_seen": 166875925, "step": 7772, "time_per_iteration": 2.7263035774230957 }, { "auxiliary_loss_clip": 0.01113556, "auxiliary_loss_mlp": 0.01034989, "balance_loss_clip": 1.04692101, "balance_loss_mlp": 1.02061212, "epoch": 0.46733804298812565, "flos": 31394257344000.0, "grad_norm": 2.0574903106475513, "language_loss": 0.63557553, "learning_rate": 2.3074247145660283e-06, "loss": 0.65706098, "num_input_tokens_seen": 166896520, "step": 7773, "time_per_iteration": 2.691378593444824 }, { "auxiliary_loss_clip": 0.01112174, "auxiliary_loss_mlp": 0.01040289, "balance_loss_clip": 1.04673469, "balance_loss_mlp": 1.02454185, "epoch": 0.4673981662407936, "flos": 19500607912320.0, "grad_norm": 1.9630472969764714, "language_loss": 0.80073929, "learning_rate": 2.3070398755623685e-06, "loss": 0.8222639, "num_input_tokens_seen": 166915370, "step": 7774, "time_per_iteration": 2.661416530609131 }, { "auxiliary_loss_clip": 0.01096265, "auxiliary_loss_mlp": 0.01033594, "balance_loss_clip": 1.04382384, "balance_loss_mlp": 1.01813269, "epoch": 0.4674582894934616, "flos": 20521835487360.0, "grad_norm": 1.5987951306887498, "language_loss": 0.77369159, "learning_rate": 2.306655024915726e-06, "loss": 0.79499024, "num_input_tokens_seen": 166934875, "step": 7775, "time_per_iteration": 4.281586647033691 }, { "auxiliary_loss_clip": 0.01096609, "auxiliary_loss_mlp": 0.01036176, "balance_loss_clip": 1.04498506, "balance_loss_mlp": 1.02137041, "epoch": 0.46751841274612954, "flos": 22090952188800.0, "grad_norm": 1.8524613051021832, "language_loss": 0.69526893, "learning_rate": 2.306270162640694e-06, "loss": 0.71659672, "num_input_tokens_seen": 166954285, "step": 7776, "time_per_iteration": 4.289973497390747 }, { "auxiliary_loss_clip": 0.0112105, "auxiliary_loss_mlp": 0.0103614, "balance_loss_clip": 1.04810274, "balance_loss_mlp": 1.02246058, "epoch": 0.46757853599879756, "flos": 26980082046720.0, "grad_norm": 1.5322212077638444, "language_loss": 0.73980904, "learning_rate": 2.3058852887518678e-06, "loss": 0.76138097, "num_input_tokens_seen": 166975975, "step": 7777, "time_per_iteration": 2.7370285987854004 }, { "auxiliary_loss_clip": 0.01118243, "auxiliary_loss_mlp": 0.01036883, "balance_loss_clip": 1.045416, "balance_loss_mlp": 1.02208281, "epoch": 0.4676386592514655, "flos": 24134053783680.0, "grad_norm": 2.891298768731385, "language_loss": 0.69314432, "learning_rate": 2.3055004032638394e-06, "loss": 0.71469557, "num_input_tokens_seen": 166996140, "step": 7778, "time_per_iteration": 4.159350633621216 }, { "auxiliary_loss_clip": 0.01119786, "auxiliary_loss_mlp": 0.01041292, "balance_loss_clip": 1.04801941, "balance_loss_mlp": 1.02624786, "epoch": 0.4676987825041335, "flos": 25483720343040.0, "grad_norm": 2.158752703527913, "language_loss": 0.73216277, "learning_rate": 2.305115506191206e-06, "loss": 0.75377357, "num_input_tokens_seen": 167016105, "step": 7779, "time_per_iteration": 2.6880576610565186 }, { "auxiliary_loss_clip": 0.0108513, "auxiliary_loss_mlp": 0.01043402, "balance_loss_clip": 1.04270327, "balance_loss_mlp": 1.02963924, "epoch": 0.46775890575680146, "flos": 21945298538880.0, "grad_norm": 1.532169986090066, "language_loss": 0.72447348, "learning_rate": 2.304730597548562e-06, "loss": 0.74575877, "num_input_tokens_seen": 167036185, "step": 7780, "time_per_iteration": 4.378252267837524 }, { "auxiliary_loss_clip": 0.01098995, "auxiliary_loss_mlp": 0.01052099, "balance_loss_clip": 1.03960943, "balance_loss_mlp": 1.03428912, "epoch": 0.4678190290094694, "flos": 25228395492480.0, "grad_norm": 1.8072634784489867, "language_loss": 0.74489224, "learning_rate": 2.3043456773505023e-06, "loss": 0.7664032, "num_input_tokens_seen": 167054515, "step": 7781, "time_per_iteration": 2.684298038482666 }, { "auxiliary_loss_clip": 0.01121556, "auxiliary_loss_mlp": 0.01040282, "balance_loss_clip": 1.04655743, "balance_loss_mlp": 1.02464151, "epoch": 0.4678791522621374, "flos": 32268358811520.0, "grad_norm": 3.3303395339611486, "language_loss": 0.62934184, "learning_rate": 2.3039607456116252e-06, "loss": 0.65096015, "num_input_tokens_seen": 167077245, "step": 7782, "time_per_iteration": 2.801643133163452 }, { "auxiliary_loss_clip": 0.01112208, "auxiliary_loss_mlp": 0.01044015, "balance_loss_clip": 1.04610753, "balance_loss_mlp": 1.02925098, "epoch": 0.46793927551480535, "flos": 27046480337280.0, "grad_norm": 2.527604831052906, "language_loss": 0.63679516, "learning_rate": 2.3035758023465254e-06, "loss": 0.65835738, "num_input_tokens_seen": 167097235, "step": 7783, "time_per_iteration": 2.779493570327759 }, { "auxiliary_loss_clip": 0.01126101, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.04948771, "balance_loss_mlp": 1.02393532, "epoch": 0.4679993987674733, "flos": 17457398576640.0, "grad_norm": 2.4796959185267884, "language_loss": 0.67925286, "learning_rate": 2.303190847569801e-06, "loss": 0.70091814, "num_input_tokens_seen": 167113155, "step": 7784, "time_per_iteration": 2.640165090560913 }, { "auxiliary_loss_clip": 0.01100267, "auxiliary_loss_mlp": 0.01033313, "balance_loss_clip": 1.04564571, "balance_loss_mlp": 1.0193001, "epoch": 0.4680595220201413, "flos": 17165121609600.0, "grad_norm": 2.0879148282250304, "language_loss": 0.84605902, "learning_rate": 2.3028058812960497e-06, "loss": 0.8673948, "num_input_tokens_seen": 167131765, "step": 7785, "time_per_iteration": 2.6447336673736572 }, { "auxiliary_loss_clip": 0.01095846, "auxiliary_loss_mlp": 0.01038359, "balance_loss_clip": 1.0473485, "balance_loss_mlp": 1.02278996, "epoch": 0.46811964527280925, "flos": 11327591001600.0, "grad_norm": 1.936392485305852, "language_loss": 0.77363992, "learning_rate": 2.3024209035398678e-06, "loss": 0.79498196, "num_input_tokens_seen": 167149030, "step": 7786, "time_per_iteration": 2.7023332118988037 }, { "auxiliary_loss_clip": 0.01116619, "auxiliary_loss_mlp": 0.01034917, "balance_loss_clip": 1.04685593, "balance_loss_mlp": 1.02089214, "epoch": 0.4681797685254772, "flos": 24278809593600.0, "grad_norm": 2.0886119764466686, "language_loss": 0.74195051, "learning_rate": 2.302035914315856e-06, "loss": 0.76346588, "num_input_tokens_seen": 167167375, "step": 7787, "time_per_iteration": 2.704002618789673 }, { "auxiliary_loss_clip": 0.0110227, "auxiliary_loss_mlp": 0.01041247, "balance_loss_clip": 1.04562151, "balance_loss_mlp": 1.02654815, "epoch": 0.4682398917781452, "flos": 31650372293760.0, "grad_norm": 1.9198703232455803, "language_loss": 0.65471619, "learning_rate": 2.3016509136386116e-06, "loss": 0.67615134, "num_input_tokens_seen": 167188065, "step": 7788, "time_per_iteration": 2.767409324645996 }, { "auxiliary_loss_clip": 0.01117478, "auxiliary_loss_mlp": 0.01034939, "balance_loss_clip": 1.0463376, "balance_loss_mlp": 1.02198708, "epoch": 0.46830001503081314, "flos": 28110765340800.0, "grad_norm": 1.576175997941932, "language_loss": 0.63680893, "learning_rate": 2.3012659015227343e-06, "loss": 0.65833306, "num_input_tokens_seen": 167209675, "step": 7789, "time_per_iteration": 2.686382532119751 }, { "auxiliary_loss_clip": 0.01034678, "auxiliary_loss_mlp": 0.01000229, "balance_loss_clip": 1.01769471, "balance_loss_mlp": 0.99867934, "epoch": 0.4683601382834811, "flos": 57881718316800.0, "grad_norm": 0.6946835696901172, "language_loss": 0.61856973, "learning_rate": 2.300880877982825e-06, "loss": 0.63891876, "num_input_tokens_seen": 167273940, "step": 7790, "time_per_iteration": 3.2082865238189697 }, { "auxiliary_loss_clip": 0.01088531, "auxiliary_loss_mlp": 0.01040894, "balance_loss_clip": 1.04553008, "balance_loss_mlp": 1.02514648, "epoch": 0.46842026153614913, "flos": 21871933009920.0, "grad_norm": 1.7348641955250894, "language_loss": 0.79120016, "learning_rate": 2.3004958430334808e-06, "loss": 0.81249446, "num_input_tokens_seen": 167292730, "step": 7791, "time_per_iteration": 2.7868592739105225 }, { "auxiliary_loss_clip": 0.0112267, "auxiliary_loss_mlp": 0.01038559, "balance_loss_clip": 1.05027902, "balance_loss_mlp": 1.0236336, "epoch": 0.4684803847888171, "flos": 24900818434560.0, "grad_norm": 1.5319083860586857, "language_loss": 0.7509321, "learning_rate": 2.3001107966893052e-06, "loss": 0.77254432, "num_input_tokens_seen": 167313460, "step": 7792, "time_per_iteration": 2.6591553688049316 }, { "auxiliary_loss_clip": 0.01093652, "auxiliary_loss_mlp": 0.01040808, "balance_loss_clip": 1.03941143, "balance_loss_mlp": 1.02582359, "epoch": 0.46854050804148506, "flos": 26251670142720.0, "grad_norm": 1.6679874379457267, "language_loss": 0.68283308, "learning_rate": 2.299725738964898e-06, "loss": 0.70417762, "num_input_tokens_seen": 167335385, "step": 7793, "time_per_iteration": 2.714614152908325 }, { "auxiliary_loss_clip": 0.01120793, "auxiliary_loss_mlp": 0.00770869, "balance_loss_clip": 1.05047464, "balance_loss_mlp": 1.00063658, "epoch": 0.468600631294153, "flos": 21579799697280.0, "grad_norm": 1.5900503410544595, "language_loss": 0.74045742, "learning_rate": 2.2993406698748607e-06, "loss": 0.75937402, "num_input_tokens_seen": 167353625, "step": 7794, "time_per_iteration": 2.631113052368164 }, { "auxiliary_loss_clip": 0.01101487, "auxiliary_loss_mlp": 0.01040191, "balance_loss_clip": 1.04786825, "balance_loss_mlp": 1.02505112, "epoch": 0.468660754546821, "flos": 25885632597120.0, "grad_norm": 1.7758607044197945, "language_loss": 0.63441491, "learning_rate": 2.2989555894337953e-06, "loss": 0.65583163, "num_input_tokens_seen": 167374565, "step": 7795, "time_per_iteration": 2.755208969116211 }, { "auxiliary_loss_clip": 0.01090992, "auxiliary_loss_mlp": 0.01033775, "balance_loss_clip": 1.04455793, "balance_loss_mlp": 1.01939869, "epoch": 0.46872087779948896, "flos": 35475001666560.0, "grad_norm": 1.5780628651808217, "language_loss": 0.6815629, "learning_rate": 2.298570497656304e-06, "loss": 0.70281053, "num_input_tokens_seen": 167395010, "step": 7796, "time_per_iteration": 2.8338258266448975 }, { "auxiliary_loss_clip": 0.01132709, "auxiliary_loss_mlp": 0.00772271, "balance_loss_clip": 1.05046582, "balance_loss_mlp": 1.00074291, "epoch": 0.4687810010521569, "flos": 26396425952640.0, "grad_norm": 3.1208322005509705, "language_loss": 0.7061345, "learning_rate": 2.2981853945569894e-06, "loss": 0.72518432, "num_input_tokens_seen": 167415285, "step": 7797, "time_per_iteration": 2.7184929847717285 }, { "auxiliary_loss_clip": 0.01108205, "auxiliary_loss_mlp": 0.01035831, "balance_loss_clip": 1.04716921, "balance_loss_mlp": 1.01992226, "epoch": 0.4688411243048249, "flos": 19972761212160.0, "grad_norm": 2.050220537358762, "language_loss": 0.67158788, "learning_rate": 2.297800280150454e-06, "loss": 0.69302827, "num_input_tokens_seen": 167432405, "step": 7798, "time_per_iteration": 2.707491159439087 }, { "auxiliary_loss_clip": 0.01033434, "auxiliary_loss_mlp": 0.00999628, "balance_loss_clip": 1.01507461, "balance_loss_mlp": 0.99782771, "epoch": 0.46890124755749285, "flos": 63977015900160.0, "grad_norm": 0.9512995219109956, "language_loss": 0.64611268, "learning_rate": 2.2974151544513033e-06, "loss": 0.66644335, "num_input_tokens_seen": 167499365, "step": 7799, "time_per_iteration": 3.3521087169647217 }, { "auxiliary_loss_clip": 0.01103151, "auxiliary_loss_mlp": 0.01029152, "balance_loss_clip": 1.0488441, "balance_loss_mlp": 1.01467967, "epoch": 0.4689613708101608, "flos": 23768985905280.0, "grad_norm": 1.342329921678728, "language_loss": 0.72313237, "learning_rate": 2.2970300174741395e-06, "loss": 0.74445534, "num_input_tokens_seen": 167520390, "step": 7800, "time_per_iteration": 2.7983593940734863 }, { "auxiliary_loss_clip": 0.01128952, "auxiliary_loss_mlp": 0.01035275, "balance_loss_clip": 1.04984462, "balance_loss_mlp": 1.0224781, "epoch": 0.4690214940628288, "flos": 24788705109120.0, "grad_norm": 1.7150056694833848, "language_loss": 0.7285912, "learning_rate": 2.296644869233568e-06, "loss": 0.75023353, "num_input_tokens_seen": 167539865, "step": 7801, "time_per_iteration": 2.635540008544922 }, { "auxiliary_loss_clip": 0.01097741, "auxiliary_loss_mlp": 0.010419, "balance_loss_clip": 1.04270506, "balance_loss_mlp": 1.02579427, "epoch": 0.46908161731549675, "flos": 18077324428800.0, "grad_norm": 1.930712957606368, "language_loss": 0.62748474, "learning_rate": 2.2962597097441936e-06, "loss": 0.64888108, "num_input_tokens_seen": 167558190, "step": 7802, "time_per_iteration": 2.8309857845306396 }, { "auxiliary_loss_clip": 0.01131707, "auxiliary_loss_mlp": 0.01041126, "balance_loss_clip": 1.04824543, "balance_loss_mlp": 1.02705908, "epoch": 0.4691417405681647, "flos": 25703350053120.0, "grad_norm": 2.0983906256852647, "language_loss": 0.73465741, "learning_rate": 2.2958745390206206e-06, "loss": 0.75638568, "num_input_tokens_seen": 167577685, "step": 7803, "time_per_iteration": 2.639453172683716 }, { "auxiliary_loss_clip": 0.01105851, "auxiliary_loss_mlp": 0.00771349, "balance_loss_clip": 1.04883635, "balance_loss_mlp": 1.00065053, "epoch": 0.46920186382083273, "flos": 17457039440640.0, "grad_norm": 2.3177200047102486, "language_loss": 0.77396876, "learning_rate": 2.2954893570774558e-06, "loss": 0.7927407, "num_input_tokens_seen": 167596390, "step": 7804, "time_per_iteration": 2.6661806106567383 }, { "auxiliary_loss_clip": 0.01105528, "auxiliary_loss_mlp": 0.01031688, "balance_loss_clip": 1.04877174, "balance_loss_mlp": 1.01763344, "epoch": 0.4692619870735007, "flos": 20339445202560.0, "grad_norm": 2.089417814933236, "language_loss": 0.77330643, "learning_rate": 2.295104163929305e-06, "loss": 0.79467863, "num_input_tokens_seen": 167614980, "step": 7805, "time_per_iteration": 2.6670541763305664 }, { "auxiliary_loss_clip": 0.01140382, "auxiliary_loss_mlp": 0.01050591, "balance_loss_clip": 1.05195141, "balance_loss_mlp": 1.03487957, "epoch": 0.46932211032616866, "flos": 29496558003840.0, "grad_norm": 1.6834011453476339, "language_loss": 0.82446682, "learning_rate": 2.2947189595907742e-06, "loss": 0.84637654, "num_input_tokens_seen": 167635895, "step": 7806, "time_per_iteration": 2.641126871109009 }, { "auxiliary_loss_clip": 0.01109262, "auxiliary_loss_mlp": 0.01041295, "balance_loss_clip": 1.04739761, "balance_loss_mlp": 1.02634656, "epoch": 0.4693822335788366, "flos": 36211242735360.0, "grad_norm": 1.815437092056069, "language_loss": 0.77320337, "learning_rate": 2.294333744076472e-06, "loss": 0.79470897, "num_input_tokens_seen": 167657440, "step": 7807, "time_per_iteration": 2.768772840499878 }, { "auxiliary_loss_clip": 0.0110914, "auxiliary_loss_mlp": 0.01038695, "balance_loss_clip": 1.05083752, "balance_loss_mlp": 1.02354348, "epoch": 0.4694423568315046, "flos": 20338978325760.0, "grad_norm": 2.201580678066969, "language_loss": 0.51815701, "learning_rate": 2.2939485174010035e-06, "loss": 0.53963536, "num_input_tokens_seen": 167675025, "step": 7808, "time_per_iteration": 2.6565470695495605 }, { "auxiliary_loss_clip": 0.01003405, "auxiliary_loss_mlp": 0.01005455, "balance_loss_clip": 1.0168457, "balance_loss_mlp": 1.00391757, "epoch": 0.46950248008417256, "flos": 64326353621760.0, "grad_norm": 0.78732179125356, "language_loss": 0.57700193, "learning_rate": 2.293563279578978e-06, "loss": 0.59709048, "num_input_tokens_seen": 167729635, "step": 7809, "time_per_iteration": 3.1529645919799805 }, { "auxiliary_loss_clip": 0.01087624, "auxiliary_loss_mlp": 0.01039585, "balance_loss_clip": 1.04826307, "balance_loss_mlp": 1.02535129, "epoch": 0.4695626033368405, "flos": 19200106730880.0, "grad_norm": 2.4452536224375403, "language_loss": 0.7153672, "learning_rate": 2.2931780306250045e-06, "loss": 0.73663932, "num_input_tokens_seen": 167745135, "step": 7810, "time_per_iteration": 2.730975389480591 }, { "auxiliary_loss_clip": 0.01122205, "auxiliary_loss_mlp": 0.01041582, "balance_loss_clip": 1.04927683, "balance_loss_mlp": 1.02719331, "epoch": 0.4696227265895085, "flos": 23002436736000.0, "grad_norm": 3.7864250348919284, "language_loss": 0.81469715, "learning_rate": 2.29279277055369e-06, "loss": 0.83633506, "num_input_tokens_seen": 167763875, "step": 7811, "time_per_iteration": 2.689089059829712 }, { "auxiliary_loss_clip": 0.01117579, "auxiliary_loss_mlp": 0.01038248, "balance_loss_clip": 1.04989529, "balance_loss_mlp": 1.02302504, "epoch": 0.46968284984217645, "flos": 21870855601920.0, "grad_norm": 1.6520361935296233, "language_loss": 0.8041414, "learning_rate": 2.292407499379644e-06, "loss": 0.82569969, "num_input_tokens_seen": 167784895, "step": 7812, "time_per_iteration": 2.6615161895751953 }, { "auxiliary_loss_clip": 0.01075193, "auxiliary_loss_mlp": 0.01036276, "balance_loss_clip": 1.04313707, "balance_loss_mlp": 1.02170289, "epoch": 0.4697429730948444, "flos": 19974987855360.0, "grad_norm": 1.6393784799199496, "language_loss": 0.74155343, "learning_rate": 2.292022217117477e-06, "loss": 0.76266813, "num_input_tokens_seen": 167803185, "step": 7813, "time_per_iteration": 2.7426726818084717 }, { "auxiliary_loss_clip": 0.01102658, "auxiliary_loss_mlp": 0.01036665, "balance_loss_clip": 1.04594994, "balance_loss_mlp": 1.02108407, "epoch": 0.4698030963475124, "flos": 15156206784000.0, "grad_norm": 2.3178266219619994, "language_loss": 0.84324849, "learning_rate": 2.291636923781798e-06, "loss": 0.86464167, "num_input_tokens_seen": 167816550, "step": 7814, "time_per_iteration": 2.6519999504089355 }, { "auxiliary_loss_clip": 0.01105673, "auxiliary_loss_mlp": 0.01036813, "balance_loss_clip": 1.04427862, "balance_loss_mlp": 1.02291358, "epoch": 0.46986321960018035, "flos": 15151178880000.0, "grad_norm": 1.8698068393605216, "language_loss": 0.81723464, "learning_rate": 2.291251619387217e-06, "loss": 0.83865952, "num_input_tokens_seen": 167831845, "step": 7815, "time_per_iteration": 5.720506906509399 }, { "auxiliary_loss_clip": 0.01088353, "auxiliary_loss_mlp": 0.01038971, "balance_loss_clip": 1.04897821, "balance_loss_mlp": 1.023808, "epoch": 0.4699233428528483, "flos": 23108911626240.0, "grad_norm": 2.071255754681328, "language_loss": 0.77463031, "learning_rate": 2.2908663039483468e-06, "loss": 0.79590356, "num_input_tokens_seen": 167850360, "step": 7816, "time_per_iteration": 2.738074541091919 }, { "auxiliary_loss_clip": 0.01044982, "auxiliary_loss_mlp": 0.01001103, "balance_loss_clip": 1.01830792, "balance_loss_mlp": 0.99944633, "epoch": 0.46998346610551633, "flos": 68105558246400.0, "grad_norm": 0.838650178196428, "language_loss": 0.58987319, "learning_rate": 2.290480977479796e-06, "loss": 0.6103341, "num_input_tokens_seen": 167908660, "step": 7817, "time_per_iteration": 3.1292662620544434 }, { "auxiliary_loss_clip": 0.01107632, "auxiliary_loss_mlp": 0.01034089, "balance_loss_clip": 1.04874861, "balance_loss_mlp": 1.02005172, "epoch": 0.4700435893581843, "flos": 24129456842880.0, "grad_norm": 1.7123630681211415, "language_loss": 0.79417968, "learning_rate": 2.2900956399961775e-06, "loss": 0.81559694, "num_input_tokens_seen": 167927905, "step": 7818, "time_per_iteration": 5.943104028701782 }, { "auxiliary_loss_clip": 0.0113212, "auxiliary_loss_mlp": 0.01037162, "balance_loss_clip": 1.04868269, "balance_loss_mlp": 1.02325034, "epoch": 0.47010371261085226, "flos": 20150518642560.0, "grad_norm": 1.6838154241149696, "language_loss": 0.83469647, "learning_rate": 2.289710291512104e-06, "loss": 0.85638928, "num_input_tokens_seen": 167945995, "step": 7819, "time_per_iteration": 2.6600770950317383 }, { "auxiliary_loss_clip": 0.01101069, "auxiliary_loss_mlp": 0.0103721, "balance_loss_clip": 1.04507041, "balance_loss_mlp": 1.02214193, "epoch": 0.47016383586352023, "flos": 15122199582720.0, "grad_norm": 2.5448578806987974, "language_loss": 0.7640624, "learning_rate": 2.289324932042186e-06, "loss": 0.78544521, "num_input_tokens_seen": 167963380, "step": 7820, "time_per_iteration": 2.720524549484253 }, { "auxiliary_loss_clip": 0.01114996, "auxiliary_loss_mlp": 0.01040886, "balance_loss_clip": 1.05066848, "balance_loss_mlp": 1.02641368, "epoch": 0.4702239591161882, "flos": 13552975140480.0, "grad_norm": 1.835793139157851, "language_loss": 0.74591041, "learning_rate": 2.288939561601039e-06, "loss": 0.76746929, "num_input_tokens_seen": 167981740, "step": 7821, "time_per_iteration": 2.6208953857421875 }, { "auxiliary_loss_clip": 0.0112785, "auxiliary_loss_mlp": 0.01044502, "balance_loss_clip": 1.04762793, "balance_loss_mlp": 1.03104329, "epoch": 0.47028408236885616, "flos": 24276511123200.0, "grad_norm": 1.8086110799443134, "language_loss": 0.89176404, "learning_rate": 2.2885541802032746e-06, "loss": 0.91348755, "num_input_tokens_seen": 167999380, "step": 7822, "time_per_iteration": 2.641425371170044 }, { "auxiliary_loss_clip": 0.01113329, "auxiliary_loss_mlp": 0.01033656, "balance_loss_clip": 1.04665482, "balance_loss_mlp": 1.01981544, "epoch": 0.4703442056215241, "flos": 22856926740480.0, "grad_norm": 1.7930134528553263, "language_loss": 0.79694283, "learning_rate": 2.2881687878635055e-06, "loss": 0.81841266, "num_input_tokens_seen": 168018395, "step": 7823, "time_per_iteration": 2.632756233215332 }, { "auxiliary_loss_clip": 0.01025068, "auxiliary_loss_mlp": 0.01003424, "balance_loss_clip": 1.02190793, "balance_loss_mlp": 1.00163603, "epoch": 0.4704043288741921, "flos": 69240227950080.0, "grad_norm": 0.8086269167579946, "language_loss": 0.56642514, "learning_rate": 2.2877833845963487e-06, "loss": 0.5867101, "num_input_tokens_seen": 168084080, "step": 7824, "time_per_iteration": 3.3140807151794434 }, { "auxiliary_loss_clip": 0.01104679, "auxiliary_loss_mlp": 0.01042887, "balance_loss_clip": 1.04395127, "balance_loss_mlp": 1.02718711, "epoch": 0.47046445212686006, "flos": 18041090584320.0, "grad_norm": 1.8843796036347318, "language_loss": 0.81223321, "learning_rate": 2.2873979704164157e-06, "loss": 0.83370888, "num_input_tokens_seen": 168101555, "step": 7825, "time_per_iteration": 2.700547695159912 }, { "auxiliary_loss_clip": 0.01111276, "auxiliary_loss_mlp": 0.01036611, "balance_loss_clip": 1.0480845, "balance_loss_mlp": 1.02218676, "epoch": 0.470524575379528, "flos": 23951448017280.0, "grad_norm": 1.7729512383292405, "language_loss": 0.66719514, "learning_rate": 2.287012545338324e-06, "loss": 0.68867397, "num_input_tokens_seen": 168121530, "step": 7826, "time_per_iteration": 2.6998069286346436 }, { "auxiliary_loss_clip": 0.01105784, "auxiliary_loss_mlp": 0.01039915, "balance_loss_clip": 1.04433072, "balance_loss_mlp": 1.02479887, "epoch": 0.470584698632196, "flos": 18113558273280.0, "grad_norm": 1.8432989970829954, "language_loss": 0.84173524, "learning_rate": 2.2866271093766877e-06, "loss": 0.86319232, "num_input_tokens_seen": 168140335, "step": 7827, "time_per_iteration": 2.692657709121704 }, { "auxiliary_loss_clip": 0.01024445, "auxiliary_loss_mlp": 0.01004787, "balance_loss_clip": 1.01622581, "balance_loss_mlp": 1.00303495, "epoch": 0.47064482188486395, "flos": 57251916224640.0, "grad_norm": 0.8086690003326286, "language_loss": 0.5568617, "learning_rate": 2.286241662546122e-06, "loss": 0.57715398, "num_input_tokens_seen": 168200535, "step": 7828, "time_per_iteration": 3.184593439102173 }, { "auxiliary_loss_clip": 0.01128245, "auxiliary_loss_mlp": 0.01033804, "balance_loss_clip": 1.04770434, "balance_loss_mlp": 1.02036309, "epoch": 0.4707049451375319, "flos": 17895077798400.0, "grad_norm": 2.799236307786822, "language_loss": 0.80882025, "learning_rate": 2.285856204861245e-06, "loss": 0.8304407, "num_input_tokens_seen": 168219610, "step": 7829, "time_per_iteration": 2.5789284706115723 }, { "auxiliary_loss_clip": 0.01128236, "auxiliary_loss_mlp": 0.01036042, "balance_loss_clip": 1.04866183, "balance_loss_mlp": 1.02311337, "epoch": 0.47076506839019994, "flos": 25232669210880.0, "grad_norm": 1.589084017915349, "language_loss": 0.76252091, "learning_rate": 2.2854707363366703e-06, "loss": 0.78416359, "num_input_tokens_seen": 168242505, "step": 7830, "time_per_iteration": 2.6604039669036865 }, { "auxiliary_loss_clip": 0.01094201, "auxiliary_loss_mlp": 0.01033866, "balance_loss_clip": 1.04519463, "balance_loss_mlp": 1.01907206, "epoch": 0.4708251916428679, "flos": 13479681438720.0, "grad_norm": 1.9041514810278948, "language_loss": 0.7839942, "learning_rate": 2.2850852569870177e-06, "loss": 0.8052749, "num_input_tokens_seen": 168260220, "step": 7831, "time_per_iteration": 2.7709531784057617 }, { "auxiliary_loss_clip": 0.01084793, "auxiliary_loss_mlp": 0.01045555, "balance_loss_clip": 1.03967106, "balance_loss_mlp": 1.0289377, "epoch": 0.47088531489553587, "flos": 30147833450880.0, "grad_norm": 3.4524245779244045, "language_loss": 0.75518548, "learning_rate": 2.2846997668269033e-06, "loss": 0.7764889, "num_input_tokens_seen": 168277360, "step": 7832, "time_per_iteration": 2.9078352451324463 }, { "auxiliary_loss_clip": 0.01100887, "auxiliary_loss_mlp": 0.01027155, "balance_loss_clip": 1.04597783, "balance_loss_mlp": 1.01476312, "epoch": 0.47094543814820383, "flos": 21798280172160.0, "grad_norm": 1.3033633023675582, "language_loss": 0.74446917, "learning_rate": 2.2843142658709454e-06, "loss": 0.76574957, "num_input_tokens_seen": 168296605, "step": 7833, "time_per_iteration": 2.7040505409240723 }, { "auxiliary_loss_clip": 0.01115931, "auxiliary_loss_mlp": 0.01039232, "balance_loss_clip": 1.04605532, "balance_loss_mlp": 1.02489686, "epoch": 0.4710055614008718, "flos": 23003011353600.0, "grad_norm": 1.6784231271486025, "language_loss": 0.75652939, "learning_rate": 2.283928754133762e-06, "loss": 0.778081, "num_input_tokens_seen": 168316205, "step": 7834, "time_per_iteration": 2.651439666748047 }, { "auxiliary_loss_clip": 0.01080958, "auxiliary_loss_mlp": 0.0104352, "balance_loss_clip": 1.04571462, "balance_loss_mlp": 1.02942359, "epoch": 0.47106568465353976, "flos": 42741346452480.0, "grad_norm": 1.5705960877616694, "language_loss": 0.66198736, "learning_rate": 2.283543231629972e-06, "loss": 0.68323219, "num_input_tokens_seen": 168338935, "step": 7835, "time_per_iteration": 2.8833723068237305 }, { "auxiliary_loss_clip": 0.01030822, "auxiliary_loss_mlp": 0.0075266, "balance_loss_clip": 1.01354921, "balance_loss_mlp": 1.00055587, "epoch": 0.4711258079062077, "flos": 68554008570240.0, "grad_norm": 0.8682696962056556, "language_loss": 0.62114525, "learning_rate": 2.283157698374194e-06, "loss": 0.63898003, "num_input_tokens_seen": 168392800, "step": 7836, "time_per_iteration": 3.271106243133545 }, { "auxiliary_loss_clip": 0.01089899, "auxiliary_loss_mlp": 0.00772396, "balance_loss_clip": 1.04188919, "balance_loss_mlp": 1.00066912, "epoch": 0.4711859311588757, "flos": 25446588658560.0, "grad_norm": 2.9726849992756623, "language_loss": 0.69634271, "learning_rate": 2.2827721543810475e-06, "loss": 0.71496564, "num_input_tokens_seen": 168412940, "step": 7837, "time_per_iteration": 2.7227394580841064 }, { "auxiliary_loss_clip": 0.01114908, "auxiliary_loss_mlp": 0.01040024, "balance_loss_clip": 1.04658818, "balance_loss_mlp": 1.02449143, "epoch": 0.47124605441154366, "flos": 21981891519360.0, "grad_norm": 1.834184212780789, "language_loss": 0.66073495, "learning_rate": 2.282386599665153e-06, "loss": 0.68228424, "num_input_tokens_seen": 168431995, "step": 7838, "time_per_iteration": 2.63415265083313 }, { "auxiliary_loss_clip": 0.01101595, "auxiliary_loss_mlp": 0.01040478, "balance_loss_clip": 1.04245853, "balance_loss_mlp": 1.02488542, "epoch": 0.4713061776642116, "flos": 25412689198080.0, "grad_norm": 1.6613879226075605, "language_loss": 0.77071315, "learning_rate": 2.2820010342411304e-06, "loss": 0.79213387, "num_input_tokens_seen": 168454585, "step": 7839, "time_per_iteration": 2.702371835708618 }, { "auxiliary_loss_clip": 0.01089161, "auxiliary_loss_mlp": 0.01035056, "balance_loss_clip": 1.04446244, "balance_loss_mlp": 1.0215137, "epoch": 0.4713663009168796, "flos": 26542259170560.0, "grad_norm": 2.064347613929302, "language_loss": 0.72607076, "learning_rate": 2.2816154581235993e-06, "loss": 0.74731302, "num_input_tokens_seen": 168471265, "step": 7840, "time_per_iteration": 2.7578155994415283 }, { "auxiliary_loss_clip": 0.01098285, "auxiliary_loss_mlp": 0.01033804, "balance_loss_clip": 1.04248786, "balance_loss_mlp": 1.01975548, "epoch": 0.47142642416954755, "flos": 23623583650560.0, "grad_norm": 1.634270857219127, "language_loss": 0.75153434, "learning_rate": 2.2812298713271833e-06, "loss": 0.77285522, "num_input_tokens_seen": 168491360, "step": 7841, "time_per_iteration": 2.7571516036987305 }, { "auxiliary_loss_clip": 0.01097356, "auxiliary_loss_mlp": 0.01036522, "balance_loss_clip": 1.04522789, "balance_loss_mlp": 1.02271175, "epoch": 0.4714865474222155, "flos": 22310150935680.0, "grad_norm": 1.514171980299406, "language_loss": 0.70372689, "learning_rate": 2.280844273866501e-06, "loss": 0.72506565, "num_input_tokens_seen": 168511335, "step": 7842, "time_per_iteration": 2.6693220138549805 }, { "auxiliary_loss_clip": 0.01122506, "auxiliary_loss_mlp": 0.01036861, "balance_loss_clip": 1.05041289, "balance_loss_mlp": 1.02272844, "epoch": 0.4715466706748835, "flos": 17822430541440.0, "grad_norm": 2.3877412842319243, "language_loss": 0.78754079, "learning_rate": 2.280458665756177e-06, "loss": 0.80913448, "num_input_tokens_seen": 168529920, "step": 7843, "time_per_iteration": 2.584821939468384 }, { "auxiliary_loss_clip": 0.01112783, "auxiliary_loss_mlp": 0.01033598, "balance_loss_clip": 1.04609227, "balance_loss_mlp": 1.02013922, "epoch": 0.4716067939275515, "flos": 23659530186240.0, "grad_norm": 1.5083750473310347, "language_loss": 0.73945224, "learning_rate": 2.280073047010832e-06, "loss": 0.76091611, "num_input_tokens_seen": 168550595, "step": 7844, "time_per_iteration": 2.6947662830352783 }, { "auxiliary_loss_clip": 0.01103523, "auxiliary_loss_mlp": 0.01045426, "balance_loss_clip": 1.04754925, "balance_loss_mlp": 1.03077483, "epoch": 0.47166691718021947, "flos": 17930162407680.0, "grad_norm": 1.6596812780951513, "language_loss": 0.7849918, "learning_rate": 2.279687417645088e-06, "loss": 0.8064813, "num_input_tokens_seen": 168569765, "step": 7845, "time_per_iteration": 2.64786434173584 }, { "auxiliary_loss_clip": 0.01116093, "auxiliary_loss_mlp": 0.01035695, "balance_loss_clip": 1.04657555, "balance_loss_mlp": 1.02204597, "epoch": 0.47172704043288743, "flos": 26614583205120.0, "grad_norm": 1.4795134607526772, "language_loss": 0.73325998, "learning_rate": 2.2793017776735703e-06, "loss": 0.75477785, "num_input_tokens_seen": 168591525, "step": 7846, "time_per_iteration": 2.6890015602111816 }, { "auxiliary_loss_clip": 0.01112295, "auxiliary_loss_mlp": 0.01033387, "balance_loss_clip": 1.04567862, "balance_loss_mlp": 1.02053618, "epoch": 0.4717871636855554, "flos": 27922700707200.0, "grad_norm": 1.365245213481775, "language_loss": 0.74306214, "learning_rate": 2.2789161271109e-06, "loss": 0.76451898, "num_input_tokens_seen": 168611235, "step": 7847, "time_per_iteration": 2.664600133895874 }, { "auxiliary_loss_clip": 0.01076671, "auxiliary_loss_mlp": 0.01036211, "balance_loss_clip": 1.04269147, "balance_loss_mlp": 1.02244806, "epoch": 0.47184728693822336, "flos": 14502237816960.0, "grad_norm": 1.614512390946798, "language_loss": 0.80744767, "learning_rate": 2.278530465971703e-06, "loss": 0.82857651, "num_input_tokens_seen": 168628710, "step": 7848, "time_per_iteration": 2.7662644386291504 }, { "auxiliary_loss_clip": 0.01118674, "auxiliary_loss_mlp": 0.01035868, "balance_loss_clip": 1.04767179, "balance_loss_mlp": 1.02170014, "epoch": 0.47190741019089133, "flos": 17856545483520.0, "grad_norm": 3.381301580597114, "language_loss": 0.70282733, "learning_rate": 2.2781447942706032e-06, "loss": 0.72437274, "num_input_tokens_seen": 168645645, "step": 7849, "time_per_iteration": 2.628324031829834 }, { "auxiliary_loss_clip": 0.01102555, "auxiliary_loss_mlp": 0.01043039, "balance_loss_clip": 1.04688513, "balance_loss_mlp": 1.02679062, "epoch": 0.4719675334435593, "flos": 17895472848000.0, "grad_norm": 2.2108635677358968, "language_loss": 0.6920523, "learning_rate": 2.277759112022224e-06, "loss": 0.71350825, "num_input_tokens_seen": 168664165, "step": 7850, "time_per_iteration": 2.678515672683716 }, { "auxiliary_loss_clip": 0.01071934, "auxiliary_loss_mlp": 0.0103323, "balance_loss_clip": 1.04294968, "balance_loss_mlp": 1.0192523, "epoch": 0.47202765669622726, "flos": 20704369426560.0, "grad_norm": 1.8559154127156776, "language_loss": 0.75022864, "learning_rate": 2.2773734192411916e-06, "loss": 0.77128029, "num_input_tokens_seen": 168681940, "step": 7851, "time_per_iteration": 2.7907421588897705 }, { "auxiliary_loss_clip": 0.01058717, "auxiliary_loss_mlp": 0.0104416, "balance_loss_clip": 1.03438354, "balance_loss_mlp": 1.02636182, "epoch": 0.4720877799488952, "flos": 16360255607040.0, "grad_norm": 1.8954666463572496, "language_loss": 0.76087546, "learning_rate": 2.276987715942132e-06, "loss": 0.78190422, "num_input_tokens_seen": 168698830, "step": 7852, "time_per_iteration": 2.751862049102783 }, { "auxiliary_loss_clip": 0.01090696, "auxiliary_loss_mlp": 0.01031466, "balance_loss_clip": 1.0440855, "balance_loss_mlp": 1.01667845, "epoch": 0.4721479032015632, "flos": 20668171495680.0, "grad_norm": 1.6687991208994266, "language_loss": 0.69092613, "learning_rate": 2.2766020021396696e-06, "loss": 0.71214771, "num_input_tokens_seen": 168718305, "step": 7853, "time_per_iteration": 2.8860716819763184 }, { "auxiliary_loss_clip": 0.01023698, "auxiliary_loss_mlp": 0.01005171, "balance_loss_clip": 1.03293765, "balance_loss_mlp": 1.00360918, "epoch": 0.47220802645423116, "flos": 67750438435200.0, "grad_norm": 0.7060966439190681, "language_loss": 0.50175303, "learning_rate": 2.276216277848432e-06, "loss": 0.52204174, "num_input_tokens_seen": 168782365, "step": 7854, "time_per_iteration": 4.915671110153198 }, { "auxiliary_loss_clip": 0.0112187, "auxiliary_loss_mlp": 0.01035341, "balance_loss_clip": 1.04927993, "balance_loss_mlp": 1.02046967, "epoch": 0.4722681497068991, "flos": 20921449271040.0, "grad_norm": 1.8544471627611243, "language_loss": 0.63919318, "learning_rate": 2.2758305430830455e-06, "loss": 0.66076523, "num_input_tokens_seen": 168800485, "step": 7855, "time_per_iteration": 4.303591728210449 }, { "auxiliary_loss_clip": 0.01115964, "auxiliary_loss_mlp": 0.01039633, "balance_loss_clip": 1.04526174, "balance_loss_mlp": 1.02463675, "epoch": 0.4723282729595671, "flos": 28293083798400.0, "grad_norm": 6.403691145457763, "language_loss": 0.75835574, "learning_rate": 2.2754447978581376e-06, "loss": 0.77991176, "num_input_tokens_seen": 168818965, "step": 7856, "time_per_iteration": 2.669156074523926 }, { "auxiliary_loss_clip": 0.01102045, "auxiliary_loss_mlp": 0.01036544, "balance_loss_clip": 1.04435217, "balance_loss_mlp": 1.02334714, "epoch": 0.4723883962122351, "flos": 27125053338240.0, "grad_norm": 1.8316073665627561, "language_loss": 0.7513321, "learning_rate": 2.2750590421883347e-06, "loss": 0.77271795, "num_input_tokens_seen": 168840355, "step": 7857, "time_per_iteration": 5.926163673400879 }, { "auxiliary_loss_clip": 0.0110506, "auxiliary_loss_mlp": 0.01044055, "balance_loss_clip": 1.04619288, "balance_loss_mlp": 1.03164554, "epoch": 0.47244851946490307, "flos": 31537253387520.0, "grad_norm": 1.4352718890089464, "language_loss": 0.64871937, "learning_rate": 2.2746732760882655e-06, "loss": 0.67021048, "num_input_tokens_seen": 168861765, "step": 7858, "time_per_iteration": 2.7516961097717285 }, { "auxiliary_loss_clip": 0.01115653, "auxiliary_loss_mlp": 0.00772171, "balance_loss_clip": 1.04487467, "balance_loss_mlp": 1.00070405, "epoch": 0.47250864271757104, "flos": 20886544229760.0, "grad_norm": 4.333924209566871, "language_loss": 0.70584702, "learning_rate": 2.2742874995725575e-06, "loss": 0.72472525, "num_input_tokens_seen": 168881310, "step": 7859, "time_per_iteration": 2.63272762298584 }, { "auxiliary_loss_clip": 0.01132339, "auxiliary_loss_mlp": 0.01038437, "balance_loss_clip": 1.0472064, "balance_loss_mlp": 1.02420318, "epoch": 0.472568765970239, "flos": 20522086882560.0, "grad_norm": 1.7578939418215658, "language_loss": 0.62056947, "learning_rate": 2.2739017126558413e-06, "loss": 0.64227724, "num_input_tokens_seen": 168899470, "step": 7860, "time_per_iteration": 2.579881429672241 }, { "auxiliary_loss_clip": 0.01104772, "auxiliary_loss_mlp": 0.01042498, "balance_loss_clip": 1.04455113, "balance_loss_mlp": 1.02835417, "epoch": 0.47262888922290697, "flos": 35805200417280.0, "grad_norm": 2.5847882369160584, "language_loss": 0.71352196, "learning_rate": 2.2735159153527445e-06, "loss": 0.73499465, "num_input_tokens_seen": 168921495, "step": 7861, "time_per_iteration": 2.7616021633148193 }, { "auxiliary_loss_clip": 0.01100093, "auxiliary_loss_mlp": 0.01035425, "balance_loss_clip": 1.04298115, "balance_loss_mlp": 1.02136993, "epoch": 0.47268901247557493, "flos": 20667740532480.0, "grad_norm": 1.877615917676971, "language_loss": 0.85056359, "learning_rate": 2.273130107677896e-06, "loss": 0.87191874, "num_input_tokens_seen": 168940515, "step": 7862, "time_per_iteration": 2.730851173400879 }, { "auxiliary_loss_clip": 0.01126067, "auxiliary_loss_mlp": 0.01032341, "balance_loss_clip": 1.04310465, "balance_loss_mlp": 1.01836395, "epoch": 0.4727491357282429, "flos": 19573291082880.0, "grad_norm": 1.8403668162610285, "language_loss": 0.84233111, "learning_rate": 2.272744289645927e-06, "loss": 0.86391521, "num_input_tokens_seen": 168958340, "step": 7863, "time_per_iteration": 2.7247161865234375 }, { "auxiliary_loss_clip": 0.01104075, "auxiliary_loss_mlp": 0.01041818, "balance_loss_clip": 1.04576826, "balance_loss_mlp": 1.02810335, "epoch": 0.47280925898091086, "flos": 18217231902720.0, "grad_norm": 2.0137135318025843, "language_loss": 0.66243893, "learning_rate": 2.272358461271467e-06, "loss": 0.68389785, "num_input_tokens_seen": 168974850, "step": 7864, "time_per_iteration": 2.7027535438537598 }, { "auxiliary_loss_clip": 0.01126031, "auxiliary_loss_mlp": 0.01038902, "balance_loss_clip": 1.04373837, "balance_loss_mlp": 1.02402425, "epoch": 0.4728693822335788, "flos": 17821820010240.0, "grad_norm": 1.9458421333469222, "language_loss": 0.64846861, "learning_rate": 2.271972622569147e-06, "loss": 0.67011791, "num_input_tokens_seen": 168992860, "step": 7865, "time_per_iteration": 2.599947214126587 }, { "auxiliary_loss_clip": 0.01095039, "auxiliary_loss_mlp": 0.00771615, "balance_loss_clip": 1.04065597, "balance_loss_mlp": 1.00069022, "epoch": 0.4729295054862468, "flos": 20595057361920.0, "grad_norm": 1.8988594463693396, "language_loss": 0.73979223, "learning_rate": 2.2715867735535976e-06, "loss": 0.75845885, "num_input_tokens_seen": 169010325, "step": 7866, "time_per_iteration": 2.6904079914093018 }, { "auxiliary_loss_clip": 0.01127633, "auxiliary_loss_mlp": 0.01036812, "balance_loss_clip": 1.0444746, "balance_loss_mlp": 1.02215528, "epoch": 0.47298962873891476, "flos": 23368079232000.0, "grad_norm": 1.7138995799513466, "language_loss": 0.82882631, "learning_rate": 2.271200914239451e-06, "loss": 0.85047078, "num_input_tokens_seen": 169029840, "step": 7867, "time_per_iteration": 2.66166353225708 }, { "auxiliary_loss_clip": 0.01113116, "auxiliary_loss_mlp": 0.01035066, "balance_loss_clip": 1.04474282, "balance_loss_mlp": 1.02197099, "epoch": 0.4730497519915827, "flos": 22052240305920.0, "grad_norm": 1.59304374398017, "language_loss": 0.79711115, "learning_rate": 2.2708150446413385e-06, "loss": 0.81859303, "num_input_tokens_seen": 169049975, "step": 7868, "time_per_iteration": 2.639418363571167 }, { "auxiliary_loss_clip": 0.01048577, "auxiliary_loss_mlp": 0.01036292, "balance_loss_clip": 1.03682256, "balance_loss_mlp": 1.02049041, "epoch": 0.4731098752442507, "flos": 21069724613760.0, "grad_norm": 2.2697646545371772, "language_loss": 0.74715841, "learning_rate": 2.2704291647738915e-06, "loss": 0.7680071, "num_input_tokens_seen": 169069540, "step": 7869, "time_per_iteration": 2.822831153869629 }, { "auxiliary_loss_clip": 0.01108509, "auxiliary_loss_mlp": 0.01048779, "balance_loss_clip": 1.04608214, "balance_loss_mlp": 1.03300154, "epoch": 0.4731699984969187, "flos": 22528775064960.0, "grad_norm": 2.141854382789547, "language_loss": 0.73684996, "learning_rate": 2.2700432746517443e-06, "loss": 0.75842285, "num_input_tokens_seen": 169089940, "step": 7870, "time_per_iteration": 2.7175748348236084 }, { "auxiliary_loss_clip": 0.01133545, "auxiliary_loss_mlp": 0.01041593, "balance_loss_clip": 1.04755211, "balance_loss_mlp": 1.02635777, "epoch": 0.4732301217495867, "flos": 24898124914560.0, "grad_norm": 2.253339307670162, "language_loss": 0.81085944, "learning_rate": 2.2696573742895292e-06, "loss": 0.83261085, "num_input_tokens_seen": 169109650, "step": 7871, "time_per_iteration": 2.6193602085113525 }, { "auxiliary_loss_clip": 0.01113818, "auxiliary_loss_mlp": 0.01036061, "balance_loss_clip": 1.04329586, "balance_loss_mlp": 1.02133834, "epoch": 0.47329024500225464, "flos": 22784423137920.0, "grad_norm": 1.5762073479047713, "language_loss": 0.75922841, "learning_rate": 2.269271463701879e-06, "loss": 0.78072715, "num_input_tokens_seen": 169128990, "step": 7872, "time_per_iteration": 2.6391725540161133 }, { "auxiliary_loss_clip": 0.01091788, "auxiliary_loss_mlp": 0.01038432, "balance_loss_clip": 1.04121172, "balance_loss_mlp": 1.02376986, "epoch": 0.4733503682549226, "flos": 38695902220800.0, "grad_norm": 3.094756801604535, "language_loss": 0.67562377, "learning_rate": 2.268885542903428e-06, "loss": 0.696926, "num_input_tokens_seen": 169154645, "step": 7873, "time_per_iteration": 2.8466758728027344 }, { "auxiliary_loss_clip": 0.01117181, "auxiliary_loss_mlp": 0.01036678, "balance_loss_clip": 1.04567063, "balance_loss_mlp": 1.02267087, "epoch": 0.47341049150759057, "flos": 22966849336320.0, "grad_norm": 1.6392218744116203, "language_loss": 0.72839928, "learning_rate": 2.26849961190881e-06, "loss": 0.74993783, "num_input_tokens_seen": 169174995, "step": 7874, "time_per_iteration": 2.721020221710205 }, { "auxiliary_loss_clip": 0.01113028, "auxiliary_loss_mlp": 0.01038664, "balance_loss_clip": 1.04846478, "balance_loss_mlp": 1.02471697, "epoch": 0.47347061476025853, "flos": 14538471661440.0, "grad_norm": 3.032092549096925, "language_loss": 0.65002596, "learning_rate": 2.26811367073266e-06, "loss": 0.67154288, "num_input_tokens_seen": 169191815, "step": 7875, "time_per_iteration": 2.6652960777282715 }, { "auxiliary_loss_clip": 0.01083743, "auxiliary_loss_mlp": 0.01035273, "balance_loss_clip": 1.04805076, "balance_loss_mlp": 1.02059197, "epoch": 0.4735307380129265, "flos": 30263250827520.0, "grad_norm": 2.768907187204124, "language_loss": 0.8101728, "learning_rate": 2.2677277193896125e-06, "loss": 0.83136296, "num_input_tokens_seen": 169210430, "step": 7876, "time_per_iteration": 2.7860774993896484 }, { "auxiliary_loss_clip": 0.01096604, "auxiliary_loss_mlp": 0.01049403, "balance_loss_clip": 1.04034781, "balance_loss_mlp": 1.03362572, "epoch": 0.47359086126559446, "flos": 19391044452480.0, "grad_norm": 1.718915834241656, "language_loss": 0.79123086, "learning_rate": 2.267341757894304e-06, "loss": 0.81269091, "num_input_tokens_seen": 169229295, "step": 7877, "time_per_iteration": 2.6741349697113037 }, { "auxiliary_loss_clip": 0.01119367, "auxiliary_loss_mlp": 0.00771148, "balance_loss_clip": 1.04634619, "balance_loss_mlp": 1.00065994, "epoch": 0.47365098451826243, "flos": 21939408708480.0, "grad_norm": 1.9321122257733154, "language_loss": 0.7070595, "learning_rate": 2.2669557862613685e-06, "loss": 0.72596461, "num_input_tokens_seen": 169247855, "step": 7878, "time_per_iteration": 2.65336012840271 }, { "auxiliary_loss_clip": 0.01091201, "auxiliary_loss_mlp": 0.01041141, "balance_loss_clip": 1.04987168, "balance_loss_mlp": 1.02767622, "epoch": 0.4737111077709304, "flos": 25845053207040.0, "grad_norm": 1.650502341043129, "language_loss": 0.75037253, "learning_rate": 2.2665698045054425e-06, "loss": 0.77169597, "num_input_tokens_seen": 169268860, "step": 7879, "time_per_iteration": 2.731395721435547 }, { "auxiliary_loss_clip": 0.01030587, "auxiliary_loss_mlp": 0.01009103, "balance_loss_clip": 1.02360272, "balance_loss_mlp": 1.00741053, "epoch": 0.47377123102359836, "flos": 67760886314880.0, "grad_norm": 0.7327852929375173, "language_loss": 0.61306548, "learning_rate": 2.266183812641164e-06, "loss": 0.63346243, "num_input_tokens_seen": 169331855, "step": 7880, "time_per_iteration": 3.224714756011963 }, { "auxiliary_loss_clip": 0.0110857, "auxiliary_loss_mlp": 0.01041962, "balance_loss_clip": 1.04677773, "balance_loss_mlp": 1.02690625, "epoch": 0.4738313542762663, "flos": 24315977191680.0, "grad_norm": 1.5081125335533625, "language_loss": 0.68397921, "learning_rate": 2.2657978106831675e-06, "loss": 0.70548451, "num_input_tokens_seen": 169352175, "step": 7881, "time_per_iteration": 2.7536203861236572 }, { "auxiliary_loss_clip": 0.01068036, "auxiliary_loss_mlp": 0.01031577, "balance_loss_clip": 1.04936802, "balance_loss_mlp": 1.01798737, "epoch": 0.4738914775289343, "flos": 20705339093760.0, "grad_norm": 1.7877053000392102, "language_loss": 0.77066004, "learning_rate": 2.265411798646092e-06, "loss": 0.7916562, "num_input_tokens_seen": 169371215, "step": 7882, "time_per_iteration": 2.873434543609619 }, { "auxiliary_loss_clip": 0.01116489, "auxiliary_loss_mlp": 0.01035892, "balance_loss_clip": 1.04511285, "balance_loss_mlp": 1.02132463, "epoch": 0.4739516007816023, "flos": 25446337263360.0, "grad_norm": 2.3087904075212204, "language_loss": 0.76111883, "learning_rate": 2.2650257765445747e-06, "loss": 0.78264266, "num_input_tokens_seen": 169391745, "step": 7883, "time_per_iteration": 2.7326574325561523 }, { "auxiliary_loss_clip": 0.01107432, "auxiliary_loss_mlp": 0.01031652, "balance_loss_clip": 1.04656231, "balance_loss_mlp": 1.01863456, "epoch": 0.4740117240342703, "flos": 19974341410560.0, "grad_norm": 1.7217647008431887, "language_loss": 0.72281808, "learning_rate": 2.2646397443932525e-06, "loss": 0.74420893, "num_input_tokens_seen": 169409845, "step": 7884, "time_per_iteration": 2.660172462463379 }, { "auxiliary_loss_clip": 0.01123059, "auxiliary_loss_mlp": 0.01037646, "balance_loss_clip": 1.04745269, "balance_loss_mlp": 1.02225614, "epoch": 0.47407184728693824, "flos": 15661146222720.0, "grad_norm": 2.1356892731193557, "language_loss": 0.82255256, "learning_rate": 2.2642537022067655e-06, "loss": 0.8441596, "num_input_tokens_seen": 169426085, "step": 7885, "time_per_iteration": 2.6816513538360596 }, { "auxiliary_loss_clip": 0.01093494, "auxiliary_loss_mlp": 0.01050029, "balance_loss_clip": 1.0418942, "balance_loss_mlp": 1.0338043, "epoch": 0.4741319705396062, "flos": 18588800142720.0, "grad_norm": 1.6528542083339792, "language_loss": 0.73020607, "learning_rate": 2.263867649999751e-06, "loss": 0.75164127, "num_input_tokens_seen": 169444705, "step": 7886, "time_per_iteration": 2.6734073162078857 }, { "auxiliary_loss_clip": 0.01110604, "auxiliary_loss_mlp": 0.01038225, "balance_loss_clip": 1.04582644, "balance_loss_mlp": 1.02251315, "epoch": 0.47419209379227417, "flos": 13261093223040.0, "grad_norm": 2.0346146652784327, "language_loss": 0.74043691, "learning_rate": 2.263481587786849e-06, "loss": 0.76192516, "num_input_tokens_seen": 169460850, "step": 7887, "time_per_iteration": 2.6761467456817627 }, { "auxiliary_loss_clip": 0.01118145, "auxiliary_loss_mlp": 0.01031795, "balance_loss_clip": 1.0474298, "balance_loss_mlp": 1.01849771, "epoch": 0.47425221704494214, "flos": 20044043752320.0, "grad_norm": 1.7788052130685665, "language_loss": 0.77452385, "learning_rate": 2.2630955155826993e-06, "loss": 0.79602331, "num_input_tokens_seen": 169478890, "step": 7888, "time_per_iteration": 2.6402924060821533 }, { "auxiliary_loss_clip": 0.01118769, "auxiliary_loss_mlp": 0.01034654, "balance_loss_clip": 1.0469296, "balance_loss_mlp": 1.02044427, "epoch": 0.4743123402976101, "flos": 27271892136960.0, "grad_norm": 4.211713497556063, "language_loss": 0.72521853, "learning_rate": 2.2627094334019406e-06, "loss": 0.7467528, "num_input_tokens_seen": 169499690, "step": 7889, "time_per_iteration": 2.693746566772461 }, { "auxiliary_loss_clip": 0.0104991, "auxiliary_loss_mlp": 0.01005818, "balance_loss_clip": 1.02273417, "balance_loss_mlp": 1.00418472, "epoch": 0.47437246355027807, "flos": 55393970261760.0, "grad_norm": 0.7194077429508707, "language_loss": 0.5605737, "learning_rate": 2.262323341259214e-06, "loss": 0.58113098, "num_input_tokens_seen": 169560475, "step": 7890, "time_per_iteration": 3.180250883102417 }, { "auxiliary_loss_clip": 0.01120493, "auxiliary_loss_mlp": 0.01032412, "balance_loss_clip": 1.04944348, "balance_loss_mlp": 1.01705146, "epoch": 0.47443258680294603, "flos": 23878477537920.0, "grad_norm": 1.9527728253341778, "language_loss": 0.65866226, "learning_rate": 2.2619372391691605e-06, "loss": 0.68019128, "num_input_tokens_seen": 169580110, "step": 7891, "time_per_iteration": 2.6768221855163574 }, { "auxiliary_loss_clip": 0.01135111, "auxiliary_loss_mlp": 0.01039265, "balance_loss_clip": 1.04865634, "balance_loss_mlp": 1.02342188, "epoch": 0.474492710055614, "flos": 21977761455360.0, "grad_norm": 2.2722368949670493, "language_loss": 0.7100271, "learning_rate": 2.26155112714642e-06, "loss": 0.73177087, "num_input_tokens_seen": 169597510, "step": 7892, "time_per_iteration": 2.5857720375061035 }, { "auxiliary_loss_clip": 0.01021432, "auxiliary_loss_mlp": 0.01001129, "balance_loss_clip": 1.01879561, "balance_loss_mlp": 0.99938869, "epoch": 0.47455283330828196, "flos": 62557180122240.0, "grad_norm": 0.8083016633053688, "language_loss": 0.5854069, "learning_rate": 2.2611650052056355e-06, "loss": 0.60563254, "num_input_tokens_seen": 169660010, "step": 7893, "time_per_iteration": 3.298412799835205 }, { "auxiliary_loss_clip": 0.01119918, "auxiliary_loss_mlp": 0.01040659, "balance_loss_clip": 1.04893851, "balance_loss_mlp": 1.02661026, "epoch": 0.47461295656094993, "flos": 12093637380480.0, "grad_norm": 2.1787400532077608, "language_loss": 0.77515149, "learning_rate": 2.2607788733614463e-06, "loss": 0.79675728, "num_input_tokens_seen": 169678485, "step": 7894, "time_per_iteration": 4.300025463104248 }, { "auxiliary_loss_clip": 0.01119579, "auxiliary_loss_mlp": 0.01038145, "balance_loss_clip": 1.04634869, "balance_loss_mlp": 1.02365553, "epoch": 0.4746730798136179, "flos": 20884568981760.0, "grad_norm": 1.6992264056336024, "language_loss": 0.75134289, "learning_rate": 2.260392731628497e-06, "loss": 0.77292013, "num_input_tokens_seen": 169697335, "step": 7895, "time_per_iteration": 4.2042882442474365 }, { "auxiliary_loss_clip": 0.01115221, "auxiliary_loss_mlp": 0.01035192, "balance_loss_clip": 1.04379582, "balance_loss_mlp": 1.02000451, "epoch": 0.4747332030662859, "flos": 19974808287360.0, "grad_norm": 2.3363867956596462, "language_loss": 0.83016753, "learning_rate": 2.260006580021429e-06, "loss": 0.85167164, "num_input_tokens_seen": 169715395, "step": 7896, "time_per_iteration": 2.6993515491485596 }, { "auxiliary_loss_clip": 0.01115945, "auxiliary_loss_mlp": 0.01033612, "balance_loss_clip": 1.04578996, "balance_loss_mlp": 1.01843619, "epoch": 0.4747933263189539, "flos": 16034186920320.0, "grad_norm": 2.109517003677199, "language_loss": 0.7557857, "learning_rate": 2.259620418554886e-06, "loss": 0.77728134, "num_input_tokens_seen": 169733755, "step": 7897, "time_per_iteration": 4.253166198730469 }, { "auxiliary_loss_clip": 0.01108787, "auxiliary_loss_mlp": 0.01040894, "balance_loss_clip": 1.04561198, "balance_loss_mlp": 1.02645135, "epoch": 0.47485344957162184, "flos": 13955102876160.0, "grad_norm": 2.267424442093673, "language_loss": 0.63623869, "learning_rate": 2.25923424724351e-06, "loss": 0.65773547, "num_input_tokens_seen": 169751390, "step": 7898, "time_per_iteration": 2.672621011734009 }, { "auxiliary_loss_clip": 0.01091849, "auxiliary_loss_mlp": 0.01057132, "balance_loss_clip": 1.04254556, "balance_loss_mlp": 1.03949475, "epoch": 0.4749135728242898, "flos": 20449080489600.0, "grad_norm": 3.549969153580447, "language_loss": 0.70200998, "learning_rate": 2.258848066101946e-06, "loss": 0.72349977, "num_input_tokens_seen": 169769500, "step": 7899, "time_per_iteration": 2.6986401081085205 }, { "auxiliary_loss_clip": 0.01119057, "auxiliary_loss_mlp": 0.01040719, "balance_loss_clip": 1.04576528, "balance_loss_mlp": 1.02590108, "epoch": 0.4749736960769578, "flos": 28949961767040.0, "grad_norm": 1.9384177803560112, "language_loss": 0.68627715, "learning_rate": 2.258461875144837e-06, "loss": 0.70787489, "num_input_tokens_seen": 169789215, "step": 7900, "time_per_iteration": 2.695420265197754 }, { "auxiliary_loss_clip": 0.01088615, "auxiliary_loss_mlp": 0.01048142, "balance_loss_clip": 1.04223442, "balance_loss_mlp": 1.0335629, "epoch": 0.47503381932962574, "flos": 31938770592000.0, "grad_norm": 2.214181272016126, "language_loss": 0.70571202, "learning_rate": 2.2580756743868273e-06, "loss": 0.72707957, "num_input_tokens_seen": 169808825, "step": 7901, "time_per_iteration": 2.7880799770355225 }, { "auxiliary_loss_clip": 0.01101024, "auxiliary_loss_mlp": 0.01063852, "balance_loss_clip": 1.04344749, "balance_loss_mlp": 1.04805636, "epoch": 0.4750939425822937, "flos": 22127257860480.0, "grad_norm": 1.723548754677231, "language_loss": 0.73669708, "learning_rate": 2.2576894638425636e-06, "loss": 0.75834584, "num_input_tokens_seen": 169827590, "step": 7902, "time_per_iteration": 2.67350172996521 }, { "auxiliary_loss_clip": 0.01087876, "auxiliary_loss_mlp": 0.01040789, "balance_loss_clip": 1.04317856, "balance_loss_mlp": 1.02710962, "epoch": 0.47515406583496167, "flos": 20850094903680.0, "grad_norm": 1.7450056007143964, "language_loss": 0.68050694, "learning_rate": 2.257303243526688e-06, "loss": 0.70179355, "num_input_tokens_seen": 169844925, "step": 7903, "time_per_iteration": 2.7626256942749023 }, { "auxiliary_loss_clip": 0.01104723, "auxiliary_loss_mlp": 0.01035743, "balance_loss_clip": 1.043818, "balance_loss_mlp": 1.02206981, "epoch": 0.47521418908762963, "flos": 17524802448000.0, "grad_norm": 1.9051075920789844, "language_loss": 0.72356462, "learning_rate": 2.256917013453848e-06, "loss": 0.74496931, "num_input_tokens_seen": 169862705, "step": 7904, "time_per_iteration": 2.6790597438812256 }, { "auxiliary_loss_clip": 0.01065198, "auxiliary_loss_mlp": 0.01045369, "balance_loss_clip": 1.03584373, "balance_loss_mlp": 1.02957416, "epoch": 0.4752743123402976, "flos": 20559434048640.0, "grad_norm": 1.6154437659751681, "language_loss": 0.86472631, "learning_rate": 2.25653077363869e-06, "loss": 0.88583207, "num_input_tokens_seen": 169880155, "step": 7905, "time_per_iteration": 2.733799457550049 }, { "auxiliary_loss_clip": 0.0110676, "auxiliary_loss_mlp": 0.01037063, "balance_loss_clip": 1.04021764, "balance_loss_mlp": 1.02423561, "epoch": 0.47533443559296557, "flos": 26360623071360.0, "grad_norm": 1.7729713006372103, "language_loss": 0.82212102, "learning_rate": 2.2561445240958583e-06, "loss": 0.84355921, "num_input_tokens_seen": 169901525, "step": 7906, "time_per_iteration": 2.6994829177856445 }, { "auxiliary_loss_clip": 0.01029489, "auxiliary_loss_mlp": 0.01023044, "balance_loss_clip": 1.03056157, "balance_loss_mlp": 1.02150619, "epoch": 0.47539455884563353, "flos": 65949660967680.0, "grad_norm": 0.6767545541611142, "language_loss": 0.58947372, "learning_rate": 2.255758264840002e-06, "loss": 0.60999906, "num_input_tokens_seen": 169970345, "step": 7907, "time_per_iteration": 3.409289836883545 }, { "auxiliary_loss_clip": 0.01112328, "auxiliary_loss_mlp": 0.0103978, "balance_loss_clip": 1.04298031, "balance_loss_mlp": 1.02575445, "epoch": 0.4754546820983015, "flos": 17238128002560.0, "grad_norm": 2.5037076646878664, "language_loss": 0.81147426, "learning_rate": 2.255371995885765e-06, "loss": 0.83299541, "num_input_tokens_seen": 169986440, "step": 7908, "time_per_iteration": 2.6126997470855713 }, { "auxiliary_loss_clip": 0.01120375, "auxiliary_loss_mlp": 0.01045183, "balance_loss_clip": 1.04887652, "balance_loss_mlp": 1.03041351, "epoch": 0.47551480535096946, "flos": 19825886499840.0, "grad_norm": 1.7145689882234993, "language_loss": 0.73805857, "learning_rate": 2.254985717247797e-06, "loss": 0.75971419, "num_input_tokens_seen": 170005705, "step": 7909, "time_per_iteration": 2.7153172492980957 }, { "auxiliary_loss_clip": 0.01098915, "auxiliary_loss_mlp": 0.0103739, "balance_loss_clip": 1.04232681, "balance_loss_mlp": 1.02348399, "epoch": 0.4755749286036375, "flos": 22163958581760.0, "grad_norm": 1.5099683944930966, "language_loss": 0.75533628, "learning_rate": 2.2545994289407457e-06, "loss": 0.77669942, "num_input_tokens_seen": 170023415, "step": 7910, "time_per_iteration": 2.7330431938171387 }, { "auxiliary_loss_clip": 0.01113687, "auxiliary_loss_mlp": 0.01030183, "balance_loss_clip": 1.04379678, "balance_loss_mlp": 1.01749897, "epoch": 0.47563505185630545, "flos": 21648280976640.0, "grad_norm": 1.931062443356086, "language_loss": 0.79401493, "learning_rate": 2.2542131309792577e-06, "loss": 0.81545365, "num_input_tokens_seen": 170042395, "step": 7911, "time_per_iteration": 2.6149117946624756 }, { "auxiliary_loss_clip": 0.01098041, "auxiliary_loss_mlp": 0.00773063, "balance_loss_clip": 1.04096794, "balance_loss_mlp": 1.00061882, "epoch": 0.4756951751089734, "flos": 20628777254400.0, "grad_norm": 2.2768804327487113, "language_loss": 0.75414324, "learning_rate": 2.253826823377983e-06, "loss": 0.77285427, "num_input_tokens_seen": 170061610, "step": 7912, "time_per_iteration": 2.680414915084839 }, { "auxiliary_loss_clip": 0.01123715, "auxiliary_loss_mlp": 0.01037472, "balance_loss_clip": 1.04319668, "balance_loss_mlp": 1.02353013, "epoch": 0.4757552983616414, "flos": 25848788221440.0, "grad_norm": 1.4371041113730632, "language_loss": 0.74065906, "learning_rate": 2.253440506151569e-06, "loss": 0.76227093, "num_input_tokens_seen": 170083505, "step": 7913, "time_per_iteration": 2.6565608978271484 }, { "auxiliary_loss_clip": 0.0110748, "auxiliary_loss_mlp": 0.01031808, "balance_loss_clip": 1.04591024, "balance_loss_mlp": 1.01694882, "epoch": 0.47581542161430934, "flos": 18223013992320.0, "grad_norm": 2.17158702079863, "language_loss": 0.72123522, "learning_rate": 2.253054179314666e-06, "loss": 0.7426281, "num_input_tokens_seen": 170100690, "step": 7914, "time_per_iteration": 2.6789934635162354 }, { "auxiliary_loss_clip": 0.01103912, "auxiliary_loss_mlp": 0.01042984, "balance_loss_clip": 1.04652143, "balance_loss_mlp": 1.02944756, "epoch": 0.4758755448669773, "flos": 21579763783680.0, "grad_norm": 2.3786315570139345, "language_loss": 0.64855683, "learning_rate": 2.2526678428819227e-06, "loss": 0.67002577, "num_input_tokens_seen": 170119240, "step": 7915, "time_per_iteration": 2.65608549118042 }, { "auxiliary_loss_clip": 0.01123163, "auxiliary_loss_mlp": 0.01041838, "balance_loss_clip": 1.04508734, "balance_loss_mlp": 1.02774107, "epoch": 0.47593566811964527, "flos": 15231152511360.0, "grad_norm": 1.7019759484121837, "language_loss": 0.76935744, "learning_rate": 2.2522814968679896e-06, "loss": 0.79100746, "num_input_tokens_seen": 170136450, "step": 7916, "time_per_iteration": 2.585491418838501 }, { "auxiliary_loss_clip": 0.01125392, "auxiliary_loss_mlp": 0.01036553, "balance_loss_clip": 1.04389, "balance_loss_mlp": 1.02302265, "epoch": 0.47599579137231324, "flos": 21543242630400.0, "grad_norm": 2.0866631919048175, "language_loss": 0.63895321, "learning_rate": 2.2518951412875173e-06, "loss": 0.66057259, "num_input_tokens_seen": 170155295, "step": 7917, "time_per_iteration": 2.5544540882110596 }, { "auxiliary_loss_clip": 0.01017258, "auxiliary_loss_mlp": 0.01002335, "balance_loss_clip": 1.01986837, "balance_loss_mlp": 1.00074983, "epoch": 0.4760559146249812, "flos": 64554602595840.0, "grad_norm": 0.8370962757635343, "language_loss": 0.65689212, "learning_rate": 2.2515087761551557e-06, "loss": 0.67708808, "num_input_tokens_seen": 170222325, "step": 7918, "time_per_iteration": 3.4263010025024414 }, { "auxiliary_loss_clip": 0.01114985, "auxiliary_loss_mlp": 0.00771917, "balance_loss_clip": 1.04313397, "balance_loss_mlp": 1.00057673, "epoch": 0.47611603787764917, "flos": 22233876405120.0, "grad_norm": 2.4555452771674067, "language_loss": 0.68450713, "learning_rate": 2.2511224014855563e-06, "loss": 0.70337617, "num_input_tokens_seen": 170241625, "step": 7919, "time_per_iteration": 2.7581801414489746 }, { "auxiliary_loss_clip": 0.01105197, "auxiliary_loss_mlp": 0.01042973, "balance_loss_clip": 1.04329574, "balance_loss_mlp": 1.02922797, "epoch": 0.47617616113031713, "flos": 22780005765120.0, "grad_norm": 1.6063666097186406, "language_loss": 0.75389183, "learning_rate": 2.2507360172933694e-06, "loss": 0.77537358, "num_input_tokens_seen": 170262470, "step": 7920, "time_per_iteration": 2.7888362407684326 }, { "auxiliary_loss_clip": 0.01109747, "auxiliary_loss_mlp": 0.01034602, "balance_loss_clip": 1.04727352, "balance_loss_mlp": 1.01956415, "epoch": 0.4762362843829851, "flos": 24133802388480.0, "grad_norm": 1.5207523519625543, "language_loss": 0.7761817, "learning_rate": 2.2503496235932487e-06, "loss": 0.79762518, "num_input_tokens_seen": 170283460, "step": 7921, "time_per_iteration": 2.7462785243988037 }, { "auxiliary_loss_clip": 0.01108901, "auxiliary_loss_mlp": 0.01043608, "balance_loss_clip": 1.0445503, "balance_loss_mlp": 1.02778864, "epoch": 0.47629640763565306, "flos": 22452069571200.0, "grad_norm": 3.2907516590332024, "language_loss": 0.78146785, "learning_rate": 2.249963220399845e-06, "loss": 0.80299294, "num_input_tokens_seen": 170304225, "step": 7922, "time_per_iteration": 2.6893417835235596 }, { "auxiliary_loss_clip": 0.01094796, "auxiliary_loss_mlp": 0.01043063, "balance_loss_clip": 1.04391539, "balance_loss_mlp": 1.02719617, "epoch": 0.4763565308883211, "flos": 11181398647680.0, "grad_norm": 1.6628631162014398, "language_loss": 0.7275365, "learning_rate": 2.2495768077278104e-06, "loss": 0.74891508, "num_input_tokens_seen": 170322110, "step": 7923, "time_per_iteration": 2.732468605041504 }, { "auxiliary_loss_clip": 0.01102187, "auxiliary_loss_mlp": 0.01039061, "balance_loss_clip": 1.04838657, "balance_loss_mlp": 1.02511382, "epoch": 0.47641665414098905, "flos": 22382151747840.0, "grad_norm": 1.679365493038583, "language_loss": 0.82141626, "learning_rate": 2.2491903855917992e-06, "loss": 0.84282875, "num_input_tokens_seen": 170340700, "step": 7924, "time_per_iteration": 2.7680320739746094 }, { "auxiliary_loss_clip": 0.01126329, "auxiliary_loss_mlp": 0.01038575, "balance_loss_clip": 1.0495019, "balance_loss_mlp": 1.02264822, "epoch": 0.476476777393657, "flos": 25046148862080.0, "grad_norm": 2.2679110024074705, "language_loss": 0.80316466, "learning_rate": 2.2488039540064626e-06, "loss": 0.82481372, "num_input_tokens_seen": 170359780, "step": 7925, "time_per_iteration": 2.649615526199341 }, { "auxiliary_loss_clip": 0.01101728, "auxiliary_loss_mlp": 0.01041222, "balance_loss_clip": 1.04264617, "balance_loss_mlp": 1.02741158, "epoch": 0.476536900646325, "flos": 27269916888960.0, "grad_norm": 1.5530829773494035, "language_loss": 0.72051573, "learning_rate": 2.2484175129864558e-06, "loss": 0.74194521, "num_input_tokens_seen": 170381260, "step": 7926, "time_per_iteration": 2.7393877506256104 }, { "auxiliary_loss_clip": 0.0111858, "auxiliary_loss_mlp": 0.01035544, "balance_loss_clip": 1.04556048, "balance_loss_mlp": 1.02015448, "epoch": 0.47659702389899294, "flos": 25301401885440.0, "grad_norm": 1.973296217359943, "language_loss": 0.68039131, "learning_rate": 2.248031062546432e-06, "loss": 0.70193255, "num_input_tokens_seen": 170400595, "step": 7927, "time_per_iteration": 2.7364554405212402 }, { "auxiliary_loss_clip": 0.01088729, "auxiliary_loss_mlp": 0.01031301, "balance_loss_clip": 1.04246449, "balance_loss_mlp": 1.01772344, "epoch": 0.4766571471516609, "flos": 25992861672960.0, "grad_norm": 1.624613635266834, "language_loss": 0.67674315, "learning_rate": 2.247644602701045e-06, "loss": 0.69794345, "num_input_tokens_seen": 170421110, "step": 7928, "time_per_iteration": 2.7200751304626465 }, { "auxiliary_loss_clip": 0.01128959, "auxiliary_loss_mlp": 0.0103446, "balance_loss_clip": 1.04645658, "balance_loss_mlp": 1.01979089, "epoch": 0.4767172704043289, "flos": 16032211672320.0, "grad_norm": 2.0796504226810497, "language_loss": 0.78678215, "learning_rate": 2.2472581334649496e-06, "loss": 0.80841631, "num_input_tokens_seen": 170436700, "step": 7929, "time_per_iteration": 2.6817221641540527 }, { "auxiliary_loss_clip": 0.01102478, "auxiliary_loss_mlp": 0.01039975, "balance_loss_clip": 1.04257607, "balance_loss_mlp": 1.0262301, "epoch": 0.47677739365699684, "flos": 39235351651200.0, "grad_norm": 1.8131309373477071, "language_loss": 0.6663419, "learning_rate": 2.2468716548528016e-06, "loss": 0.68776643, "num_input_tokens_seen": 170459555, "step": 7930, "time_per_iteration": 2.856072187423706 }, { "auxiliary_loss_clip": 0.0111358, "auxiliary_loss_mlp": 0.01036755, "balance_loss_clip": 1.04616833, "balance_loss_mlp": 1.02318919, "epoch": 0.4768375169096648, "flos": 24717781704960.0, "grad_norm": 7.611219304969564, "language_loss": 0.7973817, "learning_rate": 2.2464851668792555e-06, "loss": 0.81888509, "num_input_tokens_seen": 170479175, "step": 7931, "time_per_iteration": 2.646108865737915 }, { "auxiliary_loss_clip": 0.01100642, "auxiliary_loss_mlp": 0.01036826, "balance_loss_clip": 1.04248762, "balance_loss_mlp": 1.02181768, "epoch": 0.47689764016233277, "flos": 22528667324160.0, "grad_norm": 1.747640555146421, "language_loss": 0.76035368, "learning_rate": 2.2460986695589678e-06, "loss": 0.78172839, "num_input_tokens_seen": 170498450, "step": 7932, "time_per_iteration": 2.6632022857666016 }, { "auxiliary_loss_clip": 0.01103619, "auxiliary_loss_mlp": 0.00770594, "balance_loss_clip": 1.04416108, "balance_loss_mlp": 1.00076032, "epoch": 0.47695776341500074, "flos": 15120619384320.0, "grad_norm": 1.7743205398157191, "language_loss": 0.79733002, "learning_rate": 2.245712162906593e-06, "loss": 0.81607223, "num_input_tokens_seen": 170516255, "step": 7933, "time_per_iteration": 4.2387471199035645 }, { "auxiliary_loss_clip": 0.01123015, "auxiliary_loss_mlp": 0.01041506, "balance_loss_clip": 1.04555225, "balance_loss_mlp": 1.02532899, "epoch": 0.4770178866676687, "flos": 14678917839360.0, "grad_norm": 1.9828909232489866, "language_loss": 0.73883361, "learning_rate": 2.2453256469367888e-06, "loss": 0.76047885, "num_input_tokens_seen": 170532705, "step": 7934, "time_per_iteration": 4.074187517166138 }, { "auxiliary_loss_clip": 0.01116756, "auxiliary_loss_mlp": 0.01034977, "balance_loss_clip": 1.04362082, "balance_loss_mlp": 1.02075577, "epoch": 0.47707800992033667, "flos": 22565583527040.0, "grad_norm": 1.8305920873714958, "language_loss": 0.80197936, "learning_rate": 2.244939121664211e-06, "loss": 0.8234967, "num_input_tokens_seen": 170551925, "step": 7935, "time_per_iteration": 2.650474786758423 }, { "auxiliary_loss_clip": 0.01101181, "auxiliary_loss_mlp": 0.01043502, "balance_loss_clip": 1.04532123, "balance_loss_mlp": 1.02818346, "epoch": 0.4771381331730047, "flos": 30918225375360.0, "grad_norm": 5.908138115579588, "language_loss": 0.71829689, "learning_rate": 2.2445525871035177e-06, "loss": 0.73974371, "num_input_tokens_seen": 170572320, "step": 7936, "time_per_iteration": 4.428630113601685 }, { "auxiliary_loss_clip": 0.01130752, "auxiliary_loss_mlp": 0.01039041, "balance_loss_clip": 1.04646921, "balance_loss_mlp": 1.02419913, "epoch": 0.47719825642567265, "flos": 25738901539200.0, "grad_norm": 2.4038439056994156, "language_loss": 0.675704, "learning_rate": 2.2441660432693656e-06, "loss": 0.69740188, "num_input_tokens_seen": 170589470, "step": 7937, "time_per_iteration": 4.458148241043091 }, { "auxiliary_loss_clip": 0.01034806, "auxiliary_loss_mlp": 0.00999407, "balance_loss_clip": 1.01822138, "balance_loss_mlp": 0.99804842, "epoch": 0.4772583796783406, "flos": 66355128668160.0, "grad_norm": 0.7105047811157361, "language_loss": 0.56384945, "learning_rate": 2.2437794901764128e-06, "loss": 0.58419156, "num_input_tokens_seen": 170662265, "step": 7938, "time_per_iteration": 3.3967578411102295 }, { "auxiliary_loss_clip": 0.01099667, "auxiliary_loss_mlp": 0.0104562, "balance_loss_clip": 1.04193783, "balance_loss_mlp": 1.02908564, "epoch": 0.4773185029310086, "flos": 22051091070720.0, "grad_norm": 3.053079154163393, "language_loss": 0.88725203, "learning_rate": 2.243392927839317e-06, "loss": 0.90870488, "num_input_tokens_seen": 170679680, "step": 7939, "time_per_iteration": 2.7099897861480713 }, { "auxiliary_loss_clip": 0.01115778, "auxiliary_loss_mlp": 0.01037609, "balance_loss_clip": 1.04160845, "balance_loss_mlp": 1.02400148, "epoch": 0.47737862618367655, "flos": 16727801523840.0, "grad_norm": 1.7393189284877646, "language_loss": 0.77381486, "learning_rate": 2.2430063562727367e-06, "loss": 0.79534876, "num_input_tokens_seen": 170697340, "step": 7940, "time_per_iteration": 2.5913469791412354 }, { "auxiliary_loss_clip": 0.01104457, "auxiliary_loss_mlp": 0.01036057, "balance_loss_clip": 1.04589248, "balance_loss_mlp": 1.02288485, "epoch": 0.4774387494363445, "flos": 19609453100160.0, "grad_norm": 1.5893003235088359, "language_loss": 0.8474015, "learning_rate": 2.2426197754913322e-06, "loss": 0.8688066, "num_input_tokens_seen": 170714905, "step": 7941, "time_per_iteration": 2.605090856552124 }, { "auxiliary_loss_clip": 0.0110803, "auxiliary_loss_mlp": 0.01041538, "balance_loss_clip": 1.04433787, "balance_loss_mlp": 1.02682161, "epoch": 0.4774988726890125, "flos": 16653969118080.0, "grad_norm": 2.1303607813841237, "language_loss": 0.75943714, "learning_rate": 2.24223318550976e-06, "loss": 0.78093278, "num_input_tokens_seen": 170731810, "step": 7942, "time_per_iteration": 2.612901449203491 }, { "auxiliary_loss_clip": 0.01115811, "auxiliary_loss_mlp": 0.01038801, "balance_loss_clip": 1.04779172, "balance_loss_mlp": 1.02491331, "epoch": 0.47755899594168044, "flos": 20485565729280.0, "grad_norm": 1.7564628488897216, "language_loss": 0.6467554, "learning_rate": 2.241846586342682e-06, "loss": 0.66830152, "num_input_tokens_seen": 170750270, "step": 7943, "time_per_iteration": 2.6675846576690674 }, { "auxiliary_loss_clip": 0.01088131, "auxiliary_loss_mlp": 0.01040732, "balance_loss_clip": 1.04014313, "balance_loss_mlp": 1.02544951, "epoch": 0.4776191191943484, "flos": 21652806090240.0, "grad_norm": 3.30514620611564, "language_loss": 0.73474699, "learning_rate": 2.2414599780047577e-06, "loss": 0.75603563, "num_input_tokens_seen": 170769015, "step": 7944, "time_per_iteration": 2.6938626766204834 }, { "auxiliary_loss_clip": 0.01116316, "auxiliary_loss_mlp": 0.01035661, "balance_loss_clip": 1.04835653, "balance_loss_mlp": 1.01982975, "epoch": 0.4776792424470164, "flos": 18770220760320.0, "grad_norm": 2.01255819211095, "language_loss": 0.67873627, "learning_rate": 2.2410733605106456e-06, "loss": 0.70025599, "num_input_tokens_seen": 170785725, "step": 7945, "time_per_iteration": 2.5940043926239014 }, { "auxiliary_loss_clip": 0.0108787, "auxiliary_loss_mlp": 0.00774963, "balance_loss_clip": 1.03865957, "balance_loss_mlp": 1.00055337, "epoch": 0.47773936569968434, "flos": 29715828577920.0, "grad_norm": 1.9730762461064726, "language_loss": 0.75473535, "learning_rate": 2.240686733875009e-06, "loss": 0.77336371, "num_input_tokens_seen": 170804600, "step": 7946, "time_per_iteration": 2.762983560562134 }, { "auxiliary_loss_clip": 0.01105207, "auxiliary_loss_mlp": 0.01042769, "balance_loss_clip": 1.04477096, "balance_loss_mlp": 1.0274632, "epoch": 0.4777994889523523, "flos": 24791542283520.0, "grad_norm": 2.190560640838335, "language_loss": 0.79071236, "learning_rate": 2.240300098112506e-06, "loss": 0.81219208, "num_input_tokens_seen": 170824230, "step": 7947, "time_per_iteration": 2.692763328552246 }, { "auxiliary_loss_clip": 0.010955, "auxiliary_loss_mlp": 0.01037272, "balance_loss_clip": 1.0440042, "balance_loss_mlp": 1.02317524, "epoch": 0.47785961220502027, "flos": 17858161595520.0, "grad_norm": 2.294285239078615, "language_loss": 0.7329706, "learning_rate": 2.2399134532377998e-06, "loss": 0.75429833, "num_input_tokens_seen": 170843365, "step": 7948, "time_per_iteration": 2.6743998527526855 }, { "auxiliary_loss_clip": 0.01106692, "auxiliary_loss_mlp": 0.01038667, "balance_loss_clip": 1.04329944, "balance_loss_mlp": 1.0235039, "epoch": 0.4779197354576883, "flos": 20266546550400.0, "grad_norm": 1.7991446580624026, "language_loss": 0.78139675, "learning_rate": 2.2395267992655514e-06, "loss": 0.80285037, "num_input_tokens_seen": 170863515, "step": 7949, "time_per_iteration": 2.694549560546875 }, { "auxiliary_loss_clip": 0.01096582, "auxiliary_loss_mlp": 0.0104007, "balance_loss_clip": 1.04018211, "balance_loss_mlp": 1.0263133, "epoch": 0.47797985871035625, "flos": 17056599644160.0, "grad_norm": 2.242348781817659, "language_loss": 0.74315739, "learning_rate": 2.2391401362104227e-06, "loss": 0.76452386, "num_input_tokens_seen": 170881245, "step": 7950, "time_per_iteration": 2.718254327774048 }, { "auxiliary_loss_clip": 0.01095843, "auxiliary_loss_mlp": 0.01046859, "balance_loss_clip": 1.04172587, "balance_loss_mlp": 1.03179109, "epoch": 0.4780399819630242, "flos": 31358418549120.0, "grad_norm": 1.9896022122587003, "language_loss": 0.74343586, "learning_rate": 2.2387534640870756e-06, "loss": 0.7648629, "num_input_tokens_seen": 170901285, "step": 7951, "time_per_iteration": 2.7827391624450684 }, { "auxiliary_loss_clip": 0.01094802, "auxiliary_loss_mlp": 0.01036097, "balance_loss_clip": 1.04424548, "balance_loss_mlp": 1.02120781, "epoch": 0.4781001052156922, "flos": 24899597372160.0, "grad_norm": 2.198904574593956, "language_loss": 0.80032581, "learning_rate": 2.238366782910174e-06, "loss": 0.82163477, "num_input_tokens_seen": 170919740, "step": 7952, "time_per_iteration": 2.812988519668579 }, { "auxiliary_loss_clip": 0.01107213, "auxiliary_loss_mlp": 0.01044893, "balance_loss_clip": 1.04275584, "balance_loss_mlp": 1.03007555, "epoch": 0.47816022846836015, "flos": 18697717157760.0, "grad_norm": 1.8177204893019177, "language_loss": 0.7794894, "learning_rate": 2.23798009269438e-06, "loss": 0.80101049, "num_input_tokens_seen": 170938510, "step": 7953, "time_per_iteration": 2.6617591381073 }, { "auxiliary_loss_clip": 0.01120456, "auxiliary_loss_mlp": 0.01038164, "balance_loss_clip": 1.04588997, "balance_loss_mlp": 1.0237813, "epoch": 0.4782203517210281, "flos": 11977573559040.0, "grad_norm": 2.347215604083738, "language_loss": 0.84714645, "learning_rate": 2.2375933934543566e-06, "loss": 0.86873269, "num_input_tokens_seen": 170951170, "step": 7954, "time_per_iteration": 2.6208479404449463 }, { "auxiliary_loss_clip": 0.01097068, "auxiliary_loss_mlp": 0.01038972, "balance_loss_clip": 1.0426054, "balance_loss_mlp": 1.0248698, "epoch": 0.4782804749736961, "flos": 20813501923200.0, "grad_norm": 1.4277916214046864, "language_loss": 0.70472121, "learning_rate": 2.237206685204768e-06, "loss": 0.72608161, "num_input_tokens_seen": 170970990, "step": 7955, "time_per_iteration": 2.821913719177246 }, { "auxiliary_loss_clip": 0.0110203, "auxiliary_loss_mlp": 0.01041668, "balance_loss_clip": 1.04433143, "balance_loss_mlp": 1.0281322, "epoch": 0.47834059822636404, "flos": 23840304359040.0, "grad_norm": 1.5047634516845327, "language_loss": 0.82269239, "learning_rate": 2.2368199679602787e-06, "loss": 0.84412932, "num_input_tokens_seen": 170991215, "step": 7956, "time_per_iteration": 2.683924913406372 }, { "auxiliary_loss_clip": 0.01105668, "auxiliary_loss_mlp": 0.01036371, "balance_loss_clip": 1.04529083, "balance_loss_mlp": 1.02021837, "epoch": 0.478400721479032, "flos": 22633777497600.0, "grad_norm": 2.448858103633137, "language_loss": 0.84977531, "learning_rate": 2.2364332417355516e-06, "loss": 0.87119567, "num_input_tokens_seen": 171007325, "step": 7957, "time_per_iteration": 2.6371145248413086 }, { "auxiliary_loss_clip": 0.01118227, "auxiliary_loss_mlp": 0.01040317, "balance_loss_clip": 1.04562736, "balance_loss_mlp": 1.02653635, "epoch": 0.4784608447317, "flos": 19354954262400.0, "grad_norm": 1.5628888100251457, "language_loss": 0.79777038, "learning_rate": 2.2360465065452527e-06, "loss": 0.81935579, "num_input_tokens_seen": 171025650, "step": 7958, "time_per_iteration": 2.639721632003784 }, { "auxiliary_loss_clip": 0.01085054, "auxiliary_loss_mlp": 0.0077548, "balance_loss_clip": 1.03763032, "balance_loss_mlp": 1.00064015, "epoch": 0.47852096798436794, "flos": 24021114445440.0, "grad_norm": 1.8018566992199279, "language_loss": 0.82972836, "learning_rate": 2.235659762404047e-06, "loss": 0.84833372, "num_input_tokens_seen": 171045045, "step": 7959, "time_per_iteration": 2.733668565750122 }, { "auxiliary_loss_clip": 0.01090487, "auxiliary_loss_mlp": 0.01036767, "balance_loss_clip": 1.04364586, "balance_loss_mlp": 1.02436292, "epoch": 0.4785810912370359, "flos": 25666433850240.0, "grad_norm": 2.7562627438628504, "language_loss": 0.73275614, "learning_rate": 2.235273009326599e-06, "loss": 0.75402862, "num_input_tokens_seen": 171062910, "step": 7960, "time_per_iteration": 2.6994166374206543 }, { "auxiliary_loss_clip": 0.0109086, "auxiliary_loss_mlp": 0.0103472, "balance_loss_clip": 1.04504585, "balance_loss_mlp": 1.02170801, "epoch": 0.47864121448970387, "flos": 21432134885760.0, "grad_norm": 1.6649690841938434, "language_loss": 0.76878142, "learning_rate": 2.2348862473275745e-06, "loss": 0.79003716, "num_input_tokens_seen": 171080875, "step": 7961, "time_per_iteration": 2.7051572799682617 }, { "auxiliary_loss_clip": 0.01087757, "auxiliary_loss_mlp": 0.0103463, "balance_loss_clip": 1.04447055, "balance_loss_mlp": 1.02050352, "epoch": 0.47870133774237184, "flos": 16143894034560.0, "grad_norm": 7.35679067145723, "language_loss": 0.7769649, "learning_rate": 2.2344994764216405e-06, "loss": 0.79818881, "num_input_tokens_seen": 171099190, "step": 7962, "time_per_iteration": 2.7466347217559814 }, { "auxiliary_loss_clip": 0.0110573, "auxiliary_loss_mlp": 0.01042926, "balance_loss_clip": 1.04702401, "balance_loss_mlp": 1.02871001, "epoch": 0.47876146099503986, "flos": 26906788344960.0, "grad_norm": 1.6387698321198922, "language_loss": 0.64764994, "learning_rate": 2.2341126966234635e-06, "loss": 0.66913652, "num_input_tokens_seen": 171119060, "step": 7963, "time_per_iteration": 2.77663516998291 }, { "auxiliary_loss_clip": 0.01117113, "auxiliary_loss_mlp": 0.01035904, "balance_loss_clip": 1.04389668, "balance_loss_mlp": 1.02196217, "epoch": 0.4788215842477078, "flos": 45332085778560.0, "grad_norm": 1.655648847764305, "language_loss": 0.77503848, "learning_rate": 2.2337259079477083e-06, "loss": 0.79656863, "num_input_tokens_seen": 171141900, "step": 7964, "time_per_iteration": 2.9196712970733643 }, { "auxiliary_loss_clip": 0.01120902, "auxiliary_loss_mlp": 0.01036482, "balance_loss_clip": 1.04660964, "balance_loss_mlp": 1.02042508, "epoch": 0.4788817075003758, "flos": 22237180456320.0, "grad_norm": 2.8996801764774505, "language_loss": 0.76540697, "learning_rate": 2.233339110409044e-06, "loss": 0.78698087, "num_input_tokens_seen": 171161045, "step": 7965, "time_per_iteration": 2.6720781326293945 }, { "auxiliary_loss_clip": 0.0106828, "auxiliary_loss_mlp": 0.0105005, "balance_loss_clip": 1.03929722, "balance_loss_mlp": 1.03433788, "epoch": 0.47894183075304375, "flos": 16471183783680.0, "grad_norm": 1.712219755604538, "language_loss": 0.74560332, "learning_rate": 2.232952304022137e-06, "loss": 0.76678663, "num_input_tokens_seen": 171179675, "step": 7966, "time_per_iteration": 2.7669286727905273 }, { "auxiliary_loss_clip": 0.01101486, "auxiliary_loss_mlp": 0.0103808, "balance_loss_clip": 1.0444622, "balance_loss_mlp": 1.02388787, "epoch": 0.4790019540057117, "flos": 24282688262400.0, "grad_norm": 2.605899190258409, "language_loss": 0.73308432, "learning_rate": 2.232565488801655e-06, "loss": 0.75448, "num_input_tokens_seen": 171201175, "step": 7967, "time_per_iteration": 2.7271900177001953 }, { "auxiliary_loss_clip": 0.01102984, "auxiliary_loss_mlp": 0.01032784, "balance_loss_clip": 1.04409146, "balance_loss_mlp": 1.01838326, "epoch": 0.4790620772583797, "flos": 25666469763840.0, "grad_norm": 2.103515425969552, "language_loss": 0.79279423, "learning_rate": 2.232178664762267e-06, "loss": 0.81415194, "num_input_tokens_seen": 171221750, "step": 7968, "time_per_iteration": 2.707740545272827 }, { "auxiliary_loss_clip": 0.0102077, "auxiliary_loss_mlp": 0.01020427, "balance_loss_clip": 1.02207994, "balance_loss_mlp": 1.01903248, "epoch": 0.47912220051104765, "flos": 69428077102080.0, "grad_norm": 0.7660555925772923, "language_loss": 0.62198806, "learning_rate": 2.2317918319186408e-06, "loss": 0.64240003, "num_input_tokens_seen": 171292235, "step": 7969, "time_per_iteration": 3.3662569522857666 }, { "auxiliary_loss_clip": 0.01087594, "auxiliary_loss_mlp": 0.01029477, "balance_loss_clip": 1.04418397, "balance_loss_mlp": 1.01662636, "epoch": 0.4791823237637156, "flos": 24168922911360.0, "grad_norm": 1.7596129166374368, "language_loss": 0.77306086, "learning_rate": 2.2314049902854446e-06, "loss": 0.79423159, "num_input_tokens_seen": 171312215, "step": 7970, "time_per_iteration": 2.69364857673645 }, { "auxiliary_loss_clip": 0.01116664, "auxiliary_loss_mlp": 0.01038161, "balance_loss_clip": 1.04511642, "balance_loss_mlp": 1.0235939, "epoch": 0.4792424470163836, "flos": 24751465683840.0, "grad_norm": 1.5706742055007812, "language_loss": 0.70431626, "learning_rate": 2.231018139877349e-06, "loss": 0.72586453, "num_input_tokens_seen": 171332975, "step": 7971, "time_per_iteration": 2.690791130065918 }, { "auxiliary_loss_clip": 0.01072275, "auxiliary_loss_mlp": 0.01033862, "balance_loss_clip": 1.03982508, "balance_loss_mlp": 1.01899683, "epoch": 0.47930257026905154, "flos": 23257905240960.0, "grad_norm": 1.30993945009872, "language_loss": 0.79995155, "learning_rate": 2.230631280709021e-06, "loss": 0.82101291, "num_input_tokens_seen": 171353880, "step": 7972, "time_per_iteration": 2.829455852508545 }, { "auxiliary_loss_clip": 0.0111891, "auxiliary_loss_mlp": 0.01028077, "balance_loss_clip": 1.0466361, "balance_loss_mlp": 1.01299727, "epoch": 0.4793626935217195, "flos": 14064091718400.0, "grad_norm": 2.2411370214837807, "language_loss": 0.69401908, "learning_rate": 2.2302444127951327e-06, "loss": 0.71548891, "num_input_tokens_seen": 171370930, "step": 7973, "time_per_iteration": 4.2368669509887695 }, { "auxiliary_loss_clip": 0.01120125, "auxiliary_loss_mlp": 0.01039183, "balance_loss_clip": 1.05002046, "balance_loss_mlp": 1.02575445, "epoch": 0.4794228167743875, "flos": 21798854789760.0, "grad_norm": 1.967830357691446, "language_loss": 0.78792048, "learning_rate": 2.2298575361503523e-06, "loss": 0.80951357, "num_input_tokens_seen": 171387575, "step": 7974, "time_per_iteration": 2.666619300842285 }, { "auxiliary_loss_clip": 0.01029245, "auxiliary_loss_mlp": 0.01003452, "balance_loss_clip": 1.02188838, "balance_loss_mlp": 1.00173593, "epoch": 0.47948294002705544, "flos": 66968805553920.0, "grad_norm": 0.7538441683533625, "language_loss": 0.54051983, "learning_rate": 2.2294706507893517e-06, "loss": 0.56084681, "num_input_tokens_seen": 171449980, "step": 7975, "time_per_iteration": 4.964555501937866 }, { "auxiliary_loss_clip": 0.01114672, "auxiliary_loss_mlp": 0.01039108, "balance_loss_clip": 1.04530835, "balance_loss_mlp": 1.02287221, "epoch": 0.47954306327972346, "flos": 12422471414400.0, "grad_norm": 2.0524308160251707, "language_loss": 0.89917016, "learning_rate": 2.2290837567268008e-06, "loss": 0.92070794, "num_input_tokens_seen": 171465290, "step": 7976, "time_per_iteration": 4.202557802200317 }, { "auxiliary_loss_clip": 0.01135185, "auxiliary_loss_mlp": 0.01039598, "balance_loss_clip": 1.05056477, "balance_loss_mlp": 1.02431524, "epoch": 0.4796031865323914, "flos": 18361951799040.0, "grad_norm": 2.222330138734667, "language_loss": 0.73720783, "learning_rate": 2.2286968539773713e-06, "loss": 0.75895566, "num_input_tokens_seen": 171481130, "step": 7977, "time_per_iteration": 2.653036117553711 }, { "auxiliary_loss_clip": 0.01112997, "auxiliary_loss_mlp": 0.00772063, "balance_loss_clip": 1.0468123, "balance_loss_mlp": 1.00047266, "epoch": 0.4796633097850594, "flos": 21835088634240.0, "grad_norm": 1.5823767711410588, "language_loss": 0.78372079, "learning_rate": 2.228309942555734e-06, "loss": 0.80257142, "num_input_tokens_seen": 171501140, "step": 7978, "time_per_iteration": 2.7036852836608887 }, { "auxiliary_loss_clip": 0.01106382, "auxiliary_loss_mlp": 0.01039525, "balance_loss_clip": 1.04519784, "balance_loss_mlp": 1.02526784, "epoch": 0.47972343303772735, "flos": 23437350610560.0, "grad_norm": 2.6635738232298944, "language_loss": 0.89488423, "learning_rate": 2.22792302247656e-06, "loss": 0.91634321, "num_input_tokens_seen": 171519835, "step": 7979, "time_per_iteration": 2.653221845626831 }, { "auxiliary_loss_clip": 0.01122392, "auxiliary_loss_mlp": 0.01040662, "balance_loss_clip": 1.04798067, "balance_loss_mlp": 1.02475905, "epoch": 0.4797835562903953, "flos": 24899776940160.0, "grad_norm": 1.5901773617653536, "language_loss": 0.76710582, "learning_rate": 2.227536093754523e-06, "loss": 0.78873634, "num_input_tokens_seen": 171540980, "step": 7980, "time_per_iteration": 2.6700520515441895 }, { "auxiliary_loss_clip": 0.01103639, "auxiliary_loss_mlp": 0.01039114, "balance_loss_clip": 1.04525447, "balance_loss_mlp": 1.02261567, "epoch": 0.4798436795430633, "flos": 35042996793600.0, "grad_norm": 1.9068781398056245, "language_loss": 0.7128244, "learning_rate": 2.227149156404295e-06, "loss": 0.73425198, "num_input_tokens_seen": 171563600, "step": 7981, "time_per_iteration": 2.817458391189575 }, { "auxiliary_loss_clip": 0.01130721, "auxiliary_loss_mlp": 0.01034361, "balance_loss_clip": 1.05059981, "balance_loss_mlp": 1.02040792, "epoch": 0.47990380279573125, "flos": 20590209025920.0, "grad_norm": 2.189836625005686, "language_loss": 0.70604527, "learning_rate": 2.2267622104405473e-06, "loss": 0.72769606, "num_input_tokens_seen": 171580700, "step": 7982, "time_per_iteration": 2.639772891998291 }, { "auxiliary_loss_clip": 0.01101365, "auxiliary_loss_mlp": 0.01031884, "balance_loss_clip": 1.04456162, "balance_loss_mlp": 1.01928937, "epoch": 0.4799639260483992, "flos": 26359402008960.0, "grad_norm": 6.366705109750511, "language_loss": 0.71019757, "learning_rate": 2.2263752558779544e-06, "loss": 0.73153007, "num_input_tokens_seen": 171602035, "step": 7983, "time_per_iteration": 2.7794454097747803 }, { "auxiliary_loss_clip": 0.01038182, "auxiliary_loss_mlp": 0.00752618, "balance_loss_clip": 1.0209136, "balance_loss_mlp": 1.00015247, "epoch": 0.4800240493010672, "flos": 70979021521920.0, "grad_norm": 0.8025064053403466, "language_loss": 0.59461898, "learning_rate": 2.2259882927311883e-06, "loss": 0.61252695, "num_input_tokens_seen": 171659215, "step": 7984, "time_per_iteration": 3.1715712547302246 }, { "auxiliary_loss_clip": 0.01068728, "auxiliary_loss_mlp": 0.01050145, "balance_loss_clip": 1.03732944, "balance_loss_mlp": 1.03350329, "epoch": 0.48008417255373514, "flos": 17086656349440.0, "grad_norm": 1.9659657952718743, "language_loss": 0.66784835, "learning_rate": 2.2256013210149247e-06, "loss": 0.68903708, "num_input_tokens_seen": 171675710, "step": 7985, "time_per_iteration": 2.8482425212860107 }, { "auxiliary_loss_clip": 0.01105712, "auxiliary_loss_mlp": 0.010384, "balance_loss_clip": 1.04168916, "balance_loss_mlp": 1.02367198, "epoch": 0.4801442958064031, "flos": 15413435055360.0, "grad_norm": 1.731655766205416, "language_loss": 0.69907761, "learning_rate": 2.225214340743835e-06, "loss": 0.72051871, "num_input_tokens_seen": 171692510, "step": 7986, "time_per_iteration": 2.78254771232605 }, { "auxiliary_loss_clip": 0.01094439, "auxiliary_loss_mlp": 0.0104069, "balance_loss_clip": 1.04537976, "balance_loss_mlp": 1.02534223, "epoch": 0.4802044190590711, "flos": 11473747441920.0, "grad_norm": 2.3008677930118844, "language_loss": 0.78930938, "learning_rate": 2.2248273519325956e-06, "loss": 0.81066066, "num_input_tokens_seen": 171710235, "step": 7987, "time_per_iteration": 2.8055880069732666 }, { "auxiliary_loss_clip": 0.01076423, "auxiliary_loss_mlp": 0.01042206, "balance_loss_clip": 1.04216504, "balance_loss_mlp": 1.02793634, "epoch": 0.48026454231173904, "flos": 20951003185920.0, "grad_norm": 2.0041399034857537, "language_loss": 0.75381374, "learning_rate": 2.2244403545958812e-06, "loss": 0.77499998, "num_input_tokens_seen": 171726715, "step": 7988, "time_per_iteration": 2.7931642532348633 }, { "auxiliary_loss_clip": 0.01099185, "auxiliary_loss_mlp": 0.01033353, "balance_loss_clip": 1.04829884, "balance_loss_mlp": 1.01920891, "epoch": 0.48032466556440706, "flos": 20448110822400.0, "grad_norm": 2.2052350267481984, "language_loss": 0.79056877, "learning_rate": 2.224053348748365e-06, "loss": 0.81189418, "num_input_tokens_seen": 171743605, "step": 7989, "time_per_iteration": 2.7195966243743896 }, { "auxiliary_loss_clip": 0.01109361, "auxiliary_loss_mlp": 0.01046506, "balance_loss_clip": 1.04376316, "balance_loss_mlp": 1.03094292, "epoch": 0.480384788817075, "flos": 37120823861760.0, "grad_norm": 1.9525154549019321, "language_loss": 0.73684812, "learning_rate": 2.223666334404724e-06, "loss": 0.75840676, "num_input_tokens_seen": 171765445, "step": 7990, "time_per_iteration": 2.8826913833618164 }, { "auxiliary_loss_clip": 0.01039921, "auxiliary_loss_mlp": 0.00752733, "balance_loss_clip": 1.02231336, "balance_loss_mlp": 1.00023639, "epoch": 0.480444912069743, "flos": 69552577641600.0, "grad_norm": 0.7651324576674445, "language_loss": 0.59016085, "learning_rate": 2.223279311579633e-06, "loss": 0.60808742, "num_input_tokens_seen": 171830115, "step": 7991, "time_per_iteration": 3.325892448425293 }, { "auxiliary_loss_clip": 0.01119355, "auxiliary_loss_mlp": 0.00772289, "balance_loss_clip": 1.04751837, "balance_loss_mlp": 1.00058734, "epoch": 0.48050503532241096, "flos": 29822231640960.0, "grad_norm": 2.03548436048953, "language_loss": 0.67551184, "learning_rate": 2.222892280287768e-06, "loss": 0.69442832, "num_input_tokens_seen": 171849135, "step": 7992, "time_per_iteration": 2.7717204093933105 }, { "auxiliary_loss_clip": 0.01102719, "auxiliary_loss_mlp": 0.01037971, "balance_loss_clip": 1.04047358, "balance_loss_mlp": 1.02267683, "epoch": 0.4805651585750789, "flos": 23948539015680.0, "grad_norm": 1.7261557593206558, "language_loss": 0.76166683, "learning_rate": 2.2225052405438056e-06, "loss": 0.78307372, "num_input_tokens_seen": 171868880, "step": 7993, "time_per_iteration": 2.739190101623535 }, { "auxiliary_loss_clip": 0.01080291, "auxiliary_loss_mlp": 0.01038498, "balance_loss_clip": 1.04301596, "balance_loss_mlp": 1.02469933, "epoch": 0.4806252818277469, "flos": 25665428269440.0, "grad_norm": 1.8324818551458955, "language_loss": 0.79029763, "learning_rate": 2.222118192362422e-06, "loss": 0.81148541, "num_input_tokens_seen": 171889455, "step": 7994, "time_per_iteration": 2.775120973587036 }, { "auxiliary_loss_clip": 0.01107812, "auxiliary_loss_mlp": 0.0103252, "balance_loss_clip": 1.04342794, "balance_loss_mlp": 1.01851845, "epoch": 0.48068540508041485, "flos": 13151996640000.0, "grad_norm": 2.168964016546684, "language_loss": 0.79452056, "learning_rate": 2.2217311357582946e-06, "loss": 0.81592381, "num_input_tokens_seen": 171906070, "step": 7995, "time_per_iteration": 2.684086561203003 }, { "auxiliary_loss_clip": 0.01071477, "auxiliary_loss_mlp": 0.01034963, "balance_loss_clip": 1.04075575, "balance_loss_mlp": 1.02081871, "epoch": 0.4807455283330828, "flos": 21176738208000.0, "grad_norm": 1.4272883159105954, "language_loss": 0.82732481, "learning_rate": 2.2213440707461e-06, "loss": 0.84838915, "num_input_tokens_seen": 171926515, "step": 7996, "time_per_iteration": 2.801893711090088 }, { "auxiliary_loss_clip": 0.0105538, "auxiliary_loss_mlp": 0.01038724, "balance_loss_clip": 1.03635919, "balance_loss_mlp": 1.02432358, "epoch": 0.4808056515857508, "flos": 12275991751680.0, "grad_norm": 1.7665973767451764, "language_loss": 0.81008822, "learning_rate": 2.220956997340516e-06, "loss": 0.8310293, "num_input_tokens_seen": 171943845, "step": 7997, "time_per_iteration": 2.7309181690216064 }, { "auxiliary_loss_clip": 0.01079437, "auxiliary_loss_mlp": 0.0103905, "balance_loss_clip": 1.04144287, "balance_loss_mlp": 1.0246973, "epoch": 0.48086577483841875, "flos": 24826052275200.0, "grad_norm": 4.4511101438837555, "language_loss": 0.7285195, "learning_rate": 2.220569915556221e-06, "loss": 0.74970436, "num_input_tokens_seen": 171964970, "step": 7998, "time_per_iteration": 2.793765068054199 }, { "auxiliary_loss_clip": 0.01129175, "auxiliary_loss_mlp": 0.01042213, "balance_loss_clip": 1.04769647, "balance_loss_mlp": 1.02756786, "epoch": 0.4809258980910867, "flos": 24465365856000.0, "grad_norm": 1.6928626075088686, "language_loss": 0.71266204, "learning_rate": 2.220182825407892e-06, "loss": 0.73437595, "num_input_tokens_seen": 171986340, "step": 7999, "time_per_iteration": 2.698373556137085 }, { "auxiliary_loss_clip": 0.01120573, "auxiliary_loss_mlp": 0.01049678, "balance_loss_clip": 1.04650939, "balance_loss_mlp": 1.035707, "epoch": 0.4809860213437547, "flos": 21215952881280.0, "grad_norm": 3.5525090623309525, "language_loss": 0.71445537, "learning_rate": 2.2197957269102083e-06, "loss": 0.73615789, "num_input_tokens_seen": 172007300, "step": 8000, "time_per_iteration": 2.677906036376953 }, { "auxiliary_loss_clip": 0.01120936, "auxiliary_loss_mlp": 0.01045001, "balance_loss_clip": 1.04962945, "balance_loss_mlp": 1.03024244, "epoch": 0.48104614459642264, "flos": 37632084094080.0, "grad_norm": 1.397364252260559, "language_loss": 0.75031364, "learning_rate": 2.2194086200778485e-06, "loss": 0.77197301, "num_input_tokens_seen": 172029585, "step": 8001, "time_per_iteration": 2.8079638481140137 }, { "auxiliary_loss_clip": 0.01120097, "auxiliary_loss_mlp": 0.01045878, "balance_loss_clip": 1.04740191, "balance_loss_mlp": 1.03150105, "epoch": 0.48110626784909066, "flos": 18406122549120.0, "grad_norm": 1.760961408245497, "language_loss": 0.8157444, "learning_rate": 2.219021504925493e-06, "loss": 0.83740413, "num_input_tokens_seen": 172047495, "step": 8002, "time_per_iteration": 2.6615140438079834 }, { "auxiliary_loss_clip": 0.01127724, "auxiliary_loss_mlp": 0.01043569, "balance_loss_clip": 1.05275476, "balance_loss_mlp": 1.02780938, "epoch": 0.48116639110175863, "flos": 28439814856320.0, "grad_norm": 1.7356718355873448, "language_loss": 0.71858382, "learning_rate": 2.218634381467819e-06, "loss": 0.74029678, "num_input_tokens_seen": 172067625, "step": 8003, "time_per_iteration": 2.7304186820983887 }, { "auxiliary_loss_clip": 0.01114781, "auxiliary_loss_mlp": 0.01040333, "balance_loss_clip": 1.04751146, "balance_loss_mlp": 1.02654088, "epoch": 0.4812265143544266, "flos": 21725237865600.0, "grad_norm": 1.7533221004579713, "language_loss": 0.82598346, "learning_rate": 2.218247249719507e-06, "loss": 0.84753454, "num_input_tokens_seen": 172087885, "step": 8004, "time_per_iteration": 2.718576192855835 }, { "auxiliary_loss_clip": 0.01110853, "auxiliary_loss_mlp": 0.01042863, "balance_loss_clip": 1.04705787, "balance_loss_mlp": 1.02601874, "epoch": 0.48128663760709456, "flos": 13224679810560.0, "grad_norm": 2.3721289724239587, "language_loss": 0.77786469, "learning_rate": 2.217860109695239e-06, "loss": 0.79940188, "num_input_tokens_seen": 172105815, "step": 8005, "time_per_iteration": 2.7602009773254395 }, { "auxiliary_loss_clip": 0.01116298, "auxiliary_loss_mlp": 0.01040763, "balance_loss_clip": 1.04861951, "balance_loss_mlp": 1.02662444, "epoch": 0.4813467608597625, "flos": 24243437675520.0, "grad_norm": 1.8330364183017236, "language_loss": 0.70666707, "learning_rate": 2.217472961409692e-06, "loss": 0.72823763, "num_input_tokens_seen": 172126125, "step": 8006, "time_per_iteration": 2.7916948795318604 }, { "auxiliary_loss_clip": 0.01101733, "auxiliary_loss_mlp": 0.01039864, "balance_loss_clip": 1.04409337, "balance_loss_mlp": 1.02521324, "epoch": 0.4814068841124305, "flos": 27480424544640.0, "grad_norm": 1.7951056960252978, "language_loss": 0.70724428, "learning_rate": 2.2170858048775495e-06, "loss": 0.72866029, "num_input_tokens_seen": 172141945, "step": 8007, "time_per_iteration": 2.7661349773406982 }, { "auxiliary_loss_clip": 0.01130133, "auxiliary_loss_mlp": 0.0103276, "balance_loss_clip": 1.0476191, "balance_loss_mlp": 1.01881254, "epoch": 0.48146700736509845, "flos": 19572896033280.0, "grad_norm": 11.665968104344772, "language_loss": 0.71553946, "learning_rate": 2.2166986401134914e-06, "loss": 0.73716843, "num_input_tokens_seen": 172161095, "step": 8008, "time_per_iteration": 2.7019124031066895 }, { "auxiliary_loss_clip": 0.01096611, "auxiliary_loss_mlp": 0.01050794, "balance_loss_clip": 1.04696894, "balance_loss_mlp": 1.03467739, "epoch": 0.4815271306177664, "flos": 20627771673600.0, "grad_norm": 2.289909942865545, "language_loss": 0.60779428, "learning_rate": 2.216311467132199e-06, "loss": 0.62926841, "num_input_tokens_seen": 172178750, "step": 8009, "time_per_iteration": 2.713092088699341 }, { "auxiliary_loss_clip": 0.01022233, "auxiliary_loss_mlp": 0.01005627, "balance_loss_clip": 1.02350807, "balance_loss_mlp": 1.00431013, "epoch": 0.4815872538704344, "flos": 67691076232320.0, "grad_norm": 0.8584252589427176, "language_loss": 0.61326265, "learning_rate": 2.2159242859483547e-06, "loss": 0.63354123, "num_input_tokens_seen": 172240235, "step": 8010, "time_per_iteration": 3.2182729244232178 }, { "auxiliary_loss_clip": 0.01123367, "auxiliary_loss_mlp": 0.01044563, "balance_loss_clip": 1.0506475, "balance_loss_mlp": 1.02956653, "epoch": 0.48164737712310235, "flos": 22820764723200.0, "grad_norm": 1.7901877328371896, "language_loss": 0.73432398, "learning_rate": 2.215537096576639e-06, "loss": 0.75600326, "num_input_tokens_seen": 172259875, "step": 8011, "time_per_iteration": 2.671487331390381 }, { "auxiliary_loss_clip": 0.01103596, "auxiliary_loss_mlp": 0.01035148, "balance_loss_clip": 1.04422355, "balance_loss_mlp": 1.02199948, "epoch": 0.4817075003757703, "flos": 23733865382400.0, "grad_norm": 1.7774743588215727, "language_loss": 0.79526579, "learning_rate": 2.2151498990317354e-06, "loss": 0.81665325, "num_input_tokens_seen": 172280150, "step": 8012, "time_per_iteration": 5.769195079803467 }, { "auxiliary_loss_clip": 0.01092738, "auxiliary_loss_mlp": 0.01042222, "balance_loss_clip": 1.04738641, "balance_loss_mlp": 1.02718425, "epoch": 0.4817676236284383, "flos": 28182909807360.0, "grad_norm": 1.8494845342416013, "language_loss": 0.73714077, "learning_rate": 2.214762693328326e-06, "loss": 0.75849032, "num_input_tokens_seen": 172300810, "step": 8013, "time_per_iteration": 2.77451491355896 }, { "auxiliary_loss_clip": 0.01105203, "auxiliary_loss_mlp": 0.0103627, "balance_loss_clip": 1.05056131, "balance_loss_mlp": 1.02266848, "epoch": 0.48182774688110624, "flos": 17091756080640.0, "grad_norm": 2.3240899529345858, "language_loss": 0.90755451, "learning_rate": 2.214375479481094e-06, "loss": 0.92896926, "num_input_tokens_seen": 172317930, "step": 8014, "time_per_iteration": 4.2677695751190186 }, { "auxiliary_loss_clip": 0.0113526, "auxiliary_loss_mlp": 0.01039779, "balance_loss_clip": 1.04945207, "balance_loss_mlp": 1.02497888, "epoch": 0.4818878701337742, "flos": 12567873669120.0, "grad_norm": 3.070306284191698, "language_loss": 0.7404421, "learning_rate": 2.213988257504722e-06, "loss": 0.76219249, "num_input_tokens_seen": 172336340, "step": 8015, "time_per_iteration": 4.188862085342407 }, { "auxiliary_loss_clip": 0.01113922, "auxiliary_loss_mlp": 0.01040149, "balance_loss_clip": 1.04792023, "balance_loss_mlp": 1.02514613, "epoch": 0.48194799338644223, "flos": 24608505553920.0, "grad_norm": 2.1594847398910164, "language_loss": 0.80143541, "learning_rate": 2.213601027413894e-06, "loss": 0.82297611, "num_input_tokens_seen": 172354315, "step": 8016, "time_per_iteration": 2.745352268218994 }, { "auxiliary_loss_clip": 0.01115904, "auxiliary_loss_mlp": 0.010317, "balance_loss_clip": 1.04995775, "balance_loss_mlp": 1.01803231, "epoch": 0.4820081166391102, "flos": 21105204272640.0, "grad_norm": 1.9897571760317019, "language_loss": 0.77120233, "learning_rate": 2.2132137892232933e-06, "loss": 0.79267836, "num_input_tokens_seen": 172372695, "step": 8017, "time_per_iteration": 2.7234907150268555 }, { "auxiliary_loss_clip": 0.01117431, "auxiliary_loss_mlp": 0.01033072, "balance_loss_clip": 1.05067015, "balance_loss_mlp": 1.01848102, "epoch": 0.48206823989177816, "flos": 25264593423360.0, "grad_norm": 2.391907623354337, "language_loss": 0.80211884, "learning_rate": 2.2128265429476043e-06, "loss": 0.8236239, "num_input_tokens_seen": 172390905, "step": 8018, "time_per_iteration": 2.805011749267578 }, { "auxiliary_loss_clip": 0.01113573, "auxiliary_loss_mlp": 0.01031362, "balance_loss_clip": 1.05918038, "balance_loss_mlp": 1.01767111, "epoch": 0.4821283631444461, "flos": 24645062620800.0, "grad_norm": 1.818966225047076, "language_loss": 0.75859058, "learning_rate": 2.2124392886015124e-06, "loss": 0.78003991, "num_input_tokens_seen": 172412295, "step": 8019, "time_per_iteration": 2.767993688583374 }, { "auxiliary_loss_clip": 0.01092977, "auxiliary_loss_mlp": 0.01036734, "balance_loss_clip": 1.04580545, "balance_loss_mlp": 1.02204108, "epoch": 0.4821884863971141, "flos": 23952094462080.0, "grad_norm": 1.8745546244507358, "language_loss": 0.7907865, "learning_rate": 2.212052026199701e-06, "loss": 0.8120836, "num_input_tokens_seen": 172432625, "step": 8020, "time_per_iteration": 2.708779811859131 }, { "auxiliary_loss_clip": 0.01127117, "auxiliary_loss_mlp": 0.01036574, "balance_loss_clip": 1.04847205, "balance_loss_mlp": 1.02219176, "epoch": 0.48224860964978206, "flos": 17160668323200.0, "grad_norm": 2.712415162483374, "language_loss": 0.69893312, "learning_rate": 2.211664755756855e-06, "loss": 0.72057003, "num_input_tokens_seen": 172450010, "step": 8021, "time_per_iteration": 2.6083900928497314 }, { "auxiliary_loss_clip": 0.01102125, "auxiliary_loss_mlp": 0.01031516, "balance_loss_clip": 1.04406881, "balance_loss_mlp": 1.01672244, "epoch": 0.48230873290245, "flos": 23075838178560.0, "grad_norm": 1.7410194963021717, "language_loss": 0.62778926, "learning_rate": 2.2112774772876603e-06, "loss": 0.6491257, "num_input_tokens_seen": 172469080, "step": 8022, "time_per_iteration": 2.677368640899658 }, { "auxiliary_loss_clip": 0.01108316, "auxiliary_loss_mlp": 0.00770954, "balance_loss_clip": 1.04996586, "balance_loss_mlp": 1.00044918, "epoch": 0.482368856155118, "flos": 19353517718400.0, "grad_norm": 2.505400955117215, "language_loss": 0.66446078, "learning_rate": 2.2108901908068028e-06, "loss": 0.68325341, "num_input_tokens_seen": 172484850, "step": 8023, "time_per_iteration": 2.6412739753723145 }, { "auxiliary_loss_clip": 0.01054811, "auxiliary_loss_mlp": 0.0104073, "balance_loss_clip": 1.03875041, "balance_loss_mlp": 1.02531052, "epoch": 0.48242897940778595, "flos": 20078984707200.0, "grad_norm": 1.7010312143912936, "language_loss": 0.76777267, "learning_rate": 2.2105028963289683e-06, "loss": 0.78872806, "num_input_tokens_seen": 172503525, "step": 8024, "time_per_iteration": 2.858891010284424 }, { "auxiliary_loss_clip": 0.01109606, "auxiliary_loss_mlp": 0.01039089, "balance_loss_clip": 1.04908574, "balance_loss_mlp": 1.02432442, "epoch": 0.4824891026604539, "flos": 23403989854080.0, "grad_norm": 1.4778625856906076, "language_loss": 0.75417542, "learning_rate": 2.2101155938688423e-06, "loss": 0.77566242, "num_input_tokens_seen": 172524360, "step": 8025, "time_per_iteration": 2.6743719577789307 }, { "auxiliary_loss_clip": 0.01129031, "auxiliary_loss_mlp": 0.01034028, "balance_loss_clip": 1.04835987, "balance_loss_mlp": 1.01994324, "epoch": 0.4825492259131219, "flos": 20368675895040.0, "grad_norm": 1.785974704334164, "language_loss": 0.71310222, "learning_rate": 2.209728283441112e-06, "loss": 0.73473275, "num_input_tokens_seen": 172541480, "step": 8026, "time_per_iteration": 2.5739991664886475 }, { "auxiliary_loss_clip": 0.01115668, "auxiliary_loss_mlp": 0.01045724, "balance_loss_clip": 1.04429471, "balance_loss_mlp": 1.02949929, "epoch": 0.48260934916578985, "flos": 14319021519360.0, "grad_norm": 2.0186797289800182, "language_loss": 0.74956793, "learning_rate": 2.209340965060465e-06, "loss": 0.77118182, "num_input_tokens_seen": 172559005, "step": 8027, "time_per_iteration": 2.7139828205108643 }, { "auxiliary_loss_clip": 0.01105318, "auxiliary_loss_mlp": 0.01037258, "balance_loss_clip": 1.04597318, "balance_loss_mlp": 1.02348971, "epoch": 0.4826694724184578, "flos": 22121152548480.0, "grad_norm": 1.6779938031508344, "language_loss": 0.67332339, "learning_rate": 2.2089536387415868e-06, "loss": 0.69474924, "num_input_tokens_seen": 172578435, "step": 8028, "time_per_iteration": 2.809757709503174 }, { "auxiliary_loss_clip": 0.01105459, "auxiliary_loss_mlp": 0.01039975, "balance_loss_clip": 1.04472148, "balance_loss_mlp": 1.02583039, "epoch": 0.48272959567112583, "flos": 16181169373440.0, "grad_norm": 1.5400710398474027, "language_loss": 0.72719157, "learning_rate": 2.2085663044991655e-06, "loss": 0.7486459, "num_input_tokens_seen": 172596095, "step": 8029, "time_per_iteration": 2.692643165588379 }, { "auxiliary_loss_clip": 0.01103521, "auxiliary_loss_mlp": 0.01033131, "balance_loss_clip": 1.04666233, "balance_loss_mlp": 1.01880252, "epoch": 0.4827897189237938, "flos": 23180445561600.0, "grad_norm": 1.8484439777749806, "language_loss": 0.84841061, "learning_rate": 2.2081789623478896e-06, "loss": 0.86977708, "num_input_tokens_seen": 172615255, "step": 8030, "time_per_iteration": 2.6717677116394043 }, { "auxiliary_loss_clip": 0.01094989, "auxiliary_loss_mlp": 0.01034714, "balance_loss_clip": 1.04217124, "balance_loss_mlp": 1.02120733, "epoch": 0.48284984217646176, "flos": 21652626522240.0, "grad_norm": 2.0183604756392715, "language_loss": 0.74026352, "learning_rate": 2.2077916123024466e-06, "loss": 0.76156056, "num_input_tokens_seen": 172633185, "step": 8031, "time_per_iteration": 2.640707015991211 }, { "auxiliary_loss_clip": 0.01099826, "auxiliary_loss_mlp": 0.0104306, "balance_loss_clip": 1.04307055, "balance_loss_mlp": 1.02747965, "epoch": 0.48290996542912973, "flos": 31467443304960.0, "grad_norm": 1.5998759210668847, "language_loss": 0.71785772, "learning_rate": 2.2074042543775245e-06, "loss": 0.7392866, "num_input_tokens_seen": 172654280, "step": 8032, "time_per_iteration": 2.803567886352539 }, { "auxiliary_loss_clip": 0.0110819, "auxiliary_loss_mlp": 0.01037978, "balance_loss_clip": 1.04093766, "balance_loss_mlp": 1.02310669, "epoch": 0.4829700886817977, "flos": 24461954064000.0, "grad_norm": 1.7179702458807065, "language_loss": 0.73965132, "learning_rate": 2.2070168885878126e-06, "loss": 0.76111305, "num_input_tokens_seen": 172675545, "step": 8033, "time_per_iteration": 2.7292799949645996 }, { "auxiliary_loss_clip": 0.01073662, "auxiliary_loss_mlp": 0.01036715, "balance_loss_clip": 1.04669857, "balance_loss_mlp": 1.0225054, "epoch": 0.48303021193446566, "flos": 25702164904320.0, "grad_norm": 1.7431687715385025, "language_loss": 0.83544624, "learning_rate": 2.2066295149479996e-06, "loss": 0.85655004, "num_input_tokens_seen": 172696455, "step": 8034, "time_per_iteration": 2.807359218597412 }, { "auxiliary_loss_clip": 0.01095417, "auxiliary_loss_mlp": 0.01031736, "balance_loss_clip": 1.04668856, "balance_loss_mlp": 1.01843822, "epoch": 0.4830903351871336, "flos": 20085233673600.0, "grad_norm": 1.6936524854207098, "language_loss": 0.79185474, "learning_rate": 2.2062421334727744e-06, "loss": 0.81312621, "num_input_tokens_seen": 172716720, "step": 8035, "time_per_iteration": 2.7641072273254395 }, { "auxiliary_loss_clip": 0.01102103, "auxiliary_loss_mlp": 0.00772882, "balance_loss_clip": 1.04296494, "balance_loss_mlp": 1.00034285, "epoch": 0.4831504584398016, "flos": 39452216014080.0, "grad_norm": 1.8720500560152205, "language_loss": 0.69804895, "learning_rate": 2.2058547441768267e-06, "loss": 0.71679878, "num_input_tokens_seen": 172737435, "step": 8036, "time_per_iteration": 2.8137052059173584 }, { "auxiliary_loss_clip": 0.01112606, "auxiliary_loss_mlp": 0.01031964, "balance_loss_clip": 1.04274416, "balance_loss_mlp": 1.01839805, "epoch": 0.48321058169246955, "flos": 20006588845440.0, "grad_norm": 1.9208219105474362, "language_loss": 0.72910142, "learning_rate": 2.205467347074847e-06, "loss": 0.75054711, "num_input_tokens_seen": 172755700, "step": 8037, "time_per_iteration": 2.635277271270752 }, { "auxiliary_loss_clip": 0.01078506, "auxiliary_loss_mlp": 0.0104898, "balance_loss_clip": 1.04335546, "balance_loss_mlp": 1.03224301, "epoch": 0.4832707049451375, "flos": 20741465197440.0, "grad_norm": 3.147603880487906, "language_loss": 0.68890101, "learning_rate": 2.205079942181525e-06, "loss": 0.71017587, "num_input_tokens_seen": 172775185, "step": 8038, "time_per_iteration": 2.782864570617676 }, { "auxiliary_loss_clip": 0.01090364, "auxiliary_loss_mlp": 0.01038251, "balance_loss_clip": 1.04244566, "balance_loss_mlp": 1.02438653, "epoch": 0.4833308281978055, "flos": 33145584762240.0, "grad_norm": 1.8173480840244864, "language_loss": 0.79258525, "learning_rate": 2.20469252951155e-06, "loss": 0.81387138, "num_input_tokens_seen": 172796990, "step": 8039, "time_per_iteration": 2.7726707458496094 }, { "auxiliary_loss_clip": 0.01115294, "auxiliary_loss_mlp": 0.01034301, "balance_loss_clip": 1.04610348, "balance_loss_mlp": 1.02035379, "epoch": 0.48339095145047345, "flos": 19099234362240.0, "grad_norm": 1.6327731998252513, "language_loss": 0.77608567, "learning_rate": 2.2043051090796143e-06, "loss": 0.79758161, "num_input_tokens_seen": 172814915, "step": 8040, "time_per_iteration": 2.634373903274536 }, { "auxiliary_loss_clip": 0.01117481, "auxiliary_loss_mlp": 0.01034936, "balance_loss_clip": 1.04517746, "balance_loss_mlp": 1.02007651, "epoch": 0.4834510747031414, "flos": 34459448440320.0, "grad_norm": 1.603418513383397, "language_loss": 0.75737631, "learning_rate": 2.203917680900409e-06, "loss": 0.7789005, "num_input_tokens_seen": 172837060, "step": 8041, "time_per_iteration": 2.7551445960998535 }, { "auxiliary_loss_clip": 0.01089791, "auxiliary_loss_mlp": 0.01038452, "balance_loss_clip": 1.04363966, "balance_loss_mlp": 1.02388501, "epoch": 0.48351119795580944, "flos": 27380845065600.0, "grad_norm": 1.7873938615261085, "language_loss": 0.6681267, "learning_rate": 2.203530244988624e-06, "loss": 0.6894092, "num_input_tokens_seen": 172856545, "step": 8042, "time_per_iteration": 2.7318594455718994 }, { "auxiliary_loss_clip": 0.01029662, "auxiliary_loss_mlp": 0.0100431, "balance_loss_clip": 1.0224936, "balance_loss_mlp": 1.00289762, "epoch": 0.4835713212084774, "flos": 67143941291520.0, "grad_norm": 0.6894070214322334, "language_loss": 0.5854131, "learning_rate": 2.2031428013589517e-06, "loss": 0.60575283, "num_input_tokens_seen": 172923055, "step": 8043, "time_per_iteration": 3.2759408950805664 }, { "auxiliary_loss_clip": 0.01104355, "auxiliary_loss_mlp": 0.01041979, "balance_loss_clip": 1.04400086, "balance_loss_mlp": 1.02605903, "epoch": 0.48363144446114537, "flos": 17967473660160.0, "grad_norm": 1.92903629391714, "language_loss": 0.71673858, "learning_rate": 2.2027553500260847e-06, "loss": 0.73820192, "num_input_tokens_seen": 172940700, "step": 8044, "time_per_iteration": 2.6627197265625 }, { "auxiliary_loss_clip": 0.01073602, "auxiliary_loss_mlp": 0.01033421, "balance_loss_clip": 1.04103553, "balance_loss_mlp": 1.01863277, "epoch": 0.48369156771381333, "flos": 20593513077120.0, "grad_norm": 1.3783700874379357, "language_loss": 0.75982356, "learning_rate": 2.202367891004714e-06, "loss": 0.7808938, "num_input_tokens_seen": 172961125, "step": 8045, "time_per_iteration": 2.7301156520843506 }, { "auxiliary_loss_clip": 0.01083343, "auxiliary_loss_mlp": 0.01040882, "balance_loss_clip": 1.04626942, "balance_loss_mlp": 1.02615929, "epoch": 0.4837516909664813, "flos": 22675075159680.0, "grad_norm": 1.8085917066759625, "language_loss": 0.70038342, "learning_rate": 2.201980424309533e-06, "loss": 0.72162569, "num_input_tokens_seen": 172980405, "step": 8046, "time_per_iteration": 2.853160858154297 }, { "auxiliary_loss_clip": 0.01127438, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.04603601, "balance_loss_mlp": 1.02220488, "epoch": 0.48381181421914926, "flos": 25518625384320.0, "grad_norm": 2.1605387354357193, "language_loss": 0.82558095, "learning_rate": 2.2015929499552337e-06, "loss": 0.84722322, "num_input_tokens_seen": 172999105, "step": 8047, "time_per_iteration": 2.711172103881836 }, { "auxiliary_loss_clip": 0.01095021, "auxiliary_loss_mlp": 0.01034535, "balance_loss_clip": 1.04198444, "balance_loss_mlp": 1.02066541, "epoch": 0.4838719374718172, "flos": 24207491139840.0, "grad_norm": 1.6956601095110444, "language_loss": 0.80573416, "learning_rate": 2.2012054679565092e-06, "loss": 0.82702971, "num_input_tokens_seen": 173019935, "step": 8048, "time_per_iteration": 2.714733839035034 }, { "auxiliary_loss_clip": 0.01119221, "auxiliary_loss_mlp": 0.01039156, "balance_loss_clip": 1.04571271, "balance_loss_mlp": 1.02458251, "epoch": 0.4839320607244852, "flos": 26724577628160.0, "grad_norm": 1.6136989522042802, "language_loss": 0.81565118, "learning_rate": 2.200817978328054e-06, "loss": 0.83723497, "num_input_tokens_seen": 173039700, "step": 8049, "time_per_iteration": 2.740396738052368 }, { "auxiliary_loss_clip": 0.0110148, "auxiliary_loss_mlp": 0.01032329, "balance_loss_clip": 1.04652369, "balance_loss_mlp": 1.01979959, "epoch": 0.48399218397715316, "flos": 20448900921600.0, "grad_norm": 1.738899019363266, "language_loss": 0.72696805, "learning_rate": 2.2004304810845602e-06, "loss": 0.74830616, "num_input_tokens_seen": 173059170, "step": 8050, "time_per_iteration": 2.671696424484253 }, { "auxiliary_loss_clip": 0.01036049, "auxiliary_loss_mlp": 0.00752282, "balance_loss_clip": 1.01914835, "balance_loss_mlp": 1.00025868, "epoch": 0.4840523072298211, "flos": 67180570185600.0, "grad_norm": 0.6909377773009905, "language_loss": 0.562814, "learning_rate": 2.200042976240723e-06, "loss": 0.5806973, "num_input_tokens_seen": 173119000, "step": 8051, "time_per_iteration": 6.922944784164429 }, { "auxiliary_loss_clip": 0.01088902, "auxiliary_loss_mlp": 0.01035544, "balance_loss_clip": 1.04290557, "balance_loss_mlp": 1.0208869, "epoch": 0.4841124304824891, "flos": 22411490181120.0, "grad_norm": 1.8410570377760342, "language_loss": 0.75224304, "learning_rate": 2.199655463811236e-06, "loss": 0.77348751, "num_input_tokens_seen": 173137570, "step": 8052, "time_per_iteration": 2.7672088146209717 }, { "auxiliary_loss_clip": 0.01115072, "auxiliary_loss_mlp": 0.01037343, "balance_loss_clip": 1.04730511, "balance_loss_mlp": 1.02388382, "epoch": 0.48417255373515705, "flos": 13843959217920.0, "grad_norm": 2.7757616025011296, "language_loss": 0.6599009, "learning_rate": 2.1992679438107936e-06, "loss": 0.68142503, "num_input_tokens_seen": 173154355, "step": 8053, "time_per_iteration": 2.7092020511627197 }, { "auxiliary_loss_clip": 0.01118659, "auxiliary_loss_mlp": 0.0103362, "balance_loss_clip": 1.04970407, "balance_loss_mlp": 1.02048898, "epoch": 0.484232676987825, "flos": 31649689935360.0, "grad_norm": 1.9021914395644282, "language_loss": 0.69075954, "learning_rate": 2.198880416254091e-06, "loss": 0.7122823, "num_input_tokens_seen": 173174845, "step": 8054, "time_per_iteration": 5.934173583984375 }, { "auxiliary_loss_clip": 0.01055753, "auxiliary_loss_mlp": 0.01032099, "balance_loss_clip": 1.03702974, "balance_loss_mlp": 1.01789522, "epoch": 0.48429280024049304, "flos": 24095377814400.0, "grad_norm": 1.7332498206286664, "language_loss": 0.69624376, "learning_rate": 2.1984928811558233e-06, "loss": 0.71712232, "num_input_tokens_seen": 173195025, "step": 8055, "time_per_iteration": 2.811734676361084 }, { "auxiliary_loss_clip": 0.01121016, "auxiliary_loss_mlp": 0.01038771, "balance_loss_clip": 1.04966474, "balance_loss_mlp": 1.02396512, "epoch": 0.484352923493161, "flos": 17530081747200.0, "grad_norm": 2.8015304711701154, "language_loss": 0.63522434, "learning_rate": 2.198105338530685e-06, "loss": 0.6568222, "num_input_tokens_seen": 173213065, "step": 8056, "time_per_iteration": 2.6111772060394287 }, { "auxiliary_loss_clip": 0.01115568, "auxiliary_loss_mlp": 0.01036144, "balance_loss_clip": 1.04465592, "balance_loss_mlp": 1.0212791, "epoch": 0.48441304674582897, "flos": 29166862043520.0, "grad_norm": 2.044514393553715, "language_loss": 0.67968506, "learning_rate": 2.1977177883933726e-06, "loss": 0.70120221, "num_input_tokens_seen": 173234545, "step": 8057, "time_per_iteration": 2.678311824798584 }, { "auxiliary_loss_clip": 0.01089017, "auxiliary_loss_mlp": 0.01041569, "balance_loss_clip": 1.04114962, "balance_loss_mlp": 1.02560723, "epoch": 0.48447316999849693, "flos": 15886701676800.0, "grad_norm": 1.6304795591829788, "language_loss": 0.8145591, "learning_rate": 2.1973302307585827e-06, "loss": 0.83586496, "num_input_tokens_seen": 173252175, "step": 8058, "time_per_iteration": 2.676553964614868 }, { "auxiliary_loss_clip": 0.0111574, "auxiliary_loss_mlp": 0.01037327, "balance_loss_clip": 1.04488969, "balance_loss_mlp": 1.02229452, "epoch": 0.4845332932511649, "flos": 24381405815040.0, "grad_norm": 1.66967797618368, "language_loss": 0.79851902, "learning_rate": 2.1969426656410097e-06, "loss": 0.82004976, "num_input_tokens_seen": 173268790, "step": 8059, "time_per_iteration": 2.672071933746338 }, { "auxiliary_loss_clip": 0.01134552, "auxiliary_loss_mlp": 0.010436, "balance_loss_clip": 1.04998326, "balance_loss_mlp": 1.02804327, "epoch": 0.48459341650383286, "flos": 37116478316160.0, "grad_norm": 1.8700605031219397, "language_loss": 0.6685822, "learning_rate": 2.196555093055352e-06, "loss": 0.69036371, "num_input_tokens_seen": 173288030, "step": 8060, "time_per_iteration": 2.7481517791748047 }, { "auxiliary_loss_clip": 0.01115717, "auxiliary_loss_mlp": 0.01047797, "balance_loss_clip": 1.04782832, "balance_loss_mlp": 1.03283644, "epoch": 0.48465353975650083, "flos": 22966777509120.0, "grad_norm": 1.918934253409618, "language_loss": 0.67403054, "learning_rate": 2.1961675130163046e-06, "loss": 0.69566566, "num_input_tokens_seen": 173305965, "step": 8061, "time_per_iteration": 2.6991710662841797 }, { "auxiliary_loss_clip": 0.01112971, "auxiliary_loss_mlp": 0.01047446, "balance_loss_clip": 1.0495888, "balance_loss_mlp": 1.03176975, "epoch": 0.4847136630091688, "flos": 17707695523200.0, "grad_norm": 2.027913918653662, "language_loss": 0.82387316, "learning_rate": 2.1957799255385653e-06, "loss": 0.84547728, "num_input_tokens_seen": 173321985, "step": 8062, "time_per_iteration": 2.6427886486053467 }, { "auxiliary_loss_clip": 0.01062707, "auxiliary_loss_mlp": 0.0103913, "balance_loss_clip": 1.04044425, "balance_loss_mlp": 1.02433586, "epoch": 0.48477378626183676, "flos": 22018269018240.0, "grad_norm": 1.5908761940571217, "language_loss": 0.74599862, "learning_rate": 2.1953923306368325e-06, "loss": 0.76701701, "num_input_tokens_seen": 173341315, "step": 8063, "time_per_iteration": 2.767857313156128 }, { "auxiliary_loss_clip": 0.01103538, "auxiliary_loss_mlp": 0.01036681, "balance_loss_clip": 1.04380846, "balance_loss_mlp": 1.02177346, "epoch": 0.4848339095145047, "flos": 27962956874880.0, "grad_norm": 1.679199539296889, "language_loss": 0.7897141, "learning_rate": 2.1950047283258023e-06, "loss": 0.81111628, "num_input_tokens_seen": 173361055, "step": 8064, "time_per_iteration": 2.702838182449341 }, { "auxiliary_loss_clip": 0.01127143, "auxiliary_loss_mlp": 0.0077039, "balance_loss_clip": 1.04982877, "balance_loss_mlp": 1.00042999, "epoch": 0.4848940327671727, "flos": 21688752625920.0, "grad_norm": 1.758395032785765, "language_loss": 0.78960353, "learning_rate": 2.194617118620173e-06, "loss": 0.80857891, "num_input_tokens_seen": 173379255, "step": 8065, "time_per_iteration": 2.6464266777038574 }, { "auxiliary_loss_clip": 0.01109206, "auxiliary_loss_mlp": 0.00771166, "balance_loss_clip": 1.04239869, "balance_loss_mlp": 1.00034332, "epoch": 0.48495415601984065, "flos": 20631578515200.0, "grad_norm": 1.717828669503626, "language_loss": 0.76373905, "learning_rate": 2.194229501534644e-06, "loss": 0.78254277, "num_input_tokens_seen": 173398370, "step": 8066, "time_per_iteration": 2.622279405593872 }, { "auxiliary_loss_clip": 0.01129705, "auxiliary_loss_mlp": 0.01032468, "balance_loss_clip": 1.05031133, "balance_loss_mlp": 1.0188905, "epoch": 0.4850142792725086, "flos": 25628152930560.0, "grad_norm": 1.606995638926956, "language_loss": 0.7245208, "learning_rate": 2.193841877083912e-06, "loss": 0.74614257, "num_input_tokens_seen": 173419595, "step": 8067, "time_per_iteration": 2.6863858699798584 }, { "auxiliary_loss_clip": 0.01062315, "auxiliary_loss_mlp": 0.01036403, "balance_loss_clip": 1.04658556, "balance_loss_mlp": 1.02155542, "epoch": 0.4850744025251766, "flos": 13771958405760.0, "grad_norm": 2.9723717970034826, "language_loss": 0.79098403, "learning_rate": 2.1934542452826767e-06, "loss": 0.81197119, "num_input_tokens_seen": 173435390, "step": 8068, "time_per_iteration": 2.736361503601074 }, { "auxiliary_loss_clip": 0.01096742, "auxiliary_loss_mlp": 0.01035763, "balance_loss_clip": 1.04122019, "balance_loss_mlp": 1.02254295, "epoch": 0.4851345257778446, "flos": 20261339078400.0, "grad_norm": 1.4037595191012704, "language_loss": 0.84329617, "learning_rate": 2.193066606145638e-06, "loss": 0.86462128, "num_input_tokens_seen": 173454095, "step": 8069, "time_per_iteration": 2.6671814918518066 }, { "auxiliary_loss_clip": 0.01091404, "auxiliary_loss_mlp": 0.01033062, "balance_loss_clip": 1.04400659, "balance_loss_mlp": 1.01972818, "epoch": 0.48519464903051257, "flos": 27089681420160.0, "grad_norm": 1.7638547734342187, "language_loss": 0.78171504, "learning_rate": 2.192678959687493e-06, "loss": 0.80295968, "num_input_tokens_seen": 173475300, "step": 8070, "time_per_iteration": 2.7715907096862793 }, { "auxiliary_loss_clip": 0.01066151, "auxiliary_loss_mlp": 0.0103257, "balance_loss_clip": 1.04079247, "balance_loss_mlp": 1.01808023, "epoch": 0.48525477228318054, "flos": 17127235739520.0, "grad_norm": 1.9176398781406192, "language_loss": 0.78054178, "learning_rate": 2.192291305922943e-06, "loss": 0.80152905, "num_input_tokens_seen": 173492005, "step": 8071, "time_per_iteration": 2.7427566051483154 }, { "auxiliary_loss_clip": 0.01063848, "auxiliary_loss_mlp": 0.0103312, "balance_loss_clip": 1.04013515, "balance_loss_mlp": 1.01852274, "epoch": 0.4853148955358485, "flos": 28180324028160.0, "grad_norm": 1.9286974806008035, "language_loss": 0.72312587, "learning_rate": 2.1919036448666873e-06, "loss": 0.7440955, "num_input_tokens_seen": 173511995, "step": 8072, "time_per_iteration": 2.8457834720611572 }, { "auxiliary_loss_clip": 0.01077736, "auxiliary_loss_mlp": 0.01038365, "balance_loss_clip": 1.04195118, "balance_loss_mlp": 1.02361333, "epoch": 0.48537501878851647, "flos": 17493309198720.0, "grad_norm": 2.206546835183074, "language_loss": 0.87933266, "learning_rate": 2.1915159765334262e-06, "loss": 0.90049368, "num_input_tokens_seen": 173530215, "step": 8073, "time_per_iteration": 2.7190656661987305 }, { "auxiliary_loss_clip": 0.01081944, "auxiliary_loss_mlp": 0.01041597, "balance_loss_clip": 1.03932655, "balance_loss_mlp": 1.02555168, "epoch": 0.48543514204118443, "flos": 28584857975040.0, "grad_norm": 1.6453725477912577, "language_loss": 0.60954368, "learning_rate": 2.19112830093786e-06, "loss": 0.63077909, "num_input_tokens_seen": 173550920, "step": 8074, "time_per_iteration": 2.757408857345581 }, { "auxiliary_loss_clip": 0.01088022, "auxiliary_loss_mlp": 0.00773092, "balance_loss_clip": 1.0409627, "balance_loss_mlp": 1.00044906, "epoch": 0.4854952652938524, "flos": 20959981585920.0, "grad_norm": 1.6130644581425704, "language_loss": 0.735416, "learning_rate": 2.19074061809469e-06, "loss": 0.75402713, "num_input_tokens_seen": 173569065, "step": 8075, "time_per_iteration": 2.8191847801208496 }, { "auxiliary_loss_clip": 0.01121809, "auxiliary_loss_mlp": 0.01039314, "balance_loss_clip": 1.04537582, "balance_loss_mlp": 1.02567613, "epoch": 0.48555538854652036, "flos": 66529543155840.0, "grad_norm": 2.2867687714704665, "language_loss": 0.81751764, "learning_rate": 2.1903529280186163e-06, "loss": 0.83912885, "num_input_tokens_seen": 173596085, "step": 8076, "time_per_iteration": 3.0270113945007324 }, { "auxiliary_loss_clip": 0.01107841, "auxiliary_loss_mlp": 0.01038327, "balance_loss_clip": 1.04600549, "balance_loss_mlp": 1.02161372, "epoch": 0.4856155117991883, "flos": 15924982596480.0, "grad_norm": 2.702312951735234, "language_loss": 0.86105502, "learning_rate": 2.1899652307243407e-06, "loss": 0.88251674, "num_input_tokens_seen": 173613900, "step": 8077, "time_per_iteration": 2.6272876262664795 }, { "auxiliary_loss_clip": 0.01006449, "auxiliary_loss_mlp": 0.0100721, "balance_loss_clip": 1.01856184, "balance_loss_mlp": 1.00564885, "epoch": 0.4856756350518563, "flos": 71047395060480.0, "grad_norm": 0.8998346956373826, "language_loss": 0.58465588, "learning_rate": 2.189577526226564e-06, "loss": 0.60479248, "num_input_tokens_seen": 173671305, "step": 8078, "time_per_iteration": 3.254561424255371 }, { "auxiliary_loss_clip": 0.01132159, "auxiliary_loss_mlp": 0.01033911, "balance_loss_clip": 1.04961872, "balance_loss_mlp": 1.01946878, "epoch": 0.48573575830452426, "flos": 29825679346560.0, "grad_norm": 1.7198368274974891, "language_loss": 0.72365242, "learning_rate": 2.1891898145399884e-06, "loss": 0.74531311, "num_input_tokens_seen": 173692070, "step": 8079, "time_per_iteration": 2.6532506942749023 }, { "auxiliary_loss_clip": 0.01088509, "auxiliary_loss_mlp": 0.0103276, "balance_loss_clip": 1.04440176, "balance_loss_mlp": 1.01868141, "epoch": 0.4857958815571922, "flos": 17639501552640.0, "grad_norm": 2.749999314487442, "language_loss": 0.79557705, "learning_rate": 2.1888020956793172e-06, "loss": 0.81678975, "num_input_tokens_seen": 173709785, "step": 8080, "time_per_iteration": 2.6242940425872803 }, { "auxiliary_loss_clip": 0.01097632, "auxiliary_loss_mlp": 0.01033589, "balance_loss_clip": 1.04023981, "balance_loss_mlp": 1.01881862, "epoch": 0.4858560048098602, "flos": 21105491581440.0, "grad_norm": 1.9603729393952303, "language_loss": 0.84016395, "learning_rate": 2.188414369659251e-06, "loss": 0.86147618, "num_input_tokens_seen": 173728770, "step": 8081, "time_per_iteration": 2.6701998710632324 }, { "auxiliary_loss_clip": 0.01110096, "auxiliary_loss_mlp": 0.01036956, "balance_loss_clip": 1.04121375, "balance_loss_mlp": 1.02081513, "epoch": 0.4859161280625282, "flos": 22090844448000.0, "grad_norm": 1.4026106187948555, "language_loss": 0.83353597, "learning_rate": 2.1880266364944924e-06, "loss": 0.85500646, "num_input_tokens_seen": 173747355, "step": 8082, "time_per_iteration": 2.6535134315490723 }, { "auxiliary_loss_clip": 0.01102933, "auxiliary_loss_mlp": 0.01034217, "balance_loss_clip": 1.04525304, "balance_loss_mlp": 1.02117527, "epoch": 0.4859762513151962, "flos": 17493452853120.0, "grad_norm": 1.9462739217424578, "language_loss": 0.87314546, "learning_rate": 2.187638896199746e-06, "loss": 0.89451694, "num_input_tokens_seen": 173764825, "step": 8083, "time_per_iteration": 2.6324520111083984 }, { "auxiliary_loss_clip": 0.01080799, "auxiliary_loss_mlp": 0.01047109, "balance_loss_clip": 1.04719186, "balance_loss_mlp": 1.03410375, "epoch": 0.48603637456786414, "flos": 18004246208640.0, "grad_norm": 1.6025248177358018, "language_loss": 0.80759108, "learning_rate": 2.1872511487897126e-06, "loss": 0.82887018, "num_input_tokens_seen": 173783215, "step": 8084, "time_per_iteration": 2.679032325744629 }, { "auxiliary_loss_clip": 0.01114846, "auxiliary_loss_mlp": 0.01035804, "balance_loss_clip": 1.04544878, "balance_loss_mlp": 1.02149308, "epoch": 0.4860964978205321, "flos": 22492038430080.0, "grad_norm": 1.9539653340908196, "language_loss": 0.68145066, "learning_rate": 2.186863394279098e-06, "loss": 0.70295715, "num_input_tokens_seen": 173801905, "step": 8085, "time_per_iteration": 2.6305296421051025 }, { "auxiliary_loss_clip": 0.01113875, "auxiliary_loss_mlp": 0.01040894, "balance_loss_clip": 1.04487717, "balance_loss_mlp": 1.02714896, "epoch": 0.48615662107320007, "flos": 23372532518400.0, "grad_norm": 1.3763064439222144, "language_loss": 0.77494752, "learning_rate": 2.1864756326826046e-06, "loss": 0.79649526, "num_input_tokens_seen": 173824690, "step": 8086, "time_per_iteration": 2.6941890716552734 }, { "auxiliary_loss_clip": 0.01125139, "auxiliary_loss_mlp": 0.01028743, "balance_loss_clip": 1.04536629, "balance_loss_mlp": 1.01461661, "epoch": 0.48621674432586803, "flos": 34418833136640.0, "grad_norm": 2.3947564981199347, "language_loss": 0.7014342, "learning_rate": 2.1860878640149355e-06, "loss": 0.72297299, "num_input_tokens_seen": 173844450, "step": 8087, "time_per_iteration": 2.7329354286193848 }, { "auxiliary_loss_clip": 0.01119086, "auxiliary_loss_mlp": 0.01040298, "balance_loss_clip": 1.04627323, "balance_loss_mlp": 1.0251466, "epoch": 0.486276867578536, "flos": 33107555237760.0, "grad_norm": 1.710106545545042, "language_loss": 0.72521967, "learning_rate": 2.1857000882907974e-06, "loss": 0.74681354, "num_input_tokens_seen": 173864975, "step": 8088, "time_per_iteration": 2.747058391571045 }, { "auxiliary_loss_clip": 0.01103115, "auxiliary_loss_mlp": 0.01037287, "balance_loss_clip": 1.04365635, "balance_loss_mlp": 1.02306569, "epoch": 0.48633699083120396, "flos": 21470703114240.0, "grad_norm": 1.7297894528285667, "language_loss": 0.7543239, "learning_rate": 2.185312305524892e-06, "loss": 0.77572793, "num_input_tokens_seen": 173883805, "step": 8089, "time_per_iteration": 2.6639740467071533 }, { "auxiliary_loss_clip": 0.01092992, "auxiliary_loss_mlp": 0.01031661, "balance_loss_clip": 1.04379344, "balance_loss_mlp": 1.01733255, "epoch": 0.48639711408387193, "flos": 20084335833600.0, "grad_norm": 1.6351614757671693, "language_loss": 0.84245062, "learning_rate": 2.184924515731926e-06, "loss": 0.86369717, "num_input_tokens_seen": 173903520, "step": 8090, "time_per_iteration": 4.404139757156372 }, { "auxiliary_loss_clip": 0.01122239, "auxiliary_loss_mlp": 0.01033955, "balance_loss_clip": 1.04544723, "balance_loss_mlp": 1.0203594, "epoch": 0.4864572373365399, "flos": 20778884190720.0, "grad_norm": 1.7197214823091769, "language_loss": 0.76290631, "learning_rate": 2.1845367189266045e-06, "loss": 0.78446829, "num_input_tokens_seen": 173924255, "step": 8091, "time_per_iteration": 2.7133665084838867 }, { "auxiliary_loss_clip": 0.01115621, "auxiliary_loss_mlp": 0.01029044, "balance_loss_clip": 1.04440069, "balance_loss_mlp": 1.01553202, "epoch": 0.48651736058920786, "flos": 26025360503040.0, "grad_norm": 1.4953838782762103, "language_loss": 0.80510461, "learning_rate": 2.184148915123631e-06, "loss": 0.82655126, "num_input_tokens_seen": 173943285, "step": 8092, "time_per_iteration": 2.682349920272827 }, { "auxiliary_loss_clip": 0.0110052, "auxiliary_loss_mlp": 0.00775072, "balance_loss_clip": 1.04398346, "balance_loss_mlp": 1.00031447, "epoch": 0.4865774838418758, "flos": 20485601642880.0, "grad_norm": 1.434156215667662, "language_loss": 0.71867287, "learning_rate": 2.1837611043377126e-06, "loss": 0.73742878, "num_input_tokens_seen": 173962205, "step": 8093, "time_per_iteration": 5.686015367507935 }, { "auxiliary_loss_clip": 0.01123791, "auxiliary_loss_mlp": 0.01034202, "balance_loss_clip": 1.04521751, "balance_loss_mlp": 1.02074885, "epoch": 0.4866376070945438, "flos": 23547704169600.0, "grad_norm": 1.581585117496142, "language_loss": 0.67704266, "learning_rate": 2.1833732865835545e-06, "loss": 0.69862258, "num_input_tokens_seen": 173980945, "step": 8094, "time_per_iteration": 2.5890355110168457 }, { "auxiliary_loss_clip": 0.01109259, "auxiliary_loss_mlp": 0.01038119, "balance_loss_clip": 1.04752278, "balance_loss_mlp": 1.02342701, "epoch": 0.4866977303472118, "flos": 16690598012160.0, "grad_norm": 2.317379685093866, "language_loss": 0.66784161, "learning_rate": 2.1829854618758636e-06, "loss": 0.68931544, "num_input_tokens_seen": 173998860, "step": 8095, "time_per_iteration": 2.640468120574951 }, { "auxiliary_loss_clip": 0.01110152, "auxiliary_loss_mlp": 0.0103636, "balance_loss_clip": 1.04456031, "balance_loss_mlp": 1.02123296, "epoch": 0.4867578535998798, "flos": 17896011552000.0, "grad_norm": 2.1481069791390346, "language_loss": 0.78540075, "learning_rate": 2.182597630229345e-06, "loss": 0.80686581, "num_input_tokens_seen": 174016665, "step": 8096, "time_per_iteration": 2.585015058517456 }, { "auxiliary_loss_clip": 0.01092726, "auxiliary_loss_mlp": 0.01036143, "balance_loss_clip": 1.03732872, "balance_loss_mlp": 1.02165902, "epoch": 0.48681797685254774, "flos": 22637799820800.0, "grad_norm": 1.880706326191671, "language_loss": 0.67753577, "learning_rate": 2.1822097916587067e-06, "loss": 0.69882447, "num_input_tokens_seen": 174034800, "step": 8097, "time_per_iteration": 2.6526336669921875 }, { "auxiliary_loss_clip": 0.01097124, "auxiliary_loss_mlp": 0.01039294, "balance_loss_clip": 1.04311764, "balance_loss_mlp": 1.02491093, "epoch": 0.4868781001052157, "flos": 20886077352960.0, "grad_norm": 1.6144910396326548, "language_loss": 0.71414316, "learning_rate": 2.1818219461786543e-06, "loss": 0.73550731, "num_input_tokens_seen": 174054445, "step": 8098, "time_per_iteration": 2.6669986248016357 }, { "auxiliary_loss_clip": 0.01119656, "auxiliary_loss_mlp": 0.01037345, "balance_loss_clip": 1.04642081, "balance_loss_mlp": 1.02226543, "epoch": 0.48693822335788367, "flos": 41974940937600.0, "grad_norm": 2.9804894060925458, "language_loss": 0.66267806, "learning_rate": 2.1814340938038956e-06, "loss": 0.68424809, "num_input_tokens_seen": 174077890, "step": 8099, "time_per_iteration": 2.7542026042938232 }, { "auxiliary_loss_clip": 0.01070284, "auxiliary_loss_mlp": 0.01040695, "balance_loss_clip": 1.0372566, "balance_loss_mlp": 1.02712917, "epoch": 0.48699834661055164, "flos": 24243294021120.0, "grad_norm": 1.700994432394141, "language_loss": 0.66787708, "learning_rate": 2.181046234549138e-06, "loss": 0.6889869, "num_input_tokens_seen": 174097460, "step": 8100, "time_per_iteration": 2.7499735355377197 }, { "auxiliary_loss_clip": 0.01087635, "auxiliary_loss_mlp": 0.01033762, "balance_loss_clip": 1.04155445, "balance_loss_mlp": 1.02084517, "epoch": 0.4870584698632196, "flos": 25923877603200.0, "grad_norm": 1.427277688843355, "language_loss": 0.76812327, "learning_rate": 2.180658368429088e-06, "loss": 0.78933728, "num_input_tokens_seen": 174120775, "step": 8101, "time_per_iteration": 2.7710418701171875 }, { "auxiliary_loss_clip": 0.010432, "auxiliary_loss_mlp": 0.00999689, "balance_loss_clip": 1.01742899, "balance_loss_mlp": 0.99847281, "epoch": 0.48711859311588757, "flos": 70211933648640.0, "grad_norm": 0.6877166097191185, "language_loss": 0.52341712, "learning_rate": 2.1802704954584565e-06, "loss": 0.54384601, "num_input_tokens_seen": 174189135, "step": 8102, "time_per_iteration": 3.3232975006103516 }, { "auxiliary_loss_clip": 0.0109639, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.04584694, "balance_loss_mlp": 1.02250218, "epoch": 0.48717871636855553, "flos": 12342964659840.0, "grad_norm": 2.1242457938350885, "language_loss": 0.7405737, "learning_rate": 2.1798826156519484e-06, "loss": 0.7618984, "num_input_tokens_seen": 174203250, "step": 8103, "time_per_iteration": 2.6988277435302734 }, { "auxiliary_loss_clip": 0.01116672, "auxiliary_loss_mlp": 0.01043644, "balance_loss_clip": 1.04631233, "balance_loss_mlp": 1.0288384, "epoch": 0.4872388396212235, "flos": 23477139901440.0, "grad_norm": 1.6106517558680102, "language_loss": 0.63064033, "learning_rate": 2.1794947290242737e-06, "loss": 0.65224349, "num_input_tokens_seen": 174224145, "step": 8104, "time_per_iteration": 2.629725456237793 }, { "auxiliary_loss_clip": 0.01125564, "auxiliary_loss_mlp": 0.01032477, "balance_loss_clip": 1.04695344, "balance_loss_mlp": 1.01885152, "epoch": 0.48729896287389146, "flos": 31427582186880.0, "grad_norm": 2.7588286364308217, "language_loss": 0.69136071, "learning_rate": 2.1791068355901413e-06, "loss": 0.71294117, "num_input_tokens_seen": 174244435, "step": 8105, "time_per_iteration": 2.6670045852661133 }, { "auxiliary_loss_clip": 0.01084626, "auxiliary_loss_mlp": 0.01030665, "balance_loss_clip": 1.04264283, "balance_loss_mlp": 1.01766491, "epoch": 0.4873590861265594, "flos": 19057936700160.0, "grad_norm": 2.072109036230495, "language_loss": 0.73534381, "learning_rate": 2.178718935364259e-06, "loss": 0.75649679, "num_input_tokens_seen": 174262710, "step": 8106, "time_per_iteration": 2.679194927215576 }, { "auxiliary_loss_clip": 0.01107932, "auxiliary_loss_mlp": 0.00772241, "balance_loss_clip": 1.04675412, "balance_loss_mlp": 1.00038791, "epoch": 0.4874192093792274, "flos": 24348296453760.0, "grad_norm": 2.6438945384360157, "language_loss": 0.76877642, "learning_rate": 2.1783310283613373e-06, "loss": 0.78757817, "num_input_tokens_seen": 174281545, "step": 8107, "time_per_iteration": 2.6732285022735596 }, { "auxiliary_loss_clip": 0.01071333, "auxiliary_loss_mlp": 0.01032073, "balance_loss_clip": 1.04327512, "balance_loss_mlp": 1.01932359, "epoch": 0.4874793326318954, "flos": 23112610727040.0, "grad_norm": 3.5135482389125583, "language_loss": 0.75034302, "learning_rate": 2.1779431145960853e-06, "loss": 0.77137709, "num_input_tokens_seen": 174300290, "step": 8108, "time_per_iteration": 2.8071932792663574 }, { "auxiliary_loss_clip": 0.01111368, "auxiliary_loss_mlp": 0.01030979, "balance_loss_clip": 1.04524517, "balance_loss_mlp": 1.01917136, "epoch": 0.4875394558845634, "flos": 19026156142080.0, "grad_norm": 1.7033835018380465, "language_loss": 0.73611033, "learning_rate": 2.177555194083212e-06, "loss": 0.75753379, "num_input_tokens_seen": 174318490, "step": 8109, "time_per_iteration": 2.642854928970337 }, { "auxiliary_loss_clip": 0.01108586, "auxiliary_loss_mlp": 0.01031639, "balance_loss_clip": 1.04274952, "balance_loss_mlp": 1.01813245, "epoch": 0.48759957913723134, "flos": 21433607343360.0, "grad_norm": 1.8383730211114537, "language_loss": 0.78698927, "learning_rate": 2.177167266837428e-06, "loss": 0.80839157, "num_input_tokens_seen": 174335505, "step": 8110, "time_per_iteration": 2.6471641063690186 }, { "auxiliary_loss_clip": 0.01114056, "auxiliary_loss_mlp": 0.01041552, "balance_loss_clip": 1.04712057, "balance_loss_mlp": 1.02802181, "epoch": 0.4876597023898993, "flos": 17748669962880.0, "grad_norm": 1.8514316559502986, "language_loss": 0.72086185, "learning_rate": 2.176779332873444e-06, "loss": 0.74241793, "num_input_tokens_seen": 174353990, "step": 8111, "time_per_iteration": 2.6277401447296143 }, { "auxiliary_loss_clip": 0.01113402, "auxiliary_loss_mlp": 0.01036579, "balance_loss_clip": 1.04676926, "balance_loss_mlp": 1.02329946, "epoch": 0.4877198256425673, "flos": 17019647527680.0, "grad_norm": 1.5795214961704311, "language_loss": 0.76318377, "learning_rate": 2.17639139220597e-06, "loss": 0.78468353, "num_input_tokens_seen": 174373425, "step": 8112, "time_per_iteration": 2.598010301589966 }, { "auxiliary_loss_clip": 0.01117365, "auxiliary_loss_mlp": 0.01038377, "balance_loss_clip": 1.04562628, "balance_loss_mlp": 1.02425683, "epoch": 0.48777994889523524, "flos": 22384091082240.0, "grad_norm": 1.710789031048389, "language_loss": 0.75035822, "learning_rate": 2.1760034448497166e-06, "loss": 0.77191567, "num_input_tokens_seen": 174393070, "step": 8113, "time_per_iteration": 2.6348531246185303 }, { "auxiliary_loss_clip": 0.01028141, "auxiliary_loss_mlp": 0.0075288, "balance_loss_clip": 1.02038229, "balance_loss_mlp": 1.0004046, "epoch": 0.4878400721479032, "flos": 61241772159360.0, "grad_norm": 0.77879843500845, "language_loss": 0.4887349, "learning_rate": 2.1756154908193943e-06, "loss": 0.50654507, "num_input_tokens_seen": 174446880, "step": 8114, "time_per_iteration": 3.1273062229156494 }, { "auxiliary_loss_clip": 0.0109717, "auxiliary_loss_mlp": 0.01040496, "balance_loss_clip": 1.04649258, "balance_loss_mlp": 1.02591658, "epoch": 0.48790019540057117, "flos": 24536612482560.0, "grad_norm": 1.616579350296871, "language_loss": 0.76760268, "learning_rate": 2.1752275301297155e-06, "loss": 0.78897941, "num_input_tokens_seen": 174468485, "step": 8115, "time_per_iteration": 2.759444236755371 }, { "auxiliary_loss_clip": 0.01107443, "auxiliary_loss_mlp": 0.01033169, "balance_loss_clip": 1.0478245, "balance_loss_mlp": 1.01930535, "epoch": 0.48796031865323913, "flos": 21833939399040.0, "grad_norm": 2.031601085778298, "language_loss": 0.71910083, "learning_rate": 2.1748395627953915e-06, "loss": 0.74050689, "num_input_tokens_seen": 174486360, "step": 8116, "time_per_iteration": 2.7063751220703125 }, { "auxiliary_loss_clip": 0.01088547, "auxiliary_loss_mlp": 0.01035995, "balance_loss_clip": 1.04164481, "balance_loss_mlp": 1.02276874, "epoch": 0.4880204419059071, "flos": 18588907883520.0, "grad_norm": 3.4734402196051, "language_loss": 0.63002747, "learning_rate": 2.1744515888311335e-06, "loss": 0.65127283, "num_input_tokens_seen": 174505075, "step": 8117, "time_per_iteration": 2.713792562484741 }, { "auxiliary_loss_clip": 0.01093551, "auxiliary_loss_mlp": 0.01042447, "balance_loss_clip": 1.04097366, "balance_loss_mlp": 1.02740264, "epoch": 0.48808056515857506, "flos": 19172168928000.0, "grad_norm": 1.6679530296862457, "language_loss": 0.79487926, "learning_rate": 2.1740636082516533e-06, "loss": 0.81623924, "num_input_tokens_seen": 174523385, "step": 8118, "time_per_iteration": 2.6479125022888184 }, { "auxiliary_loss_clip": 0.01102071, "auxiliary_loss_mlp": 0.01036823, "balance_loss_clip": 1.04363036, "balance_loss_mlp": 1.02295303, "epoch": 0.48814068841124303, "flos": 20120497850880.0, "grad_norm": 1.8682176240686432, "language_loss": 0.6328088, "learning_rate": 2.1736756210716645e-06, "loss": 0.65419775, "num_input_tokens_seen": 174542200, "step": 8119, "time_per_iteration": 2.6599643230438232 }, { "auxiliary_loss_clip": 0.01061047, "auxiliary_loss_mlp": 0.00770426, "balance_loss_clip": 1.04209542, "balance_loss_mlp": 1.00037444, "epoch": 0.488200811663911, "flos": 22965592360320.0, "grad_norm": 1.676805190577927, "language_loss": 0.72166741, "learning_rate": 2.173287627305878e-06, "loss": 0.73998219, "num_input_tokens_seen": 174563620, "step": 8120, "time_per_iteration": 2.795185089111328 }, { "auxiliary_loss_clip": 0.01118613, "auxiliary_loss_mlp": 0.01031295, "balance_loss_clip": 1.0469954, "balance_loss_mlp": 1.01728177, "epoch": 0.48826093491657896, "flos": 33910697387520.0, "grad_norm": 2.388334225725702, "language_loss": 0.63951784, "learning_rate": 2.1728996269690075e-06, "loss": 0.66101694, "num_input_tokens_seen": 174586465, "step": 8121, "time_per_iteration": 2.7527153491973877 }, { "auxiliary_loss_clip": 0.01112786, "auxiliary_loss_mlp": 0.01036976, "balance_loss_clip": 1.04261351, "balance_loss_mlp": 1.02283835, "epoch": 0.488321058169247, "flos": 23070307484160.0, "grad_norm": 1.985568603421553, "language_loss": 0.82805705, "learning_rate": 2.1725116200757664e-06, "loss": 0.84955472, "num_input_tokens_seen": 174604035, "step": 8122, "time_per_iteration": 2.668754816055298 }, { "auxiliary_loss_clip": 0.0111403, "auxiliary_loss_mlp": 0.01043394, "balance_loss_clip": 1.04526711, "balance_loss_mlp": 1.02749181, "epoch": 0.48838118142191494, "flos": 19317714837120.0, "grad_norm": 1.7149683973709622, "language_loss": 0.85272485, "learning_rate": 2.172123606640866e-06, "loss": 0.87429905, "num_input_tokens_seen": 174621715, "step": 8123, "time_per_iteration": 2.6014883518218994 }, { "auxiliary_loss_clip": 0.01090574, "auxiliary_loss_mlp": 0.01031767, "balance_loss_clip": 1.04448855, "balance_loss_mlp": 1.0185523, "epoch": 0.4884413046745829, "flos": 25410678036480.0, "grad_norm": 1.3909354864913257, "language_loss": 0.85614896, "learning_rate": 2.1717355866790227e-06, "loss": 0.87737238, "num_input_tokens_seen": 174643835, "step": 8124, "time_per_iteration": 2.754786968231201 }, { "auxiliary_loss_clip": 0.01103222, "auxiliary_loss_mlp": 0.01031579, "balance_loss_clip": 1.04439664, "balance_loss_mlp": 1.0179534, "epoch": 0.4885014279272509, "flos": 20991546662400.0, "grad_norm": 1.926010658269172, "language_loss": 0.79547518, "learning_rate": 2.171347560204948e-06, "loss": 0.81682324, "num_input_tokens_seen": 174660955, "step": 8125, "time_per_iteration": 2.667335271835327 }, { "auxiliary_loss_clip": 0.01078395, "auxiliary_loss_mlp": 0.01040727, "balance_loss_clip": 1.04347515, "balance_loss_mlp": 1.0263145, "epoch": 0.48856155117991884, "flos": 13771599269760.0, "grad_norm": 2.02778788313487, "language_loss": 0.72584462, "learning_rate": 2.170959527233356e-06, "loss": 0.74703586, "num_input_tokens_seen": 174678270, "step": 8126, "time_per_iteration": 2.7370314598083496 }, { "auxiliary_loss_clip": 0.0111111, "auxiliary_loss_mlp": 0.01038149, "balance_loss_clip": 1.0410614, "balance_loss_mlp": 1.02405286, "epoch": 0.4886216744325868, "flos": 32087764206720.0, "grad_norm": 1.7703486674415694, "language_loss": 0.68917644, "learning_rate": 2.1705714877789633e-06, "loss": 0.71066898, "num_input_tokens_seen": 174698360, "step": 8127, "time_per_iteration": 2.811074733734131 }, { "auxiliary_loss_clip": 0.01125381, "auxiliary_loss_mlp": 0.01033584, "balance_loss_clip": 1.04334533, "balance_loss_mlp": 1.01993454, "epoch": 0.48868179768525477, "flos": 19610063631360.0, "grad_norm": 1.5960676368468543, "language_loss": 0.76178646, "learning_rate": 2.170183441856481e-06, "loss": 0.78337616, "num_input_tokens_seen": 174716755, "step": 8128, "time_per_iteration": 2.5751638412475586 }, { "auxiliary_loss_clip": 0.01126548, "auxiliary_loss_mlp": 0.01031229, "balance_loss_clip": 1.04598355, "balance_loss_mlp": 1.01818776, "epoch": 0.48874192093792274, "flos": 21286912199040.0, "grad_norm": 1.5334009671548041, "language_loss": 0.7574327, "learning_rate": 2.1697953894806265e-06, "loss": 0.77901042, "num_input_tokens_seen": 174735560, "step": 8129, "time_per_iteration": 4.080120325088501 }, { "auxiliary_loss_clip": 0.01113338, "auxiliary_loss_mlp": 0.01031411, "balance_loss_clip": 1.04372275, "balance_loss_mlp": 1.0174098, "epoch": 0.4888020441905907, "flos": 14173439696640.0, "grad_norm": 2.756799094025314, "language_loss": 0.64951944, "learning_rate": 2.169407330666114e-06, "loss": 0.67096692, "num_input_tokens_seen": 174752730, "step": 8130, "time_per_iteration": 4.153359413146973 }, { "auxiliary_loss_clip": 0.01087218, "auxiliary_loss_mlp": 0.01036252, "balance_loss_clip": 1.0399828, "balance_loss_mlp": 1.02282333, "epoch": 0.48886216744325867, "flos": 24097891766400.0, "grad_norm": 1.9114203912665453, "language_loss": 0.72505724, "learning_rate": 2.169019265427658e-06, "loss": 0.746292, "num_input_tokens_seen": 174772520, "step": 8131, "time_per_iteration": 2.751070499420166 }, { "auxiliary_loss_clip": 0.0111646, "auxiliary_loss_mlp": 0.01041385, "balance_loss_clip": 1.04625905, "balance_loss_mlp": 1.0270561, "epoch": 0.48892229069592663, "flos": 38431419402240.0, "grad_norm": 1.3981624070335212, "language_loss": 0.69684219, "learning_rate": 2.1686311937799745e-06, "loss": 0.71842068, "num_input_tokens_seen": 174796540, "step": 8132, "time_per_iteration": 4.478942632675171 }, { "auxiliary_loss_clip": 0.01109765, "auxiliary_loss_mlp": 0.01030128, "balance_loss_clip": 1.04673529, "balance_loss_mlp": 1.01630616, "epoch": 0.4889824139485946, "flos": 23843321101440.0, "grad_norm": 1.328560083390073, "language_loss": 0.69882882, "learning_rate": 2.1682431157377797e-06, "loss": 0.72022772, "num_input_tokens_seen": 174817840, "step": 8133, "time_per_iteration": 4.2415807247161865 }, { "auxiliary_loss_clip": 0.01062397, "auxiliary_loss_mlp": 0.01042948, "balance_loss_clip": 1.03593254, "balance_loss_mlp": 1.02922726, "epoch": 0.48904253720126256, "flos": 24425827960320.0, "grad_norm": 1.919712430573748, "language_loss": 0.70950568, "learning_rate": 2.1678550313157883e-06, "loss": 0.73055917, "num_input_tokens_seen": 174837885, "step": 8134, "time_per_iteration": 2.772383689880371 }, { "auxiliary_loss_clip": 0.01084139, "auxiliary_loss_mlp": 0.01035376, "balance_loss_clip": 1.04342508, "balance_loss_mlp": 1.02082086, "epoch": 0.4891026604539306, "flos": 24170682677760.0, "grad_norm": 1.9244253075686233, "language_loss": 0.80356431, "learning_rate": 2.167466940528718e-06, "loss": 0.82475942, "num_input_tokens_seen": 174855240, "step": 8135, "time_per_iteration": 2.7362964153289795 }, { "auxiliary_loss_clip": 0.01124035, "auxiliary_loss_mlp": 0.01035694, "balance_loss_clip": 1.04567957, "balance_loss_mlp": 1.0232842, "epoch": 0.48916278370659855, "flos": 21470954509440.0, "grad_norm": 1.8037329109010316, "language_loss": 0.74794912, "learning_rate": 2.1670788433912843e-06, "loss": 0.76954633, "num_input_tokens_seen": 174875145, "step": 8136, "time_per_iteration": 2.766477346420288 }, { "auxiliary_loss_clip": 0.01097387, "auxiliary_loss_mlp": 0.01043558, "balance_loss_clip": 1.04352307, "balance_loss_mlp": 1.02971756, "epoch": 0.4892229069592665, "flos": 22309755886080.0, "grad_norm": 1.6588593954338173, "language_loss": 0.73403543, "learning_rate": 2.166690739918204e-06, "loss": 0.75544488, "num_input_tokens_seen": 174894770, "step": 8137, "time_per_iteration": 2.720778703689575 }, { "auxiliary_loss_clip": 0.01051073, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.03699243, "balance_loss_mlp": 1.01726234, "epoch": 0.4892830302119345, "flos": 12786856934400.0, "grad_norm": 2.090077124931452, "language_loss": 0.75336611, "learning_rate": 2.1663026301241944e-06, "loss": 0.77418739, "num_input_tokens_seen": 174912780, "step": 8138, "time_per_iteration": 2.7975735664367676 }, { "auxiliary_loss_clip": 0.01091927, "auxiliary_loss_mlp": 0.01038351, "balance_loss_clip": 1.04700375, "balance_loss_mlp": 1.02536893, "epoch": 0.48934315346460244, "flos": 20813896972800.0, "grad_norm": 1.6152276292204855, "language_loss": 0.74018902, "learning_rate": 2.165914514023972e-06, "loss": 0.76149184, "num_input_tokens_seen": 174931250, "step": 8139, "time_per_iteration": 2.7135186195373535 }, { "auxiliary_loss_clip": 0.01115319, "auxiliary_loss_mlp": 0.0103739, "balance_loss_clip": 1.04502773, "balance_loss_mlp": 1.02416921, "epoch": 0.4894032767172704, "flos": 19755537713280.0, "grad_norm": 1.878714628680016, "language_loss": 0.62168998, "learning_rate": 2.165526391632255e-06, "loss": 0.64321709, "num_input_tokens_seen": 174951105, "step": 8140, "time_per_iteration": 2.6594550609588623 }, { "auxiliary_loss_clip": 0.0109215, "auxiliary_loss_mlp": 0.01040102, "balance_loss_clip": 1.04310822, "balance_loss_mlp": 1.02509928, "epoch": 0.4894633999699384, "flos": 17818982835840.0, "grad_norm": 1.7004882369900214, "language_loss": 0.82400143, "learning_rate": 2.1651382629637608e-06, "loss": 0.84532392, "num_input_tokens_seen": 174969120, "step": 8141, "time_per_iteration": 2.648696184158325 }, { "auxiliary_loss_clip": 0.01095522, "auxiliary_loss_mlp": 0.01034005, "balance_loss_clip": 1.04897892, "balance_loss_mlp": 1.01975965, "epoch": 0.48952352322260634, "flos": 25523222325120.0, "grad_norm": 1.6750975318537598, "language_loss": 0.72031653, "learning_rate": 2.1647501280332066e-06, "loss": 0.74161184, "num_input_tokens_seen": 174991295, "step": 8142, "time_per_iteration": 2.770524740219116 }, { "auxiliary_loss_clip": 0.01124129, "auxiliary_loss_mlp": 0.01033852, "balance_loss_clip": 1.04588366, "balance_loss_mlp": 1.02094769, "epoch": 0.4895836464752743, "flos": 29055502903680.0, "grad_norm": 8.902000760681485, "language_loss": 0.66877794, "learning_rate": 2.1643619868553105e-06, "loss": 0.6903578, "num_input_tokens_seen": 175012830, "step": 8143, "time_per_iteration": 2.717714786529541 }, { "auxiliary_loss_clip": 0.01116098, "auxiliary_loss_mlp": 0.00770078, "balance_loss_clip": 1.04774415, "balance_loss_mlp": 1.00015235, "epoch": 0.48964376972794227, "flos": 33546958312320.0, "grad_norm": 1.880195910988658, "language_loss": 0.75596797, "learning_rate": 2.163973839444793e-06, "loss": 0.77482975, "num_input_tokens_seen": 175035695, "step": 8144, "time_per_iteration": 2.801825761795044 }, { "auxiliary_loss_clip": 0.01099436, "auxiliary_loss_mlp": 0.01031587, "balance_loss_clip": 1.04169714, "balance_loss_mlp": 1.01753187, "epoch": 0.48970389298061023, "flos": 22054035985920.0, "grad_norm": 1.9123659180679726, "language_loss": 0.75693774, "learning_rate": 2.1635856858163695e-06, "loss": 0.77824795, "num_input_tokens_seen": 175056425, "step": 8145, "time_per_iteration": 2.781550168991089 }, { "auxiliary_loss_clip": 0.01108869, "auxiliary_loss_mlp": 0.0077212, "balance_loss_clip": 1.04549527, "balance_loss_mlp": 1.00018287, "epoch": 0.4897640162332782, "flos": 20084299920000.0, "grad_norm": 1.6675270752681912, "language_loss": 0.80437362, "learning_rate": 2.163197525984761e-06, "loss": 0.82318354, "num_input_tokens_seen": 175074800, "step": 8146, "time_per_iteration": 2.699277400970459 }, { "auxiliary_loss_clip": 0.01109996, "auxiliary_loss_mlp": 0.01033581, "balance_loss_clip": 1.04312873, "balance_loss_mlp": 1.02007508, "epoch": 0.48982413948594616, "flos": 23806225330560.0, "grad_norm": 2.022171046548427, "language_loss": 0.74193209, "learning_rate": 2.162809359964687e-06, "loss": 0.76336789, "num_input_tokens_seen": 175094500, "step": 8147, "time_per_iteration": 2.732973337173462 }, { "auxiliary_loss_clip": 0.01095071, "auxiliary_loss_mlp": 0.01032519, "balance_loss_clip": 1.0448947, "balance_loss_mlp": 1.0193938, "epoch": 0.4898842627386142, "flos": 17639645207040.0, "grad_norm": 2.1017800501084882, "language_loss": 0.8286857, "learning_rate": 2.162421187770864e-06, "loss": 0.84996164, "num_input_tokens_seen": 175112920, "step": 8148, "time_per_iteration": 2.662179708480835 }, { "auxiliary_loss_clip": 0.01091374, "auxiliary_loss_mlp": 0.01033444, "balance_loss_clip": 1.04345882, "balance_loss_mlp": 1.0213387, "epoch": 0.48994438599128215, "flos": 16617914841600.0, "grad_norm": 1.9007753197415815, "language_loss": 0.74256468, "learning_rate": 2.162033009418015e-06, "loss": 0.76381284, "num_input_tokens_seen": 175129910, "step": 8149, "time_per_iteration": 2.7373321056365967 }, { "auxiliary_loss_clip": 0.01130985, "auxiliary_loss_mlp": 0.01037014, "balance_loss_clip": 1.04766726, "balance_loss_mlp": 1.02247095, "epoch": 0.4900045092439501, "flos": 26614834600320.0, "grad_norm": 1.7000980888808985, "language_loss": 0.76319683, "learning_rate": 2.1616448249208567e-06, "loss": 0.78487676, "num_input_tokens_seen": 175148705, "step": 8150, "time_per_iteration": 2.653003692626953 }, { "auxiliary_loss_clip": 0.01103787, "auxiliary_loss_mlp": 0.01035673, "balance_loss_clip": 1.04736936, "balance_loss_mlp": 1.02152276, "epoch": 0.4900646324966181, "flos": 19902125116800.0, "grad_norm": 2.127966402053614, "language_loss": 0.72754669, "learning_rate": 2.1612566342941106e-06, "loss": 0.7489413, "num_input_tokens_seen": 175167425, "step": 8151, "time_per_iteration": 2.7142715454101562 }, { "auxiliary_loss_clip": 0.01018676, "auxiliary_loss_mlp": 0.01008139, "balance_loss_clip": 1.02870607, "balance_loss_mlp": 1.00680435, "epoch": 0.49012475574928605, "flos": 59189620337280.0, "grad_norm": 0.8300028938034224, "language_loss": 0.54350889, "learning_rate": 2.1608684375524977e-06, "loss": 0.56377703, "num_input_tokens_seen": 175227985, "step": 8152, "time_per_iteration": 3.218646764755249 }, { "auxiliary_loss_clip": 0.01066533, "auxiliary_loss_mlp": 0.01034489, "balance_loss_clip": 1.04041779, "balance_loss_mlp": 1.02058959, "epoch": 0.490184879001954, "flos": 45259797657600.0, "grad_norm": 1.9767488244056508, "language_loss": 0.61212152, "learning_rate": 2.1604802347107364e-06, "loss": 0.6331318, "num_input_tokens_seen": 175251895, "step": 8153, "time_per_iteration": 3.043501615524292 }, { "auxiliary_loss_clip": 0.01091315, "auxiliary_loss_mlp": 0.01034977, "balance_loss_clip": 1.04408598, "balance_loss_mlp": 1.02139306, "epoch": 0.490245002254622, "flos": 28002135634560.0, "grad_norm": 1.494326859026801, "language_loss": 0.767699, "learning_rate": 2.160092025783549e-06, "loss": 0.78896195, "num_input_tokens_seen": 175272770, "step": 8154, "time_per_iteration": 2.783686399459839 }, { "auxiliary_loss_clip": 0.01032948, "auxiliary_loss_mlp": 0.01009488, "balance_loss_clip": 1.02573824, "balance_loss_mlp": 1.00805795, "epoch": 0.49030512550728994, "flos": 58951318533120.0, "grad_norm": 0.9569310885457037, "language_loss": 0.6699397, "learning_rate": 2.1597038107856564e-06, "loss": 0.69036406, "num_input_tokens_seen": 175336320, "step": 8155, "time_per_iteration": 3.2836861610412598 }, { "auxiliary_loss_clip": 0.01128627, "auxiliary_loss_mlp": 0.01033153, "balance_loss_clip": 1.04858041, "balance_loss_mlp": 1.01990271, "epoch": 0.4903652487599579, "flos": 19791843384960.0, "grad_norm": 1.7952288566158678, "language_loss": 0.76406527, "learning_rate": 2.1593155897317784e-06, "loss": 0.78568316, "num_input_tokens_seen": 175353540, "step": 8156, "time_per_iteration": 2.77978515625 }, { "auxiliary_loss_clip": 0.01115952, "auxiliary_loss_mlp": 0.01033945, "balance_loss_clip": 1.04693031, "balance_loss_mlp": 1.02066517, "epoch": 0.49042537201262587, "flos": 21762082241280.0, "grad_norm": 2.671892010748055, "language_loss": 0.83756495, "learning_rate": 2.1589273626366377e-06, "loss": 0.85906386, "num_input_tokens_seen": 175370445, "step": 8157, "time_per_iteration": 2.6860296726226807 }, { "auxiliary_loss_clip": 0.01116981, "auxiliary_loss_mlp": 0.0103483, "balance_loss_clip": 1.04626417, "balance_loss_mlp": 1.02103734, "epoch": 0.49048549526529384, "flos": 18953042008320.0, "grad_norm": 1.6916175452091182, "language_loss": 0.79447746, "learning_rate": 2.158539129514956e-06, "loss": 0.81599557, "num_input_tokens_seen": 175389020, "step": 8158, "time_per_iteration": 2.723398208618164 }, { "auxiliary_loss_clip": 0.01130092, "auxiliary_loss_mlp": 0.01036013, "balance_loss_clip": 1.0493114, "balance_loss_mlp": 1.02237535, "epoch": 0.4905456185179618, "flos": 26906393295360.0, "grad_norm": 1.5924994780725177, "language_loss": 0.69469124, "learning_rate": 2.158150890381454e-06, "loss": 0.71635228, "num_input_tokens_seen": 175409545, "step": 8159, "time_per_iteration": 2.685887575149536 }, { "auxiliary_loss_clip": 0.01109209, "auxiliary_loss_mlp": 0.01041597, "balance_loss_clip": 1.04416955, "balance_loss_mlp": 1.02719688, "epoch": 0.49060574177062977, "flos": 20412343854720.0, "grad_norm": 1.8488353997421354, "language_loss": 0.73372805, "learning_rate": 2.157762645250854e-06, "loss": 0.75523615, "num_input_tokens_seen": 175429335, "step": 8160, "time_per_iteration": 2.7002642154693604 }, { "auxiliary_loss_clip": 0.01111433, "auxiliary_loss_mlp": 0.01040851, "balance_loss_clip": 1.04374194, "balance_loss_mlp": 1.02655184, "epoch": 0.4906658650232978, "flos": 17493704248320.0, "grad_norm": 4.058452856445761, "language_loss": 0.71791285, "learning_rate": 2.1573743941378796e-06, "loss": 0.73943567, "num_input_tokens_seen": 175446955, "step": 8161, "time_per_iteration": 2.641211748123169 }, { "auxiliary_loss_clip": 0.01077408, "auxiliary_loss_mlp": 0.01036857, "balance_loss_clip": 1.04114866, "balance_loss_mlp": 1.02337408, "epoch": 0.49072598827596575, "flos": 26614439550720.0, "grad_norm": 1.5881872934975843, "language_loss": 0.68676394, "learning_rate": 2.1569861370572517e-06, "loss": 0.7079066, "num_input_tokens_seen": 175468195, "step": 8162, "time_per_iteration": 2.7768666744232178 }, { "auxiliary_loss_clip": 0.01114289, "auxiliary_loss_mlp": 0.01037181, "balance_loss_clip": 1.04699993, "balance_loss_mlp": 1.02219641, "epoch": 0.4907861115286337, "flos": 20412595249920.0, "grad_norm": 1.6090900616469643, "language_loss": 0.63697332, "learning_rate": 2.1565978740236944e-06, "loss": 0.65848798, "num_input_tokens_seen": 175487455, "step": 8163, "time_per_iteration": 2.658141851425171 }, { "auxiliary_loss_clip": 0.01086004, "auxiliary_loss_mlp": 0.01032891, "balance_loss_clip": 1.03996313, "balance_loss_mlp": 1.01987886, "epoch": 0.4908462347813017, "flos": 14064271286400.0, "grad_norm": 2.5242130171230954, "language_loss": 0.77383208, "learning_rate": 2.1562096050519293e-06, "loss": 0.79502106, "num_input_tokens_seen": 175504450, "step": 8164, "time_per_iteration": 2.6626484394073486 }, { "auxiliary_loss_clip": 0.01110027, "auxiliary_loss_mlp": 0.01037706, "balance_loss_clip": 1.04298282, "balance_loss_mlp": 1.0221138, "epoch": 0.49090635803396965, "flos": 18735100237440.0, "grad_norm": 1.6753117148295888, "language_loss": 0.76749474, "learning_rate": 2.1558213301566806e-06, "loss": 0.78897208, "num_input_tokens_seen": 175523600, "step": 8165, "time_per_iteration": 2.5757079124450684 }, { "auxiliary_loss_clip": 0.0110394, "auxiliary_loss_mlp": 0.01035745, "balance_loss_clip": 1.04666007, "balance_loss_mlp": 1.02205336, "epoch": 0.4909664812866376, "flos": 20558500295040.0, "grad_norm": 1.5531816235742995, "language_loss": 0.77461708, "learning_rate": 2.1554330493526716e-06, "loss": 0.79601395, "num_input_tokens_seen": 175542720, "step": 8166, "time_per_iteration": 2.7169244289398193 }, { "auxiliary_loss_clip": 0.01040608, "auxiliary_loss_mlp": 0.00998968, "balance_loss_clip": 1.02393854, "balance_loss_mlp": 0.99768084, "epoch": 0.4910266045393056, "flos": 54684017948160.0, "grad_norm": 0.7914566078875801, "language_loss": 0.54175258, "learning_rate": 2.1550447626546253e-06, "loss": 0.56214833, "num_input_tokens_seen": 175598640, "step": 8167, "time_per_iteration": 3.192706823348999 }, { "auxiliary_loss_clip": 0.01081549, "auxiliary_loss_mlp": 0.01036447, "balance_loss_clip": 1.04554164, "balance_loss_mlp": 1.02288687, "epoch": 0.49108672779197354, "flos": 16246454342400.0, "grad_norm": 1.702915470367474, "language_loss": 0.85894108, "learning_rate": 2.1546564700772665e-06, "loss": 0.88012105, "num_input_tokens_seen": 175615675, "step": 8168, "time_per_iteration": 2.7353274822235107 }, { "auxiliary_loss_clip": 0.01107152, "auxiliary_loss_mlp": 0.01045094, "balance_loss_clip": 1.04374826, "balance_loss_mlp": 1.030586, "epoch": 0.4911468510446415, "flos": 19825419623040.0, "grad_norm": 1.7298624053450853, "language_loss": 0.73407066, "learning_rate": 2.1542681716353193e-06, "loss": 0.75559318, "num_input_tokens_seen": 175632255, "step": 8169, "time_per_iteration": 5.773583173751831 }, { "auxiliary_loss_clip": 0.01112799, "auxiliary_loss_mlp": 0.01029653, "balance_loss_clip": 1.04443777, "balance_loss_mlp": 1.01692092, "epoch": 0.4912069742973095, "flos": 21212684743680.0, "grad_norm": 1.4410309608870682, "language_loss": 0.77824241, "learning_rate": 2.1538798673435068e-06, "loss": 0.79966694, "num_input_tokens_seen": 175651625, "step": 8170, "time_per_iteration": 2.6583240032196045 }, { "auxiliary_loss_clip": 0.01096689, "auxiliary_loss_mlp": 0.010389, "balance_loss_clip": 1.04164565, "balance_loss_mlp": 1.02643037, "epoch": 0.49126709754997744, "flos": 19537129065600.0, "grad_norm": 2.2423824181328853, "language_loss": 0.76314211, "learning_rate": 2.1534915572165545e-06, "loss": 0.78449798, "num_input_tokens_seen": 175669265, "step": 8171, "time_per_iteration": 4.3524169921875 }, { "auxiliary_loss_clip": 0.01104096, "auxiliary_loss_mlp": 0.01036347, "balance_loss_clip": 1.04284763, "balance_loss_mlp": 1.02299559, "epoch": 0.4913272208026454, "flos": 12239686080000.0, "grad_norm": 1.898078833449508, "language_loss": 0.82055932, "learning_rate": 2.1531032412691875e-06, "loss": 0.84196377, "num_input_tokens_seen": 175686065, "step": 8172, "time_per_iteration": 4.201699495315552 }, { "auxiliary_loss_clip": 0.0104227, "auxiliary_loss_mlp": 0.01009809, "balance_loss_clip": 1.02604604, "balance_loss_mlp": 1.00842655, "epoch": 0.49138734405531337, "flos": 65465871661440.0, "grad_norm": 0.6872688544677212, "language_loss": 0.53258997, "learning_rate": 2.1527149195161295e-06, "loss": 0.55311078, "num_input_tokens_seen": 175748595, "step": 8173, "time_per_iteration": 3.1827917098999023 }, { "auxiliary_loss_clip": 0.0111451, "auxiliary_loss_mlp": 0.00771219, "balance_loss_clip": 1.04312336, "balance_loss_mlp": 1.00013208, "epoch": 0.4914474673079814, "flos": 18439052342400.0, "grad_norm": 2.1937948702767054, "language_loss": 0.63081181, "learning_rate": 2.152326591972107e-06, "loss": 0.64966911, "num_input_tokens_seen": 175766770, "step": 8174, "time_per_iteration": 2.591662883758545 }, { "auxiliary_loss_clip": 0.01086287, "auxiliary_loss_mlp": 0.01044728, "balance_loss_clip": 1.04296112, "balance_loss_mlp": 1.02985096, "epoch": 0.49150759056064935, "flos": 21685053525120.0, "grad_norm": 1.9252900771693722, "language_loss": 0.69252932, "learning_rate": 2.1519382586518445e-06, "loss": 0.71383941, "num_input_tokens_seen": 175783605, "step": 8175, "time_per_iteration": 2.7286670207977295 }, { "auxiliary_loss_clip": 0.01112428, "auxiliary_loss_mlp": 0.01032945, "balance_loss_clip": 1.0438236, "balance_loss_mlp": 1.02018952, "epoch": 0.4915677138133173, "flos": 22382439056640.0, "grad_norm": 1.7316891792167346, "language_loss": 0.74424642, "learning_rate": 2.151549919570068e-06, "loss": 0.76570022, "num_input_tokens_seen": 175801390, "step": 8176, "time_per_iteration": 2.623328685760498 }, { "auxiliary_loss_clip": 0.01117272, "auxiliary_loss_mlp": 0.0104375, "balance_loss_clip": 1.04691124, "balance_loss_mlp": 1.03022528, "epoch": 0.4916278370659853, "flos": 18402890325120.0, "grad_norm": 1.776030453931397, "language_loss": 0.70309961, "learning_rate": 2.1511615747415036e-06, "loss": 0.72470981, "num_input_tokens_seen": 175819830, "step": 8177, "time_per_iteration": 2.642073154449463 }, { "auxiliary_loss_clip": 0.01031811, "auxiliary_loss_mlp": 0.00752155, "balance_loss_clip": 1.02581143, "balance_loss_mlp": 0.99997473, "epoch": 0.49168796031865325, "flos": 66609124715520.0, "grad_norm": 0.6890109431226723, "language_loss": 0.46192822, "learning_rate": 2.150773224180877e-06, "loss": 0.47976786, "num_input_tokens_seen": 175881765, "step": 8178, "time_per_iteration": 3.195594072341919 }, { "auxiliary_loss_clip": 0.0112992, "auxiliary_loss_mlp": 0.01036689, "balance_loss_clip": 1.04735565, "balance_loss_mlp": 1.02215147, "epoch": 0.4917480835713212, "flos": 20959335141120.0, "grad_norm": 1.748461689040465, "language_loss": 0.65961659, "learning_rate": 2.1503848679029147e-06, "loss": 0.6812827, "num_input_tokens_seen": 175901795, "step": 8179, "time_per_iteration": 2.675170421600342 }, { "auxiliary_loss_clip": 0.01036062, "auxiliary_loss_mlp": 0.01047888, "balance_loss_clip": 1.03444839, "balance_loss_mlp": 1.031497, "epoch": 0.4918082068239892, "flos": 15772900412160.0, "grad_norm": 2.3413868243180493, "language_loss": 0.70163, "learning_rate": 2.149996505922343e-06, "loss": 0.72246957, "num_input_tokens_seen": 175917770, "step": 8180, "time_per_iteration": 2.9436681270599365 }, { "auxiliary_loss_clip": 0.01099418, "auxiliary_loss_mlp": 0.01037201, "balance_loss_clip": 1.04268646, "balance_loss_mlp": 1.02306247, "epoch": 0.49186833007665715, "flos": 24604806453120.0, "grad_norm": 1.915055420772654, "language_loss": 0.84369922, "learning_rate": 2.1496081382538895e-06, "loss": 0.86506534, "num_input_tokens_seen": 175937000, "step": 8181, "time_per_iteration": 2.8556039333343506 }, { "auxiliary_loss_clip": 0.01125975, "auxiliary_loss_mlp": 0.010356, "balance_loss_clip": 1.04886341, "balance_loss_mlp": 1.0226841, "epoch": 0.4919284533293251, "flos": 22090557139200.0, "grad_norm": 2.841846979456106, "language_loss": 0.72482812, "learning_rate": 2.1492197649122793e-06, "loss": 0.74644387, "num_input_tokens_seen": 175955170, "step": 8182, "time_per_iteration": 2.5908985137939453 }, { "auxiliary_loss_clip": 0.01088743, "auxiliary_loss_mlp": 0.01035989, "balance_loss_clip": 1.04323542, "balance_loss_mlp": 1.0227685, "epoch": 0.4919885765819931, "flos": 23368043318400.0, "grad_norm": 2.038591418033226, "language_loss": 0.72608387, "learning_rate": 2.1488313859122412e-06, "loss": 0.74733126, "num_input_tokens_seen": 175973725, "step": 8183, "time_per_iteration": 2.7704007625579834 }, { "auxiliary_loss_clip": 0.0106529, "auxiliary_loss_mlp": 0.01035357, "balance_loss_clip": 1.03853834, "balance_loss_mlp": 1.0204556, "epoch": 0.49204869983466104, "flos": 21360493209600.0, "grad_norm": 3.5725391309360406, "language_loss": 0.77354276, "learning_rate": 2.1484430012685015e-06, "loss": 0.79454923, "num_input_tokens_seen": 175993885, "step": 8184, "time_per_iteration": 2.8195126056671143 }, { "auxiliary_loss_clip": 0.01094147, "auxiliary_loss_mlp": 0.01040773, "balance_loss_clip": 1.04233742, "balance_loss_mlp": 1.02739143, "epoch": 0.492108823087329, "flos": 21142695093120.0, "grad_norm": 1.8939343643350832, "language_loss": 0.70917577, "learning_rate": 2.148054610995789e-06, "loss": 0.73052496, "num_input_tokens_seen": 176014210, "step": 8185, "time_per_iteration": 2.678464412689209 }, { "auxiliary_loss_clip": 0.01108334, "auxiliary_loss_mlp": 0.01037918, "balance_loss_clip": 1.0468477, "balance_loss_mlp": 1.02306461, "epoch": 0.49216894633999697, "flos": 25116605389440.0, "grad_norm": 1.7900274786799464, "language_loss": 0.75134045, "learning_rate": 2.147666215108831e-06, "loss": 0.77280295, "num_input_tokens_seen": 176033890, "step": 8186, "time_per_iteration": 2.754204273223877 }, { "auxiliary_loss_clip": 0.01116557, "auxiliary_loss_mlp": 0.01034757, "balance_loss_clip": 1.04770708, "balance_loss_mlp": 1.02050531, "epoch": 0.49222906959266494, "flos": 22637943475200.0, "grad_norm": 2.9803647414716945, "language_loss": 0.67526996, "learning_rate": 2.1472778136223545e-06, "loss": 0.69678307, "num_input_tokens_seen": 176052720, "step": 8187, "time_per_iteration": 2.6845459938049316 }, { "auxiliary_loss_clip": 0.0108036, "auxiliary_loss_mlp": 0.01036841, "balance_loss_clip": 1.04077077, "balance_loss_mlp": 1.02301288, "epoch": 0.49228919284533296, "flos": 20410548174720.0, "grad_norm": 1.410632675975, "language_loss": 0.67109811, "learning_rate": 2.1468894065510894e-06, "loss": 0.6922701, "num_input_tokens_seen": 176072545, "step": 8188, "time_per_iteration": 2.8322603702545166 }, { "auxiliary_loss_clip": 0.01119978, "auxiliary_loss_mlp": 0.01034509, "balance_loss_clip": 1.04967701, "balance_loss_mlp": 1.02131248, "epoch": 0.4923493160980009, "flos": 27122359818240.0, "grad_norm": 1.8145698664310643, "language_loss": 0.74643195, "learning_rate": 2.1465009939097623e-06, "loss": 0.76797676, "num_input_tokens_seen": 176091490, "step": 8189, "time_per_iteration": 2.700728178024292 }, { "auxiliary_loss_clip": 0.01102804, "auxiliary_loss_mlp": 0.01027439, "balance_loss_clip": 1.04349804, "balance_loss_mlp": 1.0138967, "epoch": 0.4924094393506689, "flos": 35736683224320.0, "grad_norm": 1.5012400452063497, "language_loss": 0.63989937, "learning_rate": 2.146112575713104e-06, "loss": 0.66120183, "num_input_tokens_seen": 176113200, "step": 8190, "time_per_iteration": 2.781034231185913 }, { "auxiliary_loss_clip": 0.01127618, "auxiliary_loss_mlp": 0.0103068, "balance_loss_clip": 1.04802811, "balance_loss_mlp": 1.01666641, "epoch": 0.49246956260333685, "flos": 20412487509120.0, "grad_norm": 2.59854956867769, "language_loss": 0.71723747, "learning_rate": 2.1457241519758413e-06, "loss": 0.73882031, "num_input_tokens_seen": 176132485, "step": 8191, "time_per_iteration": 2.6378936767578125 }, { "auxiliary_loss_clip": 0.01125365, "auxiliary_loss_mlp": 0.00771087, "balance_loss_clip": 1.04543817, "balance_loss_mlp": 1.00005293, "epoch": 0.4925296858560048, "flos": 38976938231040.0, "grad_norm": 1.5444009886503365, "language_loss": 0.71964842, "learning_rate": 2.1453357227127043e-06, "loss": 0.73861289, "num_input_tokens_seen": 176155755, "step": 8192, "time_per_iteration": 2.748840570449829 }, { "auxiliary_loss_clip": 0.01029185, "auxiliary_loss_mlp": 0.01001084, "balance_loss_clip": 1.02257538, "balance_loss_mlp": 0.9996711, "epoch": 0.4925898091086728, "flos": 64278917712000.0, "grad_norm": 0.718294486843201, "language_loss": 0.52137887, "learning_rate": 2.1449472879384224e-06, "loss": 0.54168153, "num_input_tokens_seen": 176216295, "step": 8193, "time_per_iteration": 3.264312267303467 }, { "auxiliary_loss_clip": 0.01125829, "auxiliary_loss_mlp": 0.01041308, "balance_loss_clip": 1.04740691, "balance_loss_mlp": 1.02760482, "epoch": 0.49264993236134075, "flos": 23036372110080.0, "grad_norm": 1.4111181716707888, "language_loss": 0.76839447, "learning_rate": 2.1445588476677246e-06, "loss": 0.79006582, "num_input_tokens_seen": 176235925, "step": 8194, "time_per_iteration": 2.7086539268493652 }, { "auxiliary_loss_clip": 0.01098073, "auxiliary_loss_mlp": 0.0103376, "balance_loss_clip": 1.04026222, "balance_loss_mlp": 1.02031338, "epoch": 0.4927100556140087, "flos": 24718212668160.0, "grad_norm": 1.9420104205554047, "language_loss": 0.70233512, "learning_rate": 2.144170401915341e-06, "loss": 0.72365344, "num_input_tokens_seen": 176253865, "step": 8195, "time_per_iteration": 2.6881814002990723 }, { "auxiliary_loss_clip": 0.01087059, "auxiliary_loss_mlp": 0.01033387, "balance_loss_clip": 1.04724264, "balance_loss_mlp": 1.02013052, "epoch": 0.4927701788666767, "flos": 23505544581120.0, "grad_norm": 2.097647655801467, "language_loss": 0.81090224, "learning_rate": 2.143781950696001e-06, "loss": 0.83210671, "num_input_tokens_seen": 176271525, "step": 8196, "time_per_iteration": 2.7997779846191406 }, { "auxiliary_loss_clip": 0.01092387, "auxiliary_loss_mlp": 0.01036049, "balance_loss_clip": 1.04048955, "balance_loss_mlp": 1.0212965, "epoch": 0.49283030211934464, "flos": 22928891639040.0, "grad_norm": 1.9754651417860998, "language_loss": 0.70963365, "learning_rate": 2.1433934940244356e-06, "loss": 0.73091799, "num_input_tokens_seen": 176290810, "step": 8197, "time_per_iteration": 2.687640428543091 }, { "auxiliary_loss_clip": 0.01113685, "auxiliary_loss_mlp": 0.01037816, "balance_loss_clip": 1.04734302, "balance_loss_mlp": 1.0245595, "epoch": 0.4928904253720126, "flos": 16873024210560.0, "grad_norm": 2.0854468186505133, "language_loss": 0.84519106, "learning_rate": 2.143005031915374e-06, "loss": 0.86670601, "num_input_tokens_seen": 176309165, "step": 8198, "time_per_iteration": 2.660125255584717 }, { "auxiliary_loss_clip": 0.01120431, "auxiliary_loss_mlp": 0.01037402, "balance_loss_clip": 1.04784405, "balance_loss_mlp": 1.02326965, "epoch": 0.4929505486246806, "flos": 14866551509760.0, "grad_norm": 1.8081780640264744, "language_loss": 0.76137328, "learning_rate": 2.1426165643835467e-06, "loss": 0.78295165, "num_input_tokens_seen": 176324960, "step": 8199, "time_per_iteration": 2.6528286933898926 }, { "auxiliary_loss_clip": 0.0110111, "auxiliary_loss_mlp": 0.0103715, "balance_loss_clip": 1.0420711, "balance_loss_mlp": 1.02215934, "epoch": 0.49301067187734854, "flos": 23842351434240.0, "grad_norm": 1.5743655623972015, "language_loss": 0.60060918, "learning_rate": 2.1422280914436864e-06, "loss": 0.62199175, "num_input_tokens_seen": 176346195, "step": 8200, "time_per_iteration": 2.725208044052124 }, { "auxiliary_loss_clip": 0.01112367, "auxiliary_loss_mlp": 0.01042559, "balance_loss_clip": 1.04529691, "balance_loss_mlp": 1.0288918, "epoch": 0.49307079513001656, "flos": 22491284244480.0, "grad_norm": 1.489817328340962, "language_loss": 0.79219347, "learning_rate": 2.1418396131105213e-06, "loss": 0.81374276, "num_input_tokens_seen": 176366735, "step": 8201, "time_per_iteration": 2.6749329566955566 }, { "auxiliary_loss_clip": 0.0112059, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.04529119, "balance_loss_mlp": 1.02063608, "epoch": 0.4931309183826845, "flos": 15924587546880.0, "grad_norm": 2.8764138588073527, "language_loss": 0.67214566, "learning_rate": 2.141451129398785e-06, "loss": 0.69371456, "num_input_tokens_seen": 176384475, "step": 8202, "time_per_iteration": 2.6964852809906006 }, { "auxiliary_loss_clip": 0.01101254, "auxiliary_loss_mlp": 0.01032575, "balance_loss_clip": 1.04416037, "balance_loss_mlp": 1.01929486, "epoch": 0.4931910416353525, "flos": 27309059735040.0, "grad_norm": 2.180124290012348, "language_loss": 0.75387114, "learning_rate": 2.1410626403232076e-06, "loss": 0.77520943, "num_input_tokens_seen": 176402645, "step": 8203, "time_per_iteration": 2.725586175918579 }, { "auxiliary_loss_clip": 0.01070891, "auxiliary_loss_mlp": 0.01037718, "balance_loss_clip": 1.04055309, "balance_loss_mlp": 1.02355599, "epoch": 0.49325116488802045, "flos": 20806139635200.0, "grad_norm": 2.514240753505036, "language_loss": 0.8037259, "learning_rate": 2.1406741458985197e-06, "loss": 0.82481205, "num_input_tokens_seen": 176416715, "step": 8204, "time_per_iteration": 2.6802115440368652 }, { "auxiliary_loss_clip": 0.01112932, "auxiliary_loss_mlp": 0.01040079, "balance_loss_clip": 1.04543495, "balance_loss_mlp": 1.02662015, "epoch": 0.4933112881406884, "flos": 19865963099520.0, "grad_norm": 1.919360097124168, "language_loss": 0.65891969, "learning_rate": 2.140285646139455e-06, "loss": 0.68044984, "num_input_tokens_seen": 176435755, "step": 8205, "time_per_iteration": 2.6556243896484375 }, { "auxiliary_loss_clip": 0.01131728, "auxiliary_loss_mlp": 0.01037034, "balance_loss_clip": 1.04643822, "balance_loss_mlp": 1.02157259, "epoch": 0.4933714113933564, "flos": 21827977741440.0, "grad_norm": 2.0939603582763207, "language_loss": 0.66682738, "learning_rate": 2.139897141060744e-06, "loss": 0.68851495, "num_input_tokens_seen": 176453915, "step": 8206, "time_per_iteration": 2.6004998683929443 }, { "auxiliary_loss_clip": 0.01078434, "auxiliary_loss_mlp": 0.01042651, "balance_loss_clip": 1.04006064, "balance_loss_mlp": 1.02803612, "epoch": 0.49343153464602435, "flos": 27890130049920.0, "grad_norm": 1.7303473596412533, "language_loss": 0.76393557, "learning_rate": 2.1395086306771196e-06, "loss": 0.78514642, "num_input_tokens_seen": 176475175, "step": 8207, "time_per_iteration": 2.7545268535614014 }, { "auxiliary_loss_clip": 0.01104435, "auxiliary_loss_mlp": 0.01037384, "balance_loss_clip": 1.04703426, "balance_loss_mlp": 1.02245331, "epoch": 0.4934916578986923, "flos": 24681080983680.0, "grad_norm": 2.36511926609042, "language_loss": 0.60212123, "learning_rate": 2.1391201150033147e-06, "loss": 0.62353945, "num_input_tokens_seen": 176494250, "step": 8208, "time_per_iteration": 4.556094408035278 }, { "auxiliary_loss_clip": 0.01108642, "auxiliary_loss_mlp": 0.0103495, "balance_loss_clip": 1.04619265, "balance_loss_mlp": 1.01990545, "epoch": 0.4935517811513603, "flos": 23405139089280.0, "grad_norm": 1.7507431161374047, "language_loss": 0.78938925, "learning_rate": 2.1387315940540598e-06, "loss": 0.81082511, "num_input_tokens_seen": 176513325, "step": 8209, "time_per_iteration": 4.171698093414307 }, { "auxiliary_loss_clip": 0.01094204, "auxiliary_loss_mlp": 0.00774879, "balance_loss_clip": 1.03905034, "balance_loss_mlp": 1.00007224, "epoch": 0.49361190440402825, "flos": 21944508439680.0, "grad_norm": 2.001694580419455, "language_loss": 0.79098332, "learning_rate": 2.138343067844089e-06, "loss": 0.80967414, "num_input_tokens_seen": 176532915, "step": 8210, "time_per_iteration": 4.38470196723938 }, { "auxiliary_loss_clip": 0.01113566, "auxiliary_loss_mlp": 0.01039269, "balance_loss_clip": 1.04458427, "balance_loss_mlp": 1.02467823, "epoch": 0.4936720276566962, "flos": 25115671635840.0, "grad_norm": 1.6707024820379262, "language_loss": 0.81313854, "learning_rate": 2.1379545363881363e-06, "loss": 0.83466691, "num_input_tokens_seen": 176552775, "step": 8211, "time_per_iteration": 4.290592193603516 }, { "auxiliary_loss_clip": 0.01082515, "auxiliary_loss_mlp": 0.01050398, "balance_loss_clip": 1.04066169, "balance_loss_mlp": 1.03376865, "epoch": 0.4937321509093642, "flos": 26358935132160.0, "grad_norm": 2.2904212815365477, "language_loss": 0.9144789, "learning_rate": 2.137565999700933e-06, "loss": 0.93580806, "num_input_tokens_seen": 176572185, "step": 8212, "time_per_iteration": 2.77516508102417 }, { "auxiliary_loss_clip": 0.010785, "auxiliary_loss_mlp": 0.01041938, "balance_loss_clip": 1.03849816, "balance_loss_mlp": 1.02666783, "epoch": 0.49379227416203214, "flos": 22961390469120.0, "grad_norm": 2.314209741920176, "language_loss": 0.65430582, "learning_rate": 2.1371774577972138e-06, "loss": 0.67551017, "num_input_tokens_seen": 176591490, "step": 8213, "time_per_iteration": 2.844672203063965 }, { "auxiliary_loss_clip": 0.01074353, "auxiliary_loss_mlp": 0.00772712, "balance_loss_clip": 1.03954375, "balance_loss_mlp": 1.00013876, "epoch": 0.49385239741470016, "flos": 32489101843200.0, "grad_norm": 1.8844803433311228, "language_loss": 0.7592994, "learning_rate": 2.136788910691711e-06, "loss": 0.77777004, "num_input_tokens_seen": 176612715, "step": 8214, "time_per_iteration": 2.828538179397583 }, { "auxiliary_loss_clip": 0.01131168, "auxiliary_loss_mlp": 0.01038594, "balance_loss_clip": 1.0492506, "balance_loss_mlp": 1.02410388, "epoch": 0.4939125206673681, "flos": 22492864442880.0, "grad_norm": 2.152096163807918, "language_loss": 0.84490359, "learning_rate": 2.1364003583991594e-06, "loss": 0.86660123, "num_input_tokens_seen": 176631950, "step": 8215, "time_per_iteration": 2.6413228511810303 }, { "auxiliary_loss_clip": 0.01108159, "auxiliary_loss_mlp": 0.01033701, "balance_loss_clip": 1.04206347, "balance_loss_mlp": 1.02092147, "epoch": 0.4939726439200361, "flos": 31176351486720.0, "grad_norm": 1.5888417840027016, "language_loss": 0.83245987, "learning_rate": 2.136011800934292e-06, "loss": 0.8538785, "num_input_tokens_seen": 176653060, "step": 8216, "time_per_iteration": 2.67913818359375 }, { "auxiliary_loss_clip": 0.01097989, "auxiliary_loss_mlp": 0.01034559, "balance_loss_clip": 1.04419255, "balance_loss_mlp": 1.02112412, "epoch": 0.49403276717270406, "flos": 22674213233280.0, "grad_norm": 2.8019860461659087, "language_loss": 0.74546432, "learning_rate": 2.1356232383118442e-06, "loss": 0.76678985, "num_input_tokens_seen": 176673895, "step": 8217, "time_per_iteration": 2.686866283416748 }, { "auxiliary_loss_clip": 0.0112431, "auxiliary_loss_mlp": 0.00771315, "balance_loss_clip": 1.04717755, "balance_loss_mlp": 1.00011575, "epoch": 0.494092890425372, "flos": 20741070147840.0, "grad_norm": 1.5679905275329922, "language_loss": 0.78933907, "learning_rate": 2.1352346705465494e-06, "loss": 0.80829537, "num_input_tokens_seen": 176692550, "step": 8218, "time_per_iteration": 2.6126081943511963 }, { "auxiliary_loss_clip": 0.01073156, "auxiliary_loss_mlp": 0.00770777, "balance_loss_clip": 1.03962803, "balance_loss_mlp": 1.000103, "epoch": 0.49415301367804, "flos": 18369026778240.0, "grad_norm": 2.059466953332075, "language_loss": 0.77003837, "learning_rate": 2.134846097653142e-06, "loss": 0.78847766, "num_input_tokens_seen": 176709335, "step": 8219, "time_per_iteration": 2.705432176589966 }, { "auxiliary_loss_clip": 0.01103123, "auxiliary_loss_mlp": 0.01034129, "balance_loss_clip": 1.04458046, "balance_loss_mlp": 1.02009845, "epoch": 0.49421313693070795, "flos": 17530620451200.0, "grad_norm": 1.9177646932354293, "language_loss": 0.62838733, "learning_rate": 2.134457519646357e-06, "loss": 0.64975989, "num_input_tokens_seen": 176727715, "step": 8220, "time_per_iteration": 2.615745782852173 }, { "auxiliary_loss_clip": 0.01124834, "auxiliary_loss_mlp": 0.01032605, "balance_loss_clip": 1.04509032, "balance_loss_mlp": 1.01844347, "epoch": 0.4942732601833759, "flos": 20812173120000.0, "grad_norm": 1.9687050610151906, "language_loss": 0.72233951, "learning_rate": 2.1340689365409296e-06, "loss": 0.74391389, "num_input_tokens_seen": 176747530, "step": 8221, "time_per_iteration": 2.6178054809570312 }, { "auxiliary_loss_clip": 0.01085939, "auxiliary_loss_mlp": 0.01035447, "balance_loss_clip": 1.04544675, "balance_loss_mlp": 1.02218497, "epoch": 0.4943333834360439, "flos": 15048941794560.0, "grad_norm": 1.861092907129918, "language_loss": 0.792252, "learning_rate": 2.133680348351595e-06, "loss": 0.81346589, "num_input_tokens_seen": 176765260, "step": 8222, "time_per_iteration": 2.679504632949829 }, { "auxiliary_loss_clip": 0.01115599, "auxiliary_loss_mlp": 0.01036496, "balance_loss_clip": 1.04686999, "balance_loss_mlp": 1.022048, "epoch": 0.49439350668871185, "flos": 16070420764800.0, "grad_norm": 2.9899447612273784, "language_loss": 0.72679973, "learning_rate": 2.133291755093088e-06, "loss": 0.7483207, "num_input_tokens_seen": 176781770, "step": 8223, "time_per_iteration": 2.581552028656006 }, { "auxiliary_loss_clip": 0.01116938, "auxiliary_loss_mlp": 0.01040425, "balance_loss_clip": 1.04635167, "balance_loss_mlp": 1.0257324, "epoch": 0.4944536299413798, "flos": 20880079781760.0, "grad_norm": 2.0609443486265784, "language_loss": 0.75248039, "learning_rate": 2.132903156780144e-06, "loss": 0.77405405, "num_input_tokens_seen": 176800655, "step": 8224, "time_per_iteration": 2.6427581310272217 }, { "auxiliary_loss_clip": 0.0110423, "auxiliary_loss_mlp": 0.01033189, "balance_loss_clip": 1.04815972, "balance_loss_mlp": 1.01925385, "epoch": 0.4945137531940478, "flos": 26608908856320.0, "grad_norm": 2.070444808683487, "language_loss": 0.6428299, "learning_rate": 2.1325145534274997e-06, "loss": 0.66420412, "num_input_tokens_seen": 176820610, "step": 8225, "time_per_iteration": 2.685084104537964 }, { "auxiliary_loss_clip": 0.01105728, "auxiliary_loss_mlp": 0.01034446, "balance_loss_clip": 1.04689407, "balance_loss_mlp": 1.02097511, "epoch": 0.49457387644671574, "flos": 23988148738560.0, "grad_norm": 2.0654038990553834, "language_loss": 0.76539797, "learning_rate": 2.1321259450498893e-06, "loss": 0.78679967, "num_input_tokens_seen": 176840520, "step": 8226, "time_per_iteration": 2.776888132095337 }, { "auxiliary_loss_clip": 0.01130995, "auxiliary_loss_mlp": 0.01043657, "balance_loss_clip": 1.04843736, "balance_loss_mlp": 1.02849376, "epoch": 0.49463399969938376, "flos": 26976598427520.0, "grad_norm": 1.7138853183765776, "language_loss": 0.71274078, "learning_rate": 2.131737331662051e-06, "loss": 0.7344873, "num_input_tokens_seen": 176860265, "step": 8227, "time_per_iteration": 2.6920416355133057 }, { "auxiliary_loss_clip": 0.01109805, "auxiliary_loss_mlp": 0.01042947, "balance_loss_clip": 1.04749131, "balance_loss_mlp": 1.02879047, "epoch": 0.49469412295205173, "flos": 29681534067840.0, "grad_norm": 1.5610614491128025, "language_loss": 0.7156117, "learning_rate": 2.131348713278718e-06, "loss": 0.73713928, "num_input_tokens_seen": 176882910, "step": 8228, "time_per_iteration": 2.7586421966552734 }, { "auxiliary_loss_clip": 0.01126513, "auxiliary_loss_mlp": 0.01030651, "balance_loss_clip": 1.04834974, "balance_loss_mlp": 1.01664948, "epoch": 0.4947542462047197, "flos": 24131791226880.0, "grad_norm": 1.7062154527873281, "language_loss": 0.83690989, "learning_rate": 2.1309600899146304e-06, "loss": 0.85848153, "num_input_tokens_seen": 176903030, "step": 8229, "time_per_iteration": 2.643385887145996 }, { "auxiliary_loss_clip": 0.01117283, "auxiliary_loss_mlp": 0.01035461, "balance_loss_clip": 1.04470325, "balance_loss_mlp": 1.0201304, "epoch": 0.49481436945738766, "flos": 20045049333120.0, "grad_norm": 1.8291146066570236, "language_loss": 0.74686736, "learning_rate": 2.1305714615845227e-06, "loss": 0.76839477, "num_input_tokens_seen": 176919025, "step": 8230, "time_per_iteration": 2.6726033687591553 }, { "auxiliary_loss_clip": 0.01112312, "auxiliary_loss_mlp": 0.0103259, "balance_loss_clip": 1.04797947, "balance_loss_mlp": 1.01941717, "epoch": 0.4948744927100556, "flos": 15669550005120.0, "grad_norm": 1.946821067065893, "language_loss": 0.79830235, "learning_rate": 2.1301828283031314e-06, "loss": 0.81975138, "num_input_tokens_seen": 176937945, "step": 8231, "time_per_iteration": 2.627202272415161 }, { "auxiliary_loss_clip": 0.01038701, "auxiliary_loss_mlp": 0.01000467, "balance_loss_clip": 1.02304196, "balance_loss_mlp": 0.99924535, "epoch": 0.4949346159627236, "flos": 68872071502080.0, "grad_norm": 0.7441317598934056, "language_loss": 0.60252988, "learning_rate": 2.1297941900851944e-06, "loss": 0.62292159, "num_input_tokens_seen": 177004575, "step": 8232, "time_per_iteration": 3.299022912979126 }, { "auxiliary_loss_clip": 0.01103975, "auxiliary_loss_mlp": 0.01036844, "balance_loss_clip": 1.04270494, "balance_loss_mlp": 1.0220201, "epoch": 0.49499473921539155, "flos": 24790285307520.0, "grad_norm": 1.6243536723265515, "language_loss": 0.69376481, "learning_rate": 2.1294055469454496e-06, "loss": 0.71517295, "num_input_tokens_seen": 177024155, "step": 8233, "time_per_iteration": 2.7124898433685303 }, { "auxiliary_loss_clip": 0.01069129, "auxiliary_loss_mlp": 0.01041459, "balance_loss_clip": 1.03902805, "balance_loss_mlp": 1.02584291, "epoch": 0.4950548624680595, "flos": 32707905540480.0, "grad_norm": 1.998308286461765, "language_loss": 0.66344726, "learning_rate": 2.129016898898633e-06, "loss": 0.68455309, "num_input_tokens_seen": 177046185, "step": 8234, "time_per_iteration": 2.7932980060577393 }, { "auxiliary_loss_clip": 0.01031932, "auxiliary_loss_mlp": 0.01001723, "balance_loss_clip": 1.02630067, "balance_loss_mlp": 1.00048304, "epoch": 0.4951149857207275, "flos": 50082173066880.0, "grad_norm": 0.7974470380945157, "language_loss": 0.58048564, "learning_rate": 2.128628245959482e-06, "loss": 0.60082221, "num_input_tokens_seen": 177099025, "step": 8235, "time_per_iteration": 3.095088481903076 }, { "auxiliary_loss_clip": 0.01096356, "auxiliary_loss_mlp": 0.01043085, "balance_loss_clip": 1.0431416, "balance_loss_mlp": 1.02861345, "epoch": 0.49517510897339545, "flos": 22236785406720.0, "grad_norm": 1.5745194755893521, "language_loss": 0.77200663, "learning_rate": 2.1282395881427355e-06, "loss": 0.793401, "num_input_tokens_seen": 177118365, "step": 8236, "time_per_iteration": 2.7678022384643555 }, { "auxiliary_loss_clip": 0.01081616, "auxiliary_loss_mlp": 0.01037859, "balance_loss_clip": 1.0420413, "balance_loss_mlp": 1.02397156, "epoch": 0.4952352322260634, "flos": 25374120969600.0, "grad_norm": 1.6979000405196067, "language_loss": 0.73080051, "learning_rate": 2.1278509254631315e-06, "loss": 0.75199521, "num_input_tokens_seen": 177136415, "step": 8237, "time_per_iteration": 2.764728307723999 }, { "auxiliary_loss_clip": 0.01124754, "auxiliary_loss_mlp": 0.01035631, "balance_loss_clip": 1.04693317, "balance_loss_mlp": 1.02215445, "epoch": 0.4952953554787314, "flos": 24608721035520.0, "grad_norm": 1.914497446494958, "language_loss": 0.75439888, "learning_rate": 2.127462257935406e-06, "loss": 0.77600276, "num_input_tokens_seen": 177155690, "step": 8238, "time_per_iteration": 2.66549015045166 }, { "auxiliary_loss_clip": 0.01084433, "auxiliary_loss_mlp": 0.0104692, "balance_loss_clip": 1.04372036, "balance_loss_mlp": 1.03062415, "epoch": 0.49535547873139935, "flos": 17311278049920.0, "grad_norm": 2.2478036902932508, "language_loss": 0.73706102, "learning_rate": 2.1270735855743008e-06, "loss": 0.75837457, "num_input_tokens_seen": 177173350, "step": 8239, "time_per_iteration": 2.703118324279785 }, { "auxiliary_loss_clip": 0.0104307, "auxiliary_loss_mlp": 0.01038928, "balance_loss_clip": 1.04188919, "balance_loss_mlp": 1.0223105, "epoch": 0.4954156019840673, "flos": 20740315962240.0, "grad_norm": 2.5033228354450667, "language_loss": 0.7926327, "learning_rate": 2.126684908394552e-06, "loss": 0.8134526, "num_input_tokens_seen": 177191115, "step": 8240, "time_per_iteration": 2.9256656169891357 }, { "auxiliary_loss_clip": 0.01116686, "auxiliary_loss_mlp": 0.01040866, "balance_loss_clip": 1.04832554, "balance_loss_mlp": 1.0278666, "epoch": 0.49547572523673533, "flos": 12820684567680.0, "grad_norm": 2.1558656465787367, "language_loss": 0.8547368, "learning_rate": 2.126296226410898e-06, "loss": 0.87631238, "num_input_tokens_seen": 177206155, "step": 8241, "time_per_iteration": 2.9096901416778564 }, { "auxiliary_loss_clip": 0.01067537, "auxiliary_loss_mlp": 0.01039414, "balance_loss_clip": 1.04159331, "balance_loss_mlp": 1.02591348, "epoch": 0.4955358484894033, "flos": 15597046402560.0, "grad_norm": 1.820610909823573, "language_loss": 0.77092397, "learning_rate": 2.1259075396380794e-06, "loss": 0.7919935, "num_input_tokens_seen": 177224815, "step": 8242, "time_per_iteration": 2.6902410984039307 }, { "auxiliary_loss_clip": 0.01104403, "auxiliary_loss_mlp": 0.00771127, "balance_loss_clip": 1.04569447, "balance_loss_mlp": 1.00017774, "epoch": 0.49559597174207126, "flos": 26464368528000.0, "grad_norm": 1.9730293387874334, "language_loss": 0.67737073, "learning_rate": 2.125518848090833e-06, "loss": 0.69612604, "num_input_tokens_seen": 177244490, "step": 8243, "time_per_iteration": 2.6972243785858154 }, { "auxiliary_loss_clip": 0.01112124, "auxiliary_loss_mlp": 0.01034088, "balance_loss_clip": 1.04816341, "balance_loss_mlp": 1.02076697, "epoch": 0.4956560949947392, "flos": 23148234040320.0, "grad_norm": 2.2375947263106526, "language_loss": 0.67908239, "learning_rate": 2.125130151783901e-06, "loss": 0.70054448, "num_input_tokens_seen": 177264340, "step": 8244, "time_per_iteration": 2.762528419494629 }, { "auxiliary_loss_clip": 0.01097015, "auxiliary_loss_mlp": 0.01040284, "balance_loss_clip": 1.04337358, "balance_loss_mlp": 1.02460194, "epoch": 0.4957162182474072, "flos": 20773461237120.0, "grad_norm": 1.8772229473228363, "language_loss": 0.74776495, "learning_rate": 2.12474145073202e-06, "loss": 0.76913798, "num_input_tokens_seen": 177283055, "step": 8245, "time_per_iteration": 2.7792561054229736 }, { "auxiliary_loss_clip": 0.01115174, "auxiliary_loss_mlp": 0.01036156, "balance_loss_clip": 1.04705966, "balance_loss_mlp": 1.02214909, "epoch": 0.49577634150007516, "flos": 18734202397440.0, "grad_norm": 1.8990901453025917, "language_loss": 0.8153336, "learning_rate": 2.1243527449499306e-06, "loss": 0.83684695, "num_input_tokens_seen": 177301140, "step": 8246, "time_per_iteration": 2.5740935802459717 }, { "auxiliary_loss_clip": 0.01090358, "auxiliary_loss_mlp": 0.0104326, "balance_loss_clip": 1.04562306, "balance_loss_mlp": 1.02767944, "epoch": 0.4958364647527431, "flos": 25554176870400.0, "grad_norm": 1.8707658617569873, "language_loss": 0.83808625, "learning_rate": 2.1239640344523733e-06, "loss": 0.85942245, "num_input_tokens_seen": 177323095, "step": 8247, "time_per_iteration": 4.410465955734253 }, { "auxiliary_loss_clip": 0.01102086, "auxiliary_loss_mlp": 0.01030712, "balance_loss_clip": 1.05016184, "balance_loss_mlp": 1.01716995, "epoch": 0.4958965880054111, "flos": 24425325169920.0, "grad_norm": 1.9625896451991354, "language_loss": 0.83650881, "learning_rate": 2.123575319254087e-06, "loss": 0.85783684, "num_input_tokens_seen": 177339845, "step": 8248, "time_per_iteration": 4.395894289016724 }, { "auxiliary_loss_clip": 0.01118567, "auxiliary_loss_mlp": 0.01032735, "balance_loss_clip": 1.04729056, "balance_loss_mlp": 1.01836419, "epoch": 0.49595671125807905, "flos": 25083460114560.0, "grad_norm": 1.8247689581014963, "language_loss": 0.73558569, "learning_rate": 2.123186599369812e-06, "loss": 0.75709867, "num_input_tokens_seen": 177359980, "step": 8249, "time_per_iteration": 4.36426305770874 }, { "auxiliary_loss_clip": 0.01110094, "auxiliary_loss_mlp": 0.01046161, "balance_loss_clip": 1.04773486, "balance_loss_mlp": 1.03169477, "epoch": 0.496016834510747, "flos": 16435883692800.0, "grad_norm": 1.900690676640245, "language_loss": 0.75902295, "learning_rate": 2.122797874814289e-06, "loss": 0.78058553, "num_input_tokens_seen": 177378580, "step": 8250, "time_per_iteration": 4.203567266464233 }, { "auxiliary_loss_clip": 0.011299, "auxiliary_loss_mlp": 0.01042712, "balance_loss_clip": 1.04861271, "balance_loss_mlp": 1.02788305, "epoch": 0.496076957763415, "flos": 23437925228160.0, "grad_norm": 1.7086851316152774, "language_loss": 0.69983917, "learning_rate": 2.1224091456022585e-06, "loss": 0.72156531, "num_input_tokens_seen": 177398790, "step": 8251, "time_per_iteration": 2.6825788021087646 }, { "auxiliary_loss_clip": 0.01092939, "auxiliary_loss_mlp": 0.00771421, "balance_loss_clip": 1.04950809, "balance_loss_mlp": 1.00016773, "epoch": 0.49613708101608295, "flos": 16909509450240.0, "grad_norm": 1.9257049963935782, "language_loss": 0.80088174, "learning_rate": 2.122020411748461e-06, "loss": 0.81952536, "num_input_tokens_seen": 177416515, "step": 8252, "time_per_iteration": 2.7017300128936768 }, { "auxiliary_loss_clip": 0.01130139, "auxiliary_loss_mlp": 0.01033677, "balance_loss_clip": 1.04937637, "balance_loss_mlp": 1.01769102, "epoch": 0.4961972042687509, "flos": 16618094409600.0, "grad_norm": 1.7413302103337327, "language_loss": 0.81005448, "learning_rate": 2.1216316732676363e-06, "loss": 0.83169258, "num_input_tokens_seen": 177434425, "step": 8253, "time_per_iteration": 2.5844311714172363 }, { "auxiliary_loss_clip": 0.01092121, "auxiliary_loss_mlp": 0.01031077, "balance_loss_clip": 1.04245412, "balance_loss_mlp": 1.01743925, "epoch": 0.49625732752141893, "flos": 28956749437440.0, "grad_norm": 1.4814612406319185, "language_loss": 0.67246485, "learning_rate": 2.1212429301745275e-06, "loss": 0.69369686, "num_input_tokens_seen": 177459675, "step": 8254, "time_per_iteration": 2.815851926803589 }, { "auxiliary_loss_clip": 0.01091336, "auxiliary_loss_mlp": 0.01052712, "balance_loss_clip": 1.04560924, "balance_loss_mlp": 1.03665471, "epoch": 0.4963174507740869, "flos": 23112359331840.0, "grad_norm": 1.7981030707772934, "language_loss": 0.74278247, "learning_rate": 2.1208541824838743e-06, "loss": 0.76422298, "num_input_tokens_seen": 177478895, "step": 8255, "time_per_iteration": 2.7599687576293945 }, { "auxiliary_loss_clip": 0.01098276, "auxiliary_loss_mlp": 0.01036505, "balance_loss_clip": 1.04286051, "balance_loss_mlp": 1.02203858, "epoch": 0.49637757402675486, "flos": 13917863450880.0, "grad_norm": 1.736601635944992, "language_loss": 0.81702995, "learning_rate": 2.1204654302104183e-06, "loss": 0.83837777, "num_input_tokens_seen": 177494920, "step": 8256, "time_per_iteration": 2.640913724899292 }, { "auxiliary_loss_clip": 0.01096211, "auxiliary_loss_mlp": 0.01033961, "balance_loss_clip": 1.04346132, "balance_loss_mlp": 1.02055597, "epoch": 0.49643769727942283, "flos": 22309001700480.0, "grad_norm": 1.6034861047711904, "language_loss": 0.81197649, "learning_rate": 2.120076673368901e-06, "loss": 0.83327824, "num_input_tokens_seen": 177515455, "step": 8257, "time_per_iteration": 2.724745512008667 }, { "auxiliary_loss_clip": 0.01133163, "auxiliary_loss_mlp": 0.01039711, "balance_loss_clip": 1.04763043, "balance_loss_mlp": 1.02435732, "epoch": 0.4964978205320908, "flos": 19500248776320.0, "grad_norm": 1.9280789180083706, "language_loss": 0.66280329, "learning_rate": 2.1196879119740647e-06, "loss": 0.68453205, "num_input_tokens_seen": 177534040, "step": 8258, "time_per_iteration": 2.570275068283081 }, { "auxiliary_loss_clip": 0.01110241, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.04361916, "balance_loss_mlp": 1.01942396, "epoch": 0.49655794378475876, "flos": 23436524597760.0, "grad_norm": 1.42579071834104, "language_loss": 0.77627164, "learning_rate": 2.1192991460406502e-06, "loss": 0.79769588, "num_input_tokens_seen": 177554510, "step": 8259, "time_per_iteration": 2.676722288131714 }, { "auxiliary_loss_clip": 0.01097253, "auxiliary_loss_mlp": 0.01038217, "balance_loss_clip": 1.04436278, "balance_loss_mlp": 1.02406085, "epoch": 0.4966180670374267, "flos": 26831124345600.0, "grad_norm": 1.5162865829701626, "language_loss": 0.78461975, "learning_rate": 2.1189103755834e-06, "loss": 0.80597448, "num_input_tokens_seen": 177575780, "step": 8260, "time_per_iteration": 2.7226130962371826 }, { "auxiliary_loss_clip": 0.01100503, "auxiliary_loss_mlp": 0.01035808, "balance_loss_clip": 1.04154015, "balance_loss_mlp": 1.02135992, "epoch": 0.4966781902900947, "flos": 22009326531840.0, "grad_norm": 3.0057343325073456, "language_loss": 0.76335442, "learning_rate": 2.1185216006170573e-06, "loss": 0.78471756, "num_input_tokens_seen": 177588965, "step": 8261, "time_per_iteration": 2.6477174758911133 }, { "auxiliary_loss_clip": 0.01071745, "auxiliary_loss_mlp": 0.0103251, "balance_loss_clip": 1.03892851, "balance_loss_mlp": 1.01939654, "epoch": 0.49673831354276266, "flos": 26213353309440.0, "grad_norm": 1.835251427236856, "language_loss": 0.89503151, "learning_rate": 2.1181328211563627e-06, "loss": 0.9160741, "num_input_tokens_seen": 177608425, "step": 8262, "time_per_iteration": 2.757200241088867 }, { "auxiliary_loss_clip": 0.01068117, "auxiliary_loss_mlp": 0.01035611, "balance_loss_clip": 1.04000998, "balance_loss_mlp": 1.0223608, "epoch": 0.4967984367954306, "flos": 23182277155200.0, "grad_norm": 1.5869779774184047, "language_loss": 0.73859417, "learning_rate": 2.11774403721606e-06, "loss": 0.7596314, "num_input_tokens_seen": 177628240, "step": 8263, "time_per_iteration": 2.799468994140625 }, { "auxiliary_loss_clip": 0.0108327, "auxiliary_loss_mlp": 0.01039108, "balance_loss_clip": 1.0480659, "balance_loss_mlp": 1.02325881, "epoch": 0.4968585600480986, "flos": 19281445079040.0, "grad_norm": 3.1164108836460036, "language_loss": 0.70163679, "learning_rate": 2.1173552488108923e-06, "loss": 0.72286057, "num_input_tokens_seen": 177645920, "step": 8264, "time_per_iteration": 2.720449447631836 }, { "auxiliary_loss_clip": 0.01098192, "auxiliary_loss_mlp": 0.01032461, "balance_loss_clip": 1.04328251, "balance_loss_mlp": 1.01837087, "epoch": 0.49691868330076655, "flos": 22528703237760.0, "grad_norm": 1.6446636391121152, "language_loss": 0.65104395, "learning_rate": 2.1169664559556007e-06, "loss": 0.67235053, "num_input_tokens_seen": 177667185, "step": 8265, "time_per_iteration": 2.683858633041382 }, { "auxiliary_loss_clip": 0.01028918, "auxiliary_loss_mlp": 0.01002907, "balance_loss_clip": 1.0220778, "balance_loss_mlp": 1.00148249, "epoch": 0.4969788065534345, "flos": 66577128675840.0, "grad_norm": 0.930084427968553, "language_loss": 0.53491867, "learning_rate": 2.1165776586649304e-06, "loss": 0.55523694, "num_input_tokens_seen": 177733020, "step": 8266, "time_per_iteration": 3.2566375732421875 }, { "auxiliary_loss_clip": 0.01113371, "auxiliary_loss_mlp": 0.01032636, "balance_loss_clip": 1.04611242, "balance_loss_mlp": 1.01834857, "epoch": 0.49703892980610254, "flos": 24059503105920.0, "grad_norm": 1.764439361537035, "language_loss": 0.79587245, "learning_rate": 2.1161888569536223e-06, "loss": 0.81733251, "num_input_tokens_seen": 177753370, "step": 8267, "time_per_iteration": 2.6278576850891113 }, { "auxiliary_loss_clip": 0.01102001, "auxiliary_loss_mlp": 0.01039107, "balance_loss_clip": 1.04590034, "balance_loss_mlp": 1.02316856, "epoch": 0.4970990530587705, "flos": 29126174912640.0, "grad_norm": 2.2169439003129385, "language_loss": 0.74835396, "learning_rate": 2.1158000508364223e-06, "loss": 0.76976496, "num_input_tokens_seen": 177771530, "step": 8268, "time_per_iteration": 2.734259843826294 }, { "auxiliary_loss_clip": 0.011141, "auxiliary_loss_mlp": 0.00771431, "balance_loss_clip": 1.04348183, "balance_loss_mlp": 1.00014162, "epoch": 0.49715917631143847, "flos": 46026167258880.0, "grad_norm": 4.0839840126254225, "language_loss": 0.68041855, "learning_rate": 2.115411240328073e-06, "loss": 0.69927382, "num_input_tokens_seen": 177796355, "step": 8269, "time_per_iteration": 2.90146541595459 }, { "auxiliary_loss_clip": 0.01097171, "auxiliary_loss_mlp": 0.01041712, "balance_loss_clip": 1.04262531, "balance_loss_mlp": 1.02837276, "epoch": 0.49721929956410643, "flos": 20191277600640.0, "grad_norm": 2.5681883642378436, "language_loss": 0.85533005, "learning_rate": 2.1150224254433167e-06, "loss": 0.87671888, "num_input_tokens_seen": 177814300, "step": 8270, "time_per_iteration": 2.8005404472351074 }, { "auxiliary_loss_clip": 0.01081529, "auxiliary_loss_mlp": 0.00771255, "balance_loss_clip": 1.04315615, "balance_loss_mlp": 1.00016665, "epoch": 0.4972794228167744, "flos": 21653560275840.0, "grad_norm": 1.8215552302583695, "language_loss": 0.70831466, "learning_rate": 2.114633606196899e-06, "loss": 0.72684252, "num_input_tokens_seen": 177833615, "step": 8271, "time_per_iteration": 2.91554594039917 }, { "auxiliary_loss_clip": 0.01112057, "auxiliary_loss_mlp": 0.01035877, "balance_loss_clip": 1.04666567, "balance_loss_mlp": 1.02128029, "epoch": 0.49733954606944236, "flos": 24279743347200.0, "grad_norm": 1.5312065445139798, "language_loss": 0.78403968, "learning_rate": 2.1142447826035635e-06, "loss": 0.80551904, "num_input_tokens_seen": 177855315, "step": 8272, "time_per_iteration": 2.6702592372894287 }, { "auxiliary_loss_clip": 0.01090488, "auxiliary_loss_mlp": 0.01040546, "balance_loss_clip": 1.0464623, "balance_loss_mlp": 1.02679515, "epoch": 0.4973996693221103, "flos": 37852575730560.0, "grad_norm": 2.547664660385474, "language_loss": 0.6682387, "learning_rate": 2.1138559546780544e-06, "loss": 0.68954909, "num_input_tokens_seen": 177875590, "step": 8273, "time_per_iteration": 2.8257791996002197 }, { "auxiliary_loss_clip": 0.01089829, "auxiliary_loss_mlp": 0.01037205, "balance_loss_clip": 1.04431605, "balance_loss_mlp": 1.02347827, "epoch": 0.4974597925747783, "flos": 21361426963200.0, "grad_norm": 1.5692617693087136, "language_loss": 0.78097814, "learning_rate": 2.1134671224351163e-06, "loss": 0.80224848, "num_input_tokens_seen": 177894175, "step": 8274, "time_per_iteration": 2.6786539554595947 }, { "auxiliary_loss_clip": 0.01087892, "auxiliary_loss_mlp": 0.01037968, "balance_loss_clip": 1.04171109, "balance_loss_mlp": 1.02315021, "epoch": 0.49751991582744626, "flos": 30738133560960.0, "grad_norm": 1.7539763145915706, "language_loss": 0.75727397, "learning_rate": 2.113078285889493e-06, "loss": 0.77853251, "num_input_tokens_seen": 177913920, "step": 8275, "time_per_iteration": 2.7289958000183105 }, { "auxiliary_loss_clip": 0.01117048, "auxiliary_loss_mlp": 0.01038819, "balance_loss_clip": 1.04600728, "balance_loss_mlp": 1.02240443, "epoch": 0.4975800390801142, "flos": 14100541044480.0, "grad_norm": 2.0869085379368717, "language_loss": 0.84277642, "learning_rate": 2.1126894450559303e-06, "loss": 0.86433506, "num_input_tokens_seen": 177930425, "step": 8276, "time_per_iteration": 2.612114667892456 }, { "auxiliary_loss_clip": 0.01122283, "auxiliary_loss_mlp": 0.00770821, "balance_loss_clip": 1.04578209, "balance_loss_mlp": 1.00012255, "epoch": 0.4976401623327822, "flos": 24207275658240.0, "grad_norm": 2.0722406374843283, "language_loss": 0.70213616, "learning_rate": 2.112300599949172e-06, "loss": 0.72106719, "num_input_tokens_seen": 177949885, "step": 8277, "time_per_iteration": 2.627364158630371 }, { "auxiliary_loss_clip": 0.01109969, "auxiliary_loss_mlp": 0.01038763, "balance_loss_clip": 1.04542017, "balance_loss_mlp": 1.02430928, "epoch": 0.49770028558545015, "flos": 21136769349120.0, "grad_norm": 1.855614041136712, "language_loss": 0.82644826, "learning_rate": 2.111911750583964e-06, "loss": 0.84793556, "num_input_tokens_seen": 177965720, "step": 8278, "time_per_iteration": 2.653998613357544 }, { "auxiliary_loss_clip": 0.01117237, "auxiliary_loss_mlp": 0.01041122, "balance_loss_clip": 1.04625261, "balance_loss_mlp": 1.02723408, "epoch": 0.4977604088381181, "flos": 16763927627520.0, "grad_norm": 2.0212653893375276, "language_loss": 0.67471039, "learning_rate": 2.111522896975052e-06, "loss": 0.69629395, "num_input_tokens_seen": 177983190, "step": 8279, "time_per_iteration": 2.607090473175049 }, { "auxiliary_loss_clip": 0.01115839, "auxiliary_loss_mlp": 0.01041996, "balance_loss_clip": 1.04406691, "balance_loss_mlp": 1.02692842, "epoch": 0.49782053209078614, "flos": 15703521292800.0, "grad_norm": 2.1427811758671527, "language_loss": 0.70507026, "learning_rate": 2.1111340391371794e-06, "loss": 0.72664863, "num_input_tokens_seen": 178000155, "step": 8280, "time_per_iteration": 2.636384963989258 }, { "auxiliary_loss_clip": 0.01090186, "auxiliary_loss_mlp": 0.01035589, "balance_loss_clip": 1.04237318, "balance_loss_mlp": 1.02177858, "epoch": 0.4978806553434541, "flos": 24753692327040.0, "grad_norm": 2.860421271049928, "language_loss": 0.64889467, "learning_rate": 2.1107451770850936e-06, "loss": 0.67015243, "num_input_tokens_seen": 178021060, "step": 8281, "time_per_iteration": 2.6961820125579834 }, { "auxiliary_loss_clip": 0.0111999, "auxiliary_loss_mlp": 0.01036047, "balance_loss_clip": 1.0478642, "balance_loss_mlp": 1.02102113, "epoch": 0.49794077859612207, "flos": 13115726881920.0, "grad_norm": 2.7426965878502845, "language_loss": 0.73226738, "learning_rate": 2.1103563108335387e-06, "loss": 0.75382769, "num_input_tokens_seen": 178038180, "step": 8282, "time_per_iteration": 2.7749152183532715 }, { "auxiliary_loss_clip": 0.01095648, "auxiliary_loss_mlp": 0.01033955, "balance_loss_clip": 1.04499686, "balance_loss_mlp": 1.02106822, "epoch": 0.49800090184879003, "flos": 27525133998720.0, "grad_norm": 1.749404235674241, "language_loss": 0.73327482, "learning_rate": 2.109967440397263e-06, "loss": 0.75457078, "num_input_tokens_seen": 178057565, "step": 8283, "time_per_iteration": 2.7039520740509033 }, { "auxiliary_loss_clip": 0.01068275, "auxiliary_loss_mlp": 0.01054525, "balance_loss_clip": 1.0405463, "balance_loss_mlp": 1.03883147, "epoch": 0.498061025101458, "flos": 19792489829760.0, "grad_norm": 2.5573951668279102, "language_loss": 0.7842927, "learning_rate": 2.1095785657910095e-06, "loss": 0.80552071, "num_input_tokens_seen": 178076965, "step": 8284, "time_per_iteration": 2.7534518241882324 }, { "auxiliary_loss_clip": 0.01103825, "auxiliary_loss_mlp": 0.0104233, "balance_loss_clip": 1.045488, "balance_loss_mlp": 1.02733326, "epoch": 0.49812114835412596, "flos": 29893909230720.0, "grad_norm": 1.7317298938274186, "language_loss": 0.73607123, "learning_rate": 2.109189687029526e-06, "loss": 0.75753278, "num_input_tokens_seen": 178095105, "step": 8285, "time_per_iteration": 2.696913719177246 }, { "auxiliary_loss_clip": 0.01114659, "auxiliary_loss_mlp": 0.01033722, "balance_loss_clip": 1.0496074, "balance_loss_mlp": 1.01902318, "epoch": 0.49818127160679393, "flos": 23147048891520.0, "grad_norm": 1.6428187648074233, "language_loss": 0.74194658, "learning_rate": 2.1088008041275598e-06, "loss": 0.76343036, "num_input_tokens_seen": 178114505, "step": 8286, "time_per_iteration": 4.164494752883911 }, { "auxiliary_loss_clip": 0.01106668, "auxiliary_loss_mlp": 0.0104423, "balance_loss_clip": 1.04752493, "balance_loss_mlp": 1.02986491, "epoch": 0.4982413948594619, "flos": 21652806090240.0, "grad_norm": 1.7990587687461415, "language_loss": 0.85529351, "learning_rate": 2.1084119170998545e-06, "loss": 0.87680244, "num_input_tokens_seen": 178131595, "step": 8287, "time_per_iteration": 4.236407279968262 }, { "auxiliary_loss_clip": 0.01076576, "auxiliary_loss_mlp": 0.01032511, "balance_loss_clip": 1.04194725, "balance_loss_mlp": 1.01822948, "epoch": 0.49830151811212986, "flos": 32486982940800.0, "grad_norm": 1.6860437652999367, "language_loss": 0.72530627, "learning_rate": 2.108023025961159e-06, "loss": 0.74639714, "num_input_tokens_seen": 178152055, "step": 8288, "time_per_iteration": 4.404609680175781 }, { "auxiliary_loss_clip": 0.01106449, "auxiliary_loss_mlp": 0.01040352, "balance_loss_clip": 1.04326916, "balance_loss_mlp": 1.02459288, "epoch": 0.4983616413647978, "flos": 18142358002560.0, "grad_norm": 3.334734045415943, "language_loss": 0.79885554, "learning_rate": 2.10763413072622e-06, "loss": 0.82032353, "num_input_tokens_seen": 178168150, "step": 8289, "time_per_iteration": 2.6629836559295654 }, { "auxiliary_loss_clip": 0.01114454, "auxiliary_loss_mlp": 0.0103885, "balance_loss_clip": 1.0446074, "balance_loss_mlp": 1.02460992, "epoch": 0.4984217646174658, "flos": 19718836992000.0, "grad_norm": 2.0640091139098256, "language_loss": 0.72874933, "learning_rate": 2.107245231409784e-06, "loss": 0.75028241, "num_input_tokens_seen": 178186150, "step": 8290, "time_per_iteration": 4.18574333190918 }, { "auxiliary_loss_clip": 0.0112064, "auxiliary_loss_mlp": 0.01040925, "balance_loss_clip": 1.04972208, "balance_loss_mlp": 1.02428377, "epoch": 0.49848188787013376, "flos": 24936549488640.0, "grad_norm": 1.4927804425375188, "language_loss": 0.8397218, "learning_rate": 2.106856328026598e-06, "loss": 0.86133754, "num_input_tokens_seen": 178207665, "step": 8291, "time_per_iteration": 2.716386556625366 }, { "auxiliary_loss_clip": 0.01103944, "auxiliary_loss_mlp": 0.01046379, "balance_loss_clip": 1.04420066, "balance_loss_mlp": 1.02930808, "epoch": 0.4985420111228017, "flos": 22382439056640.0, "grad_norm": 1.6316694600084898, "language_loss": 0.67022264, "learning_rate": 2.106467420591409e-06, "loss": 0.69172579, "num_input_tokens_seen": 178226325, "step": 8292, "time_per_iteration": 2.7027721405029297 }, { "auxiliary_loss_clip": 0.01127175, "auxiliary_loss_mlp": 0.01039323, "balance_loss_clip": 1.04806566, "balance_loss_mlp": 1.02625203, "epoch": 0.4986021343754697, "flos": 16216469464320.0, "grad_norm": 1.6633361946509924, "language_loss": 0.66995132, "learning_rate": 2.106078509118965e-06, "loss": 0.6916163, "num_input_tokens_seen": 178244960, "step": 8293, "time_per_iteration": 2.5719261169433594 }, { "auxiliary_loss_clip": 0.01111406, "auxiliary_loss_mlp": 0.01029657, "balance_loss_clip": 1.04379749, "balance_loss_mlp": 1.01533389, "epoch": 0.4986622576281377, "flos": 23403594804480.0, "grad_norm": 1.8610494318021187, "language_loss": 0.82020485, "learning_rate": 2.1056895936240133e-06, "loss": 0.84161556, "num_input_tokens_seen": 178265400, "step": 8294, "time_per_iteration": 2.6504080295562744 }, { "auxiliary_loss_clip": 0.01116097, "auxiliary_loss_mlp": 0.01031185, "balance_loss_clip": 1.04479063, "balance_loss_mlp": 1.01604557, "epoch": 0.49872238088080567, "flos": 19974556892160.0, "grad_norm": 2.2309244250260183, "language_loss": 0.72901344, "learning_rate": 2.1053006741213016e-06, "loss": 0.75048614, "num_input_tokens_seen": 178284535, "step": 8295, "time_per_iteration": 2.6195027828216553 }, { "auxiliary_loss_clip": 0.01059073, "auxiliary_loss_mlp": 0.01038092, "balance_loss_clip": 1.03994107, "balance_loss_mlp": 1.02466345, "epoch": 0.49878250413347364, "flos": 22893016930560.0, "grad_norm": 1.8092757241660187, "language_loss": 0.67607826, "learning_rate": 2.1049117506255775e-06, "loss": 0.69704998, "num_input_tokens_seen": 178302425, "step": 8296, "time_per_iteration": 2.755263090133667 }, { "auxiliary_loss_clip": 0.01104221, "auxiliary_loss_mlp": 0.01042078, "balance_loss_clip": 1.04649234, "balance_loss_mlp": 1.02715254, "epoch": 0.4988426273861416, "flos": 32598449821440.0, "grad_norm": 2.862724254512052, "language_loss": 0.64573205, "learning_rate": 2.1045228231515895e-06, "loss": 0.66719502, "num_input_tokens_seen": 178323065, "step": 8297, "time_per_iteration": 2.77134108543396 }, { "auxiliary_loss_clip": 0.01068772, "auxiliary_loss_mlp": 0.01035076, "balance_loss_clip": 1.04186463, "balance_loss_mlp": 1.02241552, "epoch": 0.49890275063880957, "flos": 20923604087040.0, "grad_norm": 1.6802177929429785, "language_loss": 0.70005518, "learning_rate": 2.1041338917140857e-06, "loss": 0.72109365, "num_input_tokens_seen": 178343985, "step": 8298, "time_per_iteration": 2.7644965648651123 }, { "auxiliary_loss_clip": 0.01123634, "auxiliary_loss_mlp": 0.01037158, "balance_loss_clip": 1.04611015, "balance_loss_mlp": 1.02383053, "epoch": 0.49896287389147753, "flos": 18624459369600.0, "grad_norm": 2.15895128631453, "language_loss": 0.85060012, "learning_rate": 2.103744956327814e-06, "loss": 0.87220806, "num_input_tokens_seen": 178362345, "step": 8299, "time_per_iteration": 2.6582682132720947 }, { "auxiliary_loss_clip": 0.0109908, "auxiliary_loss_mlp": 0.01042644, "balance_loss_clip": 1.04576635, "balance_loss_mlp": 1.02676535, "epoch": 0.4990229971441455, "flos": 24826555065600.0, "grad_norm": 3.5746156367417177, "language_loss": 0.69598472, "learning_rate": 2.1033560170075234e-06, "loss": 0.71740198, "num_input_tokens_seen": 178383190, "step": 8300, "time_per_iteration": 2.725041151046753 }, { "auxiliary_loss_clip": 0.01026277, "auxiliary_loss_mlp": 0.01006258, "balance_loss_clip": 1.02488732, "balance_loss_mlp": 1.00483894, "epoch": 0.49908312039681346, "flos": 71384525136000.0, "grad_norm": 0.7557607717879434, "language_loss": 0.51092541, "learning_rate": 2.1029670737679623e-06, "loss": 0.53125077, "num_input_tokens_seen": 178444250, "step": 8301, "time_per_iteration": 3.2866220474243164 }, { "auxiliary_loss_clip": 0.01096877, "auxiliary_loss_mlp": 0.01045659, "balance_loss_clip": 1.04223108, "balance_loss_mlp": 1.03140736, "epoch": 0.4991432436494814, "flos": 19828651847040.0, "grad_norm": 1.7177443948136444, "language_loss": 0.84648693, "learning_rate": 2.102578126623879e-06, "loss": 0.86791229, "num_input_tokens_seen": 178463250, "step": 8302, "time_per_iteration": 2.66215181350708 }, { "auxiliary_loss_clip": 0.01112659, "auxiliary_loss_mlp": 0.01034193, "balance_loss_clip": 1.04628754, "balance_loss_mlp": 1.02111602, "epoch": 0.4992033669021494, "flos": 15121912273920.0, "grad_norm": 5.640508686379792, "language_loss": 0.68928391, "learning_rate": 2.102189175590024e-06, "loss": 0.71075243, "num_input_tokens_seen": 178481340, "step": 8303, "time_per_iteration": 2.6031181812286377 }, { "auxiliary_loss_clip": 0.01126853, "auxiliary_loss_mlp": 0.01035164, "balance_loss_clip": 1.04641497, "balance_loss_mlp": 1.02095485, "epoch": 0.49926349015481736, "flos": 31207952476800.0, "grad_norm": 1.6560759996443648, "language_loss": 0.72727203, "learning_rate": 2.101800220681144e-06, "loss": 0.74889231, "num_input_tokens_seen": 178501545, "step": 8304, "time_per_iteration": 2.706022262573242 }, { "auxiliary_loss_clip": 0.01116141, "auxiliary_loss_mlp": 0.01037357, "balance_loss_clip": 1.0475409, "balance_loss_mlp": 1.02420211, "epoch": 0.4993236134074853, "flos": 24900207903360.0, "grad_norm": 2.1644384364092684, "language_loss": 0.81342846, "learning_rate": 2.10141126191199e-06, "loss": 0.83496344, "num_input_tokens_seen": 178519700, "step": 8305, "time_per_iteration": 2.6671528816223145 }, { "auxiliary_loss_clip": 0.01024768, "auxiliary_loss_mlp": 0.01003944, "balance_loss_clip": 1.02671385, "balance_loss_mlp": 1.00258529, "epoch": 0.4993837366601533, "flos": 70420573797120.0, "grad_norm": 0.7597400638433706, "language_loss": 0.56867081, "learning_rate": 2.1010222992973107e-06, "loss": 0.58895797, "num_input_tokens_seen": 178576740, "step": 8306, "time_per_iteration": 3.322448492050171 }, { "auxiliary_loss_clip": 0.01127996, "auxiliary_loss_mlp": 0.01039143, "balance_loss_clip": 1.04948568, "balance_loss_mlp": 1.02432525, "epoch": 0.4994438599128213, "flos": 15961216440960.0, "grad_norm": 2.2302114161499236, "language_loss": 0.82741839, "learning_rate": 2.1006333328518556e-06, "loss": 0.84908974, "num_input_tokens_seen": 178594745, "step": 8307, "time_per_iteration": 2.583996295928955 }, { "auxiliary_loss_clip": 0.01126994, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.04805601, "balance_loss_mlp": 1.02157855, "epoch": 0.4995039831654893, "flos": 27928303228800.0, "grad_norm": 1.7094622949229625, "language_loss": 0.60939324, "learning_rate": 2.1002443625903748e-06, "loss": 0.63102394, "num_input_tokens_seen": 178614110, "step": 8308, "time_per_iteration": 2.6170315742492676 }, { "auxiliary_loss_clip": 0.01120806, "auxiliary_loss_mlp": 0.01031842, "balance_loss_clip": 1.04421234, "balance_loss_mlp": 1.01890182, "epoch": 0.49956410641815724, "flos": 24204797619840.0, "grad_norm": 1.8375312667766532, "language_loss": 0.74889386, "learning_rate": 2.0998553885276168e-06, "loss": 0.77042031, "num_input_tokens_seen": 178634170, "step": 8309, "time_per_iteration": 2.6147258281707764 }, { "auxiliary_loss_clip": 0.01102514, "auxiliary_loss_mlp": 0.0103405, "balance_loss_clip": 1.04401636, "balance_loss_mlp": 1.02106261, "epoch": 0.4996242296708252, "flos": 16180127879040.0, "grad_norm": 3.148005555228763, "language_loss": 0.79502416, "learning_rate": 2.0994664106783335e-06, "loss": 0.8163898, "num_input_tokens_seen": 178651775, "step": 8310, "time_per_iteration": 2.6420629024505615 }, { "auxiliary_loss_clip": 0.01111922, "auxiliary_loss_mlp": 0.01040825, "balance_loss_clip": 1.04564738, "balance_loss_mlp": 1.02757514, "epoch": 0.49968435292349317, "flos": 16873527000960.0, "grad_norm": 1.4976626914983278, "language_loss": 0.70989597, "learning_rate": 2.0990774290572735e-06, "loss": 0.73142344, "num_input_tokens_seen": 178669720, "step": 8311, "time_per_iteration": 2.5778110027313232 }, { "auxiliary_loss_clip": 0.01098554, "auxiliary_loss_mlp": 0.01036482, "balance_loss_clip": 1.04628289, "balance_loss_mlp": 1.02355957, "epoch": 0.49974447617616113, "flos": 14939521989120.0, "grad_norm": 2.0443790290482498, "language_loss": 0.77375191, "learning_rate": 2.098688443679187e-06, "loss": 0.79510236, "num_input_tokens_seen": 178686765, "step": 8312, "time_per_iteration": 2.6517751216888428 }, { "auxiliary_loss_clip": 0.01095231, "auxiliary_loss_mlp": 0.01035354, "balance_loss_clip": 1.04635751, "balance_loss_mlp": 1.02135265, "epoch": 0.4998045994288291, "flos": 26651535321600.0, "grad_norm": 1.7937215644313522, "language_loss": 0.84479403, "learning_rate": 2.0982994545588256e-06, "loss": 0.86609983, "num_input_tokens_seen": 178705845, "step": 8313, "time_per_iteration": 2.7882683277130127 }, { "auxiliary_loss_clip": 0.01098533, "auxiliary_loss_mlp": 0.01032666, "balance_loss_clip": 1.04393864, "balance_loss_mlp": 1.01856351, "epoch": 0.49986472268149706, "flos": 20953768533120.0, "grad_norm": 1.8469644022391951, "language_loss": 0.80625784, "learning_rate": 2.097910461710939e-06, "loss": 0.82756978, "num_input_tokens_seen": 178723410, "step": 8314, "time_per_iteration": 2.6792070865631104 }, { "auxiliary_loss_clip": 0.01093189, "auxiliary_loss_mlp": 0.00772869, "balance_loss_clip": 1.04282761, "balance_loss_mlp": 1.00018048, "epoch": 0.49992484593416503, "flos": 22783884433920.0, "grad_norm": 1.9116629548957604, "language_loss": 0.79824436, "learning_rate": 2.0975214651502773e-06, "loss": 0.8169049, "num_input_tokens_seen": 178743560, "step": 8315, "time_per_iteration": 2.885185718536377 }, { "auxiliary_loss_clip": 0.01126333, "auxiliary_loss_mlp": 0.01033363, "balance_loss_clip": 1.04775071, "balance_loss_mlp": 1.02025628, "epoch": 0.499984969186833, "flos": 46786970252160.0, "grad_norm": 1.6207947092177402, "language_loss": 0.74976832, "learning_rate": 2.0971324648915926e-06, "loss": 0.77136528, "num_input_tokens_seen": 178767225, "step": 8316, "time_per_iteration": 2.865182399749756 }, { "auxiliary_loss_clip": 0.01104962, "auxiliary_loss_mlp": 0.0103454, "balance_loss_clip": 1.04472423, "balance_loss_mlp": 1.02195168, "epoch": 0.500045092439501, "flos": 25556978131200.0, "grad_norm": 1.839667572981257, "language_loss": 0.81122506, "learning_rate": 2.0967434609496343e-06, "loss": 0.83262014, "num_input_tokens_seen": 178786810, "step": 8317, "time_per_iteration": 2.781627893447876 }, { "auxiliary_loss_clip": 0.011005, "auxiliary_loss_mlp": 0.01038819, "balance_loss_clip": 1.04331255, "balance_loss_mlp": 1.02368522, "epoch": 0.5001052156921689, "flos": 20704764476160.0, "grad_norm": 1.6607654789374993, "language_loss": 0.83369392, "learning_rate": 2.0963544533391548e-06, "loss": 0.8550871, "num_input_tokens_seen": 178805660, "step": 8318, "time_per_iteration": 2.790937662124634 }, { "auxiliary_loss_clip": 0.01114137, "auxiliary_loss_mlp": 0.01032915, "balance_loss_clip": 1.04552984, "balance_loss_mlp": 1.01974225, "epoch": 0.500165338944837, "flos": 21251109317760.0, "grad_norm": 1.7594247797212967, "language_loss": 0.81800634, "learning_rate": 2.0959654420749045e-06, "loss": 0.83947688, "num_input_tokens_seen": 178824780, "step": 8319, "time_per_iteration": 2.6710760593414307 }, { "auxiliary_loss_clip": 0.01080263, "auxiliary_loss_mlp": 0.01030013, "balance_loss_clip": 1.03828013, "balance_loss_mlp": 1.01689363, "epoch": 0.5002254621975049, "flos": 27854398995840.0, "grad_norm": 1.5279258864896563, "language_loss": 0.71943277, "learning_rate": 2.095576427171635e-06, "loss": 0.7405355, "num_input_tokens_seen": 178845640, "step": 8320, "time_per_iteration": 2.7864880561828613 }, { "auxiliary_loss_clip": 0.01093478, "auxiliary_loss_mlp": 0.01044698, "balance_loss_clip": 1.04542255, "balance_loss_mlp": 1.02964222, "epoch": 0.5002855854501729, "flos": 15551941898880.0, "grad_norm": 2.783304711521318, "language_loss": 0.76481223, "learning_rate": 2.0951874086440978e-06, "loss": 0.78619403, "num_input_tokens_seen": 178862290, "step": 8321, "time_per_iteration": 2.7580785751342773 }, { "auxiliary_loss_clip": 0.01115908, "auxiliary_loss_mlp": 0.00771212, "balance_loss_clip": 1.04681301, "balance_loss_mlp": 1.00017464, "epoch": 0.5003457087028408, "flos": 16107408794880.0, "grad_norm": 6.807525102727238, "language_loss": 0.82965297, "learning_rate": 2.0947983865070455e-06, "loss": 0.84852415, "num_input_tokens_seen": 178879805, "step": 8322, "time_per_iteration": 2.6580779552459717 }, { "auxiliary_loss_clip": 0.01117442, "auxiliary_loss_mlp": 0.0103527, "balance_loss_clip": 1.0458411, "balance_loss_mlp": 1.02163804, "epoch": 0.5004058319555088, "flos": 22710518904960.0, "grad_norm": 2.2579769372834257, "language_loss": 0.73329234, "learning_rate": 2.094409360775228e-06, "loss": 0.75481945, "num_input_tokens_seen": 178896985, "step": 8323, "time_per_iteration": 2.6743083000183105 }, { "auxiliary_loss_clip": 0.01086486, "auxiliary_loss_mlp": 0.01036398, "balance_loss_clip": 1.04470778, "balance_loss_mlp": 1.02264738, "epoch": 0.5004659552081767, "flos": 30117956313600.0, "grad_norm": 1.846103580376976, "language_loss": 0.69483137, "learning_rate": 2.0940203314633977e-06, "loss": 0.71606022, "num_input_tokens_seen": 178920605, "step": 8324, "time_per_iteration": 2.783973217010498 }, { "auxiliary_loss_clip": 0.01106501, "auxiliary_loss_mlp": 0.00771259, "balance_loss_clip": 1.0422833, "balance_loss_mlp": 1.0000751, "epoch": 0.5005260784608447, "flos": 18624710764800.0, "grad_norm": 3.4520936591258224, "language_loss": 0.72325313, "learning_rate": 2.0936312985863077e-06, "loss": 0.74203074, "num_input_tokens_seen": 178937760, "step": 8325, "time_per_iteration": 4.274277448654175 }, { "auxiliary_loss_clip": 0.01089915, "auxiliary_loss_mlp": 0.01041836, "balance_loss_clip": 1.04158878, "balance_loss_mlp": 1.02669656, "epoch": 0.5005862017135126, "flos": 24859987649280.0, "grad_norm": 1.7422514730064806, "language_loss": 0.73518062, "learning_rate": 2.093242262158709e-06, "loss": 0.7564981, "num_input_tokens_seen": 178957985, "step": 8326, "time_per_iteration": 4.3523108959198 }, { "auxiliary_loss_clip": 0.01094661, "auxiliary_loss_mlp": 0.01032547, "balance_loss_clip": 1.04201293, "balance_loss_mlp": 1.01984525, "epoch": 0.5006463249661807, "flos": 18734381965440.0, "grad_norm": 1.5476902232241379, "language_loss": 0.78111005, "learning_rate": 2.0928532221953544e-06, "loss": 0.80238211, "num_input_tokens_seen": 178977070, "step": 8327, "time_per_iteration": 4.4682557582855225 }, { "auxiliary_loss_clip": 0.01128169, "auxiliary_loss_mlp": 0.01040162, "balance_loss_clip": 1.04810429, "balance_loss_mlp": 1.02641153, "epoch": 0.5007064482188487, "flos": 13042145871360.0, "grad_norm": 2.1714411479157296, "language_loss": 0.88089001, "learning_rate": 2.092464178710997e-06, "loss": 0.90257335, "num_input_tokens_seen": 178994175, "step": 8328, "time_per_iteration": 2.5710413455963135 }, { "auxiliary_loss_clip": 0.01091641, "auxiliary_loss_mlp": 0.01034728, "balance_loss_clip": 1.04136801, "balance_loss_mlp": 1.02050591, "epoch": 0.5007665714715166, "flos": 21288671965440.0, "grad_norm": 2.863428491996577, "language_loss": 0.73827946, "learning_rate": 2.092075131720388e-06, "loss": 0.75954318, "num_input_tokens_seen": 179013710, "step": 8329, "time_per_iteration": 2.7770020961761475 }, { "auxiliary_loss_clip": 0.01124061, "auxiliary_loss_mlp": 0.0103094, "balance_loss_clip": 1.04667771, "balance_loss_mlp": 1.01824427, "epoch": 0.5008266947241846, "flos": 29754576374400.0, "grad_norm": 1.6131098934363575, "language_loss": 0.79715234, "learning_rate": 2.091686081238281e-06, "loss": 0.81870234, "num_input_tokens_seen": 179035255, "step": 8330, "time_per_iteration": 4.167505979537964 }, { "auxiliary_loss_clip": 0.01021039, "auxiliary_loss_mlp": 0.00752271, "balance_loss_clip": 1.02094173, "balance_loss_mlp": 0.9997682, "epoch": 0.5008868179768525, "flos": 63557829204480.0, "grad_norm": 0.7263095406539528, "language_loss": 0.5601325, "learning_rate": 2.0912970272794282e-06, "loss": 0.5778656, "num_input_tokens_seen": 179090915, "step": 8331, "time_per_iteration": 3.008077621459961 }, { "auxiliary_loss_clip": 0.01112181, "auxiliary_loss_mlp": 0.01035155, "balance_loss_clip": 1.04617071, "balance_loss_mlp": 1.02216136, "epoch": 0.5009469412295205, "flos": 27375637593600.0, "grad_norm": 2.025315423078993, "language_loss": 0.65264666, "learning_rate": 2.0909079698585833e-06, "loss": 0.67412001, "num_input_tokens_seen": 179109160, "step": 8332, "time_per_iteration": 2.6730518341064453 }, { "auxiliary_loss_clip": 0.01120357, "auxiliary_loss_mlp": 0.01033936, "balance_loss_clip": 1.04410577, "balance_loss_mlp": 1.02124023, "epoch": 0.5010070644821885, "flos": 27378833904000.0, "grad_norm": 1.5954618594032755, "language_loss": 0.75023079, "learning_rate": 2.0905189089904993e-06, "loss": 0.7717737, "num_input_tokens_seen": 179130610, "step": 8333, "time_per_iteration": 2.685154914855957 }, { "auxiliary_loss_clip": 0.01125291, "auxiliary_loss_mlp": 0.01035107, "balance_loss_clip": 1.04558921, "balance_loss_mlp": 1.02145159, "epoch": 0.5010671877348565, "flos": 20662748542080.0, "grad_norm": 1.9338828530124208, "language_loss": 0.80424768, "learning_rate": 2.090129844689929e-06, "loss": 0.82585168, "num_input_tokens_seen": 179147860, "step": 8334, "time_per_iteration": 2.627230405807495 }, { "auxiliary_loss_clip": 0.01037349, "auxiliary_loss_mlp": 0.01004574, "balance_loss_clip": 1.02146554, "balance_loss_mlp": 1.00316703, "epoch": 0.5011273109875244, "flos": 59128645000320.0, "grad_norm": 0.8902108893007158, "language_loss": 0.62708843, "learning_rate": 2.089740776971626e-06, "loss": 0.64750767, "num_input_tokens_seen": 179210490, "step": 8335, "time_per_iteration": 3.2171308994293213 }, { "auxiliary_loss_clip": 0.01110054, "auxiliary_loss_mlp": 0.01029223, "balance_loss_clip": 1.04289985, "balance_loss_mlp": 1.01612818, "epoch": 0.5011874342401924, "flos": 25336342840320.0, "grad_norm": 1.3859166459285381, "language_loss": 0.79553854, "learning_rate": 2.0893517058503435e-06, "loss": 0.81693137, "num_input_tokens_seen": 179231360, "step": 8336, "time_per_iteration": 2.6930394172668457 }, { "auxiliary_loss_clip": 0.01082861, "auxiliary_loss_mlp": 0.01032761, "balance_loss_clip": 1.03948808, "balance_loss_mlp": 1.01899827, "epoch": 0.5012475574928603, "flos": 20229953569920.0, "grad_norm": 2.2337029404169457, "language_loss": 0.80255198, "learning_rate": 2.088962631340836e-06, "loss": 0.82370824, "num_input_tokens_seen": 179250625, "step": 8337, "time_per_iteration": 2.725379467010498 }, { "auxiliary_loss_clip": 0.01129165, "auxiliary_loss_mlp": 0.01038167, "balance_loss_clip": 1.04644942, "balance_loss_mlp": 1.0239507, "epoch": 0.5013076807455283, "flos": 22710123855360.0, "grad_norm": 2.0126131839523835, "language_loss": 0.79470736, "learning_rate": 2.0885735534578555e-06, "loss": 0.81638074, "num_input_tokens_seen": 179267360, "step": 8338, "time_per_iteration": 2.6641087532043457 }, { "auxiliary_loss_clip": 0.01100565, "auxiliary_loss_mlp": 0.01029861, "balance_loss_clip": 1.04381251, "balance_loss_mlp": 1.01617527, "epoch": 0.5013678039981962, "flos": 24245161528320.0, "grad_norm": 1.6605427604759349, "language_loss": 0.85052264, "learning_rate": 2.0881844722161583e-06, "loss": 0.87182683, "num_input_tokens_seen": 179289810, "step": 8339, "time_per_iteration": 2.7899603843688965 }, { "auxiliary_loss_clip": 0.0111167, "auxiliary_loss_mlp": 0.01037127, "balance_loss_clip": 1.04381561, "balance_loss_mlp": 1.02343023, "epoch": 0.5014279272508643, "flos": 26176688501760.0, "grad_norm": 1.4822129376950433, "language_loss": 0.70713747, "learning_rate": 2.0877953876304962e-06, "loss": 0.72862542, "num_input_tokens_seen": 179310620, "step": 8340, "time_per_iteration": 2.773681402206421 }, { "auxiliary_loss_clip": 0.01088541, "auxiliary_loss_mlp": 0.01043525, "balance_loss_clip": 1.04147744, "balance_loss_mlp": 1.02764666, "epoch": 0.5014880505035323, "flos": 21430446946560.0, "grad_norm": 1.9911594693512178, "language_loss": 0.78301972, "learning_rate": 2.0874062997156245e-06, "loss": 0.80434036, "num_input_tokens_seen": 179329005, "step": 8341, "time_per_iteration": 2.7607786655426025 }, { "auxiliary_loss_clip": 0.01096808, "auxiliary_loss_mlp": 0.01038511, "balance_loss_clip": 1.04584622, "balance_loss_mlp": 1.02391934, "epoch": 0.5015481737562002, "flos": 15770745596160.0, "grad_norm": 4.243666050944008, "language_loss": 0.89054161, "learning_rate": 2.0870172084862975e-06, "loss": 0.9118948, "num_input_tokens_seen": 179343785, "step": 8342, "time_per_iteration": 2.7108232975006104 }, { "auxiliary_loss_clip": 0.01103427, "auxiliary_loss_mlp": 0.01036162, "balance_loss_clip": 1.04467797, "balance_loss_mlp": 1.02273893, "epoch": 0.5016082970088682, "flos": 26830801123200.0, "grad_norm": 1.768885433843204, "language_loss": 0.76325786, "learning_rate": 2.0866281139572682e-06, "loss": 0.78465378, "num_input_tokens_seen": 179364070, "step": 8343, "time_per_iteration": 2.6551196575164795 }, { "auxiliary_loss_clip": 0.01113632, "auxiliary_loss_mlp": 0.01028707, "balance_loss_clip": 1.04612589, "balance_loss_mlp": 1.01574898, "epoch": 0.5016684202615361, "flos": 21470595373440.0, "grad_norm": 1.8502078003194165, "language_loss": 0.6725269, "learning_rate": 2.086239016143293e-06, "loss": 0.6939503, "num_input_tokens_seen": 179384225, "step": 8344, "time_per_iteration": 2.634850263595581 }, { "auxiliary_loss_clip": 0.01104392, "auxiliary_loss_mlp": 0.0103805, "balance_loss_clip": 1.04439509, "balance_loss_mlp": 1.025056, "epoch": 0.5017285435142042, "flos": 26246821806720.0, "grad_norm": 2.403480744645997, "language_loss": 0.75519335, "learning_rate": 2.0858499150591258e-06, "loss": 0.77661783, "num_input_tokens_seen": 179402595, "step": 8345, "time_per_iteration": 2.7551872730255127 }, { "auxiliary_loss_clip": 0.01111042, "auxiliary_loss_mlp": 0.01031467, "balance_loss_clip": 1.04757214, "balance_loss_mlp": 1.01661348, "epoch": 0.5017886667668721, "flos": 20777555387520.0, "grad_norm": 2.18282722391055, "language_loss": 0.78664625, "learning_rate": 2.0854608107195203e-06, "loss": 0.80807132, "num_input_tokens_seen": 179419635, "step": 8346, "time_per_iteration": 2.661569833755493 }, { "auxiliary_loss_clip": 0.01102528, "auxiliary_loss_mlp": 0.00770029, "balance_loss_clip": 1.04322028, "balance_loss_mlp": 1.00006032, "epoch": 0.5018487900195401, "flos": 20156408472960.0, "grad_norm": 1.5952257408001917, "language_loss": 0.69384575, "learning_rate": 2.0850717031392333e-06, "loss": 0.71257138, "num_input_tokens_seen": 179438770, "step": 8347, "time_per_iteration": 2.7273542881011963 }, { "auxiliary_loss_clip": 0.0108784, "auxiliary_loss_mlp": 0.01037703, "balance_loss_clip": 1.04173744, "balance_loss_mlp": 1.02352858, "epoch": 0.501908913272208, "flos": 18150689957760.0, "grad_norm": 1.852088117198485, "language_loss": 0.70635176, "learning_rate": 2.0846825923330174e-06, "loss": 0.72760713, "num_input_tokens_seen": 179457475, "step": 8348, "time_per_iteration": 2.7395875453948975 }, { "auxiliary_loss_clip": 0.01110808, "auxiliary_loss_mlp": 0.01035347, "balance_loss_clip": 1.04538929, "balance_loss_mlp": 1.02306843, "epoch": 0.501969036524876, "flos": 23112287504640.0, "grad_norm": 1.775170825025465, "language_loss": 0.74760187, "learning_rate": 2.0842934783156303e-06, "loss": 0.76906341, "num_input_tokens_seen": 179478140, "step": 8349, "time_per_iteration": 2.6996099948883057 }, { "auxiliary_loss_clip": 0.01112401, "auxiliary_loss_mlp": 0.01034238, "balance_loss_clip": 1.0427202, "balance_loss_mlp": 1.01971805, "epoch": 0.5020291597775439, "flos": 11363214314880.0, "grad_norm": 2.078287176668375, "language_loss": 0.63625813, "learning_rate": 2.0839043611018266e-06, "loss": 0.6577245, "num_input_tokens_seen": 179494325, "step": 8350, "time_per_iteration": 2.6264822483062744 }, { "auxiliary_loss_clip": 0.01015981, "auxiliary_loss_mlp": 0.01015388, "balance_loss_clip": 1.01908755, "balance_loss_mlp": 1.01377916, "epoch": 0.5020892830302119, "flos": 64011094928640.0, "grad_norm": 0.7752505604108973, "language_loss": 0.59761232, "learning_rate": 2.0835152407063597e-06, "loss": 0.617926, "num_input_tokens_seen": 179553545, "step": 8351, "time_per_iteration": 3.4168505668640137 }, { "auxiliary_loss_clip": 0.01100468, "auxiliary_loss_mlp": 0.0103649, "balance_loss_clip": 1.04387021, "balance_loss_mlp": 1.02232814, "epoch": 0.5021494062828799, "flos": 23732859801600.0, "grad_norm": 1.746970205481512, "language_loss": 0.74981982, "learning_rate": 2.0831261171439873e-06, "loss": 0.77118939, "num_input_tokens_seen": 179573645, "step": 8352, "time_per_iteration": 2.7219762802124023 }, { "auxiliary_loss_clip": 0.01097371, "auxiliary_loss_mlp": 0.0103593, "balance_loss_clip": 1.04458284, "balance_loss_mlp": 1.02211952, "epoch": 0.5022095295355479, "flos": 21576747041280.0, "grad_norm": 1.6929263676943664, "language_loss": 0.71971965, "learning_rate": 2.082736990429464e-06, "loss": 0.74105263, "num_input_tokens_seen": 179591435, "step": 8353, "time_per_iteration": 2.6912848949432373 }, { "auxiliary_loss_clip": 0.01123337, "auxiliary_loss_mlp": 0.01037374, "balance_loss_clip": 1.05196476, "balance_loss_mlp": 1.02265787, "epoch": 0.5022696527882159, "flos": 21397229844480.0, "grad_norm": 1.8297806631316527, "language_loss": 0.74025398, "learning_rate": 2.0823478605775455e-06, "loss": 0.76186109, "num_input_tokens_seen": 179609955, "step": 8354, "time_per_iteration": 2.7325775623321533 }, { "auxiliary_loss_clip": 0.0110051, "auxiliary_loss_mlp": 0.01042571, "balance_loss_clip": 1.04367399, "balance_loss_mlp": 1.02817094, "epoch": 0.5023297760408838, "flos": 27160712565120.0, "grad_norm": 1.8324523966840642, "language_loss": 0.72395205, "learning_rate": 2.0819587276029884e-06, "loss": 0.74538279, "num_input_tokens_seen": 179630875, "step": 8355, "time_per_iteration": 2.717954158782959 }, { "auxiliary_loss_clip": 0.01117118, "auxiliary_loss_mlp": 0.01041207, "balance_loss_clip": 1.0459739, "balance_loss_mlp": 1.02644253, "epoch": 0.5023898992935518, "flos": 26213820186240.0, "grad_norm": 1.6992540953340016, "language_loss": 0.81400853, "learning_rate": 2.081569591520548e-06, "loss": 0.83559179, "num_input_tokens_seen": 179649835, "step": 8356, "time_per_iteration": 2.7149479389190674 }, { "auxiliary_loss_clip": 0.01117006, "auxiliary_loss_mlp": 0.01044256, "balance_loss_clip": 1.04384911, "balance_loss_mlp": 1.02906859, "epoch": 0.5024500225462197, "flos": 13440323111040.0, "grad_norm": 2.281950898223197, "language_loss": 0.76235557, "learning_rate": 2.0811804523449803e-06, "loss": 0.78396809, "num_input_tokens_seen": 179667605, "step": 8357, "time_per_iteration": 2.6641504764556885 }, { "auxiliary_loss_clip": 0.01115092, "auxiliary_loss_mlp": 0.01038737, "balance_loss_clip": 1.04538774, "balance_loss_mlp": 1.02369308, "epoch": 0.5025101457988878, "flos": 21579584215680.0, "grad_norm": 1.606830870939079, "language_loss": 0.766074, "learning_rate": 2.0807913100910417e-06, "loss": 0.78761232, "num_input_tokens_seen": 179686910, "step": 8358, "time_per_iteration": 2.715304136276245 }, { "auxiliary_loss_clip": 0.01101769, "auxiliary_loss_mlp": 0.0103829, "balance_loss_clip": 1.04243326, "balance_loss_mlp": 1.02330494, "epoch": 0.5025702690515557, "flos": 24645134448000.0, "grad_norm": 2.4091387510851354, "language_loss": 0.72286153, "learning_rate": 2.0804021647734887e-06, "loss": 0.7442621, "num_input_tokens_seen": 179706395, "step": 8359, "time_per_iteration": 2.7783002853393555 }, { "auxiliary_loss_clip": 0.01097913, "auxiliary_loss_mlp": 0.01045718, "balance_loss_clip": 1.04463625, "balance_loss_mlp": 1.03208613, "epoch": 0.5026303923042237, "flos": 22090162089600.0, "grad_norm": 1.9040983502257391, "language_loss": 0.76839483, "learning_rate": 2.080013016407077e-06, "loss": 0.7898311, "num_input_tokens_seen": 179725735, "step": 8360, "time_per_iteration": 2.6632778644561768 }, { "auxiliary_loss_clip": 0.01085631, "auxiliary_loss_mlp": 0.01038787, "balance_loss_clip": 1.04737091, "balance_loss_mlp": 1.02541208, "epoch": 0.5026905155568916, "flos": 23697200574720.0, "grad_norm": 1.9221287440607566, "language_loss": 0.7667141, "learning_rate": 2.0796238650065645e-06, "loss": 0.78795838, "num_input_tokens_seen": 179746150, "step": 8361, "time_per_iteration": 2.7411348819732666 }, { "auxiliary_loss_clip": 0.01096697, "auxiliary_loss_mlp": 0.01034867, "balance_loss_clip": 1.04426289, "balance_loss_mlp": 1.01988244, "epoch": 0.5027506388095596, "flos": 25812410722560.0, "grad_norm": 1.5686217043676736, "language_loss": 0.85069525, "learning_rate": 2.0792347105867065e-06, "loss": 0.87201089, "num_input_tokens_seen": 179767550, "step": 8362, "time_per_iteration": 2.827319622039795 }, { "auxiliary_loss_clip": 0.01102707, "auxiliary_loss_mlp": 0.01033879, "balance_loss_clip": 1.0435946, "balance_loss_mlp": 1.02022946, "epoch": 0.5028107620622275, "flos": 27526606456320.0, "grad_norm": 1.54737690881779, "language_loss": 0.78134143, "learning_rate": 2.0788455531622605e-06, "loss": 0.80270725, "num_input_tokens_seen": 179790075, "step": 8363, "time_per_iteration": 2.76174259185791 }, { "auxiliary_loss_clip": 0.01111576, "auxiliary_loss_mlp": 0.01035086, "balance_loss_clip": 1.04562223, "balance_loss_mlp": 1.02060819, "epoch": 0.5028708853148955, "flos": 24534278098560.0, "grad_norm": 3.229087026174198, "language_loss": 0.75995886, "learning_rate": 2.0784563927479838e-06, "loss": 0.78142548, "num_input_tokens_seen": 179806515, "step": 8364, "time_per_iteration": 4.35154914855957 }, { "auxiliary_loss_clip": 0.01124922, "auxiliary_loss_mlp": 0.01030963, "balance_loss_clip": 1.04685044, "balance_loss_mlp": 1.01810658, "epoch": 0.5029310085675635, "flos": 20813609664000.0, "grad_norm": 1.5241312757107228, "language_loss": 0.69465041, "learning_rate": 2.0780672293586317e-06, "loss": 0.71620929, "num_input_tokens_seen": 179826450, "step": 8365, "time_per_iteration": 2.619415283203125 }, { "auxiliary_loss_clip": 0.01103666, "auxiliary_loss_mlp": 0.01034829, "balance_loss_clip": 1.04435158, "balance_loss_mlp": 1.0207144, "epoch": 0.5029911318202315, "flos": 22342470197760.0, "grad_norm": 1.4884180792885182, "language_loss": 0.73293805, "learning_rate": 2.0776780630089635e-06, "loss": 0.75432301, "num_input_tokens_seen": 179846770, "step": 8366, "time_per_iteration": 4.228264331817627 }, { "auxiliary_loss_clip": 0.01113401, "auxiliary_loss_mlp": 0.01032302, "balance_loss_clip": 1.04693627, "balance_loss_mlp": 1.0189749, "epoch": 0.5030512550728995, "flos": 24352713826560.0, "grad_norm": 1.4343945223262573, "language_loss": 0.7806654, "learning_rate": 2.077288893713735e-06, "loss": 0.80212247, "num_input_tokens_seen": 179866585, "step": 8367, "time_per_iteration": 4.1336071491241455 }, { "auxiliary_loss_clip": 0.01113589, "auxiliary_loss_mlp": 0.01031042, "balance_loss_clip": 1.0443697, "balance_loss_mlp": 1.01778555, "epoch": 0.5031113783255674, "flos": 18259930195200.0, "grad_norm": 1.686368676940742, "language_loss": 0.69880998, "learning_rate": 2.0768997214877035e-06, "loss": 0.72025627, "num_input_tokens_seen": 179885575, "step": 8368, "time_per_iteration": 2.5836374759674072 }, { "auxiliary_loss_clip": 0.01036914, "auxiliary_loss_mlp": 0.01003217, "balance_loss_clip": 1.0201298, "balance_loss_mlp": 1.00156045, "epoch": 0.5031715015782354, "flos": 57253173200640.0, "grad_norm": 0.8467965026864039, "language_loss": 0.63315928, "learning_rate": 2.0765105463456274e-06, "loss": 0.65356052, "num_input_tokens_seen": 179939650, "step": 8369, "time_per_iteration": 4.438805103302002 }, { "auxiliary_loss_clip": 0.011076, "auxiliary_loss_mlp": 0.01034663, "balance_loss_clip": 1.04427028, "balance_loss_mlp": 1.0215379, "epoch": 0.5032316248309033, "flos": 27527360641920.0, "grad_norm": 2.0752589468807043, "language_loss": 0.60782373, "learning_rate": 2.076121368302263e-06, "loss": 0.62924629, "num_input_tokens_seen": 179961765, "step": 8370, "time_per_iteration": 2.65816330909729 }, { "auxiliary_loss_clip": 0.01076531, "auxiliary_loss_mlp": 0.01043773, "balance_loss_clip": 1.04144311, "balance_loss_mlp": 1.02868104, "epoch": 0.5032917480835714, "flos": 34495825939200.0, "grad_norm": 1.8954281033433134, "language_loss": 0.68462563, "learning_rate": 2.0757321873723695e-06, "loss": 0.70582867, "num_input_tokens_seen": 179983015, "step": 8371, "time_per_iteration": 2.8479132652282715 }, { "auxiliary_loss_clip": 0.01097422, "auxiliary_loss_mlp": 0.01034396, "balance_loss_clip": 1.04120922, "balance_loss_mlp": 1.019364, "epoch": 0.5033518713362393, "flos": 33656773167360.0, "grad_norm": 1.6611598690743674, "language_loss": 0.67656618, "learning_rate": 2.0753430035707042e-06, "loss": 0.69788438, "num_input_tokens_seen": 180003210, "step": 8372, "time_per_iteration": 2.767489194869995 }, { "auxiliary_loss_clip": 0.01085092, "auxiliary_loss_mlp": 0.01043333, "balance_loss_clip": 1.04139996, "balance_loss_mlp": 1.02714443, "epoch": 0.5034119945889073, "flos": 28185495586560.0, "grad_norm": 1.9018001021824607, "language_loss": 0.66726547, "learning_rate": 2.0749538169120235e-06, "loss": 0.68854976, "num_input_tokens_seen": 180025530, "step": 8373, "time_per_iteration": 2.7779579162597656 }, { "auxiliary_loss_clip": 0.0109703, "auxiliary_loss_mlp": 0.01035617, "balance_loss_clip": 1.04184651, "balance_loss_mlp": 1.02208042, "epoch": 0.5034721178415752, "flos": 21358697529600.0, "grad_norm": 1.7065424378128664, "language_loss": 0.74679291, "learning_rate": 2.0745646274110872e-06, "loss": 0.76811939, "num_input_tokens_seen": 180043180, "step": 8374, "time_per_iteration": 2.673182487487793 }, { "auxiliary_loss_clip": 0.01100104, "auxiliary_loss_mlp": 0.01040932, "balance_loss_clip": 1.04264212, "balance_loss_mlp": 1.02604842, "epoch": 0.5035322410942432, "flos": 22674823764480.0, "grad_norm": 1.5424981365737231, "language_loss": 0.68154198, "learning_rate": 2.0741754350826525e-06, "loss": 0.70295238, "num_input_tokens_seen": 180062905, "step": 8375, "time_per_iteration": 2.6842665672302246 }, { "auxiliary_loss_clip": 0.01077033, "auxiliary_loss_mlp": 0.01034566, "balance_loss_clip": 1.04517126, "balance_loss_mlp": 1.0195334, "epoch": 0.5035923643469111, "flos": 19828723674240.0, "grad_norm": 3.5828954699990656, "language_loss": 0.79316169, "learning_rate": 2.0737862399414777e-06, "loss": 0.81427765, "num_input_tokens_seen": 180082000, "step": 8376, "time_per_iteration": 2.7780654430389404 }, { "auxiliary_loss_clip": 0.01117369, "auxiliary_loss_mlp": 0.00771622, "balance_loss_clip": 1.04441619, "balance_loss_mlp": 1.00016475, "epoch": 0.5036524875995791, "flos": 30514625182080.0, "grad_norm": 2.6140774214814693, "language_loss": 0.59478593, "learning_rate": 2.0733970420023213e-06, "loss": 0.61367583, "num_input_tokens_seen": 180101340, "step": 8377, "time_per_iteration": 2.8071539402008057 }, { "auxiliary_loss_clip": 0.01101437, "auxiliary_loss_mlp": 0.01036815, "balance_loss_clip": 1.04309344, "balance_loss_mlp": 1.02237928, "epoch": 0.5037126108522471, "flos": 14720574637440.0, "grad_norm": 2.0235166884987663, "language_loss": 0.76598781, "learning_rate": 2.0730078412799425e-06, "loss": 0.78737032, "num_input_tokens_seen": 180119160, "step": 8378, "time_per_iteration": 2.7332303524017334 }, { "auxiliary_loss_clip": 0.01086538, "auxiliary_loss_mlp": 0.01035008, "balance_loss_clip": 1.04592919, "balance_loss_mlp": 1.02190685, "epoch": 0.5037727341049151, "flos": 25297702784640.0, "grad_norm": 1.7029006786118923, "language_loss": 0.75000858, "learning_rate": 2.0726186377890985e-06, "loss": 0.77122402, "num_input_tokens_seen": 180138730, "step": 8379, "time_per_iteration": 2.8803420066833496 }, { "auxiliary_loss_clip": 0.0111301, "auxiliary_loss_mlp": 0.01035016, "balance_loss_clip": 1.04890418, "balance_loss_mlp": 1.02151, "epoch": 0.5038328573575831, "flos": 28541764632960.0, "grad_norm": 2.071075437448324, "language_loss": 0.67026305, "learning_rate": 2.072229431544548e-06, "loss": 0.69174337, "num_input_tokens_seen": 180158810, "step": 8380, "time_per_iteration": 2.7347092628479004 }, { "auxiliary_loss_clip": 0.01070606, "auxiliary_loss_mlp": 0.01037412, "balance_loss_clip": 1.04154301, "balance_loss_mlp": 1.02420914, "epoch": 0.503892980610251, "flos": 31649869503360.0, "grad_norm": 1.7540511910669407, "language_loss": 0.63245583, "learning_rate": 2.071840222561051e-06, "loss": 0.65353596, "num_input_tokens_seen": 180179700, "step": 8381, "time_per_iteration": 2.836247444152832 }, { "auxiliary_loss_clip": 0.01101604, "auxiliary_loss_mlp": 0.01039312, "balance_loss_clip": 1.04428375, "balance_loss_mlp": 1.02624631, "epoch": 0.503953103862919, "flos": 27089358197760.0, "grad_norm": 1.4852984664170332, "language_loss": 0.67586917, "learning_rate": 2.071451010853365e-06, "loss": 0.69727832, "num_input_tokens_seen": 180199890, "step": 8382, "time_per_iteration": 2.776895523071289 }, { "auxiliary_loss_clip": 0.01115945, "auxiliary_loss_mlp": 0.010349, "balance_loss_clip": 1.04923749, "balance_loss_mlp": 1.02039194, "epoch": 0.5040132271155869, "flos": 15632957024640.0, "grad_norm": 2.370012953933875, "language_loss": 0.62379169, "learning_rate": 2.0710617964362506e-06, "loss": 0.64530009, "num_input_tokens_seen": 180217840, "step": 8383, "time_per_iteration": 2.7200045585632324 }, { "auxiliary_loss_clip": 0.0108883, "auxiliary_loss_mlp": 0.01037077, "balance_loss_clip": 1.04611087, "balance_loss_mlp": 1.02349341, "epoch": 0.504073350368255, "flos": 13590106824960.0, "grad_norm": 1.70449565256652, "language_loss": 0.66918409, "learning_rate": 2.070672579324465e-06, "loss": 0.69044316, "num_input_tokens_seen": 180236465, "step": 8384, "time_per_iteration": 2.7442476749420166 }, { "auxiliary_loss_clip": 0.01108405, "auxiliary_loss_mlp": 0.01040675, "balance_loss_clip": 1.04502487, "balance_loss_mlp": 1.02765775, "epoch": 0.5041334736209229, "flos": 29058160510080.0, "grad_norm": 3.2853523964565072, "language_loss": 0.7103979, "learning_rate": 2.0702833595327674e-06, "loss": 0.73188871, "num_input_tokens_seen": 180258025, "step": 8385, "time_per_iteration": 2.7480194568634033 }, { "auxiliary_loss_clip": 0.01110668, "auxiliary_loss_mlp": 0.01029456, "balance_loss_clip": 1.0450182, "balance_loss_mlp": 1.01644468, "epoch": 0.5041935968735909, "flos": 24608361899520.0, "grad_norm": 1.9814049774657359, "language_loss": 0.83344412, "learning_rate": 2.069894137075919e-06, "loss": 0.8548454, "num_input_tokens_seen": 180277825, "step": 8386, "time_per_iteration": 2.703789234161377 }, { "auxiliary_loss_clip": 0.01108831, "auxiliary_loss_mlp": 0.01037004, "balance_loss_clip": 1.04437232, "balance_loss_mlp": 1.02313972, "epoch": 0.5042537201262588, "flos": 26286934320000.0, "grad_norm": 1.592773103928685, "language_loss": 0.66832674, "learning_rate": 2.0695049119686766e-06, "loss": 0.68978512, "num_input_tokens_seen": 180300465, "step": 8387, "time_per_iteration": 2.8348472118377686 }, { "auxiliary_loss_clip": 0.0106703, "auxiliary_loss_mlp": 0.01033704, "balance_loss_clip": 1.03972006, "balance_loss_mlp": 1.02091861, "epoch": 0.5043138433789268, "flos": 22017371178240.0, "grad_norm": 1.386335560273684, "language_loss": 0.80273068, "learning_rate": 2.0691156842258016e-06, "loss": 0.82373804, "num_input_tokens_seen": 180321050, "step": 8388, "time_per_iteration": 2.8797311782836914 }, { "auxiliary_loss_clip": 0.01112016, "auxiliary_loss_mlp": 0.01032606, "balance_loss_clip": 1.04459918, "balance_loss_mlp": 1.01927233, "epoch": 0.5043739666315947, "flos": 28767104605440.0, "grad_norm": 2.1659708262729436, "language_loss": 0.69815123, "learning_rate": 2.0687264538620537e-06, "loss": 0.7195974, "num_input_tokens_seen": 180338870, "step": 8389, "time_per_iteration": 2.7739861011505127 }, { "auxiliary_loss_clip": 0.01090981, "auxiliary_loss_mlp": 0.01040643, "balance_loss_clip": 1.04124045, "balance_loss_mlp": 1.02756596, "epoch": 0.5044340898842627, "flos": 27599253713280.0, "grad_norm": 1.6276843858059296, "language_loss": 0.6986587, "learning_rate": 2.068337220892191e-06, "loss": 0.71997494, "num_input_tokens_seen": 180361285, "step": 8390, "time_per_iteration": 2.844275712966919 }, { "auxiliary_loss_clip": 0.01033792, "auxiliary_loss_mlp": 0.01003101, "balance_loss_clip": 1.02656126, "balance_loss_mlp": 1.00192666, "epoch": 0.5044942131369307, "flos": 67458050749440.0, "grad_norm": 0.9139771068710668, "language_loss": 0.52933067, "learning_rate": 2.067947985330974e-06, "loss": 0.54969966, "num_input_tokens_seen": 180415170, "step": 8391, "time_per_iteration": 3.054262638092041 }, { "auxiliary_loss_clip": 0.01015619, "auxiliary_loss_mlp": 0.01001074, "balance_loss_clip": 1.02201819, "balance_loss_mlp": 0.99963111, "epoch": 0.5045543363895987, "flos": 58630849390080.0, "grad_norm": 0.853635093218063, "language_loss": 0.60675329, "learning_rate": 2.0675587471931628e-06, "loss": 0.62692022, "num_input_tokens_seen": 180468060, "step": 8392, "time_per_iteration": 3.0727028846740723 }, { "auxiliary_loss_clip": 0.01085218, "auxiliary_loss_mlp": 0.01036141, "balance_loss_clip": 1.04148042, "balance_loss_mlp": 1.02351034, "epoch": 0.5046144596422667, "flos": 22526620248960.0, "grad_norm": 2.343143032045354, "language_loss": 0.84343797, "learning_rate": 2.067169506493517e-06, "loss": 0.86465156, "num_input_tokens_seen": 180486610, "step": 8393, "time_per_iteration": 2.7260749340057373 }, { "auxiliary_loss_clip": 0.01087949, "auxiliary_loss_mlp": 0.01033749, "balance_loss_clip": 1.04098725, "balance_loss_mlp": 1.02107096, "epoch": 0.5046745828949346, "flos": 27454246508160.0, "grad_norm": 1.8418334138160795, "language_loss": 0.50936127, "learning_rate": 2.0667802632467974e-06, "loss": 0.53057826, "num_input_tokens_seen": 180508135, "step": 8394, "time_per_iteration": 2.827000617980957 }, { "auxiliary_loss_clip": 0.01121524, "auxiliary_loss_mlp": 0.0103809, "balance_loss_clip": 1.04323471, "balance_loss_mlp": 1.02311766, "epoch": 0.5047347061476026, "flos": 17274541415040.0, "grad_norm": 1.5679941994223312, "language_loss": 0.75414777, "learning_rate": 2.0663910174677627e-06, "loss": 0.7757439, "num_input_tokens_seen": 180527000, "step": 8395, "time_per_iteration": 2.6535708904266357 }, { "auxiliary_loss_clip": 0.01106012, "auxiliary_loss_mlp": 0.01041618, "balance_loss_clip": 1.04312563, "balance_loss_mlp": 1.02860057, "epoch": 0.5047948294002705, "flos": 16649515831680.0, "grad_norm": 2.0910564250698562, "language_loss": 0.68781769, "learning_rate": 2.0660017691711737e-06, "loss": 0.70929396, "num_input_tokens_seen": 180544715, "step": 8396, "time_per_iteration": 2.700747013092041 }, { "auxiliary_loss_clip": 0.01111788, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.04604292, "balance_loss_mlp": 1.02059579, "epoch": 0.5048549526529386, "flos": 26865706164480.0, "grad_norm": 3.479269791703844, "language_loss": 0.78899479, "learning_rate": 2.065612518371792e-06, "loss": 0.81044173, "num_input_tokens_seen": 180565365, "step": 8397, "time_per_iteration": 2.716320514678955 }, { "auxiliary_loss_clip": 0.01078686, "auxiliary_loss_mlp": 0.01033767, "balance_loss_clip": 1.04137075, "balance_loss_mlp": 1.02079701, "epoch": 0.5049150759056065, "flos": 21833939399040.0, "grad_norm": 3.435063442023246, "language_loss": 0.66291559, "learning_rate": 2.065223265084376e-06, "loss": 0.68404007, "num_input_tokens_seen": 180586670, "step": 8398, "time_per_iteration": 2.773245334625244 }, { "auxiliary_loss_clip": 0.01113858, "auxiliary_loss_mlp": 0.00770983, "balance_loss_clip": 1.04783058, "balance_loss_mlp": 1.00018215, "epoch": 0.5049751991582745, "flos": 21685807710720.0, "grad_norm": 1.5640615321007765, "language_loss": 0.720043, "learning_rate": 2.064834009323688e-06, "loss": 0.73889136, "num_input_tokens_seen": 180605085, "step": 8399, "time_per_iteration": 2.697688341140747 }, { "auxiliary_loss_clip": 0.01091578, "auxiliary_loss_mlp": 0.01053063, "balance_loss_clip": 1.04215539, "balance_loss_mlp": 1.03741038, "epoch": 0.5050353224109424, "flos": 21359379888000.0, "grad_norm": 3.5795224523825695, "language_loss": 0.81615806, "learning_rate": 2.0644447511044878e-06, "loss": 0.8376044, "num_input_tokens_seen": 180624370, "step": 8400, "time_per_iteration": 2.7172608375549316 }, { "auxiliary_loss_clip": 0.01084985, "auxiliary_loss_mlp": 0.01039311, "balance_loss_clip": 1.04359269, "balance_loss_mlp": 1.02413547, "epoch": 0.5050954456636104, "flos": 22820082364800.0, "grad_norm": 1.9975954417395212, "language_loss": 0.78901821, "learning_rate": 2.0640554904415362e-06, "loss": 0.81026119, "num_input_tokens_seen": 180642450, "step": 8401, "time_per_iteration": 2.790361166000366 }, { "auxiliary_loss_clip": 0.01125612, "auxiliary_loss_mlp": 0.00770602, "balance_loss_clip": 1.04576373, "balance_loss_mlp": 1.00024748, "epoch": 0.5051555689162783, "flos": 30448226891520.0, "grad_norm": 1.6142524162989784, "language_loss": 0.70102769, "learning_rate": 2.063666227349593e-06, "loss": 0.7199899, "num_input_tokens_seen": 180665250, "step": 8402, "time_per_iteration": 2.6950721740722656 }, { "auxiliary_loss_clip": 0.01112822, "auxiliary_loss_mlp": 0.00771289, "balance_loss_clip": 1.04341567, "balance_loss_mlp": 1.00022268, "epoch": 0.5052156921689464, "flos": 21287953693440.0, "grad_norm": 2.3922403816883433, "language_loss": 0.69298434, "learning_rate": 2.063276961843422e-06, "loss": 0.71182549, "num_input_tokens_seen": 180687425, "step": 8403, "time_per_iteration": 4.257136344909668 }, { "auxiliary_loss_clip": 0.01109967, "auxiliary_loss_mlp": 0.01043124, "balance_loss_clip": 1.04455948, "balance_loss_mlp": 1.03021932, "epoch": 0.5052758154216143, "flos": 25081305298560.0, "grad_norm": 1.6578366313908228, "language_loss": 0.85693455, "learning_rate": 2.062887693937781e-06, "loss": 0.87846541, "num_input_tokens_seen": 180708725, "step": 8404, "time_per_iteration": 2.725935459136963 }, { "auxiliary_loss_clip": 0.01087696, "auxiliary_loss_mlp": 0.00769912, "balance_loss_clip": 1.04370379, "balance_loss_mlp": 1.00018847, "epoch": 0.5053359386742823, "flos": 20885502735360.0, "grad_norm": 1.5507323053673605, "language_loss": 0.75329977, "learning_rate": 2.0624984236474322e-06, "loss": 0.77187586, "num_input_tokens_seen": 180727990, "step": 8405, "time_per_iteration": 4.237490653991699 }, { "auxiliary_loss_clip": 0.01124188, "auxiliary_loss_mlp": 0.01031903, "balance_loss_clip": 1.04560125, "balance_loss_mlp": 1.01756775, "epoch": 0.5053960619269503, "flos": 37743335493120.0, "grad_norm": 1.5851552924914987, "language_loss": 0.73046809, "learning_rate": 2.0621091509871378e-06, "loss": 0.75202894, "num_input_tokens_seen": 180749765, "step": 8406, "time_per_iteration": 4.387450218200684 }, { "auxiliary_loss_clip": 0.0108276, "auxiliary_loss_mlp": 0.01031932, "balance_loss_clip": 1.04293895, "balance_loss_mlp": 1.01945066, "epoch": 0.5054561851796182, "flos": 23513840622720.0, "grad_norm": 1.8244341787972256, "language_loss": 0.76631331, "learning_rate": 2.0617198759716568e-06, "loss": 0.78746021, "num_input_tokens_seen": 180769580, "step": 8407, "time_per_iteration": 2.765031099319458 }, { "auxiliary_loss_clip": 0.01085678, "auxiliary_loss_mlp": 0.01030739, "balance_loss_clip": 1.04038286, "balance_loss_mlp": 1.01838887, "epoch": 0.5055163084322862, "flos": 30410233280640.0, "grad_norm": 1.769865286909125, "language_loss": 0.63482308, "learning_rate": 2.0613305986157535e-06, "loss": 0.65598726, "num_input_tokens_seen": 180790295, "step": 8408, "time_per_iteration": 2.7497997283935547 }, { "auxiliary_loss_clip": 0.01094613, "auxiliary_loss_mlp": 0.01046938, "balance_loss_clip": 1.04494774, "balance_loss_mlp": 1.03097582, "epoch": 0.5055764316849541, "flos": 20259651139200.0, "grad_norm": 1.9259425074827412, "language_loss": 0.63427341, "learning_rate": 2.0609413189341865e-06, "loss": 0.655689, "num_input_tokens_seen": 180807875, "step": 8409, "time_per_iteration": 4.083381652832031 }, { "auxiliary_loss_clip": 0.01099903, "auxiliary_loss_mlp": 0.01029856, "balance_loss_clip": 1.04535913, "balance_loss_mlp": 1.01790488, "epoch": 0.5056365549376222, "flos": 26070895969920.0, "grad_norm": 2.0381050127162528, "language_loss": 0.71175253, "learning_rate": 2.0605520369417193e-06, "loss": 0.73305017, "num_input_tokens_seen": 180831300, "step": 8410, "time_per_iteration": 2.7279632091522217 }, { "auxiliary_loss_clip": 0.01097675, "auxiliary_loss_mlp": 0.0104194, "balance_loss_clip": 1.04237318, "balance_loss_mlp": 1.02787888, "epoch": 0.5056966781902901, "flos": 19279074781440.0, "grad_norm": 1.4485544779958848, "language_loss": 0.79037184, "learning_rate": 2.060162752653113e-06, "loss": 0.81176794, "num_input_tokens_seen": 180849055, "step": 8411, "time_per_iteration": 2.6837332248687744 }, { "auxiliary_loss_clip": 0.01125313, "auxiliary_loss_mlp": 0.01039106, "balance_loss_clip": 1.04655755, "balance_loss_mlp": 1.02372837, "epoch": 0.5057568014429581, "flos": 21323325611520.0, "grad_norm": 1.8986612146492552, "language_loss": 0.81808418, "learning_rate": 2.0597734660831285e-06, "loss": 0.83972836, "num_input_tokens_seen": 180867395, "step": 8412, "time_per_iteration": 2.615809679031372 }, { "auxiliary_loss_clip": 0.01103779, "auxiliary_loss_mlp": 0.01041145, "balance_loss_clip": 1.04390502, "balance_loss_mlp": 1.02739954, "epoch": 0.505816924695626, "flos": 17493596507520.0, "grad_norm": 1.9029826105260268, "language_loss": 0.80660832, "learning_rate": 2.0593841772465283e-06, "loss": 0.82805753, "num_input_tokens_seen": 180886670, "step": 8413, "time_per_iteration": 2.7692911624908447 }, { "auxiliary_loss_clip": 0.0109162, "auxiliary_loss_mlp": 0.00771431, "balance_loss_clip": 1.04406643, "balance_loss_mlp": 1.00020945, "epoch": 0.505877047948294, "flos": 21142084561920.0, "grad_norm": 1.9410580169313951, "language_loss": 0.80582374, "learning_rate": 2.0589948861580737e-06, "loss": 0.82445419, "num_input_tokens_seen": 180904645, "step": 8414, "time_per_iteration": 2.6970348358154297 }, { "auxiliary_loss_clip": 0.01107406, "auxiliary_loss_mlp": 0.01030257, "balance_loss_clip": 1.03923571, "balance_loss_mlp": 1.0169946, "epoch": 0.5059371712009619, "flos": 36350036887680.0, "grad_norm": 2.0609800291463225, "language_loss": 0.62233627, "learning_rate": 2.058605592832528e-06, "loss": 0.64371288, "num_input_tokens_seen": 180922340, "step": 8415, "time_per_iteration": 2.7197422981262207 }, { "auxiliary_loss_clip": 0.01087332, "auxiliary_loss_mlp": 0.01032316, "balance_loss_clip": 1.04092574, "balance_loss_mlp": 1.01899433, "epoch": 0.50599729445363, "flos": 22673387220480.0, "grad_norm": 1.6231002317718672, "language_loss": 0.81935573, "learning_rate": 2.0582162972846515e-06, "loss": 0.84055215, "num_input_tokens_seen": 180941350, "step": 8416, "time_per_iteration": 2.782719612121582 }, { "auxiliary_loss_clip": 0.01091272, "auxiliary_loss_mlp": 0.0103737, "balance_loss_clip": 1.04698849, "balance_loss_mlp": 1.02498984, "epoch": 0.5060574177062979, "flos": 22747866071040.0, "grad_norm": 1.5803053727793945, "language_loss": 0.78981423, "learning_rate": 2.0578269995292078e-06, "loss": 0.81110072, "num_input_tokens_seen": 180960720, "step": 8417, "time_per_iteration": 2.7089340686798096 }, { "auxiliary_loss_clip": 0.01070059, "auxiliary_loss_mlp": 0.01039058, "balance_loss_clip": 1.0394783, "balance_loss_mlp": 1.02599227, "epoch": 0.5061175409589659, "flos": 21653201139840.0, "grad_norm": 1.8562945560748794, "language_loss": 0.62433213, "learning_rate": 2.0574376995809588e-06, "loss": 0.64542329, "num_input_tokens_seen": 180979725, "step": 8418, "time_per_iteration": 2.719282388687134 }, { "auxiliary_loss_clip": 0.0109094, "auxiliary_loss_mlp": 0.01035325, "balance_loss_clip": 1.04258347, "balance_loss_mlp": 1.02194929, "epoch": 0.5061776642116339, "flos": 21616249023360.0, "grad_norm": 2.2787836153634724, "language_loss": 0.77394211, "learning_rate": 2.0570483974546653e-06, "loss": 0.79520482, "num_input_tokens_seen": 180998980, "step": 8419, "time_per_iteration": 2.741727113723755 }, { "auxiliary_loss_clip": 0.01062039, "auxiliary_loss_mlp": 0.0103574, "balance_loss_clip": 1.04027188, "balance_loss_mlp": 1.02160168, "epoch": 0.5062377874643018, "flos": 24426294837120.0, "grad_norm": 1.7570247471688223, "language_loss": 0.77180004, "learning_rate": 2.0566590931650917e-06, "loss": 0.79277784, "num_input_tokens_seen": 181019165, "step": 8420, "time_per_iteration": 2.8240675926208496 }, { "auxiliary_loss_clip": 0.01123562, "auxiliary_loss_mlp": 0.01036164, "balance_loss_clip": 1.04462767, "balance_loss_mlp": 1.02188277, "epoch": 0.5062979107169698, "flos": 22524429519360.0, "grad_norm": 1.730716034871051, "language_loss": 0.77317429, "learning_rate": 2.056269786726999e-06, "loss": 0.79477155, "num_input_tokens_seen": 181037110, "step": 8421, "time_per_iteration": 2.6797008514404297 }, { "auxiliary_loss_clip": 0.01106529, "auxiliary_loss_mlp": 0.01032526, "balance_loss_clip": 1.04212284, "balance_loss_mlp": 1.01860261, "epoch": 0.5063580339696377, "flos": 24571984400640.0, "grad_norm": 1.4584078249019805, "language_loss": 0.66635919, "learning_rate": 2.0558804781551512e-06, "loss": 0.68774974, "num_input_tokens_seen": 181057775, "step": 8422, "time_per_iteration": 2.80218505859375 }, { "auxiliary_loss_clip": 0.01123775, "auxiliary_loss_mlp": 0.01032917, "balance_loss_clip": 1.04679537, "balance_loss_mlp": 1.01939869, "epoch": 0.5064181572223058, "flos": 22596143022720.0, "grad_norm": 1.7069001340883154, "language_loss": 0.818717, "learning_rate": 2.05549116746431e-06, "loss": 0.84028399, "num_input_tokens_seen": 181078260, "step": 8423, "time_per_iteration": 2.6722168922424316 }, { "auxiliary_loss_clip": 0.01124994, "auxiliary_loss_mlp": 0.00771759, "balance_loss_clip": 1.04458904, "balance_loss_mlp": 1.00021005, "epoch": 0.5064782804749737, "flos": 25994944661760.0, "grad_norm": 1.7762047243227106, "language_loss": 0.74689841, "learning_rate": 2.055101854669237e-06, "loss": 0.76586592, "num_input_tokens_seen": 181098755, "step": 8424, "time_per_iteration": 2.657538652420044 }, { "auxiliary_loss_clip": 0.01121266, "auxiliary_loss_mlp": 0.01037955, "balance_loss_clip": 1.04494393, "balance_loss_mlp": 1.02427602, "epoch": 0.5065384037276417, "flos": 28553041503360.0, "grad_norm": 1.7147939268792267, "language_loss": 0.71541035, "learning_rate": 2.0547125397846975e-06, "loss": 0.73700261, "num_input_tokens_seen": 181121570, "step": 8425, "time_per_iteration": 2.6696951389312744 }, { "auxiliary_loss_clip": 0.0108314, "auxiliary_loss_mlp": 0.01043142, "balance_loss_clip": 1.04042649, "balance_loss_mlp": 1.02828813, "epoch": 0.5065985269803096, "flos": 22966023323520.0, "grad_norm": 1.7834107132976578, "language_loss": 0.7868796, "learning_rate": 2.0543232228254524e-06, "loss": 0.80814242, "num_input_tokens_seen": 181140240, "step": 8426, "time_per_iteration": 2.702861785888672 }, { "auxiliary_loss_clip": 0.01116039, "auxiliary_loss_mlp": 0.01039376, "balance_loss_clip": 1.0481956, "balance_loss_mlp": 1.0255599, "epoch": 0.5066586502329776, "flos": 21608563512960.0, "grad_norm": 2.9338643206598713, "language_loss": 0.7762264, "learning_rate": 2.053933903806265e-06, "loss": 0.79778051, "num_input_tokens_seen": 181158630, "step": 8427, "time_per_iteration": 2.5964066982269287 }, { "auxiliary_loss_clip": 0.0112123, "auxiliary_loss_mlp": 0.01028788, "balance_loss_clip": 1.04505837, "balance_loss_mlp": 1.014763, "epoch": 0.5067187734856455, "flos": 20339912079360.0, "grad_norm": 2.519773325925209, "language_loss": 0.71591479, "learning_rate": 2.0535445827418997e-06, "loss": 0.73741496, "num_input_tokens_seen": 181176405, "step": 8428, "time_per_iteration": 2.5878183841705322 }, { "auxiliary_loss_clip": 0.01105053, "auxiliary_loss_mlp": 0.00769921, "balance_loss_clip": 1.041857, "balance_loss_mlp": 1.00016701, "epoch": 0.5067788967383136, "flos": 28841080665600.0, "grad_norm": 1.637474951892814, "language_loss": 0.83266222, "learning_rate": 2.0531552596471168e-06, "loss": 0.85141206, "num_input_tokens_seen": 181197595, "step": 8429, "time_per_iteration": 2.6528842449188232 }, { "auxiliary_loss_clip": 0.01094205, "auxiliary_loss_mlp": 0.0103555, "balance_loss_clip": 1.04527116, "balance_loss_mlp": 1.02068472, "epoch": 0.5068390199909815, "flos": 32450174478720.0, "grad_norm": 1.986559953193462, "language_loss": 0.73507559, "learning_rate": 2.052765934536682e-06, "loss": 0.75637317, "num_input_tokens_seen": 181218560, "step": 8430, "time_per_iteration": 2.8031511306762695 }, { "auxiliary_loss_clip": 0.01057925, "auxiliary_loss_mlp": 0.01041942, "balance_loss_clip": 1.03520572, "balance_loss_mlp": 1.02702332, "epoch": 0.5068991432436495, "flos": 23146582014720.0, "grad_norm": 2.0458094547910766, "language_loss": 0.77132332, "learning_rate": 2.0523766074253575e-06, "loss": 0.79232198, "num_input_tokens_seen": 181237095, "step": 8431, "time_per_iteration": 2.7593939304351807 }, { "auxiliary_loss_clip": 0.01108688, "auxiliary_loss_mlp": 0.01035857, "balance_loss_clip": 1.04256523, "balance_loss_mlp": 1.02171338, "epoch": 0.5069592664963174, "flos": 19936096404480.0, "grad_norm": 1.5904348009832192, "language_loss": 0.72110546, "learning_rate": 2.0519872783279074e-06, "loss": 0.74255085, "num_input_tokens_seen": 181255940, "step": 8432, "time_per_iteration": 2.6104278564453125 }, { "auxiliary_loss_clip": 0.0100252, "auxiliary_loss_mlp": 0.01010781, "balance_loss_clip": 1.01845694, "balance_loss_mlp": 1.00870693, "epoch": 0.5070193897489854, "flos": 65793771941760.0, "grad_norm": 0.7570764213883562, "language_loss": 0.63648349, "learning_rate": 2.0515979472590945e-06, "loss": 0.65661651, "num_input_tokens_seen": 181316945, "step": 8433, "time_per_iteration": 3.395040273666382 }, { "auxiliary_loss_clip": 0.01089015, "auxiliary_loss_mlp": 0.01040915, "balance_loss_clip": 1.04288781, "balance_loss_mlp": 1.02685428, "epoch": 0.5070795130016534, "flos": 17275331514240.0, "grad_norm": 2.2603713431070194, "language_loss": 0.78218484, "learning_rate": 2.051208614233681e-06, "loss": 0.80348414, "num_input_tokens_seen": 181335555, "step": 8434, "time_per_iteration": 2.705864667892456 }, { "auxiliary_loss_clip": 0.01099616, "auxiliary_loss_mlp": 0.01035206, "balance_loss_clip": 1.04088449, "balance_loss_mlp": 1.02169967, "epoch": 0.5071396362543213, "flos": 21069940095360.0, "grad_norm": 1.6177485307205706, "language_loss": 0.70698971, "learning_rate": 2.0508192792664326e-06, "loss": 0.72833788, "num_input_tokens_seen": 181354580, "step": 8435, "time_per_iteration": 2.699631929397583 }, { "auxiliary_loss_clip": 0.01115814, "auxiliary_loss_mlp": 0.01036717, "balance_loss_clip": 1.04539943, "balance_loss_mlp": 1.02220905, "epoch": 0.5071997595069894, "flos": 23144822248320.0, "grad_norm": 1.8141877812584497, "language_loss": 0.72254074, "learning_rate": 2.050429942372112e-06, "loss": 0.74406612, "num_input_tokens_seen": 181374320, "step": 8436, "time_per_iteration": 2.6646859645843506 }, { "auxiliary_loss_clip": 0.01124514, "auxiliary_loss_mlp": 0.01034184, "balance_loss_clip": 1.04597569, "balance_loss_mlp": 1.01978946, "epoch": 0.5072598827596573, "flos": 22747183712640.0, "grad_norm": 1.5423854267163515, "language_loss": 0.83801168, "learning_rate": 2.050040603565483e-06, "loss": 0.85959864, "num_input_tokens_seen": 181392190, "step": 8437, "time_per_iteration": 2.6614348888397217 }, { "auxiliary_loss_clip": 0.01110359, "auxiliary_loss_mlp": 0.01028112, "balance_loss_clip": 1.04387856, "balance_loss_mlp": 1.01448607, "epoch": 0.5073200060123253, "flos": 22566301799040.0, "grad_norm": 2.7232019997829924, "language_loss": 0.80638587, "learning_rate": 2.049651262861309e-06, "loss": 0.82777059, "num_input_tokens_seen": 181413890, "step": 8438, "time_per_iteration": 2.6778056621551514 }, { "auxiliary_loss_clip": 0.01081177, "auxiliary_loss_mlp": 0.01037532, "balance_loss_clip": 1.04218078, "balance_loss_mlp": 1.02235103, "epoch": 0.5073801292649932, "flos": 25806341324160.0, "grad_norm": 1.4751942737164592, "language_loss": 0.7943362, "learning_rate": 2.0492619202743543e-06, "loss": 0.81552327, "num_input_tokens_seen": 181433240, "step": 8439, "time_per_iteration": 2.694603443145752 }, { "auxiliary_loss_clip": 0.01088705, "auxiliary_loss_mlp": 0.0077357, "balance_loss_clip": 1.04178834, "balance_loss_mlp": 1.00020123, "epoch": 0.5074402525176612, "flos": 25373941401600.0, "grad_norm": 1.5360675692672114, "language_loss": 0.71413541, "learning_rate": 2.048872575819383e-06, "loss": 0.7327581, "num_input_tokens_seen": 181453535, "step": 8440, "time_per_iteration": 2.68709397315979 }, { "auxiliary_loss_clip": 0.01096271, "auxiliary_loss_mlp": 0.01036596, "balance_loss_clip": 1.04103327, "balance_loss_mlp": 1.0227561, "epoch": 0.5075003757703291, "flos": 26064431521920.0, "grad_norm": 1.6763306182018036, "language_loss": 0.7087847, "learning_rate": 2.048483229511158e-06, "loss": 0.73011339, "num_input_tokens_seen": 181474195, "step": 8441, "time_per_iteration": 2.728649377822876 }, { "auxiliary_loss_clip": 0.01113949, "auxiliary_loss_mlp": 0.00771406, "balance_loss_clip": 1.04312265, "balance_loss_mlp": 1.00021851, "epoch": 0.5075604990229972, "flos": 21835447770240.0, "grad_norm": 1.794299641086803, "language_loss": 0.63846874, "learning_rate": 2.0480938813644445e-06, "loss": 0.65732235, "num_input_tokens_seen": 181494000, "step": 8442, "time_per_iteration": 4.1495561599731445 }, { "auxiliary_loss_clip": 0.01065064, "auxiliary_loss_mlp": 0.01028245, "balance_loss_clip": 1.03900802, "balance_loss_mlp": 1.01582956, "epoch": 0.5076206222756651, "flos": 31978703537280.0, "grad_norm": 1.7729718848020288, "language_loss": 0.7149542, "learning_rate": 2.047704531394006e-06, "loss": 0.73588729, "num_input_tokens_seen": 181515955, "step": 8443, "time_per_iteration": 2.84781551361084 }, { "auxiliary_loss_clip": 0.01033895, "auxiliary_loss_mlp": 0.01036606, "balance_loss_clip": 1.03034997, "balance_loss_mlp": 1.02093554, "epoch": 0.5076807455283331, "flos": 36904031326080.0, "grad_norm": 1.237062481884337, "language_loss": 0.62134659, "learning_rate": 2.047315179614607e-06, "loss": 0.64205158, "num_input_tokens_seen": 181540225, "step": 8444, "time_per_iteration": 3.2103631496429443 }, { "auxiliary_loss_clip": 0.01086312, "auxiliary_loss_mlp": 0.01030312, "balance_loss_clip": 1.04043984, "balance_loss_mlp": 1.0172112, "epoch": 0.507740868781001, "flos": 29862415981440.0, "grad_norm": 1.7245082556223335, "language_loss": 0.64173615, "learning_rate": 2.046925826041012e-06, "loss": 0.66290236, "num_input_tokens_seen": 181560125, "step": 8445, "time_per_iteration": 4.46838903427124 }, { "auxiliary_loss_clip": 0.01013224, "auxiliary_loss_mlp": 0.01008254, "balance_loss_clip": 1.02398801, "balance_loss_mlp": 1.00686538, "epoch": 0.507800992033669, "flos": 61918974247680.0, "grad_norm": 0.8265855466772786, "language_loss": 0.61854541, "learning_rate": 2.0465364706879845e-06, "loss": 0.63876021, "num_input_tokens_seen": 181618830, "step": 8446, "time_per_iteration": 3.267681121826172 }, { "auxiliary_loss_clip": 0.01080886, "auxiliary_loss_mlp": 0.01028563, "balance_loss_clip": 1.0391748, "balance_loss_mlp": 1.0157063, "epoch": 0.507861115286337, "flos": 20700490757760.0, "grad_norm": 1.574417237275669, "language_loss": 0.8065623, "learning_rate": 2.04614711357029e-06, "loss": 0.82765681, "num_input_tokens_seen": 181637120, "step": 8447, "time_per_iteration": 2.761584758758545 }, { "auxiliary_loss_clip": 0.01111406, "auxiliary_loss_mlp": 0.01031653, "balance_loss_clip": 1.04490948, "balance_loss_mlp": 1.01859963, "epoch": 0.507921238539005, "flos": 30847050576000.0, "grad_norm": 1.8510365938740598, "language_loss": 0.70990604, "learning_rate": 2.0457577547026916e-06, "loss": 0.73133665, "num_input_tokens_seen": 181659965, "step": 8448, "time_per_iteration": 4.335421085357666 }, { "auxiliary_loss_clip": 0.01121931, "auxiliary_loss_mlp": 0.00769587, "balance_loss_clip": 1.04565167, "balance_loss_mlp": 1.00020599, "epoch": 0.507981361791673, "flos": 35700197984640.0, "grad_norm": 3.0099403095172557, "language_loss": 0.71958399, "learning_rate": 2.045368394099955e-06, "loss": 0.73849922, "num_input_tokens_seen": 181685290, "step": 8449, "time_per_iteration": 2.7780673503875732 }, { "auxiliary_loss_clip": 0.01094628, "auxiliary_loss_mlp": 0.0103001, "balance_loss_clip": 1.04017317, "balance_loss_mlp": 1.01767778, "epoch": 0.5080414850443409, "flos": 27161466750720.0, "grad_norm": 1.5810099588314865, "language_loss": 0.73045403, "learning_rate": 2.044979031776844e-06, "loss": 0.7517004, "num_input_tokens_seen": 181706080, "step": 8450, "time_per_iteration": 2.744396448135376 }, { "auxiliary_loss_clip": 0.01123333, "auxiliary_loss_mlp": 0.01027947, "balance_loss_clip": 1.04468369, "balance_loss_mlp": 1.01485837, "epoch": 0.5081016082970089, "flos": 27085192220160.0, "grad_norm": 1.7103931675901212, "language_loss": 0.77190459, "learning_rate": 2.0445896677481234e-06, "loss": 0.79341733, "num_input_tokens_seen": 181724805, "step": 8451, "time_per_iteration": 2.683182716369629 }, { "auxiliary_loss_clip": 0.01122238, "auxiliary_loss_mlp": 0.01037138, "balance_loss_clip": 1.04372776, "balance_loss_mlp": 1.02413273, "epoch": 0.5081617315496768, "flos": 22856531690880.0, "grad_norm": 1.9627256153454082, "language_loss": 0.85055304, "learning_rate": 2.044200302028559e-06, "loss": 0.87214684, "num_input_tokens_seen": 181743725, "step": 8452, "time_per_iteration": 2.684624671936035 }, { "auxiliary_loss_clip": 0.01126785, "auxiliary_loss_mlp": 0.01034895, "balance_loss_clip": 1.04584098, "balance_loss_mlp": 1.02078056, "epoch": 0.5082218548023448, "flos": 16281898087680.0, "grad_norm": 4.065129026902181, "language_loss": 0.77099299, "learning_rate": 2.0438109346329143e-06, "loss": 0.79260981, "num_input_tokens_seen": 181757720, "step": 8453, "time_per_iteration": 2.572178602218628 }, { "auxiliary_loss_clip": 0.01084848, "auxiliary_loss_mlp": 0.01032198, "balance_loss_clip": 1.04113591, "balance_loss_mlp": 1.02010989, "epoch": 0.5082819780550127, "flos": 24460768915200.0, "grad_norm": 1.6244227223176155, "language_loss": 0.76530403, "learning_rate": 2.0434215655759544e-06, "loss": 0.78647447, "num_input_tokens_seen": 181778545, "step": 8454, "time_per_iteration": 2.8153836727142334 }, { "auxiliary_loss_clip": 0.01097667, "auxiliary_loss_mlp": 0.01036941, "balance_loss_clip": 1.03992426, "balance_loss_mlp": 1.02275562, "epoch": 0.5083421013076808, "flos": 23403271582080.0, "grad_norm": 1.5351507829324025, "language_loss": 0.89199609, "learning_rate": 2.0430321948724446e-06, "loss": 0.91334224, "num_input_tokens_seen": 181799495, "step": 8455, "time_per_iteration": 2.7793357372283936 }, { "auxiliary_loss_clip": 0.01106838, "auxiliary_loss_mlp": 0.00772606, "balance_loss_clip": 1.04346323, "balance_loss_mlp": 1.00026703, "epoch": 0.5084022245603487, "flos": 23872695448320.0, "grad_norm": 1.6166334009695327, "language_loss": 0.62119138, "learning_rate": 2.042642822537149e-06, "loss": 0.63998592, "num_input_tokens_seen": 181818400, "step": 8456, "time_per_iteration": 2.7200372219085693 }, { "auxiliary_loss_clip": 0.01034029, "auxiliary_loss_mlp": 0.01006279, "balance_loss_clip": 1.01840019, "balance_loss_mlp": 1.00490177, "epoch": 0.5084623478130167, "flos": 62873336655360.0, "grad_norm": 0.8116383799523507, "language_loss": 0.6243, "learning_rate": 2.0422534485848343e-06, "loss": 0.64470303, "num_input_tokens_seen": 181875975, "step": 8457, "time_per_iteration": 3.087890625 }, { "auxiliary_loss_clip": 0.01113045, "auxiliary_loss_mlp": 0.01032551, "balance_loss_clip": 1.0439477, "balance_loss_mlp": 1.01853776, "epoch": 0.5085224710656846, "flos": 22346133384960.0, "grad_norm": 1.6206653077395385, "language_loss": 0.67609936, "learning_rate": 2.0418640730302644e-06, "loss": 0.6975553, "num_input_tokens_seen": 181896450, "step": 8458, "time_per_iteration": 2.6950957775115967 }, { "auxiliary_loss_clip": 0.011096, "auxiliary_loss_mlp": 0.01034441, "balance_loss_clip": 1.04140186, "balance_loss_mlp": 1.01998079, "epoch": 0.5085825943183526, "flos": 26066263115520.0, "grad_norm": 1.6983738136244226, "language_loss": 0.77766174, "learning_rate": 2.0414746958882043e-06, "loss": 0.79910213, "num_input_tokens_seen": 181916770, "step": 8459, "time_per_iteration": 2.699784278869629 }, { "auxiliary_loss_clip": 0.01127851, "auxiliary_loss_mlp": 0.01035156, "balance_loss_clip": 1.04686987, "balance_loss_mlp": 1.02099431, "epoch": 0.5086427175710206, "flos": 17420733768960.0, "grad_norm": 10.198892862393663, "language_loss": 0.8050856, "learning_rate": 2.0410853171734196e-06, "loss": 0.82671559, "num_input_tokens_seen": 181932710, "step": 8460, "time_per_iteration": 2.632998466491699 }, { "auxiliary_loss_clip": 0.01101605, "auxiliary_loss_mlp": 0.01038577, "balance_loss_clip": 1.04293346, "balance_loss_mlp": 1.0255115, "epoch": 0.5087028408236886, "flos": 20631758083200.0, "grad_norm": 1.5613520556763807, "language_loss": 0.68347144, "learning_rate": 2.0406959369006754e-06, "loss": 0.70487332, "num_input_tokens_seen": 181950665, "step": 8461, "time_per_iteration": 2.7463462352752686 }, { "auxiliary_loss_clip": 0.01118492, "auxiliary_loss_mlp": 0.01030227, "balance_loss_clip": 1.04215729, "balance_loss_mlp": 1.01677442, "epoch": 0.5087629640763566, "flos": 25593822506880.0, "grad_norm": 1.9214201788253797, "language_loss": 0.76016432, "learning_rate": 2.0403065550847375e-06, "loss": 0.7816515, "num_input_tokens_seen": 181971270, "step": 8462, "time_per_iteration": 2.780043363571167 }, { "auxiliary_loss_clip": 0.01081215, "auxiliary_loss_mlp": 0.01037857, "balance_loss_clip": 1.0401057, "balance_loss_mlp": 1.02322388, "epoch": 0.5088230873290245, "flos": 13261631927040.0, "grad_norm": 2.117801536001897, "language_loss": 0.81441897, "learning_rate": 2.0399171717403706e-06, "loss": 0.83560967, "num_input_tokens_seen": 181988410, "step": 8463, "time_per_iteration": 2.7101564407348633 }, { "auxiliary_loss_clip": 0.0110518, "auxiliary_loss_mlp": 0.01035062, "balance_loss_clip": 1.04148602, "balance_loss_mlp": 1.02201426, "epoch": 0.5088832105816925, "flos": 20043469134720.0, "grad_norm": 2.6576302734312733, "language_loss": 0.76305163, "learning_rate": 2.039527786882341e-06, "loss": 0.78445399, "num_input_tokens_seen": 182006530, "step": 8464, "time_per_iteration": 2.6081295013427734 }, { "auxiliary_loss_clip": 0.01034964, "auxiliary_loss_mlp": 0.0100043, "balance_loss_clip": 1.01882601, "balance_loss_mlp": 0.99929708, "epoch": 0.5089433338343604, "flos": 67422179018880.0, "grad_norm": 0.6843560168430419, "language_loss": 0.59347767, "learning_rate": 2.0391384005254133e-06, "loss": 0.61383158, "num_input_tokens_seen": 182074240, "step": 8465, "time_per_iteration": 3.308885097503662 }, { "auxiliary_loss_clip": 0.0111949, "auxiliary_loss_mlp": 0.01033343, "balance_loss_clip": 1.04262543, "balance_loss_mlp": 1.0203197, "epoch": 0.5090034570870284, "flos": 22710339336960.0, "grad_norm": 2.5778248190048787, "language_loss": 0.80206662, "learning_rate": 2.038749012684354e-06, "loss": 0.82359493, "num_input_tokens_seen": 182093360, "step": 8466, "time_per_iteration": 2.6912481784820557 }, { "auxiliary_loss_clip": 0.01107512, "auxiliary_loss_mlp": 0.0102939, "balance_loss_clip": 1.03987598, "balance_loss_mlp": 1.01634204, "epoch": 0.5090635803396963, "flos": 20445812352000.0, "grad_norm": 1.5056043379234754, "language_loss": 0.78307688, "learning_rate": 2.0383596233739286e-06, "loss": 0.80444586, "num_input_tokens_seen": 182110170, "step": 8467, "time_per_iteration": 2.61828875541687 }, { "auxiliary_loss_clip": 0.01119026, "auxiliary_loss_mlp": 0.01034745, "balance_loss_clip": 1.04424381, "balance_loss_mlp": 1.02226961, "epoch": 0.5091237035923644, "flos": 23768878164480.0, "grad_norm": 1.9340722959801353, "language_loss": 0.74676347, "learning_rate": 2.0379702326089013e-06, "loss": 0.76830113, "num_input_tokens_seen": 182129570, "step": 8468, "time_per_iteration": 2.6233344078063965 }, { "auxiliary_loss_clip": 0.01119943, "auxiliary_loss_mlp": 0.01029058, "balance_loss_clip": 1.04366863, "balance_loss_mlp": 1.01651728, "epoch": 0.5091838268450323, "flos": 18327908684160.0, "grad_norm": 1.884666390581893, "language_loss": 0.77613342, "learning_rate": 2.03758084040404e-06, "loss": 0.7976234, "num_input_tokens_seen": 182147565, "step": 8469, "time_per_iteration": 2.579117774963379 }, { "auxiliary_loss_clip": 0.01107532, "auxiliary_loss_mlp": 0.01038411, "balance_loss_clip": 1.04521155, "balance_loss_mlp": 1.02425504, "epoch": 0.5092439500977003, "flos": 29057621806080.0, "grad_norm": 1.5718905230515574, "language_loss": 0.69481277, "learning_rate": 2.037191446774109e-06, "loss": 0.71627223, "num_input_tokens_seen": 182169695, "step": 8470, "time_per_iteration": 2.6437594890594482 }, { "auxiliary_loss_clip": 0.01096004, "auxiliary_loss_mlp": 0.01045395, "balance_loss_clip": 1.04067326, "balance_loss_mlp": 1.02993393, "epoch": 0.5093040733503682, "flos": 13553908894080.0, "grad_norm": 2.534594931806725, "language_loss": 0.73583853, "learning_rate": 2.0368020517338745e-06, "loss": 0.75725245, "num_input_tokens_seen": 182186385, "step": 8471, "time_per_iteration": 2.6213905811309814 }, { "auxiliary_loss_clip": 0.01043282, "auxiliary_loss_mlp": 0.00999685, "balance_loss_clip": 1.01733398, "balance_loss_mlp": 0.99825424, "epoch": 0.5093641966030362, "flos": 68906617407360.0, "grad_norm": 0.7545989611287492, "language_loss": 0.58065605, "learning_rate": 2.036412655298103e-06, "loss": 0.60108572, "num_input_tokens_seen": 182247095, "step": 8472, "time_per_iteration": 3.1640241146087646 }, { "auxiliary_loss_clip": 0.01069354, "auxiliary_loss_mlp": 0.01036283, "balance_loss_clip": 1.03772914, "balance_loss_mlp": 1.0235815, "epoch": 0.5094243198557042, "flos": 21580948932480.0, "grad_norm": 2.4665832849090994, "language_loss": 0.68956393, "learning_rate": 2.03602325748156e-06, "loss": 0.71062028, "num_input_tokens_seen": 182266380, "step": 8473, "time_per_iteration": 2.806593179702759 }, { "auxiliary_loss_clip": 0.01097364, "auxiliary_loss_mlp": 0.01035609, "balance_loss_clip": 1.04190159, "balance_loss_mlp": 1.02250814, "epoch": 0.5094844431083722, "flos": 28840721529600.0, "grad_norm": 1.8851162187904098, "language_loss": 0.85464561, "learning_rate": 2.0356338582990105e-06, "loss": 0.87597537, "num_input_tokens_seen": 182284685, "step": 8474, "time_per_iteration": 2.7467737197875977 }, { "auxiliary_loss_clip": 0.01097916, "auxiliary_loss_mlp": 0.01035284, "balance_loss_clip": 1.04213905, "balance_loss_mlp": 1.02201009, "epoch": 0.5095445663610402, "flos": 14976114969600.0, "grad_norm": 2.1580860587409867, "language_loss": 0.65563238, "learning_rate": 2.035244457765222e-06, "loss": 0.6769644, "num_input_tokens_seen": 182301810, "step": 8475, "time_per_iteration": 2.653343439102173 }, { "auxiliary_loss_clip": 0.01101978, "auxiliary_loss_mlp": 0.01044707, "balance_loss_clip": 1.04155195, "balance_loss_mlp": 1.03043771, "epoch": 0.5096046896137081, "flos": 20777088510720.0, "grad_norm": 2.3692417745384886, "language_loss": 0.82122153, "learning_rate": 2.0348550558949605e-06, "loss": 0.84268838, "num_input_tokens_seen": 182320285, "step": 8476, "time_per_iteration": 2.735163927078247 }, { "auxiliary_loss_clip": 0.01069648, "auxiliary_loss_mlp": 0.01043833, "balance_loss_clip": 1.03814852, "balance_loss_mlp": 1.02698851, "epoch": 0.5096648128663761, "flos": 23185078416000.0, "grad_norm": 5.724576330634238, "language_loss": 0.80651575, "learning_rate": 2.0344656527029917e-06, "loss": 0.82765061, "num_input_tokens_seen": 182339465, "step": 8477, "time_per_iteration": 2.8972108364105225 }, { "auxiliary_loss_clip": 0.01096525, "auxiliary_loss_mlp": 0.01028419, "balance_loss_clip": 1.04044962, "balance_loss_mlp": 1.01321959, "epoch": 0.509724936119044, "flos": 22309432663680.0, "grad_norm": 1.8365176357872317, "language_loss": 0.6178633, "learning_rate": 2.034076248204082e-06, "loss": 0.63911271, "num_input_tokens_seen": 182358375, "step": 8478, "time_per_iteration": 2.77237606048584 }, { "auxiliary_loss_clip": 0.01105596, "auxiliary_loss_mlp": 0.01039662, "balance_loss_clip": 1.04185414, "balance_loss_mlp": 1.02667403, "epoch": 0.509785059371712, "flos": 26287077974400.0, "grad_norm": 1.8436515105252975, "language_loss": 0.66209054, "learning_rate": 2.0336868424129968e-06, "loss": 0.68354309, "num_input_tokens_seen": 182377935, "step": 8479, "time_per_iteration": 2.667865514755249 }, { "auxiliary_loss_clip": 0.01108822, "auxiliary_loss_mlp": 0.01036542, "balance_loss_clip": 1.0434258, "balance_loss_mlp": 1.02382231, "epoch": 0.50984518262438, "flos": 22964586779520.0, "grad_norm": 1.5755275700627138, "language_loss": 0.69447386, "learning_rate": 2.0332974353445037e-06, "loss": 0.71592748, "num_input_tokens_seen": 182396440, "step": 8480, "time_per_iteration": 2.630505323410034 }, { "auxiliary_loss_clip": 0.01124122, "auxiliary_loss_mlp": 0.0103478, "balance_loss_clip": 1.04386926, "balance_loss_mlp": 1.02133346, "epoch": 0.509905305877048, "flos": 26213389223040.0, "grad_norm": 1.7899171043779052, "language_loss": 0.79267204, "learning_rate": 2.0329080270133688e-06, "loss": 0.81426102, "num_input_tokens_seen": 182415890, "step": 8481, "time_per_iteration": 2.6193926334381104 }, { "auxiliary_loss_clip": 0.01104496, "auxiliary_loss_mlp": 0.01034587, "balance_loss_clip": 1.04157507, "balance_loss_mlp": 1.02124786, "epoch": 0.5099654291297159, "flos": 20340055733760.0, "grad_norm": 1.468990392476105, "language_loss": 0.83301556, "learning_rate": 2.0325186174343578e-06, "loss": 0.85440642, "num_input_tokens_seen": 182434235, "step": 8482, "time_per_iteration": 4.149403095245361 }, { "auxiliary_loss_clip": 0.01113898, "auxiliary_loss_mlp": 0.00771464, "balance_loss_clip": 1.04287457, "balance_loss_mlp": 1.00025356, "epoch": 0.5100255523823839, "flos": 29054820545280.0, "grad_norm": 1.9010351115161617, "language_loss": 0.85379988, "learning_rate": 2.032129206622238e-06, "loss": 0.87265354, "num_input_tokens_seen": 182454360, "step": 8483, "time_per_iteration": 2.7000234127044678 }, { "auxiliary_loss_clip": 0.01109801, "auxiliary_loss_mlp": 0.01033991, "balance_loss_clip": 1.04242575, "balance_loss_mlp": 1.0214082, "epoch": 0.5100856756350518, "flos": 22455912326400.0, "grad_norm": 2.079288328100567, "language_loss": 0.82931423, "learning_rate": 2.031739794591775e-06, "loss": 0.85075212, "num_input_tokens_seen": 182471940, "step": 8484, "time_per_iteration": 4.3401288986206055 }, { "auxiliary_loss_clip": 0.01095037, "auxiliary_loss_mlp": 0.01033642, "balance_loss_clip": 1.0400697, "balance_loss_mlp": 1.0194087, "epoch": 0.5101457988877198, "flos": 19171055606400.0, "grad_norm": 2.530206097433835, "language_loss": 0.81594586, "learning_rate": 2.031350381357736e-06, "loss": 0.83723271, "num_input_tokens_seen": 182490685, "step": 8485, "time_per_iteration": 2.6573400497436523 }, { "auxiliary_loss_clip": 0.01092909, "auxiliary_loss_mlp": 0.01038281, "balance_loss_clip": 1.03726983, "balance_loss_mlp": 1.02494788, "epoch": 0.5102059221403878, "flos": 14866371941760.0, "grad_norm": 1.9374375358888782, "language_loss": 0.74155819, "learning_rate": 2.0309609669348874e-06, "loss": 0.76287007, "num_input_tokens_seen": 182508325, "step": 8486, "time_per_iteration": 2.676863670349121 }, { "auxiliary_loss_clip": 0.01078995, "auxiliary_loss_mlp": 0.01037671, "balance_loss_clip": 1.03769588, "balance_loss_mlp": 1.0228231, "epoch": 0.5102660453930558, "flos": 22961103160320.0, "grad_norm": 1.4946123985675848, "language_loss": 0.70439661, "learning_rate": 2.0305715513379953e-06, "loss": 0.72556329, "num_input_tokens_seen": 182527020, "step": 8487, "time_per_iteration": 2.740612030029297 }, { "auxiliary_loss_clip": 0.01099488, "auxiliary_loss_mlp": 0.01039832, "balance_loss_clip": 1.04223216, "balance_loss_mlp": 1.02521729, "epoch": 0.5103261686457238, "flos": 23149311448320.0, "grad_norm": 2.286550245787084, "language_loss": 0.73022705, "learning_rate": 2.030182134581827e-06, "loss": 0.75162029, "num_input_tokens_seen": 182543505, "step": 8488, "time_per_iteration": 4.345505714416504 }, { "auxiliary_loss_clip": 0.01081446, "auxiliary_loss_mlp": 0.00771801, "balance_loss_clip": 1.04138601, "balance_loss_mlp": 1.00030088, "epoch": 0.5103862918983917, "flos": 14319237000960.0, "grad_norm": 1.7726796746163496, "language_loss": 0.69465196, "learning_rate": 2.0297927166811503e-06, "loss": 0.71318448, "num_input_tokens_seen": 182562250, "step": 8489, "time_per_iteration": 2.7057676315307617 }, { "auxiliary_loss_clip": 0.01096056, "auxiliary_loss_mlp": 0.01035357, "balance_loss_clip": 1.04011536, "balance_loss_mlp": 1.02176082, "epoch": 0.5104464151510597, "flos": 25848536826240.0, "grad_norm": 2.097372581248088, "language_loss": 0.73219633, "learning_rate": 2.0294032976507297e-06, "loss": 0.75351048, "num_input_tokens_seen": 182581910, "step": 8490, "time_per_iteration": 2.7062344551086426 }, { "auxiliary_loss_clip": 0.01093699, "auxiliary_loss_mlp": 0.01030609, "balance_loss_clip": 1.04015577, "balance_loss_mlp": 1.01796126, "epoch": 0.5105065384037276, "flos": 21652913831040.0, "grad_norm": 1.454492701867694, "language_loss": 0.80228478, "learning_rate": 2.0290138775053337e-06, "loss": 0.82352787, "num_input_tokens_seen": 182601350, "step": 8491, "time_per_iteration": 2.670520782470703 }, { "auxiliary_loss_clip": 0.01108835, "auxiliary_loss_mlp": 0.01031094, "balance_loss_clip": 1.04258561, "balance_loss_mlp": 1.01813614, "epoch": 0.5105666616563956, "flos": 22491571553280.0, "grad_norm": 1.8545470770344947, "language_loss": 0.78970987, "learning_rate": 2.028624456259728e-06, "loss": 0.81110907, "num_input_tokens_seen": 182619660, "step": 8492, "time_per_iteration": 2.681852102279663 }, { "auxiliary_loss_clip": 0.01088193, "auxiliary_loss_mlp": 0.01045644, "balance_loss_clip": 1.04025435, "balance_loss_mlp": 1.03187561, "epoch": 0.5106267849090635, "flos": 22455768672000.0, "grad_norm": 1.9312934890574833, "language_loss": 0.77364743, "learning_rate": 2.0282350339286804e-06, "loss": 0.79498577, "num_input_tokens_seen": 182639815, "step": 8493, "time_per_iteration": 2.71234393119812 }, { "auxiliary_loss_clip": 0.01079322, "auxiliary_loss_mlp": 0.01035175, "balance_loss_clip": 1.04074192, "balance_loss_mlp": 1.02040458, "epoch": 0.5106869081617316, "flos": 23547093638400.0, "grad_norm": 1.7772442138937719, "language_loss": 0.84122825, "learning_rate": 2.0278456105269574e-06, "loss": 0.86237323, "num_input_tokens_seen": 182659655, "step": 8494, "time_per_iteration": 2.737844944000244 }, { "auxiliary_loss_clip": 0.0112627, "auxiliary_loss_mlp": 0.01037758, "balance_loss_clip": 1.04641843, "balance_loss_mlp": 1.02502632, "epoch": 0.5107470314143995, "flos": 26792987080320.0, "grad_norm": 1.9716326999087717, "language_loss": 0.78846836, "learning_rate": 2.027456186069326e-06, "loss": 0.81010866, "num_input_tokens_seen": 182677075, "step": 8495, "time_per_iteration": 2.5992324352264404 }, { "auxiliary_loss_clip": 0.01088486, "auxiliary_loss_mlp": 0.0103671, "balance_loss_clip": 1.04210663, "balance_loss_mlp": 1.02254176, "epoch": 0.5108071546670675, "flos": 25739691638400.0, "grad_norm": 1.7860993635097173, "language_loss": 0.78245926, "learning_rate": 2.0270667605705535e-06, "loss": 0.80371118, "num_input_tokens_seen": 182699625, "step": 8496, "time_per_iteration": 2.764511823654175 }, { "auxiliary_loss_clip": 0.01107232, "auxiliary_loss_mlp": 0.01031296, "balance_loss_clip": 1.04186177, "balance_loss_mlp": 1.01885021, "epoch": 0.5108672779197354, "flos": 18697537589760.0, "grad_norm": 2.583960220786706, "language_loss": 0.78615016, "learning_rate": 2.0266773340454066e-06, "loss": 0.80753547, "num_input_tokens_seen": 182717020, "step": 8497, "time_per_iteration": 2.614715337753296 }, { "auxiliary_loss_clip": 0.01119749, "auxiliary_loss_mlp": 0.01032774, "balance_loss_clip": 1.04238069, "balance_loss_mlp": 1.01958323, "epoch": 0.5109274011724034, "flos": 26688164215680.0, "grad_norm": 1.8043712312754003, "language_loss": 0.81731009, "learning_rate": 2.0262879065086525e-06, "loss": 0.83883524, "num_input_tokens_seen": 182736955, "step": 8498, "time_per_iteration": 2.670713186264038 }, { "auxiliary_loss_clip": 0.01086895, "auxiliary_loss_mlp": 0.00771568, "balance_loss_clip": 1.03893542, "balance_loss_mlp": 1.00021791, "epoch": 0.5109875244250714, "flos": 22784028088320.0, "grad_norm": 1.9502410959783398, "language_loss": 0.70963287, "learning_rate": 2.0258984779750584e-06, "loss": 0.72821754, "num_input_tokens_seen": 182757620, "step": 8499, "time_per_iteration": 2.6890597343444824 }, { "auxiliary_loss_clip": 0.01063023, "auxiliary_loss_mlp": 0.01039504, "balance_loss_clip": 1.03797197, "balance_loss_mlp": 1.0247463, "epoch": 0.5110476476777394, "flos": 35588515622400.0, "grad_norm": 1.532594294583486, "language_loss": 0.72400367, "learning_rate": 2.0255090484593914e-06, "loss": 0.74502897, "num_input_tokens_seen": 182780195, "step": 8500, "time_per_iteration": 2.8889389038085938 }, { "auxiliary_loss_clip": 0.01113898, "auxiliary_loss_mlp": 0.01039834, "balance_loss_clip": 1.04150367, "balance_loss_mlp": 1.0244801, "epoch": 0.5111077709304074, "flos": 19280798634240.0, "grad_norm": 2.6334939898019867, "language_loss": 0.62424856, "learning_rate": 2.0251196179764183e-06, "loss": 0.64578593, "num_input_tokens_seen": 182795765, "step": 8501, "time_per_iteration": 2.564922571182251 }, { "auxiliary_loss_clip": 0.01120814, "auxiliary_loss_mlp": 0.01040593, "balance_loss_clip": 1.04017985, "balance_loss_mlp": 1.0265801, "epoch": 0.5111678941830753, "flos": 20668207409280.0, "grad_norm": 2.184561184824311, "language_loss": 0.87622821, "learning_rate": 2.024730186540907e-06, "loss": 0.89784235, "num_input_tokens_seen": 182813120, "step": 8502, "time_per_iteration": 2.6287243366241455 }, { "auxiliary_loss_clip": 0.01106628, "auxiliary_loss_mlp": 0.01038615, "balance_loss_clip": 1.04065216, "balance_loss_mlp": 1.02592492, "epoch": 0.5112280174357433, "flos": 26287903987200.0, "grad_norm": 1.480449524900748, "language_loss": 0.82794261, "learning_rate": 2.0243407541676253e-06, "loss": 0.84939504, "num_input_tokens_seen": 182835745, "step": 8503, "time_per_iteration": 2.682711124420166 }, { "auxiliary_loss_clip": 0.01025632, "auxiliary_loss_mlp": 0.01004613, "balance_loss_clip": 1.0205853, "balance_loss_mlp": 1.00336099, "epoch": 0.5112881406884112, "flos": 59474247707520.0, "grad_norm": 0.8583626669635097, "language_loss": 0.63898063, "learning_rate": 2.023951320871339e-06, "loss": 0.65928316, "num_input_tokens_seen": 182892540, "step": 8504, "time_per_iteration": 3.216397523880005 }, { "auxiliary_loss_clip": 0.01091882, "auxiliary_loss_mlp": 0.00771622, "balance_loss_clip": 1.04488444, "balance_loss_mlp": 1.00014472, "epoch": 0.5113482639410792, "flos": 26468857728000.0, "grad_norm": 1.826391287063558, "language_loss": 0.84206301, "learning_rate": 2.023561886666816e-06, "loss": 0.86069804, "num_input_tokens_seen": 182911515, "step": 8505, "time_per_iteration": 2.8032052516937256 }, { "auxiliary_loss_clip": 0.0110904, "auxiliary_loss_mlp": 0.01030264, "balance_loss_clip": 1.04468179, "balance_loss_mlp": 1.01698971, "epoch": 0.5114083871937471, "flos": 29895848565120.0, "grad_norm": 1.983310033112748, "language_loss": 0.75608075, "learning_rate": 2.0231724515688246e-06, "loss": 0.77747381, "num_input_tokens_seen": 182930860, "step": 8506, "time_per_iteration": 2.699448347091675 }, { "auxiliary_loss_clip": 0.01122646, "auxiliary_loss_mlp": 0.01034693, "balance_loss_clip": 1.04428148, "balance_loss_mlp": 1.01986337, "epoch": 0.5114685104464152, "flos": 24314576561280.0, "grad_norm": 1.918965700593569, "language_loss": 0.58023655, "learning_rate": 2.022783015592131e-06, "loss": 0.60180998, "num_input_tokens_seen": 182949960, "step": 8507, "time_per_iteration": 2.5828280448913574 }, { "auxiliary_loss_clip": 0.01114406, "auxiliary_loss_mlp": 0.01042669, "balance_loss_clip": 1.04659033, "balance_loss_mlp": 1.02820277, "epoch": 0.5115286336990831, "flos": 17019288391680.0, "grad_norm": 1.7197846358145388, "language_loss": 0.85691231, "learning_rate": 2.022393578751503e-06, "loss": 0.87848306, "num_input_tokens_seen": 182968085, "step": 8508, "time_per_iteration": 2.691185235977173 }, { "auxiliary_loss_clip": 0.01090388, "auxiliary_loss_mlp": 0.00770619, "balance_loss_clip": 1.04480338, "balance_loss_mlp": 1.00018072, "epoch": 0.5115887569517511, "flos": 23659386531840.0, "grad_norm": 1.8624731533798382, "language_loss": 0.72326827, "learning_rate": 2.022004141061709e-06, "loss": 0.74187839, "num_input_tokens_seen": 182987275, "step": 8509, "time_per_iteration": 2.7239418029785156 }, { "auxiliary_loss_clip": 0.01120525, "auxiliary_loss_mlp": 0.00770526, "balance_loss_clip": 1.04470599, "balance_loss_mlp": 1.00009036, "epoch": 0.511648880204419, "flos": 16107193313280.0, "grad_norm": 2.5868792605641477, "language_loss": 0.76204944, "learning_rate": 2.0216147025375153e-06, "loss": 0.78095996, "num_input_tokens_seen": 183004700, "step": 8510, "time_per_iteration": 2.6135294437408447 }, { "auxiliary_loss_clip": 0.0112199, "auxiliary_loss_mlp": 0.01035525, "balance_loss_clip": 1.04560411, "balance_loss_mlp": 1.022668, "epoch": 0.511709003457087, "flos": 32634970974720.0, "grad_norm": 4.709097064233808, "language_loss": 0.70997655, "learning_rate": 2.0212252631936907e-06, "loss": 0.73155165, "num_input_tokens_seen": 183025830, "step": 8511, "time_per_iteration": 2.7760493755340576 }, { "auxiliary_loss_clip": 0.01095679, "auxiliary_loss_mlp": 0.01029146, "balance_loss_clip": 1.04216874, "balance_loss_mlp": 1.01593149, "epoch": 0.511769126709755, "flos": 21762082241280.0, "grad_norm": 2.953853433531297, "language_loss": 0.66357356, "learning_rate": 2.020835823045001e-06, "loss": 0.68482178, "num_input_tokens_seen": 183045140, "step": 8512, "time_per_iteration": 2.723987340927124 }, { "auxiliary_loss_clip": 0.01060265, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.0384953, "balance_loss_mlp": 1.02158666, "epoch": 0.511829249962423, "flos": 23915357827200.0, "grad_norm": 1.7575723482240548, "language_loss": 0.67203867, "learning_rate": 2.0204463821062146e-06, "loss": 0.69301212, "num_input_tokens_seen": 183063935, "step": 8513, "time_per_iteration": 2.759958505630493 }, { "auxiliary_loss_clip": 0.01083159, "auxiliary_loss_mlp": 0.01036169, "balance_loss_clip": 1.04507256, "balance_loss_mlp": 1.02201903, "epoch": 0.511889373215091, "flos": 23727005884800.0, "grad_norm": 2.3341144576485116, "language_loss": 0.68508673, "learning_rate": 2.0200569403921e-06, "loss": 0.70627999, "num_input_tokens_seen": 183084135, "step": 8514, "time_per_iteration": 2.7791545391082764 }, { "auxiliary_loss_clip": 0.01119085, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.04411948, "balance_loss_mlp": 1.01689076, "epoch": 0.5119494964677589, "flos": 28111519526400.0, "grad_norm": 1.6536407135597841, "language_loss": 0.66139281, "learning_rate": 2.019667497917424e-06, "loss": 0.68287694, "num_input_tokens_seen": 183104570, "step": 8515, "time_per_iteration": 2.6567435264587402 }, { "auxiliary_loss_clip": 0.01109629, "auxiliary_loss_mlp": 0.01035907, "balance_loss_clip": 1.04417586, "balance_loss_mlp": 1.02317524, "epoch": 0.5120096197204269, "flos": 24973214296320.0, "grad_norm": 1.939516836327544, "language_loss": 0.7526269, "learning_rate": 2.019278054696955e-06, "loss": 0.77408224, "num_input_tokens_seen": 183123850, "step": 8516, "time_per_iteration": 2.7218270301818848 }, { "auxiliary_loss_clip": 0.01093123, "auxiliary_loss_mlp": 0.01039766, "balance_loss_clip": 1.04275799, "balance_loss_mlp": 1.02562129, "epoch": 0.5120697429730948, "flos": 17968012364160.0, "grad_norm": 2.066446678045309, "language_loss": 0.78090644, "learning_rate": 2.0188886107454595e-06, "loss": 0.80223525, "num_input_tokens_seen": 183141725, "step": 8517, "time_per_iteration": 2.6922826766967773 }, { "auxiliary_loss_clip": 0.01114661, "auxiliary_loss_mlp": 0.01034987, "balance_loss_clip": 1.0449543, "balance_loss_mlp": 1.02086043, "epoch": 0.5121298662257628, "flos": 23292343405440.0, "grad_norm": 1.7160803061965533, "language_loss": 0.74111056, "learning_rate": 2.0184991660777063e-06, "loss": 0.76260698, "num_input_tokens_seen": 183161300, "step": 8518, "time_per_iteration": 2.6781773567199707 }, { "auxiliary_loss_clip": 0.01107849, "auxiliary_loss_mlp": 0.0104112, "balance_loss_clip": 1.04497719, "balance_loss_mlp": 1.02699947, "epoch": 0.5121899894784308, "flos": 17311062568320.0, "grad_norm": 1.7790366802945887, "language_loss": 0.78405094, "learning_rate": 2.0181097207084625e-06, "loss": 0.80554068, "num_input_tokens_seen": 183180495, "step": 8519, "time_per_iteration": 2.634488582611084 }, { "auxiliary_loss_clip": 0.01126735, "auxiliary_loss_mlp": 0.01036152, "balance_loss_clip": 1.04811025, "balance_loss_mlp": 1.02241898, "epoch": 0.5122501127310988, "flos": 24930085040640.0, "grad_norm": 1.8142627745056843, "language_loss": 0.79518384, "learning_rate": 2.017720274652497e-06, "loss": 0.81681275, "num_input_tokens_seen": 183200330, "step": 8520, "time_per_iteration": 2.6977620124816895 }, { "auxiliary_loss_clip": 0.01104965, "auxiliary_loss_mlp": 0.01041606, "balance_loss_clip": 1.0438292, "balance_loss_mlp": 1.02683616, "epoch": 0.5123102359837667, "flos": 18442859184000.0, "grad_norm": 2.180675544150299, "language_loss": 0.81294155, "learning_rate": 2.0173308279245765e-06, "loss": 0.83440727, "num_input_tokens_seen": 183218230, "step": 8521, "time_per_iteration": 4.264198303222656 }, { "auxiliary_loss_clip": 0.0111372, "auxiliary_loss_mlp": 0.01032737, "balance_loss_clip": 1.04381251, "balance_loss_mlp": 1.01808071, "epoch": 0.5123703592364347, "flos": 26684860164480.0, "grad_norm": 1.8350455385455566, "language_loss": 0.68333864, "learning_rate": 2.0169413805394692e-06, "loss": 0.70480323, "num_input_tokens_seen": 183236735, "step": 8522, "time_per_iteration": 2.755563735961914 }, { "auxiliary_loss_clip": 0.0109986, "auxiliary_loss_mlp": 0.01043615, "balance_loss_clip": 1.04744244, "balance_loss_mlp": 1.02636552, "epoch": 0.5124304824891026, "flos": 28803948981120.0, "grad_norm": 1.6735611690288588, "language_loss": 0.61849087, "learning_rate": 2.0165519325119433e-06, "loss": 0.6399256, "num_input_tokens_seen": 183257550, "step": 8523, "time_per_iteration": 2.752614974975586 }, { "auxiliary_loss_clip": 0.01088964, "auxiliary_loss_mlp": 0.01041136, "balance_loss_clip": 1.04488027, "balance_loss_mlp": 1.02776718, "epoch": 0.5124906057417706, "flos": 21761830846080.0, "grad_norm": 2.1631882282248966, "language_loss": 0.7807008, "learning_rate": 2.0161624838567656e-06, "loss": 0.80200177, "num_input_tokens_seen": 183275515, "step": 8524, "time_per_iteration": 5.938940763473511 }, { "auxiliary_loss_clip": 0.0110059, "auxiliary_loss_mlp": 0.01035868, "balance_loss_clip": 1.04444933, "balance_loss_mlp": 1.02287436, "epoch": 0.5125507289944387, "flos": 18880538405760.0, "grad_norm": 2.5285806743725834, "language_loss": 0.7489953, "learning_rate": 2.015773034588706e-06, "loss": 0.77035987, "num_input_tokens_seen": 183293880, "step": 8525, "time_per_iteration": 2.6603550910949707 }, { "auxiliary_loss_clip": 0.01100341, "auxiliary_loss_mlp": 0.01045872, "balance_loss_clip": 1.04424882, "balance_loss_mlp": 1.02996945, "epoch": 0.5126108522471066, "flos": 35627838036480.0, "grad_norm": 1.6545403659553666, "language_loss": 0.74193799, "learning_rate": 2.015383584722531e-06, "loss": 0.76340014, "num_input_tokens_seen": 183315860, "step": 8526, "time_per_iteration": 2.7631187438964844 }, { "auxiliary_loss_clip": 0.01117967, "auxiliary_loss_mlp": 0.010412, "balance_loss_clip": 1.04805541, "balance_loss_mlp": 1.02755094, "epoch": 0.5126709754997746, "flos": 20190918464640.0, "grad_norm": 1.7970307477050764, "language_loss": 0.65624464, "learning_rate": 2.0149941342730088e-06, "loss": 0.6778363, "num_input_tokens_seen": 183335480, "step": 8527, "time_per_iteration": 4.185753107070923 }, { "auxiliary_loss_clip": 0.01099112, "auxiliary_loss_mlp": 0.01038782, "balance_loss_clip": 1.04767573, "balance_loss_mlp": 1.02663493, "epoch": 0.5127310987524425, "flos": 18588548747520.0, "grad_norm": 1.4652981434759074, "language_loss": 0.74246556, "learning_rate": 2.014604683254908e-06, "loss": 0.76384449, "num_input_tokens_seen": 183354395, "step": 8528, "time_per_iteration": 2.647268056869507 }, { "auxiliary_loss_clip": 0.01110552, "auxiliary_loss_mlp": 0.01034843, "balance_loss_clip": 1.04382324, "balance_loss_mlp": 1.02143764, "epoch": 0.5127912220051105, "flos": 22454691264000.0, "grad_norm": 1.6345499952693072, "language_loss": 0.82838154, "learning_rate": 2.014215231682995e-06, "loss": 0.84983552, "num_input_tokens_seen": 183372980, "step": 8529, "time_per_iteration": 2.6546859741210938 }, { "auxiliary_loss_clip": 0.0107231, "auxiliary_loss_mlp": 0.01034968, "balance_loss_clip": 1.04131067, "balance_loss_mlp": 1.02149725, "epoch": 0.5128513452577784, "flos": 19093703667840.0, "grad_norm": 2.6019709601767866, "language_loss": 0.73687661, "learning_rate": 2.01382577957204e-06, "loss": 0.75794935, "num_input_tokens_seen": 183390160, "step": 8530, "time_per_iteration": 2.754840612411499 }, { "auxiliary_loss_clip": 0.01018433, "auxiliary_loss_mlp": 0.01003338, "balance_loss_clip": 1.02142978, "balance_loss_mlp": 1.00163293, "epoch": 0.5129114685104464, "flos": 67892285243520.0, "grad_norm": 0.7482622882096543, "language_loss": 0.60775113, "learning_rate": 2.0134363269368095e-06, "loss": 0.62796879, "num_input_tokens_seen": 183455280, "step": 8531, "time_per_iteration": 3.331425666809082 }, { "auxiliary_loss_clip": 0.01096599, "auxiliary_loss_mlp": 0.01039227, "balance_loss_clip": 1.04599643, "balance_loss_mlp": 1.02387309, "epoch": 0.5129715917631144, "flos": 20449152316800.0, "grad_norm": 1.6723134032232012, "language_loss": 0.76866412, "learning_rate": 2.0130468737920725e-06, "loss": 0.79002237, "num_input_tokens_seen": 183473955, "step": 8532, "time_per_iteration": 2.8071939945220947 }, { "auxiliary_loss_clip": 0.0110043, "auxiliary_loss_mlp": 0.01036596, "balance_loss_clip": 1.0434345, "balance_loss_mlp": 1.02273178, "epoch": 0.5130317150157824, "flos": 35116146840960.0, "grad_norm": 4.28948987854823, "language_loss": 0.67031407, "learning_rate": 2.012657420152597e-06, "loss": 0.69168431, "num_input_tokens_seen": 183497195, "step": 8533, "time_per_iteration": 2.7799179553985596 }, { "auxiliary_loss_clip": 0.01094678, "auxiliary_loss_mlp": 0.01039401, "balance_loss_clip": 1.04602468, "balance_loss_mlp": 1.02452362, "epoch": 0.5130918382684503, "flos": 19791627903360.0, "grad_norm": 1.9915175591272611, "language_loss": 0.8200537, "learning_rate": 2.01226796603315e-06, "loss": 0.84139454, "num_input_tokens_seen": 183513675, "step": 8534, "time_per_iteration": 2.6692066192626953 }, { "auxiliary_loss_clip": 0.01111793, "auxiliary_loss_mlp": 0.01038613, "balance_loss_clip": 1.04316652, "balance_loss_mlp": 1.02398574, "epoch": 0.5131519615211183, "flos": 26323096337280.0, "grad_norm": 1.4683279633381257, "language_loss": 0.63850307, "learning_rate": 2.0118785114485017e-06, "loss": 0.66000712, "num_input_tokens_seen": 183535165, "step": 8535, "time_per_iteration": 2.6881463527679443 }, { "auxiliary_loss_clip": 0.01118055, "auxiliary_loss_mlp": 0.01031488, "balance_loss_clip": 1.04930139, "balance_loss_mlp": 1.01707554, "epoch": 0.5132120847737862, "flos": 19171917532800.0, "grad_norm": 1.558826189326605, "language_loss": 0.69832361, "learning_rate": 2.011489056413418e-06, "loss": 0.71981907, "num_input_tokens_seen": 183553780, "step": 8536, "time_per_iteration": 2.7181568145751953 }, { "auxiliary_loss_clip": 0.01116762, "auxiliary_loss_mlp": 0.01038725, "balance_loss_clip": 1.04751253, "balance_loss_mlp": 1.02378178, "epoch": 0.5132722080264542, "flos": 20230420446720.0, "grad_norm": 1.9464397996960447, "language_loss": 0.70725036, "learning_rate": 2.011099600942669e-06, "loss": 0.72880518, "num_input_tokens_seen": 183572285, "step": 8537, "time_per_iteration": 2.6996657848358154 }, { "auxiliary_loss_clip": 0.01080908, "auxiliary_loss_mlp": 0.01034474, "balance_loss_clip": 1.04291606, "balance_loss_mlp": 1.02007353, "epoch": 0.5133323312791223, "flos": 16469459930880.0, "grad_norm": 1.8282608051087097, "language_loss": 0.8028723, "learning_rate": 2.0107101450510214e-06, "loss": 0.82402611, "num_input_tokens_seen": 183589330, "step": 8538, "time_per_iteration": 2.752685308456421 }, { "auxiliary_loss_clip": 0.01113197, "auxiliary_loss_mlp": 0.01031357, "balance_loss_clip": 1.0443325, "balance_loss_mlp": 1.01739144, "epoch": 0.5133924545317902, "flos": 26068094709120.0, "grad_norm": 2.0083592119837403, "language_loss": 0.78388107, "learning_rate": 2.0103206887532437e-06, "loss": 0.80532658, "num_input_tokens_seen": 183609205, "step": 8539, "time_per_iteration": 2.6856329441070557 }, { "auxiliary_loss_clip": 0.0109867, "auxiliary_loss_mlp": 0.01033877, "balance_loss_clip": 1.04138374, "balance_loss_mlp": 1.01994729, "epoch": 0.5134525777844582, "flos": 29131023248640.0, "grad_norm": 1.7382927125385157, "language_loss": 0.76111883, "learning_rate": 2.009931232064105e-06, "loss": 0.78244424, "num_input_tokens_seen": 183629985, "step": 8540, "time_per_iteration": 2.780198574066162 }, { "auxiliary_loss_clip": 0.01074682, "auxiliary_loss_mlp": 0.01038818, "balance_loss_clip": 1.04355264, "balance_loss_mlp": 1.02344, "epoch": 0.5135127010371261, "flos": 17454776883840.0, "grad_norm": 1.7132610384814069, "language_loss": 0.746566, "learning_rate": 2.0095417749983724e-06, "loss": 0.76770097, "num_input_tokens_seen": 183648220, "step": 8541, "time_per_iteration": 2.6982674598693848 }, { "auxiliary_loss_clip": 0.01060333, "auxiliary_loss_mlp": 0.01039276, "balance_loss_clip": 1.0412941, "balance_loss_mlp": 1.02475083, "epoch": 0.5135728242897941, "flos": 21944975316480.0, "grad_norm": 1.5289233613121331, "language_loss": 0.70432508, "learning_rate": 2.0091523175708162e-06, "loss": 0.72532117, "num_input_tokens_seen": 183668230, "step": 8542, "time_per_iteration": 2.783440113067627 }, { "auxiliary_loss_clip": 0.01102439, "auxiliary_loss_mlp": 0.01029643, "balance_loss_clip": 1.04426861, "balance_loss_mlp": 1.01601708, "epoch": 0.513632947542462, "flos": 22674859678080.0, "grad_norm": 1.886898343071389, "language_loss": 0.79691696, "learning_rate": 2.0087628597962023e-06, "loss": 0.81823772, "num_input_tokens_seen": 183687800, "step": 8543, "time_per_iteration": 2.906564950942993 }, { "auxiliary_loss_clip": 0.01101285, "auxiliary_loss_mlp": 0.01044679, "balance_loss_clip": 1.04514194, "balance_loss_mlp": 1.03012979, "epoch": 0.51369307079513, "flos": 29457163762560.0, "grad_norm": 1.7217499667212701, "language_loss": 0.67941636, "learning_rate": 2.008373401689299e-06, "loss": 0.700876, "num_input_tokens_seen": 183709025, "step": 8544, "time_per_iteration": 2.815377950668335 }, { "auxiliary_loss_clip": 0.01086355, "auxiliary_loss_mlp": 0.01049073, "balance_loss_clip": 1.03878117, "balance_loss_mlp": 1.03430903, "epoch": 0.513753194047798, "flos": 18989347680000.0, "grad_norm": 2.2112374430559214, "language_loss": 0.72265953, "learning_rate": 2.0079839432648765e-06, "loss": 0.74401385, "num_input_tokens_seen": 183725740, "step": 8545, "time_per_iteration": 2.7677536010742188 }, { "auxiliary_loss_clip": 0.01115821, "auxiliary_loss_mlp": 0.01045255, "balance_loss_clip": 1.04458177, "balance_loss_mlp": 1.03013897, "epoch": 0.513813317300466, "flos": 17821855923840.0, "grad_norm": 2.431720560794894, "language_loss": 0.82277304, "learning_rate": 2.0075944845377016e-06, "loss": 0.84438378, "num_input_tokens_seen": 183743995, "step": 8546, "time_per_iteration": 2.6764519214630127 }, { "auxiliary_loss_clip": 0.01110159, "auxiliary_loss_mlp": 0.01037047, "balance_loss_clip": 1.0421015, "balance_loss_mlp": 1.02272379, "epoch": 0.5138734405531339, "flos": 24061191045120.0, "grad_norm": 1.829642419824105, "language_loss": 0.73038638, "learning_rate": 2.007205025522544e-06, "loss": 0.75185841, "num_input_tokens_seen": 183764150, "step": 8547, "time_per_iteration": 2.664536714553833 }, { "auxiliary_loss_clip": 0.01112692, "auxiliary_loss_mlp": 0.01048016, "balance_loss_clip": 1.04215682, "balance_loss_mlp": 1.03369892, "epoch": 0.5139335638058019, "flos": 26097253574400.0, "grad_norm": 1.6776951969003835, "language_loss": 0.73548347, "learning_rate": 2.0068155662341702e-06, "loss": 0.75709057, "num_input_tokens_seen": 183783280, "step": 8548, "time_per_iteration": 2.6639697551727295 }, { "auxiliary_loss_clip": 0.01086334, "auxiliary_loss_mlp": 0.01037281, "balance_loss_clip": 1.03931546, "balance_loss_mlp": 1.02296984, "epoch": 0.5139936870584698, "flos": 18917095472640.0, "grad_norm": 1.6001321585074282, "language_loss": 0.82261604, "learning_rate": 2.0064261066873495e-06, "loss": 0.84385222, "num_input_tokens_seen": 183800725, "step": 8549, "time_per_iteration": 2.748581886291504 }, { "auxiliary_loss_clip": 0.01115178, "auxiliary_loss_mlp": 0.01033379, "balance_loss_clip": 1.04665935, "balance_loss_mlp": 1.0205524, "epoch": 0.5140538103111378, "flos": 16144001775360.0, "grad_norm": 1.9742432137522015, "language_loss": 0.71977437, "learning_rate": 2.0060366468968504e-06, "loss": 0.74125993, "num_input_tokens_seen": 183818735, "step": 8550, "time_per_iteration": 2.651068687438965 }, { "auxiliary_loss_clip": 0.01112958, "auxiliary_loss_mlp": 0.01041915, "balance_loss_clip": 1.04612732, "balance_loss_mlp": 1.02725196, "epoch": 0.5141139335638057, "flos": 22420145358720.0, "grad_norm": 1.8069208573649895, "language_loss": 0.75043917, "learning_rate": 2.0056471868774408e-06, "loss": 0.77198792, "num_input_tokens_seen": 183840015, "step": 8551, "time_per_iteration": 2.7058589458465576 }, { "auxiliary_loss_clip": 0.01093993, "auxiliary_loss_mlp": 0.01037756, "balance_loss_clip": 1.0459106, "balance_loss_mlp": 1.0240587, "epoch": 0.5141740568164738, "flos": 27089645506560.0, "grad_norm": 1.6630090206247619, "language_loss": 0.69182396, "learning_rate": 2.0052577266438897e-06, "loss": 0.71314144, "num_input_tokens_seen": 183860145, "step": 8552, "time_per_iteration": 2.7040834426879883 }, { "auxiliary_loss_clip": 0.01114038, "auxiliary_loss_mlp": 0.01039378, "balance_loss_clip": 1.04381299, "balance_loss_mlp": 1.02445841, "epoch": 0.5142341800691418, "flos": 24973250209920.0, "grad_norm": 2.1567314432200364, "language_loss": 0.753088, "learning_rate": 2.004868266210965e-06, "loss": 0.7746222, "num_input_tokens_seen": 183880540, "step": 8553, "time_per_iteration": 2.6321310997009277 }, { "auxiliary_loss_clip": 0.01125852, "auxiliary_loss_mlp": 0.0104126, "balance_loss_clip": 1.04767513, "balance_loss_mlp": 1.02800989, "epoch": 0.5142943033218097, "flos": 20704513080960.0, "grad_norm": 1.7807872167537822, "language_loss": 0.67740041, "learning_rate": 2.004478805593435e-06, "loss": 0.69907153, "num_input_tokens_seen": 183900895, "step": 8554, "time_per_iteration": 2.5353291034698486 }, { "auxiliary_loss_clip": 0.01118225, "auxiliary_loss_mlp": 0.01040414, "balance_loss_clip": 1.04483485, "balance_loss_mlp": 1.02390337, "epoch": 0.5143544265744777, "flos": 22925479847040.0, "grad_norm": 1.822401657137422, "language_loss": 0.73321033, "learning_rate": 2.004089344806068e-06, "loss": 0.75479674, "num_input_tokens_seen": 183920335, "step": 8555, "time_per_iteration": 2.8193295001983643 }, { "auxiliary_loss_clip": 0.01089525, "auxiliary_loss_mlp": 0.01039524, "balance_loss_clip": 1.04645813, "balance_loss_mlp": 1.02570128, "epoch": 0.5144145498271456, "flos": 15921391236480.0, "grad_norm": 2.4707318139003327, "language_loss": 0.74175709, "learning_rate": 2.003699883863633e-06, "loss": 0.76304758, "num_input_tokens_seen": 183936220, "step": 8556, "time_per_iteration": 2.721573829650879 }, { "auxiliary_loss_clip": 0.0109284, "auxiliary_loss_mlp": 0.01036355, "balance_loss_clip": 1.04400861, "balance_loss_mlp": 1.02320015, "epoch": 0.5144746730798136, "flos": 19681238430720.0, "grad_norm": 1.790105253554859, "language_loss": 0.85782719, "learning_rate": 2.003310422780898e-06, "loss": 0.87911922, "num_input_tokens_seen": 183953250, "step": 8557, "time_per_iteration": 2.70686674118042 }, { "auxiliary_loss_clip": 0.01106764, "auxiliary_loss_mlp": 0.01043673, "balance_loss_clip": 1.04357624, "balance_loss_mlp": 1.0292908, "epoch": 0.5145347963324816, "flos": 23914711382400.0, "grad_norm": 1.6124493392185149, "language_loss": 0.88770819, "learning_rate": 2.0029209615726307e-06, "loss": 0.90921259, "num_input_tokens_seen": 183973865, "step": 8558, "time_per_iteration": 2.7256360054016113 }, { "auxiliary_loss_clip": 0.01123218, "auxiliary_loss_mlp": 0.00770892, "balance_loss_clip": 1.04631722, "balance_loss_mlp": 1.00014222, "epoch": 0.5145949195851496, "flos": 18260002022400.0, "grad_norm": 2.0888380287595196, "language_loss": 0.65300936, "learning_rate": 2.002531500253602e-06, "loss": 0.67195046, "num_input_tokens_seen": 183992555, "step": 8559, "time_per_iteration": 2.64591646194458 }, { "auxiliary_loss_clip": 0.01108519, "auxiliary_loss_mlp": 0.00771269, "balance_loss_clip": 1.04542136, "balance_loss_mlp": 1.00025797, "epoch": 0.5146550428378175, "flos": 26213425136640.0, "grad_norm": 1.9572467781311524, "language_loss": 0.63094109, "learning_rate": 2.002142038838577e-06, "loss": 0.64973897, "num_input_tokens_seen": 184010825, "step": 8560, "time_per_iteration": 4.225303888320923 }, { "auxiliary_loss_clip": 0.0112394, "auxiliary_loss_mlp": 0.01031949, "balance_loss_clip": 1.04584384, "balance_loss_mlp": 1.01820433, "epoch": 0.5147151660904855, "flos": 22674177319680.0, "grad_norm": 1.85112269234195, "language_loss": 0.70142567, "learning_rate": 2.0017525773423265e-06, "loss": 0.72298455, "num_input_tokens_seen": 184030155, "step": 8561, "time_per_iteration": 2.6462759971618652 }, { "auxiliary_loss_clip": 0.01099376, "auxiliary_loss_mlp": 0.01032154, "balance_loss_clip": 1.04134226, "balance_loss_mlp": 1.01888585, "epoch": 0.5147752893431534, "flos": 24972388283520.0, "grad_norm": 1.6885707870282478, "language_loss": 0.66502726, "learning_rate": 2.0013631157796177e-06, "loss": 0.6863426, "num_input_tokens_seen": 184051440, "step": 8562, "time_per_iteration": 2.6790151596069336 }, { "auxiliary_loss_clip": 0.01118509, "auxiliary_loss_mlp": 0.01035134, "balance_loss_clip": 1.04731929, "balance_loss_mlp": 1.02153838, "epoch": 0.5148354125958214, "flos": 22744669760640.0, "grad_norm": 1.6641105551237323, "language_loss": 0.77625287, "learning_rate": 2.0009736541652188e-06, "loss": 0.79778934, "num_input_tokens_seen": 184070205, "step": 8563, "time_per_iteration": 5.86843466758728 }, { "auxiliary_loss_clip": 0.01117165, "auxiliary_loss_mlp": 0.01035106, "balance_loss_clip": 1.04520798, "balance_loss_mlp": 1.01931095, "epoch": 0.5148955358484893, "flos": 23068763199360.0, "grad_norm": 1.8668644890701778, "language_loss": 0.82346904, "learning_rate": 2.0005841925139e-06, "loss": 0.84499174, "num_input_tokens_seen": 184087345, "step": 8564, "time_per_iteration": 2.6531171798706055 }, { "auxiliary_loss_clip": 0.01105481, "auxiliary_loss_mlp": 0.01035772, "balance_loss_clip": 1.04333782, "balance_loss_mlp": 1.02130592, "epoch": 0.5149556591011574, "flos": 20340127560960.0, "grad_norm": 1.6929228826937828, "language_loss": 0.73255026, "learning_rate": 2.0001947308404283e-06, "loss": 0.75396281, "num_input_tokens_seen": 184107110, "step": 8565, "time_per_iteration": 2.8100740909576416 }, { "auxiliary_loss_clip": 0.0111614, "auxiliary_loss_mlp": 0.01036767, "balance_loss_clip": 1.04448807, "balance_loss_mlp": 1.02056694, "epoch": 0.5150157823538254, "flos": 22638230784000.0, "grad_norm": 2.0356075529568596, "language_loss": 0.68441874, "learning_rate": 1.9998052691595715e-06, "loss": 0.70594788, "num_input_tokens_seen": 184127105, "step": 8566, "time_per_iteration": 4.174206972122192 }, { "auxiliary_loss_clip": 0.01126685, "auxiliary_loss_mlp": 0.00772285, "balance_loss_clip": 1.04328656, "balance_loss_mlp": 1.00031221, "epoch": 0.5150759056064933, "flos": 26067627832320.0, "grad_norm": 1.624621701105177, "language_loss": 0.78153682, "learning_rate": 1.9994158074861005e-06, "loss": 0.80052656, "num_input_tokens_seen": 184148060, "step": 8567, "time_per_iteration": 2.6405906677246094 }, { "auxiliary_loss_clip": 0.01115866, "auxiliary_loss_mlp": 0.01034427, "balance_loss_clip": 1.0444839, "balance_loss_mlp": 1.01929939, "epoch": 0.5151360288591613, "flos": 25952641418880.0, "grad_norm": 2.181301277452511, "language_loss": 0.79243255, "learning_rate": 1.9990263458347806e-06, "loss": 0.81393552, "num_input_tokens_seen": 184166175, "step": 8568, "time_per_iteration": 2.6806869506835938 }, { "auxiliary_loss_clip": 0.01100678, "auxiliary_loss_mlp": 0.01033449, "balance_loss_clip": 1.04264474, "balance_loss_mlp": 1.02017546, "epoch": 0.5151961521118292, "flos": 18507246312960.0, "grad_norm": 2.356580017264164, "language_loss": 0.9131906, "learning_rate": 1.9986368842203825e-06, "loss": 0.93453181, "num_input_tokens_seen": 184182600, "step": 8569, "time_per_iteration": 2.6493630409240723 }, { "auxiliary_loss_clip": 0.01128863, "auxiliary_loss_mlp": 0.01034527, "balance_loss_clip": 1.04688525, "balance_loss_mlp": 1.0198164, "epoch": 0.5152562753644973, "flos": 22233696837120.0, "grad_norm": 2.0115285980006967, "language_loss": 0.76725376, "learning_rate": 1.998247422657674e-06, "loss": 0.78888762, "num_input_tokens_seen": 184202020, "step": 8570, "time_per_iteration": 2.6327102184295654 }, { "auxiliary_loss_clip": 0.01115897, "auxiliary_loss_mlp": 0.01044719, "balance_loss_clip": 1.04504037, "balance_loss_mlp": 1.02880454, "epoch": 0.5153163986171652, "flos": 38436555047040.0, "grad_norm": 1.735564613465363, "language_loss": 0.73986542, "learning_rate": 1.9978579611614227e-06, "loss": 0.76147163, "num_input_tokens_seen": 184224850, "step": 8571, "time_per_iteration": 2.879904270172119 }, { "auxiliary_loss_clip": 0.01031454, "auxiliary_loss_mlp": 0.01001432, "balance_loss_clip": 1.02375364, "balance_loss_mlp": 1.00009048, "epoch": 0.5153765218698332, "flos": 66384503015040.0, "grad_norm": 0.7786581254678329, "language_loss": 0.52855021, "learning_rate": 1.9974684997463984e-06, "loss": 0.54887909, "num_input_tokens_seen": 184288520, "step": 8572, "time_per_iteration": 3.2987639904022217 }, { "auxiliary_loss_clip": 0.01112833, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.04641247, "balance_loss_mlp": 1.02542353, "epoch": 0.5154366451225011, "flos": 24024669891840.0, "grad_norm": 1.82770535610101, "language_loss": 0.76185274, "learning_rate": 1.9970790384273687e-06, "loss": 0.78336841, "num_input_tokens_seen": 184308565, "step": 8573, "time_per_iteration": 2.6767003536224365 }, { "auxiliary_loss_clip": 0.01111651, "auxiliary_loss_mlp": 0.01028763, "balance_loss_clip": 1.04382682, "balance_loss_mlp": 1.01498199, "epoch": 0.5154967683751691, "flos": 23468843859840.0, "grad_norm": 2.7144169534848976, "language_loss": 0.77198601, "learning_rate": 1.996689577219102e-06, "loss": 0.7933901, "num_input_tokens_seen": 184326795, "step": 8574, "time_per_iteration": 2.6607704162597656 }, { "auxiliary_loss_clip": 0.01099994, "auxiliary_loss_mlp": 0.01033635, "balance_loss_clip": 1.04476404, "balance_loss_mlp": 1.02018237, "epoch": 0.515556891627837, "flos": 23805650712960.0, "grad_norm": 3.244613949266341, "language_loss": 0.8558231, "learning_rate": 1.996300116136367e-06, "loss": 0.87715936, "num_input_tokens_seen": 184345990, "step": 8575, "time_per_iteration": 2.6699635982513428 }, { "auxiliary_loss_clip": 0.01113561, "auxiliary_loss_mlp": 0.0103516, "balance_loss_clip": 1.04307377, "balance_loss_mlp": 1.02077138, "epoch": 0.515617014880505, "flos": 19828544106240.0, "grad_norm": 1.6301780240264319, "language_loss": 0.76920515, "learning_rate": 1.995910655193932e-06, "loss": 0.79069233, "num_input_tokens_seen": 184366300, "step": 8576, "time_per_iteration": 2.7603139877319336 }, { "auxiliary_loss_clip": 0.01078348, "auxiliary_loss_mlp": 0.00773356, "balance_loss_clip": 1.04196084, "balance_loss_mlp": 1.00032973, "epoch": 0.515677138133173, "flos": 14245907385600.0, "grad_norm": 2.480047069773859, "language_loss": 0.76414418, "learning_rate": 1.9955211944065654e-06, "loss": 0.78266126, "num_input_tokens_seen": 184383030, "step": 8577, "time_per_iteration": 2.694549083709717 }, { "auxiliary_loss_clip": 0.01099471, "auxiliary_loss_mlp": 0.01044811, "balance_loss_clip": 1.04260516, "balance_loss_mlp": 1.0279547, "epoch": 0.515737261385841, "flos": 28289707920000.0, "grad_norm": 1.7162174586848327, "language_loss": 0.80910254, "learning_rate": 1.9951317337890353e-06, "loss": 0.83054537, "num_input_tokens_seen": 184403410, "step": 8578, "time_per_iteration": 2.740527391433716 }, { "auxiliary_loss_clip": 0.01121615, "auxiliary_loss_mlp": 0.01032969, "balance_loss_clip": 1.04364657, "balance_loss_mlp": 1.01914644, "epoch": 0.515797384638509, "flos": 27891925729920.0, "grad_norm": 1.8526777225789184, "language_loss": 0.75880611, "learning_rate": 1.9947422733561105e-06, "loss": 0.780352, "num_input_tokens_seen": 184423830, "step": 8579, "time_per_iteration": 2.6643004417419434 }, { "auxiliary_loss_clip": 0.01087857, "auxiliary_loss_mlp": 0.01032352, "balance_loss_clip": 1.04332745, "balance_loss_mlp": 1.01849377, "epoch": 0.5158575078911769, "flos": 23040071210880.0, "grad_norm": 3.647152473791378, "language_loss": 0.7862978, "learning_rate": 1.994352813122559e-06, "loss": 0.80749989, "num_input_tokens_seen": 184445050, "step": 8580, "time_per_iteration": 2.74796986579895 }, { "auxiliary_loss_clip": 0.01086006, "auxiliary_loss_mlp": 0.0104917, "balance_loss_clip": 1.04050803, "balance_loss_mlp": 1.03265989, "epoch": 0.5159176311438449, "flos": 12641346938880.0, "grad_norm": 2.0718752995567966, "language_loss": 0.73151392, "learning_rate": 1.99396335310315e-06, "loss": 0.75286567, "num_input_tokens_seen": 184460775, "step": 8581, "time_per_iteration": 2.6738648414611816 }, { "auxiliary_loss_clip": 0.01114558, "auxiliary_loss_mlp": 0.01033417, "balance_loss_clip": 1.0463438, "balance_loss_mlp": 1.01976788, "epoch": 0.5159777543965128, "flos": 15558154951680.0, "grad_norm": 2.080206363710033, "language_loss": 0.74150515, "learning_rate": 1.9935738933126508e-06, "loss": 0.76298487, "num_input_tokens_seen": 184477365, "step": 8582, "time_per_iteration": 2.649186134338379 }, { "auxiliary_loss_clip": 0.01085634, "auxiliary_loss_mlp": 0.0103519, "balance_loss_clip": 1.04351485, "balance_loss_mlp": 1.02202952, "epoch": 0.5160378776491809, "flos": 23221671396480.0, "grad_norm": 4.912834420865202, "language_loss": 0.65803373, "learning_rate": 1.99318443376583e-06, "loss": 0.67924196, "num_input_tokens_seen": 184497045, "step": 8583, "time_per_iteration": 2.7025017738342285 }, { "auxiliary_loss_clip": 0.0111508, "auxiliary_loss_mlp": 0.01037055, "balance_loss_clip": 1.04503357, "balance_loss_mlp": 1.02199888, "epoch": 0.5160980009018488, "flos": 21944616180480.0, "grad_norm": 1.4135833939266678, "language_loss": 0.76130998, "learning_rate": 1.9927949744774568e-06, "loss": 0.78283131, "num_input_tokens_seen": 184517675, "step": 8584, "time_per_iteration": 2.662471294403076 }, { "auxiliary_loss_clip": 0.01093144, "auxiliary_loss_mlp": 0.01043062, "balance_loss_clip": 1.0425117, "balance_loss_mlp": 1.02877474, "epoch": 0.5161581241545168, "flos": 22784064001920.0, "grad_norm": 2.700643227023907, "language_loss": 0.79112214, "learning_rate": 1.9924055154622983e-06, "loss": 0.81248415, "num_input_tokens_seen": 184537745, "step": 8585, "time_per_iteration": 2.727789878845215 }, { "auxiliary_loss_clip": 0.01105983, "auxiliary_loss_mlp": 0.01033747, "balance_loss_clip": 1.0444293, "balance_loss_mlp": 1.02064013, "epoch": 0.5162182474071847, "flos": 19675384513920.0, "grad_norm": 2.398879690546405, "language_loss": 0.81236124, "learning_rate": 1.9920160567351238e-06, "loss": 0.83375853, "num_input_tokens_seen": 184553630, "step": 8586, "time_per_iteration": 2.6371195316314697 }, { "auxiliary_loss_clip": 0.01106215, "auxiliary_loss_mlp": 0.0103541, "balance_loss_clip": 1.04690671, "balance_loss_mlp": 1.02083015, "epoch": 0.5162783706598527, "flos": 20046198568320.0, "grad_norm": 1.819724898525227, "language_loss": 0.71372288, "learning_rate": 1.991626598310701e-06, "loss": 0.73513913, "num_input_tokens_seen": 184573530, "step": 8587, "time_per_iteration": 2.7760136127471924 }, { "auxiliary_loss_clip": 0.01038098, "auxiliary_loss_mlp": 0.01008101, "balance_loss_clip": 1.02063632, "balance_loss_mlp": 1.00669408, "epoch": 0.5163384939125206, "flos": 69959553713280.0, "grad_norm": 0.7288340121404665, "language_loss": 0.57740283, "learning_rate": 1.9912371402037984e-06, "loss": 0.59786481, "num_input_tokens_seen": 184637875, "step": 8588, "time_per_iteration": 3.183241844177246 }, { "auxiliary_loss_clip": 0.01101129, "auxiliary_loss_mlp": 0.01040283, "balance_loss_clip": 1.04456651, "balance_loss_mlp": 1.02572727, "epoch": 0.5163986171651886, "flos": 17417034668160.0, "grad_norm": 1.7775907605960104, "language_loss": 0.75007761, "learning_rate": 1.990847682429185e-06, "loss": 0.77149177, "num_input_tokens_seen": 184656125, "step": 8589, "time_per_iteration": 2.8228790760040283 }, { "auxiliary_loss_clip": 0.01117201, "auxiliary_loss_mlp": 0.01029876, "balance_loss_clip": 1.04574263, "balance_loss_mlp": 1.01678646, "epoch": 0.5164587404178566, "flos": 21322679166720.0, "grad_norm": 1.76753328713407, "language_loss": 0.67530292, "learning_rate": 1.990458225001627e-06, "loss": 0.69677365, "num_input_tokens_seen": 184675920, "step": 8590, "time_per_iteration": 2.6443076133728027 }, { "auxiliary_loss_clip": 0.0104106, "auxiliary_loss_mlp": 0.01004207, "balance_loss_clip": 1.02416718, "balance_loss_mlp": 1.00274086, "epoch": 0.5165188636705246, "flos": 68057149691520.0, "grad_norm": 1.576071766619913, "language_loss": 0.55832803, "learning_rate": 1.990068767935895e-06, "loss": 0.57878071, "num_input_tokens_seen": 184730520, "step": 8591, "time_per_iteration": 3.062364101409912 }, { "auxiliary_loss_clip": 0.01096175, "auxiliary_loss_mlp": 0.0102813, "balance_loss_clip": 1.04139185, "balance_loss_mlp": 1.01549983, "epoch": 0.5165789869231926, "flos": 19385657412480.0, "grad_norm": 1.5710435869577224, "language_loss": 0.81707442, "learning_rate": 1.9896793112467566e-06, "loss": 0.83831745, "num_input_tokens_seen": 184748340, "step": 8592, "time_per_iteration": 2.6631641387939453 }, { "auxiliary_loss_clip": 0.01108366, "auxiliary_loss_mlp": 0.01031712, "balance_loss_clip": 1.04346967, "balance_loss_mlp": 1.01837873, "epoch": 0.5166391101758605, "flos": 20960197067520.0, "grad_norm": 2.447309188835127, "language_loss": 0.83472121, "learning_rate": 1.989289854948979e-06, "loss": 0.85612202, "num_input_tokens_seen": 184766615, "step": 8593, "time_per_iteration": 2.6486148834228516 }, { "auxiliary_loss_clip": 0.01097046, "auxiliary_loss_mlp": 0.01044386, "balance_loss_clip": 1.04197097, "balance_loss_mlp": 1.02946699, "epoch": 0.5166992334285285, "flos": 29462407148160.0, "grad_norm": 2.3092045349550374, "language_loss": 0.69423366, "learning_rate": 1.9889003990573314e-06, "loss": 0.71564794, "num_input_tokens_seen": 184788075, "step": 8594, "time_per_iteration": 2.7182230949401855 }, { "auxiliary_loss_clip": 0.01082123, "auxiliary_loss_mlp": 0.01030642, "balance_loss_clip": 1.04193354, "balance_loss_mlp": 1.01663446, "epoch": 0.5167593566811964, "flos": 20304360593280.0, "grad_norm": 1.4197237581629922, "language_loss": 0.77434355, "learning_rate": 1.988510943586582e-06, "loss": 0.79547119, "num_input_tokens_seen": 184808710, "step": 8595, "time_per_iteration": 2.7374019622802734 }, { "auxiliary_loss_clip": 0.01123588, "auxiliary_loss_mlp": 0.01039202, "balance_loss_clip": 1.0457046, "balance_loss_mlp": 1.02551079, "epoch": 0.5168194799338645, "flos": 14611370313600.0, "grad_norm": 1.5026096017220443, "language_loss": 0.650635, "learning_rate": 1.9881214885514986e-06, "loss": 0.67226291, "num_input_tokens_seen": 184826475, "step": 8596, "time_per_iteration": 2.581263542175293 }, { "auxiliary_loss_clip": 0.01083842, "auxiliary_loss_mlp": 0.01032453, "balance_loss_clip": 1.0427258, "balance_loss_mlp": 1.01740873, "epoch": 0.5168796031865324, "flos": 25007257411200.0, "grad_norm": 1.5566562133380693, "language_loss": 0.75481033, "learning_rate": 1.9877320339668492e-06, "loss": 0.77597326, "num_input_tokens_seen": 184845245, "step": 8597, "time_per_iteration": 2.741926670074463 }, { "auxiliary_loss_clip": 0.01124007, "auxiliary_loss_mlp": 0.01026784, "balance_loss_clip": 1.04456997, "balance_loss_mlp": 1.01349235, "epoch": 0.5169397264392004, "flos": 26939969533440.0, "grad_norm": 1.5821649734534613, "language_loss": 0.81177652, "learning_rate": 1.987342579847403e-06, "loss": 0.83328438, "num_input_tokens_seen": 184866605, "step": 8598, "time_per_iteration": 2.690035343170166 }, { "auxiliary_loss_clip": 0.01071801, "auxiliary_loss_mlp": 0.01046328, "balance_loss_clip": 1.03745472, "balance_loss_mlp": 1.03122449, "epoch": 0.5169998496918683, "flos": 25407804948480.0, "grad_norm": 1.4930779887062733, "language_loss": 0.75179017, "learning_rate": 1.9869531262079273e-06, "loss": 0.77297151, "num_input_tokens_seen": 184886945, "step": 8599, "time_per_iteration": 2.8392081260681152 }, { "auxiliary_loss_clip": 0.01105064, "auxiliary_loss_mlp": 0.01033083, "balance_loss_clip": 1.04534984, "balance_loss_mlp": 1.02013683, "epoch": 0.5170599729445363, "flos": 24680793674880.0, "grad_norm": 2.7626803107212825, "language_loss": 0.72095126, "learning_rate": 1.9865636730631904e-06, "loss": 0.7423327, "num_input_tokens_seen": 184905590, "step": 8600, "time_per_iteration": 4.393568515777588 }, { "auxiliary_loss_clip": 0.01085277, "auxiliary_loss_mlp": 0.01034751, "balance_loss_clip": 1.03932548, "balance_loss_mlp": 1.02074337, "epoch": 0.5171200961972042, "flos": 20994455664000.0, "grad_norm": 1.381905387614244, "language_loss": 0.73886168, "learning_rate": 1.9861742204279602e-06, "loss": 0.76006198, "num_input_tokens_seen": 184925555, "step": 8601, "time_per_iteration": 2.7736306190490723 }, { "auxiliary_loss_clip": 0.01114158, "auxiliary_loss_mlp": 0.01040835, "balance_loss_clip": 1.04510868, "balance_loss_mlp": 1.02620816, "epoch": 0.5171802194498722, "flos": 22745639427840.0, "grad_norm": 2.1013626788591817, "language_loss": 0.83703583, "learning_rate": 1.9857847683170045e-06, "loss": 0.85858572, "num_input_tokens_seen": 184944490, "step": 8602, "time_per_iteration": 4.306191444396973 }, { "auxiliary_loss_clip": 0.01124659, "auxiliary_loss_mlp": 0.01033871, "balance_loss_clip": 1.04496753, "balance_loss_mlp": 1.01937509, "epoch": 0.5172403427025402, "flos": 28176732668160.0, "grad_norm": 1.7451034136925476, "language_loss": 0.74647379, "learning_rate": 1.9853953167450926e-06, "loss": 0.76805902, "num_input_tokens_seen": 184963190, "step": 8603, "time_per_iteration": 2.73425030708313 }, { "auxiliary_loss_clip": 0.01101467, "auxiliary_loss_mlp": 0.01037433, "balance_loss_clip": 1.04518127, "balance_loss_mlp": 1.02431369, "epoch": 0.5173004659552082, "flos": 20337829090560.0, "grad_norm": 2.1792209860390503, "language_loss": 0.72349811, "learning_rate": 1.9850058657269915e-06, "loss": 0.74488711, "num_input_tokens_seen": 184981220, "step": 8604, "time_per_iteration": 2.740248441696167 }, { "auxiliary_loss_clip": 0.01107237, "auxiliary_loss_mlp": 0.01042176, "balance_loss_clip": 1.04422593, "balance_loss_mlp": 1.02716208, "epoch": 0.5173605892078762, "flos": 19063323740160.0, "grad_norm": 1.7719196350127329, "language_loss": 0.85052991, "learning_rate": 1.984616415277469e-06, "loss": 0.87202406, "num_input_tokens_seen": 184998810, "step": 8605, "time_per_iteration": 4.264687538146973 }, { "auxiliary_loss_clip": 0.01107777, "auxiliary_loss_mlp": 0.01027945, "balance_loss_clip": 1.04396403, "balance_loss_mlp": 1.01552308, "epoch": 0.5174207124605441, "flos": 27995168396160.0, "grad_norm": 1.6794634480750013, "language_loss": 0.64467752, "learning_rate": 1.984226965411294e-06, "loss": 0.6660347, "num_input_tokens_seen": 185021185, "step": 8606, "time_per_iteration": 2.7390646934509277 }, { "auxiliary_loss_clip": 0.01096289, "auxiliary_loss_mlp": 0.01031967, "balance_loss_clip": 1.04330635, "balance_loss_mlp": 1.01885414, "epoch": 0.5174808357132121, "flos": 19496657416320.0, "grad_norm": 1.503605725156866, "language_loss": 0.77918422, "learning_rate": 1.983837516143234e-06, "loss": 0.80046678, "num_input_tokens_seen": 185038465, "step": 8607, "time_per_iteration": 2.718864917755127 }, { "auxiliary_loss_clip": 0.01114878, "auxiliary_loss_mlp": 0.01036994, "balance_loss_clip": 1.04531431, "balance_loss_mlp": 1.0226177, "epoch": 0.51754095896588, "flos": 22784171742720.0, "grad_norm": 2.7158797821524585, "language_loss": 0.72334993, "learning_rate": 1.983448067488057e-06, "loss": 0.74486864, "num_input_tokens_seen": 185057340, "step": 8608, "time_per_iteration": 2.767817258834839 }, { "auxiliary_loss_clip": 0.01119837, "auxiliary_loss_mlp": 0.01034295, "balance_loss_clip": 1.04469681, "balance_loss_mlp": 1.01979923, "epoch": 0.5176010822185481, "flos": 22669257156480.0, "grad_norm": 1.8609844806921267, "language_loss": 0.8623482, "learning_rate": 1.983058619460531e-06, "loss": 0.88388956, "num_input_tokens_seen": 185074935, "step": 8609, "time_per_iteration": 2.8063855171203613 }, { "auxiliary_loss_clip": 0.01111694, "auxiliary_loss_mlp": 0.01037765, "balance_loss_clip": 1.04306316, "balance_loss_mlp": 1.02484906, "epoch": 0.517661205471216, "flos": 23951196622080.0, "grad_norm": 2.050130502752804, "language_loss": 0.73473549, "learning_rate": 1.9826691720754237e-06, "loss": 0.75623012, "num_input_tokens_seen": 185095050, "step": 8610, "time_per_iteration": 2.740083694458008 }, { "auxiliary_loss_clip": 0.01129954, "auxiliary_loss_mlp": 0.01038598, "balance_loss_clip": 1.04616904, "balance_loss_mlp": 1.02353036, "epoch": 0.517721328723884, "flos": 15596076735360.0, "grad_norm": 2.3590336184711926, "language_loss": 0.67205131, "learning_rate": 1.9822797253475034e-06, "loss": 0.69373685, "num_input_tokens_seen": 185112275, "step": 8611, "time_per_iteration": 2.648165464401245 }, { "auxiliary_loss_clip": 0.01122336, "auxiliary_loss_mlp": 0.01039403, "balance_loss_clip": 1.0434556, "balance_loss_mlp": 1.02535403, "epoch": 0.5177814519765519, "flos": 20960197067520.0, "grad_norm": 2.3905761842565485, "language_loss": 0.77420157, "learning_rate": 1.9818902792915373e-06, "loss": 0.79581904, "num_input_tokens_seen": 185132165, "step": 8612, "time_per_iteration": 2.663339376449585 }, { "auxiliary_loss_clip": 0.01114318, "auxiliary_loss_mlp": 0.01040798, "balance_loss_clip": 1.04297137, "balance_loss_mlp": 1.02688015, "epoch": 0.5178415752292199, "flos": 17967832796160.0, "grad_norm": 2.1474229546439174, "language_loss": 0.8168264, "learning_rate": 1.981500833922294e-06, "loss": 0.83837759, "num_input_tokens_seen": 185151025, "step": 8613, "time_per_iteration": 2.6589057445526123 }, { "auxiliary_loss_clip": 0.01128171, "auxiliary_loss_mlp": 0.01042961, "balance_loss_clip": 1.04804301, "balance_loss_mlp": 1.02832222, "epoch": 0.5179016984818878, "flos": 17821496787840.0, "grad_norm": 2.274335348251239, "language_loss": 0.66216785, "learning_rate": 1.981111389254541e-06, "loss": 0.6838792, "num_input_tokens_seen": 185168455, "step": 8614, "time_per_iteration": 2.692133903503418 }, { "auxiliary_loss_clip": 0.01100612, "auxiliary_loss_mlp": 0.01034486, "balance_loss_clip": 1.04462051, "balance_loss_mlp": 1.01982355, "epoch": 0.5179618217345558, "flos": 17820455293440.0, "grad_norm": 2.0015033819610055, "language_loss": 0.8693983, "learning_rate": 1.9807219453030453e-06, "loss": 0.89074928, "num_input_tokens_seen": 185184415, "step": 8615, "time_per_iteration": 2.690483808517456 }, { "auxiliary_loss_clip": 0.01113112, "auxiliary_loss_mlp": 0.01044655, "balance_loss_clip": 1.04499412, "balance_loss_mlp": 1.03147638, "epoch": 0.5180219449872238, "flos": 22522131048960.0, "grad_norm": 1.8105595259457619, "language_loss": 0.8084923, "learning_rate": 1.9803325020825763e-06, "loss": 0.83007002, "num_input_tokens_seen": 185202910, "step": 8616, "time_per_iteration": 2.6410508155822754 }, { "auxiliary_loss_clip": 0.01120148, "auxiliary_loss_mlp": 0.00772211, "balance_loss_clip": 1.04987717, "balance_loss_mlp": 1.00035763, "epoch": 0.5180820682398918, "flos": 23915465568000.0, "grad_norm": 2.1203191332986675, "language_loss": 0.75104189, "learning_rate": 1.9799430596079e-06, "loss": 0.76996547, "num_input_tokens_seen": 185223085, "step": 8617, "time_per_iteration": 2.6979870796203613 }, { "auxiliary_loss_clip": 0.01126304, "auxiliary_loss_mlp": 0.01042481, "balance_loss_clip": 1.04557788, "balance_loss_mlp": 1.02717435, "epoch": 0.5181421914925598, "flos": 16979930064000.0, "grad_norm": 1.6549706674723104, "language_loss": 0.70240247, "learning_rate": 1.979553617893785e-06, "loss": 0.72409028, "num_input_tokens_seen": 185241295, "step": 8618, "time_per_iteration": 2.6166911125183105 }, { "auxiliary_loss_clip": 0.01038523, "auxiliary_loss_mlp": 0.01004843, "balance_loss_clip": 1.02117562, "balance_loss_mlp": 1.00342429, "epoch": 0.5182023147452277, "flos": 66059870872320.0, "grad_norm": 0.9503620431523022, "language_loss": 0.67223799, "learning_rate": 1.979164176954999e-06, "loss": 0.69267166, "num_input_tokens_seen": 185298295, "step": 8619, "time_per_iteration": 3.186922550201416 }, { "auxiliary_loss_clip": 0.01079843, "auxiliary_loss_mlp": 0.01035858, "balance_loss_clip": 1.04400134, "balance_loss_mlp": 1.02230954, "epoch": 0.5182624379978957, "flos": 18187749815040.0, "grad_norm": 1.8983764009380637, "language_loss": 0.79863739, "learning_rate": 1.97877473680631e-06, "loss": 0.8197943, "num_input_tokens_seen": 185317000, "step": 8620, "time_per_iteration": 2.8446528911590576 }, { "auxiliary_loss_clip": 0.01060893, "auxiliary_loss_mlp": 0.00772403, "balance_loss_clip": 1.04089034, "balance_loss_mlp": 1.00029039, "epoch": 0.5183225612505636, "flos": 14026708638720.0, "grad_norm": 2.0819192927399586, "language_loss": 0.82402205, "learning_rate": 1.9783852974624846e-06, "loss": 0.84235501, "num_input_tokens_seen": 185331185, "step": 8621, "time_per_iteration": 2.753957509994507 }, { "auxiliary_loss_clip": 0.01097265, "auxiliary_loss_mlp": 0.010405, "balance_loss_clip": 1.03958249, "balance_loss_mlp": 1.02750611, "epoch": 0.5183826845032317, "flos": 23659781581440.0, "grad_norm": 2.428940739700658, "language_loss": 0.65491748, "learning_rate": 1.9779958589382905e-06, "loss": 0.67629516, "num_input_tokens_seen": 185348955, "step": 8622, "time_per_iteration": 2.7421741485595703 }, { "auxiliary_loss_clip": 0.01106105, "auxiliary_loss_mlp": 0.01044986, "balance_loss_clip": 1.04371572, "balance_loss_mlp": 1.03016257, "epoch": 0.5184428077558996, "flos": 15888605097600.0, "grad_norm": 2.083884784089921, "language_loss": 0.60552382, "learning_rate": 1.977606421248497e-06, "loss": 0.62703472, "num_input_tokens_seen": 185367330, "step": 8623, "time_per_iteration": 2.690345048904419 }, { "auxiliary_loss_clip": 0.0112578, "auxiliary_loss_mlp": 0.01032047, "balance_loss_clip": 1.04534173, "balance_loss_mlp": 1.01890421, "epoch": 0.5185029310085676, "flos": 21030833162880.0, "grad_norm": 1.609281256747452, "language_loss": 0.76150465, "learning_rate": 1.9772169844078685e-06, "loss": 0.78308284, "num_input_tokens_seen": 185385060, "step": 8624, "time_per_iteration": 2.613788366317749 }, { "auxiliary_loss_clip": 0.0107795, "auxiliary_loss_mlp": 0.01043066, "balance_loss_clip": 1.03900456, "balance_loss_mlp": 1.02859426, "epoch": 0.5185630542612355, "flos": 26542690133760.0, "grad_norm": 2.373822325498003, "language_loss": 0.70952767, "learning_rate": 1.9768275484311756e-06, "loss": 0.73073781, "num_input_tokens_seen": 185403745, "step": 8625, "time_per_iteration": 2.7548205852508545 }, { "auxiliary_loss_clip": 0.01100948, "auxiliary_loss_mlp": 0.0103515, "balance_loss_clip": 1.04119349, "balance_loss_mlp": 1.02260327, "epoch": 0.5186231775139035, "flos": 20668422890880.0, "grad_norm": 1.9009704883002407, "language_loss": 0.67718256, "learning_rate": 1.976438113333184e-06, "loss": 0.69854349, "num_input_tokens_seen": 185422620, "step": 8626, "time_per_iteration": 2.731328248977661 }, { "auxiliary_loss_clip": 0.0111085, "auxiliary_loss_mlp": 0.01033689, "balance_loss_clip": 1.04271841, "balance_loss_mlp": 1.02022982, "epoch": 0.5186833007665714, "flos": 20885502735360.0, "grad_norm": 1.960489278080422, "language_loss": 0.70780122, "learning_rate": 1.9760486791286612e-06, "loss": 0.72924662, "num_input_tokens_seen": 185439380, "step": 8627, "time_per_iteration": 2.6464414596557617 }, { "auxiliary_loss_clip": 0.011279, "auxiliary_loss_mlp": 0.00772067, "balance_loss_clip": 1.04576206, "balance_loss_mlp": 1.00029826, "epoch": 0.5187434240192395, "flos": 20886903365760.0, "grad_norm": 2.0333805073835007, "language_loss": 0.7303592, "learning_rate": 1.9756592458323753e-06, "loss": 0.74935889, "num_input_tokens_seen": 185458830, "step": 8628, "time_per_iteration": 2.7327346801757812 }, { "auxiliary_loss_clip": 0.01102356, "auxiliary_loss_mlp": 0.01031961, "balance_loss_clip": 1.04561651, "balance_loss_mlp": 1.01927686, "epoch": 0.5188035472719074, "flos": 19859929614720.0, "grad_norm": 1.6190117042724865, "language_loss": 0.77354944, "learning_rate": 1.9752698134590927e-06, "loss": 0.79489267, "num_input_tokens_seen": 185477270, "step": 8629, "time_per_iteration": 2.77992582321167 }, { "auxiliary_loss_clip": 0.01115143, "auxiliary_loss_mlp": 0.01034186, "balance_loss_clip": 1.04428935, "balance_loss_mlp": 1.01932621, "epoch": 0.5188636705245754, "flos": 21138313633920.0, "grad_norm": 2.228815370750346, "language_loss": 0.75078702, "learning_rate": 1.9748803820235815e-06, "loss": 0.77228034, "num_input_tokens_seen": 185495795, "step": 8630, "time_per_iteration": 2.6749987602233887 }, { "auxiliary_loss_clip": 0.01112188, "auxiliary_loss_mlp": 0.01038971, "balance_loss_clip": 1.04358792, "balance_loss_mlp": 1.02446306, "epoch": 0.5189237937772434, "flos": 22419786222720.0, "grad_norm": 2.002083188679526, "language_loss": 0.80665708, "learning_rate": 1.9744909515406093e-06, "loss": 0.82816863, "num_input_tokens_seen": 185514885, "step": 8631, "time_per_iteration": 2.7432682514190674 }, { "auxiliary_loss_clip": 0.01114617, "auxiliary_loss_mlp": 0.01034953, "balance_loss_clip": 1.04478788, "balance_loss_mlp": 1.02031374, "epoch": 0.5189839170299113, "flos": 25446696399360.0, "grad_norm": 1.4933919289773454, "language_loss": 0.74756616, "learning_rate": 1.974101522024942e-06, "loss": 0.76906186, "num_input_tokens_seen": 185537155, "step": 8632, "time_per_iteration": 2.726018190383911 }, { "auxiliary_loss_clip": 0.01093075, "auxiliary_loss_mlp": 0.01033441, "balance_loss_clip": 1.04612803, "balance_loss_mlp": 1.01946926, "epoch": 0.5190440402825793, "flos": 18587722734720.0, "grad_norm": 1.8814471450767234, "language_loss": 0.78911304, "learning_rate": 1.9737120934913477e-06, "loss": 0.81037819, "num_input_tokens_seen": 185555520, "step": 8633, "time_per_iteration": 2.715510606765747 }, { "auxiliary_loss_clip": 0.0111596, "auxiliary_loss_mlp": 0.01028973, "balance_loss_clip": 1.04581857, "balance_loss_mlp": 1.01619983, "epoch": 0.5191041635352472, "flos": 21908633731200.0, "grad_norm": 5.606824878452593, "language_loss": 0.80551088, "learning_rate": 1.9733226659545936e-06, "loss": 0.82696015, "num_input_tokens_seen": 185573855, "step": 8634, "time_per_iteration": 2.6477181911468506 }, { "auxiliary_loss_clip": 0.01122619, "auxiliary_loss_mlp": 0.0103901, "balance_loss_clip": 1.04603028, "balance_loss_mlp": 1.02571273, "epoch": 0.5191642867879153, "flos": 27527971173120.0, "grad_norm": 1.5734156514364543, "language_loss": 0.69467652, "learning_rate": 1.9729332394294467e-06, "loss": 0.71629286, "num_input_tokens_seen": 185595145, "step": 8635, "time_per_iteration": 2.713585615158081 }, { "auxiliary_loss_clip": 0.01102259, "auxiliary_loss_mlp": 0.01035772, "balance_loss_clip": 1.0431217, "balance_loss_mlp": 1.02210498, "epoch": 0.5192244100405832, "flos": 15705999331200.0, "grad_norm": 1.6343728145872918, "language_loss": 0.77876496, "learning_rate": 1.9725438139306742e-06, "loss": 0.80014527, "num_input_tokens_seen": 185613320, "step": 8636, "time_per_iteration": 2.6876139640808105 }, { "auxiliary_loss_clip": 0.01127572, "auxiliary_loss_mlp": 0.01032982, "balance_loss_clip": 1.04695189, "balance_loss_mlp": 1.01938009, "epoch": 0.5192845332932512, "flos": 12057080313600.0, "grad_norm": 2.1121159964360596, "language_loss": 0.71433318, "learning_rate": 1.9721543894730425e-06, "loss": 0.73593867, "num_input_tokens_seen": 185630730, "step": 8637, "time_per_iteration": 2.6093368530273438 }, { "auxiliary_loss_clip": 0.01088299, "auxiliary_loss_mlp": 0.01033237, "balance_loss_clip": 1.04357982, "balance_loss_mlp": 1.01999319, "epoch": 0.5193446565459191, "flos": 18953185662720.0, "grad_norm": 2.05486546466365, "language_loss": 0.76026344, "learning_rate": 1.9717649660713194e-06, "loss": 0.78147888, "num_input_tokens_seen": 185648515, "step": 8638, "time_per_iteration": 2.680696725845337 }, { "auxiliary_loss_clip": 0.0109108, "auxiliary_loss_mlp": 0.01028733, "balance_loss_clip": 1.04291189, "balance_loss_mlp": 1.01578116, "epoch": 0.5194047797985871, "flos": 20374960775040.0, "grad_norm": 13.373516582231533, "language_loss": 0.74382144, "learning_rate": 1.971375543740272e-06, "loss": 0.7650196, "num_input_tokens_seen": 185665220, "step": 8639, "time_per_iteration": 4.318557500839233 }, { "auxiliary_loss_clip": 0.01123361, "auxiliary_loss_mlp": 0.01032334, "balance_loss_clip": 1.04529893, "balance_loss_mlp": 1.01838636, "epoch": 0.519464903051255, "flos": 24353001135360.0, "grad_norm": 1.5657899745454023, "language_loss": 0.77311909, "learning_rate": 1.9709861224946665e-06, "loss": 0.79467607, "num_input_tokens_seen": 185683750, "step": 8640, "time_per_iteration": 2.5864639282226562 }, { "auxiliary_loss_clip": 0.01082849, "auxiliary_loss_mlp": 0.0103216, "balance_loss_clip": 1.04260516, "balance_loss_mlp": 1.01930904, "epoch": 0.519525026303923, "flos": 14061829161600.0, "grad_norm": 2.0170540453425714, "language_loss": 0.66183293, "learning_rate": 1.97059670234927e-06, "loss": 0.68298292, "num_input_tokens_seen": 185700625, "step": 8641, "time_per_iteration": 2.692979574203491 }, { "auxiliary_loss_clip": 0.01123177, "auxiliary_loss_mlp": 0.01034193, "balance_loss_clip": 1.04594493, "balance_loss_mlp": 1.02172363, "epoch": 0.519585149556591, "flos": 28835873193600.0, "grad_norm": 1.7554954360005686, "language_loss": 0.76535702, "learning_rate": 1.97020728331885e-06, "loss": 0.78693068, "num_input_tokens_seen": 185721155, "step": 8642, "time_per_iteration": 5.96128249168396 }, { "auxiliary_loss_clip": 0.0112288, "auxiliary_loss_mlp": 0.01031224, "balance_loss_clip": 1.04584873, "balance_loss_mlp": 1.01806307, "epoch": 0.519645272809259, "flos": 25373007648000.0, "grad_norm": 2.255175934024536, "language_loss": 0.83165199, "learning_rate": 1.9698178654181726e-06, "loss": 0.85319304, "num_input_tokens_seen": 185740990, "step": 8643, "time_per_iteration": 2.81384539604187 }, { "auxiliary_loss_clip": 0.01126122, "auxiliary_loss_mlp": 0.01041822, "balance_loss_clip": 1.04520261, "balance_loss_mlp": 1.02785623, "epoch": 0.519705396061927, "flos": 25372863993600.0, "grad_norm": 2.2020503225508645, "language_loss": 0.7044059, "learning_rate": 1.969428448662004e-06, "loss": 0.72608531, "num_input_tokens_seen": 185762235, "step": 8644, "time_per_iteration": 2.7107033729553223 }, { "auxiliary_loss_clip": 0.01111108, "auxiliary_loss_mlp": 0.00770711, "balance_loss_clip": 1.04354811, "balance_loss_mlp": 1.00015676, "epoch": 0.5197655193145949, "flos": 28476228268800.0, "grad_norm": 1.5309653957313616, "language_loss": 0.80272603, "learning_rate": 1.9690390330651133e-06, "loss": 0.82154423, "num_input_tokens_seen": 185783415, "step": 8645, "time_per_iteration": 4.246826171875 }, { "auxiliary_loss_clip": 0.01122573, "auxiliary_loss_mlp": 0.01033869, "balance_loss_clip": 1.04362488, "balance_loss_mlp": 1.02058911, "epoch": 0.5198256425672629, "flos": 20009138711040.0, "grad_norm": 1.7778396167930446, "language_loss": 0.7800498, "learning_rate": 1.968649618642264e-06, "loss": 0.80161417, "num_input_tokens_seen": 185801345, "step": 8646, "time_per_iteration": 2.630892276763916 }, { "auxiliary_loss_clip": 0.01117401, "auxiliary_loss_mlp": 0.01035003, "balance_loss_clip": 1.04832959, "balance_loss_mlp": 1.02218235, "epoch": 0.5198857658199308, "flos": 19828867328640.0, "grad_norm": 1.6794769864367036, "language_loss": 0.65647638, "learning_rate": 1.9682602054082252e-06, "loss": 0.67800039, "num_input_tokens_seen": 185820815, "step": 8647, "time_per_iteration": 2.6543033123016357 }, { "auxiliary_loss_clip": 0.01127292, "auxiliary_loss_mlp": 0.01036653, "balance_loss_clip": 1.04618931, "balance_loss_mlp": 1.02208591, "epoch": 0.5199458890725989, "flos": 24461918150400.0, "grad_norm": 1.7193073170603235, "language_loss": 0.71425897, "learning_rate": 1.967870793377763e-06, "loss": 0.73589844, "num_input_tokens_seen": 185841450, "step": 8648, "time_per_iteration": 2.6632113456726074 }, { "auxiliary_loss_clip": 0.0110717, "auxiliary_loss_mlp": 0.01035133, "balance_loss_clip": 1.0474503, "balance_loss_mlp": 1.02016664, "epoch": 0.5200060123252668, "flos": 23404779953280.0, "grad_norm": 2.0932120653926853, "language_loss": 0.64383608, "learning_rate": 1.967481382565642e-06, "loss": 0.66525912, "num_input_tokens_seen": 185859935, "step": 8649, "time_per_iteration": 2.708676815032959 }, { "auxiliary_loss_clip": 0.01101881, "auxiliary_loss_mlp": 0.01035641, "balance_loss_clip": 1.04480278, "balance_loss_mlp": 1.02039409, "epoch": 0.5200661355779348, "flos": 17201355454080.0, "grad_norm": 2.0779038173518978, "language_loss": 0.70331943, "learning_rate": 1.9670919729866315e-06, "loss": 0.72469461, "num_input_tokens_seen": 185876795, "step": 8650, "time_per_iteration": 2.650996446609497 }, { "auxiliary_loss_clip": 0.01123307, "auxiliary_loss_mlp": 0.01030812, "balance_loss_clip": 1.04483724, "balance_loss_mlp": 1.01754415, "epoch": 0.5201262588306027, "flos": 18515075477760.0, "grad_norm": 1.793577075652819, "language_loss": 0.77560079, "learning_rate": 1.966702564655496e-06, "loss": 0.79714197, "num_input_tokens_seen": 185895570, "step": 8651, "time_per_iteration": 2.6181790828704834 }, { "auxiliary_loss_clip": 0.01068752, "auxiliary_loss_mlp": 0.01040289, "balance_loss_clip": 1.04241145, "balance_loss_mlp": 1.02557862, "epoch": 0.5201863820832707, "flos": 18619395552000.0, "grad_norm": 1.579276828195563, "language_loss": 0.78716815, "learning_rate": 1.966313157587003e-06, "loss": 0.80825853, "num_input_tokens_seen": 185913700, "step": 8652, "time_per_iteration": 2.81169056892395 }, { "auxiliary_loss_clip": 0.01087589, "auxiliary_loss_mlp": 0.0103997, "balance_loss_clip": 1.04238617, "balance_loss_mlp": 1.02496183, "epoch": 0.5202465053359386, "flos": 22857142222080.0, "grad_norm": 2.456126746607985, "language_loss": 0.70069832, "learning_rate": 1.9659237517959187e-06, "loss": 0.7219739, "num_input_tokens_seen": 185932460, "step": 8653, "time_per_iteration": 2.8110082149505615 }, { "auxiliary_loss_clip": 0.01094035, "auxiliary_loss_mlp": 0.01042704, "balance_loss_clip": 1.04702687, "balance_loss_mlp": 1.02864337, "epoch": 0.5203066285886067, "flos": 21981532383360.0, "grad_norm": 1.546190224311193, "language_loss": 0.78555804, "learning_rate": 1.965534347297008e-06, "loss": 0.80692542, "num_input_tokens_seen": 185952030, "step": 8654, "time_per_iteration": 2.8240180015563965 }, { "auxiliary_loss_clip": 0.01115002, "auxiliary_loss_mlp": 0.01046231, "balance_loss_clip": 1.04417038, "balance_loss_mlp": 1.03130579, "epoch": 0.5203667518412746, "flos": 20233329448320.0, "grad_norm": 1.7757606906195533, "language_loss": 0.84137118, "learning_rate": 1.9651449441050393e-06, "loss": 0.86298347, "num_input_tokens_seen": 185973130, "step": 8655, "time_per_iteration": 2.767338752746582 }, { "auxiliary_loss_clip": 0.01113773, "auxiliary_loss_mlp": 0.01038813, "balance_loss_clip": 1.04705739, "balance_loss_mlp": 1.02643943, "epoch": 0.5204268750939426, "flos": 15705460627200.0, "grad_norm": 2.3853440972465227, "language_loss": 0.66374946, "learning_rate": 1.9647555422347777e-06, "loss": 0.68527532, "num_input_tokens_seen": 185990200, "step": 8656, "time_per_iteration": 2.6653099060058594 }, { "auxiliary_loss_clip": 0.01083984, "auxiliary_loss_mlp": 0.01043204, "balance_loss_clip": 1.04517853, "balance_loss_mlp": 1.02981043, "epoch": 0.5204869983466105, "flos": 27449469999360.0, "grad_norm": 1.9804929730339849, "language_loss": 0.73262924, "learning_rate": 1.9643661417009893e-06, "loss": 0.75390112, "num_input_tokens_seen": 186009880, "step": 8657, "time_per_iteration": 2.8447728157043457 }, { "auxiliary_loss_clip": 0.01091042, "auxiliary_loss_mlp": 0.01039275, "balance_loss_clip": 1.0432241, "balance_loss_mlp": 1.02489877, "epoch": 0.5205471215992785, "flos": 20595452411520.0, "grad_norm": 1.769785544944644, "language_loss": 0.71705246, "learning_rate": 1.9639767425184408e-06, "loss": 0.73835564, "num_input_tokens_seen": 186026680, "step": 8658, "time_per_iteration": 2.8423781394958496 }, { "auxiliary_loss_clip": 0.01123437, "auxiliary_loss_mlp": 0.01039751, "balance_loss_clip": 1.04425454, "balance_loss_mlp": 1.02607751, "epoch": 0.5206072448519465, "flos": 22127904305280.0, "grad_norm": 1.7936056694778655, "language_loss": 0.83181685, "learning_rate": 1.963587344701897e-06, "loss": 0.85344875, "num_input_tokens_seen": 186046920, "step": 8659, "time_per_iteration": 2.662799596786499 }, { "auxiliary_loss_clip": 0.01103478, "auxiliary_loss_mlp": 0.01045743, "balance_loss_clip": 1.043998, "balance_loss_mlp": 1.02959061, "epoch": 0.5206673681046144, "flos": 18330422636160.0, "grad_norm": 1.9906097398392346, "language_loss": 0.75777173, "learning_rate": 1.9631979482661253e-06, "loss": 0.77926397, "num_input_tokens_seen": 186062090, "step": 8660, "time_per_iteration": 2.6635682582855225 }, { "auxiliary_loss_clip": 0.01123245, "auxiliary_loss_mlp": 0.01039579, "balance_loss_clip": 1.04523396, "balance_loss_mlp": 1.02638865, "epoch": 0.5207274913572825, "flos": 20230240878720.0, "grad_norm": 1.836365427627734, "language_loss": 0.77897781, "learning_rate": 1.9628085532258906e-06, "loss": 0.80060601, "num_input_tokens_seen": 186081135, "step": 8661, "time_per_iteration": 2.6036980152130127 }, { "auxiliary_loss_clip": 0.01101785, "auxiliary_loss_mlp": 0.0103675, "balance_loss_clip": 1.04206395, "balance_loss_mlp": 1.02354193, "epoch": 0.5207876146099504, "flos": 22127042378880.0, "grad_norm": 1.6821546298299666, "language_loss": 0.70456815, "learning_rate": 1.9624191595959603e-06, "loss": 0.72595346, "num_input_tokens_seen": 186099700, "step": 8662, "time_per_iteration": 2.6941347122192383 }, { "auxiliary_loss_clip": 0.01108537, "auxiliary_loss_mlp": 0.01034478, "balance_loss_clip": 1.04286838, "balance_loss_mlp": 1.01910543, "epoch": 0.5208477378626184, "flos": 23878908501120.0, "grad_norm": 1.571076572398917, "language_loss": 0.69488823, "learning_rate": 1.962029767391098e-06, "loss": 0.71631837, "num_input_tokens_seen": 186119740, "step": 8663, "time_per_iteration": 2.648148536682129 }, { "auxiliary_loss_clip": 0.01096912, "auxiliary_loss_mlp": 0.00772823, "balance_loss_clip": 1.04340351, "balance_loss_mlp": 1.00029683, "epoch": 0.5209078611152863, "flos": 20961525870720.0, "grad_norm": 1.508064062466455, "language_loss": 0.77011776, "learning_rate": 1.961640376626072e-06, "loss": 0.78881508, "num_input_tokens_seen": 186140645, "step": 8664, "time_per_iteration": 2.713656187057495 }, { "auxiliary_loss_clip": 0.01099911, "auxiliary_loss_mlp": 0.01035751, "balance_loss_clip": 1.04555953, "balance_loss_mlp": 1.02207136, "epoch": 0.5209679843679543, "flos": 20667740532480.0, "grad_norm": 2.174055653698437, "language_loss": 0.76443201, "learning_rate": 1.961250987315646e-06, "loss": 0.78578866, "num_input_tokens_seen": 186160130, "step": 8665, "time_per_iteration": 2.6254820823669434 }, { "auxiliary_loss_clip": 0.0111827, "auxiliary_loss_mlp": 0.0103845, "balance_loss_clip": 1.04986227, "balance_loss_mlp": 1.02577186, "epoch": 0.5210281076206222, "flos": 20227295963520.0, "grad_norm": 1.6491776532454103, "language_loss": 0.72156572, "learning_rate": 1.960861599474586e-06, "loss": 0.74313289, "num_input_tokens_seen": 186179485, "step": 8666, "time_per_iteration": 2.680417060852051 }, { "auxiliary_loss_clip": 0.01108853, "auxiliary_loss_mlp": 0.01038135, "balance_loss_clip": 1.04408336, "balance_loss_mlp": 1.02222097, "epoch": 0.5210882308732903, "flos": 16069989801600.0, "grad_norm": 2.5838170040517583, "language_loss": 0.68477565, "learning_rate": 1.9604722131176592e-06, "loss": 0.70624554, "num_input_tokens_seen": 186197140, "step": 8667, "time_per_iteration": 2.665583372116089 }, { "auxiliary_loss_clip": 0.01089337, "auxiliary_loss_mlp": 0.01039011, "balance_loss_clip": 1.05282402, "balance_loss_mlp": 1.02584982, "epoch": 0.5211483541259582, "flos": 24825298089600.0, "grad_norm": 1.3808961063616443, "language_loss": 0.81199509, "learning_rate": 1.960082828259629e-06, "loss": 0.83327854, "num_input_tokens_seen": 186216800, "step": 8668, "time_per_iteration": 2.802410125732422 }, { "auxiliary_loss_clip": 0.01105597, "auxiliary_loss_mlp": 0.01031995, "balance_loss_clip": 1.04507339, "balance_loss_mlp": 1.01803613, "epoch": 0.5212084773786262, "flos": 20370651143040.0, "grad_norm": 2.086648647266329, "language_loss": 0.63722765, "learning_rate": 1.9596934449152623e-06, "loss": 0.65860361, "num_input_tokens_seen": 186235320, "step": 8669, "time_per_iteration": 2.681579113006592 }, { "auxiliary_loss_clip": 0.01102666, "auxiliary_loss_mlp": 0.00771955, "balance_loss_clip": 1.04595864, "balance_loss_mlp": 1.00027704, "epoch": 0.5212686006312941, "flos": 23145468693120.0, "grad_norm": 1.5766402887224458, "language_loss": 0.66502392, "learning_rate": 1.959304063099325e-06, "loss": 0.68377018, "num_input_tokens_seen": 186254460, "step": 8670, "time_per_iteration": 2.7425742149353027 }, { "auxiliary_loss_clip": 0.01085453, "auxiliary_loss_mlp": 0.01033651, "balance_loss_clip": 1.04303861, "balance_loss_mlp": 1.02063334, "epoch": 0.5213287238839621, "flos": 27774030314880.0, "grad_norm": 2.122031398938641, "language_loss": 0.76534224, "learning_rate": 1.9589146828265806e-06, "loss": 0.78653324, "num_input_tokens_seen": 186269465, "step": 8671, "time_per_iteration": 2.7530081272125244 }, { "auxiliary_loss_clip": 0.01096106, "auxiliary_loss_mlp": 0.01041463, "balance_loss_clip": 1.04865241, "balance_loss_mlp": 1.02665734, "epoch": 0.5213888471366301, "flos": 19937676602880.0, "grad_norm": 2.569347871916013, "language_loss": 0.78284293, "learning_rate": 1.958525304111796e-06, "loss": 0.80421865, "num_input_tokens_seen": 186288660, "step": 8672, "time_per_iteration": 2.7782974243164062 }, { "auxiliary_loss_clip": 0.01085385, "auxiliary_loss_mlp": 0.01032995, "balance_loss_clip": 1.04014993, "balance_loss_mlp": 1.02035856, "epoch": 0.521448970389298, "flos": 16982731324800.0, "grad_norm": 1.8835859039826313, "language_loss": 0.72004962, "learning_rate": 1.958135926969736e-06, "loss": 0.74123341, "num_input_tokens_seen": 186305760, "step": 8673, "time_per_iteration": 2.7094011306762695 }, { "auxiliary_loss_clip": 0.01108751, "auxiliary_loss_mlp": 0.01034613, "balance_loss_clip": 1.04249072, "balance_loss_mlp": 1.02049243, "epoch": 0.5215090936419661, "flos": 18989706816000.0, "grad_norm": 1.4914552209414809, "language_loss": 0.74901187, "learning_rate": 1.957746551415166e-06, "loss": 0.77044559, "num_input_tokens_seen": 186324135, "step": 8674, "time_per_iteration": 2.6582236289978027 }, { "auxiliary_loss_clip": 0.01097767, "auxiliary_loss_mlp": 0.0103511, "balance_loss_clip": 1.0421474, "balance_loss_mlp": 1.02030408, "epoch": 0.521569216894634, "flos": 16143427157760.0, "grad_norm": 2.0310628766426615, "language_loss": 0.86121237, "learning_rate": 1.9573571774628506e-06, "loss": 0.88254112, "num_input_tokens_seen": 186340205, "step": 8675, "time_per_iteration": 2.659674882888794 }, { "auxiliary_loss_clip": 0.01022959, "auxiliary_loss_mlp": 0.01006796, "balance_loss_clip": 1.01756668, "balance_loss_mlp": 1.00524664, "epoch": 0.521629340147302, "flos": 57579493282560.0, "grad_norm": 0.8681331347139113, "language_loss": 0.63129932, "learning_rate": 1.9569678051275556e-06, "loss": 0.65159684, "num_input_tokens_seen": 186396940, "step": 8676, "time_per_iteration": 3.205299139022827 }, { "auxiliary_loss_clip": 0.01111064, "auxiliary_loss_mlp": 0.01030098, "balance_loss_clip": 1.04485834, "balance_loss_mlp": 1.0172416, "epoch": 0.5216894633999699, "flos": 26796901662720.0, "grad_norm": 1.5700830686566873, "language_loss": 0.68696839, "learning_rate": 1.956578434424046e-06, "loss": 0.70837998, "num_input_tokens_seen": 186418680, "step": 8677, "time_per_iteration": 2.7582013607025146 }, { "auxiliary_loss_clip": 0.0111011, "auxiliary_loss_mlp": 0.01032255, "balance_loss_clip": 1.04261422, "balance_loss_mlp": 1.01857519, "epoch": 0.5217495866526379, "flos": 26358719650560.0, "grad_norm": 1.8246312930355708, "language_loss": 0.65474886, "learning_rate": 1.956189065367086e-06, "loss": 0.67617249, "num_input_tokens_seen": 186438265, "step": 8678, "time_per_iteration": 4.216279029846191 }, { "auxiliary_loss_clip": 0.01101119, "auxiliary_loss_mlp": 0.01036814, "balance_loss_clip": 1.03927827, "balance_loss_mlp": 1.02188301, "epoch": 0.5218097099053058, "flos": 23584009841280.0, "grad_norm": 2.0476762683914287, "language_loss": 0.67981493, "learning_rate": 1.9557996979714414e-06, "loss": 0.70119429, "num_input_tokens_seen": 186456870, "step": 8679, "time_per_iteration": 2.7411186695098877 }, { "auxiliary_loss_clip": 0.01125585, "auxiliary_loss_mlp": 0.01038661, "balance_loss_clip": 1.04630351, "balance_loss_mlp": 1.02463043, "epoch": 0.5218698331579739, "flos": 18077396256000.0, "grad_norm": 1.6988813784316565, "language_loss": 0.66861475, "learning_rate": 1.9554103322518764e-06, "loss": 0.69025725, "num_input_tokens_seen": 186476425, "step": 8680, "time_per_iteration": 2.656953811645508 }, { "auxiliary_loss_clip": 0.0112586, "auxiliary_loss_mlp": 0.01039387, "balance_loss_clip": 1.04645705, "balance_loss_mlp": 1.02533197, "epoch": 0.5219299564106418, "flos": 19281121856640.0, "grad_norm": 2.024829019659845, "language_loss": 0.83280826, "learning_rate": 1.955020968223156e-06, "loss": 0.85446072, "num_input_tokens_seen": 186492555, "step": 8681, "time_per_iteration": 4.351206541061401 }, { "auxiliary_loss_clip": 0.01098299, "auxiliary_loss_mlp": 0.01033401, "balance_loss_clip": 1.0424881, "balance_loss_mlp": 1.02001929, "epoch": 0.5219900796633098, "flos": 26651355753600.0, "grad_norm": 2.0563808347758563, "language_loss": 0.77594543, "learning_rate": 1.9546316059000454e-06, "loss": 0.79726237, "num_input_tokens_seen": 186513190, "step": 8682, "time_per_iteration": 2.836205005645752 }, { "auxiliary_loss_clip": 0.01084257, "auxiliary_loss_mlp": 0.01048472, "balance_loss_clip": 1.03948176, "balance_loss_mlp": 1.03558517, "epoch": 0.5220502029159777, "flos": 34312717382400.0, "grad_norm": 1.4694894100116993, "language_loss": 0.68905342, "learning_rate": 1.9542422452973082e-06, "loss": 0.71038067, "num_input_tokens_seen": 186534830, "step": 8683, "time_per_iteration": 2.8703176975250244 }, { "auxiliary_loss_clip": 0.01091474, "auxiliary_loss_mlp": 0.01042368, "balance_loss_clip": 1.04399586, "balance_loss_mlp": 1.02824771, "epoch": 0.5221103261686457, "flos": 22156488552960.0, "grad_norm": 1.7170989726331638, "language_loss": 0.76116288, "learning_rate": 1.9538528864297104e-06, "loss": 0.78250128, "num_input_tokens_seen": 186554390, "step": 8684, "time_per_iteration": 2.8443922996520996 }, { "auxiliary_loss_clip": 0.0110091, "auxiliary_loss_mlp": 0.00771126, "balance_loss_clip": 1.0387888, "balance_loss_mlp": 1.00024819, "epoch": 0.5221704494213137, "flos": 19208402772480.0, "grad_norm": 1.8259321745961588, "language_loss": 0.75595027, "learning_rate": 1.9534635293120153e-06, "loss": 0.7746706, "num_input_tokens_seen": 186572360, "step": 8685, "time_per_iteration": 4.343646049499512 }, { "auxiliary_loss_clip": 0.01101598, "auxiliary_loss_mlp": 0.01041734, "balance_loss_clip": 1.04539514, "balance_loss_mlp": 1.02856123, "epoch": 0.5222305726739817, "flos": 19354056422400.0, "grad_norm": 1.8098495762940472, "language_loss": 0.80820441, "learning_rate": 1.9530741739589876e-06, "loss": 0.82963777, "num_input_tokens_seen": 186590655, "step": 8686, "time_per_iteration": 2.9524481296539307 }, { "auxiliary_loss_clip": 0.01102372, "auxiliary_loss_mlp": 0.01034624, "balance_loss_clip": 1.04477715, "balance_loss_mlp": 1.02207708, "epoch": 0.5222906959266497, "flos": 27814789272960.0, "grad_norm": 1.5584733304526452, "language_loss": 0.69955659, "learning_rate": 1.9526848203853927e-06, "loss": 0.72092646, "num_input_tokens_seen": 186610345, "step": 8687, "time_per_iteration": 2.8442130088806152 }, { "auxiliary_loss_clip": 0.01119347, "auxiliary_loss_mlp": 0.01033746, "balance_loss_clip": 1.04286504, "balance_loss_mlp": 1.02110982, "epoch": 0.5223508191793176, "flos": 12712988615040.0, "grad_norm": 2.218511460216324, "language_loss": 0.83229095, "learning_rate": 1.9522954686059936e-06, "loss": 0.85382187, "num_input_tokens_seen": 186624360, "step": 8688, "time_per_iteration": 2.6338348388671875 }, { "auxiliary_loss_clip": 0.01111374, "auxiliary_loss_mlp": 0.00771369, "balance_loss_clip": 1.04469848, "balance_loss_mlp": 1.00028682, "epoch": 0.5224109424319856, "flos": 15632238752640.0, "grad_norm": 2.3403806989505744, "language_loss": 0.73484588, "learning_rate": 1.9519061186355558e-06, "loss": 0.75367332, "num_input_tokens_seen": 186638680, "step": 8689, "time_per_iteration": 2.7219626903533936 }, { "auxiliary_loss_clip": 0.01098413, "auxiliary_loss_mlp": 0.01039301, "balance_loss_clip": 1.04080057, "balance_loss_mlp": 1.02569962, "epoch": 0.5224710656846535, "flos": 15742233175680.0, "grad_norm": 1.8348188856486891, "language_loss": 0.83713108, "learning_rate": 1.9515167704888417e-06, "loss": 0.85850823, "num_input_tokens_seen": 186655840, "step": 8690, "time_per_iteration": 2.7358436584472656 }, { "auxiliary_loss_clip": 0.01088108, "auxiliary_loss_mlp": 0.01042101, "balance_loss_clip": 1.04381537, "balance_loss_mlp": 1.0276053, "epoch": 0.5225311889373215, "flos": 26030998938240.0, "grad_norm": 2.015928049267595, "language_loss": 0.79080188, "learning_rate": 1.9511274241806173e-06, "loss": 0.81210393, "num_input_tokens_seen": 186674150, "step": 8691, "time_per_iteration": 2.813861131668091 }, { "auxiliary_loss_clip": 0.01120671, "auxiliary_loss_mlp": 0.01040201, "balance_loss_clip": 1.04700625, "balance_loss_mlp": 1.02552676, "epoch": 0.5225913121899894, "flos": 18369278173440.0, "grad_norm": 2.3023499072102194, "language_loss": 0.76491982, "learning_rate": 1.950738079725646e-06, "loss": 0.78652847, "num_input_tokens_seen": 186690675, "step": 8692, "time_per_iteration": 2.73480224609375 }, { "auxiliary_loss_clip": 0.01108877, "auxiliary_loss_mlp": 0.01039055, "balance_loss_clip": 1.04479527, "balance_loss_mlp": 1.02631116, "epoch": 0.5226514354426575, "flos": 29273516501760.0, "grad_norm": 1.6247734368015925, "language_loss": 0.72325015, "learning_rate": 1.950348737138691e-06, "loss": 0.7447294, "num_input_tokens_seen": 186710380, "step": 8693, "time_per_iteration": 2.782871723175049 }, { "auxiliary_loss_clip": 0.01126187, "auxiliary_loss_mlp": 0.01042643, "balance_loss_clip": 1.04384446, "balance_loss_mlp": 1.02753901, "epoch": 0.5227115586953254, "flos": 22853299466880.0, "grad_norm": 7.53216872329228, "language_loss": 0.8220976, "learning_rate": 1.949959396434517e-06, "loss": 0.84378588, "num_input_tokens_seen": 186729135, "step": 8694, "time_per_iteration": 2.6748385429382324 }, { "auxiliary_loss_clip": 0.01013741, "auxiliary_loss_mlp": 0.01003883, "balance_loss_clip": 1.02031374, "balance_loss_mlp": 1.00224972, "epoch": 0.5227716819479934, "flos": 57474419022720.0, "grad_norm": 0.775564151874101, "language_loss": 0.55647832, "learning_rate": 1.949570057627888e-06, "loss": 0.57665455, "num_input_tokens_seen": 186791115, "step": 8695, "time_per_iteration": 3.345134973526001 }, { "auxiliary_loss_clip": 0.01061261, "auxiliary_loss_mlp": 0.01041707, "balance_loss_clip": 1.04356098, "balance_loss_mlp": 1.0283134, "epoch": 0.5228318052006613, "flos": 13808264077440.0, "grad_norm": 1.8671615474987673, "language_loss": 0.732638, "learning_rate": 1.9491807207335672e-06, "loss": 0.75366765, "num_input_tokens_seen": 186808660, "step": 8696, "time_per_iteration": 2.782350540161133 }, { "auxiliary_loss_clip": 0.01099328, "auxiliary_loss_mlp": 0.01039177, "balance_loss_clip": 1.0429219, "balance_loss_mlp": 1.02538478, "epoch": 0.5228919284533293, "flos": 15596184476160.0, "grad_norm": 1.7190001113055795, "language_loss": 0.71068561, "learning_rate": 1.948791385766319e-06, "loss": 0.73207062, "num_input_tokens_seen": 186825900, "step": 8697, "time_per_iteration": 2.781651735305786 }, { "auxiliary_loss_clip": 0.01092255, "auxiliary_loss_mlp": 0.01037704, "balance_loss_clip": 1.04413819, "balance_loss_mlp": 1.02498996, "epoch": 0.5229520517059973, "flos": 22491499726080.0, "grad_norm": 1.9475868659159346, "language_loss": 0.80332339, "learning_rate": 1.948402052740906e-06, "loss": 0.82462299, "num_input_tokens_seen": 186843735, "step": 8698, "time_per_iteration": 2.7078070640563965 }, { "auxiliary_loss_clip": 0.01110911, "auxiliary_loss_mlp": 0.01038923, "balance_loss_clip": 1.04286766, "balance_loss_mlp": 1.02576292, "epoch": 0.5230121749586653, "flos": 22090880361600.0, "grad_norm": 1.6510046342053804, "language_loss": 0.74265802, "learning_rate": 1.948012721672093e-06, "loss": 0.7641564, "num_input_tokens_seen": 186862440, "step": 8699, "time_per_iteration": 2.667205333709717 }, { "auxiliary_loss_clip": 0.01113513, "auxiliary_loss_mlp": 0.00773315, "balance_loss_clip": 1.04171407, "balance_loss_mlp": 1.00029182, "epoch": 0.5230722982113333, "flos": 22127150119680.0, "grad_norm": 1.8535119798273105, "language_loss": 0.73102427, "learning_rate": 1.947623392574642e-06, "loss": 0.74989247, "num_input_tokens_seen": 186880940, "step": 8700, "time_per_iteration": 2.7250688076019287 }, { "auxiliary_loss_clip": 0.01100202, "auxiliary_loss_mlp": 0.01039746, "balance_loss_clip": 1.04480553, "balance_loss_mlp": 1.02510738, "epoch": 0.5231324214640012, "flos": 25009268572800.0, "grad_norm": 1.8378710861613805, "language_loss": 0.67156309, "learning_rate": 1.947234065463318e-06, "loss": 0.69296253, "num_input_tokens_seen": 186900785, "step": 8701, "time_per_iteration": 2.830300807952881 }, { "auxiliary_loss_clip": 0.0110603, "auxiliary_loss_mlp": 0.00771586, "balance_loss_clip": 1.04569697, "balance_loss_mlp": 1.0002594, "epoch": 0.5231925447166692, "flos": 25740517651200.0, "grad_norm": 1.7245960424067608, "language_loss": 0.66710031, "learning_rate": 1.9468447403528826e-06, "loss": 0.68587643, "num_input_tokens_seen": 186920895, "step": 8702, "time_per_iteration": 2.725583791732788 }, { "auxiliary_loss_clip": 0.01100659, "auxiliary_loss_mlp": 0.01039254, "balance_loss_clip": 1.04362679, "balance_loss_mlp": 1.02464485, "epoch": 0.5232526679693371, "flos": 21433930565760.0, "grad_norm": 1.7906940342438376, "language_loss": 0.76647937, "learning_rate": 1.946455417258101e-06, "loss": 0.78787845, "num_input_tokens_seen": 186940605, "step": 8703, "time_per_iteration": 2.7585973739624023 }, { "auxiliary_loss_clip": 0.01117607, "auxiliary_loss_mlp": 0.01043637, "balance_loss_clip": 1.04529738, "balance_loss_mlp": 1.02807403, "epoch": 0.5233127912220051, "flos": 35298393471360.0, "grad_norm": 2.3077994186551036, "language_loss": 0.76945215, "learning_rate": 1.9460660961937348e-06, "loss": 0.79106462, "num_input_tokens_seen": 186960820, "step": 8704, "time_per_iteration": 2.8613169193267822 }, { "auxiliary_loss_clip": 0.01102832, "auxiliary_loss_mlp": 0.0104096, "balance_loss_clip": 1.04692268, "balance_loss_mlp": 1.02798438, "epoch": 0.523372914474673, "flos": 17051320344960.0, "grad_norm": 1.8023730932949449, "language_loss": 0.78725791, "learning_rate": 1.9456767771745474e-06, "loss": 0.80869591, "num_input_tokens_seen": 186976240, "step": 8705, "time_per_iteration": 2.741025924682617 }, { "auxiliary_loss_clip": 0.01106252, "auxiliary_loss_mlp": 0.01037077, "balance_loss_clip": 1.04467177, "balance_loss_mlp": 1.02273059, "epoch": 0.5234330377273411, "flos": 18406302117120.0, "grad_norm": 2.80572723928073, "language_loss": 0.69824338, "learning_rate": 1.9452874602153027e-06, "loss": 0.71967667, "num_input_tokens_seen": 186992855, "step": 8706, "time_per_iteration": 2.6872975826263428 }, { "auxiliary_loss_clip": 0.01035877, "auxiliary_loss_mlp": 0.01013693, "balance_loss_clip": 1.01881003, "balance_loss_mlp": 1.01213157, "epoch": 0.523493160980009, "flos": 65850296970240.0, "grad_norm": 0.6808139995313122, "language_loss": 0.52465838, "learning_rate": 1.9448981453307623e-06, "loss": 0.54515409, "num_input_tokens_seen": 187051205, "step": 8707, "time_per_iteration": 3.2341713905334473 }, { "auxiliary_loss_clip": 0.01098509, "auxiliary_loss_mlp": 0.0103739, "balance_loss_clip": 1.04139447, "balance_loss_mlp": 1.02380002, "epoch": 0.523553284232677, "flos": 21872076664320.0, "grad_norm": 1.6877057679435725, "language_loss": 0.74618769, "learning_rate": 1.9445088325356904e-06, "loss": 0.76754665, "num_input_tokens_seen": 187070540, "step": 8708, "time_per_iteration": 2.8342666625976562 }, { "auxiliary_loss_clip": 0.0109528, "auxiliary_loss_mlp": 0.01031158, "balance_loss_clip": 1.04457259, "balance_loss_mlp": 1.01772881, "epoch": 0.5236134074853449, "flos": 20848191482880.0, "grad_norm": 1.566541485414049, "language_loss": 0.7730183, "learning_rate": 1.944119521844849e-06, "loss": 0.79428267, "num_input_tokens_seen": 187089975, "step": 8709, "time_per_iteration": 2.708807945251465 }, { "auxiliary_loss_clip": 0.01074175, "auxiliary_loss_mlp": 0.0103878, "balance_loss_clip": 1.03733826, "balance_loss_mlp": 1.02211428, "epoch": 0.5236735307380129, "flos": 25520421064320.0, "grad_norm": 2.041376547108184, "language_loss": 0.83508044, "learning_rate": 1.9437302132730003e-06, "loss": 0.85620999, "num_input_tokens_seen": 187108775, "step": 8710, "time_per_iteration": 2.7781410217285156 }, { "auxiliary_loss_clip": 0.01093974, "auxiliary_loss_mlp": 0.01031634, "balance_loss_clip": 1.04229414, "balance_loss_mlp": 1.01794267, "epoch": 0.523733653990681, "flos": 23583112001280.0, "grad_norm": 2.2254848949827983, "language_loss": 0.69715381, "learning_rate": 1.943340906834908e-06, "loss": 0.7184099, "num_input_tokens_seen": 187128830, "step": 8711, "time_per_iteration": 2.7991995811462402 }, { "auxiliary_loss_clip": 0.01114283, "auxiliary_loss_mlp": 0.01039219, "balance_loss_clip": 1.04482269, "balance_loss_mlp": 1.02475893, "epoch": 0.5237937772433489, "flos": 21106245767040.0, "grad_norm": 2.0479693285364764, "language_loss": 0.8319692, "learning_rate": 1.9429516025453345e-06, "loss": 0.85350424, "num_input_tokens_seen": 187149570, "step": 8712, "time_per_iteration": 2.6913018226623535 }, { "auxiliary_loss_clip": 0.01126488, "auxiliary_loss_mlp": 0.01042299, "balance_loss_clip": 1.04477775, "balance_loss_mlp": 1.02704, "epoch": 0.5238539004960169, "flos": 19172887200000.0, "grad_norm": 2.12392132979159, "language_loss": 0.69795638, "learning_rate": 1.9425623004190415e-06, "loss": 0.71964419, "num_input_tokens_seen": 187170575, "step": 8713, "time_per_iteration": 2.6037533283233643 }, { "auxiliary_loss_clip": 0.01087813, "auxiliary_loss_mlp": 0.01040708, "balance_loss_clip": 1.03908944, "balance_loss_mlp": 1.02369666, "epoch": 0.5239140237486848, "flos": 17888218300800.0, "grad_norm": 2.8914750795344233, "language_loss": 0.76703346, "learning_rate": 1.9421730004707925e-06, "loss": 0.78831869, "num_input_tokens_seen": 187187190, "step": 8714, "time_per_iteration": 2.717984676361084 }, { "auxiliary_loss_clip": 0.01086969, "auxiliary_loss_mlp": 0.01044306, "balance_loss_clip": 1.0413481, "balance_loss_mlp": 1.02729511, "epoch": 0.5239741470013528, "flos": 17930413802880.0, "grad_norm": 1.9287276707329408, "language_loss": 0.7608462, "learning_rate": 1.9417837027153483e-06, "loss": 0.78215897, "num_input_tokens_seen": 187204350, "step": 8715, "time_per_iteration": 2.6999671459198 }, { "auxiliary_loss_clip": 0.01099192, "auxiliary_loss_mlp": 0.01035578, "balance_loss_clip": 1.0417552, "balance_loss_mlp": 1.02110636, "epoch": 0.5240342702540207, "flos": 30993386584320.0, "grad_norm": 2.1294970054785622, "language_loss": 0.71165496, "learning_rate": 1.9413944071674723e-06, "loss": 0.73300266, "num_input_tokens_seen": 187225605, "step": 8716, "time_per_iteration": 2.744347333908081 }, { "auxiliary_loss_clip": 0.01121973, "auxiliary_loss_mlp": 0.0103854, "balance_loss_clip": 1.04380643, "balance_loss_mlp": 1.02563596, "epoch": 0.5240943935066887, "flos": 25005066681600.0, "grad_norm": 3.2118480553546087, "language_loss": 0.87086689, "learning_rate": 1.941005113841926e-06, "loss": 0.89247203, "num_input_tokens_seen": 187241335, "step": 8717, "time_per_iteration": 4.158156394958496 }, { "auxiliary_loss_clip": 0.01109045, "auxiliary_loss_mlp": 0.01035747, "balance_loss_clip": 1.0454371, "balance_loss_mlp": 1.02164412, "epoch": 0.5241545167593566, "flos": 23659099223040.0, "grad_norm": 1.880090780763199, "language_loss": 0.61121464, "learning_rate": 1.9406158227534723e-06, "loss": 0.63266253, "num_input_tokens_seen": 187259925, "step": 8718, "time_per_iteration": 2.671760320663452 }, { "auxiliary_loss_clip": 0.01094217, "auxiliary_loss_mlp": 0.010389, "balance_loss_clip": 1.04272294, "balance_loss_mlp": 1.02387953, "epoch": 0.5242146400120247, "flos": 23400398494080.0, "grad_norm": 1.8098933087704439, "language_loss": 0.72060192, "learning_rate": 1.940226533916872e-06, "loss": 0.74193311, "num_input_tokens_seen": 187279035, "step": 8719, "time_per_iteration": 2.815864324569702 }, { "auxiliary_loss_clip": 0.01109147, "auxiliary_loss_mlp": 0.01029429, "balance_loss_clip": 1.04305363, "balance_loss_mlp": 1.01676893, "epoch": 0.5242747632646926, "flos": 17749065012480.0, "grad_norm": 1.9600898858885738, "language_loss": 0.73258477, "learning_rate": 1.9398372473468877e-06, "loss": 0.7539705, "num_input_tokens_seen": 187297555, "step": 8720, "time_per_iteration": 4.34027624130249 }, { "auxiliary_loss_clip": 0.01110975, "auxiliary_loss_mlp": 0.01037749, "balance_loss_clip": 1.042588, "balance_loss_mlp": 1.02323568, "epoch": 0.5243348865173606, "flos": 32597731549440.0, "grad_norm": 1.7136870064395262, "language_loss": 0.7059021, "learning_rate": 1.939447963058281e-06, "loss": 0.72738934, "num_input_tokens_seen": 187320265, "step": 8721, "time_per_iteration": 4.457958698272705 }, { "auxiliary_loss_clip": 0.01064422, "auxiliary_loss_mlp": 0.0103891, "balance_loss_clip": 1.03628516, "balance_loss_mlp": 1.02399719, "epoch": 0.5243950097700285, "flos": 25484115392640.0, "grad_norm": 1.8741175153878353, "language_loss": 0.86506796, "learning_rate": 1.939058681065813e-06, "loss": 0.88610125, "num_input_tokens_seen": 187338045, "step": 8722, "time_per_iteration": 2.851713180541992 }, { "auxiliary_loss_clip": 0.01122948, "auxiliary_loss_mlp": 0.01033308, "balance_loss_clip": 1.0449574, "balance_loss_mlp": 1.01830578, "epoch": 0.5244551330226965, "flos": 15268391936640.0, "grad_norm": 1.8614764349338224, "language_loss": 0.79853708, "learning_rate": 1.938669401384247e-06, "loss": 0.82009959, "num_input_tokens_seen": 187356040, "step": 8723, "time_per_iteration": 2.567403554916382 }, { "auxiliary_loss_clip": 0.01111191, "auxiliary_loss_mlp": 0.0104214, "balance_loss_clip": 1.04611158, "balance_loss_mlp": 1.02747166, "epoch": 0.5245152562753645, "flos": 22237108629120.0, "grad_norm": 2.070314434964904, "language_loss": 0.75515735, "learning_rate": 1.9382801240283426e-06, "loss": 0.77669066, "num_input_tokens_seen": 187374185, "step": 8724, "time_per_iteration": 4.372815847396851 }, { "auxiliary_loss_clip": 0.01128433, "auxiliary_loss_mlp": 0.01038668, "balance_loss_clip": 1.04391563, "balance_loss_mlp": 1.02228856, "epoch": 0.5245753795280325, "flos": 29426460612480.0, "grad_norm": 1.7393951886603523, "language_loss": 0.70450562, "learning_rate": 1.9378908490128625e-06, "loss": 0.72617668, "num_input_tokens_seen": 187396640, "step": 8725, "time_per_iteration": 2.691462278366089 }, { "auxiliary_loss_clip": 0.01014562, "auxiliary_loss_mlp": 0.0100467, "balance_loss_clip": 1.01748943, "balance_loss_mlp": 1.0025723, "epoch": 0.5246355027807005, "flos": 58834392785280.0, "grad_norm": 0.751972672191828, "language_loss": 0.55635381, "learning_rate": 1.937501576352568e-06, "loss": 0.57654613, "num_input_tokens_seen": 187455945, "step": 8726, "time_per_iteration": 3.2482144832611084 }, { "auxiliary_loss_clip": 0.01023582, "auxiliary_loss_mlp": 0.01000951, "balance_loss_clip": 1.02279115, "balance_loss_mlp": 0.9995268, "epoch": 0.5246956260333684, "flos": 64526592965760.0, "grad_norm": 0.7878423938979384, "language_loss": 0.58313322, "learning_rate": 1.937112306062219e-06, "loss": 0.60337853, "num_input_tokens_seen": 187519975, "step": 8727, "time_per_iteration": 3.2606794834136963 }, { "auxiliary_loss_clip": 0.01114413, "auxiliary_loss_mlp": 0.01036047, "balance_loss_clip": 1.0418663, "balance_loss_mlp": 1.02111006, "epoch": 0.5247557492860364, "flos": 24533631653760.0, "grad_norm": 1.3167349097133665, "language_loss": 0.70678449, "learning_rate": 1.9367230381565786e-06, "loss": 0.72828913, "num_input_tokens_seen": 187541775, "step": 8728, "time_per_iteration": 2.6979823112487793 }, { "auxiliary_loss_clip": 0.01110188, "auxiliary_loss_mlp": 0.01029551, "balance_loss_clip": 1.04107904, "balance_loss_mlp": 1.01636648, "epoch": 0.5248158725387043, "flos": 18806131382400.0, "grad_norm": 1.4052080718589413, "language_loss": 0.69816244, "learning_rate": 1.9363337726504062e-06, "loss": 0.71955991, "num_input_tokens_seen": 187560425, "step": 8729, "time_per_iteration": 2.6898272037506104 }, { "auxiliary_loss_clip": 0.01084395, "auxiliary_loss_mlp": 0.01034673, "balance_loss_clip": 1.04138565, "balance_loss_mlp": 1.02001655, "epoch": 0.5248759957913723, "flos": 20955851521920.0, "grad_norm": 1.9953537122640765, "language_loss": 0.83565557, "learning_rate": 1.935944509558464e-06, "loss": 0.85684621, "num_input_tokens_seen": 187579930, "step": 8730, "time_per_iteration": 2.719953775405884 }, { "auxiliary_loss_clip": 0.01087481, "auxiliary_loss_mlp": 0.01037052, "balance_loss_clip": 1.04011822, "balance_loss_mlp": 1.02177548, "epoch": 0.5249361190440403, "flos": 18660980522880.0, "grad_norm": 2.0205964009231816, "language_loss": 0.79403269, "learning_rate": 1.9355552488955125e-06, "loss": 0.81527805, "num_input_tokens_seen": 187595365, "step": 8731, "time_per_iteration": 2.741563081741333 }, { "auxiliary_loss_clip": 0.01105082, "auxiliary_loss_mlp": 0.01030893, "balance_loss_clip": 1.03996611, "balance_loss_mlp": 1.0172075, "epoch": 0.5249962422967083, "flos": 24863327614080.0, "grad_norm": 1.917738069421625, "language_loss": 0.83558822, "learning_rate": 1.935165990676312e-06, "loss": 0.85694802, "num_input_tokens_seen": 187614715, "step": 8732, "time_per_iteration": 2.672537326812744 }, { "auxiliary_loss_clip": 0.01109755, "auxiliary_loss_mlp": 0.01037546, "balance_loss_clip": 1.04267287, "balance_loss_mlp": 1.0239923, "epoch": 0.5250563655493762, "flos": 15262681674240.0, "grad_norm": 1.7357983281517446, "language_loss": 0.77602309, "learning_rate": 1.9347767349156237e-06, "loss": 0.79749608, "num_input_tokens_seen": 187630745, "step": 8733, "time_per_iteration": 2.651329278945923 }, { "auxiliary_loss_clip": 0.01126312, "auxiliary_loss_mlp": 0.01036227, "balance_loss_clip": 1.04450274, "balance_loss_mlp": 1.02157617, "epoch": 0.5251164888020442, "flos": 18625177641600.0, "grad_norm": 1.892740616554097, "language_loss": 0.8202911, "learning_rate": 1.934387481628208e-06, "loss": 0.84191644, "num_input_tokens_seen": 187648200, "step": 8734, "time_per_iteration": 2.608727216720581 }, { "auxiliary_loss_clip": 0.01091339, "auxiliary_loss_mlp": 0.01028225, "balance_loss_clip": 1.04116642, "balance_loss_mlp": 1.01467109, "epoch": 0.5251766120547121, "flos": 29710764760320.0, "grad_norm": 1.3668287037138613, "language_loss": 0.76932037, "learning_rate": 1.933998230828826e-06, "loss": 0.79051596, "num_input_tokens_seen": 187669205, "step": 8735, "time_per_iteration": 2.703274965286255 }, { "auxiliary_loss_clip": 0.01112983, "auxiliary_loss_mlp": 0.01038692, "balance_loss_clip": 1.04413259, "balance_loss_mlp": 1.02544188, "epoch": 0.5252367353073801, "flos": 23440295525760.0, "grad_norm": 1.7627870360178364, "language_loss": 0.80808437, "learning_rate": 1.9336089825322376e-06, "loss": 0.82960117, "num_input_tokens_seen": 187690890, "step": 8736, "time_per_iteration": 2.6869864463806152 }, { "auxiliary_loss_clip": 0.01124902, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.04460597, "balance_loss_mlp": 1.02199018, "epoch": 0.5252968585600482, "flos": 30810708990720.0, "grad_norm": 2.2019442049314626, "language_loss": 0.69824821, "learning_rate": 1.9332197367532033e-06, "loss": 0.71986508, "num_input_tokens_seen": 187713045, "step": 8737, "time_per_iteration": 2.694178342819214 }, { "auxiliary_loss_clip": 0.01101601, "auxiliary_loss_mlp": 0.01038957, "balance_loss_clip": 1.04274702, "balance_loss_mlp": 1.02473521, "epoch": 0.5253569818127161, "flos": 20628274464000.0, "grad_norm": 1.4444028137471083, "language_loss": 0.77386785, "learning_rate": 1.9328304935064833e-06, "loss": 0.79527342, "num_input_tokens_seen": 187733640, "step": 8738, "time_per_iteration": 2.7655301094055176 }, { "auxiliary_loss_clip": 0.01012696, "auxiliary_loss_mlp": 0.00752303, "balance_loss_clip": 1.01498532, "balance_loss_mlp": 0.99995118, "epoch": 0.5254171050653841, "flos": 63428695810560.0, "grad_norm": 0.7418872270660203, "language_loss": 0.54437888, "learning_rate": 1.932441252806837e-06, "loss": 0.56202877, "num_input_tokens_seen": 187792930, "step": 8739, "time_per_iteration": 3.183931350708008 }, { "auxiliary_loss_clip": 0.01093164, "auxiliary_loss_mlp": 0.01039099, "balance_loss_clip": 1.03987527, "balance_loss_mlp": 1.02572989, "epoch": 0.525477228318052, "flos": 34670782108800.0, "grad_norm": 1.6115423077763054, "language_loss": 0.84719479, "learning_rate": 1.9320520146690263e-06, "loss": 0.8685174, "num_input_tokens_seen": 187812495, "step": 8740, "time_per_iteration": 2.8701846599578857 }, { "auxiliary_loss_clip": 0.01106251, "auxiliary_loss_mlp": 0.00771888, "balance_loss_clip": 1.03936994, "balance_loss_mlp": 1.00030541, "epoch": 0.52553735157072, "flos": 17930844766080.0, "grad_norm": 2.112576285349714, "language_loss": 0.69466913, "learning_rate": 1.9316627791078093e-06, "loss": 0.71345055, "num_input_tokens_seen": 187829685, "step": 8741, "time_per_iteration": 2.721233606338501 }, { "auxiliary_loss_clip": 0.01101687, "auxiliary_loss_mlp": 0.0103584, "balance_loss_clip": 1.04140949, "balance_loss_mlp": 1.02171421, "epoch": 0.5255974748233879, "flos": 9940864584960.0, "grad_norm": 1.8031333880336204, "language_loss": 0.66328311, "learning_rate": 1.931273546137947e-06, "loss": 0.68465841, "num_input_tokens_seen": 187846495, "step": 8742, "time_per_iteration": 2.695504903793335 }, { "auxiliary_loss_clip": 0.01086092, "auxiliary_loss_mlp": 0.01042238, "balance_loss_clip": 1.03882444, "balance_loss_mlp": 1.02666903, "epoch": 0.5256575980760559, "flos": 16868427269760.0, "grad_norm": 1.9909144400242709, "language_loss": 0.63219392, "learning_rate": 1.9308843157741983e-06, "loss": 0.65347725, "num_input_tokens_seen": 187862010, "step": 8743, "time_per_iteration": 2.712376832962036 }, { "auxiliary_loss_clip": 0.0102969, "auxiliary_loss_mlp": 0.01008337, "balance_loss_clip": 1.01230693, "balance_loss_mlp": 1.00641751, "epoch": 0.5257177213287239, "flos": 62386210362240.0, "grad_norm": 0.7739828883360421, "language_loss": 0.5410347, "learning_rate": 1.930495088031323e-06, "loss": 0.56141496, "num_input_tokens_seen": 187922730, "step": 8744, "time_per_iteration": 3.281756639480591 }, { "auxiliary_loss_clip": 0.01106094, "auxiliary_loss_mlp": 0.01037818, "balance_loss_clip": 1.04534447, "balance_loss_mlp": 1.02202296, "epoch": 0.5257778445813919, "flos": 20776908942720.0, "grad_norm": 2.5030900138953274, "language_loss": 0.75859022, "learning_rate": 1.9301058629240814e-06, "loss": 0.7800293, "num_input_tokens_seen": 187940160, "step": 8745, "time_per_iteration": 2.642817258834839 }, { "auxiliary_loss_clip": 0.01110515, "auxiliary_loss_mlp": 0.0104281, "balance_loss_clip": 1.04153466, "balance_loss_mlp": 1.02948213, "epoch": 0.5258379678340598, "flos": 17018606033280.0, "grad_norm": 1.7823830080970366, "language_loss": 0.8089028, "learning_rate": 1.9297166404672324e-06, "loss": 0.83043599, "num_input_tokens_seen": 187958625, "step": 8746, "time_per_iteration": 2.5678205490112305 }, { "auxiliary_loss_clip": 0.01108698, "auxiliary_loss_mlp": 0.01036575, "balance_loss_clip": 1.04006267, "balance_loss_mlp": 1.02191806, "epoch": 0.5258980910867278, "flos": 21068754946560.0, "grad_norm": 2.1394959039376475, "language_loss": 0.75231433, "learning_rate": 1.9293274206755353e-06, "loss": 0.77376711, "num_input_tokens_seen": 187977575, "step": 8747, "time_per_iteration": -0.009610652923583984 }, { "auxiliary_loss_clip": 0.0105854, "auxiliary_loss_mlp": 0.01033909, "balance_loss_clip": 1.03949201, "balance_loss_mlp": 1.01987767, "epoch": 0.5259582143393957, "flos": 18004461690240.0, "grad_norm": 2.0175880820051058, "language_loss": 0.82632613, "learning_rate": 1.9289382035637505e-06, "loss": 0.84725058, "num_input_tokens_seen": 187996650, "step": 8748, "time_per_iteration": 2.7604665756225586 }, { "auxiliary_loss_clip": 0.01099486, "auxiliary_loss_mlp": 0.01033119, "balance_loss_clip": 1.03856742, "balance_loss_mlp": 1.01846862, "epoch": 0.5260183375920637, "flos": 22783848520320.0, "grad_norm": 2.328081087853481, "language_loss": 0.80873966, "learning_rate": 1.9285489891466345e-06, "loss": 0.83006573, "num_input_tokens_seen": 188013510, "step": 8749, "time_per_iteration": 2.6853184700012207 }, { "auxiliary_loss_clip": 0.01109749, "auxiliary_loss_mlp": 0.01040189, "balance_loss_clip": 1.04381132, "balance_loss_mlp": 1.02556193, "epoch": 0.5260784608447318, "flos": 27052406081280.0, "grad_norm": 1.7699462129252088, "language_loss": 0.72291499, "learning_rate": 1.9281597774389487e-06, "loss": 0.74441439, "num_input_tokens_seen": 188032085, "step": 8750, "time_per_iteration": 2.6771364212036133 }, { "auxiliary_loss_clip": 0.01098374, "auxiliary_loss_mlp": 0.01037371, "balance_loss_clip": 1.03887165, "balance_loss_mlp": 1.02362585, "epoch": 0.5261385840973997, "flos": 20662820369280.0, "grad_norm": 1.3348346616556535, "language_loss": 0.76186317, "learning_rate": 1.9277705684554517e-06, "loss": 0.78322065, "num_input_tokens_seen": 188050590, "step": 8751, "time_per_iteration": 2.7016804218292236 }, { "auxiliary_loss_clip": 0.01119796, "auxiliary_loss_mlp": 0.01039709, "balance_loss_clip": 1.04339051, "balance_loss_mlp": 1.02622056, "epoch": 0.5261987073500677, "flos": 23622649896960.0, "grad_norm": 1.7424279065253616, "language_loss": 0.75831163, "learning_rate": 1.927381362210902e-06, "loss": 0.77990663, "num_input_tokens_seen": 188071620, "step": 8752, "time_per_iteration": 2.7128703594207764 }, { "auxiliary_loss_clip": 0.01112565, "auxiliary_loss_mlp": 0.01033514, "balance_loss_clip": 1.04177046, "balance_loss_mlp": 1.01780224, "epoch": 0.5262588306027356, "flos": 27636241743360.0, "grad_norm": 2.1757268908288707, "language_loss": 0.67754769, "learning_rate": 1.926992158720058e-06, "loss": 0.69900852, "num_input_tokens_seen": 188091740, "step": 8753, "time_per_iteration": 2.678269147872925 }, { "auxiliary_loss_clip": 0.01111599, "auxiliary_loss_mlp": 0.01034275, "balance_loss_clip": 1.04266751, "balance_loss_mlp": 1.02072084, "epoch": 0.5263189538554036, "flos": 21759711943680.0, "grad_norm": 1.6342208992061138, "language_loss": 0.84114075, "learning_rate": 1.9266029579976785e-06, "loss": 0.86259949, "num_input_tokens_seen": 188111165, "step": 8754, "time_per_iteration": 2.6858248710632324 }, { "auxiliary_loss_clip": 0.01109767, "auxiliary_loss_mlp": 0.01035863, "balance_loss_clip": 1.04159164, "balance_loss_mlp": 1.02159333, "epoch": 0.5263790771080715, "flos": 14276359140480.0, "grad_norm": 2.0064086672514323, "language_loss": 0.87360156, "learning_rate": 1.926213760058522e-06, "loss": 0.89505792, "num_input_tokens_seen": 188127825, "step": 8755, "time_per_iteration": 2.5783674716949463 }, { "auxiliary_loss_clip": 0.01007681, "auxiliary_loss_mlp": 0.01000927, "balance_loss_clip": 1.01328659, "balance_loss_mlp": 0.99918669, "epoch": 0.5264392003607395, "flos": 65806413528960.0, "grad_norm": 0.7404552494369754, "language_loss": 0.5880959, "learning_rate": 1.9258245649173477e-06, "loss": 0.60818201, "num_input_tokens_seen": 188194050, "step": 8756, "time_per_iteration": 3.308302402496338 }, { "auxiliary_loss_clip": 0.01094156, "auxiliary_loss_mlp": 0.01036712, "balance_loss_clip": 1.0415833, "balance_loss_mlp": 1.02182269, "epoch": 0.5264993236134075, "flos": 21032413361280.0, "grad_norm": 1.6572717697992079, "language_loss": 0.70703959, "learning_rate": 1.925435372588913e-06, "loss": 0.72834826, "num_input_tokens_seen": 188212565, "step": 8757, "time_per_iteration": 4.195650100708008 }, { "auxiliary_loss_clip": 0.0110952, "auxiliary_loss_mlp": 0.01040036, "balance_loss_clip": 1.04061294, "balance_loss_mlp": 1.02590346, "epoch": 0.5265594468660755, "flos": 16618202150400.0, "grad_norm": 2.0494500796269577, "language_loss": 0.88039553, "learning_rate": 1.9250461830879768e-06, "loss": 0.90189111, "num_input_tokens_seen": 188229505, "step": 8758, "time_per_iteration": 2.63089656829834 }, { "auxiliary_loss_clip": 0.01061465, "auxiliary_loss_mlp": 0.01037569, "balance_loss_clip": 1.03887105, "balance_loss_mlp": 1.02301979, "epoch": 0.5266195701187434, "flos": 24134125610880.0, "grad_norm": 1.4473751902891179, "language_loss": 0.75895298, "learning_rate": 1.9246569964292965e-06, "loss": 0.77994329, "num_input_tokens_seen": 188250395, "step": 8759, "time_per_iteration": 4.702188968658447 }, { "auxiliary_loss_clip": 0.01098136, "auxiliary_loss_mlp": 0.01030934, "balance_loss_clip": 1.04185557, "balance_loss_mlp": 1.0181073, "epoch": 0.5266796933714114, "flos": 15844111125120.0, "grad_norm": 1.7900777891811301, "language_loss": 0.71485013, "learning_rate": 1.9242678126276307e-06, "loss": 0.73614085, "num_input_tokens_seen": 188266785, "step": 8760, "time_per_iteration": 4.256975412368774 }, { "auxiliary_loss_clip": 0.01098696, "auxiliary_loss_mlp": 0.01040967, "balance_loss_clip": 1.04177952, "balance_loss_mlp": 1.02593493, "epoch": 0.5267398166240793, "flos": 20951434149120.0, "grad_norm": 2.6157951761776697, "language_loss": 0.75801802, "learning_rate": 1.923878631697736e-06, "loss": 0.77941465, "num_input_tokens_seen": 188282525, "step": 8761, "time_per_iteration": 2.685028553009033 }, { "auxiliary_loss_clip": 0.01104735, "auxiliary_loss_mlp": 0.00771727, "balance_loss_clip": 1.03871739, "balance_loss_mlp": 1.00023258, "epoch": 0.5267999398767473, "flos": 20996394998400.0, "grad_norm": 1.8739254444127986, "language_loss": 0.70466101, "learning_rate": 1.923489453654373e-06, "loss": 0.72342563, "num_input_tokens_seen": 188301395, "step": 8762, "time_per_iteration": 2.727120876312256 }, { "auxiliary_loss_clip": 0.01014324, "auxiliary_loss_mlp": 0.00999661, "balance_loss_clip": 1.00980198, "balance_loss_mlp": 0.99816543, "epoch": 0.5268600631294152, "flos": 66849401767680.0, "grad_norm": 0.9282030794038212, "language_loss": 0.65443593, "learning_rate": 1.9231002785122963e-06, "loss": 0.67457575, "num_input_tokens_seen": 188357665, "step": 8763, "time_per_iteration": 3.109525203704834 }, { "auxiliary_loss_clip": 0.01109455, "auxiliary_loss_mlp": 0.01030406, "balance_loss_clip": 1.04166603, "balance_loss_mlp": 1.01676226, "epoch": 0.5269201863820833, "flos": 17165552572800.0, "grad_norm": 1.6243900815433006, "language_loss": 0.71050072, "learning_rate": 1.922711106286265e-06, "loss": 0.73189938, "num_input_tokens_seen": 188376935, "step": 8764, "time_per_iteration": 4.168430328369141 }, { "auxiliary_loss_clip": 0.01080487, "auxiliary_loss_mlp": 0.01033821, "balance_loss_clip": 1.03809977, "balance_loss_mlp": 1.01832938, "epoch": 0.5269803096347513, "flos": 20522589672960.0, "grad_norm": 1.5962933914095123, "language_loss": 0.74318087, "learning_rate": 1.9223219369910368e-06, "loss": 0.76432389, "num_input_tokens_seen": 188394995, "step": 8765, "time_per_iteration": 2.7441658973693848 }, { "auxiliary_loss_clip": 0.01098499, "auxiliary_loss_mlp": 0.01037098, "balance_loss_clip": 1.03631091, "balance_loss_mlp": 1.02200055, "epoch": 0.5270404328874192, "flos": 27230989524480.0, "grad_norm": 1.60818818085183, "language_loss": 0.85403508, "learning_rate": 1.9219327706413677e-06, "loss": 0.87539107, "num_input_tokens_seen": 188415475, "step": 8766, "time_per_iteration": 2.7902116775512695 }, { "auxiliary_loss_clip": 0.0112556, "auxiliary_loss_mlp": 0.01039583, "balance_loss_clip": 1.0449605, "balance_loss_mlp": 1.02492046, "epoch": 0.5271005561400872, "flos": 23110491824640.0, "grad_norm": 1.780636206979604, "language_loss": 0.79070592, "learning_rate": 1.921543607252017e-06, "loss": 0.81235737, "num_input_tokens_seen": 188435665, "step": 8767, "time_per_iteration": 2.6986846923828125 }, { "auxiliary_loss_clip": 0.01114967, "auxiliary_loss_mlp": 0.01039357, "balance_loss_clip": 1.04406393, "balance_loss_mlp": 1.02407432, "epoch": 0.5271606793927551, "flos": 22564793427840.0, "grad_norm": 1.6576657234027676, "language_loss": 0.73513746, "learning_rate": 1.9211544468377394e-06, "loss": 0.75668073, "num_input_tokens_seen": 188455405, "step": 8768, "time_per_iteration": 2.695497989654541 }, { "auxiliary_loss_clip": 0.01092606, "auxiliary_loss_mlp": 0.01048135, "balance_loss_clip": 1.03795791, "balance_loss_mlp": 1.03445613, "epoch": 0.5272208026454231, "flos": 18764259102720.0, "grad_norm": 1.9012673693956994, "language_loss": 0.7428031, "learning_rate": 1.9207652894132933e-06, "loss": 0.76421046, "num_input_tokens_seen": 188472940, "step": 8769, "time_per_iteration": 2.7763235569000244 }, { "auxiliary_loss_clip": 0.01082308, "auxiliary_loss_mlp": 0.0104049, "balance_loss_clip": 1.03746688, "balance_loss_mlp": 1.02675128, "epoch": 0.5272809258980911, "flos": 20412164286720.0, "grad_norm": 1.8328085669464766, "language_loss": 0.7360974, "learning_rate": 1.920376134993436e-06, "loss": 0.75732535, "num_input_tokens_seen": 188493035, "step": 8770, "time_per_iteration": 2.7274930477142334 }, { "auxiliary_loss_clip": 0.011224, "auxiliary_loss_mlp": 0.01035685, "balance_loss_clip": 1.04366255, "balance_loss_mlp": 1.02199364, "epoch": 0.5273410491507591, "flos": 28256742213120.0, "grad_norm": 1.7661010025178618, "language_loss": 0.68258119, "learning_rate": 1.9199869835929224e-06, "loss": 0.704162, "num_input_tokens_seen": 188513860, "step": 8771, "time_per_iteration": 2.6751418113708496 }, { "auxiliary_loss_clip": 0.01109367, "auxiliary_loss_mlp": 0.01038799, "balance_loss_clip": 1.0429647, "balance_loss_mlp": 1.02500653, "epoch": 0.527401172403427, "flos": 22455158140800.0, "grad_norm": 1.9220412670697933, "language_loss": 0.76438117, "learning_rate": 1.9195978352265115e-06, "loss": 0.78586286, "num_input_tokens_seen": 188533345, "step": 8772, "time_per_iteration": 2.7865138053894043 }, { "auxiliary_loss_clip": 0.01107055, "auxiliary_loss_mlp": 0.01047604, "balance_loss_clip": 1.04159784, "balance_loss_mlp": 1.03290582, "epoch": 0.527461295656095, "flos": 21031084558080.0, "grad_norm": 2.1683746410962472, "language_loss": 0.65569091, "learning_rate": 1.9192086899089585e-06, "loss": 0.67723751, "num_input_tokens_seen": 188551550, "step": 8773, "time_per_iteration": 2.648556709289551 }, { "auxiliary_loss_clip": 0.01089634, "auxiliary_loss_mlp": 0.01040938, "balance_loss_clip": 1.04127073, "balance_loss_mlp": 1.02838576, "epoch": 0.5275214189087629, "flos": 26322018929280.0, "grad_norm": 1.7479537399696432, "language_loss": 0.85893595, "learning_rate": 1.91881954765502e-06, "loss": 0.88024169, "num_input_tokens_seen": 188571615, "step": 8774, "time_per_iteration": 2.8036038875579834 }, { "auxiliary_loss_clip": 0.01088366, "auxiliary_loss_mlp": 0.01035546, "balance_loss_clip": 1.03889024, "balance_loss_mlp": 1.02204525, "epoch": 0.5275815421614309, "flos": 20047024581120.0, "grad_norm": 1.657417688760408, "language_loss": 0.80199802, "learning_rate": 1.9184304084794523e-06, "loss": 0.82323706, "num_input_tokens_seen": 188591965, "step": 8775, "time_per_iteration": 2.7011687755584717 }, { "auxiliary_loss_clip": 0.01096581, "auxiliary_loss_mlp": 0.01042615, "balance_loss_clip": 1.03883219, "balance_loss_mlp": 1.02843523, "epoch": 0.5276416654140988, "flos": 21432206712960.0, "grad_norm": 1.7666023716485497, "language_loss": 0.83578467, "learning_rate": 1.918041272397012e-06, "loss": 0.85717654, "num_input_tokens_seen": 188610675, "step": 8776, "time_per_iteration": 2.6593801975250244 }, { "auxiliary_loss_clip": 0.01093105, "auxiliary_loss_mlp": 0.01036597, "balance_loss_clip": 1.04135871, "balance_loss_mlp": 1.0225482, "epoch": 0.5277017886667669, "flos": 17165085696000.0, "grad_norm": 1.7073238735749807, "language_loss": 0.67856812, "learning_rate": 1.9176521394224547e-06, "loss": 0.6998651, "num_input_tokens_seen": 188628235, "step": 8777, "time_per_iteration": 2.684119462966919 }, { "auxiliary_loss_clip": 0.01098291, "auxiliary_loss_mlp": 0.01042578, "balance_loss_clip": 1.0435065, "balance_loss_mlp": 1.02887487, "epoch": 0.5277619119194349, "flos": 20448146736000.0, "grad_norm": 1.6906001817136074, "language_loss": 0.8258512, "learning_rate": 1.9172630095705358e-06, "loss": 0.84725994, "num_input_tokens_seen": 188648925, "step": 8778, "time_per_iteration": 2.682415723800659 }, { "auxiliary_loss_clip": 0.01111904, "auxiliary_loss_mlp": 0.01042858, "balance_loss_clip": 1.04339361, "balance_loss_mlp": 1.02807617, "epoch": 0.5278220351721028, "flos": 24061083304320.0, "grad_norm": 2.7851808389493913, "language_loss": 0.79809994, "learning_rate": 1.916873882856013e-06, "loss": 0.81964755, "num_input_tokens_seen": 188668125, "step": 8779, "time_per_iteration": 2.6585779190063477 }, { "auxiliary_loss_clip": 0.01105817, "auxiliary_loss_mlp": 0.01036255, "balance_loss_clip": 1.04011083, "balance_loss_mlp": 1.02326131, "epoch": 0.5278821584247708, "flos": 24642907804800.0, "grad_norm": 2.3801118784221487, "language_loss": 0.76782715, "learning_rate": 1.9164847592936406e-06, "loss": 0.78924787, "num_input_tokens_seen": 188684410, "step": 8780, "time_per_iteration": 2.64528489112854 }, { "auxiliary_loss_clip": 0.01092369, "auxiliary_loss_mlp": 0.01031311, "balance_loss_clip": 1.04324102, "balance_loss_mlp": 1.01723862, "epoch": 0.5279422816774387, "flos": 35408244240000.0, "grad_norm": 1.6460087018057796, "language_loss": 0.7001918, "learning_rate": 1.916095638898174e-06, "loss": 0.72142857, "num_input_tokens_seen": 188706130, "step": 8781, "time_per_iteration": 2.8247299194335938 }, { "auxiliary_loss_clip": 0.01107498, "auxiliary_loss_mlp": 0.01040285, "balance_loss_clip": 1.04195011, "balance_loss_mlp": 1.02773809, "epoch": 0.5280024049301068, "flos": 22967028904320.0, "grad_norm": 1.5355974889681627, "language_loss": 0.72236538, "learning_rate": 1.9157065216843696e-06, "loss": 0.7438432, "num_input_tokens_seen": 188725030, "step": 8782, "time_per_iteration": 2.6150832176208496 }, { "auxiliary_loss_clip": 0.01090709, "auxiliary_loss_mlp": 0.01033496, "balance_loss_clip": 1.03973758, "balance_loss_mlp": 1.0204308, "epoch": 0.5280625281827747, "flos": 21507619317120.0, "grad_norm": 1.8366229943518229, "language_loss": 0.68489599, "learning_rate": 1.915317407666982e-06, "loss": 0.70613807, "num_input_tokens_seen": 188744325, "step": 8783, "time_per_iteration": 2.7228338718414307 }, { "auxiliary_loss_clip": 0.01120029, "auxiliary_loss_mlp": 0.01042206, "balance_loss_clip": 1.04475784, "balance_loss_mlp": 1.02599382, "epoch": 0.5281226514354427, "flos": 31208167958400.0, "grad_norm": 1.8621065563663965, "language_loss": 0.69557488, "learning_rate": 1.9149282968607674e-06, "loss": 0.71719718, "num_input_tokens_seen": 188765100, "step": 8784, "time_per_iteration": 2.756030797958374 }, { "auxiliary_loss_clip": 0.01124818, "auxiliary_loss_mlp": 0.01034258, "balance_loss_clip": 1.04128921, "balance_loss_mlp": 1.01935077, "epoch": 0.5281827746881106, "flos": 25077821679360.0, "grad_norm": 3.8002246773271238, "language_loss": 0.7503646, "learning_rate": 1.91453918928048e-06, "loss": 0.77195537, "num_input_tokens_seen": 188783995, "step": 8785, "time_per_iteration": 2.6486949920654297 }, { "auxiliary_loss_clip": 0.01110957, "auxiliary_loss_mlp": 0.01035187, "balance_loss_clip": 1.04315662, "balance_loss_mlp": 1.02070904, "epoch": 0.5282428979407786, "flos": 20631255292800.0, "grad_norm": 1.5855662273934061, "language_loss": 0.83260286, "learning_rate": 1.9141500849408745e-06, "loss": 0.85406423, "num_input_tokens_seen": 188803120, "step": 8786, "time_per_iteration": 2.6352970600128174 }, { "auxiliary_loss_clip": 0.01083443, "auxiliary_loss_mlp": 0.01025911, "balance_loss_clip": 1.04014111, "balance_loss_mlp": 1.0136745, "epoch": 0.5283030211934465, "flos": 22419391173120.0, "grad_norm": 2.305341017618089, "language_loss": 0.82486933, "learning_rate": 1.9137609838567076e-06, "loss": 0.84596282, "num_input_tokens_seen": 188820960, "step": 8787, "time_per_iteration": 2.712639570236206 }, { "auxiliary_loss_clip": 0.01066097, "auxiliary_loss_mlp": 0.01026546, "balance_loss_clip": 1.03866088, "balance_loss_mlp": 1.01387453, "epoch": 0.5283631444461145, "flos": 23615467176960.0, "grad_norm": 1.663088771358256, "language_loss": 0.83609009, "learning_rate": 1.9133718860427316e-06, "loss": 0.85701656, "num_input_tokens_seen": 188837165, "step": 8788, "time_per_iteration": 2.7158761024475098 }, { "auxiliary_loss_clip": 0.01087908, "auxiliary_loss_mlp": 0.01041692, "balance_loss_clip": 1.04602289, "balance_loss_mlp": 1.02696919, "epoch": 0.5284232676987825, "flos": 32671994918400.0, "grad_norm": 1.8980499308542007, "language_loss": 0.75046682, "learning_rate": 1.9129827915137027e-06, "loss": 0.77176291, "num_input_tokens_seen": 188858555, "step": 8789, "time_per_iteration": 2.806339979171753 }, { "auxiliary_loss_clip": 0.01113755, "auxiliary_loss_mlp": 0.01037056, "balance_loss_clip": 1.04411733, "balance_loss_mlp": 1.02322817, "epoch": 0.5284833909514505, "flos": 26760919213440.0, "grad_norm": 1.5263217177178625, "language_loss": 0.69562709, "learning_rate": 1.9125937002843754e-06, "loss": 0.71713525, "num_input_tokens_seen": 188879050, "step": 8790, "time_per_iteration": 2.701814651489258 }, { "auxiliary_loss_clip": 0.01117978, "auxiliary_loss_mlp": 0.01029171, "balance_loss_clip": 1.04194212, "balance_loss_mlp": 1.01685631, "epoch": 0.5285435142041185, "flos": 22090700793600.0, "grad_norm": 1.472851859989372, "language_loss": 0.79096156, "learning_rate": 1.9122046123695036e-06, "loss": 0.812433, "num_input_tokens_seen": 188898885, "step": 8791, "time_per_iteration": 2.609342575073242 }, { "auxiliary_loss_clip": 0.01063984, "auxiliary_loss_mlp": 0.01029869, "balance_loss_clip": 1.04006243, "balance_loss_mlp": 1.01632702, "epoch": 0.5286036374567864, "flos": 20375463565440.0, "grad_norm": 2.747278304747908, "language_loss": 0.66302419, "learning_rate": 1.9118155277838423e-06, "loss": 0.6839627, "num_input_tokens_seen": 188917225, "step": 8792, "time_per_iteration": 2.713622570037842 }, { "auxiliary_loss_clip": 0.01090251, "auxiliary_loss_mlp": 0.01040633, "balance_loss_clip": 1.03743482, "balance_loss_mlp": 1.02670956, "epoch": 0.5286637607094544, "flos": 24352175122560.0, "grad_norm": 1.9116255636929125, "language_loss": 0.79727674, "learning_rate": 1.9114264465421443e-06, "loss": 0.81858563, "num_input_tokens_seen": 188936120, "step": 8793, "time_per_iteration": 2.6645493507385254 }, { "auxiliary_loss_clip": 0.01121499, "auxiliary_loss_mlp": 0.01045468, "balance_loss_clip": 1.04323554, "balance_loss_mlp": 1.03118658, "epoch": 0.5287238839621223, "flos": 17271165536640.0, "grad_norm": 2.655732529836172, "language_loss": 0.84749115, "learning_rate": 1.9110373686591645e-06, "loss": 0.86916077, "num_input_tokens_seen": 188953405, "step": 8794, "time_per_iteration": 2.8306803703308105 }, { "auxiliary_loss_clip": 0.01097868, "auxiliary_loss_mlp": 0.0103516, "balance_loss_clip": 1.03908813, "balance_loss_mlp": 1.02062225, "epoch": 0.5287840072147904, "flos": 17566890209280.0, "grad_norm": 2.1997369626435894, "language_loss": 0.676875, "learning_rate": 1.9106482941496564e-06, "loss": 0.69820529, "num_input_tokens_seen": 188971150, "step": 8795, "time_per_iteration": 2.703134059906006 }, { "auxiliary_loss_clip": 0.01098455, "auxiliary_loss_mlp": 0.010334, "balance_loss_clip": 1.04339266, "balance_loss_mlp": 1.01989961, "epoch": 0.5288441304674583, "flos": 18552099421440.0, "grad_norm": 2.036052201037856, "language_loss": 0.80291003, "learning_rate": 1.910259223028374e-06, "loss": 0.82422858, "num_input_tokens_seen": 188989550, "step": 8796, "time_per_iteration": 2.6733570098876953 }, { "auxiliary_loss_clip": 0.01079591, "auxiliary_loss_mlp": 0.01043571, "balance_loss_clip": 1.03867388, "balance_loss_mlp": 1.02758455, "epoch": 0.5289042537201263, "flos": 20814507504000.0, "grad_norm": 1.5572831824692925, "language_loss": 0.69010925, "learning_rate": 1.909870155310071e-06, "loss": 0.71134079, "num_input_tokens_seen": 189008795, "step": 8797, "time_per_iteration": 4.254164934158325 }, { "auxiliary_loss_clip": 0.01101135, "auxiliary_loss_mlp": 0.01036632, "balance_loss_clip": 1.04237545, "balance_loss_mlp": 1.02374518, "epoch": 0.5289643769727942, "flos": 15735265937280.0, "grad_norm": 1.6872492204324914, "language_loss": 0.82684171, "learning_rate": 1.9094810910095005e-06, "loss": 0.84821934, "num_input_tokens_seen": 189025540, "step": 8798, "time_per_iteration": 2.7167000770568848 }, { "auxiliary_loss_clip": 0.01096424, "auxiliary_loss_mlp": 0.00774405, "balance_loss_clip": 1.03896332, "balance_loss_mlp": 1.00029516, "epoch": 0.5290245002254622, "flos": 19537308633600.0, "grad_norm": 1.9585595365508919, "language_loss": 0.70825863, "learning_rate": 1.9090920301414166e-06, "loss": 0.72696698, "num_input_tokens_seen": 189044885, "step": 8799, "time_per_iteration": 4.350652694702148 }, { "auxiliary_loss_clip": 0.01111399, "auxiliary_loss_mlp": 0.01038005, "balance_loss_clip": 1.04659581, "balance_loss_mlp": 1.02507114, "epoch": 0.5290846234781301, "flos": 15815131827840.0, "grad_norm": 2.2031970702340704, "language_loss": 0.69286144, "learning_rate": 1.9087029727205716e-06, "loss": 0.71435547, "num_input_tokens_seen": 189061280, "step": 8800, "time_per_iteration": 4.109759569168091 }, { "auxiliary_loss_clip": 0.01017957, "auxiliary_loss_mlp": 0.01037827, "balance_loss_clip": 1.01865292, "balance_loss_mlp": 1.03631306, "epoch": 0.5291447467307981, "flos": 70057624821120.0, "grad_norm": 0.9935539305247675, "language_loss": 0.56959099, "learning_rate": 1.9083139187617193e-06, "loss": 0.59014881, "num_input_tokens_seen": 189114775, "step": 8801, "time_per_iteration": 3.1419920921325684 }, { "auxiliary_loss_clip": 0.01110756, "auxiliary_loss_mlp": 0.01036206, "balance_loss_clip": 1.04886377, "balance_loss_mlp": 1.02271795, "epoch": 0.529204869983466, "flos": 28364186770560.0, "grad_norm": 1.5688016044474997, "language_loss": 0.6425091, "learning_rate": 1.9079248682796123e-06, "loss": 0.6639787, "num_input_tokens_seen": 189134700, "step": 8802, "time_per_iteration": 2.7467000484466553 }, { "auxiliary_loss_clip": 0.01101463, "auxiliary_loss_mlp": 0.01031341, "balance_loss_clip": 1.04380429, "balance_loss_mlp": 1.01772761, "epoch": 0.5292649932361341, "flos": 33758830684800.0, "grad_norm": 3.351871019760029, "language_loss": 0.69098222, "learning_rate": 1.907535821289003e-06, "loss": 0.71231019, "num_input_tokens_seen": 189155365, "step": 8803, "time_per_iteration": 4.278867721557617 }, { "auxiliary_loss_clip": 0.01106005, "auxiliary_loss_mlp": 0.00770288, "balance_loss_clip": 1.04076648, "balance_loss_mlp": 1.00028872, "epoch": 0.5293251164888021, "flos": 20447679859200.0, "grad_norm": 1.7989646267917587, "language_loss": 0.76156348, "learning_rate": 1.9071467778046458e-06, "loss": 0.78032649, "num_input_tokens_seen": 189173885, "step": 8804, "time_per_iteration": 2.683661699295044 }, { "auxiliary_loss_clip": 0.01032487, "auxiliary_loss_mlp": 0.01019664, "balance_loss_clip": 1.01553822, "balance_loss_mlp": 1.01836514, "epoch": 0.52938523974147, "flos": 66545312204160.0, "grad_norm": 0.7526453486337231, "language_loss": 0.5290755, "learning_rate": 1.906757737841291e-06, "loss": 0.54959702, "num_input_tokens_seen": 189236515, "step": 8805, "time_per_iteration": 3.243603467941284 }, { "auxiliary_loss_clip": 0.0103203, "auxiliary_loss_mlp": 0.01016047, "balance_loss_clip": 1.01495409, "balance_loss_mlp": 1.01418769, "epoch": 0.529445362994138, "flos": 67151734542720.0, "grad_norm": 0.7522317031499139, "language_loss": 0.6378004, "learning_rate": 1.906368701413693e-06, "loss": 0.65828121, "num_input_tokens_seen": 189300500, "step": 8806, "time_per_iteration": 3.185899257659912 }, { "auxiliary_loss_clip": 0.01112977, "auxiliary_loss_mlp": 0.01034283, "balance_loss_clip": 1.04236031, "balance_loss_mlp": 1.02053213, "epoch": 0.5295054862468059, "flos": 17749316407680.0, "grad_norm": 1.5696878511475738, "language_loss": 0.72756052, "learning_rate": 1.9059796685366026e-06, "loss": 0.74903309, "num_input_tokens_seen": 189319745, "step": 8807, "time_per_iteration": 2.652667284011841 }, { "auxiliary_loss_clip": 0.01079975, "auxiliary_loss_mlp": 0.01030639, "balance_loss_clip": 1.04053009, "balance_loss_mlp": 1.01760888, "epoch": 0.529565609499474, "flos": 11397401084160.0, "grad_norm": 2.191041401806776, "language_loss": 0.69626606, "learning_rate": 1.9055906392247723e-06, "loss": 0.71737224, "num_input_tokens_seen": 189334550, "step": 8808, "time_per_iteration": 2.6991183757781982 }, { "auxiliary_loss_clip": 0.01109251, "auxiliary_loss_mlp": 0.01032489, "balance_loss_clip": 1.041991, "balance_loss_mlp": 1.01962066, "epoch": 0.5296257327521419, "flos": 17196363463680.0, "grad_norm": 1.8261828078243632, "language_loss": 0.8653447, "learning_rate": 1.9052016134929554e-06, "loss": 0.88676214, "num_input_tokens_seen": 189351735, "step": 8809, "time_per_iteration": 2.5995731353759766 }, { "auxiliary_loss_clip": 0.0111469, "auxiliary_loss_mlp": 0.01041403, "balance_loss_clip": 1.04281509, "balance_loss_mlp": 1.02607894, "epoch": 0.5296858560048099, "flos": 39964086777600.0, "grad_norm": 1.9222242916722383, "language_loss": 0.64388674, "learning_rate": 1.9048125913559016e-06, "loss": 0.66544765, "num_input_tokens_seen": 189373105, "step": 8810, "time_per_iteration": 2.776230573654175 }, { "auxiliary_loss_clip": 0.01119011, "auxiliary_loss_mlp": 0.01038636, "balance_loss_clip": 1.04296374, "balance_loss_mlp": 1.02509344, "epoch": 0.5297459792574778, "flos": 20961418129920.0, "grad_norm": 1.8063937788931883, "language_loss": 0.68213391, "learning_rate": 1.9044235728283646e-06, "loss": 0.70371044, "num_input_tokens_seen": 189394615, "step": 8811, "time_per_iteration": 2.684617757797241 }, { "auxiliary_loss_clip": 0.01007367, "auxiliary_loss_mlp": 0.0100546, "balance_loss_clip": 1.01854634, "balance_loss_mlp": 1.00402915, "epoch": 0.5298061025101458, "flos": 66523620389760.0, "grad_norm": 0.689972629111167, "language_loss": 0.53345251, "learning_rate": 1.9040345579250953e-06, "loss": 0.55358076, "num_input_tokens_seen": 189459750, "step": 8812, "time_per_iteration": 3.3905134201049805 }, { "auxiliary_loss_clip": 0.01023218, "auxiliary_loss_mlp": 0.01004548, "balance_loss_clip": 1.01716316, "balance_loss_mlp": 1.00321257, "epoch": 0.5298662257628137, "flos": 67662994775040.0, "grad_norm": 0.7359658604916758, "language_loss": 0.56288284, "learning_rate": 1.9036455466608453e-06, "loss": 0.58316052, "num_input_tokens_seen": 189527540, "step": 8813, "time_per_iteration": 3.2840702533721924 }, { "auxiliary_loss_clip": 0.01064136, "auxiliary_loss_mlp": 0.01033208, "balance_loss_clip": 1.0387466, "balance_loss_mlp": 1.01986289, "epoch": 0.5299263490154817, "flos": 19646405216640.0, "grad_norm": 1.8723589062576662, "language_loss": 0.81484783, "learning_rate": 1.9032565390503657e-06, "loss": 0.83582127, "num_input_tokens_seen": 189546900, "step": 8814, "time_per_iteration": 2.7889370918273926 }, { "auxiliary_loss_clip": 0.01129463, "auxiliary_loss_mlp": 0.01035706, "balance_loss_clip": 1.04835963, "balance_loss_mlp": 1.02225351, "epoch": 0.5299864722681497, "flos": 22055005653120.0, "grad_norm": 1.8736963674991467, "language_loss": 0.85159796, "learning_rate": 1.9028675351084076e-06, "loss": 0.87324965, "num_input_tokens_seen": 189566490, "step": 8815, "time_per_iteration": 2.588376998901367 }, { "auxiliary_loss_clip": 0.01119356, "auxiliary_loss_mlp": 0.01030819, "balance_loss_clip": 1.04443836, "balance_loss_mlp": 1.01802766, "epoch": 0.5300465955208177, "flos": 21763698353280.0, "grad_norm": 2.360835312755498, "language_loss": 0.66173548, "learning_rate": 1.9024785348497225e-06, "loss": 0.6832372, "num_input_tokens_seen": 189585580, "step": 8816, "time_per_iteration": 2.6367204189300537 }, { "auxiliary_loss_clip": 0.01098885, "auxiliary_loss_mlp": 0.01037316, "balance_loss_clip": 1.04165578, "balance_loss_mlp": 1.02370238, "epoch": 0.5301067187734857, "flos": 42996491735040.0, "grad_norm": 1.8428826452353317, "language_loss": 0.72204578, "learning_rate": 1.9020895382890611e-06, "loss": 0.74340779, "num_input_tokens_seen": 189608485, "step": 8817, "time_per_iteration": 2.8511815071105957 }, { "auxiliary_loss_clip": 0.01093351, "auxiliary_loss_mlp": 0.0103512, "balance_loss_clip": 1.03981018, "balance_loss_mlp": 1.01959896, "epoch": 0.5301668420261536, "flos": 20554298403840.0, "grad_norm": 1.7783802077805728, "language_loss": 0.65400332, "learning_rate": 1.9017005454411743e-06, "loss": 0.67528808, "num_input_tokens_seen": 189627815, "step": 8818, "time_per_iteration": 2.757228374481201 }, { "auxiliary_loss_clip": 0.01075022, "auxiliary_loss_mlp": 0.01033272, "balance_loss_clip": 1.04101062, "balance_loss_mlp": 1.01816273, "epoch": 0.5302269652788216, "flos": 17486665182720.0, "grad_norm": 1.8529738404346974, "language_loss": 0.75020683, "learning_rate": 1.9013115563208126e-06, "loss": 0.77128971, "num_input_tokens_seen": 189644850, "step": 8819, "time_per_iteration": 2.7458016872406006 }, { "auxiliary_loss_clip": 0.01088004, "auxiliary_loss_mlp": 0.01047287, "balance_loss_clip": 1.04190588, "balance_loss_mlp": 1.03143191, "epoch": 0.5302870885314895, "flos": 14574202715520.0, "grad_norm": 2.236781046268797, "language_loss": 0.81955135, "learning_rate": 1.9009225709427267e-06, "loss": 0.84090424, "num_input_tokens_seen": 189660945, "step": 8820, "time_per_iteration": 2.7917025089263916 }, { "auxiliary_loss_clip": 0.01101102, "auxiliary_loss_mlp": 0.01034433, "balance_loss_clip": 1.04137421, "balance_loss_mlp": 1.02192223, "epoch": 0.5303472117841576, "flos": 23438032968960.0, "grad_norm": 1.5105877277652986, "language_loss": 0.72733676, "learning_rate": 1.9005335893216667e-06, "loss": 0.74869215, "num_input_tokens_seen": 189680425, "step": 8821, "time_per_iteration": 2.664912462234497 }, { "auxiliary_loss_clip": 0.01092575, "auxiliary_loss_mlp": 0.01032249, "balance_loss_clip": 1.04237318, "balance_loss_mlp": 1.01958346, "epoch": 0.5304073350368255, "flos": 22709010533760.0, "grad_norm": 1.4432589414019072, "language_loss": 0.74112785, "learning_rate": 1.9001446114723824e-06, "loss": 0.76237607, "num_input_tokens_seen": 189700375, "step": 8822, "time_per_iteration": 2.7494471073150635 }, { "auxiliary_loss_clip": 0.01087967, "auxiliary_loss_mlp": 0.01034985, "balance_loss_clip": 1.03945005, "balance_loss_mlp": 1.02029884, "epoch": 0.5304674582894935, "flos": 27928554624000.0, "grad_norm": 1.6561028390766985, "language_loss": 0.67739707, "learning_rate": 1.8997556374096257e-06, "loss": 0.69862658, "num_input_tokens_seen": 189721225, "step": 8823, "time_per_iteration": 2.8298280239105225 }, { "auxiliary_loss_clip": 0.01127487, "auxiliary_loss_mlp": 0.01042695, "balance_loss_clip": 1.0455004, "balance_loss_mlp": 1.02722192, "epoch": 0.5305275815421614, "flos": 21250642440960.0, "grad_norm": 1.7679489191905855, "language_loss": 0.69459474, "learning_rate": 1.8993666671481444e-06, "loss": 0.71629655, "num_input_tokens_seen": 189740170, "step": 8824, "time_per_iteration": 2.7093706130981445 }, { "auxiliary_loss_clip": 0.01098459, "auxiliary_loss_mlp": 0.00770579, "balance_loss_clip": 1.04351103, "balance_loss_mlp": 1.00028551, "epoch": 0.5305877047948294, "flos": 17603088140160.0, "grad_norm": 2.079936946962719, "language_loss": 0.7578221, "learning_rate": 1.898977700702689e-06, "loss": 0.77651244, "num_input_tokens_seen": 189757890, "step": 8825, "time_per_iteration": 2.7240397930145264 }, { "auxiliary_loss_clip": 0.01042177, "auxiliary_loss_mlp": 0.01041743, "balance_loss_clip": 1.03510904, "balance_loss_mlp": 1.02771175, "epoch": 0.5306478280474973, "flos": 15195493284480.0, "grad_norm": 1.902170532497994, "language_loss": 0.85671568, "learning_rate": 1.8985887380880103e-06, "loss": 0.87755489, "num_input_tokens_seen": 189775390, "step": 8826, "time_per_iteration": 2.786893367767334 }, { "auxiliary_loss_clip": 0.0112111, "auxiliary_loss_mlp": 0.0103337, "balance_loss_clip": 1.04376101, "balance_loss_mlp": 1.01967907, "epoch": 0.5307079513001653, "flos": 15341218761600.0, "grad_norm": 1.3295158202050776, "language_loss": 0.64655942, "learning_rate": 1.8981997793188558e-06, "loss": 0.66810423, "num_input_tokens_seen": 189793975, "step": 8827, "time_per_iteration": 2.650259017944336 }, { "auxiliary_loss_clip": 0.01100521, "auxiliary_loss_mlp": 0.01041689, "balance_loss_clip": 1.04230511, "balance_loss_mlp": 1.02720535, "epoch": 0.5307680745528333, "flos": 43544452688640.0, "grad_norm": 1.5763280036459053, "language_loss": 0.60055244, "learning_rate": 1.8978108244099762e-06, "loss": 0.62197453, "num_input_tokens_seen": 189817870, "step": 8828, "time_per_iteration": 2.9273712635040283 }, { "auxiliary_loss_clip": 0.01115165, "auxiliary_loss_mlp": 0.01032955, "balance_loss_clip": 1.04400516, "balance_loss_mlp": 1.01779199, "epoch": 0.5308281978055013, "flos": 20048928001920.0, "grad_norm": 1.6623375431972864, "language_loss": 0.81171465, "learning_rate": 1.8974218733761208e-06, "loss": 0.83319587, "num_input_tokens_seen": 189837905, "step": 8829, "time_per_iteration": 2.6640090942382812 }, { "auxiliary_loss_clip": 0.01104846, "auxiliary_loss_mlp": 0.01035043, "balance_loss_clip": 1.043993, "balance_loss_mlp": 1.02136946, "epoch": 0.5308883210581693, "flos": 20703938463360.0, "grad_norm": 1.3895948203919835, "language_loss": 0.78245443, "learning_rate": 1.8970329262320375e-06, "loss": 0.80385327, "num_input_tokens_seen": 189856970, "step": 8830, "time_per_iteration": 2.736316680908203 }, { "auxiliary_loss_clip": 0.01111385, "auxiliary_loss_mlp": 0.01033264, "balance_loss_clip": 1.04335451, "balance_loss_mlp": 1.02036524, "epoch": 0.5309484443108372, "flos": 14355506759040.0, "grad_norm": 2.4391763831493165, "language_loss": 0.8031435, "learning_rate": 1.8966439829924768e-06, "loss": 0.82458997, "num_input_tokens_seen": 189872830, "step": 8831, "time_per_iteration": 2.6151957511901855 }, { "auxiliary_loss_clip": 0.01108777, "auxiliary_loss_mlp": 0.01032917, "balance_loss_clip": 1.0430057, "balance_loss_mlp": 1.01951742, "epoch": 0.5310085675635052, "flos": 20010503427840.0, "grad_norm": 4.592110703983282, "language_loss": 0.73025942, "learning_rate": 1.896255043672186e-06, "loss": 0.75167632, "num_input_tokens_seen": 189891635, "step": 8832, "time_per_iteration": 2.6464226245880127 }, { "auxiliary_loss_clip": 0.01089691, "auxiliary_loss_mlp": 0.01036866, "balance_loss_clip": 1.04126275, "balance_loss_mlp": 1.02198887, "epoch": 0.5310686908161731, "flos": 22127293774080.0, "grad_norm": 2.4188792138763513, "language_loss": 0.75694382, "learning_rate": 1.8958661082859143e-06, "loss": 0.77820939, "num_input_tokens_seen": 189909050, "step": 8833, "time_per_iteration": 2.757716178894043 }, { "auxiliary_loss_clip": 0.01087272, "auxiliary_loss_mlp": 0.01036493, "balance_loss_clip": 1.03743505, "balance_loss_mlp": 1.02260494, "epoch": 0.5311288140688412, "flos": 24717889445760.0, "grad_norm": 1.6684529348681687, "language_loss": 0.73618537, "learning_rate": 1.8954771768484103e-06, "loss": 0.75742298, "num_input_tokens_seen": 189927405, "step": 8834, "time_per_iteration": 2.7447376251220703 }, { "auxiliary_loss_clip": 0.01127832, "auxiliary_loss_mlp": 0.01042563, "balance_loss_clip": 1.04435921, "balance_loss_mlp": 1.02734029, "epoch": 0.5311889373215091, "flos": 24097712198400.0, "grad_norm": 1.9940250251862053, "language_loss": 0.77417272, "learning_rate": 1.8950882493744226e-06, "loss": 0.79587668, "num_input_tokens_seen": 189947740, "step": 8835, "time_per_iteration": 2.654860734939575 }, { "auxiliary_loss_clip": 0.01097251, "auxiliary_loss_mlp": 0.01046402, "balance_loss_clip": 1.04259109, "balance_loss_mlp": 1.03138208, "epoch": 0.5312490605741771, "flos": 22017012042240.0, "grad_norm": 2.4706637723930505, "language_loss": 0.72355223, "learning_rate": 1.8946993258786985e-06, "loss": 0.7449888, "num_input_tokens_seen": 189966495, "step": 8836, "time_per_iteration": 2.694772243499756 }, { "auxiliary_loss_clip": 0.01104585, "auxiliary_loss_mlp": 0.01040344, "balance_loss_clip": 1.04374099, "balance_loss_mlp": 1.02537167, "epoch": 0.531309183826845, "flos": 19390541662080.0, "grad_norm": 1.705704926785557, "language_loss": 0.81026083, "learning_rate": 1.894310406375987e-06, "loss": 0.8317101, "num_input_tokens_seen": 189985325, "step": 8837, "time_per_iteration": 4.218893527984619 }, { "auxiliary_loss_clip": 0.01107393, "auxiliary_loss_mlp": 0.01036209, "balance_loss_clip": 1.04489708, "balance_loss_mlp": 1.02216005, "epoch": 0.531369307079513, "flos": 20190056538240.0, "grad_norm": 1.8031911656804687, "language_loss": 0.8618502, "learning_rate": 1.893921490881035e-06, "loss": 0.88328624, "num_input_tokens_seen": 190003290, "step": 8838, "time_per_iteration": 4.327972888946533 }, { "auxiliary_loss_clip": 0.01097617, "auxiliary_loss_mlp": 0.01036447, "balance_loss_clip": 1.04136765, "balance_loss_mlp": 1.02366185, "epoch": 0.5314294303321809, "flos": 18880143356160.0, "grad_norm": 1.7768925166398193, "language_loss": 0.72961235, "learning_rate": 1.8935325794085906e-06, "loss": 0.75095296, "num_input_tokens_seen": 190023260, "step": 8839, "time_per_iteration": 4.2734081745147705 }, { "auxiliary_loss_clip": 0.0110159, "auxiliary_loss_mlp": 0.01042278, "balance_loss_clip": 1.04086304, "balance_loss_mlp": 1.02885473, "epoch": 0.531489553584849, "flos": 23040035297280.0, "grad_norm": 1.7238696185302183, "language_loss": 0.76902539, "learning_rate": 1.8931436719734023e-06, "loss": 0.79046404, "num_input_tokens_seen": 190042035, "step": 8840, "time_per_iteration": 2.708387613296509 }, { "auxiliary_loss_clip": 0.01085488, "auxiliary_loss_mlp": 0.01033907, "balance_loss_clip": 1.04072022, "balance_loss_mlp": 1.01934612, "epoch": 0.5315496768375169, "flos": 19790478668160.0, "grad_norm": 2.0047240823259385, "language_loss": 0.77301592, "learning_rate": 1.892754768590216e-06, "loss": 0.7942099, "num_input_tokens_seen": 190057545, "step": 8841, "time_per_iteration": 2.6982758045196533 }, { "auxiliary_loss_clip": 0.0102526, "auxiliary_loss_mlp": 0.01022764, "balance_loss_clip": 1.01826656, "balance_loss_mlp": 1.02119017, "epoch": 0.5316098000901849, "flos": 71023228185600.0, "grad_norm": 0.6981779601463162, "language_loss": 0.56741858, "learning_rate": 1.8923658692737793e-06, "loss": 0.58789885, "num_input_tokens_seen": 190123800, "step": 8842, "time_per_iteration": 4.895024299621582 }, { "auxiliary_loss_clip": 0.01102673, "auxiliary_loss_mlp": 0.01041259, "balance_loss_clip": 1.04331183, "balance_loss_mlp": 1.02621484, "epoch": 0.5316699233428529, "flos": 16435560470400.0, "grad_norm": 1.8735975877067965, "language_loss": 0.73998511, "learning_rate": 1.8919769740388407e-06, "loss": 0.76142448, "num_input_tokens_seen": 190141625, "step": 8843, "time_per_iteration": 2.66169810295105 }, { "auxiliary_loss_clip": 0.01023627, "auxiliary_loss_mlp": 0.0100589, "balance_loss_clip": 1.01690733, "balance_loss_mlp": 1.00456095, "epoch": 0.5317300465955208, "flos": 67420814302080.0, "grad_norm": 0.8814346849515853, "language_loss": 0.61057651, "learning_rate": 1.891588082900145e-06, "loss": 0.63087165, "num_input_tokens_seen": 190198110, "step": 8844, "time_per_iteration": 3.297545909881592 }, { "auxiliary_loss_clip": 0.01032752, "auxiliary_loss_mlp": 0.01005725, "balance_loss_clip": 1.01528263, "balance_loss_mlp": 1.00425863, "epoch": 0.5317901698481888, "flos": 59508075340800.0, "grad_norm": 0.8422745451421196, "language_loss": 0.62147105, "learning_rate": 1.8911991958724411e-06, "loss": 0.64185584, "num_input_tokens_seen": 190259950, "step": 8845, "time_per_iteration": 3.1747312545776367 }, { "auxiliary_loss_clip": 0.01088974, "auxiliary_loss_mlp": 0.01040872, "balance_loss_clip": 1.04063165, "balance_loss_mlp": 1.02521944, "epoch": 0.5318502931008567, "flos": 19129219240320.0, "grad_norm": 1.8386701394288745, "language_loss": 0.74980247, "learning_rate": 1.890810312970474e-06, "loss": 0.77110094, "num_input_tokens_seen": 190278265, "step": 8846, "time_per_iteration": 2.734652519226074 }, { "auxiliary_loss_clip": 0.01111858, "auxiliary_loss_mlp": 0.01034985, "balance_loss_clip": 1.04369533, "balance_loss_mlp": 1.0226109, "epoch": 0.5319104163535248, "flos": 24681045070080.0, "grad_norm": 1.562458752543025, "language_loss": 0.75478411, "learning_rate": 1.8904214342089903e-06, "loss": 0.77625251, "num_input_tokens_seen": 190298400, "step": 8847, "time_per_iteration": 2.7175981998443604 }, { "auxiliary_loss_clip": 0.0110005, "auxiliary_loss_mlp": 0.01032122, "balance_loss_clip": 1.04175198, "balance_loss_mlp": 1.0193609, "epoch": 0.5319705396061927, "flos": 19385513758080.0, "grad_norm": 1.5938668259379032, "language_loss": 0.87875456, "learning_rate": 1.8900325596027378e-06, "loss": 0.90007627, "num_input_tokens_seen": 190316235, "step": 8848, "time_per_iteration": 2.777731418609619 }, { "auxiliary_loss_clip": 0.01084561, "auxiliary_loss_mlp": 0.01041363, "balance_loss_clip": 1.04119325, "balance_loss_mlp": 1.02549624, "epoch": 0.5320306628588607, "flos": 18259319664000.0, "grad_norm": 2.1051582434291833, "language_loss": 0.74326992, "learning_rate": 1.8896436891664609e-06, "loss": 0.76452917, "num_input_tokens_seen": 190335060, "step": 8849, "time_per_iteration": 2.7248313426971436 }, { "auxiliary_loss_clip": 0.01107496, "auxiliary_loss_mlp": 0.01030316, "balance_loss_clip": 1.03895473, "balance_loss_mlp": 1.0154624, "epoch": 0.5320907861115286, "flos": 23732321097600.0, "grad_norm": 1.8915242874982603, "language_loss": 0.79657137, "learning_rate": 1.8892548229149066e-06, "loss": 0.81794947, "num_input_tokens_seen": 190353265, "step": 8850, "time_per_iteration": 2.7357401847839355 }, { "auxiliary_loss_clip": 0.01121659, "auxiliary_loss_mlp": 0.01031858, "balance_loss_clip": 1.04192996, "balance_loss_mlp": 1.01804209, "epoch": 0.5321509093641966, "flos": 34495251321600.0, "grad_norm": 1.633301633467878, "language_loss": 0.55076206, "learning_rate": 1.888865960862821e-06, "loss": 0.57229722, "num_input_tokens_seen": 190376575, "step": 8851, "time_per_iteration": 2.730081081390381 }, { "auxiliary_loss_clip": 0.01110617, "auxiliary_loss_mlp": 0.01036207, "balance_loss_clip": 1.04243159, "balance_loss_mlp": 1.0228914, "epoch": 0.5322110326168645, "flos": 20010934391040.0, "grad_norm": 1.5393101812132837, "language_loss": 0.68206942, "learning_rate": 1.8884771030249484e-06, "loss": 0.70353764, "num_input_tokens_seen": 190395185, "step": 8852, "time_per_iteration": 2.685267925262451 }, { "auxiliary_loss_clip": 0.01020981, "auxiliary_loss_mlp": 0.00752764, "balance_loss_clip": 1.01425028, "balance_loss_mlp": 0.99977398, "epoch": 0.5322711558695326, "flos": 64631164435200.0, "grad_norm": 0.7921902417648442, "language_loss": 0.62794167, "learning_rate": 1.8880882494160357e-06, "loss": 0.64567912, "num_input_tokens_seen": 190452595, "step": 8853, "time_per_iteration": 3.154197931289673 }, { "auxiliary_loss_clip": 0.01113411, "auxiliary_loss_mlp": 0.01027799, "balance_loss_clip": 1.04064846, "balance_loss_mlp": 1.01379788, "epoch": 0.5323312791222005, "flos": 14939342421120.0, "grad_norm": 2.437651920606879, "language_loss": 0.79789698, "learning_rate": 1.8876994000508278e-06, "loss": 0.81930912, "num_input_tokens_seen": 190469140, "step": 8854, "time_per_iteration": 2.6569535732269287 }, { "auxiliary_loss_clip": 0.01092841, "auxiliary_loss_mlp": 0.0102808, "balance_loss_clip": 1.0418992, "balance_loss_mlp": 1.01586115, "epoch": 0.5323914023748685, "flos": 23440834229760.0, "grad_norm": 1.7182223658194644, "language_loss": 0.73290253, "learning_rate": 1.8873105549440698e-06, "loss": 0.75411177, "num_input_tokens_seen": 190489015, "step": 8855, "time_per_iteration": 2.6984002590179443 }, { "auxiliary_loss_clip": 0.01095667, "auxiliary_loss_mlp": 0.0077104, "balance_loss_clip": 1.03969502, "balance_loss_mlp": 1.00030267, "epoch": 0.5324515256275365, "flos": 26286180134400.0, "grad_norm": 1.9960339019119333, "language_loss": 0.6505388, "learning_rate": 1.886921714110507e-06, "loss": 0.66920584, "num_input_tokens_seen": 190508065, "step": 8856, "time_per_iteration": 2.7057278156280518 }, { "auxiliary_loss_clip": 0.01100444, "auxiliary_loss_mlp": 0.0103908, "balance_loss_clip": 1.04079795, "balance_loss_mlp": 1.02341616, "epoch": 0.5325116488802044, "flos": 26870913636480.0, "grad_norm": 2.078757662178109, "language_loss": 0.77651089, "learning_rate": 1.8865328775648842e-06, "loss": 0.79790616, "num_input_tokens_seen": 190527045, "step": 8857, "time_per_iteration": 2.764199733734131 }, { "auxiliary_loss_clip": 0.01092407, "auxiliary_loss_mlp": 0.01034608, "balance_loss_clip": 1.04279578, "balance_loss_mlp": 1.02039194, "epoch": 0.5325717721328724, "flos": 25884734757120.0, "grad_norm": 2.3746118235231592, "language_loss": 0.70823711, "learning_rate": 1.8861440453219456e-06, "loss": 0.72950727, "num_input_tokens_seen": 190544075, "step": 8858, "time_per_iteration": 2.735534191131592 }, { "auxiliary_loss_clip": 0.01108427, "auxiliary_loss_mlp": 0.01040186, "balance_loss_clip": 1.0411067, "balance_loss_mlp": 1.02518916, "epoch": 0.5326318953855403, "flos": 21799321666560.0, "grad_norm": 1.83105211007431, "language_loss": 0.69232476, "learning_rate": 1.8857552173964367e-06, "loss": 0.71381092, "num_input_tokens_seen": 190566030, "step": 8859, "time_per_iteration": 2.773764133453369 }, { "auxiliary_loss_clip": 0.01109944, "auxiliary_loss_mlp": 0.01028838, "balance_loss_clip": 1.04517436, "balance_loss_mlp": 1.01671481, "epoch": 0.5326920186382084, "flos": 20922921728640.0, "grad_norm": 1.8423028887831514, "language_loss": 0.69617528, "learning_rate": 1.8853663938031013e-06, "loss": 0.71756315, "num_input_tokens_seen": 190585605, "step": 8860, "time_per_iteration": 2.689471483230591 }, { "auxiliary_loss_clip": 0.01102885, "auxiliary_loss_mlp": 0.01035827, "balance_loss_clip": 1.0451107, "balance_loss_mlp": 1.02258921, "epoch": 0.5327521418908763, "flos": 21433427775360.0, "grad_norm": 2.3281195979693297, "language_loss": 0.78340018, "learning_rate": 1.884977574556683e-06, "loss": 0.80478734, "num_input_tokens_seen": 190604625, "step": 8861, "time_per_iteration": 2.66679048538208 }, { "auxiliary_loss_clip": 0.01077125, "auxiliary_loss_mlp": 0.01040454, "balance_loss_clip": 1.03987145, "balance_loss_mlp": 1.02606571, "epoch": 0.5328122651435443, "flos": 21760250647680.0, "grad_norm": 1.7664447291359346, "language_loss": 0.85554659, "learning_rate": 1.8845887596719279e-06, "loss": 0.87672234, "num_input_tokens_seen": 190625060, "step": 8862, "time_per_iteration": 2.7928006649017334 }, { "auxiliary_loss_clip": 0.0109879, "auxiliary_loss_mlp": 0.01039782, "balance_loss_clip": 1.03952289, "balance_loss_mlp": 1.0237242, "epoch": 0.5328723883962122, "flos": 18296487262080.0, "grad_norm": 2.2696975914187116, "language_loss": 0.62147439, "learning_rate": 1.8841999491635778e-06, "loss": 0.64286011, "num_input_tokens_seen": 190643150, "step": 8863, "time_per_iteration": 2.685253381729126 }, { "auxiliary_loss_clip": 0.01098767, "auxiliary_loss_mlp": 0.01040661, "balance_loss_clip": 1.04511809, "balance_loss_mlp": 1.02661765, "epoch": 0.5329325116488802, "flos": 25374911068800.0, "grad_norm": 1.8529881391436633, "language_loss": 0.73310483, "learning_rate": 1.883811143046377e-06, "loss": 0.75449914, "num_input_tokens_seen": 190662725, "step": 8864, "time_per_iteration": 2.703639030456543 }, { "auxiliary_loss_clip": 0.01120661, "auxiliary_loss_mlp": 0.01035736, "balance_loss_clip": 1.04301071, "balance_loss_mlp": 1.02275968, "epoch": 0.5329926349015481, "flos": 25592098654080.0, "grad_norm": 1.6333657309737846, "language_loss": 0.64201105, "learning_rate": 1.8834223413350702e-06, "loss": 0.66357499, "num_input_tokens_seen": 190683680, "step": 8865, "time_per_iteration": 2.691087245941162 }, { "auxiliary_loss_clip": 0.01113033, "auxiliary_loss_mlp": 0.01029706, "balance_loss_clip": 1.0424211, "balance_loss_mlp": 1.01641965, "epoch": 0.5330527581542162, "flos": 22889605138560.0, "grad_norm": 3.0767575494694985, "language_loss": 0.78091645, "learning_rate": 1.8830335440443989e-06, "loss": 0.80234385, "num_input_tokens_seen": 190703350, "step": 8866, "time_per_iteration": 2.674612283706665 }, { "auxiliary_loss_clip": 0.01108068, "auxiliary_loss_mlp": 0.0103023, "balance_loss_clip": 1.04092908, "balance_loss_mlp": 1.01696241, "epoch": 0.5331128814068841, "flos": 16026752805120.0, "grad_norm": 1.842224927457961, "language_loss": 0.73840493, "learning_rate": 1.882644751189108e-06, "loss": 0.75978798, "num_input_tokens_seen": 190721170, "step": 8867, "time_per_iteration": 2.6963648796081543 }, { "auxiliary_loss_clip": 0.01098718, "auxiliary_loss_mlp": 0.01039247, "balance_loss_clip": 1.04040504, "balance_loss_mlp": 1.02402985, "epoch": 0.5331730046595521, "flos": 39344699629440.0, "grad_norm": 1.5703549780422514, "language_loss": 0.71881396, "learning_rate": 1.88225596278394e-06, "loss": 0.74019361, "num_input_tokens_seen": 190743795, "step": 8868, "time_per_iteration": 2.830118417739868 }, { "auxiliary_loss_clip": 0.01090763, "auxiliary_loss_mlp": 0.01034625, "balance_loss_clip": 1.04197335, "balance_loss_mlp": 1.0212791, "epoch": 0.5332331279122201, "flos": 24024382583040.0, "grad_norm": 5.550281122060094, "language_loss": 0.78397369, "learning_rate": 1.881867178843637e-06, "loss": 0.80522758, "num_input_tokens_seen": 190761560, "step": 8869, "time_per_iteration": 2.738565444946289 }, { "auxiliary_loss_clip": 0.01114623, "auxiliary_loss_mlp": 0.01037147, "balance_loss_clip": 1.0432862, "balance_loss_mlp": 1.02336633, "epoch": 0.533293251164888, "flos": 17129318728320.0, "grad_norm": 1.7588400416982446, "language_loss": 0.75840724, "learning_rate": 1.8814783993829434e-06, "loss": 0.77992487, "num_input_tokens_seen": 190778875, "step": 8870, "time_per_iteration": 2.598963499069214 }, { "auxiliary_loss_clip": 0.01100618, "auxiliary_loss_mlp": 0.01038316, "balance_loss_clip": 1.04231286, "balance_loss_mlp": 1.02373052, "epoch": 0.533353374417556, "flos": 22126360020480.0, "grad_norm": 5.617051153369423, "language_loss": 0.75663799, "learning_rate": 1.8810896244165997e-06, "loss": 0.7780273, "num_input_tokens_seen": 190799830, "step": 8871, "time_per_iteration": 2.7459628582000732 }, { "auxiliary_loss_clip": 0.01099152, "auxiliary_loss_mlp": 0.0103356, "balance_loss_clip": 1.04201055, "balance_loss_mlp": 1.0202924, "epoch": 0.533413497670224, "flos": 15011091838080.0, "grad_norm": 1.8041252581471448, "language_loss": 0.7247498, "learning_rate": 1.8807008539593498e-06, "loss": 0.74607694, "num_input_tokens_seen": 190817155, "step": 8872, "time_per_iteration": 2.6604373455047607 }, { "auxiliary_loss_clip": 0.01100126, "auxiliary_loss_mlp": 0.0104147, "balance_loss_clip": 1.04733372, "balance_loss_mlp": 1.02694392, "epoch": 0.533473620922892, "flos": 19609955890560.0, "grad_norm": 1.7875555414889834, "language_loss": 0.65306997, "learning_rate": 1.880312088025936e-06, "loss": 0.67448598, "num_input_tokens_seen": 190835240, "step": 8873, "time_per_iteration": 2.6587424278259277 }, { "auxiliary_loss_clip": 0.01098214, "auxiliary_loss_mlp": 0.0104372, "balance_loss_clip": 1.04254389, "balance_loss_mlp": 1.03035116, "epoch": 0.5335337441755599, "flos": 14282644020480.0, "grad_norm": 2.157272820213575, "language_loss": 0.80225539, "learning_rate": 1.879923326631099e-06, "loss": 0.82367474, "num_input_tokens_seen": 190851620, "step": 8874, "time_per_iteration": 2.723454475402832 }, { "auxiliary_loss_clip": 0.01112328, "auxiliary_loss_mlp": 0.01030163, "balance_loss_clip": 1.04300058, "balance_loss_mlp": 1.01653171, "epoch": 0.5335938674282279, "flos": 20814830726400.0, "grad_norm": 1.8315602861194333, "language_loss": 0.69789159, "learning_rate": 1.879534569789582e-06, "loss": 0.71931654, "num_input_tokens_seen": 190870545, "step": 8875, "time_per_iteration": 2.6051578521728516 }, { "auxiliary_loss_clip": 0.01045431, "auxiliary_loss_mlp": 0.01001312, "balance_loss_clip": 1.01922286, "balance_loss_mlp": 0.99979252, "epoch": 0.5336539906808958, "flos": 71396448451200.0, "grad_norm": 0.7211200965927701, "language_loss": 0.59631079, "learning_rate": 1.879145817516126e-06, "loss": 0.61677825, "num_input_tokens_seen": 190931995, "step": 8876, "time_per_iteration": 3.3114185333251953 }, { "auxiliary_loss_clip": 0.01113481, "auxiliary_loss_mlp": 0.01040016, "balance_loss_clip": 1.04467189, "balance_loss_mlp": 1.02705741, "epoch": 0.5337141139335638, "flos": 20152996680960.0, "grad_norm": 1.6786856291224888, "language_loss": 0.74847406, "learning_rate": 1.8787570698254727e-06, "loss": 0.77000904, "num_input_tokens_seen": 190949890, "step": 8877, "time_per_iteration": 4.474783182144165 }, { "auxiliary_loss_clip": 0.01030394, "auxiliary_loss_mlp": 0.01002162, "balance_loss_clip": 1.01585436, "balance_loss_mlp": 1.00046921, "epoch": 0.5337742371862317, "flos": 67728387484800.0, "grad_norm": 0.7582021069840851, "language_loss": 0.57155037, "learning_rate": 1.8783683267323629e-06, "loss": 0.59187591, "num_input_tokens_seen": 191008480, "step": 8878, "time_per_iteration": 4.623803615570068 }, { "auxiliary_loss_clip": 0.0112711, "auxiliary_loss_mlp": 0.0103613, "balance_loss_clip": 1.04414368, "balance_loss_mlp": 1.02169418, "epoch": 0.5338343604388998, "flos": 25008909436800.0, "grad_norm": 1.4672061419232192, "language_loss": 0.72301328, "learning_rate": 1.8779795882515395e-06, "loss": 0.74464571, "num_input_tokens_seen": 191028995, "step": 8879, "time_per_iteration": 2.646631956100464 }, { "auxiliary_loss_clip": 0.01126385, "auxiliary_loss_mlp": 0.01039416, "balance_loss_clip": 1.04535294, "balance_loss_mlp": 1.02487254, "epoch": 0.5338944836915677, "flos": 17601256546560.0, "grad_norm": 2.878615745391383, "language_loss": 0.83403212, "learning_rate": 1.8775908543977416e-06, "loss": 0.85569012, "num_input_tokens_seen": 191045285, "step": 8880, "time_per_iteration": 2.578953504562378 }, { "auxiliary_loss_clip": 0.01053817, "auxiliary_loss_mlp": 0.01036139, "balance_loss_clip": 1.03627348, "balance_loss_mlp": 1.02279377, "epoch": 0.5339546069442357, "flos": 21724124544000.0, "grad_norm": 1.3711441541735603, "language_loss": 0.79637486, "learning_rate": 1.8772021251857107e-06, "loss": 0.81727445, "num_input_tokens_seen": 191066105, "step": 8881, "time_per_iteration": 4.335238695144653 }, { "auxiliary_loss_clip": 0.0102058, "auxiliary_loss_mlp": 0.00999984, "balance_loss_clip": 1.01616335, "balance_loss_mlp": 0.99846381, "epoch": 0.5340147301969036, "flos": 69723583315200.0, "grad_norm": 0.7924040124288975, "language_loss": 0.59248376, "learning_rate": 1.8768134006301882e-06, "loss": 0.61268938, "num_input_tokens_seen": 191126315, "step": 8882, "time_per_iteration": 3.1252357959747314 }, { "auxiliary_loss_clip": 0.01025577, "auxiliary_loss_mlp": 0.01019116, "balance_loss_clip": 1.01780772, "balance_loss_mlp": 1.01768577, "epoch": 0.5340748534495716, "flos": 63880701580800.0, "grad_norm": 0.8651438881324313, "language_loss": 0.63574433, "learning_rate": 1.876424680745913e-06, "loss": 0.65619123, "num_input_tokens_seen": 191174240, "step": 8883, "time_per_iteration": 3.0245001316070557 }, { "auxiliary_loss_clip": 0.01079245, "auxiliary_loss_mlp": 0.01040102, "balance_loss_clip": 1.03873086, "balance_loss_mlp": 1.02523685, "epoch": 0.5341349767022396, "flos": 28694313694080.0, "grad_norm": 2.1049960022330385, "language_loss": 0.8200773, "learning_rate": 1.8760359655476272e-06, "loss": 0.8412708, "num_input_tokens_seen": 191193335, "step": 8884, "time_per_iteration": 2.8096158504486084 }, { "auxiliary_loss_clip": 0.01088886, "auxiliary_loss_mlp": 0.01042992, "balance_loss_clip": 1.0403688, "balance_loss_mlp": 1.02865684, "epoch": 0.5341950999549075, "flos": 16289691338880.0, "grad_norm": 1.6281705583461854, "language_loss": 0.72372848, "learning_rate": 1.8756472550500695e-06, "loss": 0.74504721, "num_input_tokens_seen": 191210900, "step": 8885, "time_per_iteration": 2.6555016040802 }, { "auxiliary_loss_clip": 0.01103878, "auxiliary_loss_mlp": 0.01037971, "balance_loss_clip": 1.04014146, "balance_loss_mlp": 1.02301598, "epoch": 0.5342552232075756, "flos": 14355650413440.0, "grad_norm": 2.9046192596208846, "language_loss": 0.79004246, "learning_rate": 1.87525854926798e-06, "loss": 0.81146097, "num_input_tokens_seen": 191226730, "step": 8886, "time_per_iteration": 2.6476478576660156 }, { "auxiliary_loss_clip": 0.01083524, "auxiliary_loss_mlp": 0.00772223, "balance_loss_clip": 1.04013681, "balance_loss_mlp": 1.00027037, "epoch": 0.5343153464602435, "flos": 30297976300800.0, "grad_norm": 1.5332505330022492, "language_loss": 0.750615, "learning_rate": 1.8748698482160996e-06, "loss": 0.76917243, "num_input_tokens_seen": 191250435, "step": 8887, "time_per_iteration": 2.7690041065216064 }, { "auxiliary_loss_clip": 0.01095123, "auxiliary_loss_mlp": 0.01034453, "balance_loss_clip": 1.03800249, "balance_loss_mlp": 1.02050543, "epoch": 0.5343754697129115, "flos": 15596292216960.0, "grad_norm": 2.322348043408552, "language_loss": 0.68717337, "learning_rate": 1.8744811519091663e-06, "loss": 0.70846909, "num_input_tokens_seen": 191268315, "step": 8888, "time_per_iteration": 2.631999969482422 }, { "auxiliary_loss_clip": 0.01118819, "auxiliary_loss_mlp": 0.01041785, "balance_loss_clip": 1.04266095, "balance_loss_mlp": 1.02738404, "epoch": 0.5344355929655794, "flos": 16909617191040.0, "grad_norm": 2.080624189448151, "language_loss": 0.77346873, "learning_rate": 1.8740924603619208e-06, "loss": 0.79507482, "num_input_tokens_seen": 191287000, "step": 8889, "time_per_iteration": 2.621675729751587 }, { "auxiliary_loss_clip": 0.01122598, "auxiliary_loss_mlp": 0.01042684, "balance_loss_clip": 1.04449213, "balance_loss_mlp": 1.02922511, "epoch": 0.5344957162182474, "flos": 16798186224000.0, "grad_norm": 2.052201989860069, "language_loss": 0.69323713, "learning_rate": 1.873703773589102e-06, "loss": 0.71489, "num_input_tokens_seen": 191304565, "step": 8890, "time_per_iteration": 2.6052801609039307 }, { "auxiliary_loss_clip": 0.01128191, "auxiliary_loss_mlp": 0.01052498, "balance_loss_clip": 1.04494905, "balance_loss_mlp": 1.0359515, "epoch": 0.5345558394709153, "flos": 12705590413440.0, "grad_norm": 2.21737658698942, "language_loss": 0.77022809, "learning_rate": 1.8733150916054483e-06, "loss": 0.79203498, "num_input_tokens_seen": 191318300, "step": 8891, "time_per_iteration": 2.533200263977051 }, { "auxiliary_loss_clip": 0.01103794, "auxiliary_loss_mlp": 0.01042349, "balance_loss_clip": 1.04030669, "balance_loss_mlp": 1.02807951, "epoch": 0.5346159627235834, "flos": 22455050400000.0, "grad_norm": 2.8109589169570857, "language_loss": 0.74259919, "learning_rate": 1.872926414425699e-06, "loss": 0.76406056, "num_input_tokens_seen": 191337925, "step": 8892, "time_per_iteration": 2.674466609954834 }, { "auxiliary_loss_clip": 0.01107598, "auxiliary_loss_mlp": 0.01038096, "balance_loss_clip": 1.04592252, "balance_loss_mlp": 1.02414215, "epoch": 0.5346760859762513, "flos": 22415763899520.0, "grad_norm": 1.9745937936433648, "language_loss": 0.87865257, "learning_rate": 1.8725377420645932e-06, "loss": 0.90010953, "num_input_tokens_seen": 191357120, "step": 8893, "time_per_iteration": 2.7012922763824463 }, { "auxiliary_loss_clip": 0.0111971, "auxiliary_loss_mlp": 0.01036459, "balance_loss_clip": 1.04291701, "balance_loss_mlp": 1.02377474, "epoch": 0.5347362092289193, "flos": 22816131868800.0, "grad_norm": 1.9421223728327293, "language_loss": 0.72379559, "learning_rate": 1.872149074536869e-06, "loss": 0.74535728, "num_input_tokens_seen": 191375395, "step": 8894, "time_per_iteration": 2.590670108795166 }, { "auxiliary_loss_clip": 0.01111441, "auxiliary_loss_mlp": 0.01031552, "balance_loss_clip": 1.04253268, "balance_loss_mlp": 1.01799238, "epoch": 0.5347963324815872, "flos": 23219480666880.0, "grad_norm": 1.965554622310178, "language_loss": 0.74611443, "learning_rate": 1.8717604118572648e-06, "loss": 0.76754439, "num_input_tokens_seen": 191395595, "step": 8895, "time_per_iteration": 2.6462347507476807 }, { "auxiliary_loss_clip": 0.01089565, "auxiliary_loss_mlp": 0.01036535, "balance_loss_clip": 1.04067063, "balance_loss_mlp": 1.02246881, "epoch": 0.5348564557342552, "flos": 22601350494720.0, "grad_norm": 1.8148089507657776, "language_loss": 0.76860476, "learning_rate": 1.8713717540405178e-06, "loss": 0.78986579, "num_input_tokens_seen": 191413730, "step": 8896, "time_per_iteration": 2.6798579692840576 }, { "auxiliary_loss_clip": 0.01093639, "auxiliary_loss_mlp": 0.01027964, "balance_loss_clip": 1.04279101, "balance_loss_mlp": 1.01502943, "epoch": 0.5349165789869232, "flos": 18002378701440.0, "grad_norm": 1.8518658883520687, "language_loss": 0.78188956, "learning_rate": 1.8709831011013676e-06, "loss": 0.80310559, "num_input_tokens_seen": 191432400, "step": 8897, "time_per_iteration": 2.6509950160980225 }, { "auxiliary_loss_clip": 0.01113143, "auxiliary_loss_mlp": 0.01032016, "balance_loss_clip": 1.04366183, "balance_loss_mlp": 1.01799703, "epoch": 0.5349767022395912, "flos": 17159770483200.0, "grad_norm": 1.7403204910626056, "language_loss": 0.75393677, "learning_rate": 1.8705944530545509e-06, "loss": 0.7753883, "num_input_tokens_seen": 191448855, "step": 8898, "time_per_iteration": 2.682753086090088 }, { "auxiliary_loss_clip": 0.01037971, "auxiliary_loss_mlp": 0.01005108, "balance_loss_clip": 1.0205543, "balance_loss_mlp": 1.00373161, "epoch": 0.5350368254922592, "flos": 70992058158720.0, "grad_norm": 0.9010106507685076, "language_loss": 0.57955837, "learning_rate": 1.8702058099148052e-06, "loss": 0.59998918, "num_input_tokens_seen": 191519690, "step": 8899, "time_per_iteration": 3.3475701808929443 }, { "auxiliary_loss_clip": 0.01101715, "auxiliary_loss_mlp": 0.0103468, "balance_loss_clip": 1.04445124, "balance_loss_mlp": 1.02107263, "epoch": 0.5350969487449271, "flos": 27417833095680.0, "grad_norm": 2.547752496503206, "language_loss": 0.69974548, "learning_rate": 1.869817171696868e-06, "loss": 0.72110939, "num_input_tokens_seen": 191539380, "step": 8900, "time_per_iteration": 2.7260618209838867 }, { "auxiliary_loss_clip": 0.01099442, "auxiliary_loss_mlp": 0.01035798, "balance_loss_clip": 1.03943968, "balance_loss_mlp": 1.02212465, "epoch": 0.5351570719975951, "flos": 19316134638720.0, "grad_norm": 1.7903210344042488, "language_loss": 0.71756148, "learning_rate": 1.8694285384154777e-06, "loss": 0.73891389, "num_input_tokens_seen": 191557400, "step": 8901, "time_per_iteration": 2.661510467529297 }, { "auxiliary_loss_clip": 0.01087314, "auxiliary_loss_mlp": 0.01036972, "balance_loss_clip": 1.03631806, "balance_loss_mlp": 1.02237511, "epoch": 0.535217195250263, "flos": 19828580019840.0, "grad_norm": 1.7989041746924002, "language_loss": 0.77021015, "learning_rate": 1.8690399100853699e-06, "loss": 0.791453, "num_input_tokens_seen": 191575860, "step": 8902, "time_per_iteration": 2.69665789604187 }, { "auxiliary_loss_clip": 0.01087231, "auxiliary_loss_mlp": 0.01041891, "balance_loss_clip": 1.04053831, "balance_loss_mlp": 1.0283792, "epoch": 0.535277318502931, "flos": 22127868391680.0, "grad_norm": 1.509633063766185, "language_loss": 0.70147592, "learning_rate": 1.868651286721281e-06, "loss": 0.72276717, "num_input_tokens_seen": 191595775, "step": 8903, "time_per_iteration": 2.676028251647949 }, { "auxiliary_loss_clip": 0.0111537, "auxiliary_loss_mlp": 0.00772296, "balance_loss_clip": 1.04395127, "balance_loss_mlp": 1.00028765, "epoch": 0.5353374417555989, "flos": 25045897466880.0, "grad_norm": 1.6001480056643833, "language_loss": 0.72911739, "learning_rate": 1.86826266833795e-06, "loss": 0.74799401, "num_input_tokens_seen": 191617785, "step": 8904, "time_per_iteration": 2.7466139793395996 }, { "auxiliary_loss_clip": 0.01099985, "auxiliary_loss_mlp": 0.01041546, "balance_loss_clip": 1.04453778, "balance_loss_mlp": 1.02705002, "epoch": 0.535397565008267, "flos": 19388710068480.0, "grad_norm": 1.8242307652956307, "language_loss": 0.73365581, "learning_rate": 1.8678740549501103e-06, "loss": 0.7550711, "num_input_tokens_seen": 191636900, "step": 8905, "time_per_iteration": 2.772406578063965 }, { "auxiliary_loss_clip": 0.01105525, "auxiliary_loss_mlp": 0.0103776, "balance_loss_clip": 1.04141188, "balance_loss_mlp": 1.02607787, "epoch": 0.5354576882609349, "flos": 21471205904640.0, "grad_norm": 1.6628200467542797, "language_loss": 0.83795619, "learning_rate": 1.8674854465725005e-06, "loss": 0.85938901, "num_input_tokens_seen": 191656720, "step": 8906, "time_per_iteration": 2.7151100635528564 }, { "auxiliary_loss_clip": 0.01115256, "auxiliary_loss_mlp": 0.00771962, "balance_loss_clip": 1.04406035, "balance_loss_mlp": 1.00027847, "epoch": 0.5355178115136029, "flos": 20777519473920.0, "grad_norm": 1.884591574516044, "language_loss": 0.74096596, "learning_rate": 1.8670968432198563e-06, "loss": 0.75983804, "num_input_tokens_seen": 191674445, "step": 8907, "time_per_iteration": 2.6978471279144287 }, { "auxiliary_loss_clip": 0.01106969, "auxiliary_loss_mlp": 0.01040191, "balance_loss_clip": 1.04144001, "balance_loss_mlp": 1.02508759, "epoch": 0.5355779347662708, "flos": 23514020190720.0, "grad_norm": 2.160786888323469, "language_loss": 0.76593792, "learning_rate": 1.866708244906912e-06, "loss": 0.7874096, "num_input_tokens_seen": 191695000, "step": 8908, "time_per_iteration": 2.6536221504211426 }, { "auxiliary_loss_clip": 0.01097449, "auxiliary_loss_mlp": 0.00772377, "balance_loss_clip": 1.04248428, "balance_loss_mlp": 1.00030112, "epoch": 0.5356380580189388, "flos": 20303211358080.0, "grad_norm": 3.03117864576072, "language_loss": 0.740637, "learning_rate": 1.8663196516484055e-06, "loss": 0.75933528, "num_input_tokens_seen": 191713295, "step": 8909, "time_per_iteration": 2.665473461151123 }, { "auxiliary_loss_clip": 0.01082798, "auxiliary_loss_mlp": 0.01042054, "balance_loss_clip": 1.0436362, "balance_loss_mlp": 1.02891159, "epoch": 0.5356981812716068, "flos": 21361642444800.0, "grad_norm": 2.1999922776778233, "language_loss": 0.84319562, "learning_rate": 1.8659310634590702e-06, "loss": 0.86444414, "num_input_tokens_seen": 191732725, "step": 8910, "time_per_iteration": 2.715521812438965 }, { "auxiliary_loss_clip": 0.01102329, "auxiliary_loss_mlp": 0.0103318, "balance_loss_clip": 1.04114723, "balance_loss_mlp": 1.01928067, "epoch": 0.5357583045242748, "flos": 23111246010240.0, "grad_norm": 1.6725390900013062, "language_loss": 0.81822705, "learning_rate": 1.8655424803536427e-06, "loss": 0.8395822, "num_input_tokens_seen": 191753765, "step": 8911, "time_per_iteration": 2.715254068374634 }, { "auxiliary_loss_clip": 0.0108401, "auxiliary_loss_mlp": 0.01044047, "balance_loss_clip": 1.04012454, "balance_loss_mlp": 1.03019536, "epoch": 0.5358184277769428, "flos": 21141761339520.0, "grad_norm": 5.639232337071921, "language_loss": 0.69078076, "learning_rate": 1.8651539023468585e-06, "loss": 0.71206129, "num_input_tokens_seen": 191773560, "step": 8912, "time_per_iteration": 2.6743216514587402 }, { "auxiliary_loss_clip": 0.01098459, "auxiliary_loss_mlp": 0.01036297, "balance_loss_clip": 1.04129279, "balance_loss_mlp": 1.02273059, "epoch": 0.5358785510296107, "flos": 16282400878080.0, "grad_norm": 2.041064157993178, "language_loss": 0.71507263, "learning_rate": 1.8647653294534509e-06, "loss": 0.73642015, "num_input_tokens_seen": 191791255, "step": 8913, "time_per_iteration": 2.6959731578826904 }, { "auxiliary_loss_clip": 0.01092724, "auxiliary_loss_mlp": 0.01039004, "balance_loss_clip": 1.04161441, "balance_loss_mlp": 1.02512836, "epoch": 0.5359386742822787, "flos": 16976877408000.0, "grad_norm": 1.9206134465038889, "language_loss": 0.72290546, "learning_rate": 1.864376761688156e-06, "loss": 0.74422276, "num_input_tokens_seen": 191809325, "step": 8914, "time_per_iteration": 2.678020477294922 }, { "auxiliary_loss_clip": 0.01104699, "auxiliary_loss_mlp": 0.01039806, "balance_loss_clip": 1.04611683, "balance_loss_mlp": 1.02468383, "epoch": 0.5359987975349466, "flos": 20812927305600.0, "grad_norm": 1.8719693529557881, "language_loss": 0.70668626, "learning_rate": 1.8639881990657079e-06, "loss": 0.72813135, "num_input_tokens_seen": 191829795, "step": 8915, "time_per_iteration": 2.653940200805664 }, { "auxiliary_loss_clip": 0.01094002, "auxiliary_loss_mlp": 0.01045487, "balance_loss_clip": 1.04047489, "balance_loss_mlp": 1.03118742, "epoch": 0.5360589207876146, "flos": 22199941031040.0, "grad_norm": 1.5982896811499068, "language_loss": 0.74664176, "learning_rate": 1.8635996416008408e-06, "loss": 0.76803666, "num_input_tokens_seen": 191850840, "step": 8916, "time_per_iteration": 4.3477959632873535 }, { "auxiliary_loss_clip": 0.01081313, "auxiliary_loss_mlp": 0.00772126, "balance_loss_clip": 1.04081666, "balance_loss_mlp": 1.00021815, "epoch": 0.5361190440402825, "flos": 31394365084800.0, "grad_norm": 1.8553858112595492, "language_loss": 0.72677946, "learning_rate": 1.863211089308289e-06, "loss": 0.74531382, "num_input_tokens_seen": 191869520, "step": 8917, "time_per_iteration": 2.808074712753296 }, { "auxiliary_loss_clip": 0.01102423, "auxiliary_loss_mlp": 0.01041518, "balance_loss_clip": 1.0441047, "balance_loss_mlp": 1.02715325, "epoch": 0.5361791672929506, "flos": 16069882060800.0, "grad_norm": 1.960367430660897, "language_loss": 0.71014392, "learning_rate": 1.8628225422027865e-06, "loss": 0.73158336, "num_input_tokens_seen": 191887240, "step": 8918, "time_per_iteration": 4.185984134674072 }, { "auxiliary_loss_clip": 0.01106012, "auxiliary_loss_mlp": 0.01036881, "balance_loss_clip": 1.0469594, "balance_loss_mlp": 1.02306461, "epoch": 0.5362392905456185, "flos": 20740926493440.0, "grad_norm": 1.4605213362212828, "language_loss": 0.74976659, "learning_rate": 1.862434000299067e-06, "loss": 0.77119553, "num_input_tokens_seen": 191905690, "step": 8919, "time_per_iteration": 2.694120407104492 }, { "auxiliary_loss_clip": 0.01093376, "auxiliary_loss_mlp": 0.01035953, "balance_loss_clip": 1.04010797, "balance_loss_mlp": 1.02207744, "epoch": 0.5362994137982865, "flos": 17340077779200.0, "grad_norm": 1.9976483392210334, "language_loss": 0.71690488, "learning_rate": 1.862045463611864e-06, "loss": 0.73819816, "num_input_tokens_seen": 191920725, "step": 8920, "time_per_iteration": 2.6273410320281982 }, { "auxiliary_loss_clip": 0.01105087, "auxiliary_loss_mlp": 0.01040608, "balance_loss_clip": 1.03961456, "balance_loss_mlp": 1.02532554, "epoch": 0.5363595370509544, "flos": 42813957795840.0, "grad_norm": 1.3877970230156793, "language_loss": 0.68828928, "learning_rate": 1.8616569321559105e-06, "loss": 0.70974618, "num_input_tokens_seen": 191944645, "step": 8921, "time_per_iteration": 4.31537938117981 }, { "auxiliary_loss_clip": 0.01114121, "auxiliary_loss_mlp": 0.01036194, "balance_loss_clip": 1.04631782, "balance_loss_mlp": 1.0227288, "epoch": 0.5364196603036224, "flos": 19171953446400.0, "grad_norm": 1.8336561717381605, "language_loss": 0.81926084, "learning_rate": 1.86126840594594e-06, "loss": 0.84076393, "num_input_tokens_seen": 191962265, "step": 8922, "time_per_iteration": 2.6045267581939697 }, { "auxiliary_loss_clip": 0.01117037, "auxiliary_loss_mlp": 0.01031036, "balance_loss_clip": 1.04637003, "balance_loss_mlp": 1.01782727, "epoch": 0.5364797835562904, "flos": 17931060247680.0, "grad_norm": 2.029402038210475, "language_loss": 0.76969302, "learning_rate": 1.860879884996686e-06, "loss": 0.79117376, "num_input_tokens_seen": 191978850, "step": 8923, "time_per_iteration": 2.627131223678589 }, { "auxiliary_loss_clip": 0.01097305, "auxiliary_loss_mlp": 0.01035531, "balance_loss_clip": 1.04099584, "balance_loss_mlp": 1.02144074, "epoch": 0.5365399068089584, "flos": 30228058477440.0, "grad_norm": 1.4696173336709724, "language_loss": 0.70680726, "learning_rate": 1.8604913693228804e-06, "loss": 0.72813559, "num_input_tokens_seen": 192002000, "step": 8924, "time_per_iteration": 2.7947139739990234 }, { "auxiliary_loss_clip": 0.01093943, "auxiliary_loss_mlp": 0.01040337, "balance_loss_clip": 1.0430336, "balance_loss_mlp": 1.02501917, "epoch": 0.5366000300616264, "flos": 24891696380160.0, "grad_norm": 2.0937693746484456, "language_loss": 0.87335229, "learning_rate": 1.8601028589392558e-06, "loss": 0.8946951, "num_input_tokens_seen": 192019100, "step": 8925, "time_per_iteration": 2.768362045288086 }, { "auxiliary_loss_clip": 0.01123484, "auxiliary_loss_mlp": 0.01031699, "balance_loss_clip": 1.04188776, "balance_loss_mlp": 1.01764417, "epoch": 0.5366601533142943, "flos": 29826649013760.0, "grad_norm": 1.5047259348419413, "language_loss": 0.77962756, "learning_rate": 1.8597143538605455e-06, "loss": 0.80117941, "num_input_tokens_seen": 192041660, "step": 8926, "time_per_iteration": 2.715451955795288 }, { "auxiliary_loss_clip": 0.01087054, "auxiliary_loss_mlp": 0.01032082, "balance_loss_clip": 1.04502523, "balance_loss_mlp": 1.01944578, "epoch": 0.5367202765669623, "flos": 27199352620800.0, "grad_norm": 1.5425961750104156, "language_loss": 0.66906953, "learning_rate": 1.85932585410148e-06, "loss": 0.69026089, "num_input_tokens_seen": 192063540, "step": 8927, "time_per_iteration": 2.7890443801879883 }, { "auxiliary_loss_clip": 0.0111207, "auxiliary_loss_mlp": 0.0103082, "balance_loss_clip": 1.04044211, "balance_loss_mlp": 1.01719475, "epoch": 0.5367803998196302, "flos": 20229953569920.0, "grad_norm": 1.7627850836145547, "language_loss": 0.73644257, "learning_rate": 1.8589373596767929e-06, "loss": 0.75787145, "num_input_tokens_seen": 192081760, "step": 8928, "time_per_iteration": 2.6679322719573975 }, { "auxiliary_loss_clip": 0.01097621, "auxiliary_loss_mlp": 0.01033753, "balance_loss_clip": 1.03983617, "balance_loss_mlp": 1.02038312, "epoch": 0.5368405230722982, "flos": 32154629374080.0, "grad_norm": 1.8947277080350169, "language_loss": 0.63138568, "learning_rate": 1.8585488706012154e-06, "loss": 0.65269947, "num_input_tokens_seen": 192101620, "step": 8929, "time_per_iteration": 2.77915620803833 }, { "auxiliary_loss_clip": 0.01112721, "auxiliary_loss_mlp": 0.01035038, "balance_loss_clip": 1.04284871, "balance_loss_mlp": 1.02102494, "epoch": 0.5369006463249661, "flos": 26247935128320.0, "grad_norm": 1.6504217106645076, "language_loss": 0.65814567, "learning_rate": 1.8581603868894781e-06, "loss": 0.67962325, "num_input_tokens_seen": 192121805, "step": 8930, "time_per_iteration": 2.671699285507202 }, { "auxiliary_loss_clip": 0.01070837, "auxiliary_loss_mlp": 0.01029378, "balance_loss_clip": 1.03888655, "balance_loss_mlp": 1.01519203, "epoch": 0.5369607695776342, "flos": 26211306234240.0, "grad_norm": 1.4657060850123025, "language_loss": 0.67106915, "learning_rate": 1.8577719085563136e-06, "loss": 0.69207126, "num_input_tokens_seen": 192141765, "step": 8931, "time_per_iteration": 2.791450023651123 }, { "auxiliary_loss_clip": 0.0107183, "auxiliary_loss_mlp": 0.01035308, "balance_loss_clip": 1.03937209, "balance_loss_mlp": 1.02028155, "epoch": 0.5370208928303021, "flos": 25009017177600.0, "grad_norm": 1.6675319791175172, "language_loss": 0.76147091, "learning_rate": 1.8573834356164525e-06, "loss": 0.78254229, "num_input_tokens_seen": 192161560, "step": 8932, "time_per_iteration": 2.817074775695801 }, { "auxiliary_loss_clip": 0.0108812, "auxiliary_loss_mlp": 0.01035166, "balance_loss_clip": 1.04271507, "balance_loss_mlp": 1.02086663, "epoch": 0.5370810160829701, "flos": 31792147274880.0, "grad_norm": 1.7321457922490968, "language_loss": 0.66103363, "learning_rate": 1.8569949680846261e-06, "loss": 0.68226647, "num_input_tokens_seen": 192180190, "step": 8933, "time_per_iteration": 2.7999963760375977 }, { "auxiliary_loss_clip": 0.01106374, "auxiliary_loss_mlp": 0.0077107, "balance_loss_clip": 1.04321599, "balance_loss_mlp": 1.00030327, "epoch": 0.537141139335638, "flos": 23842602829440.0, "grad_norm": 1.7096623259043264, "language_loss": 0.83137345, "learning_rate": 1.856606505975565e-06, "loss": 0.8501479, "num_input_tokens_seen": 192198855, "step": 8934, "time_per_iteration": 2.77140474319458 }, { "auxiliary_loss_clip": 0.01083657, "auxiliary_loss_mlp": 0.01038537, "balance_loss_clip": 1.03906775, "balance_loss_mlp": 1.02371967, "epoch": 0.537201262588306, "flos": 18508826511360.0, "grad_norm": 1.9684207217946548, "language_loss": 0.79907835, "learning_rate": 1.856218049303999e-06, "loss": 0.82030034, "num_input_tokens_seen": 192216555, "step": 8935, "time_per_iteration": 2.714343547821045 }, { "auxiliary_loss_clip": 0.01111571, "auxiliary_loss_mlp": 0.01041616, "balance_loss_clip": 1.04217649, "balance_loss_mlp": 1.02750206, "epoch": 0.537261385840974, "flos": 25662950231040.0, "grad_norm": 2.937428754588345, "language_loss": 0.84070867, "learning_rate": 1.855829598084659e-06, "loss": 0.86224055, "num_input_tokens_seen": 192236910, "step": 8936, "time_per_iteration": 2.6816179752349854 }, { "auxiliary_loss_clip": 0.01092497, "auxiliary_loss_mlp": 0.01030736, "balance_loss_clip": 1.04575956, "balance_loss_mlp": 1.018255, "epoch": 0.537321509093642, "flos": 40735017406080.0, "grad_norm": 1.2320449417851727, "language_loss": 0.72774732, "learning_rate": 1.8554411523322754e-06, "loss": 0.74897963, "num_input_tokens_seen": 192260790, "step": 8937, "time_per_iteration": 2.9294662475585938 }, { "auxiliary_loss_clip": 0.01097303, "auxiliary_loss_mlp": 0.0103947, "balance_loss_clip": 1.03866911, "balance_loss_mlp": 1.02411556, "epoch": 0.53738163234631, "flos": 17238487138560.0, "grad_norm": 2.4958463124017825, "language_loss": 0.82070464, "learning_rate": 1.8550527120615778e-06, "loss": 0.84207237, "num_input_tokens_seen": 192277230, "step": 8938, "time_per_iteration": 2.7016329765319824 }, { "auxiliary_loss_clip": 0.01128942, "auxiliary_loss_mlp": 0.01037787, "balance_loss_clip": 1.04445028, "balance_loss_mlp": 1.02425027, "epoch": 0.5374417555989779, "flos": 12821977457280.0, "grad_norm": 2.39037719214814, "language_loss": 0.80410939, "learning_rate": 1.8546642772872957e-06, "loss": 0.8257767, "num_input_tokens_seen": 192292840, "step": 8939, "time_per_iteration": 2.588257312774658 }, { "auxiliary_loss_clip": 0.01012372, "auxiliary_loss_mlp": 0.01007323, "balance_loss_clip": 1.01498079, "balance_loss_mlp": 1.00561845, "epoch": 0.5375018788516459, "flos": 67256018703360.0, "grad_norm": 0.706070728219951, "language_loss": 0.52408826, "learning_rate": 1.8542758480241589e-06, "loss": 0.5442853, "num_input_tokens_seen": 192358240, "step": 8940, "time_per_iteration": 3.276360273361206 }, { "auxiliary_loss_clip": 0.01083174, "auxiliary_loss_mlp": 0.01033229, "balance_loss_clip": 1.04148936, "balance_loss_mlp": 1.01995516, "epoch": 0.5375620021043138, "flos": 18114168804480.0, "grad_norm": 2.0987581231461725, "language_loss": 0.71804386, "learning_rate": 1.8538874242868965e-06, "loss": 0.73920786, "num_input_tokens_seen": 192377370, "step": 8941, "time_per_iteration": 2.732537269592285 }, { "auxiliary_loss_clip": 0.01092897, "auxiliary_loss_mlp": 0.01030607, "balance_loss_clip": 1.03881931, "balance_loss_mlp": 1.01767242, "epoch": 0.5376221253569818, "flos": 23149383275520.0, "grad_norm": 1.733585832372728, "language_loss": 0.79825974, "learning_rate": 1.853499006090237e-06, "loss": 0.81949472, "num_input_tokens_seen": 192396450, "step": 8942, "time_per_iteration": 2.723686695098877 }, { "auxiliary_loss_clip": 0.01126783, "auxiliary_loss_mlp": 0.01038334, "balance_loss_clip": 1.04432559, "balance_loss_mlp": 1.02416599, "epoch": 0.5376822486096497, "flos": 29972302663680.0, "grad_norm": 1.8527940596038397, "language_loss": 0.70161736, "learning_rate": 1.853110593448911e-06, "loss": 0.72326851, "num_input_tokens_seen": 192417390, "step": 8943, "time_per_iteration": 2.683830499649048 }, { "auxiliary_loss_clip": 0.01030181, "auxiliary_loss_mlp": 0.01002794, "balance_loss_clip": 1.01417148, "balance_loss_mlp": 1.00145841, "epoch": 0.5377423718623178, "flos": 54168950874240.0, "grad_norm": 0.8559023322108498, "language_loss": 0.5964179, "learning_rate": 1.852722186377645e-06, "loss": 0.61674768, "num_input_tokens_seen": 192478060, "step": 8944, "time_per_iteration": 3.195451498031616 }, { "auxiliary_loss_clip": 0.01075816, "auxiliary_loss_mlp": 0.01037224, "balance_loss_clip": 1.04020023, "balance_loss_mlp": 1.02198291, "epoch": 0.5378024951149857, "flos": 23257079228160.0, "grad_norm": 2.0363151234070567, "language_loss": 0.77896553, "learning_rate": 1.852333784891169e-06, "loss": 0.80009592, "num_input_tokens_seen": 192495985, "step": 8945, "time_per_iteration": 2.7992632389068604 }, { "auxiliary_loss_clip": 0.01114593, "auxiliary_loss_mlp": 0.01035525, "balance_loss_clip": 1.04309297, "balance_loss_mlp": 1.02173805, "epoch": 0.5378626183676537, "flos": 24024095274240.0, "grad_norm": 1.6722587357114949, "language_loss": 0.68561995, "learning_rate": 1.8519453890042112e-06, "loss": 0.70712113, "num_input_tokens_seen": 192515445, "step": 8946, "time_per_iteration": 2.6522717475891113 }, { "auxiliary_loss_clip": 0.01078154, "auxiliary_loss_mlp": 0.0104253, "balance_loss_clip": 1.04271758, "balance_loss_mlp": 1.02895761, "epoch": 0.5379227416203216, "flos": 27161789973120.0, "grad_norm": 1.8248631368800923, "language_loss": 0.76991701, "learning_rate": 1.851556998731498e-06, "loss": 0.79112387, "num_input_tokens_seen": 192536530, "step": 8947, "time_per_iteration": 2.796123743057251 }, { "auxiliary_loss_clip": 0.0111442, "auxiliary_loss_mlp": 0.01032597, "balance_loss_clip": 1.04487777, "balance_loss_mlp": 1.01940608, "epoch": 0.5379828648729896, "flos": 24681619687680.0, "grad_norm": 1.55307874766799, "language_loss": 0.60198331, "learning_rate": 1.8511686140877592e-06, "loss": 0.6234535, "num_input_tokens_seen": 192556075, "step": 8948, "time_per_iteration": 2.7054309844970703 }, { "auxiliary_loss_clip": 0.01082153, "auxiliary_loss_mlp": 0.01037517, "balance_loss_clip": 1.03970575, "balance_loss_mlp": 1.02415979, "epoch": 0.5380429881256577, "flos": 22523280284160.0, "grad_norm": 1.6281037537893495, "language_loss": 0.79697102, "learning_rate": 1.8507802350877205e-06, "loss": 0.81816769, "num_input_tokens_seen": 192575535, "step": 8949, "time_per_iteration": 2.8140738010406494 }, { "auxiliary_loss_clip": 0.01078335, "auxiliary_loss_mlp": 0.01042356, "balance_loss_clip": 1.03704572, "balance_loss_mlp": 1.02679944, "epoch": 0.5381031113783256, "flos": 26979543342720.0, "grad_norm": 2.0888170828860444, "language_loss": 0.77963328, "learning_rate": 1.850391861746111e-06, "loss": 0.80084026, "num_input_tokens_seen": 192594490, "step": 8950, "time_per_iteration": 2.7498505115509033 }, { "auxiliary_loss_clip": 0.01110071, "auxiliary_loss_mlp": 0.01029664, "balance_loss_clip": 1.05072141, "balance_loss_mlp": 1.01671791, "epoch": 0.5381632346309936, "flos": 24754087376640.0, "grad_norm": 1.5816580812213883, "language_loss": 0.72668755, "learning_rate": 1.8500034940776573e-06, "loss": 0.7480849, "num_input_tokens_seen": 192615650, "step": 8951, "time_per_iteration": 2.7927658557891846 }, { "auxiliary_loss_clip": 0.01122901, "auxiliary_loss_mlp": 0.00772698, "balance_loss_clip": 1.04232633, "balance_loss_mlp": 1.00031877, "epoch": 0.5382233578836615, "flos": 15560058372480.0, "grad_norm": 1.7038907930473366, "language_loss": 0.74791837, "learning_rate": 1.849615132097085e-06, "loss": 0.76687431, "num_input_tokens_seen": 192633840, "step": 8952, "time_per_iteration": 2.663555860519409 }, { "auxiliary_loss_clip": 0.01103413, "auxiliary_loss_mlp": 0.01034816, "balance_loss_clip": 1.04635072, "balance_loss_mlp": 1.02090442, "epoch": 0.5382834811363295, "flos": 25084501608960.0, "grad_norm": 1.486507819644587, "language_loss": 0.79733002, "learning_rate": 1.8492267758191228e-06, "loss": 0.81871235, "num_input_tokens_seen": 192655890, "step": 8953, "time_per_iteration": 2.7213597297668457 }, { "auxiliary_loss_clip": 0.01092412, "auxiliary_loss_mlp": 0.01036672, "balance_loss_clip": 1.04632258, "balance_loss_mlp": 1.02147865, "epoch": 0.5383436043889974, "flos": 13297901685120.0, "grad_norm": 1.8841614793520622, "language_loss": 0.80665779, "learning_rate": 1.8488384252584964e-06, "loss": 0.82794857, "num_input_tokens_seen": 192673025, "step": 8954, "time_per_iteration": 2.7119338512420654 }, { "auxiliary_loss_clip": 0.01124989, "auxiliary_loss_mlp": 0.0103348, "balance_loss_clip": 1.04552889, "balance_loss_mlp": 1.0192287, "epoch": 0.5384037276416654, "flos": 23039388852480.0, "grad_norm": 2.080642260770838, "language_loss": 0.76782274, "learning_rate": 1.8484500804299318e-06, "loss": 0.78940743, "num_input_tokens_seen": 192692190, "step": 8955, "time_per_iteration": 4.170248746871948 }, { "auxiliary_loss_clip": 0.01100368, "auxiliary_loss_mlp": 0.01043375, "balance_loss_clip": 1.04422796, "balance_loss_mlp": 1.02911186, "epoch": 0.5384638508943334, "flos": 20631147552000.0, "grad_norm": 1.64526518725267, "language_loss": 0.78446829, "learning_rate": 1.8480617413481557e-06, "loss": 0.8059057, "num_input_tokens_seen": 192710380, "step": 8956, "time_per_iteration": 4.346608638763428 }, { "auxiliary_loss_clip": 0.01014882, "auxiliary_loss_mlp": 0.01009567, "balance_loss_clip": 1.01641572, "balance_loss_mlp": 1.00802886, "epoch": 0.5385239741470014, "flos": 66737683491840.0, "grad_norm": 0.8632221777835867, "language_loss": 0.63366526, "learning_rate": 1.8476734080278932e-06, "loss": 0.6539098, "num_input_tokens_seen": 192768995, "step": 8957, "time_per_iteration": 4.689607381820679 }, { "auxiliary_loss_clip": 0.01003314, "auxiliary_loss_mlp": 0.00999601, "balance_loss_clip": 1.01686144, "balance_loss_mlp": 0.99808067, "epoch": 0.5385840973996693, "flos": 64716058229760.0, "grad_norm": 0.7163688318545376, "language_loss": 0.5155347, "learning_rate": 1.8472850804838705e-06, "loss": 0.53556383, "num_input_tokens_seen": 192825585, "step": 8958, "time_per_iteration": 3.263490676879883 }, { "auxiliary_loss_clip": 0.01118278, "auxiliary_loss_mlp": 0.01034558, "balance_loss_clip": 1.04870462, "balance_loss_mlp": 1.01945472, "epoch": 0.5386442206523373, "flos": 26141783460480.0, "grad_norm": 1.5599827476789179, "language_loss": 0.77335596, "learning_rate": 1.8468967587308128e-06, "loss": 0.79488432, "num_input_tokens_seen": 192847335, "step": 8959, "time_per_iteration": 2.6936423778533936 }, { "auxiliary_loss_clip": 0.01078149, "auxiliary_loss_mlp": 0.01035897, "balance_loss_clip": 1.04148221, "balance_loss_mlp": 1.02258778, "epoch": 0.5387043439050052, "flos": 18251849635200.0, "grad_norm": 2.554990268603387, "language_loss": 0.84077597, "learning_rate": 1.8465084427834455e-06, "loss": 0.86191648, "num_input_tokens_seen": 192862205, "step": 8960, "time_per_iteration": 4.281194686889648 }, { "auxiliary_loss_clip": 0.01114712, "auxiliary_loss_mlp": 0.01032916, "balance_loss_clip": 1.0460726, "balance_loss_mlp": 1.01955807, "epoch": 0.5387644671576732, "flos": 29788296266880.0, "grad_norm": 1.4386251393877574, "language_loss": 0.78275657, "learning_rate": 1.8461201326564933e-06, "loss": 0.80423284, "num_input_tokens_seen": 192883695, "step": 8961, "time_per_iteration": 2.7518913745880127 }, { "auxiliary_loss_clip": 0.01089107, "auxiliary_loss_mlp": 0.01035524, "balance_loss_clip": 1.041345, "balance_loss_mlp": 1.02189803, "epoch": 0.5388245904103413, "flos": 22374466237440.0, "grad_norm": 11.100507002897315, "language_loss": 0.84070158, "learning_rate": 1.845731828364681e-06, "loss": 0.86194789, "num_input_tokens_seen": 192900190, "step": 8962, "time_per_iteration": 2.745964288711548 }, { "auxiliary_loss_clip": 0.01020426, "auxiliary_loss_mlp": 0.01002497, "balance_loss_clip": 1.01872444, "balance_loss_mlp": 1.00114429, "epoch": 0.5388847136630092, "flos": 69807794751360.0, "grad_norm": 0.7287303599556714, "language_loss": 0.5418579, "learning_rate": 1.8453435299227333e-06, "loss": 0.56208712, "num_input_tokens_seen": 192958675, "step": 8963, "time_per_iteration": 3.0952982902526855 }, { "auxiliary_loss_clip": 0.01022568, "auxiliary_loss_mlp": 0.01009564, "balance_loss_clip": 1.01615238, "balance_loss_mlp": 1.00817513, "epoch": 0.5389448369156772, "flos": 69822303845760.0, "grad_norm": 1.4175775222807738, "language_loss": 0.63305563, "learning_rate": 1.8449552373453744e-06, "loss": 0.65337688, "num_input_tokens_seen": 193033135, "step": 8964, "time_per_iteration": 3.2670536041259766 }, { "auxiliary_loss_clip": 0.01065573, "auxiliary_loss_mlp": 0.01035006, "balance_loss_clip": 1.04052043, "balance_loss_mlp": 1.02049828, "epoch": 0.5390049601683451, "flos": 31722444933120.0, "grad_norm": 1.4839412969014603, "language_loss": 0.69941193, "learning_rate": 1.8445669506473287e-06, "loss": 0.72041768, "num_input_tokens_seen": 193055570, "step": 8965, "time_per_iteration": 2.8793537616729736 }, { "auxiliary_loss_clip": 0.01097921, "auxiliary_loss_mlp": 0.00772841, "balance_loss_clip": 1.04318738, "balance_loss_mlp": 1.00031877, "epoch": 0.5390650834210131, "flos": 18113486446080.0, "grad_norm": 3.9331383698311297, "language_loss": 0.82359982, "learning_rate": 1.8441786698433192e-06, "loss": 0.84230745, "num_input_tokens_seen": 193073120, "step": 8966, "time_per_iteration": 2.7008259296417236 }, { "auxiliary_loss_clip": 0.0112489, "auxiliary_loss_mlp": 0.01032097, "balance_loss_clip": 1.04688132, "balance_loss_mlp": 1.01831603, "epoch": 0.539125206673681, "flos": 17416711445760.0, "grad_norm": 1.8273360824105822, "language_loss": 0.72234643, "learning_rate": 1.8437903949480706e-06, "loss": 0.74391627, "num_input_tokens_seen": 193090105, "step": 8967, "time_per_iteration": 2.536813974380493 }, { "auxiliary_loss_clip": 0.01101272, "auxiliary_loss_mlp": 0.01034325, "balance_loss_clip": 1.04193211, "balance_loss_mlp": 1.02177858, "epoch": 0.539185329926349, "flos": 22198935450240.0, "grad_norm": 2.8461637045489394, "language_loss": 0.81760883, "learning_rate": 1.8434021259763065e-06, "loss": 0.83896482, "num_input_tokens_seen": 193109325, "step": 8968, "time_per_iteration": 2.6812336444854736 }, { "auxiliary_loss_clip": 0.01095464, "auxiliary_loss_mlp": 0.01039812, "balance_loss_clip": 1.04489422, "balance_loss_mlp": 1.0244931, "epoch": 0.539245453179017, "flos": 21434397442560.0, "grad_norm": 1.479768408322399, "language_loss": 0.74093103, "learning_rate": 1.8430138629427484e-06, "loss": 0.76228386, "num_input_tokens_seen": 193130595, "step": 8969, "time_per_iteration": 2.775066614151001 }, { "auxiliary_loss_clip": 0.01089398, "auxiliary_loss_mlp": 0.00772297, "balance_loss_clip": 1.03885353, "balance_loss_mlp": 1.00019646, "epoch": 0.539305576431685, "flos": 20735000749440.0, "grad_norm": 1.789523366494458, "language_loss": 0.82301641, "learning_rate": 1.8426256058621205e-06, "loss": 0.84163332, "num_input_tokens_seen": 193148930, "step": 8970, "time_per_iteration": 2.709660053253174 }, { "auxiliary_loss_clip": 0.0109962, "auxiliary_loss_mlp": 0.01036868, "balance_loss_clip": 1.04434752, "balance_loss_mlp": 1.02398705, "epoch": 0.5393656996843529, "flos": 30920452018560.0, "grad_norm": 1.3749735874734272, "language_loss": 0.75481087, "learning_rate": 1.842237354749146e-06, "loss": 0.77617574, "num_input_tokens_seen": 193170140, "step": 8971, "time_per_iteration": 2.759859800338745 }, { "auxiliary_loss_clip": 0.01031428, "auxiliary_loss_mlp": 0.01020808, "balance_loss_clip": 1.01404476, "balance_loss_mlp": 1.01906729, "epoch": 0.5394258229370209, "flos": 50317781351040.0, "grad_norm": 0.8852076637627846, "language_loss": 0.60268009, "learning_rate": 1.8418491096185465e-06, "loss": 0.62320244, "num_input_tokens_seen": 193227235, "step": 8972, "time_per_iteration": 3.1906497478485107 }, { "auxiliary_loss_clip": 0.01113524, "auxiliary_loss_mlp": 0.01042903, "balance_loss_clip": 1.0430851, "balance_loss_mlp": 1.02806175, "epoch": 0.5394859461896888, "flos": 25411935012480.0, "grad_norm": 1.3798913966673876, "language_loss": 0.78418267, "learning_rate": 1.841460870485045e-06, "loss": 0.80574697, "num_input_tokens_seen": 193248435, "step": 8973, "time_per_iteration": 2.67616868019104 }, { "auxiliary_loss_clip": 0.01119952, "auxiliary_loss_mlp": 0.01038926, "balance_loss_clip": 1.04402721, "balance_loss_mlp": 1.0234288, "epoch": 0.5395460694423568, "flos": 25478476957440.0, "grad_norm": 1.97267381364002, "language_loss": 0.73745018, "learning_rate": 1.8410726373633623e-06, "loss": 0.75903904, "num_input_tokens_seen": 193267490, "step": 8974, "time_per_iteration": 2.6896610260009766 }, { "auxiliary_loss_clip": 0.01038786, "auxiliary_loss_mlp": 0.01002204, "balance_loss_clip": 1.01252413, "balance_loss_mlp": 1.00089288, "epoch": 0.5396061926950249, "flos": 53249493507840.0, "grad_norm": 0.7368178577125409, "language_loss": 0.51070768, "learning_rate": 1.8406844102682215e-06, "loss": 0.53111756, "num_input_tokens_seen": 193326050, "step": 8975, "time_per_iteration": 3.1316938400268555 }, { "auxiliary_loss_clip": 0.01110433, "auxiliary_loss_mlp": 0.01042663, "balance_loss_clip": 1.04242885, "balance_loss_mlp": 1.02821445, "epoch": 0.5396663159476928, "flos": 26725080418560.0, "grad_norm": 2.630341512403146, "language_loss": 0.72291577, "learning_rate": 1.840296189214344e-06, "loss": 0.74444675, "num_input_tokens_seen": 193348785, "step": 8976, "time_per_iteration": 2.722482681274414 }, { "auxiliary_loss_clip": 0.01107068, "auxiliary_loss_mlp": 0.00771891, "balance_loss_clip": 1.0392096, "balance_loss_mlp": 1.00027895, "epoch": 0.5397264392003608, "flos": 23253380127360.0, "grad_norm": 1.6269165395400453, "language_loss": 0.69827849, "learning_rate": 1.8399079742164509e-06, "loss": 0.71706808, "num_input_tokens_seen": 193367080, "step": 8977, "time_per_iteration": 2.661503553390503 }, { "auxiliary_loss_clip": 0.0105269, "auxiliary_loss_mlp": 0.01038261, "balance_loss_clip": 1.03996563, "balance_loss_mlp": 1.02390814, "epoch": 0.5397865624530287, "flos": 18294188791680.0, "grad_norm": 1.662156020825611, "language_loss": 0.7259683, "learning_rate": 1.8395197652892636e-06, "loss": 0.74687779, "num_input_tokens_seen": 193383715, "step": 8978, "time_per_iteration": 2.7381365299224854 }, { "auxiliary_loss_clip": 0.01087228, "auxiliary_loss_mlp": 0.010397, "balance_loss_clip": 1.04297757, "balance_loss_mlp": 1.02373815, "epoch": 0.5398466857056967, "flos": 15297514888320.0, "grad_norm": 1.853626793115837, "language_loss": 0.74536407, "learning_rate": 1.8391315624475028e-06, "loss": 0.76663339, "num_input_tokens_seen": 193400560, "step": 8979, "time_per_iteration": 2.694063425064087 }, { "auxiliary_loss_clip": 0.01072362, "auxiliary_loss_mlp": 0.01049968, "balance_loss_clip": 1.04104912, "balance_loss_mlp": 1.03438091, "epoch": 0.5399068089583646, "flos": 17821748183040.0, "grad_norm": 1.8942057962212562, "language_loss": 0.76699525, "learning_rate": 1.8387433657058892e-06, "loss": 0.78821856, "num_input_tokens_seen": 193418680, "step": 8980, "time_per_iteration": 2.820065498352051 }, { "auxiliary_loss_clip": 0.01123296, "auxiliary_loss_mlp": 0.01035485, "balance_loss_clip": 1.04266453, "balance_loss_mlp": 1.02159715, "epoch": 0.5399669322110326, "flos": 27381635164800.0, "grad_norm": 1.799033275645953, "language_loss": 0.82047689, "learning_rate": 1.8383551750791431e-06, "loss": 0.84206468, "num_input_tokens_seen": 193439310, "step": 8981, "time_per_iteration": 2.6362786293029785 }, { "auxiliary_loss_clip": 0.01114328, "auxiliary_loss_mlp": 0.01033767, "balance_loss_clip": 1.0414052, "balance_loss_mlp": 1.01837707, "epoch": 0.5400270554637006, "flos": 20449116403200.0, "grad_norm": 1.8414706821019682, "language_loss": 0.66744691, "learning_rate": 1.8379669905819857e-06, "loss": 0.68892789, "num_input_tokens_seen": 193458115, "step": 8982, "time_per_iteration": 2.621446371078491 }, { "auxiliary_loss_clip": 0.01087174, "auxiliary_loss_mlp": 0.00771772, "balance_loss_clip": 1.04236412, "balance_loss_mlp": 1.00037217, "epoch": 0.5400871787163686, "flos": 21689578638720.0, "grad_norm": 1.585959219226275, "language_loss": 0.82838899, "learning_rate": 1.8375788122291358e-06, "loss": 0.84697849, "num_input_tokens_seen": 193477365, "step": 8983, "time_per_iteration": 2.725118637084961 }, { "auxiliary_loss_clip": 0.0107373, "auxiliary_loss_mlp": 0.01037262, "balance_loss_clip": 1.03868723, "balance_loss_mlp": 1.0226711, "epoch": 0.5401473019690365, "flos": 19204739585280.0, "grad_norm": 1.7940455633993566, "language_loss": 0.71052921, "learning_rate": 1.8371906400353138e-06, "loss": 0.73163915, "num_input_tokens_seen": 193495595, "step": 8984, "time_per_iteration": 2.7552812099456787 }, { "auxiliary_loss_clip": 0.01129583, "auxiliary_loss_mlp": 0.01039978, "balance_loss_clip": 1.04673409, "balance_loss_mlp": 1.02464724, "epoch": 0.5402074252217045, "flos": 20627376624000.0, "grad_norm": 1.7153215255445333, "language_loss": 0.80088288, "learning_rate": 1.8368024740152386e-06, "loss": 0.82257855, "num_input_tokens_seen": 193514035, "step": 8985, "time_per_iteration": 2.6251611709594727 }, { "auxiliary_loss_clip": 0.01076326, "auxiliary_loss_mlp": 0.01030482, "balance_loss_clip": 1.03776312, "balance_loss_mlp": 1.01603341, "epoch": 0.5402675484743724, "flos": 24973465691520.0, "grad_norm": 1.6597478268739005, "language_loss": 0.79092562, "learning_rate": 1.83641431418363e-06, "loss": 0.81199366, "num_input_tokens_seen": 193535445, "step": 8986, "time_per_iteration": 2.7512738704681396 }, { "auxiliary_loss_clip": 0.01105948, "auxiliary_loss_mlp": 0.01041249, "balance_loss_clip": 1.0403738, "balance_loss_mlp": 1.02647913, "epoch": 0.5403276717270404, "flos": 19459022941440.0, "grad_norm": 1.5813568652048575, "language_loss": 0.77027225, "learning_rate": 1.8360261605552075e-06, "loss": 0.79174423, "num_input_tokens_seen": 193554780, "step": 8987, "time_per_iteration": 2.678215265274048 }, { "auxiliary_loss_clip": 0.01094562, "auxiliary_loss_mlp": 0.01035835, "balance_loss_clip": 1.04025865, "balance_loss_mlp": 1.021613, "epoch": 0.5403877949797083, "flos": 18442140912000.0, "grad_norm": 3.169719409567684, "language_loss": 0.71186262, "learning_rate": 1.8356380131446887e-06, "loss": 0.73316658, "num_input_tokens_seen": 193573580, "step": 8988, "time_per_iteration": 2.779327869415283 }, { "auxiliary_loss_clip": 0.01073421, "auxiliary_loss_mlp": 0.01040131, "balance_loss_clip": 1.03765535, "balance_loss_mlp": 1.02508116, "epoch": 0.5404479182323764, "flos": 28292868316800.0, "grad_norm": 2.25930737507901, "language_loss": 0.67611122, "learning_rate": 1.8352498719667934e-06, "loss": 0.69724679, "num_input_tokens_seen": 193590490, "step": 8989, "time_per_iteration": 2.7891674041748047 }, { "auxiliary_loss_clip": 0.01111206, "auxiliary_loss_mlp": 0.01041114, "balance_loss_clip": 1.04164839, "balance_loss_mlp": 1.02667785, "epoch": 0.5405080414850444, "flos": 23367325046400.0, "grad_norm": 1.5585472280182338, "language_loss": 0.77394271, "learning_rate": 1.8348617370362399e-06, "loss": 0.79546589, "num_input_tokens_seen": 193609900, "step": 8990, "time_per_iteration": 2.6976635456085205 }, { "auxiliary_loss_clip": 0.01106061, "auxiliary_loss_mlp": 0.01026872, "balance_loss_clip": 1.03980994, "balance_loss_mlp": 1.01427758, "epoch": 0.5405681647377123, "flos": 21106425335040.0, "grad_norm": 1.9802166321118257, "language_loss": 0.69258702, "learning_rate": 1.834473608367745e-06, "loss": 0.71391636, "num_input_tokens_seen": 193629775, "step": 8991, "time_per_iteration": 2.6734046936035156 }, { "auxiliary_loss_clip": 0.01061373, "auxiliary_loss_mlp": 0.01034138, "balance_loss_clip": 1.03470838, "balance_loss_mlp": 1.01864719, "epoch": 0.5406282879903803, "flos": 20449188230400.0, "grad_norm": 1.8615919781627641, "language_loss": 0.75722122, "learning_rate": 1.8340854859760277e-06, "loss": 0.77817637, "num_input_tokens_seen": 193648070, "step": 8992, "time_per_iteration": 2.7986576557159424 }, { "auxiliary_loss_clip": 0.01094937, "auxiliary_loss_mlp": 0.01042345, "balance_loss_clip": 1.03807545, "balance_loss_mlp": 1.02672255, "epoch": 0.5406884112430482, "flos": 14209493973120.0, "grad_norm": 2.5485108966117704, "language_loss": 0.76453286, "learning_rate": 1.8336973698758056e-06, "loss": 0.78590572, "num_input_tokens_seen": 193665060, "step": 8993, "time_per_iteration": 2.7208335399627686 }, { "auxiliary_loss_clip": 0.01106981, "auxiliary_loss_mlp": 0.01031441, "balance_loss_clip": 1.03966081, "balance_loss_mlp": 1.01783895, "epoch": 0.5407485344957162, "flos": 23875568536320.0, "grad_norm": 1.7082267966393392, "language_loss": 0.70645487, "learning_rate": 1.8333092600817959e-06, "loss": 0.72783911, "num_input_tokens_seen": 193683620, "step": 8994, "time_per_iteration": 2.724794626235962 }, { "auxiliary_loss_clip": 0.01107598, "auxiliary_loss_mlp": 0.01031331, "balance_loss_clip": 1.03957391, "balance_loss_mlp": 1.01583362, "epoch": 0.5408086577483842, "flos": 23148485435520.0, "grad_norm": 3.058822592256831, "language_loss": 0.75407541, "learning_rate": 1.8329211566087157e-06, "loss": 0.77546465, "num_input_tokens_seen": 193702990, "step": 8995, "time_per_iteration": 5.971833229064941 }, { "auxiliary_loss_clip": 0.0110732, "auxiliary_loss_mlp": 0.01036119, "balance_loss_clip": 1.04115582, "balance_loss_mlp": 1.02335215, "epoch": 0.5408687810010522, "flos": 18771046773120.0, "grad_norm": 1.7630879917097735, "language_loss": 0.73701608, "learning_rate": 1.832533059471282e-06, "loss": 0.75845045, "num_input_tokens_seen": 193721785, "step": 8996, "time_per_iteration": 4.209546327590942 }, { "auxiliary_loss_clip": 0.0107249, "auxiliary_loss_mlp": 0.0103344, "balance_loss_clip": 1.03679025, "balance_loss_mlp": 1.02018428, "epoch": 0.5409289042537201, "flos": 13881557779200.0, "grad_norm": 2.7958611639566557, "language_loss": 0.73200142, "learning_rate": 1.8321449686842115e-06, "loss": 0.75306082, "num_input_tokens_seen": 193740315, "step": 8997, "time_per_iteration": 2.6815428733825684 }, { "auxiliary_loss_clip": 0.0112099, "auxiliary_loss_mlp": 0.01036623, "balance_loss_clip": 1.04214144, "balance_loss_mlp": 1.02241874, "epoch": 0.5409890275063881, "flos": 14465357527680.0, "grad_norm": 2.1382567541010706, "language_loss": 0.71990108, "learning_rate": 1.8317568842622207e-06, "loss": 0.74147719, "num_input_tokens_seen": 193757580, "step": 8998, "time_per_iteration": 2.516322374343872 }, { "auxiliary_loss_clip": 0.01084198, "auxiliary_loss_mlp": 0.01038336, "balance_loss_clip": 1.03824925, "balance_loss_mlp": 1.02481771, "epoch": 0.541049150759056, "flos": 48977449349760.0, "grad_norm": 1.4737906597538892, "language_loss": 0.7077291, "learning_rate": 1.8313688062200256e-06, "loss": 0.72895443, "num_input_tokens_seen": 193780965, "step": 8999, "time_per_iteration": 4.582181215286255 }, { "auxiliary_loss_clip": 0.01092675, "auxiliary_loss_mlp": 0.01037736, "balance_loss_clip": 1.04016924, "balance_loss_mlp": 1.02372253, "epoch": 0.541109274011724, "flos": 18147601388160.0, "grad_norm": 2.7892757576067972, "language_loss": 0.80210066, "learning_rate": 1.8309807345723422e-06, "loss": 0.82340479, "num_input_tokens_seen": 193797855, "step": 9000, "time_per_iteration": 2.6335151195526123 }, { "auxiliary_loss_clip": 0.01069713, "auxiliary_loss_mlp": 0.01033155, "balance_loss_clip": 1.03577805, "balance_loss_mlp": 1.01837265, "epoch": 0.541169397264392, "flos": 20522553759360.0, "grad_norm": 1.6231589706551275, "language_loss": 0.73037231, "learning_rate": 1.8305926693338863e-06, "loss": 0.75140095, "num_input_tokens_seen": 193817375, "step": 9001, "time_per_iteration": 2.854574680328369 }, { "auxiliary_loss_clip": 0.01088976, "auxiliary_loss_mlp": 0.01037285, "balance_loss_clip": 1.03875196, "balance_loss_mlp": 1.0225749, "epoch": 0.54122952051706, "flos": 20044043752320.0, "grad_norm": 2.3946252475459704, "language_loss": 0.85775471, "learning_rate": 1.8302046105193734e-06, "loss": 0.87901723, "num_input_tokens_seen": 193832205, "step": 9002, "time_per_iteration": 2.83799409866333 }, { "auxiliary_loss_clip": 0.01071827, "auxiliary_loss_mlp": 0.01036651, "balance_loss_clip": 1.03876507, "balance_loss_mlp": 1.0244441, "epoch": 0.541289643769728, "flos": 19062246332160.0, "grad_norm": 1.9022782971983632, "language_loss": 0.78010678, "learning_rate": 1.8298165581435183e-06, "loss": 0.80119157, "num_input_tokens_seen": 193849830, "step": 9003, "time_per_iteration": 2.8771512508392334 }, { "auxiliary_loss_clip": 0.01105804, "auxiliary_loss_mlp": 0.01031034, "balance_loss_clip": 1.03998888, "balance_loss_mlp": 1.01659191, "epoch": 0.5413497670223959, "flos": 22382295402240.0, "grad_norm": 2.4815464780266905, "language_loss": 0.69489288, "learning_rate": 1.8294285122210372e-06, "loss": 0.71626127, "num_input_tokens_seen": 193869945, "step": 9004, "time_per_iteration": 2.7296600341796875 }, { "auxiliary_loss_clip": 0.01027886, "auxiliary_loss_mlp": 0.01000864, "balance_loss_clip": 1.01221299, "balance_loss_mlp": 0.99943334, "epoch": 0.5414098902750639, "flos": 70031734093440.0, "grad_norm": 0.9691738453098017, "language_loss": 0.59067202, "learning_rate": 1.8290404727666434e-06, "loss": 0.61095953, "num_input_tokens_seen": 193930860, "step": 9005, "time_per_iteration": 3.2482104301452637 }, { "auxiliary_loss_clip": 0.011229, "auxiliary_loss_mlp": 0.00771475, "balance_loss_clip": 1.04402518, "balance_loss_mlp": 1.00026715, "epoch": 0.5414700135277318, "flos": 21798962530560.0, "grad_norm": 3.1081571461352357, "language_loss": 0.78251934, "learning_rate": 1.8286524397950517e-06, "loss": 0.80146307, "num_input_tokens_seen": 193949075, "step": 9006, "time_per_iteration": 2.646697521209717 }, { "auxiliary_loss_clip": 0.01099607, "auxiliary_loss_mlp": 0.01035785, "balance_loss_clip": 1.04162097, "balance_loss_mlp": 1.02380466, "epoch": 0.5415301367803999, "flos": 16907929251840.0, "grad_norm": 2.04905315291525, "language_loss": 0.82968152, "learning_rate": 1.8282644133209777e-06, "loss": 0.85103542, "num_input_tokens_seen": 193967630, "step": 9007, "time_per_iteration": 2.6906566619873047 }, { "auxiliary_loss_clip": 0.01105367, "auxiliary_loss_mlp": 0.01035166, "balance_loss_clip": 1.04186976, "balance_loss_mlp": 1.02084875, "epoch": 0.5415902600330678, "flos": 25704176065920.0, "grad_norm": 2.002533361325265, "language_loss": 0.67188275, "learning_rate": 1.8278763933591334e-06, "loss": 0.69328809, "num_input_tokens_seen": 193988730, "step": 9008, "time_per_iteration": 2.6538190841674805 }, { "auxiliary_loss_clip": 0.01126211, "auxiliary_loss_mlp": 0.01033213, "balance_loss_clip": 1.04396832, "balance_loss_mlp": 1.01836514, "epoch": 0.5416503832857358, "flos": 19208151377280.0, "grad_norm": 1.9615897276879948, "language_loss": 0.73713046, "learning_rate": 1.827488379924234e-06, "loss": 0.75872469, "num_input_tokens_seen": 194005160, "step": 9009, "time_per_iteration": 2.5716910362243652 }, { "auxiliary_loss_clip": 0.01072637, "auxiliary_loss_mlp": 0.01036076, "balance_loss_clip": 1.04184818, "balance_loss_mlp": 1.02171135, "epoch": 0.5417105065384037, "flos": 12713706887040.0, "grad_norm": 2.1963503735452417, "language_loss": 0.87984347, "learning_rate": 1.8271003730309923e-06, "loss": 0.90093064, "num_input_tokens_seen": 194021700, "step": 9010, "time_per_iteration": 2.725271701812744 }, { "auxiliary_loss_clip": 0.01120446, "auxiliary_loss_mlp": 0.01037388, "balance_loss_clip": 1.04260874, "balance_loss_mlp": 1.02448332, "epoch": 0.5417706297910717, "flos": 30335933998080.0, "grad_norm": 1.8667479755469423, "language_loss": 0.65187848, "learning_rate": 1.826712372694122e-06, "loss": 0.67345679, "num_input_tokens_seen": 194042620, "step": 9011, "time_per_iteration": 2.6546692848205566 }, { "auxiliary_loss_clip": 0.01111756, "auxiliary_loss_mlp": 0.01036661, "balance_loss_clip": 1.04458547, "balance_loss_mlp": 1.02324426, "epoch": 0.5418307530437396, "flos": 29020992912000.0, "grad_norm": 2.8570982701345797, "language_loss": 0.79252279, "learning_rate": 1.8263243789283362e-06, "loss": 0.81400692, "num_input_tokens_seen": 194061800, "step": 9012, "time_per_iteration": 2.6907572746276855 }, { "auxiliary_loss_clip": 0.01119813, "auxiliary_loss_mlp": 0.01033195, "balance_loss_clip": 1.04184949, "balance_loss_mlp": 1.01965845, "epoch": 0.5418908762964076, "flos": 16873455173760.0, "grad_norm": 2.191987247231765, "language_loss": 0.74450612, "learning_rate": 1.8259363917483466e-06, "loss": 0.76603615, "num_input_tokens_seen": 194079890, "step": 9013, "time_per_iteration": 2.6294262409210205 }, { "auxiliary_loss_clip": 0.01085863, "auxiliary_loss_mlp": 0.01030984, "balance_loss_clip": 1.04200959, "balance_loss_mlp": 1.01776361, "epoch": 0.5419509995490756, "flos": 18949702043520.0, "grad_norm": 2.094538198423721, "language_loss": 0.72306025, "learning_rate": 1.8255484111688667e-06, "loss": 0.74422872, "num_input_tokens_seen": 194097625, "step": 9014, "time_per_iteration": 2.653125524520874 }, { "auxiliary_loss_clip": 0.01099897, "auxiliary_loss_mlp": 0.01032361, "balance_loss_clip": 1.04301429, "balance_loss_mlp": 1.01888418, "epoch": 0.5420111228017436, "flos": 18077719478400.0, "grad_norm": 1.5497382301526352, "language_loss": 0.807073, "learning_rate": 1.8251604372046085e-06, "loss": 0.82839555, "num_input_tokens_seen": 194116055, "step": 9015, "time_per_iteration": 2.6197831630706787 }, { "auxiliary_loss_clip": 0.01117394, "auxiliary_loss_mlp": 0.01039918, "balance_loss_clip": 1.04648256, "balance_loss_mlp": 1.02635145, "epoch": 0.5420712460544116, "flos": 19061779455360.0, "grad_norm": 2.4637362060141053, "language_loss": 0.81252277, "learning_rate": 1.8247724698702843e-06, "loss": 0.83409584, "num_input_tokens_seen": 194130365, "step": 9016, "time_per_iteration": 2.617722988128662 }, { "auxiliary_loss_clip": 0.01121755, "auxiliary_loss_mlp": 0.01030314, "balance_loss_clip": 1.04375124, "balance_loss_mlp": 1.01745152, "epoch": 0.5421313693070795, "flos": 18187103370240.0, "grad_norm": 1.6999373176246328, "language_loss": 0.81182349, "learning_rate": 1.8243845091806053e-06, "loss": 0.83334422, "num_input_tokens_seen": 194148975, "step": 9017, "time_per_iteration": 2.629488706588745 }, { "auxiliary_loss_clip": 0.01119384, "auxiliary_loss_mlp": 0.01035787, "balance_loss_clip": 1.04308951, "balance_loss_mlp": 1.02270925, "epoch": 0.5421914925597475, "flos": 13005947940480.0, "grad_norm": 1.767329743248484, "language_loss": 0.77847707, "learning_rate": 1.8239965551502837e-06, "loss": 0.80002874, "num_input_tokens_seen": 194167185, "step": 9018, "time_per_iteration": 2.595520257949829 }, { "auxiliary_loss_clip": 0.01121333, "auxiliary_loss_mlp": 0.010389, "balance_loss_clip": 1.04014397, "balance_loss_mlp": 1.02462447, "epoch": 0.5422516158124154, "flos": 46758457831680.0, "grad_norm": 1.6302803515957, "language_loss": 0.66417134, "learning_rate": 1.8236086077940303e-06, "loss": 0.68577361, "num_input_tokens_seen": 194192840, "step": 9019, "time_per_iteration": 2.8572912216186523 }, { "auxiliary_loss_clip": 0.01101197, "auxiliary_loss_mlp": 0.01036576, "balance_loss_clip": 1.03910589, "balance_loss_mlp": 1.02315295, "epoch": 0.5423117390650835, "flos": 31758642864000.0, "grad_norm": 1.5350920710342792, "language_loss": 0.69515598, "learning_rate": 1.8232206671265555e-06, "loss": 0.71653378, "num_input_tokens_seen": 194213150, "step": 9020, "time_per_iteration": 2.710081100463867 }, { "auxiliary_loss_clip": 0.01082322, "auxiliary_loss_mlp": 0.01037191, "balance_loss_clip": 1.03962088, "balance_loss_mlp": 1.02462053, "epoch": 0.5423718623177514, "flos": 27201974313600.0, "grad_norm": 1.5706670053852172, "language_loss": 0.80494618, "learning_rate": 1.8228327331625717e-06, "loss": 0.82614136, "num_input_tokens_seen": 194234665, "step": 9021, "time_per_iteration": 2.760133743286133 }, { "auxiliary_loss_clip": 0.01069543, "auxiliary_loss_mlp": 0.01037659, "balance_loss_clip": 1.03820395, "balance_loss_mlp": 1.02405667, "epoch": 0.5424319855704194, "flos": 23546447193600.0, "grad_norm": 2.2946341773433496, "language_loss": 0.78887641, "learning_rate": 1.822444805916788e-06, "loss": 0.80994844, "num_input_tokens_seen": 194253790, "step": 9022, "time_per_iteration": 2.8245437145233154 }, { "auxiliary_loss_clip": 0.01085662, "auxiliary_loss_mlp": 0.00771451, "balance_loss_clip": 1.03742123, "balance_loss_mlp": 1.00025558, "epoch": 0.5424921088230873, "flos": 26615624699520.0, "grad_norm": 1.6811700220554942, "language_loss": 0.8234387, "learning_rate": 1.822056885403915e-06, "loss": 0.84200984, "num_input_tokens_seen": 194274950, "step": 9023, "time_per_iteration": 2.722637891769409 }, { "auxiliary_loss_clip": 0.01105066, "auxiliary_loss_mlp": 0.01031053, "balance_loss_clip": 1.04266286, "balance_loss_mlp": 1.01798785, "epoch": 0.5425522320757553, "flos": 23586811102080.0, "grad_norm": 1.7285453701222258, "language_loss": 0.71582222, "learning_rate": 1.8216689716386627e-06, "loss": 0.73718333, "num_input_tokens_seen": 194296155, "step": 9024, "time_per_iteration": 2.6643166542053223 }, { "auxiliary_loss_clip": 0.01109023, "auxiliary_loss_mlp": 0.01034832, "balance_loss_clip": 1.03978658, "balance_loss_mlp": 1.02231479, "epoch": 0.5426123553284232, "flos": 30592264429440.0, "grad_norm": 1.7605396052132907, "language_loss": 0.65074313, "learning_rate": 1.8212810646357405e-06, "loss": 0.67218173, "num_input_tokens_seen": 194318025, "step": 9025, "time_per_iteration": 2.6963577270507812 }, { "auxiliary_loss_clip": 0.0109579, "auxiliary_loss_mlp": 0.00769932, "balance_loss_clip": 1.04664063, "balance_loss_mlp": 1.00038803, "epoch": 0.5426724785810912, "flos": 12495118671360.0, "grad_norm": 2.055737651503127, "language_loss": 0.73914909, "learning_rate": 1.8208931644098591e-06, "loss": 0.7578063, "num_input_tokens_seen": 194336150, "step": 9026, "time_per_iteration": 2.6317172050476074 }, { "auxiliary_loss_clip": 0.01095155, "auxiliary_loss_mlp": 0.01040442, "balance_loss_clip": 1.03804421, "balance_loss_mlp": 1.02545154, "epoch": 0.5427326018337592, "flos": 26064611089920.0, "grad_norm": 2.1949475938623224, "language_loss": 0.7840718, "learning_rate": 1.8205052709757265e-06, "loss": 0.80542773, "num_input_tokens_seen": 194355980, "step": 9027, "time_per_iteration": 2.6076927185058594 }, { "auxiliary_loss_clip": 0.01004652, "auxiliary_loss_mlp": 0.01011362, "balance_loss_clip": 1.00918782, "balance_loss_mlp": 1.00950241, "epoch": 0.5427927250864272, "flos": 65984745576960.0, "grad_norm": 0.759944437260396, "language_loss": 0.56566465, "learning_rate": 1.8201173843480515e-06, "loss": 0.58582479, "num_input_tokens_seen": 194422660, "step": 9028, "time_per_iteration": 3.173718214035034 }, { "auxiliary_loss_clip": 0.01078653, "auxiliary_loss_mlp": 0.01029607, "balance_loss_clip": 1.0437665, "balance_loss_mlp": 1.01519442, "epoch": 0.5428528483390952, "flos": 19975382904960.0, "grad_norm": 2.1789279213341857, "language_loss": 0.7763471, "learning_rate": 1.8197295045415442e-06, "loss": 0.79742968, "num_input_tokens_seen": 194438545, "step": 9029, "time_per_iteration": 2.6010968685150146 }, { "auxiliary_loss_clip": 0.01080602, "auxiliary_loss_mlp": 0.01029952, "balance_loss_clip": 1.0426538, "balance_loss_mlp": 1.01611137, "epoch": 0.5429129715917631, "flos": 21832323287040.0, "grad_norm": 1.5227150839007966, "language_loss": 0.8289423, "learning_rate": 1.8193416315709112e-06, "loss": 0.85004783, "num_input_tokens_seen": 194458060, "step": 9030, "time_per_iteration": 2.673872232437134 }, { "auxiliary_loss_clip": 0.01119103, "auxiliary_loss_mlp": 0.0103115, "balance_loss_clip": 1.04308653, "balance_loss_mlp": 1.01801896, "epoch": 0.5429730948444311, "flos": 27782685492480.0, "grad_norm": 1.5242093045096456, "language_loss": 0.74554878, "learning_rate": 1.8189537654508623e-06, "loss": 0.76705134, "num_input_tokens_seen": 194477405, "step": 9031, "time_per_iteration": 2.6361796855926514 }, { "auxiliary_loss_clip": 0.01099875, "auxiliary_loss_mlp": 0.01039492, "balance_loss_clip": 1.03957534, "balance_loss_mlp": 1.02664721, "epoch": 0.543033218097099, "flos": 26760452336640.0, "grad_norm": 1.8557133497087115, "language_loss": 0.85526693, "learning_rate": 1.8185659061961045e-06, "loss": 0.87666059, "num_input_tokens_seen": 194497085, "step": 9032, "time_per_iteration": 2.633051872253418 }, { "auxiliary_loss_clip": 0.01101785, "auxiliary_loss_mlp": 0.01037126, "balance_loss_clip": 1.04154074, "balance_loss_mlp": 1.02405477, "epoch": 0.5430933413497671, "flos": 22675254727680.0, "grad_norm": 1.789713495487195, "language_loss": 0.74318242, "learning_rate": 1.8181780538213457e-06, "loss": 0.76457155, "num_input_tokens_seen": 194516785, "step": 9033, "time_per_iteration": 2.654573917388916 }, { "auxiliary_loss_clip": 0.01080113, "auxiliary_loss_mlp": 0.01040958, "balance_loss_clip": 1.03826129, "balance_loss_mlp": 1.0267365, "epoch": 0.543153464602435, "flos": 24607499973120.0, "grad_norm": 1.5302152204895145, "language_loss": 0.75507742, "learning_rate": 1.8177902083412935e-06, "loss": 0.77628815, "num_input_tokens_seen": 194536475, "step": 9034, "time_per_iteration": 6.07684326171875 }, { "auxiliary_loss_clip": 0.01080457, "auxiliary_loss_mlp": 0.01035889, "balance_loss_clip": 1.04235947, "balance_loss_mlp": 1.02360463, "epoch": 0.543213587855103, "flos": 19025725178880.0, "grad_norm": 1.697596865274133, "language_loss": 0.84559906, "learning_rate": 1.817402369770655e-06, "loss": 0.86676252, "num_input_tokens_seen": 194554495, "step": 9035, "time_per_iteration": 4.246930122375488 }, { "auxiliary_loss_clip": 0.01010369, "auxiliary_loss_mlp": 0.01004655, "balance_loss_clip": 1.01446867, "balance_loss_mlp": 1.00328398, "epoch": 0.5432737111077709, "flos": 65686435125120.0, "grad_norm": 0.7105133860132232, "language_loss": 0.55900681, "learning_rate": 1.8170145381241364e-06, "loss": 0.57915699, "num_input_tokens_seen": 194617620, "step": 9036, "time_per_iteration": 3.214927911758423 }, { "auxiliary_loss_clip": 0.0106374, "auxiliary_loss_mlp": 0.01035958, "balance_loss_clip": 1.04064369, "balance_loss_mlp": 1.02285123, "epoch": 0.5433338343604389, "flos": 22091670460800.0, "grad_norm": 1.4967561616212492, "language_loss": 0.75198317, "learning_rate": 1.8166267134164451e-06, "loss": 0.77298009, "num_input_tokens_seen": 194637690, "step": 9037, "time_per_iteration": 2.815127372741699 }, { "auxiliary_loss_clip": 0.01089499, "auxiliary_loss_mlp": 0.01036314, "balance_loss_clip": 1.039361, "balance_loss_mlp": 1.02274799, "epoch": 0.5433939576131068, "flos": 34672649616000.0, "grad_norm": 1.6562121389813547, "language_loss": 0.66519392, "learning_rate": 1.8162388956622875e-06, "loss": 0.68645203, "num_input_tokens_seen": 194659520, "step": 9038, "time_per_iteration": 2.788142681121826 }, { "auxiliary_loss_clip": 0.01105433, "auxiliary_loss_mlp": 0.01036988, "balance_loss_clip": 1.03904057, "balance_loss_mlp": 1.02456677, "epoch": 0.5434540808657748, "flos": 20303355012480.0, "grad_norm": 1.9500381910938636, "language_loss": 0.7809025, "learning_rate": 1.8158510848763692e-06, "loss": 0.80232668, "num_input_tokens_seen": 194677645, "step": 9039, "time_per_iteration": 4.200030326843262 }, { "auxiliary_loss_clip": 0.01076379, "auxiliary_loss_mlp": 0.01038367, "balance_loss_clip": 1.03707099, "balance_loss_mlp": 1.02523017, "epoch": 0.5435142041184428, "flos": 23112790295040.0, "grad_norm": 1.9066978344822971, "language_loss": 0.76675421, "learning_rate": 1.8154632810733962e-06, "loss": 0.7879017, "num_input_tokens_seen": 194697400, "step": 9040, "time_per_iteration": 2.752359628677368 }, { "auxiliary_loss_clip": 0.01021921, "auxiliary_loss_mlp": 0.0101024, "balance_loss_clip": 1.01599014, "balance_loss_mlp": 1.00891709, "epoch": 0.5435743273711108, "flos": 64012746954240.0, "grad_norm": 0.6657326543890927, "language_loss": 0.52456856, "learning_rate": 1.815075484268074e-06, "loss": 0.54489017, "num_input_tokens_seen": 194761205, "step": 9041, "time_per_iteration": 3.19743275642395 }, { "auxiliary_loss_clip": 0.01092893, "auxiliary_loss_mlp": 0.01043232, "balance_loss_clip": 1.04014623, "balance_loss_mlp": 1.0300709, "epoch": 0.5436344506237788, "flos": 25118903859840.0, "grad_norm": 1.6935261425615555, "language_loss": 0.76397556, "learning_rate": 1.8146876944751078e-06, "loss": 0.78533685, "num_input_tokens_seen": 194782445, "step": 9042, "time_per_iteration": 2.7176172733306885 }, { "auxiliary_loss_clip": 0.01082719, "auxiliary_loss_mlp": 0.01030979, "balance_loss_clip": 1.04040313, "balance_loss_mlp": 1.01886773, "epoch": 0.5436945738764467, "flos": 19572967860480.0, "grad_norm": 1.7014237411229687, "language_loss": 0.67346215, "learning_rate": 1.8142999117092033e-06, "loss": 0.69459915, "num_input_tokens_seen": 194800325, "step": 9043, "time_per_iteration": 2.7166213989257812 }, { "auxiliary_loss_clip": 0.0107861, "auxiliary_loss_mlp": 0.01032764, "balance_loss_clip": 1.03779316, "balance_loss_mlp": 1.01971054, "epoch": 0.5437546971291147, "flos": 21142515525120.0, "grad_norm": 1.5921714365650326, "language_loss": 0.84577447, "learning_rate": 1.8139121359850644e-06, "loss": 0.86688828, "num_input_tokens_seen": 194818675, "step": 9044, "time_per_iteration": 2.758593797683716 }, { "auxiliary_loss_clip": 0.01123207, "auxiliary_loss_mlp": 0.01031023, "balance_loss_clip": 1.04196227, "balance_loss_mlp": 1.01723039, "epoch": 0.5438148203817826, "flos": 25118688378240.0, "grad_norm": 1.5431059852471993, "language_loss": 0.62074721, "learning_rate": 1.8135243673173956e-06, "loss": 0.64228952, "num_input_tokens_seen": 194836595, "step": 9045, "time_per_iteration": 2.6207923889160156 }, { "auxiliary_loss_clip": 0.0112166, "auxiliary_loss_mlp": 0.01035257, "balance_loss_clip": 1.04318917, "balance_loss_mlp": 1.02179205, "epoch": 0.5438749436344507, "flos": 23002939526400.0, "grad_norm": 1.4293832885602564, "language_loss": 0.70140386, "learning_rate": 1.8131366057209023e-06, "loss": 0.72297299, "num_input_tokens_seen": 194857520, "step": 9046, "time_per_iteration": 2.6262285709381104 }, { "auxiliary_loss_clip": 0.01117279, "auxiliary_loss_mlp": 0.01029233, "balance_loss_clip": 1.04171467, "balance_loss_mlp": 1.01709127, "epoch": 0.5439350668871186, "flos": 15487016065920.0, "grad_norm": 1.95554521575616, "language_loss": 0.7724129, "learning_rate": 1.8127488512102868e-06, "loss": 0.79387808, "num_input_tokens_seen": 194876020, "step": 9047, "time_per_iteration": 2.592041492462158 }, { "auxiliary_loss_clip": 0.01094716, "auxiliary_loss_mlp": 0.01047772, "balance_loss_clip": 1.04039311, "balance_loss_mlp": 1.03321636, "epoch": 0.5439951901397866, "flos": 17238415311360.0, "grad_norm": 1.5854248061222735, "language_loss": 0.7262761, "learning_rate": 1.8123611038002547e-06, "loss": 0.74770093, "num_input_tokens_seen": 194894650, "step": 9048, "time_per_iteration": 2.667393684387207 }, { "auxiliary_loss_clip": 0.01069346, "auxiliary_loss_mlp": 0.01045305, "balance_loss_clip": 1.03664947, "balance_loss_mlp": 1.0298202, "epoch": 0.5440553133924545, "flos": 18661016436480.0, "grad_norm": 1.9805900660696516, "language_loss": 0.93650311, "learning_rate": 1.8119733635055076e-06, "loss": 0.95764971, "num_input_tokens_seen": 194911935, "step": 9049, "time_per_iteration": 2.7119088172912598 }, { "auxiliary_loss_clip": 0.0110651, "auxiliary_loss_mlp": 0.01032835, "balance_loss_clip": 1.03992295, "balance_loss_mlp": 1.02054429, "epoch": 0.5441154366451225, "flos": 27122934435840.0, "grad_norm": 1.7800719649484351, "language_loss": 0.73936987, "learning_rate": 1.8115856303407492e-06, "loss": 0.76076329, "num_input_tokens_seen": 194931620, "step": 9050, "time_per_iteration": 2.631661891937256 }, { "auxiliary_loss_clip": 0.01111441, "auxiliary_loss_mlp": 0.01030882, "balance_loss_clip": 1.0437777, "balance_loss_mlp": 1.01755428, "epoch": 0.5441755598977904, "flos": 25993867253760.0, "grad_norm": 1.737903905046117, "language_loss": 0.66990525, "learning_rate": 1.8111979043206832e-06, "loss": 0.69132841, "num_input_tokens_seen": 194952560, "step": 9051, "time_per_iteration": 2.648484230041504 }, { "auxiliary_loss_clip": 0.01080337, "auxiliary_loss_mlp": 0.01033354, "balance_loss_clip": 1.03722811, "balance_loss_mlp": 1.02039015, "epoch": 0.5442356831504584, "flos": 32380041173760.0, "grad_norm": 2.245844605247971, "language_loss": 0.67334735, "learning_rate": 1.810810185460011e-06, "loss": 0.69448429, "num_input_tokens_seen": 194973915, "step": 9052, "time_per_iteration": 2.778211832046509 }, { "auxiliary_loss_clip": 0.01121064, "auxiliary_loss_mlp": 0.01033478, "balance_loss_clip": 1.04266417, "balance_loss_mlp": 1.02010286, "epoch": 0.5442958064031264, "flos": 24164290056960.0, "grad_norm": 1.8200748140762566, "language_loss": 0.92835879, "learning_rate": 1.810422473773436e-06, "loss": 0.9499042, "num_input_tokens_seen": 194990170, "step": 9053, "time_per_iteration": 2.6110095977783203 }, { "auxiliary_loss_clip": 0.01093907, "auxiliary_loss_mlp": 0.01034949, "balance_loss_clip": 1.04024363, "balance_loss_mlp": 1.02203834, "epoch": 0.5443559296557944, "flos": 18764690065920.0, "grad_norm": 2.3950140888374687, "language_loss": 0.83948398, "learning_rate": 1.8100347692756595e-06, "loss": 0.86077261, "num_input_tokens_seen": 195006395, "step": 9054, "time_per_iteration": 2.6261367797851562 }, { "auxiliary_loss_clip": 0.01090647, "auxiliary_loss_mlp": 0.01034581, "balance_loss_clip": 1.03965771, "balance_loss_mlp": 1.02094352, "epoch": 0.5444160529084624, "flos": 22632556435200.0, "grad_norm": 2.6065175825707327, "language_loss": 0.68213475, "learning_rate": 1.8096470719813836e-06, "loss": 0.70338708, "num_input_tokens_seen": 195025080, "step": 9055, "time_per_iteration": 2.623518705368042 }, { "auxiliary_loss_clip": 0.01000083, "auxiliary_loss_mlp": 0.00999074, "balance_loss_clip": 1.01110244, "balance_loss_mlp": 0.99770337, "epoch": 0.5444761761611303, "flos": 69671909600640.0, "grad_norm": 0.7426728731430834, "language_loss": 0.57650024, "learning_rate": 1.80925938190531e-06, "loss": 0.59649181, "num_input_tokens_seen": 195085725, "step": 9056, "time_per_iteration": 3.2228453159332275 }, { "auxiliary_loss_clip": 0.01087409, "auxiliary_loss_mlp": 0.01036027, "balance_loss_clip": 1.04208684, "balance_loss_mlp": 1.02234185, "epoch": 0.5445362994137983, "flos": 14278442129280.0, "grad_norm": 1.75653480561415, "language_loss": 0.69749284, "learning_rate": 1.8088716990621395e-06, "loss": 0.71872711, "num_input_tokens_seen": 195102585, "step": 9057, "time_per_iteration": 2.7110843658447266 }, { "auxiliary_loss_clip": 0.01106044, "auxiliary_loss_mlp": 0.01038419, "balance_loss_clip": 1.04014075, "balance_loss_mlp": 1.02472818, "epoch": 0.5445964226664662, "flos": 28986195611520.0, "grad_norm": 2.0738816921888366, "language_loss": 0.75373238, "learning_rate": 1.8084840234665738e-06, "loss": 0.775177, "num_input_tokens_seen": 195120055, "step": 9058, "time_per_iteration": 2.7001023292541504 }, { "auxiliary_loss_clip": 0.01003793, "auxiliary_loss_mlp": 0.01003874, "balance_loss_clip": 1.01181531, "balance_loss_mlp": 1.00230026, "epoch": 0.5446565459191343, "flos": 68620230270720.0, "grad_norm": 0.7925901763726337, "language_loss": 0.6261481, "learning_rate": 1.808096355133312e-06, "loss": 0.6462248, "num_input_tokens_seen": 195181045, "step": 9059, "time_per_iteration": 3.355748414993286 }, { "auxiliary_loss_clip": 0.01107073, "auxiliary_loss_mlp": 0.0103287, "balance_loss_clip": 1.0414511, "balance_loss_mlp": 1.01922059, "epoch": 0.5447166691718022, "flos": 16216469464320.0, "grad_norm": 1.790354282478879, "language_loss": 0.79365647, "learning_rate": 1.8077086940770572e-06, "loss": 0.81505585, "num_input_tokens_seen": 195198840, "step": 9060, "time_per_iteration": 2.6523141860961914 }, { "auxiliary_loss_clip": 0.01111799, "auxiliary_loss_mlp": 0.01033132, "balance_loss_clip": 1.04219317, "balance_loss_mlp": 1.01976824, "epoch": 0.5447767924244702, "flos": 25849039616640.0, "grad_norm": 1.7487339019361072, "language_loss": 0.8006283, "learning_rate": 1.8073210403125072e-06, "loss": 0.82207763, "num_input_tokens_seen": 195218720, "step": 9061, "time_per_iteration": 2.660477876663208 }, { "auxiliary_loss_clip": 0.01107514, "auxiliary_loss_mlp": 0.01028575, "balance_loss_clip": 1.04152489, "balance_loss_mlp": 1.01595628, "epoch": 0.5448369156771381, "flos": 19677718897920.0, "grad_norm": 1.667542325640746, "language_loss": 0.8699556, "learning_rate": 1.8069333938543627e-06, "loss": 0.89131653, "num_input_tokens_seen": 195235770, "step": 9062, "time_per_iteration": 2.6527698040008545 }, { "auxiliary_loss_clip": 0.0109274, "auxiliary_loss_mlp": 0.01037371, "balance_loss_clip": 1.03916395, "balance_loss_mlp": 1.02188551, "epoch": 0.5448970389298061, "flos": 19281804215040.0, "grad_norm": 1.6766222611874342, "language_loss": 0.82069784, "learning_rate": 1.8065457547173233e-06, "loss": 0.84199893, "num_input_tokens_seen": 195254870, "step": 9063, "time_per_iteration": 2.651977062225342 }, { "auxiliary_loss_clip": 0.01118028, "auxiliary_loss_mlp": 0.01032916, "balance_loss_clip": 1.0406127, "balance_loss_mlp": 1.01958823, "epoch": 0.544957162182474, "flos": 20991690316800.0, "grad_norm": 1.769153488212037, "language_loss": 0.63484013, "learning_rate": 1.8061581229160878e-06, "loss": 0.65634954, "num_input_tokens_seen": 195273390, "step": 9064, "time_per_iteration": 2.595914602279663 }, { "auxiliary_loss_clip": 0.0112242, "auxiliary_loss_mlp": 0.01037085, "balance_loss_clip": 1.04264021, "balance_loss_mlp": 1.02337003, "epoch": 0.545017285435142, "flos": 25374587846400.0, "grad_norm": 1.6143269954810184, "language_loss": 0.79795569, "learning_rate": 1.8057704984653566e-06, "loss": 0.81955075, "num_input_tokens_seen": 195295635, "step": 9065, "time_per_iteration": 2.647632360458374 }, { "auxiliary_loss_clip": 0.01082455, "auxiliary_loss_mlp": 0.01032837, "balance_loss_clip": 1.04022825, "balance_loss_mlp": 1.0211482, "epoch": 0.54507740868781, "flos": 19134749934720.0, "grad_norm": 2.1024584454927626, "language_loss": 0.77589709, "learning_rate": 1.805382881379827e-06, "loss": 0.79705, "num_input_tokens_seen": 195312545, "step": 9066, "time_per_iteration": 2.750904083251953 }, { "auxiliary_loss_clip": 0.01106868, "auxiliary_loss_mlp": 0.0103139, "balance_loss_clip": 1.04005289, "balance_loss_mlp": 1.01794958, "epoch": 0.545137531940478, "flos": 26249802635520.0, "grad_norm": 2.0527073359497665, "language_loss": 0.75859725, "learning_rate": 1.8049952716741975e-06, "loss": 0.77997983, "num_input_tokens_seen": 195332955, "step": 9067, "time_per_iteration": 2.68332839012146 }, { "auxiliary_loss_clip": 0.0108798, "auxiliary_loss_mlp": 0.01038091, "balance_loss_clip": 1.04256892, "balance_loss_mlp": 1.02183652, "epoch": 0.545197655193146, "flos": 37555629995520.0, "grad_norm": 6.876378009840058, "language_loss": 0.63596183, "learning_rate": 1.8046076693631682e-06, "loss": 0.65722257, "num_input_tokens_seen": 195355930, "step": 9068, "time_per_iteration": 2.893052816390991 }, { "auxiliary_loss_clip": 0.01080095, "auxiliary_loss_mlp": 0.01041608, "balance_loss_clip": 1.0446372, "balance_loss_mlp": 1.02935874, "epoch": 0.5452577784458139, "flos": 26031250333440.0, "grad_norm": 1.5002235169223528, "language_loss": 0.7186054, "learning_rate": 1.8042200744614343e-06, "loss": 0.73982239, "num_input_tokens_seen": 195376445, "step": 9069, "time_per_iteration": 2.7437844276428223 }, { "auxiliary_loss_clip": 0.01118098, "auxiliary_loss_mlp": 0.01028881, "balance_loss_clip": 1.04397726, "balance_loss_mlp": 1.0169543, "epoch": 0.5453179016984819, "flos": 17639034675840.0, "grad_norm": 1.9248359915141238, "language_loss": 0.73836279, "learning_rate": 1.8038324869836957e-06, "loss": 0.75983256, "num_input_tokens_seen": 195393725, "step": 9070, "time_per_iteration": 2.629026174545288 }, { "auxiliary_loss_clip": 0.01104842, "auxiliary_loss_mlp": 0.01038375, "balance_loss_clip": 1.0405302, "balance_loss_mlp": 1.02508879, "epoch": 0.5453780249511498, "flos": 23216679406080.0, "grad_norm": 2.895965777257026, "language_loss": 0.60386193, "learning_rate": 1.8034449069446489e-06, "loss": 0.62529415, "num_input_tokens_seen": 195411380, "step": 9071, "time_per_iteration": 2.787898540496826 }, { "auxiliary_loss_clip": 0.0103628, "auxiliary_loss_mlp": 0.01019994, "balance_loss_clip": 1.01031959, "balance_loss_mlp": 1.01858091, "epoch": 0.5454381482038179, "flos": 68696504801280.0, "grad_norm": 0.701915733274622, "language_loss": 0.57096583, "learning_rate": 1.80305733435899e-06, "loss": 0.59152853, "num_input_tokens_seen": 195482015, "step": 9072, "time_per_iteration": 3.3096070289611816 }, { "auxiliary_loss_clip": 0.01088718, "auxiliary_loss_mlp": 0.0104092, "balance_loss_clip": 1.03829658, "balance_loss_mlp": 1.02696621, "epoch": 0.5454982714564858, "flos": 13260626346240.0, "grad_norm": 1.6985686628313852, "language_loss": 0.6941787, "learning_rate": 1.8026697692414174e-06, "loss": 0.71547508, "num_input_tokens_seen": 195500440, "step": 9073, "time_per_iteration": 5.942334413528442 }, { "auxiliary_loss_clip": 0.01094077, "auxiliary_loss_mlp": 0.01042156, "balance_loss_clip": 1.03799677, "balance_loss_mlp": 1.02981734, "epoch": 0.5455583947091538, "flos": 21835878733440.0, "grad_norm": 1.7477774368009211, "language_loss": 0.7124452, "learning_rate": 1.802282211606627e-06, "loss": 0.73380756, "num_input_tokens_seen": 195520860, "step": 9074, "time_per_iteration": 2.6760778427124023 }, { "auxiliary_loss_clip": 0.0110625, "auxiliary_loss_mlp": 0.01038683, "balance_loss_clip": 1.04050887, "balance_loss_mlp": 1.02611828, "epoch": 0.5456185179618217, "flos": 17817438551040.0, "grad_norm": 1.854490114521215, "language_loss": 0.68543398, "learning_rate": 1.8018946614693148e-06, "loss": 0.70688331, "num_input_tokens_seen": 195538615, "step": 9075, "time_per_iteration": 4.19740891456604 }, { "auxiliary_loss_clip": 0.01109026, "auxiliary_loss_mlp": 0.01034737, "balance_loss_clip": 1.04411292, "balance_loss_mlp": 1.02303696, "epoch": 0.5456786412144897, "flos": 21069401391360.0, "grad_norm": 1.8542702472429493, "language_loss": 0.80530715, "learning_rate": 1.8015071188441768e-06, "loss": 0.82674479, "num_input_tokens_seen": 195557460, "step": 9076, "time_per_iteration": 2.6821329593658447 }, { "auxiliary_loss_clip": 0.01109363, "auxiliary_loss_mlp": 0.01032135, "balance_loss_clip": 1.04109383, "balance_loss_mlp": 1.01970196, "epoch": 0.5457387644671576, "flos": 23294965098240.0, "grad_norm": 1.6176910715306643, "language_loss": 0.80137533, "learning_rate": 1.8011195837459089e-06, "loss": 0.82279032, "num_input_tokens_seen": 195577985, "step": 9077, "time_per_iteration": 2.6378607749938965 }, { "auxiliary_loss_clip": 0.01103737, "auxiliary_loss_mlp": 0.01035436, "balance_loss_clip": 1.04032636, "balance_loss_mlp": 1.02293682, "epoch": 0.5457988877198257, "flos": 21617039122560.0, "grad_norm": 2.2183628478116346, "language_loss": 0.67997038, "learning_rate": 1.8007320561892064e-06, "loss": 0.70136213, "num_input_tokens_seen": 195597620, "step": 9078, "time_per_iteration": 4.261017560958862 }, { "auxiliary_loss_clip": 0.01114465, "auxiliary_loss_mlp": 0.01039359, "balance_loss_clip": 1.04379976, "balance_loss_mlp": 1.02579284, "epoch": 0.5458590109724936, "flos": 23762485543680.0, "grad_norm": 1.8448340723101526, "language_loss": 0.80507636, "learning_rate": 1.800344536188764e-06, "loss": 0.82661462, "num_input_tokens_seen": 195615910, "step": 9079, "time_per_iteration": 2.6384685039520264 }, { "auxiliary_loss_clip": 0.01124513, "auxiliary_loss_mlp": 0.01034947, "balance_loss_clip": 1.04221058, "balance_loss_mlp": 1.02032018, "epoch": 0.5459191342251616, "flos": 24424283675520.0, "grad_norm": 1.6928746227882223, "language_loss": 0.75848919, "learning_rate": 1.799957023759277e-06, "loss": 0.78008378, "num_input_tokens_seen": 195635620, "step": 9080, "time_per_iteration": 2.6506381034851074 }, { "auxiliary_loss_clip": 0.01080273, "auxiliary_loss_mlp": 0.01037485, "balance_loss_clip": 1.03795743, "balance_loss_mlp": 1.0230484, "epoch": 0.5459792574778296, "flos": 23623009032960.0, "grad_norm": 2.0769433103494737, "language_loss": 0.83164978, "learning_rate": 1.7995695189154392e-06, "loss": 0.85282731, "num_input_tokens_seen": 195652495, "step": 9081, "time_per_iteration": 2.705381393432617 }, { "auxiliary_loss_clip": 0.0112596, "auxiliary_loss_mlp": 0.0103236, "balance_loss_clip": 1.04470921, "balance_loss_mlp": 1.01884151, "epoch": 0.5460393807304975, "flos": 19135540033920.0, "grad_norm": 1.688461125774873, "language_loss": 0.70063365, "learning_rate": 1.7991820216719461e-06, "loss": 0.72221684, "num_input_tokens_seen": 195671965, "step": 9082, "time_per_iteration": 2.6176023483276367 }, { "auxiliary_loss_clip": 0.01115168, "auxiliary_loss_mlp": 0.01030163, "balance_loss_clip": 1.03972983, "balance_loss_mlp": 1.01709151, "epoch": 0.5460995039831655, "flos": 35918534805120.0, "grad_norm": 1.559424348169526, "language_loss": 0.66653717, "learning_rate": 1.7987945320434906e-06, "loss": 0.68799043, "num_input_tokens_seen": 195694725, "step": 9083, "time_per_iteration": 2.710636854171753 }, { "auxiliary_loss_clip": 0.01091037, "auxiliary_loss_mlp": 0.01033037, "balance_loss_clip": 1.03879106, "balance_loss_mlp": 1.01998401, "epoch": 0.5461596272358334, "flos": 26759231274240.0, "grad_norm": 1.7271294710436846, "language_loss": 0.78584135, "learning_rate": 1.798407050044766e-06, "loss": 0.80708218, "num_input_tokens_seen": 195714090, "step": 9084, "time_per_iteration": 2.6876227855682373 }, { "auxiliary_loss_clip": 0.01111571, "auxiliary_loss_mlp": 0.01037411, "balance_loss_clip": 1.042117, "balance_loss_mlp": 1.02412558, "epoch": 0.5462197504885015, "flos": 20886580143360.0, "grad_norm": 2.0534049917888852, "language_loss": 0.75331509, "learning_rate": 1.7980195756904675e-06, "loss": 0.77480489, "num_input_tokens_seen": 195733585, "step": 9085, "time_per_iteration": 2.710315704345703 }, { "auxiliary_loss_clip": 0.01098293, "auxiliary_loss_mlp": 0.01035765, "balance_loss_clip": 1.0397166, "balance_loss_mlp": 1.02216959, "epoch": 0.5462798737411694, "flos": 25804976607360.0, "grad_norm": 2.0038443585531174, "language_loss": 0.75082123, "learning_rate": 1.7976321089952857e-06, "loss": 0.7721619, "num_input_tokens_seen": 195752820, "step": 9086, "time_per_iteration": 2.7101428508758545 }, { "auxiliary_loss_clip": 0.01102837, "auxiliary_loss_mlp": 0.01035671, "balance_loss_clip": 1.03951812, "balance_loss_mlp": 1.02227759, "epoch": 0.5463399969938374, "flos": 25775027642880.0, "grad_norm": 1.6829711206227542, "language_loss": 0.77097058, "learning_rate": 1.7972446499739155e-06, "loss": 0.79235566, "num_input_tokens_seen": 195773740, "step": 9087, "time_per_iteration": 2.6439003944396973 }, { "auxiliary_loss_clip": 0.01114018, "auxiliary_loss_mlp": 0.01042361, "balance_loss_clip": 1.04376245, "balance_loss_mlp": 1.02707863, "epoch": 0.5464001202465053, "flos": 18843298980480.0, "grad_norm": 1.9617582228039958, "language_loss": 0.77464199, "learning_rate": 1.7968571986410484e-06, "loss": 0.79620576, "num_input_tokens_seen": 195792125, "step": 9088, "time_per_iteration": 2.62850022315979 }, { "auxiliary_loss_clip": 0.00993547, "auxiliary_loss_mlp": 0.00999929, "balance_loss_clip": 1.02517176, "balance_loss_mlp": 0.99852258, "epoch": 0.5464602434991733, "flos": 69049541623680.0, "grad_norm": 0.7268281858475805, "language_loss": 0.57717931, "learning_rate": 1.7964697550113758e-06, "loss": 0.59711409, "num_input_tokens_seen": 195854935, "step": 9089, "time_per_iteration": 3.532050371170044 }, { "auxiliary_loss_clip": 0.01085451, "auxiliary_loss_mlp": 0.01038489, "balance_loss_clip": 1.03805399, "balance_loss_mlp": 1.02422571, "epoch": 0.5465203667518412, "flos": 27560039040000.0, "grad_norm": 1.7593878993297172, "language_loss": 0.76682436, "learning_rate": 1.7960823190995918e-06, "loss": 0.78806376, "num_input_tokens_seen": 195874715, "step": 9090, "time_per_iteration": 3.0779287815093994 }, { "auxiliary_loss_clip": 0.01106384, "auxiliary_loss_mlp": 0.01039408, "balance_loss_clip": 1.03928399, "balance_loss_mlp": 1.0233984, "epoch": 0.5465804900045093, "flos": 21210206705280.0, "grad_norm": 1.8843979676244431, "language_loss": 0.74037111, "learning_rate": 1.7956948909203855e-06, "loss": 0.76182902, "num_input_tokens_seen": 195892610, "step": 9091, "time_per_iteration": 2.6843886375427246 }, { "auxiliary_loss_clip": 0.01103772, "auxiliary_loss_mlp": 0.01037785, "balance_loss_clip": 1.04514658, "balance_loss_mlp": 1.02397454, "epoch": 0.5466406132571772, "flos": 22488949860480.0, "grad_norm": 1.8168674877061988, "language_loss": 0.78466463, "learning_rate": 1.7953074704884498e-06, "loss": 0.80608022, "num_input_tokens_seen": 195911085, "step": 9092, "time_per_iteration": 2.6951024532318115 }, { "auxiliary_loss_clip": 0.01125215, "auxiliary_loss_mlp": 0.01034303, "balance_loss_clip": 1.04363537, "balance_loss_mlp": 1.01997435, "epoch": 0.5467007365098452, "flos": 17675843137920.0, "grad_norm": 2.188123152779193, "language_loss": 0.74691254, "learning_rate": 1.794920057818476e-06, "loss": 0.76850772, "num_input_tokens_seen": 195929845, "step": 9093, "time_per_iteration": 2.596165657043457 }, { "auxiliary_loss_clip": 0.01112494, "auxiliary_loss_mlp": 0.01040653, "balance_loss_clip": 1.04044032, "balance_loss_mlp": 1.02444029, "epoch": 0.5467608597625132, "flos": 15698852524800.0, "grad_norm": 2.4498750676664414, "language_loss": 0.6874221, "learning_rate": 1.7945326529251533e-06, "loss": 0.70895356, "num_input_tokens_seen": 195946350, "step": 9094, "time_per_iteration": 2.617203712463379 }, { "auxiliary_loss_clip": 0.01100239, "auxiliary_loss_mlp": 0.0103544, "balance_loss_clip": 1.04255402, "balance_loss_mlp": 1.02238083, "epoch": 0.5468209830151811, "flos": 24312816794880.0, "grad_norm": 3.189829826251606, "language_loss": 0.67888498, "learning_rate": 1.7941452558231731e-06, "loss": 0.70024174, "num_input_tokens_seen": 195959840, "step": 9095, "time_per_iteration": 2.709214687347412 }, { "auxiliary_loss_clip": 0.01085979, "auxiliary_loss_mlp": 0.01036228, "balance_loss_clip": 1.0412364, "balance_loss_mlp": 1.0228703, "epoch": 0.5468811062678491, "flos": 29166323339520.0, "grad_norm": 1.772487886139895, "language_loss": 0.66687673, "learning_rate": 1.7937578665272256e-06, "loss": 0.68809879, "num_input_tokens_seen": 195981125, "step": 9096, "time_per_iteration": 2.768289804458618 }, { "auxiliary_loss_clip": 0.01013718, "auxiliary_loss_mlp": 0.01003083, "balance_loss_clip": 1.01639581, "balance_loss_mlp": 1.00179529, "epoch": 0.546941229520517, "flos": 67867037982720.0, "grad_norm": 0.7380745619271847, "language_loss": 0.57528484, "learning_rate": 1.7933704850520007e-06, "loss": 0.59545285, "num_input_tokens_seen": 196038880, "step": 9097, "time_per_iteration": 3.353034496307373 }, { "auxiliary_loss_clip": 0.01023908, "auxiliary_loss_mlp": 0.00999165, "balance_loss_clip": 1.01245689, "balance_loss_mlp": 0.99754351, "epoch": 0.5470013527731851, "flos": 58270306625280.0, "grad_norm": 0.9199423088856966, "language_loss": 0.64710629, "learning_rate": 1.7929831114121868e-06, "loss": 0.66733694, "num_input_tokens_seen": 196099215, "step": 9098, "time_per_iteration": 3.1356828212738037 }, { "auxiliary_loss_clip": 0.01114825, "auxiliary_loss_mlp": 0.01037808, "balance_loss_clip": 1.04415989, "balance_loss_mlp": 1.02378869, "epoch": 0.547061476025853, "flos": 22965915582720.0, "grad_norm": 2.132166365058938, "language_loss": 0.73123235, "learning_rate": 1.7925957456224753e-06, "loss": 0.75275862, "num_input_tokens_seen": 196120370, "step": 9099, "time_per_iteration": 2.662252426147461 }, { "auxiliary_loss_clip": 0.01097751, "auxiliary_loss_mlp": 0.01035706, "balance_loss_clip": 1.04278708, "balance_loss_mlp": 1.02327275, "epoch": 0.547121599278521, "flos": 29968244426880.0, "grad_norm": 1.880355780986747, "language_loss": 0.72515011, "learning_rate": 1.7922083876975537e-06, "loss": 0.74648476, "num_input_tokens_seen": 196139075, "step": 9100, "time_per_iteration": 2.859636068344116 }, { "auxiliary_loss_clip": 0.01106059, "auxiliary_loss_mlp": 0.00770753, "balance_loss_clip": 1.04162157, "balance_loss_mlp": 1.00017691, "epoch": 0.5471817225311889, "flos": 36535443914880.0, "grad_norm": 1.8314110929237357, "language_loss": 0.68211091, "learning_rate": 1.7918210376521102e-06, "loss": 0.70087898, "num_input_tokens_seen": 196159990, "step": 9101, "time_per_iteration": 2.747811794281006 }, { "auxiliary_loss_clip": 0.01123228, "auxiliary_loss_mlp": 0.01034884, "balance_loss_clip": 1.04393971, "balance_loss_mlp": 1.02121687, "epoch": 0.5472418457838569, "flos": 25775243124480.0, "grad_norm": 1.907951209204745, "language_loss": 0.77796781, "learning_rate": 1.7914336955008343e-06, "loss": 0.79954892, "num_input_tokens_seen": 196180570, "step": 9102, "time_per_iteration": 2.6425788402557373 }, { "auxiliary_loss_clip": 0.01087581, "auxiliary_loss_mlp": 0.01039397, "balance_loss_clip": 1.04114008, "balance_loss_mlp": 1.02447212, "epoch": 0.5473019690365248, "flos": 27887687925120.0, "grad_norm": 1.553646996990172, "language_loss": 0.72080058, "learning_rate": 1.791046361258413e-06, "loss": 0.74207032, "num_input_tokens_seen": 196200300, "step": 9103, "time_per_iteration": 2.7307486534118652 }, { "auxiliary_loss_clip": 0.01088884, "auxiliary_loss_mlp": 0.01031551, "balance_loss_clip": 1.0425241, "balance_loss_mlp": 1.01806211, "epoch": 0.5473620922891929, "flos": 57631490219520.0, "grad_norm": 1.4283303897304696, "language_loss": 0.65195155, "learning_rate": 1.7906590349395356e-06, "loss": 0.67315584, "num_input_tokens_seen": 196228525, "step": 9104, "time_per_iteration": 3.0792930126190186 }, { "auxiliary_loss_clip": 0.01109949, "auxiliary_loss_mlp": 0.0103298, "balance_loss_clip": 1.04480743, "balance_loss_mlp": 1.01883578, "epoch": 0.5474222155418608, "flos": 19354056422400.0, "grad_norm": 1.90483998435302, "language_loss": 0.82428771, "learning_rate": 1.790271716558888e-06, "loss": 0.84571701, "num_input_tokens_seen": 196247690, "step": 9105, "time_per_iteration": 3.3235061168670654 }, { "auxiliary_loss_clip": 0.01119165, "auxiliary_loss_mlp": 0.01030088, "balance_loss_clip": 1.04210079, "balance_loss_mlp": 1.01735604, "epoch": 0.5474823387945288, "flos": 25120448144640.0, "grad_norm": 1.6592382133296117, "language_loss": 0.80052161, "learning_rate": 1.7898844061311575e-06, "loss": 0.82201409, "num_input_tokens_seen": 196268555, "step": 9106, "time_per_iteration": 2.7082676887512207 }, { "auxiliary_loss_clip": 0.01115376, "auxiliary_loss_mlp": 0.01036861, "balance_loss_clip": 1.04689944, "balance_loss_mlp": 1.02419519, "epoch": 0.5475424620471967, "flos": 18004174381440.0, "grad_norm": 1.7933883779040884, "language_loss": 0.69402343, "learning_rate": 1.7894971036710322e-06, "loss": 0.71554577, "num_input_tokens_seen": 196285585, "step": 9107, "time_per_iteration": 2.626214027404785 }, { "auxiliary_loss_clip": 0.01115289, "auxiliary_loss_mlp": 0.01035057, "balance_loss_clip": 1.04319263, "balance_loss_mlp": 1.02166939, "epoch": 0.5476025852998647, "flos": 22309324922880.0, "grad_norm": 2.6929722220667824, "language_loss": 0.63537276, "learning_rate": 1.789109809193197e-06, "loss": 0.65687621, "num_input_tokens_seen": 196305085, "step": 9108, "time_per_iteration": 2.6056766510009766 }, { "auxiliary_loss_clip": 0.01122102, "auxiliary_loss_mlp": 0.01029913, "balance_loss_clip": 1.0446291, "balance_loss_mlp": 1.01750922, "epoch": 0.5476627085525327, "flos": 20120497850880.0, "grad_norm": 1.7311986454715018, "language_loss": 0.75234431, "learning_rate": 1.7887225227123396e-06, "loss": 0.77386445, "num_input_tokens_seen": 196323945, "step": 9109, "time_per_iteration": 2.562833786010742 }, { "auxiliary_loss_clip": 0.01093609, "auxiliary_loss_mlp": 0.01035669, "balance_loss_clip": 1.04307365, "balance_loss_mlp": 1.02143562, "epoch": 0.5477228318052006, "flos": 17712579772800.0, "grad_norm": 1.7887859684809904, "language_loss": 0.77939326, "learning_rate": 1.7883352442431457e-06, "loss": 0.800686, "num_input_tokens_seen": 196342200, "step": 9110, "time_per_iteration": 2.62839674949646 }, { "auxiliary_loss_clip": 0.01106302, "auxiliary_loss_mlp": 0.01032426, "balance_loss_clip": 1.04262304, "balance_loss_mlp": 1.01997423, "epoch": 0.5477829550578687, "flos": 25848895962240.0, "grad_norm": 1.525983194059855, "language_loss": 0.71175343, "learning_rate": 1.7879479738002993e-06, "loss": 0.73314071, "num_input_tokens_seen": 196362940, "step": 9111, "time_per_iteration": 2.664486885070801 }, { "auxiliary_loss_clip": 0.01111586, "auxiliary_loss_mlp": 0.01044961, "balance_loss_clip": 1.0436976, "balance_loss_mlp": 1.0317409, "epoch": 0.5478430783105366, "flos": 23039676161280.0, "grad_norm": 1.5197619181850293, "language_loss": 0.71096945, "learning_rate": 1.7875607113984876e-06, "loss": 0.73253489, "num_input_tokens_seen": 196383070, "step": 9112, "time_per_iteration": 2.7334086894989014 }, { "auxiliary_loss_clip": 0.01067523, "auxiliary_loss_mlp": 0.01034936, "balance_loss_clip": 1.03873658, "balance_loss_mlp": 1.02179968, "epoch": 0.5479032015632046, "flos": 16071210864000.0, "grad_norm": 2.172543516099556, "language_loss": 0.87877554, "learning_rate": 1.7871734570523953e-06, "loss": 0.89980012, "num_input_tokens_seen": 196398485, "step": 9113, "time_per_iteration": 5.9666571617126465 }, { "auxiliary_loss_clip": 0.01070074, "auxiliary_loss_mlp": 0.01032166, "balance_loss_clip": 1.04229951, "balance_loss_mlp": 1.01853991, "epoch": 0.5479633248158725, "flos": 24278701852800.0, "grad_norm": 1.4694487805740626, "language_loss": 0.73041236, "learning_rate": 1.7867862107767067e-06, "loss": 0.7514348, "num_input_tokens_seen": 196417725, "step": 9114, "time_per_iteration": 4.333765745162964 }, { "auxiliary_loss_clip": 0.01093195, "auxiliary_loss_mlp": 0.00770887, "balance_loss_clip": 1.03821266, "balance_loss_mlp": 1.00027823, "epoch": 0.5480234480685405, "flos": 26358216860160.0, "grad_norm": 1.6145561495164014, "language_loss": 0.72155976, "learning_rate": 1.7863989725861066e-06, "loss": 0.74020058, "num_input_tokens_seen": 196437840, "step": 9115, "time_per_iteration": 2.6793766021728516 }, { "auxiliary_loss_clip": 0.01084634, "auxiliary_loss_mlp": 0.00774539, "balance_loss_clip": 1.03983831, "balance_loss_mlp": 1.00038791, "epoch": 0.5480835713212084, "flos": 22055077480320.0, "grad_norm": 1.7266092862770852, "language_loss": 0.72229278, "learning_rate": 1.7860117424952781e-06, "loss": 0.74088448, "num_input_tokens_seen": 196457300, "step": 9116, "time_per_iteration": 2.738142490386963 }, { "auxiliary_loss_clip": 0.01095127, "auxiliary_loss_mlp": 0.01039685, "balance_loss_clip": 1.04102373, "balance_loss_mlp": 1.0259639, "epoch": 0.5481436945738765, "flos": 25301042749440.0, "grad_norm": 4.413930764564679, "language_loss": 0.76158273, "learning_rate": 1.7856245205189063e-06, "loss": 0.78293079, "num_input_tokens_seen": 196476720, "step": 9117, "time_per_iteration": 2.693359613418579 }, { "auxiliary_loss_clip": 0.01070482, "auxiliary_loss_mlp": 0.01035701, "balance_loss_clip": 1.03514457, "balance_loss_mlp": 1.02292752, "epoch": 0.5482038178265444, "flos": 33580857772800.0, "grad_norm": 1.575829874902699, "language_loss": 0.62537289, "learning_rate": 1.785237306671674e-06, "loss": 0.64643478, "num_input_tokens_seen": 196496765, "step": 9118, "time_per_iteration": 4.42430305480957 }, { "auxiliary_loss_clip": 0.01124628, "auxiliary_loss_mlp": 0.01036624, "balance_loss_clip": 1.04479444, "balance_loss_mlp": 1.02259278, "epoch": 0.5482639410792124, "flos": 19026192055680.0, "grad_norm": 2.694246810130355, "language_loss": 0.79018009, "learning_rate": 1.7848501009682646e-06, "loss": 0.81179261, "num_input_tokens_seen": 196516220, "step": 9119, "time_per_iteration": 2.606593608856201 }, { "auxiliary_loss_clip": 0.01092726, "auxiliary_loss_mlp": 0.00769453, "balance_loss_clip": 1.04150975, "balance_loss_mlp": 1.00022948, "epoch": 0.5483240643318803, "flos": 25410318900480.0, "grad_norm": 1.8682271604905119, "language_loss": 0.82534289, "learning_rate": 1.7844629034233604e-06, "loss": 0.8439647, "num_input_tokens_seen": 196533860, "step": 9120, "time_per_iteration": 2.694546699523926 }, { "auxiliary_loss_clip": 0.01089359, "auxiliary_loss_mlp": 0.01039031, "balance_loss_clip": 1.04395008, "balance_loss_mlp": 1.02531016, "epoch": 0.5483841875845483, "flos": 21466896272640.0, "grad_norm": 1.8000226938726367, "language_loss": 0.80031526, "learning_rate": 1.7840757140516455e-06, "loss": 0.82159919, "num_input_tokens_seen": 196551305, "step": 9121, "time_per_iteration": 2.7422945499420166 }, { "auxiliary_loss_clip": 0.01076146, "auxiliary_loss_mlp": 0.01038978, "balance_loss_clip": 1.03803313, "balance_loss_mlp": 1.02408934, "epoch": 0.5484443108372163, "flos": 24747263792640.0, "grad_norm": 1.9827939120507885, "language_loss": 0.60996848, "learning_rate": 1.7836885328678008e-06, "loss": 0.63111973, "num_input_tokens_seen": 196569420, "step": 9122, "time_per_iteration": 2.782677412033081 }, { "auxiliary_loss_clip": 0.01106377, "auxiliary_loss_mlp": 0.01038556, "balance_loss_clip": 1.04853153, "balance_loss_mlp": 1.0268079, "epoch": 0.5485044340898843, "flos": 25375377945600.0, "grad_norm": 1.5587852273808862, "language_loss": 0.71594763, "learning_rate": 1.7833013598865084e-06, "loss": 0.73739696, "num_input_tokens_seen": 196590610, "step": 9123, "time_per_iteration": 2.756350517272949 }, { "auxiliary_loss_clip": 0.01121133, "auxiliary_loss_mlp": 0.01033494, "balance_loss_clip": 1.04210067, "balance_loss_mlp": 1.0208813, "epoch": 0.5485645573425523, "flos": 12641167370880.0, "grad_norm": 2.3735658261361845, "language_loss": 0.83559448, "learning_rate": 1.7829141951224505e-06, "loss": 0.85714072, "num_input_tokens_seen": 196606495, "step": 9124, "time_per_iteration": 2.61197829246521 }, { "auxiliary_loss_clip": 0.01094486, "auxiliary_loss_mlp": 0.01033029, "balance_loss_clip": 1.04321349, "balance_loss_mlp": 1.01992834, "epoch": 0.5486246805952202, "flos": 28329425383680.0, "grad_norm": 1.5486509111854319, "language_loss": 0.80518043, "learning_rate": 1.7825270385903075e-06, "loss": 0.82645559, "num_input_tokens_seen": 196626365, "step": 9125, "time_per_iteration": 2.773972749710083 }, { "auxiliary_loss_clip": 0.01111849, "auxiliary_loss_mlp": 0.01032276, "balance_loss_clip": 1.04336679, "balance_loss_mlp": 1.01903141, "epoch": 0.5486848038478882, "flos": 16800017817600.0, "grad_norm": 4.333134351852335, "language_loss": 0.74312758, "learning_rate": 1.7821398903047617e-06, "loss": 0.76456887, "num_input_tokens_seen": 196644465, "step": 9126, "time_per_iteration": 2.654529333114624 }, { "auxiliary_loss_clip": 0.01107646, "auxiliary_loss_mlp": 0.01037249, "balance_loss_clip": 1.03968537, "balance_loss_mlp": 1.02193701, "epoch": 0.5487449271005561, "flos": 17236224581760.0, "grad_norm": 2.710645426319007, "language_loss": 0.66802239, "learning_rate": 1.7817527502804928e-06, "loss": 0.6894713, "num_input_tokens_seen": 196659160, "step": 9127, "time_per_iteration": 2.615807294845581 }, { "auxiliary_loss_clip": 0.01078683, "auxiliary_loss_mlp": 0.01039383, "balance_loss_clip": 1.03928149, "balance_loss_mlp": 1.0249052, "epoch": 0.5488050503532241, "flos": 17340867878400.0, "grad_norm": 2.0894273864631225, "language_loss": 0.82909453, "learning_rate": 1.781365618532181e-06, "loss": 0.85027516, "num_input_tokens_seen": 196677410, "step": 9128, "time_per_iteration": 2.681060791015625 }, { "auxiliary_loss_clip": 0.01074302, "auxiliary_loss_mlp": 0.01037438, "balance_loss_clip": 1.03565645, "balance_loss_mlp": 1.02254319, "epoch": 0.548865173605892, "flos": 17239169496960.0, "grad_norm": 1.9025486027385248, "language_loss": 0.74247289, "learning_rate": 1.7809784950745078e-06, "loss": 0.76359022, "num_input_tokens_seen": 196696765, "step": 9129, "time_per_iteration": 2.681459426879883 }, { "auxiliary_loss_clip": 0.01077104, "auxiliary_loss_mlp": 0.01037347, "balance_loss_clip": 1.03771412, "balance_loss_mlp": 1.02210581, "epoch": 0.5489252968585601, "flos": 17456716218240.0, "grad_norm": 3.0707794644461854, "language_loss": 0.63489515, "learning_rate": 1.7805913799221511e-06, "loss": 0.65603966, "num_input_tokens_seen": 196714895, "step": 9130, "time_per_iteration": 2.743734359741211 }, { "auxiliary_loss_clip": 0.01124543, "auxiliary_loss_mlp": 0.00771634, "balance_loss_clip": 1.04329586, "balance_loss_mlp": 1.00023222, "epoch": 0.548985420111228, "flos": 26323383646080.0, "grad_norm": 1.7961020275949398, "language_loss": 0.62998879, "learning_rate": 1.7802042730897915e-06, "loss": 0.64895058, "num_input_tokens_seen": 196735510, "step": 9131, "time_per_iteration": 2.7136600017547607 }, { "auxiliary_loss_clip": 0.01109321, "auxiliary_loss_mlp": 0.01039388, "balance_loss_clip": 1.04004657, "balance_loss_mlp": 1.02416492, "epoch": 0.549045543363896, "flos": 18693730748160.0, "grad_norm": 1.6718560353245449, "language_loss": 0.7504952, "learning_rate": 1.7798171745921084e-06, "loss": 0.77198231, "num_input_tokens_seen": 196752855, "step": 9132, "time_per_iteration": 2.686460494995117 }, { "auxiliary_loss_clip": 0.01107553, "auxiliary_loss_mlp": 0.01033276, "balance_loss_clip": 1.03815818, "balance_loss_mlp": 1.02046108, "epoch": 0.5491056666165639, "flos": 24717386655360.0, "grad_norm": 1.5443073078358045, "language_loss": 0.81107825, "learning_rate": 1.7794300844437795e-06, "loss": 0.83248657, "num_input_tokens_seen": 196772230, "step": 9133, "time_per_iteration": 2.607304811477661 }, { "auxiliary_loss_clip": 0.0109676, "auxiliary_loss_mlp": 0.00770878, "balance_loss_clip": 1.04211152, "balance_loss_mlp": 1.00023055, "epoch": 0.5491657898692319, "flos": 21576926609280.0, "grad_norm": 2.2143971437865275, "language_loss": 0.69978988, "learning_rate": 1.7790430026594841e-06, "loss": 0.71846628, "num_input_tokens_seen": 196790405, "step": 9134, "time_per_iteration": 2.655400037765503 }, { "auxiliary_loss_clip": 0.01085592, "auxiliary_loss_mlp": 0.0104003, "balance_loss_clip": 1.03952289, "balance_loss_mlp": 1.0263567, "epoch": 0.5492259131219, "flos": 50476432746240.0, "grad_norm": 2.156005038881863, "language_loss": 0.61240542, "learning_rate": 1.7786559292539004e-06, "loss": 0.63366163, "num_input_tokens_seen": 196813785, "step": 9135, "time_per_iteration": 2.911567449569702 }, { "auxiliary_loss_clip": 0.01112825, "auxiliary_loss_mlp": 0.01036574, "balance_loss_clip": 1.042696, "balance_loss_mlp": 1.02169049, "epoch": 0.5492860363745679, "flos": 25119262995840.0, "grad_norm": 1.746391133416305, "language_loss": 0.72368252, "learning_rate": 1.7782688642417058e-06, "loss": 0.74517649, "num_input_tokens_seen": 196834390, "step": 9136, "time_per_iteration": 2.6732101440429688 }, { "auxiliary_loss_clip": 0.01060281, "auxiliary_loss_mlp": 0.0104408, "balance_loss_clip": 1.03961897, "balance_loss_mlp": 1.02839267, "epoch": 0.5493461596272359, "flos": 22633777497600.0, "grad_norm": 2.424259272269788, "language_loss": 0.68256485, "learning_rate": 1.7778818076375781e-06, "loss": 0.70360851, "num_input_tokens_seen": 196853290, "step": 9137, "time_per_iteration": 2.7947540283203125 }, { "auxiliary_loss_clip": 0.01030828, "auxiliary_loss_mlp": 0.01011299, "balance_loss_clip": 1.01489806, "balance_loss_mlp": 1.00992203, "epoch": 0.5494062828799038, "flos": 66151800754560.0, "grad_norm": 0.7420439748923869, "language_loss": 0.65270352, "learning_rate": 1.7774947594561947e-06, "loss": 0.67312479, "num_input_tokens_seen": 196913120, "step": 9138, "time_per_iteration": 3.2256250381469727 }, { "auxiliary_loss_clip": 0.0111256, "auxiliary_loss_mlp": 0.01032689, "balance_loss_clip": 1.04488194, "balance_loss_mlp": 1.01902211, "epoch": 0.5494664061325718, "flos": 21105958458240.0, "grad_norm": 1.8659950166851553, "language_loss": 0.75243253, "learning_rate": 1.7771077197122321e-06, "loss": 0.77388501, "num_input_tokens_seen": 196931530, "step": 9139, "time_per_iteration": 2.7239251136779785 }, { "auxiliary_loss_clip": 0.01110681, "auxiliary_loss_mlp": 0.01033207, "balance_loss_clip": 1.04175556, "balance_loss_mlp": 1.01932561, "epoch": 0.5495265293852397, "flos": 14392566616320.0, "grad_norm": 1.6260992267363037, "language_loss": 0.70765269, "learning_rate": 1.7767206884203672e-06, "loss": 0.72909158, "num_input_tokens_seen": 196949430, "step": 9140, "time_per_iteration": 2.647174119949341 }, { "auxiliary_loss_clip": 0.01090583, "auxiliary_loss_mlp": 0.01036785, "balance_loss_clip": 1.03731537, "balance_loss_mlp": 1.02207434, "epoch": 0.5495866526379077, "flos": 25549148966400.0, "grad_norm": 1.8985191424105816, "language_loss": 0.7687242, "learning_rate": 1.7763336655952762e-06, "loss": 0.78999794, "num_input_tokens_seen": 196968265, "step": 9141, "time_per_iteration": 2.65411639213562 }, { "auxiliary_loss_clip": 0.01084812, "auxiliary_loss_mlp": 0.01036963, "balance_loss_clip": 1.0427072, "balance_loss_mlp": 1.02342081, "epoch": 0.5496467758905756, "flos": 21317256213120.0, "grad_norm": 2.1277262842794697, "language_loss": 0.7463578, "learning_rate": 1.7759466512516346e-06, "loss": 0.7675755, "num_input_tokens_seen": 196984930, "step": 9142, "time_per_iteration": 2.7200329303741455 }, { "auxiliary_loss_clip": 0.01098795, "auxiliary_loss_mlp": 0.01036884, "balance_loss_clip": 1.04416585, "balance_loss_mlp": 1.02186954, "epoch": 0.5497068991432437, "flos": 22233086305920.0, "grad_norm": 5.155975597587774, "language_loss": 0.7661894, "learning_rate": 1.7755596454041192e-06, "loss": 0.78754616, "num_input_tokens_seen": 197002320, "step": 9143, "time_per_iteration": 2.6951520442962646 }, { "auxiliary_loss_clip": 0.01091779, "auxiliary_loss_mlp": 0.01037521, "balance_loss_clip": 1.03912258, "balance_loss_mlp": 1.02332926, "epoch": 0.5497670223959116, "flos": 18479093028480.0, "grad_norm": 2.8186227807908466, "language_loss": 0.79572552, "learning_rate": 1.7751726480674044e-06, "loss": 0.81701857, "num_input_tokens_seen": 197020825, "step": 9144, "time_per_iteration": 2.661098003387451 }, { "auxiliary_loss_clip": 0.01112844, "auxiliary_loss_mlp": 0.01034684, "balance_loss_clip": 1.04339552, "balance_loss_mlp": 1.02086163, "epoch": 0.5498271456485796, "flos": 29205107049600.0, "grad_norm": 1.6865855857111283, "language_loss": 0.70998669, "learning_rate": 1.7747856592561645e-06, "loss": 0.731462, "num_input_tokens_seen": 197040450, "step": 9145, "time_per_iteration": 2.6857175827026367 }, { "auxiliary_loss_clip": 0.01109884, "auxiliary_loss_mlp": 0.01033378, "balance_loss_clip": 1.04158354, "balance_loss_mlp": 1.02063489, "epoch": 0.5498872689012475, "flos": 34824372664320.0, "grad_norm": 1.7292068512536125, "language_loss": 0.70875257, "learning_rate": 1.774398678985076e-06, "loss": 0.73018515, "num_input_tokens_seen": 197063930, "step": 9146, "time_per_iteration": 2.7719805240631104 }, { "auxiliary_loss_clip": 0.01096176, "auxiliary_loss_mlp": 0.01029792, "balance_loss_clip": 1.04054928, "balance_loss_mlp": 1.01708448, "epoch": 0.5499473921539155, "flos": 25921938268800.0, "grad_norm": 1.7336366982972622, "language_loss": 0.63770372, "learning_rate": 1.7740117072688113e-06, "loss": 0.65896338, "num_input_tokens_seen": 197082660, "step": 9147, "time_per_iteration": 2.6603379249572754 }, { "auxiliary_loss_clip": 0.01125139, "auxiliary_loss_mlp": 0.01033, "balance_loss_clip": 1.04582083, "balance_loss_mlp": 1.01920164, "epoch": 0.5500075154065835, "flos": 22273701609600.0, "grad_norm": 2.1607061922348088, "language_loss": 0.81009579, "learning_rate": 1.7736247441220458e-06, "loss": 0.8316772, "num_input_tokens_seen": 197100675, "step": 9148, "time_per_iteration": 2.620183229446411 }, { "auxiliary_loss_clip": 0.01101315, "auxiliary_loss_mlp": 0.01039357, "balance_loss_clip": 1.04367983, "balance_loss_mlp": 1.02550507, "epoch": 0.5500676386592515, "flos": 28037507552640.0, "grad_norm": 1.7340881050910257, "language_loss": 0.79154336, "learning_rate": 1.773237789559453e-06, "loss": 0.81295007, "num_input_tokens_seen": 197121320, "step": 9149, "time_per_iteration": 2.734495162963867 }, { "auxiliary_loss_clip": 0.01082615, "auxiliary_loss_mlp": 0.0102795, "balance_loss_clip": 1.0412097, "balance_loss_mlp": 1.01476002, "epoch": 0.5501277619119195, "flos": 23914819123200.0, "grad_norm": 4.0693062888880185, "language_loss": 0.72006851, "learning_rate": 1.7728508435957052e-06, "loss": 0.74117416, "num_input_tokens_seen": 197138965, "step": 9150, "time_per_iteration": 2.66481876373291 }, { "auxiliary_loss_clip": 0.01099742, "auxiliary_loss_mlp": 0.01033518, "balance_loss_clip": 1.03804266, "balance_loss_mlp": 1.0189085, "epoch": 0.5501878851645874, "flos": 20923783655040.0, "grad_norm": 3.1249847499070014, "language_loss": 0.75043446, "learning_rate": 1.772463906245477e-06, "loss": 0.77176708, "num_input_tokens_seen": 197156460, "step": 9151, "time_per_iteration": 2.704946517944336 }, { "auxiliary_loss_clip": 0.0109205, "auxiliary_loss_mlp": 0.01033656, "balance_loss_clip": 1.03899741, "balance_loss_mlp": 1.01981556, "epoch": 0.5502480084172554, "flos": 20665298407680.0, "grad_norm": 2.3903222148465035, "language_loss": 0.76302028, "learning_rate": 1.7720769775234394e-06, "loss": 0.78427732, "num_input_tokens_seen": 197175140, "step": 9152, "time_per_iteration": 5.871058464050293 }, { "auxiliary_loss_clip": 0.01098821, "auxiliary_loss_mlp": 0.01033709, "balance_loss_clip": 1.04291546, "balance_loss_mlp": 1.02058983, "epoch": 0.5503081316699233, "flos": 26432552056320.0, "grad_norm": 1.865148989318078, "language_loss": 0.82033801, "learning_rate": 1.7716900574442662e-06, "loss": 0.84166336, "num_input_tokens_seen": 197194345, "step": 9153, "time_per_iteration": 2.741382598876953 }, { "auxiliary_loss_clip": 0.01110131, "auxiliary_loss_mlp": 0.01029037, "balance_loss_clip": 1.04423809, "balance_loss_mlp": 1.01572764, "epoch": 0.5503682549225913, "flos": 30629144718720.0, "grad_norm": 1.7497509025726563, "language_loss": 0.74392802, "learning_rate": 1.7713031460226294e-06, "loss": 0.76531971, "num_input_tokens_seen": 197215535, "step": 9154, "time_per_iteration": 4.345115900039673 }, { "auxiliary_loss_clip": 0.01104154, "auxiliary_loss_mlp": 0.01039546, "balance_loss_clip": 1.04041803, "balance_loss_mlp": 1.02451348, "epoch": 0.5504283781752592, "flos": 22565439872640.0, "grad_norm": 1.5994441828682415, "language_loss": 0.73138744, "learning_rate": 1.770916243273199e-06, "loss": 0.75282443, "num_input_tokens_seen": 197234945, "step": 9155, "time_per_iteration": 2.6851611137390137 }, { "auxiliary_loss_clip": 0.01021957, "auxiliary_loss_mlp": 0.01001594, "balance_loss_clip": 1.01543474, "balance_loss_mlp": 1.00016963, "epoch": 0.5504885014279273, "flos": 67901009270400.0, "grad_norm": 0.7575867212346565, "language_loss": 0.55399221, "learning_rate": 1.7705293492106483e-06, "loss": 0.57422775, "num_input_tokens_seen": 197302285, "step": 9156, "time_per_iteration": 3.300373077392578 }, { "auxiliary_loss_clip": 0.0110824, "auxiliary_loss_mlp": 0.01037205, "balance_loss_clip": 1.03954601, "balance_loss_mlp": 1.02354383, "epoch": 0.5505486246805952, "flos": 22450058409600.0, "grad_norm": 1.7338338818713679, "language_loss": 0.82676858, "learning_rate": 1.7701424638496475e-06, "loss": 0.84822297, "num_input_tokens_seen": 197321575, "step": 9157, "time_per_iteration": 4.260001182556152 }, { "auxiliary_loss_clip": 0.01128779, "auxiliary_loss_mlp": 0.01036261, "balance_loss_clip": 1.04512608, "balance_loss_mlp": 1.02101421, "epoch": 0.5506087479332632, "flos": 26906896085760.0, "grad_norm": 2.1665568405651916, "language_loss": 0.7574966, "learning_rate": 1.7697555872048677e-06, "loss": 0.77914703, "num_input_tokens_seen": 197340255, "step": 9158, "time_per_iteration": 2.634035587310791 }, { "auxiliary_loss_clip": 0.01079995, "auxiliary_loss_mlp": 0.01032346, "balance_loss_clip": 1.04036868, "balance_loss_mlp": 1.01919723, "epoch": 0.5506688711859311, "flos": 22930256355840.0, "grad_norm": 1.7765349720842452, "language_loss": 0.7011236, "learning_rate": 1.769368719290979e-06, "loss": 0.72224694, "num_input_tokens_seen": 197360360, "step": 9159, "time_per_iteration": 2.765982151031494 }, { "auxiliary_loss_clip": 0.01074937, "auxiliary_loss_mlp": 0.00772606, "balance_loss_clip": 1.03859997, "balance_loss_mlp": 1.00024915, "epoch": 0.5507289944385991, "flos": 29606408772480.0, "grad_norm": 1.5184177470515237, "language_loss": 0.6844312, "learning_rate": 1.7689818601226516e-06, "loss": 0.70290661, "num_input_tokens_seen": 197381905, "step": 9160, "time_per_iteration": 2.7715611457824707 }, { "auxiliary_loss_clip": 0.01121201, "auxiliary_loss_mlp": 0.01036642, "balance_loss_clip": 1.04361653, "balance_loss_mlp": 1.02297473, "epoch": 0.5507891176912671, "flos": 15334431091200.0, "grad_norm": 2.346039254378587, "language_loss": 0.71789527, "learning_rate": 1.7685950097145552e-06, "loss": 0.7394737, "num_input_tokens_seen": 197398555, "step": 9161, "time_per_iteration": 2.641042470932007 }, { "auxiliary_loss_clip": 0.01112875, "auxiliary_loss_mlp": 0.01042589, "balance_loss_clip": 1.04357731, "balance_loss_mlp": 1.02879643, "epoch": 0.5508492409439351, "flos": 26578313447040.0, "grad_norm": 1.6233913779896265, "language_loss": 0.69443804, "learning_rate": 1.768208168081359e-06, "loss": 0.71599269, "num_input_tokens_seen": 197419630, "step": 9162, "time_per_iteration": 2.693645715713501 }, { "auxiliary_loss_clip": 0.01122811, "auxiliary_loss_mlp": 0.01038789, "balance_loss_clip": 1.04462349, "balance_loss_mlp": 1.02506185, "epoch": 0.5509093641966031, "flos": 25443428261760.0, "grad_norm": 1.863003505887403, "language_loss": 0.85338551, "learning_rate": 1.767821335237733e-06, "loss": 0.87500155, "num_input_tokens_seen": 197438480, "step": 9163, "time_per_iteration": 2.6538877487182617 }, { "auxiliary_loss_clip": 0.01088872, "auxiliary_loss_mlp": 0.01032282, "balance_loss_clip": 1.04132617, "balance_loss_mlp": 1.01908576, "epoch": 0.550969487449271, "flos": 18698543170560.0, "grad_norm": 1.8611061255519936, "language_loss": 0.80892253, "learning_rate": 1.7674345111983441e-06, "loss": 0.83013415, "num_input_tokens_seen": 197456755, "step": 9164, "time_per_iteration": 2.813016891479492 }, { "auxiliary_loss_clip": 0.0110727, "auxiliary_loss_mlp": 0.01033027, "balance_loss_clip": 1.04617882, "balance_loss_mlp": 1.01856649, "epoch": 0.551029610701939, "flos": 22708723224960.0, "grad_norm": 1.8149479270660511, "language_loss": 0.73350954, "learning_rate": 1.767047695977863e-06, "loss": 0.75491256, "num_input_tokens_seen": 197475530, "step": 9165, "time_per_iteration": 2.6487855911254883 }, { "auxiliary_loss_clip": 0.01103747, "auxiliary_loss_mlp": 0.01041326, "balance_loss_clip": 1.04083133, "balance_loss_mlp": 1.02677011, "epoch": 0.5510897339546069, "flos": 12420496166400.0, "grad_norm": 1.9553906281347788, "language_loss": 0.78998721, "learning_rate": 1.7666608895909563e-06, "loss": 0.8114379, "num_input_tokens_seen": 197490835, "step": 9166, "time_per_iteration": 2.578125 }, { "auxiliary_loss_clip": 0.01089384, "auxiliary_loss_mlp": 0.01032199, "balance_loss_clip": 1.03881669, "balance_loss_mlp": 1.01822138, "epoch": 0.5511498572072749, "flos": 18770579896320.0, "grad_norm": 2.156469581369372, "language_loss": 0.76529676, "learning_rate": 1.7662740920522913e-06, "loss": 0.78651255, "num_input_tokens_seen": 197508770, "step": 9167, "time_per_iteration": 2.7045888900756836 }, { "auxiliary_loss_clip": 0.01112145, "auxiliary_loss_mlp": 0.01032029, "balance_loss_clip": 1.04281187, "balance_loss_mlp": 1.01811707, "epoch": 0.5512099804599428, "flos": 19573326996480.0, "grad_norm": 2.0156954118398227, "language_loss": 0.79765004, "learning_rate": 1.7658873033765374e-06, "loss": 0.81909174, "num_input_tokens_seen": 197527340, "step": 9168, "time_per_iteration": 2.669908046722412 }, { "auxiliary_loss_clip": 0.0111534, "auxiliary_loss_mlp": 0.0104239, "balance_loss_clip": 1.04542589, "balance_loss_mlp": 1.02830565, "epoch": 0.5512701037126109, "flos": 26245600744320.0, "grad_norm": 1.6113858397633185, "language_loss": 0.69293267, "learning_rate": 1.7655005235783591e-06, "loss": 0.71450996, "num_input_tokens_seen": 197547280, "step": 9169, "time_per_iteration": 2.70609450340271 }, { "auxiliary_loss_clip": 0.01106964, "auxiliary_loss_mlp": 0.01029287, "balance_loss_clip": 1.04113257, "balance_loss_mlp": 1.01710367, "epoch": 0.5513302269652788, "flos": 21945406279680.0, "grad_norm": 1.9890616519308366, "language_loss": 0.85510826, "learning_rate": 1.7651137526724251e-06, "loss": 0.87647074, "num_input_tokens_seen": 197565045, "step": 9170, "time_per_iteration": 2.670785427093506 }, { "auxiliary_loss_clip": 0.01022762, "auxiliary_loss_mlp": 0.01003909, "balance_loss_clip": 1.02287233, "balance_loss_mlp": 1.00240731, "epoch": 0.5513903502179468, "flos": 68235948616320.0, "grad_norm": 0.7781167580815929, "language_loss": 0.59840322, "learning_rate": 1.7647269906734017e-06, "loss": 0.61866993, "num_input_tokens_seen": 197625005, "step": 9171, "time_per_iteration": 3.2524025440216064 }, { "auxiliary_loss_clip": 0.01085077, "auxiliary_loss_mlp": 0.01041997, "balance_loss_clip": 1.03855562, "balance_loss_mlp": 1.02763844, "epoch": 0.5514504734706147, "flos": 18734238311040.0, "grad_norm": 1.556060427891405, "language_loss": 0.70670319, "learning_rate": 1.7643402375959533e-06, "loss": 0.72797394, "num_input_tokens_seen": 197645050, "step": 9172, "time_per_iteration": 2.708811044692993 }, { "auxiliary_loss_clip": 0.01120195, "auxiliary_loss_mlp": 0.01038202, "balance_loss_clip": 1.04229403, "balance_loss_mlp": 1.02470756, "epoch": 0.5515105967232827, "flos": 22270972176000.0, "grad_norm": 1.7490660409709138, "language_loss": 0.75727642, "learning_rate": 1.7639534934547474e-06, "loss": 0.77886033, "num_input_tokens_seen": 197663910, "step": 9173, "time_per_iteration": 2.6022469997406006 }, { "auxiliary_loss_clip": 0.01083041, "auxiliary_loss_mlp": 0.01033938, "balance_loss_clip": 1.04071558, "balance_loss_mlp": 1.02043712, "epoch": 0.5515707199759508, "flos": 22557682535040.0, "grad_norm": 1.9060639151270278, "language_loss": 0.75156957, "learning_rate": 1.7635667582644484e-06, "loss": 0.77273941, "num_input_tokens_seen": 197681580, "step": 9174, "time_per_iteration": 2.758668899536133 }, { "auxiliary_loss_clip": 0.01102936, "auxiliary_loss_mlp": 0.01034738, "balance_loss_clip": 1.0414834, "balance_loss_mlp": 1.02056456, "epoch": 0.5516308432286187, "flos": 28291072636800.0, "grad_norm": 2.209520073538634, "language_loss": 0.72830188, "learning_rate": 1.7631800320397217e-06, "loss": 0.74967873, "num_input_tokens_seen": 197702095, "step": 9175, "time_per_iteration": 2.6674885749816895 }, { "auxiliary_loss_clip": 0.01112767, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.04439914, "balance_loss_mlp": 1.02324057, "epoch": 0.5516909664812867, "flos": 18764474584320.0, "grad_norm": 1.7828415192194789, "language_loss": 0.69321132, "learning_rate": 1.7627933147952318e-06, "loss": 0.71470201, "num_input_tokens_seen": 197720720, "step": 9176, "time_per_iteration": 2.721855878829956 }, { "auxiliary_loss_clip": 0.01112205, "auxiliary_loss_mlp": 0.01032754, "balance_loss_clip": 1.04404604, "balance_loss_mlp": 1.02004051, "epoch": 0.5517510897339546, "flos": 27740346336000.0, "grad_norm": 1.6320384621008008, "language_loss": 0.70890021, "learning_rate": 1.7624066065456435e-06, "loss": 0.73034984, "num_input_tokens_seen": 197741820, "step": 9177, "time_per_iteration": 2.6951122283935547 }, { "auxiliary_loss_clip": 0.01111799, "auxiliary_loss_mlp": 0.01031495, "balance_loss_clip": 1.0442878, "balance_loss_mlp": 1.01811981, "epoch": 0.5518112129866226, "flos": 18404470523520.0, "grad_norm": 1.5626252071778102, "language_loss": 0.80647016, "learning_rate": 1.7620199073056204e-06, "loss": 0.82790309, "num_input_tokens_seen": 197759160, "step": 9178, "time_per_iteration": 2.6048829555511475 }, { "auxiliary_loss_clip": 0.01063405, "auxiliary_loss_mlp": 0.01046955, "balance_loss_clip": 1.04167509, "balance_loss_mlp": 1.03129053, "epoch": 0.5518713362392905, "flos": 25082670015360.0, "grad_norm": 2.211793529411812, "language_loss": 0.7505163, "learning_rate": 1.761633217089826e-06, "loss": 0.77161986, "num_input_tokens_seen": 197779760, "step": 9179, "time_per_iteration": 2.808234453201294 }, { "auxiliary_loss_clip": 0.01114825, "auxiliary_loss_mlp": 0.0104374, "balance_loss_clip": 1.04556203, "balance_loss_mlp": 1.02984655, "epoch": 0.5519314594919585, "flos": 36538999361280.0, "grad_norm": 1.9934221112233521, "language_loss": 0.7009306, "learning_rate": 1.761246535912924e-06, "loss": 0.7225163, "num_input_tokens_seen": 197801545, "step": 9180, "time_per_iteration": 2.788222551345825 }, { "auxiliary_loss_clip": 0.01106377, "auxiliary_loss_mlp": 0.01041353, "balance_loss_clip": 1.0398531, "balance_loss_mlp": 1.02672613, "epoch": 0.5519915827446265, "flos": 20448613612800.0, "grad_norm": 1.9005454733047327, "language_loss": 0.67093515, "learning_rate": 1.7608598637895776e-06, "loss": 0.69241244, "num_input_tokens_seen": 197820760, "step": 9181, "time_per_iteration": 2.7013533115386963 }, { "auxiliary_loss_clip": 0.01126813, "auxiliary_loss_mlp": 0.0103403, "balance_loss_clip": 1.0449146, "balance_loss_mlp": 1.02041602, "epoch": 0.5520517059972945, "flos": 23768052151680.0, "grad_norm": 2.0355295280850347, "language_loss": 0.79382825, "learning_rate": 1.7604732007344486e-06, "loss": 0.8154366, "num_input_tokens_seen": 197840195, "step": 9182, "time_per_iteration": 2.6580309867858887 }, { "auxiliary_loss_clip": 0.0108505, "auxiliary_loss_mlp": 0.01029722, "balance_loss_clip": 1.0405935, "balance_loss_mlp": 1.01576233, "epoch": 0.5521118292499624, "flos": 22196457411840.0, "grad_norm": 2.3123904881057524, "language_loss": 0.83006704, "learning_rate": 1.7600865467622003e-06, "loss": 0.85121477, "num_input_tokens_seen": 197859475, "step": 9183, "time_per_iteration": 2.744466543197632 }, { "auxiliary_loss_clip": 0.01100335, "auxiliary_loss_mlp": 0.01028792, "balance_loss_clip": 1.0419153, "balance_loss_mlp": 1.01544046, "epoch": 0.5521719525026304, "flos": 23583291569280.0, "grad_norm": 1.2881660479793424, "language_loss": 0.67605364, "learning_rate": 1.7596999018874936e-06, "loss": 0.6973449, "num_input_tokens_seen": 197879395, "step": 9184, "time_per_iteration": 2.6846580505371094 }, { "auxiliary_loss_clip": 0.01110729, "auxiliary_loss_mlp": 0.01028759, "balance_loss_clip": 1.04261684, "balance_loss_mlp": 1.01442409, "epoch": 0.5522320757552983, "flos": 26137617482880.0, "grad_norm": 1.486667996359971, "language_loss": 0.76359147, "learning_rate": 1.7593132661249917e-06, "loss": 0.78498632, "num_input_tokens_seen": 197900815, "step": 9185, "time_per_iteration": 2.6278598308563232 }, { "auxiliary_loss_clip": 0.01084681, "auxiliary_loss_mlp": 0.01041899, "balance_loss_clip": 1.04073203, "balance_loss_mlp": 1.02742732, "epoch": 0.5522921990079663, "flos": 24676160820480.0, "grad_norm": 1.6270174778631188, "language_loss": 0.74294305, "learning_rate": 1.7589266394893536e-06, "loss": 0.76420891, "num_input_tokens_seen": 197918985, "step": 9186, "time_per_iteration": 2.7178421020507812 }, { "auxiliary_loss_clip": 0.01094897, "auxiliary_loss_mlp": 0.0103984, "balance_loss_clip": 1.04445529, "balance_loss_mlp": 1.02626204, "epoch": 0.5523523222606344, "flos": 22748153379840.0, "grad_norm": 2.1270117067296725, "language_loss": 0.66701925, "learning_rate": 1.7585400219952421e-06, "loss": 0.68836665, "num_input_tokens_seen": 197937725, "step": 9187, "time_per_iteration": 2.7278029918670654 }, { "auxiliary_loss_clip": 0.01101824, "auxiliary_loss_mlp": 0.01034427, "balance_loss_clip": 1.04459238, "balance_loss_mlp": 1.02054477, "epoch": 0.5524124455133023, "flos": 19755825022080.0, "grad_norm": 1.575939713951601, "language_loss": 0.7774123, "learning_rate": 1.758153413657318e-06, "loss": 0.79877484, "num_input_tokens_seen": 197955635, "step": 9188, "time_per_iteration": 2.753506660461426 }, { "auxiliary_loss_clip": 0.01095705, "auxiliary_loss_mlp": 0.01031864, "balance_loss_clip": 1.04053175, "balance_loss_mlp": 1.01806509, "epoch": 0.5524725687659703, "flos": 23294821443840.0, "grad_norm": 1.82344252580878, "language_loss": 0.81139189, "learning_rate": 1.7577668144902394e-06, "loss": 0.83266759, "num_input_tokens_seen": 197974490, "step": 9189, "time_per_iteration": 2.7089128494262695 }, { "auxiliary_loss_clip": 0.01104025, "auxiliary_loss_mlp": 0.00770543, "balance_loss_clip": 1.04259682, "balance_loss_mlp": 1.00024211, "epoch": 0.5525326920186382, "flos": 24862178378880.0, "grad_norm": 1.4850448399521246, "language_loss": 0.76478475, "learning_rate": 1.7573802245086684e-06, "loss": 0.78353041, "num_input_tokens_seen": 197995735, "step": 9190, "time_per_iteration": 2.611971855163574 }, { "auxiliary_loss_clip": 0.01125599, "auxiliary_loss_mlp": 0.01041501, "balance_loss_clip": 1.04273391, "balance_loss_mlp": 1.02648067, "epoch": 0.5525928152713062, "flos": 13735580906880.0, "grad_norm": 2.4141637541410508, "language_loss": 0.78987861, "learning_rate": 1.7569936437272627e-06, "loss": 0.81154966, "num_input_tokens_seen": 198009685, "step": 9191, "time_per_iteration": 2.545794725418091 }, { "auxiliary_loss_clip": 0.01050104, "auxiliary_loss_mlp": 0.01035439, "balance_loss_clip": 1.03439641, "balance_loss_mlp": 1.02133703, "epoch": 0.5526529385239741, "flos": 13071592045440.0, "grad_norm": 2.484462687188894, "language_loss": 0.68966973, "learning_rate": 1.7566070721606829e-06, "loss": 0.71052521, "num_input_tokens_seen": 198026845, "step": 9192, "time_per_iteration": 6.08718204498291 }, { "auxiliary_loss_clip": 0.01110861, "auxiliary_loss_mlp": 0.01035933, "balance_loss_clip": 1.04424548, "balance_loss_mlp": 1.02356553, "epoch": 0.5527130617766421, "flos": 23148377694720.0, "grad_norm": 1.4810056841060688, "language_loss": 0.77680272, "learning_rate": 1.756220509823588e-06, "loss": 0.7982707, "num_input_tokens_seen": 198045275, "step": 9193, "time_per_iteration": 4.1960039138793945 }, { "auxiliary_loss_clip": 0.01083568, "auxiliary_loss_mlp": 0.01034731, "balance_loss_clip": 1.03722787, "balance_loss_mlp": 1.02139795, "epoch": 0.55277318502931, "flos": 21285547482240.0, "grad_norm": 1.4323494490195217, "language_loss": 0.78473246, "learning_rate": 1.7558339567306344e-06, "loss": 0.80591547, "num_input_tokens_seen": 198065760, "step": 9194, "time_per_iteration": 2.730219841003418 }, { "auxiliary_loss_clip": 0.01089289, "auxiliary_loss_mlp": 0.01036551, "balance_loss_clip": 1.04286909, "balance_loss_mlp": 1.02309823, "epoch": 0.5528333082819781, "flos": 38324549462400.0, "grad_norm": 2.5114324389353224, "language_loss": 0.69563878, "learning_rate": 1.7554474128964825e-06, "loss": 0.71689719, "num_input_tokens_seen": 198087595, "step": 9195, "time_per_iteration": 2.898447275161743 }, { "auxiliary_loss_clip": 0.01107137, "auxiliary_loss_mlp": 0.01036404, "balance_loss_clip": 1.04293728, "balance_loss_mlp": 1.02215791, "epoch": 0.552893431534646, "flos": 13553621585280.0, "grad_norm": 1.952206040801574, "language_loss": 0.74276292, "learning_rate": 1.7550608783357887e-06, "loss": 0.76419842, "num_input_tokens_seen": 198104620, "step": 9196, "time_per_iteration": 2.775261878967285 }, { "auxiliary_loss_clip": 0.01105394, "auxiliary_loss_mlp": 0.01038639, "balance_loss_clip": 1.04212689, "balance_loss_mlp": 1.02461457, "epoch": 0.552953554787314, "flos": 21939408708480.0, "grad_norm": 2.1600616911977384, "language_loss": 0.76948142, "learning_rate": 1.7546743530632115e-06, "loss": 0.79092181, "num_input_tokens_seen": 198123565, "step": 9197, "time_per_iteration": 4.16440224647522 }, { "auxiliary_loss_clip": 0.01097995, "auxiliary_loss_mlp": 0.01032629, "balance_loss_clip": 1.03984201, "balance_loss_mlp": 1.01995707, "epoch": 0.5530136780399819, "flos": 43658002558080.0, "grad_norm": 1.6850679441105894, "language_loss": 0.76054031, "learning_rate": 1.754287837093407e-06, "loss": 0.78184652, "num_input_tokens_seen": 198148270, "step": 9198, "time_per_iteration": 2.950439453125 }, { "auxiliary_loss_clip": 0.01119177, "auxiliary_loss_mlp": 0.01029802, "balance_loss_clip": 1.04138994, "balance_loss_mlp": 1.01700497, "epoch": 0.5530738012926499, "flos": 25045502417280.0, "grad_norm": 1.499755291272354, "language_loss": 0.79495585, "learning_rate": 1.7539013304410327e-06, "loss": 0.81644565, "num_input_tokens_seen": 198168810, "step": 9199, "time_per_iteration": 2.619361162185669 }, { "auxiliary_loss_clip": 0.01078304, "auxiliary_loss_mlp": 0.01039784, "balance_loss_clip": 1.03867352, "balance_loss_mlp": 1.02552032, "epoch": 0.553133924545318, "flos": 16472081623680.0, "grad_norm": 1.9832278976810611, "language_loss": 0.63797927, "learning_rate": 1.7535148331207443e-06, "loss": 0.65916014, "num_input_tokens_seen": 198186200, "step": 9200, "time_per_iteration": 2.6335854530334473 }, { "auxiliary_loss_clip": 0.01102034, "auxiliary_loss_mlp": 0.01033407, "balance_loss_clip": 1.04273176, "balance_loss_mlp": 1.01869619, "epoch": 0.5531940477979859, "flos": 24606207083520.0, "grad_norm": 1.4982382349332672, "language_loss": 0.66065866, "learning_rate": 1.7531283451471978e-06, "loss": 0.68201303, "num_input_tokens_seen": 198207050, "step": 9201, "time_per_iteration": 2.7522671222686768 }, { "auxiliary_loss_clip": 0.01108187, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.04183888, "balance_loss_mlp": 1.02056432, "epoch": 0.5532541710506539, "flos": 22159577122560.0, "grad_norm": 1.9333851468305103, "language_loss": 0.61028016, "learning_rate": 1.7527418665350502e-06, "loss": 0.63171005, "num_input_tokens_seen": 198224565, "step": 9202, "time_per_iteration": 2.6281580924987793 }, { "auxiliary_loss_clip": 0.0110847, "auxiliary_loss_mlp": 0.00770781, "balance_loss_clip": 1.0422498, "balance_loss_mlp": 1.00029778, "epoch": 0.5533142943033218, "flos": 21397265758080.0, "grad_norm": 1.7184873612817428, "language_loss": 0.64222115, "learning_rate": 1.7523553972989548e-06, "loss": 0.66101366, "num_input_tokens_seen": 198244790, "step": 9203, "time_per_iteration": 2.6509506702423096 }, { "auxiliary_loss_clip": 0.01108951, "auxiliary_loss_mlp": 0.0103371, "balance_loss_clip": 1.04175293, "balance_loss_mlp": 1.02028739, "epoch": 0.5533744175559898, "flos": 23550541344000.0, "grad_norm": 1.4819756399271273, "language_loss": 0.63615203, "learning_rate": 1.7519689374535683e-06, "loss": 0.65757859, "num_input_tokens_seen": 198264375, "step": 9204, "time_per_iteration": 2.7008473873138428 }, { "auxiliary_loss_clip": 0.01106611, "auxiliary_loss_mlp": 0.01030715, "balance_loss_clip": 1.04070532, "balance_loss_mlp": 1.0184958, "epoch": 0.5534345408086577, "flos": 24061514267520.0, "grad_norm": 1.5985992235632864, "language_loss": 0.77158082, "learning_rate": 1.7515824870135445e-06, "loss": 0.79295409, "num_input_tokens_seen": 198283895, "step": 9205, "time_per_iteration": 2.6544225215911865 }, { "auxiliary_loss_clip": 0.01059768, "auxiliary_loss_mlp": 0.01039078, "balance_loss_clip": 1.03511405, "balance_loss_mlp": 1.02576268, "epoch": 0.5534946640613257, "flos": 33771831408000.0, "grad_norm": 1.4391383519913163, "language_loss": 0.72826385, "learning_rate": 1.751196045993537e-06, "loss": 0.74925232, "num_input_tokens_seen": 198310035, "step": 9206, "time_per_iteration": 2.832268476486206 }, { "auxiliary_loss_clip": 0.01073531, "auxiliary_loss_mlp": 0.01034139, "balance_loss_clip": 1.03840923, "balance_loss_mlp": 1.0208354, "epoch": 0.5535547873139937, "flos": 15159223526400.0, "grad_norm": 2.230271879861814, "language_loss": 0.75639313, "learning_rate": 1.7508096144082012e-06, "loss": 0.77746987, "num_input_tokens_seen": 198327810, "step": 9207, "time_per_iteration": 2.7088775634765625 }, { "auxiliary_loss_clip": 0.01088202, "auxiliary_loss_mlp": 0.01033804, "balance_loss_clip": 1.0419991, "balance_loss_mlp": 1.02010703, "epoch": 0.5536149105666617, "flos": 16980863817600.0, "grad_norm": 71.24671792095333, "language_loss": 0.61898887, "learning_rate": 1.750423192272189e-06, "loss": 0.6402089, "num_input_tokens_seen": 198343150, "step": 9208, "time_per_iteration": 2.749739646911621 }, { "auxiliary_loss_clip": 0.01123136, "auxiliary_loss_mlp": 0.0103585, "balance_loss_clip": 1.04367232, "balance_loss_mlp": 1.02285004, "epoch": 0.5536750338193296, "flos": 18149935772160.0, "grad_norm": 2.006267106077657, "language_loss": 0.64258868, "learning_rate": 1.7500367796001547e-06, "loss": 0.66417855, "num_input_tokens_seen": 198360925, "step": 9209, "time_per_iteration": 2.6854724884033203 }, { "auxiliary_loss_clip": 0.01084442, "auxiliary_loss_mlp": 0.0104196, "balance_loss_clip": 1.03969955, "balance_loss_mlp": 1.02729774, "epoch": 0.5537351570719976, "flos": 22747794243840.0, "grad_norm": 1.8841222831412607, "language_loss": 0.82470959, "learning_rate": 1.7496503764067513e-06, "loss": 0.84597361, "num_input_tokens_seen": 198379265, "step": 9210, "time_per_iteration": 2.746532917022705 }, { "auxiliary_loss_clip": 0.01098481, "auxiliary_loss_mlp": 0.01029278, "balance_loss_clip": 1.04068804, "balance_loss_mlp": 1.016523, "epoch": 0.5537952803246655, "flos": 26356026130560.0, "grad_norm": 1.6369703268884894, "language_loss": 0.72731483, "learning_rate": 1.74926398270663e-06, "loss": 0.74859238, "num_input_tokens_seen": 198399490, "step": 9211, "time_per_iteration": 2.767152786254883 }, { "auxiliary_loss_clip": 0.01089972, "auxiliary_loss_mlp": 0.01037477, "balance_loss_clip": 1.03941226, "balance_loss_mlp": 1.02259946, "epoch": 0.5538554035773335, "flos": 18037427397120.0, "grad_norm": 1.965979716525238, "language_loss": 0.6684767, "learning_rate": 1.7488775985144437e-06, "loss": 0.68975115, "num_input_tokens_seen": 198419110, "step": 9212, "time_per_iteration": 2.6946139335632324 }, { "auxiliary_loss_clip": 0.01092654, "auxiliary_loss_mlp": 0.01030144, "balance_loss_clip": 1.04305434, "balance_loss_mlp": 1.01557696, "epoch": 0.5539155268300014, "flos": 31686247002240.0, "grad_norm": 1.403594998374367, "language_loss": 0.51636183, "learning_rate": 1.7484912238448443e-06, "loss": 0.53758979, "num_input_tokens_seen": 198441360, "step": 9213, "time_per_iteration": 2.7821476459503174 }, { "auxiliary_loss_clip": 0.01092111, "auxiliary_loss_mlp": 0.01030874, "balance_loss_clip": 1.04350758, "balance_loss_mlp": 1.01752245, "epoch": 0.5539756500826695, "flos": 15193769431680.0, "grad_norm": 3.6308307245288214, "language_loss": 0.86044586, "learning_rate": 1.7481048587124827e-06, "loss": 0.88167566, "num_input_tokens_seen": 198459835, "step": 9214, "time_per_iteration": 2.7264554500579834 }, { "auxiliary_loss_clip": 0.01110148, "auxiliary_loss_mlp": 0.01032811, "balance_loss_clip": 1.04324055, "balance_loss_mlp": 1.02003813, "epoch": 0.5540357733353375, "flos": 26353117128960.0, "grad_norm": 2.235553679927881, "language_loss": 0.70002753, "learning_rate": 1.7477185031320108e-06, "loss": 0.72145712, "num_input_tokens_seen": 198478955, "step": 9215, "time_per_iteration": 2.684901714324951 }, { "auxiliary_loss_clip": 0.01093255, "auxiliary_loss_mlp": 0.0103064, "balance_loss_clip": 1.03972387, "balance_loss_mlp": 1.01641822, "epoch": 0.5540958965880054, "flos": 21323684747520.0, "grad_norm": 1.5213166138329088, "language_loss": 0.73443544, "learning_rate": 1.7473321571180773e-06, "loss": 0.75567436, "num_input_tokens_seen": 198499030, "step": 9216, "time_per_iteration": 2.6930174827575684 }, { "auxiliary_loss_clip": 0.01095704, "auxiliary_loss_mlp": 0.0103884, "balance_loss_clip": 1.04206526, "balance_loss_mlp": 1.02541757, "epoch": 0.5541560198406734, "flos": 25666828899840.0, "grad_norm": 1.8909182551573178, "language_loss": 0.71728694, "learning_rate": 1.7469458206853345e-06, "loss": 0.73863238, "num_input_tokens_seen": 198520265, "step": 9217, "time_per_iteration": 2.705566644668579 }, { "auxiliary_loss_clip": 0.01102416, "auxiliary_loss_mlp": 0.01027899, "balance_loss_clip": 1.04219627, "balance_loss_mlp": 1.01496446, "epoch": 0.5542161430933413, "flos": 21939624190080.0, "grad_norm": 1.8150794810366015, "language_loss": 0.78261054, "learning_rate": 1.7465594938484315e-06, "loss": 0.80391365, "num_input_tokens_seen": 198539645, "step": 9218, "time_per_iteration": 2.6569690704345703 }, { "auxiliary_loss_clip": 0.01077956, "auxiliary_loss_mlp": 0.01037085, "balance_loss_clip": 1.03790164, "balance_loss_mlp": 1.02161169, "epoch": 0.5542762663460093, "flos": 19571459489280.0, "grad_norm": 1.6224660724744044, "language_loss": 0.72173905, "learning_rate": 1.7461731766220176e-06, "loss": 0.74288952, "num_input_tokens_seen": 198558710, "step": 9219, "time_per_iteration": 2.685511350631714 }, { "auxiliary_loss_clip": 0.01108862, "auxiliary_loss_mlp": 0.01039965, "balance_loss_clip": 1.04482341, "balance_loss_mlp": 1.0262028, "epoch": 0.5543363895986773, "flos": 19499063627520.0, "grad_norm": 1.5105706382424104, "language_loss": 0.71297967, "learning_rate": 1.7457868690207426e-06, "loss": 0.73446798, "num_input_tokens_seen": 198577050, "step": 9220, "time_per_iteration": 2.6306073665618896 }, { "auxiliary_loss_clip": 0.01120811, "auxiliary_loss_mlp": 0.01026848, "balance_loss_clip": 1.04381871, "balance_loss_mlp": 1.01429546, "epoch": 0.5543965128513453, "flos": 22635609091200.0, "grad_norm": 1.6307293256223026, "language_loss": 0.79449409, "learning_rate": 1.7454005710592547e-06, "loss": 0.81597066, "num_input_tokens_seen": 198595290, "step": 9221, "time_per_iteration": 2.664358139038086 }, { "auxiliary_loss_clip": 0.01090389, "auxiliary_loss_mlp": 0.01034475, "balance_loss_clip": 1.04653525, "balance_loss_mlp": 1.02108812, "epoch": 0.5544566361040132, "flos": 25989952671360.0, "grad_norm": 1.9685503329730023, "language_loss": 0.83722961, "learning_rate": 1.7450142827522027e-06, "loss": 0.85847831, "num_input_tokens_seen": 198614110, "step": 9222, "time_per_iteration": 2.770050048828125 }, { "auxiliary_loss_clip": 0.01100221, "auxiliary_loss_mlp": 0.00771629, "balance_loss_clip": 1.04789209, "balance_loss_mlp": 1.00036037, "epoch": 0.5545167593566812, "flos": 28257568225920.0, "grad_norm": 1.9185335813275248, "language_loss": 0.75431746, "learning_rate": 1.7446280041142344e-06, "loss": 0.773036, "num_input_tokens_seen": 198633880, "step": 9223, "time_per_iteration": 2.794182062149048 }, { "auxiliary_loss_clip": 0.01091289, "auxiliary_loss_mlp": 0.0103417, "balance_loss_clip": 1.04017019, "balance_loss_mlp": 1.0201149, "epoch": 0.5545768826093491, "flos": 28476551491200.0, "grad_norm": 1.614917501509061, "language_loss": 0.82090491, "learning_rate": 1.7442417351599986e-06, "loss": 0.84215945, "num_input_tokens_seen": 198653505, "step": 9224, "time_per_iteration": 2.7137935161590576 }, { "auxiliary_loss_clip": 0.01108448, "auxiliary_loss_mlp": 0.01043106, "balance_loss_clip": 1.04417324, "balance_loss_mlp": 1.02924204, "epoch": 0.5546370058620171, "flos": 18478051534080.0, "grad_norm": 1.7607532408743478, "language_loss": 0.57043874, "learning_rate": 1.743855475904141e-06, "loss": 0.59195429, "num_input_tokens_seen": 198671890, "step": 9225, "time_per_iteration": 2.616447687149048 }, { "auxiliary_loss_clip": 0.01112997, "auxiliary_loss_mlp": 0.01038411, "balance_loss_clip": 1.04317498, "balance_loss_mlp": 1.02444005, "epoch": 0.554697129114685, "flos": 22930507751040.0, "grad_norm": 1.6222452828903178, "language_loss": 0.67458808, "learning_rate": 1.7434692263613098e-06, "loss": 0.69610214, "num_input_tokens_seen": 198691995, "step": 9226, "time_per_iteration": 2.663339138031006 }, { "auxiliary_loss_clip": 0.0108551, "auxiliary_loss_mlp": 0.0103467, "balance_loss_clip": 1.03901601, "balance_loss_mlp": 1.02121162, "epoch": 0.5547572523673531, "flos": 21797166850560.0, "grad_norm": 1.6061917148762987, "language_loss": 0.74387592, "learning_rate": 1.7430829865461518e-06, "loss": 0.76507771, "num_input_tokens_seen": 198712440, "step": 9227, "time_per_iteration": 2.762258529663086 }, { "auxiliary_loss_clip": 0.01087938, "auxiliary_loss_mlp": 0.01034743, "balance_loss_clip": 1.04223549, "balance_loss_mlp": 1.02071249, "epoch": 0.5548173756200211, "flos": 22342829333760.0, "grad_norm": 1.8589261758591291, "language_loss": 0.73263627, "learning_rate": 1.7426967564733118e-06, "loss": 0.7538631, "num_input_tokens_seen": 198731515, "step": 9228, "time_per_iteration": 2.762092113494873 }, { "auxiliary_loss_clip": 0.01122414, "auxiliary_loss_mlp": 0.01031991, "balance_loss_clip": 1.04351175, "balance_loss_mlp": 1.01886559, "epoch": 0.554877498872689, "flos": 17858736213120.0, "grad_norm": 1.672332446894358, "language_loss": 0.75519872, "learning_rate": 1.7423105361574373e-06, "loss": 0.77674282, "num_input_tokens_seen": 198749750, "step": 9229, "time_per_iteration": 2.6003267765045166 }, { "auxiliary_loss_clip": 0.01110807, "auxiliary_loss_mlp": 0.00772253, "balance_loss_clip": 1.0439682, "balance_loss_mlp": 1.00026536, "epoch": 0.554937622125357, "flos": 17238343484160.0, "grad_norm": 1.7587828151966396, "language_loss": 0.68663722, "learning_rate": 1.741924325613172e-06, "loss": 0.70546782, "num_input_tokens_seen": 198768320, "step": 9230, "time_per_iteration": 2.6502435207366943 }, { "auxiliary_loss_clip": 0.01078746, "auxiliary_loss_mlp": 0.01039366, "balance_loss_clip": 1.04407859, "balance_loss_mlp": 1.02506709, "epoch": 0.5549977453780249, "flos": 25368087484800.0, "grad_norm": 2.162588573655947, "language_loss": 0.6800701, "learning_rate": 1.741538124855163e-06, "loss": 0.70125121, "num_input_tokens_seen": 198787230, "step": 9231, "time_per_iteration": 4.46450400352478 }, { "auxiliary_loss_clip": 0.01125233, "auxiliary_loss_mlp": 0.01040313, "balance_loss_clip": 1.04339528, "balance_loss_mlp": 1.02537608, "epoch": 0.555057868630693, "flos": 25079114568960.0, "grad_norm": 1.7058695185820383, "language_loss": 0.78623915, "learning_rate": 1.7411519338980548e-06, "loss": 0.80789459, "num_input_tokens_seen": 198806720, "step": 9232, "time_per_iteration": 4.17819356918335 }, { "auxiliary_loss_clip": 0.01077674, "auxiliary_loss_mlp": 0.01038155, "balance_loss_clip": 1.03794336, "balance_loss_mlp": 1.02523899, "epoch": 0.5551179918833609, "flos": 26104220812800.0, "grad_norm": 1.530027860156435, "language_loss": 0.82512534, "learning_rate": 1.7407657527564898e-06, "loss": 0.84628367, "num_input_tokens_seen": 198826235, "step": 9233, "time_per_iteration": 2.7746078968048096 }, { "auxiliary_loss_clip": 0.01108881, "auxiliary_loss_mlp": 0.01040385, "balance_loss_clip": 1.04062366, "balance_loss_mlp": 1.02632475, "epoch": 0.5551781151360289, "flos": 19384759572480.0, "grad_norm": 8.113354085779601, "language_loss": 0.74638891, "learning_rate": 1.7403795814451142e-06, "loss": 0.76788163, "num_input_tokens_seen": 198842655, "step": 9234, "time_per_iteration": 2.6174590587615967 }, { "auxiliary_loss_clip": 0.01094953, "auxiliary_loss_mlp": 0.01029345, "balance_loss_clip": 1.03896558, "balance_loss_mlp": 1.01647031, "epoch": 0.5552382383886968, "flos": 21725956137600.0, "grad_norm": 4.639125305136136, "language_loss": 0.64988184, "learning_rate": 1.7399934199785706e-06, "loss": 0.67112482, "num_input_tokens_seen": 198861210, "step": 9235, "time_per_iteration": 2.6820857524871826 }, { "auxiliary_loss_clip": 0.0106692, "auxiliary_loss_mlp": 0.01042767, "balance_loss_clip": 1.03562975, "balance_loss_mlp": 1.02793705, "epoch": 0.5552983616413648, "flos": 14356189117440.0, "grad_norm": 1.66240052317675, "language_loss": 0.67842531, "learning_rate": 1.7396072683715029e-06, "loss": 0.69952214, "num_input_tokens_seen": 198880045, "step": 9236, "time_per_iteration": 4.265462160110474 }, { "auxiliary_loss_clip": 0.01116825, "auxiliary_loss_mlp": 0.01028362, "balance_loss_clip": 1.04261172, "balance_loss_mlp": 1.01549888, "epoch": 0.5553584848940327, "flos": 25478548784640.0, "grad_norm": 1.8489707966449562, "language_loss": 0.86189765, "learning_rate": 1.7392211266385536e-06, "loss": 0.88334954, "num_input_tokens_seen": 198900210, "step": 9237, "time_per_iteration": 2.662736654281616 }, { "auxiliary_loss_clip": 0.01108193, "auxiliary_loss_mlp": 0.01037757, "balance_loss_clip": 1.04178131, "balance_loss_mlp": 1.02388716, "epoch": 0.5554186081467007, "flos": 22163850840960.0, "grad_norm": 2.008755703666539, "language_loss": 0.73663169, "learning_rate": 1.7388349947943652e-06, "loss": 0.75809121, "num_input_tokens_seen": 198919055, "step": 9238, "time_per_iteration": 2.6842122077941895 }, { "auxiliary_loss_clip": 0.01105716, "auxiliary_loss_mlp": 0.01031608, "balance_loss_clip": 1.0387727, "balance_loss_mlp": 1.01777411, "epoch": 0.5554787313993687, "flos": 49746656125440.0, "grad_norm": 1.8187915692087442, "language_loss": 0.78551757, "learning_rate": 1.73844887285358e-06, "loss": 0.80689085, "num_input_tokens_seen": 198943505, "step": 9239, "time_per_iteration": 2.887911558151245 }, { "auxiliary_loss_clip": 0.01106485, "auxiliary_loss_mlp": 0.01030728, "balance_loss_clip": 1.04819751, "balance_loss_mlp": 1.01699483, "epoch": 0.5555388546520367, "flos": 22127365601280.0, "grad_norm": 1.7617963791060023, "language_loss": 0.8016845, "learning_rate": 1.7380627608308393e-06, "loss": 0.82305664, "num_input_tokens_seen": 198963590, "step": 9240, "time_per_iteration": 2.759277582168579 }, { "auxiliary_loss_clip": 0.0109666, "auxiliary_loss_mlp": 0.01034491, "balance_loss_clip": 1.04089236, "balance_loss_mlp": 1.02099013, "epoch": 0.5555989779047047, "flos": 24682122478080.0, "grad_norm": 2.168471057936508, "language_loss": 0.65255535, "learning_rate": 1.737676658740786e-06, "loss": 0.67386687, "num_input_tokens_seen": 198982680, "step": 9241, "time_per_iteration": 2.7321317195892334 }, { "auxiliary_loss_clip": 0.01110689, "auxiliary_loss_mlp": 0.0077113, "balance_loss_clip": 1.04320502, "balance_loss_mlp": 1.00029731, "epoch": 0.5556591011573726, "flos": 16106510954880.0, "grad_norm": 1.885035131778914, "language_loss": 0.72406638, "learning_rate": 1.7372905665980594e-06, "loss": 0.74288458, "num_input_tokens_seen": 199000185, "step": 9242, "time_per_iteration": 2.6891591548919678 }, { "auxiliary_loss_clip": 0.01106836, "auxiliary_loss_mlp": 0.01034566, "balance_loss_clip": 1.04584861, "balance_loss_mlp": 1.02024293, "epoch": 0.5557192244100406, "flos": 12933695733120.0, "grad_norm": 1.6675932055368092, "language_loss": 0.64065903, "learning_rate": 1.7369044844173012e-06, "loss": 0.66207308, "num_input_tokens_seen": 199018380, "step": 9243, "time_per_iteration": 3.1710290908813477 }, { "auxiliary_loss_clip": 0.01094198, "auxiliary_loss_mlp": 0.00771105, "balance_loss_clip": 1.04436445, "balance_loss_mlp": 1.00027966, "epoch": 0.5557793476627085, "flos": 23111712887040.0, "grad_norm": 2.6865994829235333, "language_loss": 0.75548631, "learning_rate": 1.7365184122131509e-06, "loss": 0.77413929, "num_input_tokens_seen": 199037115, "step": 9244, "time_per_iteration": 2.686121940612793 }, { "auxiliary_loss_clip": 0.01091692, "auxiliary_loss_mlp": 0.01036173, "balance_loss_clip": 1.03900838, "balance_loss_mlp": 1.02352512, "epoch": 0.5558394709153766, "flos": 21428040735360.0, "grad_norm": 2.0505810415857506, "language_loss": 0.75051856, "learning_rate": 1.7361323500002486e-06, "loss": 0.77179724, "num_input_tokens_seen": 199053375, "step": 9245, "time_per_iteration": 2.6561057567596436 }, { "auxiliary_loss_clip": 0.01099057, "auxiliary_loss_mlp": 0.01034743, "balance_loss_clip": 1.04262114, "balance_loss_mlp": 1.02087283, "epoch": 0.5558995941680445, "flos": 25078324469760.0, "grad_norm": 2.0581034442408055, "language_loss": 0.79967058, "learning_rate": 1.7357462977932348e-06, "loss": 0.82100856, "num_input_tokens_seen": 199070930, "step": 9246, "time_per_iteration": 2.6968653202056885 }, { "auxiliary_loss_clip": 0.01120892, "auxiliary_loss_mlp": 0.01037931, "balance_loss_clip": 1.0435034, "balance_loss_mlp": 1.0241977, "epoch": 0.5559597174207125, "flos": 20011149872640.0, "grad_norm": 1.8340386723611697, "language_loss": 0.73825908, "learning_rate": 1.7353602556067471e-06, "loss": 0.75984728, "num_input_tokens_seen": 199088675, "step": 9247, "time_per_iteration": 2.5861082077026367 }, { "auxiliary_loss_clip": 0.01091731, "auxiliary_loss_mlp": 0.01035279, "balance_loss_clip": 1.04089963, "balance_loss_mlp": 1.0214448, "epoch": 0.5560198406733804, "flos": 16835677044480.0, "grad_norm": 2.6765383510534324, "language_loss": 0.74975288, "learning_rate": 1.7349742234554254e-06, "loss": 0.77102304, "num_input_tokens_seen": 199103075, "step": 9248, "time_per_iteration": 2.634092092514038 }, { "auxiliary_loss_clip": 0.00999886, "auxiliary_loss_mlp": 0.01011469, "balance_loss_clip": 1.01177704, "balance_loss_mlp": 1.00989556, "epoch": 0.5560799639260484, "flos": 70697051758080.0, "grad_norm": 0.8462101410465201, "language_loss": 0.59490269, "learning_rate": 1.7345882013539081e-06, "loss": 0.61501622, "num_input_tokens_seen": 199160325, "step": 9249, "time_per_iteration": 3.389267683029175 }, { "auxiliary_loss_clip": 0.01118078, "auxiliary_loss_mlp": 0.01029785, "balance_loss_clip": 1.04007614, "balance_loss_mlp": 1.01592088, "epoch": 0.5561400871787163, "flos": 23148593176320.0, "grad_norm": 2.8767161081984427, "language_loss": 0.79950154, "learning_rate": 1.734202189316832e-06, "loss": 0.82098025, "num_input_tokens_seen": 199179760, "step": 9250, "time_per_iteration": 2.578690528869629 }, { "auxiliary_loss_clip": 0.01098469, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.04169929, "balance_loss_mlp": 1.02075529, "epoch": 0.5562002104313843, "flos": 17566423332480.0, "grad_norm": 3.104352444179477, "language_loss": 0.68685251, "learning_rate": 1.733816187358836e-06, "loss": 0.7081852, "num_input_tokens_seen": 199196695, "step": 9251, "time_per_iteration": 2.7810349464416504 }, { "auxiliary_loss_clip": 0.01109089, "auxiliary_loss_mlp": 0.01033405, "balance_loss_clip": 1.04200792, "balance_loss_mlp": 1.02018476, "epoch": 0.5562603336840523, "flos": 25045430590080.0, "grad_norm": 1.5038625186154766, "language_loss": 0.75750792, "learning_rate": 1.7334301954945569e-06, "loss": 0.77893281, "num_input_tokens_seen": 199217845, "step": 9252, "time_per_iteration": 2.663238286972046 }, { "auxiliary_loss_clip": 0.01107916, "auxiliary_loss_mlp": 0.01039535, "balance_loss_clip": 1.04108679, "balance_loss_mlp": 1.02441943, "epoch": 0.5563204569367203, "flos": 29059022436480.0, "grad_norm": 1.5228616100256118, "language_loss": 0.72854966, "learning_rate": 1.7330442137386313e-06, "loss": 0.7500242, "num_input_tokens_seen": 199239250, "step": 9253, "time_per_iteration": 2.6020450592041016 }, { "auxiliary_loss_clip": 0.01093689, "auxiliary_loss_mlp": 0.01032948, "balance_loss_clip": 1.04451489, "balance_loss_mlp": 1.02043748, "epoch": 0.5563805801893883, "flos": 22090449398400.0, "grad_norm": 1.6703038143704756, "language_loss": 0.83143723, "learning_rate": 1.7326582421056965e-06, "loss": 0.85270357, "num_input_tokens_seen": 199258320, "step": 9254, "time_per_iteration": 2.701199531555176 }, { "auxiliary_loss_clip": 0.01012318, "auxiliary_loss_mlp": 0.01004464, "balance_loss_clip": 1.01460981, "balance_loss_mlp": 1.0030154, "epoch": 0.5564407034420562, "flos": 58636128689280.0, "grad_norm": 0.8693463823650434, "language_loss": 0.64875168, "learning_rate": 1.732272280610387e-06, "loss": 0.6689195, "num_input_tokens_seen": 199314840, "step": 9255, "time_per_iteration": 3.1222445964813232 }, { "auxiliary_loss_clip": 0.01111592, "auxiliary_loss_mlp": 0.01033344, "balance_loss_clip": 1.04527521, "balance_loss_mlp": 1.02035666, "epoch": 0.5565008266947242, "flos": 23112323418240.0, "grad_norm": 2.147539486852423, "language_loss": 0.69487607, "learning_rate": 1.7318863292673399e-06, "loss": 0.7163254, "num_input_tokens_seen": 199335405, "step": 9256, "time_per_iteration": 2.642542600631714 }, { "auxiliary_loss_clip": 0.01085774, "auxiliary_loss_mlp": 0.01031728, "balance_loss_clip": 1.04269767, "balance_loss_mlp": 1.01939559, "epoch": 0.5565609499473921, "flos": 21578399066880.0, "grad_norm": 1.6171582584602333, "language_loss": 0.75981283, "learning_rate": 1.73150038809119e-06, "loss": 0.78098786, "num_input_tokens_seen": 199354345, "step": 9257, "time_per_iteration": 2.712520122528076 }, { "auxiliary_loss_clip": 0.01074562, "auxiliary_loss_mlp": 0.01036038, "balance_loss_clip": 1.04019046, "balance_loss_mlp": 1.0233897, "epoch": 0.5566210732000602, "flos": 18369637309440.0, "grad_norm": 3.6499733263034746, "language_loss": 0.60697454, "learning_rate": 1.7311144570965724e-06, "loss": 0.62808049, "num_input_tokens_seen": 199372250, "step": 9258, "time_per_iteration": 2.751559257507324 }, { "auxiliary_loss_clip": 0.01084702, "auxiliary_loss_mlp": 0.01035032, "balance_loss_clip": 1.03922486, "balance_loss_mlp": 1.02042937, "epoch": 0.5566811964527281, "flos": 25703350053120.0, "grad_norm": 1.5966024354647115, "language_loss": 0.79111505, "learning_rate": 1.7307285362981215e-06, "loss": 0.81231236, "num_input_tokens_seen": 199392815, "step": 9259, "time_per_iteration": 2.7664895057678223 }, { "auxiliary_loss_clip": 0.01088989, "auxiliary_loss_mlp": 0.0103733, "balance_loss_clip": 1.04242945, "balance_loss_mlp": 1.02328086, "epoch": 0.5567413197053961, "flos": 26943991856640.0, "grad_norm": 1.7833081696281723, "language_loss": 0.81253225, "learning_rate": 1.7303426257104712e-06, "loss": 0.83379543, "num_input_tokens_seen": 199412375, "step": 9260, "time_per_iteration": 2.79059100151062 }, { "auxiliary_loss_clip": 0.01120889, "auxiliary_loss_mlp": 0.01039805, "balance_loss_clip": 1.04265976, "balance_loss_mlp": 1.02585721, "epoch": 0.556801442958064, "flos": 20850597694080.0, "grad_norm": 1.513133023380305, "language_loss": 0.69277883, "learning_rate": 1.729956725348256e-06, "loss": 0.71438575, "num_input_tokens_seen": 199431490, "step": 9261, "time_per_iteration": 2.5942957401275635 }, { "auxiliary_loss_clip": 0.01009344, "auxiliary_loss_mlp": 0.01005985, "balance_loss_clip": 1.01376081, "balance_loss_mlp": 1.00455499, "epoch": 0.556861566210732, "flos": 70498213044480.0, "grad_norm": 0.7654306967564637, "language_loss": 0.61116695, "learning_rate": 1.729570835226108e-06, "loss": 0.63132024, "num_input_tokens_seen": 199495855, "step": 9262, "time_per_iteration": 3.2477405071258545 }, { "auxiliary_loss_clip": 0.01109024, "auxiliary_loss_mlp": 0.0103923, "balance_loss_clip": 1.03991163, "balance_loss_mlp": 1.02594411, "epoch": 0.5569216894633999, "flos": 25337276593920.0, "grad_norm": 1.6344264149627976, "language_loss": 0.64423072, "learning_rate": 1.7291849553586622e-06, "loss": 0.66571325, "num_input_tokens_seen": 199515870, "step": 9263, "time_per_iteration": 2.658576488494873 }, { "auxiliary_loss_clip": 0.01095378, "auxiliary_loss_mlp": 0.010346, "balance_loss_clip": 1.03873014, "balance_loss_mlp": 1.02134418, "epoch": 0.556981812716068, "flos": 22638733574400.0, "grad_norm": 1.867976542015905, "language_loss": 0.73368537, "learning_rate": 1.7287990857605497e-06, "loss": 0.75498509, "num_input_tokens_seen": 199535745, "step": 9264, "time_per_iteration": 2.7003254890441895 }, { "auxiliary_loss_clip": 0.01095238, "auxiliary_loss_mlp": 0.01029532, "balance_loss_clip": 1.04636014, "balance_loss_mlp": 1.01672268, "epoch": 0.5570419359687359, "flos": 11035852738560.0, "grad_norm": 2.2771016341265526, "language_loss": 0.76178783, "learning_rate": 1.7284132264464022e-06, "loss": 0.78303552, "num_input_tokens_seen": 199554035, "step": 9265, "time_per_iteration": 2.7386014461517334 }, { "auxiliary_loss_clip": 0.01090389, "auxiliary_loss_mlp": 0.01034586, "balance_loss_clip": 1.04179025, "balance_loss_mlp": 1.02249825, "epoch": 0.5571020592214039, "flos": 22823135020800.0, "grad_norm": 1.339030652191656, "language_loss": 0.70789158, "learning_rate": 1.7280273774308536e-06, "loss": 0.72914135, "num_input_tokens_seen": 199576120, "step": 9266, "time_per_iteration": 2.741800546646118 }, { "auxiliary_loss_clip": 0.01094155, "auxiliary_loss_mlp": 0.0103873, "balance_loss_clip": 1.03911209, "balance_loss_mlp": 1.0255034, "epoch": 0.5571621824740719, "flos": 22927778317440.0, "grad_norm": 2.0031056980063506, "language_loss": 0.68157613, "learning_rate": 1.727641538728533e-06, "loss": 0.70290494, "num_input_tokens_seen": 199593780, "step": 9267, "time_per_iteration": 2.7874062061309814 }, { "auxiliary_loss_clip": 0.01104037, "auxiliary_loss_mlp": 0.01038856, "balance_loss_clip": 1.03991306, "balance_loss_mlp": 1.02653575, "epoch": 0.5572223057267398, "flos": 22966705681920.0, "grad_norm": 1.918660534651482, "language_loss": 0.74570519, "learning_rate": 1.7272557103540736e-06, "loss": 0.76713407, "num_input_tokens_seen": 199613220, "step": 9268, "time_per_iteration": 2.7008538246154785 }, { "auxiliary_loss_clip": 0.01103292, "auxiliary_loss_mlp": 0.00770403, "balance_loss_clip": 1.04299617, "balance_loss_mlp": 1.00017905, "epoch": 0.5572824289794078, "flos": 20960053413120.0, "grad_norm": 1.8745085493520866, "language_loss": 0.75087655, "learning_rate": 1.726869892322104e-06, "loss": 0.76961344, "num_input_tokens_seen": 199632085, "step": 9269, "time_per_iteration": 2.653756856918335 }, { "auxiliary_loss_clip": 0.01081519, "auxiliary_loss_mlp": 0.01046232, "balance_loss_clip": 1.03722787, "balance_loss_mlp": 1.03201032, "epoch": 0.5573425522320757, "flos": 25042413847680.0, "grad_norm": 1.688879717720704, "language_loss": 0.82588089, "learning_rate": 1.726484084647256e-06, "loss": 0.84715831, "num_input_tokens_seen": 199649295, "step": 9270, "time_per_iteration": 4.278396844863892 }, { "auxiliary_loss_clip": 0.01079257, "auxiliary_loss_mlp": 0.01039234, "balance_loss_clip": 1.04120445, "balance_loss_mlp": 1.02594197, "epoch": 0.5574026754847438, "flos": 23659637927040.0, "grad_norm": 2.0078243728297167, "language_loss": 0.79825968, "learning_rate": 1.7260982873441591e-06, "loss": 0.81944454, "num_input_tokens_seen": 199668870, "step": 9271, "time_per_iteration": 6.1330788135528564 }, { "auxiliary_loss_clip": 0.01099668, "auxiliary_loss_mlp": 0.01031508, "balance_loss_clip": 1.04303491, "balance_loss_mlp": 1.01848447, "epoch": 0.5574627987374117, "flos": 24782240661120.0, "grad_norm": 2.2903855544483394, "language_loss": 0.90515852, "learning_rate": 1.725712500427442e-06, "loss": 0.92647034, "num_input_tokens_seen": 199684870, "step": 9272, "time_per_iteration": 2.6802456378936768 }, { "auxiliary_loss_clip": 0.01086004, "auxiliary_loss_mlp": 0.01032973, "balance_loss_clip": 1.04199028, "balance_loss_mlp": 1.02049148, "epoch": 0.5575229219900797, "flos": 21834944979840.0, "grad_norm": 2.009692341926254, "language_loss": 0.83817393, "learning_rate": 1.7253267239117347e-06, "loss": 0.85936373, "num_input_tokens_seen": 199701975, "step": 9273, "time_per_iteration": 2.714702606201172 }, { "auxiliary_loss_clip": 0.01111871, "auxiliary_loss_mlp": 0.01043586, "balance_loss_clip": 1.0435437, "balance_loss_mlp": 1.0286727, "epoch": 0.5575830452427476, "flos": 27815148408960.0, "grad_norm": 2.029727061879287, "language_loss": 0.74000418, "learning_rate": 1.7249409578116655e-06, "loss": 0.76155877, "num_input_tokens_seen": 199721865, "step": 9274, "time_per_iteration": 2.6897573471069336 }, { "auxiliary_loss_clip": 0.01102598, "auxiliary_loss_mlp": 0.01036471, "balance_loss_clip": 1.04597545, "balance_loss_mlp": 1.02202296, "epoch": 0.5576431684954156, "flos": 17812805696640.0, "grad_norm": 2.7929550344218885, "language_loss": 0.7749905, "learning_rate": 1.7245552021418629e-06, "loss": 0.79638124, "num_input_tokens_seen": 199736455, "step": 9275, "time_per_iteration": 2.6423583030700684 }, { "auxiliary_loss_clip": 0.01093646, "auxiliary_loss_mlp": 0.01035097, "balance_loss_clip": 1.04310751, "balance_loss_mlp": 1.02178109, "epoch": 0.5577032917480835, "flos": 15486872411520.0, "grad_norm": 1.5365384810156146, "language_loss": 0.75059974, "learning_rate": 1.7241694569169546e-06, "loss": 0.77188718, "num_input_tokens_seen": 199753125, "step": 9276, "time_per_iteration": 4.227986812591553 }, { "auxiliary_loss_clip": 0.01098066, "auxiliary_loss_mlp": 0.01035646, "balance_loss_clip": 1.04026711, "balance_loss_mlp": 1.02219296, "epoch": 0.5577634150007516, "flos": 21579763783680.0, "grad_norm": 1.8156811956405543, "language_loss": 0.75730252, "learning_rate": 1.7237837221515678e-06, "loss": 0.77863955, "num_input_tokens_seen": 199771365, "step": 9277, "time_per_iteration": 2.651348114013672 }, { "auxiliary_loss_clip": 0.01117192, "auxiliary_loss_mlp": 0.01033742, "balance_loss_clip": 1.04269838, "balance_loss_mlp": 1.02087963, "epoch": 0.5578235382534195, "flos": 21139750177920.0, "grad_norm": 1.871466977383403, "language_loss": 0.71828836, "learning_rate": 1.7233979978603304e-06, "loss": 0.73979771, "num_input_tokens_seen": 199790035, "step": 9278, "time_per_iteration": 2.657386302947998 }, { "auxiliary_loss_clip": 0.0108587, "auxiliary_loss_mlp": 0.01037056, "balance_loss_clip": 1.04430723, "balance_loss_mlp": 1.02232218, "epoch": 0.5578836615060875, "flos": 26505199313280.0, "grad_norm": 1.586228481919935, "language_loss": 0.75729156, "learning_rate": 1.723012284057868e-06, "loss": 0.77852082, "num_input_tokens_seen": 199811125, "step": 9279, "time_per_iteration": 2.751840353012085 }, { "auxiliary_loss_clip": 0.01093934, "auxiliary_loss_mlp": 0.01037128, "balance_loss_clip": 1.03794658, "balance_loss_mlp": 1.02376509, "epoch": 0.5579437847587555, "flos": 20153786780160.0, "grad_norm": 1.6097529730476008, "language_loss": 0.67559254, "learning_rate": 1.7226265807588082e-06, "loss": 0.69690311, "num_input_tokens_seen": 199829915, "step": 9280, "time_per_iteration": 2.6563684940338135 }, { "auxiliary_loss_clip": 0.01106752, "auxiliary_loss_mlp": 0.01041709, "balance_loss_clip": 1.0392946, "balance_loss_mlp": 1.02810693, "epoch": 0.5580039080114234, "flos": 26102281478400.0, "grad_norm": 1.6056594505621422, "language_loss": 0.73215401, "learning_rate": 1.7222408879777763e-06, "loss": 0.75363857, "num_input_tokens_seen": 199850670, "step": 9281, "time_per_iteration": 2.6871986389160156 }, { "auxiliary_loss_clip": 0.01086628, "auxiliary_loss_mlp": 0.00770991, "balance_loss_clip": 1.04039741, "balance_loss_mlp": 1.0002861, "epoch": 0.5580640312640914, "flos": 13771671096960.0, "grad_norm": 3.0582981113882317, "language_loss": 0.75378543, "learning_rate": 1.7218552057293974e-06, "loss": 0.77236158, "num_input_tokens_seen": 199867645, "step": 9282, "time_per_iteration": 2.680744171142578 }, { "auxiliary_loss_clip": 0.01055422, "auxiliary_loss_mlp": 0.01036854, "balance_loss_clip": 1.03532624, "balance_loss_mlp": 1.02328229, "epoch": 0.5581241545167593, "flos": 17675986792320.0, "grad_norm": 2.212590462669887, "language_loss": 0.6592958, "learning_rate": 1.721469534028297e-06, "loss": 0.68021852, "num_input_tokens_seen": 199886320, "step": 9283, "time_per_iteration": 2.7523255348205566 }, { "auxiliary_loss_clip": 0.01087506, "auxiliary_loss_mlp": 0.01030166, "balance_loss_clip": 1.04440904, "balance_loss_mlp": 1.01841235, "epoch": 0.5581842777694274, "flos": 19569161018880.0, "grad_norm": 1.7248818916670352, "language_loss": 0.82969356, "learning_rate": 1.7210838728890994e-06, "loss": 0.85087025, "num_input_tokens_seen": 199904895, "step": 9284, "time_per_iteration": 2.6912968158721924 }, { "auxiliary_loss_clip": 0.01097795, "auxiliary_loss_mlp": 0.0103561, "balance_loss_clip": 1.04244661, "balance_loss_mlp": 1.02261066, "epoch": 0.5582444010220953, "flos": 20595165102720.0, "grad_norm": 2.3068151709488736, "language_loss": 0.85949606, "learning_rate": 1.7206982223264304e-06, "loss": 0.88083011, "num_input_tokens_seen": 199921090, "step": 9285, "time_per_iteration": 2.6835310459136963 }, { "auxiliary_loss_clip": 0.01095995, "auxiliary_loss_mlp": 0.01037997, "balance_loss_clip": 1.0437417, "balance_loss_mlp": 1.02543855, "epoch": 0.5583045242747633, "flos": 19135504120320.0, "grad_norm": 2.6758058324476024, "language_loss": 0.73497176, "learning_rate": 1.720312582354912e-06, "loss": 0.75631171, "num_input_tokens_seen": 199939925, "step": 9286, "time_per_iteration": 2.7510128021240234 }, { "auxiliary_loss_clip": 0.01119969, "auxiliary_loss_mlp": 0.01032279, "balance_loss_clip": 1.04193521, "balance_loss_mlp": 1.01924896, "epoch": 0.5583646475274312, "flos": 27454569730560.0, "grad_norm": 2.5542622351497104, "language_loss": 0.7366401, "learning_rate": 1.7199269529891684e-06, "loss": 0.7581625, "num_input_tokens_seen": 199960015, "step": 9287, "time_per_iteration": 2.7764368057250977 }, { "auxiliary_loss_clip": 0.01087822, "auxiliary_loss_mlp": 0.01038543, "balance_loss_clip": 1.04215682, "balance_loss_mlp": 1.0240171, "epoch": 0.5584247707800992, "flos": 23653784010240.0, "grad_norm": 1.5995445525462566, "language_loss": 0.75250727, "learning_rate": 1.7195413342438233e-06, "loss": 0.77377093, "num_input_tokens_seen": 199980505, "step": 9288, "time_per_iteration": 2.711667060852051 }, { "auxiliary_loss_clip": 0.01101347, "auxiliary_loss_mlp": 0.01045442, "balance_loss_clip": 1.04461765, "balance_loss_mlp": 1.03062999, "epoch": 0.5584848940327671, "flos": 13698880185600.0, "grad_norm": 2.3847574468541075, "language_loss": 0.77486145, "learning_rate": 1.7191557261334984e-06, "loss": 0.79632932, "num_input_tokens_seen": 199999020, "step": 9289, "time_per_iteration": 2.726365566253662 }, { "auxiliary_loss_clip": 0.01092544, "auxiliary_loss_mlp": 0.01034807, "balance_loss_clip": 1.04270971, "balance_loss_mlp": 1.02084172, "epoch": 0.5585450172854352, "flos": 27016208150400.0, "grad_norm": 1.8546991944448898, "language_loss": 0.61392409, "learning_rate": 1.718770128672817e-06, "loss": 0.63519758, "num_input_tokens_seen": 200019020, "step": 9290, "time_per_iteration": 2.7546441555023193 }, { "auxiliary_loss_clip": 0.01071377, "auxiliary_loss_mlp": 0.01032544, "balance_loss_clip": 1.03871763, "balance_loss_mlp": 1.01945531, "epoch": 0.5586051405381031, "flos": 23185653033600.0, "grad_norm": 2.64639974160875, "language_loss": 0.68249333, "learning_rate": 1.7183845418764e-06, "loss": 0.70353258, "num_input_tokens_seen": 200038110, "step": 9291, "time_per_iteration": 3.030916452407837 }, { "auxiliary_loss_clip": 0.01091279, "auxiliary_loss_mlp": 0.01045913, "balance_loss_clip": 1.04114079, "balance_loss_mlp": 1.03218007, "epoch": 0.5586652637907711, "flos": 20775544225920.0, "grad_norm": 1.7635760067758424, "language_loss": 0.84269536, "learning_rate": 1.7179989657588698e-06, "loss": 0.86406732, "num_input_tokens_seen": 200056210, "step": 9292, "time_per_iteration": 2.6990363597869873 }, { "auxiliary_loss_clip": 0.01090195, "auxiliary_loss_mlp": 0.01046206, "balance_loss_clip": 1.03904271, "balance_loss_mlp": 1.03265166, "epoch": 0.5587253870434391, "flos": 28219897837440.0, "grad_norm": 2.3637237833932687, "language_loss": 0.73976684, "learning_rate": 1.7176134003348476e-06, "loss": 0.76113087, "num_input_tokens_seen": 200075620, "step": 9293, "time_per_iteration": 2.7066195011138916 }, { "auxiliary_loss_clip": 0.0108672, "auxiliary_loss_mlp": 0.01044291, "balance_loss_clip": 1.04188502, "balance_loss_mlp": 1.03185785, "epoch": 0.558785510296107, "flos": 26615732440320.0, "grad_norm": 1.7291294273759894, "language_loss": 0.72083485, "learning_rate": 1.7172278456189523e-06, "loss": 0.74214494, "num_input_tokens_seen": 200095945, "step": 9294, "time_per_iteration": 2.7188310623168945 }, { "auxiliary_loss_clip": 0.01098814, "auxiliary_loss_mlp": 0.00770939, "balance_loss_clip": 1.04345989, "balance_loss_mlp": 1.0002197, "epoch": 0.558845633548775, "flos": 20156767608960.0, "grad_norm": 2.0034844848738995, "language_loss": 0.68573147, "learning_rate": 1.716842301625806e-06, "loss": 0.70442897, "num_input_tokens_seen": 200114185, "step": 9295, "time_per_iteration": 2.645157814025879 }, { "auxiliary_loss_clip": 0.01120796, "auxiliary_loss_mlp": 0.01037699, "balance_loss_clip": 1.04437232, "balance_loss_mlp": 1.02404976, "epoch": 0.5589057568014429, "flos": 24350774492160.0, "grad_norm": 1.451861251832641, "language_loss": 0.81153715, "learning_rate": 1.7164567683700281e-06, "loss": 0.83312207, "num_input_tokens_seen": 200135030, "step": 9296, "time_per_iteration": 2.638831853866577 }, { "auxiliary_loss_clip": 0.01109007, "auxiliary_loss_mlp": 0.01036287, "balance_loss_clip": 1.0433023, "balance_loss_mlp": 1.02302504, "epoch": 0.558965880054111, "flos": 21105168359040.0, "grad_norm": 2.39482931377815, "language_loss": 0.65407717, "learning_rate": 1.7160712458662379e-06, "loss": 0.67553014, "num_input_tokens_seen": 200154290, "step": 9297, "time_per_iteration": 2.6714565753936768 }, { "auxiliary_loss_clip": 0.01088452, "auxiliary_loss_mlp": 0.01039165, "balance_loss_clip": 1.04224098, "balance_loss_mlp": 1.024997, "epoch": 0.5590260033067789, "flos": 18436071513600.0, "grad_norm": 1.768502931317098, "language_loss": 0.75242859, "learning_rate": 1.7156857341290544e-06, "loss": 0.77370477, "num_input_tokens_seen": 200171555, "step": 9298, "time_per_iteration": 2.7061312198638916 }, { "auxiliary_loss_clip": 0.01019627, "auxiliary_loss_mlp": 0.01016507, "balance_loss_clip": 1.01274395, "balance_loss_mlp": 1.01488543, "epoch": 0.5590861265594469, "flos": 70577432490240.0, "grad_norm": 0.6867151105979278, "language_loss": 0.52393436, "learning_rate": 1.7153002331730967e-06, "loss": 0.54429573, "num_input_tokens_seen": 200237010, "step": 9299, "time_per_iteration": 3.2783946990966797 }, { "auxiliary_loss_clip": 0.01104521, "auxiliary_loss_mlp": 0.01037017, "balance_loss_clip": 1.04119837, "balance_loss_mlp": 1.02390957, "epoch": 0.5591462498121148, "flos": 30664408896000.0, "grad_norm": 1.9265460961114051, "language_loss": 0.69143355, "learning_rate": 1.7149147430129824e-06, "loss": 0.7128489, "num_input_tokens_seen": 200260820, "step": 9300, "time_per_iteration": 2.716351270675659 }, { "auxiliary_loss_clip": 0.01065458, "auxiliary_loss_mlp": 0.01057284, "balance_loss_clip": 1.03432143, "balance_loss_mlp": 1.04067802, "epoch": 0.5592063730647828, "flos": 18150438562560.0, "grad_norm": 2.0948179426753164, "language_loss": 0.81994128, "learning_rate": 1.7145292636633293e-06, "loss": 0.84116876, "num_input_tokens_seen": 200278035, "step": 9301, "time_per_iteration": 2.6983389854431152 }, { "auxiliary_loss_clip": 0.01117535, "auxiliary_loss_mlp": 0.01032183, "balance_loss_clip": 1.04067254, "balance_loss_mlp": 1.0186348, "epoch": 0.5592664963174507, "flos": 24060400945920.0, "grad_norm": 3.1722185850775553, "language_loss": 0.68140459, "learning_rate": 1.714143795138756e-06, "loss": 0.70290172, "num_input_tokens_seen": 200297255, "step": 9302, "time_per_iteration": 2.5997016429901123 }, { "auxiliary_loss_clip": 0.01088292, "auxiliary_loss_mlp": 0.01028765, "balance_loss_clip": 1.04123783, "balance_loss_mlp": 1.01426911, "epoch": 0.5593266195701188, "flos": 19827897661440.0, "grad_norm": 1.7171276141981482, "language_loss": 0.70894414, "learning_rate": 1.713758337453878e-06, "loss": 0.7301147, "num_input_tokens_seen": 200317505, "step": 9303, "time_per_iteration": 2.720726728439331 }, { "auxiliary_loss_clip": 0.01045978, "auxiliary_loss_mlp": 0.01043666, "balance_loss_clip": 1.03466618, "balance_loss_mlp": 1.02934885, "epoch": 0.5593867428227867, "flos": 25300755440640.0, "grad_norm": 3.8871936508431606, "language_loss": 0.72614998, "learning_rate": 1.7133728906233124e-06, "loss": 0.74704641, "num_input_tokens_seen": 200338350, "step": 9304, "time_per_iteration": 2.7727861404418945 }, { "auxiliary_loss_clip": 0.01107464, "auxiliary_loss_mlp": 0.01030237, "balance_loss_clip": 1.04120493, "balance_loss_mlp": 1.0174104, "epoch": 0.5594468660754547, "flos": 12933013374720.0, "grad_norm": 2.306388303475261, "language_loss": 0.77981883, "learning_rate": 1.7129874546616763e-06, "loss": 0.80119586, "num_input_tokens_seen": 200353965, "step": 9305, "time_per_iteration": 2.5945067405700684 }, { "auxiliary_loss_clip": 0.01069392, "auxiliary_loss_mlp": 0.01030263, "balance_loss_clip": 1.04184294, "balance_loss_mlp": 1.01778793, "epoch": 0.5595069893281227, "flos": 19062713208960.0, "grad_norm": 1.7491845938042618, "language_loss": 0.69805098, "learning_rate": 1.7126020295835836e-06, "loss": 0.71904755, "num_input_tokens_seen": 200373595, "step": 9306, "time_per_iteration": 2.8083784580230713 }, { "auxiliary_loss_clip": 0.01018297, "auxiliary_loss_mlp": 0.01002442, "balance_loss_clip": 1.015836, "balance_loss_mlp": 1.00099397, "epoch": 0.5595671125807906, "flos": 70273375862400.0, "grad_norm": 0.9194279331367995, "language_loss": 0.60304606, "learning_rate": 1.7122166154036518e-06, "loss": 0.62325346, "num_input_tokens_seen": 200429155, "step": 9307, "time_per_iteration": 3.301408052444458 }, { "auxiliary_loss_clip": 0.01104522, "auxiliary_loss_mlp": 0.01035417, "balance_loss_clip": 1.0423522, "balance_loss_mlp": 1.02234626, "epoch": 0.5596272358334586, "flos": 20665513889280.0, "grad_norm": 1.8556565203900444, "language_loss": 0.73943615, "learning_rate": 1.7118312121364943e-06, "loss": 0.76083553, "num_input_tokens_seen": 200448290, "step": 9308, "time_per_iteration": 2.6449387073516846 }, { "auxiliary_loss_clip": 0.01051886, "auxiliary_loss_mlp": 0.01038908, "balance_loss_clip": 1.03424501, "balance_loss_mlp": 1.02397084, "epoch": 0.5596873590861265, "flos": 25041013217280.0, "grad_norm": 2.1877402567653808, "language_loss": 0.69691569, "learning_rate": 1.7114458197967257e-06, "loss": 0.71782362, "num_input_tokens_seen": 200466555, "step": 9309, "time_per_iteration": 4.464626312255859 }, { "auxiliary_loss_clip": 0.01093684, "auxiliary_loss_mlp": 0.01037862, "balance_loss_clip": 1.04161119, "balance_loss_mlp": 1.02288949, "epoch": 0.5597474823387946, "flos": 25958387594880.0, "grad_norm": 1.9102617963629012, "language_loss": 0.75523353, "learning_rate": 1.7110604383989613e-06, "loss": 0.77654898, "num_input_tokens_seen": 200485980, "step": 9310, "time_per_iteration": 4.4445412158966064 }, { "auxiliary_loss_clip": 0.01112006, "auxiliary_loss_mlp": 0.01037268, "balance_loss_clip": 1.04378152, "balance_loss_mlp": 1.02286768, "epoch": 0.5598076055914625, "flos": 26177442687360.0, "grad_norm": 2.0703892527912813, "language_loss": 0.69657761, "learning_rate": 1.7106750679578133e-06, "loss": 0.71807039, "num_input_tokens_seen": 200504555, "step": 9311, "time_per_iteration": 4.303341865539551 }, { "auxiliary_loss_clip": 0.01105172, "auxiliary_loss_mlp": 0.01034066, "balance_loss_clip": 1.04042637, "balance_loss_mlp": 1.02103674, "epoch": 0.5598677288441305, "flos": 11655778590720.0, "grad_norm": 1.8932120118757645, "language_loss": 0.71856189, "learning_rate": 1.7102897084878962e-06, "loss": 0.73995423, "num_input_tokens_seen": 200522700, "step": 9312, "time_per_iteration": 2.610438823699951 }, { "auxiliary_loss_clip": 0.01080705, "auxiliary_loss_mlp": 0.01033643, "balance_loss_clip": 1.04290187, "balance_loss_mlp": 1.02023816, "epoch": 0.5599278520967984, "flos": 22966597941120.0, "grad_norm": 2.1557841469459746, "language_loss": 0.89152771, "learning_rate": 1.709904360003822e-06, "loss": 0.91267115, "num_input_tokens_seen": 200541910, "step": 9313, "time_per_iteration": 2.6854610443115234 }, { "auxiliary_loss_clip": 0.01081962, "auxiliary_loss_mlp": 0.01044977, "balance_loss_clip": 1.0415206, "balance_loss_mlp": 1.03109467, "epoch": 0.5599879753494664, "flos": 21215557831680.0, "grad_norm": 1.521477055933408, "language_loss": 0.77815449, "learning_rate": 1.709519022520204e-06, "loss": 0.79942387, "num_input_tokens_seen": 200562600, "step": 9314, "time_per_iteration": 4.262527942657471 }, { "auxiliary_loss_clip": 0.01082652, "auxiliary_loss_mlp": 0.01031612, "balance_loss_clip": 1.0416466, "balance_loss_mlp": 1.01851654, "epoch": 0.5600480986021343, "flos": 31903219105920.0, "grad_norm": 1.6753660628338782, "language_loss": 0.70509619, "learning_rate": 1.7091336960516537e-06, "loss": 0.72623885, "num_input_tokens_seen": 200584795, "step": 9315, "time_per_iteration": 2.7611892223358154 }, { "auxiliary_loss_clip": 0.0110321, "auxiliary_loss_mlp": 0.01041043, "balance_loss_clip": 1.04375148, "balance_loss_mlp": 1.02726793, "epoch": 0.5601082218548024, "flos": 28476048700800.0, "grad_norm": 1.7587170023253702, "language_loss": 0.66601861, "learning_rate": 1.7087483806127824e-06, "loss": 0.68746114, "num_input_tokens_seen": 200606945, "step": 9316, "time_per_iteration": 2.675050973892212 }, { "auxiliary_loss_clip": 0.0108131, "auxiliary_loss_mlp": 0.01037022, "balance_loss_clip": 1.037871, "balance_loss_mlp": 1.0214529, "epoch": 0.5601683451074703, "flos": 24097173494400.0, "grad_norm": 2.414777902457845, "language_loss": 0.87209964, "learning_rate": 1.7083630762182022e-06, "loss": 0.89328289, "num_input_tokens_seen": 200626340, "step": 9317, "time_per_iteration": 2.7405858039855957 }, { "auxiliary_loss_clip": 0.01115616, "auxiliary_loss_mlp": 0.01038233, "balance_loss_clip": 1.04544759, "balance_loss_mlp": 1.02290869, "epoch": 0.5602284683601383, "flos": 26356205698560.0, "grad_norm": 1.8555836482261492, "language_loss": 0.76961493, "learning_rate": 1.7079777828825233e-06, "loss": 0.79115343, "num_input_tokens_seen": 200644520, "step": 9318, "time_per_iteration": 2.683375597000122 }, { "auxiliary_loss_clip": 0.0110569, "auxiliary_loss_mlp": 0.01040718, "balance_loss_clip": 1.04080641, "balance_loss_mlp": 1.02822459, "epoch": 0.5602885916128063, "flos": 24496392228480.0, "grad_norm": 1.6342768124534643, "language_loss": 0.76235765, "learning_rate": 1.7075925006203558e-06, "loss": 0.7838217, "num_input_tokens_seen": 200664845, "step": 9319, "time_per_iteration": 2.6256465911865234 }, { "auxiliary_loss_clip": 0.01107325, "auxiliary_loss_mlp": 0.01036412, "balance_loss_clip": 1.04242063, "balance_loss_mlp": 1.02393723, "epoch": 0.5603487148654742, "flos": 27345006270720.0, "grad_norm": 1.4761895802927258, "language_loss": 0.85648036, "learning_rate": 1.7072072294463101e-06, "loss": 0.87791771, "num_input_tokens_seen": 200686535, "step": 9320, "time_per_iteration": 2.7295455932617188 }, { "auxiliary_loss_clip": 0.0103543, "auxiliary_loss_mlp": 0.01003142, "balance_loss_clip": 1.01980209, "balance_loss_mlp": 1.00181246, "epoch": 0.5604088381181422, "flos": 54087756180480.0, "grad_norm": 0.7528149861495326, "language_loss": 0.52530909, "learning_rate": 1.706821969374996e-06, "loss": 0.54569471, "num_input_tokens_seen": 200736965, "step": 9321, "time_per_iteration": 3.0199856758117676 }, { "auxiliary_loss_clip": 0.01097468, "auxiliary_loss_mlp": 0.01035636, "balance_loss_clip": 1.04187417, "balance_loss_mlp": 1.02274311, "epoch": 0.5604689613708101, "flos": 22236390357120.0, "grad_norm": 1.366292846882571, "language_loss": 0.74232858, "learning_rate": 1.7064367204210216e-06, "loss": 0.7636596, "num_input_tokens_seen": 200757420, "step": 9322, "time_per_iteration": 2.7239301204681396 }, { "auxiliary_loss_clip": 0.01120105, "auxiliary_loss_mlp": 0.01033893, "balance_loss_clip": 1.04226124, "balance_loss_mlp": 1.01925397, "epoch": 0.5605290846234782, "flos": 35297782940160.0, "grad_norm": 1.6268223998146492, "language_loss": 0.74119061, "learning_rate": 1.7060514825989963e-06, "loss": 0.7627306, "num_input_tokens_seen": 200779520, "step": 9323, "time_per_iteration": 2.7277660369873047 }, { "auxiliary_loss_clip": 0.01097354, "auxiliary_loss_mlp": 0.01034103, "balance_loss_clip": 1.04408789, "balance_loss_mlp": 1.01961303, "epoch": 0.5605892078761461, "flos": 20263314326400.0, "grad_norm": 2.353968750169446, "language_loss": 0.61679977, "learning_rate": 1.7056662559235286e-06, "loss": 0.63811433, "num_input_tokens_seen": 200799485, "step": 9324, "time_per_iteration": 2.681330442428589 }, { "auxiliary_loss_clip": 0.01068442, "auxiliary_loss_mlp": 0.0103778, "balance_loss_clip": 1.03685164, "balance_loss_mlp": 1.02353454, "epoch": 0.5606493311288141, "flos": 17308333134720.0, "grad_norm": 1.7599111661375368, "language_loss": 0.87798876, "learning_rate": 1.705281040409226e-06, "loss": 0.89905095, "num_input_tokens_seen": 200817540, "step": 9325, "time_per_iteration": 2.73244571685791 }, { "auxiliary_loss_clip": 0.01098073, "auxiliary_loss_mlp": 0.01034138, "balance_loss_clip": 1.04064608, "balance_loss_mlp": 1.01970756, "epoch": 0.560709454381482, "flos": 21652985658240.0, "grad_norm": 1.5582793995716135, "language_loss": 0.7359941, "learning_rate": 1.7048958360706952e-06, "loss": 0.75731623, "num_input_tokens_seen": 200838380, "step": 9326, "time_per_iteration": 2.685098886489868 }, { "auxiliary_loss_clip": 0.01099795, "auxiliary_loss_mlp": 0.01027968, "balance_loss_clip": 1.04008412, "balance_loss_mlp": 1.01316798, "epoch": 0.56076957763415, "flos": 20303355012480.0, "grad_norm": 1.8644433543241015, "language_loss": 0.78216934, "learning_rate": 1.7045106429225447e-06, "loss": 0.80344701, "num_input_tokens_seen": 200855640, "step": 9327, "time_per_iteration": 2.7206430435180664 }, { "auxiliary_loss_clip": 0.01106989, "auxiliary_loss_mlp": 0.01034784, "balance_loss_clip": 1.04609513, "balance_loss_mlp": 1.02029371, "epoch": 0.5608297008868179, "flos": 25045897466880.0, "grad_norm": 1.6309153070460434, "language_loss": 0.78084052, "learning_rate": 1.7041254609793795e-06, "loss": 0.80225813, "num_input_tokens_seen": 200876585, "step": 9328, "time_per_iteration": 2.6724750995635986 }, { "auxiliary_loss_clip": 0.01119639, "auxiliary_loss_mlp": 0.01031594, "balance_loss_clip": 1.04266322, "balance_loss_mlp": 1.01832008, "epoch": 0.560889824139486, "flos": 19866825025920.0, "grad_norm": 1.4710158195252034, "language_loss": 0.73393631, "learning_rate": 1.7037402902558066e-06, "loss": 0.75544858, "num_input_tokens_seen": 200898175, "step": 9329, "time_per_iteration": 2.610711097717285 }, { "auxiliary_loss_clip": 0.01100007, "auxiliary_loss_mlp": 0.00773419, "balance_loss_clip": 1.04148126, "balance_loss_mlp": 1.00026274, "epoch": 0.5609499473921539, "flos": 22929394429440.0, "grad_norm": 1.5539142345159989, "language_loss": 0.83609939, "learning_rate": 1.7033551307664324e-06, "loss": 0.85483366, "num_input_tokens_seen": 200917515, "step": 9330, "time_per_iteration": 2.7287333011627197 }, { "auxiliary_loss_clip": 0.01042257, "auxiliary_loss_mlp": 0.01001028, "balance_loss_clip": 1.01692343, "balance_loss_mlp": 0.99974674, "epoch": 0.5610100706448219, "flos": 53035825455360.0, "grad_norm": 0.7095685041475404, "language_loss": 0.57797414, "learning_rate": 1.7029699825258603e-06, "loss": 0.59840697, "num_input_tokens_seen": 200978615, "step": 9331, "time_per_iteration": 3.197101354598999 }, { "auxiliary_loss_clip": 0.01082146, "auxiliary_loss_mlp": 0.01038466, "balance_loss_clip": 1.0445832, "balance_loss_mlp": 1.02405381, "epoch": 0.5610701938974898, "flos": 21834944979840.0, "grad_norm": 1.957386899067858, "language_loss": 0.82066166, "learning_rate": 1.7025848455486971e-06, "loss": 0.8418678, "num_input_tokens_seen": 200997745, "step": 9332, "time_per_iteration": 2.706125497817993 }, { "auxiliary_loss_clip": 0.01106958, "auxiliary_loss_mlp": 0.01043073, "balance_loss_clip": 1.04060066, "balance_loss_mlp": 1.02800488, "epoch": 0.5611303171501578, "flos": 17457183095040.0, "grad_norm": 1.7807099110593088, "language_loss": 0.81912845, "learning_rate": 1.7021997198495454e-06, "loss": 0.8406288, "num_input_tokens_seen": 201016370, "step": 9333, "time_per_iteration": 2.6288132667541504 }, { "auxiliary_loss_clip": 0.01119893, "auxiliary_loss_mlp": 0.01030061, "balance_loss_clip": 1.04119062, "balance_loss_mlp": 1.01676321, "epoch": 0.5611904404028258, "flos": 22637799820800.0, "grad_norm": 1.6112092331225492, "language_loss": 0.72989404, "learning_rate": 1.7018146054430108e-06, "loss": 0.75139362, "num_input_tokens_seen": 201034310, "step": 9334, "time_per_iteration": 2.6088995933532715 }, { "auxiliary_loss_clip": 0.01098453, "auxiliary_loss_mlp": 0.01040678, "balance_loss_clip": 1.0453335, "balance_loss_mlp": 1.02690315, "epoch": 0.5612505636554938, "flos": 14316327999360.0, "grad_norm": 2.5253764454191416, "language_loss": 0.71248639, "learning_rate": 1.7014295023436961e-06, "loss": 0.73387766, "num_input_tokens_seen": 201052030, "step": 9335, "time_per_iteration": 2.633389949798584 }, { "auxiliary_loss_clip": 0.0109857, "auxiliary_loss_mlp": 0.01034463, "balance_loss_clip": 1.03983665, "balance_loss_mlp": 1.02066469, "epoch": 0.5613106869081618, "flos": 16508279554560.0, "grad_norm": 1.8386426637696407, "language_loss": 0.77176088, "learning_rate": 1.701044410566205e-06, "loss": 0.79309118, "num_input_tokens_seen": 201068445, "step": 9336, "time_per_iteration": 2.681753158569336 }, { "auxiliary_loss_clip": 0.01108773, "auxiliary_loss_mlp": 0.01033965, "balance_loss_clip": 1.0423466, "balance_loss_mlp": 1.02086353, "epoch": 0.5613708101608297, "flos": 24058569352320.0, "grad_norm": 2.6196694346701817, "language_loss": 0.64508319, "learning_rate": 1.7006593301251393e-06, "loss": 0.66651058, "num_input_tokens_seen": 201082140, "step": 9337, "time_per_iteration": 2.629194498062134 }, { "auxiliary_loss_clip": 0.01025154, "auxiliary_loss_mlp": 0.01003147, "balance_loss_clip": 1.01963842, "balance_loss_mlp": 1.00190687, "epoch": 0.5614309334134977, "flos": 64905735997440.0, "grad_norm": 0.8917713489246797, "language_loss": 0.62551695, "learning_rate": 1.700274261035102e-06, "loss": 0.64579999, "num_input_tokens_seen": 201137245, "step": 9338, "time_per_iteration": 3.1740610599517822 }, { "auxiliary_loss_clip": 0.01091363, "auxiliary_loss_mlp": 0.01035931, "balance_loss_clip": 1.04291368, "balance_loss_mlp": 1.02275264, "epoch": 0.5614910566661656, "flos": 32919849740160.0, "grad_norm": 1.9155240319962232, "language_loss": 0.65588379, "learning_rate": 1.6998892033106946e-06, "loss": 0.67715669, "num_input_tokens_seen": 201157270, "step": 9339, "time_per_iteration": 2.795539617538452 }, { "auxiliary_loss_clip": 0.0110324, "auxiliary_loss_mlp": 0.01043787, "balance_loss_clip": 1.04000616, "balance_loss_mlp": 1.0283432, "epoch": 0.5615511799188336, "flos": 18588871969920.0, "grad_norm": 1.9415000376687095, "language_loss": 0.69498181, "learning_rate": 1.6995041569665184e-06, "loss": 0.716452, "num_input_tokens_seen": 201174530, "step": 9340, "time_per_iteration": 2.6073222160339355 }, { "auxiliary_loss_clip": 0.01076412, "auxiliary_loss_mlp": 0.0103814, "balance_loss_clip": 1.04082394, "balance_loss_mlp": 1.02536726, "epoch": 0.5616113031715015, "flos": 22820010537600.0, "grad_norm": 1.461608284307224, "language_loss": 0.77235413, "learning_rate": 1.6991191220171756e-06, "loss": 0.79349971, "num_input_tokens_seen": 201194905, "step": 9341, "time_per_iteration": 2.712812662124634 }, { "auxiliary_loss_clip": 0.01069621, "auxiliary_loss_mlp": 0.01037705, "balance_loss_clip": 1.03758025, "balance_loss_mlp": 1.0230068, "epoch": 0.5616714264241696, "flos": 22345702421760.0, "grad_norm": 1.556156421929591, "language_loss": 0.79645002, "learning_rate": 1.6987340984772653e-06, "loss": 0.81752324, "num_input_tokens_seen": 201213715, "step": 9342, "time_per_iteration": 2.774918556213379 }, { "auxiliary_loss_clip": 0.01091015, "auxiliary_loss_mlp": 0.01035282, "balance_loss_clip": 1.03911448, "balance_loss_mlp": 1.02109551, "epoch": 0.5617315496768375, "flos": 18807783408000.0, "grad_norm": 2.3711889370259907, "language_loss": 0.76042008, "learning_rate": 1.6983490863613882e-06, "loss": 0.78168309, "num_input_tokens_seen": 201231415, "step": 9343, "time_per_iteration": 2.7124969959259033 }, { "auxiliary_loss_clip": 0.01080837, "auxiliary_loss_mlp": 0.01044577, "balance_loss_clip": 1.04475522, "balance_loss_mlp": 1.03011727, "epoch": 0.5617916729295055, "flos": 18369314087040.0, "grad_norm": 2.196794276196035, "language_loss": 0.69644189, "learning_rate": 1.6979640856841442e-06, "loss": 0.71769607, "num_input_tokens_seen": 201249625, "step": 9344, "time_per_iteration": 2.7265472412109375 }, { "auxiliary_loss_clip": 0.01121229, "auxiliary_loss_mlp": 0.01038625, "balance_loss_clip": 1.04347157, "balance_loss_mlp": 1.02447486, "epoch": 0.5618517961821734, "flos": 28179964892160.0, "grad_norm": 3.2350770637683106, "language_loss": 0.6636014, "learning_rate": 1.6975790964601318e-06, "loss": 0.68519998, "num_input_tokens_seen": 201271205, "step": 9345, "time_per_iteration": 2.686527729034424 }, { "auxiliary_loss_clip": 0.01098571, "auxiliary_loss_mlp": 0.01032052, "balance_loss_clip": 1.04279995, "balance_loss_mlp": 1.0190227, "epoch": 0.5619119194348414, "flos": 15486872411520.0, "grad_norm": 1.9772946469645978, "language_loss": 0.87311339, "learning_rate": 1.6971941187039512e-06, "loss": 0.89441955, "num_input_tokens_seen": 201287700, "step": 9346, "time_per_iteration": 2.6551971435546875 }, { "auxiliary_loss_clip": 0.0109764, "auxiliary_loss_mlp": 0.01036969, "balance_loss_clip": 1.04373372, "balance_loss_mlp": 1.02243173, "epoch": 0.5619720426875094, "flos": 29128652951040.0, "grad_norm": 2.320939151148892, "language_loss": 0.59135818, "learning_rate": 1.6968091524301993e-06, "loss": 0.61270428, "num_input_tokens_seen": 201307530, "step": 9347, "time_per_iteration": 2.701704263687134 }, { "auxiliary_loss_clip": 0.01113798, "auxiliary_loss_mlp": 0.01039809, "balance_loss_clip": 1.0449301, "balance_loss_mlp": 1.02461553, "epoch": 0.5620321659401774, "flos": 18003743418240.0, "grad_norm": 3.390094180858037, "language_loss": 0.69345069, "learning_rate": 1.6964241976534745e-06, "loss": 0.7149868, "num_input_tokens_seen": 201326210, "step": 9348, "time_per_iteration": 2.6152281761169434 }, { "auxiliary_loss_clip": 0.01072866, "auxiliary_loss_mlp": 0.01035332, "balance_loss_clip": 1.03694952, "balance_loss_mlp": 1.02000761, "epoch": 0.5620922891928454, "flos": 20594518657920.0, "grad_norm": 12.292181580280033, "language_loss": 0.79008943, "learning_rate": 1.6960392543883754e-06, "loss": 0.81117141, "num_input_tokens_seen": 201346120, "step": 9349, "time_per_iteration": 5.937277793884277 }, { "auxiliary_loss_clip": 0.01068645, "auxiliary_loss_mlp": 0.0103743, "balance_loss_clip": 1.04074883, "balance_loss_mlp": 1.02314854, "epoch": 0.5621524124455133, "flos": 26287006147200.0, "grad_norm": 2.217082199318971, "language_loss": 0.67245173, "learning_rate": 1.6956543226494975e-06, "loss": 0.6935125, "num_input_tokens_seen": 201365700, "step": 9350, "time_per_iteration": 4.385211229324341 }, { "auxiliary_loss_clip": 0.01069908, "auxiliary_loss_mlp": 0.01039418, "balance_loss_clip": 1.03964508, "balance_loss_mlp": 1.02451682, "epoch": 0.5622125356981813, "flos": 12750299867520.0, "grad_norm": 2.668539433171336, "language_loss": 0.78305924, "learning_rate": 1.6952694024514381e-06, "loss": 0.80415249, "num_input_tokens_seen": 201382795, "step": 9351, "time_per_iteration": 2.6691691875457764 }, { "auxiliary_loss_clip": 0.01099605, "auxiliary_loss_mlp": 0.00772893, "balance_loss_clip": 1.03920138, "balance_loss_mlp": 1.00020838, "epoch": 0.5622726589508492, "flos": 23805327490560.0, "grad_norm": 1.4861648044093183, "language_loss": 0.59128547, "learning_rate": 1.6948844938087945e-06, "loss": 0.61001039, "num_input_tokens_seen": 201402780, "step": 9352, "time_per_iteration": 2.753941297531128 }, { "auxiliary_loss_clip": 0.01105703, "auxiliary_loss_mlp": 0.0103746, "balance_loss_clip": 1.0406158, "balance_loss_mlp": 1.02476466, "epoch": 0.5623327822035172, "flos": 24718212668160.0, "grad_norm": 1.334754568183942, "language_loss": 0.71630079, "learning_rate": 1.6944995967361604e-06, "loss": 0.73773241, "num_input_tokens_seen": 201424140, "step": 9353, "time_per_iteration": 4.249570369720459 }, { "auxiliary_loss_clip": 0.01098184, "auxiliary_loss_mlp": 0.01032581, "balance_loss_clip": 1.04213238, "balance_loss_mlp": 1.01918769, "epoch": 0.5623929054561851, "flos": 14019274523520.0, "grad_norm": 2.376274628807619, "language_loss": 0.7593621, "learning_rate": 1.6941147112481327e-06, "loss": 0.78066975, "num_input_tokens_seen": 201439645, "step": 9354, "time_per_iteration": 2.689899206161499 }, { "auxiliary_loss_clip": 0.01089457, "auxiliary_loss_mlp": 0.01035605, "balance_loss_clip": 1.04167855, "balance_loss_mlp": 1.02183056, "epoch": 0.5624530287088532, "flos": 20704405340160.0, "grad_norm": 1.8223711210662343, "language_loss": 0.72909653, "learning_rate": 1.6937298373593056e-06, "loss": 0.75034714, "num_input_tokens_seen": 201459970, "step": 9355, "time_per_iteration": 2.755100965499878 }, { "auxiliary_loss_clip": 0.01104288, "auxiliary_loss_mlp": 0.01032084, "balance_loss_clip": 1.04146492, "balance_loss_mlp": 1.01845825, "epoch": 0.5625131519615211, "flos": 21470918595840.0, "grad_norm": 1.4719507883232867, "language_loss": 0.7346037, "learning_rate": 1.693344975084274e-06, "loss": 0.75596744, "num_input_tokens_seen": 201480055, "step": 9356, "time_per_iteration": 2.641638994216919 }, { "auxiliary_loss_clip": 0.01119375, "auxiliary_loss_mlp": 0.0103593, "balance_loss_clip": 1.04301476, "balance_loss_mlp": 1.02204823, "epoch": 0.5625732752141891, "flos": 18698004466560.0, "grad_norm": 2.3002614331876687, "language_loss": 0.83191347, "learning_rate": 1.6929601244376318e-06, "loss": 0.85346651, "num_input_tokens_seen": 201497645, "step": 9357, "time_per_iteration": 2.6374433040618896 }, { "auxiliary_loss_clip": 0.01108702, "auxiliary_loss_mlp": 0.01033262, "balance_loss_clip": 1.04158151, "balance_loss_mlp": 1.02019668, "epoch": 0.562633398466857, "flos": 16216900427520.0, "grad_norm": 2.42238754199954, "language_loss": 0.72483993, "learning_rate": 1.6925752854339722e-06, "loss": 0.74625957, "num_input_tokens_seen": 201515455, "step": 9358, "time_per_iteration": 2.6288702487945557 }, { "auxiliary_loss_clip": 0.01118085, "auxiliary_loss_mlp": 0.01042212, "balance_loss_clip": 1.04183221, "balance_loss_mlp": 1.02859807, "epoch": 0.562693521719525, "flos": 22491930689280.0, "grad_norm": 2.2438292834488838, "language_loss": 0.7763263, "learning_rate": 1.6921904580878885e-06, "loss": 0.79792929, "num_input_tokens_seen": 201534500, "step": 9359, "time_per_iteration": 2.6272196769714355 }, { "auxiliary_loss_clip": 0.0109706, "auxiliary_loss_mlp": 0.01033721, "balance_loss_clip": 1.04087317, "balance_loss_mlp": 1.0212934, "epoch": 0.562753644972193, "flos": 25331171281920.0, "grad_norm": 1.8703344042445116, "language_loss": 0.70466304, "learning_rate": 1.6918056424139736e-06, "loss": 0.72597086, "num_input_tokens_seen": 201553280, "step": 9360, "time_per_iteration": 2.6694719791412354 }, { "auxiliary_loss_clip": 0.00993761, "auxiliary_loss_mlp": 0.00999248, "balance_loss_clip": 1.01494741, "balance_loss_mlp": 0.99799061, "epoch": 0.562813768224861, "flos": 67392622126080.0, "grad_norm": 0.7735600550199924, "language_loss": 0.5555625, "learning_rate": 1.6914208384268197e-06, "loss": 0.57549262, "num_input_tokens_seen": 201610030, "step": 9361, "time_per_iteration": 3.2061593532562256 }, { "auxiliary_loss_clip": 0.01093709, "auxiliary_loss_mlp": 0.01035172, "balance_loss_clip": 1.04106104, "balance_loss_mlp": 1.02236927, "epoch": 0.562873891477529, "flos": 23331163029120.0, "grad_norm": 1.4272041180912485, "language_loss": 0.8169086, "learning_rate": 1.691036046141018e-06, "loss": 0.83819747, "num_input_tokens_seen": 201628370, "step": 9362, "time_per_iteration": 2.648585319519043 }, { "auxiliary_loss_clip": 0.01084349, "auxiliary_loss_mlp": 0.00771085, "balance_loss_clip": 1.03982627, "balance_loss_mlp": 1.00021708, "epoch": 0.5629340147301969, "flos": 38472824805120.0, "grad_norm": 1.5810217639510977, "language_loss": 0.7460767, "learning_rate": 1.6906512655711614e-06, "loss": 0.76463103, "num_input_tokens_seen": 201649790, "step": 9363, "time_per_iteration": 2.8376948833465576 }, { "auxiliary_loss_clip": 0.01114455, "auxiliary_loss_mlp": 0.01034672, "balance_loss_clip": 1.04345608, "balance_loss_mlp": 1.02068281, "epoch": 0.5629941379828649, "flos": 29242023252480.0, "grad_norm": 1.625625465741998, "language_loss": 0.82640725, "learning_rate": 1.690266496731839e-06, "loss": 0.84789848, "num_input_tokens_seen": 201669175, "step": 9364, "time_per_iteration": 2.6790480613708496 }, { "auxiliary_loss_clip": 0.0107898, "auxiliary_loss_mlp": 0.0103866, "balance_loss_clip": 1.03860497, "balance_loss_mlp": 1.02573752, "epoch": 0.5630542612355328, "flos": 19420885676160.0, "grad_norm": 2.0942443962927513, "language_loss": 0.65238589, "learning_rate": 1.689881739637642e-06, "loss": 0.67356229, "num_input_tokens_seen": 201687000, "step": 9365, "time_per_iteration": 2.6504223346710205 }, { "auxiliary_loss_clip": 0.01099908, "auxiliary_loss_mlp": 0.01040371, "balance_loss_clip": 1.0423665, "balance_loss_mlp": 1.0259583, "epoch": 0.5631143844882008, "flos": 22266303408000.0, "grad_norm": 5.761173374312871, "language_loss": 0.8185727, "learning_rate": 1.6894969943031611e-06, "loss": 0.83997548, "num_input_tokens_seen": 201703335, "step": 9366, "time_per_iteration": 2.6865267753601074 }, { "auxiliary_loss_clip": 0.01118809, "auxiliary_loss_mlp": 0.01033751, "balance_loss_clip": 1.04305601, "balance_loss_mlp": 1.02106667, "epoch": 0.5631745077408687, "flos": 22965305051520.0, "grad_norm": 1.4687745386206819, "language_loss": 0.73388821, "learning_rate": 1.6891122607429845e-06, "loss": 0.75541377, "num_input_tokens_seen": 201723495, "step": 9367, "time_per_iteration": 2.6309821605682373 }, { "auxiliary_loss_clip": 0.01020057, "auxiliary_loss_mlp": 0.01004541, "balance_loss_clip": 1.01475585, "balance_loss_mlp": 1.0032177, "epoch": 0.5632346309935368, "flos": 65080515576960.0, "grad_norm": 0.6203732228424765, "language_loss": 0.53471267, "learning_rate": 1.6887275389717028e-06, "loss": 0.5549587, "num_input_tokens_seen": 201792615, "step": 9368, "time_per_iteration": 3.285132884979248 }, { "auxiliary_loss_clip": 0.01119712, "auxiliary_loss_mlp": 0.01038636, "balance_loss_clip": 1.04367208, "balance_loss_mlp": 1.02514756, "epoch": 0.5632947542462047, "flos": 23002903612800.0, "grad_norm": 1.6032046035258145, "language_loss": 0.69323123, "learning_rate": 1.6883428290039046e-06, "loss": 0.71481466, "num_input_tokens_seen": 201812520, "step": 9369, "time_per_iteration": 2.5828912258148193 }, { "auxiliary_loss_clip": 0.01081861, "auxiliary_loss_mlp": 0.01036769, "balance_loss_clip": 1.03560948, "balance_loss_mlp": 1.02258897, "epoch": 0.5633548774988727, "flos": 30482593228800.0, "grad_norm": 1.8644770946275213, "language_loss": 0.75840139, "learning_rate": 1.6879581308541763e-06, "loss": 0.77958775, "num_input_tokens_seen": 201834185, "step": 9370, "time_per_iteration": 2.7649481296539307 }, { "auxiliary_loss_clip": 0.01095504, "auxiliary_loss_mlp": 0.01038896, "balance_loss_clip": 1.04126322, "balance_loss_mlp": 1.02440023, "epoch": 0.5634150007515406, "flos": 18515039564160.0, "grad_norm": 2.2895815027179864, "language_loss": 0.755108, "learning_rate": 1.687573444537108e-06, "loss": 0.776452, "num_input_tokens_seen": 201851305, "step": 9371, "time_per_iteration": 2.591031312942505 }, { "auxiliary_loss_clip": 0.01106226, "auxiliary_loss_mlp": 0.01040784, "balance_loss_clip": 1.04110384, "balance_loss_mlp": 1.02787304, "epoch": 0.5634751240042086, "flos": 19244672530560.0, "grad_norm": 1.7615457998604214, "language_loss": 0.76489764, "learning_rate": 1.687188770067285e-06, "loss": 0.78636777, "num_input_tokens_seen": 201870350, "step": 9372, "time_per_iteration": 2.619053840637207 }, { "auxiliary_loss_clip": 0.01090528, "auxiliary_loss_mlp": 0.01030605, "balance_loss_clip": 1.03906, "balance_loss_mlp": 1.01705718, "epoch": 0.5635352472568766, "flos": 12020630987520.0, "grad_norm": 2.266062441891877, "language_loss": 0.71336401, "learning_rate": 1.6868041074592956e-06, "loss": 0.73457533, "num_input_tokens_seen": 201886800, "step": 9373, "time_per_iteration": 2.624600887298584 }, { "auxiliary_loss_clip": 0.01090554, "auxiliary_loss_mlp": 0.01031384, "balance_loss_clip": 1.04418933, "balance_loss_mlp": 1.0168401, "epoch": 0.5635953705095446, "flos": 21871645701120.0, "grad_norm": 2.1043627154333797, "language_loss": 0.82543874, "learning_rate": 1.6864194567277264e-06, "loss": 0.84665811, "num_input_tokens_seen": 201904730, "step": 9374, "time_per_iteration": 2.644887924194336 }, { "auxiliary_loss_clip": 0.01104117, "auxiliary_loss_mlp": 0.01030499, "balance_loss_clip": 1.03739262, "balance_loss_mlp": 1.01734459, "epoch": 0.5636554937622126, "flos": 27126166659840.0, "grad_norm": 1.7268514389800265, "language_loss": 0.66357785, "learning_rate": 1.6860348178871618e-06, "loss": 0.68492401, "num_input_tokens_seen": 201924850, "step": 9375, "time_per_iteration": 2.65166974067688 }, { "auxiliary_loss_clip": 0.01084894, "auxiliary_loss_mlp": 0.00770652, "balance_loss_clip": 1.04238153, "balance_loss_mlp": 1.00019169, "epoch": 0.5637156170148805, "flos": 12926405272320.0, "grad_norm": 2.3049359861127696, "language_loss": 0.81049269, "learning_rate": 1.6856501909521889e-06, "loss": 0.82904816, "num_input_tokens_seen": 201939500, "step": 9376, "time_per_iteration": 2.766364336013794 }, { "auxiliary_loss_clip": 0.01101359, "auxiliary_loss_mlp": 0.01034779, "balance_loss_clip": 1.04133999, "balance_loss_mlp": 1.02115881, "epoch": 0.5637757402675485, "flos": 45551033130240.0, "grad_norm": 1.6449694311006493, "language_loss": 0.6926713, "learning_rate": 1.6852655759373925e-06, "loss": 0.71403265, "num_input_tokens_seen": 201963000, "step": 9377, "time_per_iteration": 2.870060443878174 }, { "auxiliary_loss_clip": 0.01074381, "auxiliary_loss_mlp": 0.01032969, "balance_loss_clip": 1.03875685, "balance_loss_mlp": 1.01979017, "epoch": 0.5638358635202164, "flos": 20886041439360.0, "grad_norm": 1.3919625147372467, "language_loss": 0.74771237, "learning_rate": 1.6848809728573565e-06, "loss": 0.76878589, "num_input_tokens_seen": 201983145, "step": 9378, "time_per_iteration": 2.749613046646118 }, { "auxiliary_loss_clip": 0.01122728, "auxiliary_loss_mlp": 0.01035934, "balance_loss_clip": 1.04050553, "balance_loss_mlp": 1.02154493, "epoch": 0.5638959867728844, "flos": 18806562345600.0, "grad_norm": 2.63873718495401, "language_loss": 0.81853002, "learning_rate": 1.6844963817266656e-06, "loss": 0.84011662, "num_input_tokens_seen": 202000335, "step": 9379, "time_per_iteration": 2.625277280807495 }, { "auxiliary_loss_clip": 0.01093031, "auxiliary_loss_mlp": 0.01036774, "balance_loss_clip": 1.03674948, "balance_loss_mlp": 1.02336287, "epoch": 0.5639561100255523, "flos": 27490336698240.0, "grad_norm": 2.218934810530396, "language_loss": 0.7167027, "learning_rate": 1.6841118025599042e-06, "loss": 0.73800081, "num_input_tokens_seen": 202018275, "step": 9380, "time_per_iteration": 2.715791940689087 }, { "auxiliary_loss_clip": 0.01086194, "auxiliary_loss_mlp": 0.01039984, "balance_loss_clip": 1.0455358, "balance_loss_mlp": 1.02485633, "epoch": 0.5640162332782204, "flos": 18076570243200.0, "grad_norm": 2.0069687855649234, "language_loss": 0.74178547, "learning_rate": 1.6837272353716542e-06, "loss": 0.76304728, "num_input_tokens_seen": 202034330, "step": 9381, "time_per_iteration": 2.8091652393341064 }, { "auxiliary_loss_clip": 0.01068257, "auxiliary_loss_mlp": 0.01041209, "balance_loss_clip": 1.03590226, "balance_loss_mlp": 1.02741027, "epoch": 0.5640763565308883, "flos": 20884856290560.0, "grad_norm": 2.008212488841835, "language_loss": 0.72358, "learning_rate": 1.683342680176499e-06, "loss": 0.74467456, "num_input_tokens_seen": 202053100, "step": 9382, "time_per_iteration": 2.750049114227295 }, { "auxiliary_loss_clip": 0.0103983, "auxiliary_loss_mlp": 0.01012073, "balance_loss_clip": 1.01468074, "balance_loss_mlp": 1.01088643, "epoch": 0.5641364797835563, "flos": 64447912224000.0, "grad_norm": 0.7132903418918451, "language_loss": 0.54439944, "learning_rate": 1.682958136989022e-06, "loss": 0.56491846, "num_input_tokens_seen": 202120125, "step": 9383, "time_per_iteration": 3.308600425720215 }, { "auxiliary_loss_clip": 0.01106116, "auxiliary_loss_mlp": 0.01030643, "balance_loss_clip": 1.04080617, "balance_loss_mlp": 1.01664162, "epoch": 0.5641966030362242, "flos": 18660944609280.0, "grad_norm": 1.7587549687902173, "language_loss": 0.71036148, "learning_rate": 1.6825736058238033e-06, "loss": 0.73172909, "num_input_tokens_seen": 202138030, "step": 9384, "time_per_iteration": 2.705378532409668 }, { "auxiliary_loss_clip": 0.01098378, "auxiliary_loss_mlp": 0.01035226, "balance_loss_clip": 1.04193604, "balance_loss_mlp": 1.02113533, "epoch": 0.5642567262888922, "flos": 22492325738880.0, "grad_norm": 2.5060474723218724, "language_loss": 0.75891483, "learning_rate": 1.6821890866954263e-06, "loss": 0.78025091, "num_input_tokens_seen": 202155580, "step": 9385, "time_per_iteration": 2.648486375808716 }, { "auxiliary_loss_clip": 0.01102679, "auxiliary_loss_mlp": 0.01034721, "balance_loss_clip": 1.03705001, "balance_loss_mlp": 1.02121449, "epoch": 0.5643168495415603, "flos": 13003972692480.0, "grad_norm": 1.9370694733196534, "language_loss": 0.82360542, "learning_rate": 1.6818045796184703e-06, "loss": 0.84497941, "num_input_tokens_seen": 202170365, "step": 9386, "time_per_iteration": 2.6014211177825928 }, { "auxiliary_loss_clip": 0.01108433, "auxiliary_loss_mlp": 0.01035233, "balance_loss_clip": 1.04246962, "balance_loss_mlp": 1.02117205, "epoch": 0.5643769727942282, "flos": 18588297352320.0, "grad_norm": 2.256739627854675, "language_loss": 0.69928676, "learning_rate": 1.681420084607516e-06, "loss": 0.72072339, "num_input_tokens_seen": 202189095, "step": 9387, "time_per_iteration": 2.6225178241729736 }, { "auxiliary_loss_clip": 0.01110032, "auxiliary_loss_mlp": 0.01036058, "balance_loss_clip": 1.04169261, "balance_loss_mlp": 1.02292085, "epoch": 0.5644370960468962, "flos": 33806269572480.0, "grad_norm": 1.4294069994917775, "language_loss": 0.74616826, "learning_rate": 1.6810356016771452e-06, "loss": 0.76762915, "num_input_tokens_seen": 202213500, "step": 9388, "time_per_iteration": 4.3489909172058105 }, { "auxiliary_loss_clip": 0.01103005, "auxiliary_loss_mlp": 0.01033351, "balance_loss_clip": 1.04041004, "balance_loss_mlp": 1.02143562, "epoch": 0.5644972192995641, "flos": 21214911386880.0, "grad_norm": 1.5515532198665989, "language_loss": 0.81965339, "learning_rate": 1.6806511308419353e-06, "loss": 0.84101695, "num_input_tokens_seen": 202231920, "step": 9389, "time_per_iteration": 5.713036060333252 }, { "auxiliary_loss_clip": 0.01083726, "auxiliary_loss_mlp": 0.01035772, "balance_loss_clip": 1.03770804, "balance_loss_mlp": 1.02090037, "epoch": 0.5645573425522321, "flos": 18587722734720.0, "grad_norm": 2.017294292301613, "language_loss": 0.63844502, "learning_rate": 1.680266672116467e-06, "loss": 0.65964001, "num_input_tokens_seen": 202247600, "step": 9390, "time_per_iteration": 2.718738079071045 }, { "auxiliary_loss_clip": 0.01096947, "auxiliary_loss_mlp": 0.01030588, "balance_loss_clip": 1.04229331, "balance_loss_mlp": 1.01875103, "epoch": 0.5646174658049, "flos": 18113809668480.0, "grad_norm": 1.8385345725956297, "language_loss": 0.92190915, "learning_rate": 1.6798822255153192e-06, "loss": 0.94318449, "num_input_tokens_seen": 202265350, "step": 9391, "time_per_iteration": 2.6871705055236816 }, { "auxiliary_loss_clip": 0.01118295, "auxiliary_loss_mlp": 0.01037212, "balance_loss_clip": 1.04650784, "balance_loss_mlp": 1.02288282, "epoch": 0.564677589057568, "flos": 28329964087680.0, "grad_norm": 2.30014312113224, "language_loss": 0.60238105, "learning_rate": 1.6794977910530684e-06, "loss": 0.62393618, "num_input_tokens_seen": 202284285, "step": 9392, "time_per_iteration": 2.6965878009796143 }, { "auxiliary_loss_clip": 0.01068376, "auxiliary_loss_mlp": 0.01027584, "balance_loss_clip": 1.03531122, "balance_loss_mlp": 1.01367223, "epoch": 0.564737712310236, "flos": 22163743100160.0, "grad_norm": 2.2381091213593924, "language_loss": 0.81505215, "learning_rate": 1.6791133687442937e-06, "loss": 0.83601177, "num_input_tokens_seen": 202303450, "step": 9393, "time_per_iteration": 4.253687620162964 }, { "auxiliary_loss_clip": 0.01095131, "auxiliary_loss_mlp": 0.01031195, "balance_loss_clip": 1.03995085, "balance_loss_mlp": 1.01804614, "epoch": 0.564797835562904, "flos": 20959011918720.0, "grad_norm": 1.6857006339700658, "language_loss": 0.87381589, "learning_rate": 1.6787289586035725e-06, "loss": 0.89507914, "num_input_tokens_seen": 202322315, "step": 9394, "time_per_iteration": 2.6733334064483643 }, { "auxiliary_loss_clip": 0.0110875, "auxiliary_loss_mlp": 0.01033757, "balance_loss_clip": 1.04296374, "balance_loss_mlp": 1.02065587, "epoch": 0.5648579588155719, "flos": 17420302805760.0, "grad_norm": 1.9505278392416294, "language_loss": 0.84685338, "learning_rate": 1.6783445606454814e-06, "loss": 0.86827838, "num_input_tokens_seen": 202339905, "step": 9395, "time_per_iteration": 2.6754062175750732 }, { "auxiliary_loss_clip": 0.0102964, "auxiliary_loss_mlp": 0.01000117, "balance_loss_clip": 1.01416993, "balance_loss_mlp": 0.99888915, "epoch": 0.5649180820682399, "flos": 69929568835200.0, "grad_norm": 0.7966393150311729, "language_loss": 0.58260763, "learning_rate": 1.677960174884597e-06, "loss": 0.60290521, "num_input_tokens_seen": 202397320, "step": 9396, "time_per_iteration": 3.176486015319824 }, { "auxiliary_loss_clip": 0.01099184, "auxiliary_loss_mlp": 0.01030849, "balance_loss_clip": 1.04099381, "balance_loss_mlp": 1.01762295, "epoch": 0.5649782053209078, "flos": 24973070641920.0, "grad_norm": 1.8659420980935195, "language_loss": 0.70408708, "learning_rate": 1.6775758013354943e-06, "loss": 0.72538739, "num_input_tokens_seen": 202416865, "step": 9397, "time_per_iteration": 2.76436710357666 }, { "auxiliary_loss_clip": 0.01087737, "auxiliary_loss_mlp": 0.01036875, "balance_loss_clip": 1.0412184, "balance_loss_mlp": 1.02305877, "epoch": 0.5650383285735758, "flos": 21726602582400.0, "grad_norm": 1.7242630837852022, "language_loss": 0.66510224, "learning_rate": 1.67719144001275e-06, "loss": 0.68634838, "num_input_tokens_seen": 202436210, "step": 9398, "time_per_iteration": 2.8452060222625732 }, { "auxiliary_loss_clip": 0.0102199, "auxiliary_loss_mlp": 0.01002651, "balance_loss_clip": 1.01533413, "balance_loss_mlp": 1.00157201, "epoch": 0.5650984518262439, "flos": 65904484636800.0, "grad_norm": 0.7636877487193632, "language_loss": 0.58165693, "learning_rate": 1.6768070909309386e-06, "loss": 0.60190332, "num_input_tokens_seen": 202492925, "step": 9399, "time_per_iteration": 3.1523597240448 }, { "auxiliary_loss_clip": 0.01076045, "auxiliary_loss_mlp": 0.01036845, "balance_loss_clip": 1.03608418, "balance_loss_mlp": 1.02109778, "epoch": 0.5651585750789118, "flos": 21032592929280.0, "grad_norm": 2.707299355352823, "language_loss": 0.7311101, "learning_rate": 1.6764227541046347e-06, "loss": 0.75223899, "num_input_tokens_seen": 202511905, "step": 9400, "time_per_iteration": 2.778313636779785 }, { "auxiliary_loss_clip": 0.01093566, "auxiliary_loss_mlp": 0.01038541, "balance_loss_clip": 1.04261565, "balance_loss_mlp": 1.02349663, "epoch": 0.5652186983315798, "flos": 18551919853440.0, "grad_norm": 1.7896331589473868, "language_loss": 0.6111843, "learning_rate": 1.676038429548412e-06, "loss": 0.63250542, "num_input_tokens_seen": 202529815, "step": 9401, "time_per_iteration": 2.7110683917999268 }, { "auxiliary_loss_clip": 0.01077473, "auxiliary_loss_mlp": 0.01030698, "balance_loss_clip": 1.03607464, "balance_loss_mlp": 1.01735282, "epoch": 0.5652788215842477, "flos": 18478662065280.0, "grad_norm": 3.6521869515488405, "language_loss": 0.81323993, "learning_rate": 1.6756541172768453e-06, "loss": 0.83432162, "num_input_tokens_seen": 202547710, "step": 9402, "time_per_iteration": 2.8134961128234863 }, { "auxiliary_loss_clip": 0.0106172, "auxiliary_loss_mlp": 0.01043189, "balance_loss_clip": 1.03186333, "balance_loss_mlp": 1.02785897, "epoch": 0.5653389448369157, "flos": 30044052080640.0, "grad_norm": 1.434807389128129, "language_loss": 0.77711642, "learning_rate": 1.6752698173045068e-06, "loss": 0.79816544, "num_input_tokens_seen": 202568835, "step": 9403, "time_per_iteration": 2.9176833629608154 }, { "auxiliary_loss_clip": 0.01064861, "auxiliary_loss_mlp": 0.01036876, "balance_loss_clip": 1.03543758, "balance_loss_mlp": 1.02137828, "epoch": 0.5653990680895836, "flos": 16727550128640.0, "grad_norm": 1.6891349615397695, "language_loss": 0.69381618, "learning_rate": 1.6748855296459685e-06, "loss": 0.71483362, "num_input_tokens_seen": 202587385, "step": 9404, "time_per_iteration": 2.8122291564941406 }, { "auxiliary_loss_clip": 0.01081972, "auxiliary_loss_mlp": 0.01035091, "balance_loss_clip": 1.03926969, "balance_loss_mlp": 1.02245533, "epoch": 0.5654591913422516, "flos": 14538256179840.0, "grad_norm": 1.8707097320787585, "language_loss": 0.66802347, "learning_rate": 1.6745012543158045e-06, "loss": 0.68919408, "num_input_tokens_seen": 202604815, "step": 9405, "time_per_iteration": 2.6256675720214844 }, { "auxiliary_loss_clip": 0.01087827, "auxiliary_loss_mlp": 0.01038368, "balance_loss_clip": 1.03976154, "balance_loss_mlp": 1.02543378, "epoch": 0.5655193145949196, "flos": 26209905603840.0, "grad_norm": 1.7731068900459501, "language_loss": 0.74520212, "learning_rate": 1.6741169913285852e-06, "loss": 0.76646411, "num_input_tokens_seen": 202623775, "step": 9406, "time_per_iteration": 2.7220685482025146 }, { "auxiliary_loss_clip": 0.01061139, "auxiliary_loss_mlp": 0.01043351, "balance_loss_clip": 1.03829598, "balance_loss_mlp": 1.02655435, "epoch": 0.5655794378475876, "flos": 25046579825280.0, "grad_norm": 1.7152353741974506, "language_loss": 0.7952764, "learning_rate": 1.673732740698882e-06, "loss": 0.81632137, "num_input_tokens_seen": 202643375, "step": 9407, "time_per_iteration": 2.785325765609741 }, { "auxiliary_loss_clip": 0.01077703, "auxiliary_loss_mlp": 0.01039246, "balance_loss_clip": 1.03728688, "balance_loss_mlp": 1.02510178, "epoch": 0.5656395611002555, "flos": 31032852652800.0, "grad_norm": 1.3619251628826352, "language_loss": 0.71023029, "learning_rate": 1.6733485024412666e-06, "loss": 0.73139971, "num_input_tokens_seen": 202668400, "step": 9408, "time_per_iteration": 2.8171489238739014 }, { "auxiliary_loss_clip": 0.01061658, "auxiliary_loss_mlp": 0.01035867, "balance_loss_clip": 1.03865576, "balance_loss_mlp": 1.02198541, "epoch": 0.5656996843529235, "flos": 20229522606720.0, "grad_norm": 1.9952093590252573, "language_loss": 0.81203496, "learning_rate": 1.672964276570308e-06, "loss": 0.8330102, "num_input_tokens_seen": 202685125, "step": 9409, "time_per_iteration": 2.770899772644043 }, { "auxiliary_loss_clip": 0.01076156, "auxiliary_loss_mlp": 0.01030595, "balance_loss_clip": 1.03786421, "balance_loss_mlp": 1.01730919, "epoch": 0.5657598076055914, "flos": 20996251344000.0, "grad_norm": 1.8859201816541107, "language_loss": 0.78039193, "learning_rate": 1.6725800631005776e-06, "loss": 0.80145949, "num_input_tokens_seen": 202703830, "step": 9410, "time_per_iteration": 2.6944680213928223 }, { "auxiliary_loss_clip": 0.01121778, "auxiliary_loss_mlp": 0.01042462, "balance_loss_clip": 1.04339719, "balance_loss_mlp": 1.02865767, "epoch": 0.5658199308582594, "flos": 11545999649280.0, "grad_norm": 2.199230863577756, "language_loss": 0.83460367, "learning_rate": 1.6721958620466432e-06, "loss": 0.85624611, "num_input_tokens_seen": 202719835, "step": 9411, "time_per_iteration": 2.576122760772705 }, { "auxiliary_loss_clip": 0.01112938, "auxiliary_loss_mlp": 0.01033542, "balance_loss_clip": 1.04195237, "balance_loss_mlp": 1.01830769, "epoch": 0.5658800541109275, "flos": 14172146807040.0, "grad_norm": 3.221148840875553, "language_loss": 0.67855954, "learning_rate": 1.6718116734230749e-06, "loss": 0.70002437, "num_input_tokens_seen": 202736795, "step": 9412, "time_per_iteration": 2.6416120529174805 }, { "auxiliary_loss_clip": 0.01104164, "auxiliary_loss_mlp": 0.01032428, "balance_loss_clip": 1.04040003, "balance_loss_mlp": 1.02026224, "epoch": 0.5659401773635954, "flos": 27305073325440.0, "grad_norm": 1.6585263288332466, "language_loss": 0.58582389, "learning_rate": 1.6714274972444413e-06, "loss": 0.60718977, "num_input_tokens_seen": 202756900, "step": 9413, "time_per_iteration": 2.678048610687256 }, { "auxiliary_loss_clip": 0.01039217, "auxiliary_loss_mlp": 0.01044241, "balance_loss_clip": 1.03433728, "balance_loss_mlp": 1.02943516, "epoch": 0.5660003006162634, "flos": 16728196573440.0, "grad_norm": 1.5449777270978375, "language_loss": 0.69369984, "learning_rate": 1.6710433335253092e-06, "loss": 0.71453446, "num_input_tokens_seen": 202775145, "step": 9414, "time_per_iteration": 2.7721176147460938 }, { "auxiliary_loss_clip": 0.01048825, "auxiliary_loss_mlp": 0.01033596, "balance_loss_clip": 1.04257154, "balance_loss_mlp": 1.02139449, "epoch": 0.5660604238689313, "flos": 21653452535040.0, "grad_norm": 1.812121190686056, "language_loss": 0.78028589, "learning_rate": 1.670659182280247e-06, "loss": 0.80111009, "num_input_tokens_seen": 202794505, "step": 9415, "time_per_iteration": 3.0027029514312744 }, { "auxiliary_loss_clip": 0.01020707, "auxiliary_loss_mlp": 0.01005189, "balance_loss_clip": 1.01482093, "balance_loss_mlp": 1.00411057, "epoch": 0.5661205471215993, "flos": 68824022083200.0, "grad_norm": 0.6894107195855314, "language_loss": 0.4917945, "learning_rate": 1.670275043523822e-06, "loss": 0.51205349, "num_input_tokens_seen": 202858580, "step": 9416, "time_per_iteration": 3.564145565032959 }, { "auxiliary_loss_clip": 0.01107627, "auxiliary_loss_mlp": 0.00770936, "balance_loss_clip": 1.04195189, "balance_loss_mlp": 1.00020862, "epoch": 0.5661806703742672, "flos": 28621774177920.0, "grad_norm": 1.657672708695628, "language_loss": 0.62541103, "learning_rate": 1.6698909172706e-06, "loss": 0.64419663, "num_input_tokens_seen": 202878565, "step": 9417, "time_per_iteration": 2.6624128818511963 }, { "auxiliary_loss_clip": 0.01098355, "auxiliary_loss_mlp": 0.01033838, "balance_loss_clip": 1.03992152, "balance_loss_mlp": 1.02003968, "epoch": 0.5662407936269352, "flos": 21397948116480.0, "grad_norm": 1.9219049023075434, "language_loss": 0.68760461, "learning_rate": 1.6695068035351479e-06, "loss": 0.7089265, "num_input_tokens_seen": 202897350, "step": 9418, "time_per_iteration": 2.686701774597168 }, { "auxiliary_loss_clip": 0.0110608, "auxiliary_loss_mlp": 0.01034957, "balance_loss_clip": 1.03848708, "balance_loss_mlp": 1.01997232, "epoch": 0.5663009168796032, "flos": 25660005315840.0, "grad_norm": 1.8426385136450754, "language_loss": 0.65225303, "learning_rate": 1.6691227023320304e-06, "loss": 0.67366338, "num_input_tokens_seen": 202916745, "step": 9419, "time_per_iteration": 2.7483572959899902 }, { "auxiliary_loss_clip": 0.00978175, "auxiliary_loss_mlp": 0.01018666, "balance_loss_clip": 1.01932096, "balance_loss_mlp": 1.01722336, "epoch": 0.5663610401322712, "flos": 67930458422400.0, "grad_norm": 0.7448874820638522, "language_loss": 0.59677726, "learning_rate": 1.6687386136758135e-06, "loss": 0.61674571, "num_input_tokens_seen": 202982375, "step": 9420, "time_per_iteration": 3.422990083694458 }, { "auxiliary_loss_clip": 0.01098663, "auxiliary_loss_mlp": 0.00770427, "balance_loss_clip": 1.0412631, "balance_loss_mlp": 1.00017929, "epoch": 0.5664211633849391, "flos": 24609367480320.0, "grad_norm": 1.5681535851968893, "language_loss": 0.74130625, "learning_rate": 1.6683545375810618e-06, "loss": 0.75999713, "num_input_tokens_seen": 203002430, "step": 9421, "time_per_iteration": 2.8006680011749268 }, { "auxiliary_loss_clip": 0.0108426, "auxiliary_loss_mlp": 0.01035979, "balance_loss_clip": 1.03777134, "balance_loss_mlp": 1.02212119, "epoch": 0.5664812866376071, "flos": 11648811352320.0, "grad_norm": 2.1577016458252567, "language_loss": 0.72988069, "learning_rate": 1.6679704740623389e-06, "loss": 0.75108308, "num_input_tokens_seen": 203019425, "step": 9422, "time_per_iteration": 2.6400234699249268 }, { "auxiliary_loss_clip": 0.01105093, "auxiliary_loss_mlp": 0.01037861, "balance_loss_clip": 1.04141676, "balance_loss_mlp": 1.02530825, "epoch": 0.566541409890275, "flos": 24643985212800.0, "grad_norm": 1.7654112494568213, "language_loss": 0.81893075, "learning_rate": 1.6675864231342085e-06, "loss": 0.84036028, "num_input_tokens_seen": 203039035, "step": 9423, "time_per_iteration": 2.673105239868164 }, { "auxiliary_loss_clip": 0.01090689, "auxiliary_loss_mlp": 0.01037493, "balance_loss_clip": 1.03944159, "balance_loss_mlp": 1.02356339, "epoch": 0.566601533142943, "flos": 22270577126400.0, "grad_norm": 1.4934148877619189, "language_loss": 0.8075555, "learning_rate": 1.6672023848112353e-06, "loss": 0.82883728, "num_input_tokens_seen": 203059320, "step": 9424, "time_per_iteration": 2.6597039699554443 }, { "auxiliary_loss_clip": 0.01124321, "auxiliary_loss_mlp": 0.00771519, "balance_loss_clip": 1.04382432, "balance_loss_mlp": 1.00018978, "epoch": 0.5666616563956111, "flos": 29971656218880.0, "grad_norm": 2.0092362269175297, "language_loss": 0.78882873, "learning_rate": 1.6668183591079805e-06, "loss": 0.80778712, "num_input_tokens_seen": 203078490, "step": 9425, "time_per_iteration": 2.6688153743743896 }, { "auxiliary_loss_clip": 0.01090837, "auxiliary_loss_mlp": 0.01034858, "balance_loss_clip": 1.0417583, "balance_loss_mlp": 1.02170324, "epoch": 0.566721779648279, "flos": 17781456101760.0, "grad_norm": 1.976091068193849, "language_loss": 0.5920769, "learning_rate": 1.6664343460390064e-06, "loss": 0.61333382, "num_input_tokens_seen": 203096065, "step": 9426, "time_per_iteration": 2.6646664142608643 }, { "auxiliary_loss_clip": 0.01110034, "auxiliary_loss_mlp": 0.01032331, "balance_loss_clip": 1.04102027, "balance_loss_mlp": 1.01922381, "epoch": 0.566781902900947, "flos": 21033490769280.0, "grad_norm": 2.110311025280775, "language_loss": 0.81678975, "learning_rate": 1.6660503456188764e-06, "loss": 0.83821344, "num_input_tokens_seen": 203115270, "step": 9427, "time_per_iteration": 5.8222620487213135 }, { "auxiliary_loss_clip": 0.01117064, "auxiliary_loss_mlp": 0.01038278, "balance_loss_clip": 1.04323864, "balance_loss_mlp": 1.02506411, "epoch": 0.5668420261536149, "flos": 23148593176320.0, "grad_norm": 1.814267468057716, "language_loss": 0.86105633, "learning_rate": 1.6656663578621498e-06, "loss": 0.88260972, "num_input_tokens_seen": 203134290, "step": 9428, "time_per_iteration": 4.0940985679626465 }, { "auxiliary_loss_clip": 0.01102233, "auxiliary_loss_mlp": 0.01034099, "balance_loss_clip": 1.04397511, "balance_loss_mlp": 1.01996648, "epoch": 0.5669021494062829, "flos": 22601601889920.0, "grad_norm": 2.604927880391597, "language_loss": 0.73541754, "learning_rate": 1.6652823827833886e-06, "loss": 0.75678086, "num_input_tokens_seen": 203152935, "step": 9429, "time_per_iteration": 2.711982011795044 }, { "auxiliary_loss_clip": 0.01100688, "auxiliary_loss_mlp": 0.00772268, "balance_loss_clip": 1.04164147, "balance_loss_mlp": 1.00020123, "epoch": 0.5669622726589508, "flos": 17381231786880.0, "grad_norm": 3.499205688936759, "language_loss": 0.75380534, "learning_rate": 1.6648984203971538e-06, "loss": 0.77253491, "num_input_tokens_seen": 203170110, "step": 9430, "time_per_iteration": 2.775536060333252 }, { "auxiliary_loss_clip": 0.0111876, "auxiliary_loss_mlp": 0.01036284, "balance_loss_clip": 1.04125142, "balance_loss_mlp": 1.02263451, "epoch": 0.5670223959116188, "flos": 18763253521920.0, "grad_norm": 1.7932678929965582, "language_loss": 0.72862244, "learning_rate": 1.6645144707180032e-06, "loss": 0.75017291, "num_input_tokens_seen": 203188825, "step": 9431, "time_per_iteration": 2.7299160957336426 }, { "auxiliary_loss_clip": 0.01068382, "auxiliary_loss_mlp": 0.01037407, "balance_loss_clip": 1.03856969, "balance_loss_mlp": 1.02459264, "epoch": 0.5670825191642868, "flos": 13553334276480.0, "grad_norm": 1.899230938499918, "language_loss": 0.73544705, "learning_rate": 1.6641305337604984e-06, "loss": 0.75650489, "num_input_tokens_seen": 203206860, "step": 9432, "time_per_iteration": 2.68713641166687 }, { "auxiliary_loss_clip": 0.01066627, "auxiliary_loss_mlp": 0.01032044, "balance_loss_clip": 1.03716183, "balance_loss_mlp": 1.01875782, "epoch": 0.5671426424169548, "flos": 22054035985920.0, "grad_norm": 1.4657818599236931, "language_loss": 0.78099382, "learning_rate": 1.663746609539197e-06, "loss": 0.80198044, "num_input_tokens_seen": 203225625, "step": 9433, "time_per_iteration": 4.3982954025268555 }, { "auxiliary_loss_clip": 0.01123451, "auxiliary_loss_mlp": 0.01038623, "balance_loss_clip": 1.04226542, "balance_loss_mlp": 1.02239299, "epoch": 0.5672027656696227, "flos": 21323972056320.0, "grad_norm": 1.9415050552486373, "language_loss": 0.6311425, "learning_rate": 1.6633626980686582e-06, "loss": 0.65276325, "num_input_tokens_seen": 203242920, "step": 9434, "time_per_iteration": 2.6829726696014404 }, { "auxiliary_loss_clip": 0.01106985, "auxiliary_loss_mlp": 0.01029655, "balance_loss_clip": 1.04066229, "balance_loss_mlp": 1.01654196, "epoch": 0.5672628889222907, "flos": 23514056104320.0, "grad_norm": 2.0456781967901025, "language_loss": 0.66337132, "learning_rate": 1.6629787993634399e-06, "loss": 0.68473774, "num_input_tokens_seen": 203261995, "step": 9435, "time_per_iteration": 2.7055511474609375 }, { "auxiliary_loss_clip": 0.01092568, "auxiliary_loss_mlp": 0.00770808, "balance_loss_clip": 1.03747869, "balance_loss_mlp": 1.00008333, "epoch": 0.5673230121749586, "flos": 27121928855040.0, "grad_norm": 1.9714061310868114, "language_loss": 0.71574509, "learning_rate": 1.6625949134380984e-06, "loss": 0.73437893, "num_input_tokens_seen": 203280670, "step": 9436, "time_per_iteration": 2.7314302921295166 }, { "auxiliary_loss_clip": 0.01119804, "auxiliary_loss_mlp": 0.01034867, "balance_loss_clip": 1.041466, "balance_loss_mlp": 1.02099752, "epoch": 0.5673831354276266, "flos": 31141985149440.0, "grad_norm": 1.474374193730658, "language_loss": 0.7411499, "learning_rate": 1.6622110403071921e-06, "loss": 0.76269662, "num_input_tokens_seen": 203304800, "step": 9437, "time_per_iteration": 2.6829545497894287 }, { "auxiliary_loss_clip": 0.01115825, "auxiliary_loss_mlp": 0.01036618, "balance_loss_clip": 1.04766893, "balance_loss_mlp": 1.02231264, "epoch": 0.5674432586802945, "flos": 27673193859840.0, "grad_norm": 2.0226289672132096, "language_loss": 0.6118415, "learning_rate": 1.661827179985277e-06, "loss": 0.63336593, "num_input_tokens_seen": 203324060, "step": 9438, "time_per_iteration": 2.6840946674346924 }, { "auxiliary_loss_clip": 0.01097885, "auxiliary_loss_mlp": 0.01032312, "balance_loss_clip": 1.03924835, "balance_loss_mlp": 1.0185318, "epoch": 0.5675033819329626, "flos": 26615157822720.0, "grad_norm": 1.5530482991602657, "language_loss": 0.75020033, "learning_rate": 1.661443332486909e-06, "loss": 0.77150226, "num_input_tokens_seen": 203344360, "step": 9439, "time_per_iteration": 2.6898789405822754 }, { "auxiliary_loss_clip": 0.01092055, "auxiliary_loss_mlp": 0.01036149, "balance_loss_clip": 1.04008341, "balance_loss_mlp": 1.02168322, "epoch": 0.5675635051856306, "flos": 19098372435840.0, "grad_norm": 1.924986803502997, "language_loss": 0.83848387, "learning_rate": 1.6610594978266438e-06, "loss": 0.85976589, "num_input_tokens_seen": 203362115, "step": 9440, "time_per_iteration": 2.7438228130340576 }, { "auxiliary_loss_clip": 0.01087383, "auxiliary_loss_mlp": 0.01036961, "balance_loss_clip": 1.0389899, "balance_loss_mlp": 1.02264404, "epoch": 0.5676236284382985, "flos": 17566315591680.0, "grad_norm": 3.3538120018942843, "language_loss": 0.75190175, "learning_rate": 1.6606756760190365e-06, "loss": 0.7731452, "num_input_tokens_seen": 203380550, "step": 9441, "time_per_iteration": 2.6487948894500732 }, { "auxiliary_loss_clip": 0.01066366, "auxiliary_loss_mlp": 0.01037451, "balance_loss_clip": 1.03523147, "balance_loss_mlp": 1.02376556, "epoch": 0.5676837516909665, "flos": 15954069634560.0, "grad_norm": 1.8078445069287523, "language_loss": 0.83109975, "learning_rate": 1.6602918670786413e-06, "loss": 0.85213792, "num_input_tokens_seen": 203396590, "step": 9442, "time_per_iteration": 2.692474842071533 }, { "auxiliary_loss_clip": 0.01083606, "auxiliary_loss_mlp": 0.0103585, "balance_loss_clip": 1.04210138, "balance_loss_mlp": 1.02311242, "epoch": 0.5677438749436344, "flos": 18295912644480.0, "grad_norm": 2.0214699890453414, "language_loss": 0.74567246, "learning_rate": 1.6599080710200126e-06, "loss": 0.76686704, "num_input_tokens_seen": 203414280, "step": 9443, "time_per_iteration": 2.742173433303833 }, { "auxiliary_loss_clip": 0.01093942, "auxiliary_loss_mlp": 0.01036542, "balance_loss_clip": 1.04245853, "balance_loss_mlp": 1.02310669, "epoch": 0.5678039981963025, "flos": 17931311642880.0, "grad_norm": 2.2236359492875817, "language_loss": 0.77068752, "learning_rate": 1.6595242878577046e-06, "loss": 0.79199237, "num_input_tokens_seen": 203433280, "step": 9444, "time_per_iteration": 2.65165376663208 }, { "auxiliary_loss_clip": 0.01083168, "auxiliary_loss_mlp": 0.01042977, "balance_loss_clip": 1.04132152, "balance_loss_mlp": 1.02910697, "epoch": 0.5678641214489704, "flos": 19316350120320.0, "grad_norm": 1.9769562357276376, "language_loss": 0.80988097, "learning_rate": 1.6591405176062687e-06, "loss": 0.83114243, "num_input_tokens_seen": 203449935, "step": 9445, "time_per_iteration": 2.692103147506714 }, { "auxiliary_loss_clip": 0.01115981, "auxiliary_loss_mlp": 0.01030041, "balance_loss_clip": 1.03910589, "balance_loss_mlp": 1.01635599, "epoch": 0.5679242447016384, "flos": 27751084502400.0, "grad_norm": 1.8145653139656197, "language_loss": 0.71126974, "learning_rate": 1.658756760280259e-06, "loss": 0.73272997, "num_input_tokens_seen": 203473025, "step": 9446, "time_per_iteration": 2.6656479835510254 }, { "auxiliary_loss_clip": 0.01084809, "auxiliary_loss_mlp": 0.01029841, "balance_loss_clip": 1.03896558, "balance_loss_mlp": 1.01640046, "epoch": 0.5679843679543063, "flos": 23769093646080.0, "grad_norm": 1.9173533022587075, "language_loss": 0.73434311, "learning_rate": 1.6583730158942276e-06, "loss": 0.75548959, "num_input_tokens_seen": 203492895, "step": 9447, "time_per_iteration": 2.7948012351989746 }, { "auxiliary_loss_clip": 0.01099661, "auxiliary_loss_mlp": 0.01034648, "balance_loss_clip": 1.04186499, "balance_loss_mlp": 1.02139819, "epoch": 0.5680444912069743, "flos": 25591883172480.0, "grad_norm": 3.5475375147623294, "language_loss": 0.7504915, "learning_rate": 1.657989284462725e-06, "loss": 0.77183461, "num_input_tokens_seen": 203513710, "step": 9448, "time_per_iteration": 2.700333595275879 }, { "auxiliary_loss_clip": 0.01079167, "auxiliary_loss_mlp": 0.01049109, "balance_loss_clip": 1.04264426, "balance_loss_mlp": 1.0336951, "epoch": 0.5681046144596422, "flos": 23695799944320.0, "grad_norm": 2.3399913967333865, "language_loss": 0.76352537, "learning_rate": 1.6576055660003038e-06, "loss": 0.78480804, "num_input_tokens_seen": 203531630, "step": 9449, "time_per_iteration": 2.7736854553222656 }, { "auxiliary_loss_clip": 0.01096359, "auxiliary_loss_mlp": 0.01042326, "balance_loss_clip": 1.04059768, "balance_loss_mlp": 1.02729404, "epoch": 0.5681647377123102, "flos": 28000770917760.0, "grad_norm": 1.7507923980752478, "language_loss": 0.74660265, "learning_rate": 1.6572218605215128e-06, "loss": 0.76798952, "num_input_tokens_seen": 203551885, "step": 9450, "time_per_iteration": 2.749420642852783 }, { "auxiliary_loss_clip": 0.01102012, "auxiliary_loss_mlp": 0.01039617, "balance_loss_clip": 1.04193068, "balance_loss_mlp": 1.02674794, "epoch": 0.5682248609649782, "flos": 22747758330240.0, "grad_norm": 2.689223250754005, "language_loss": 0.66906244, "learning_rate": 1.6568381680409038e-06, "loss": 0.69047868, "num_input_tokens_seen": 203572250, "step": 9451, "time_per_iteration": 2.753199338912964 }, { "auxiliary_loss_clip": 0.01096067, "auxiliary_loss_mlp": 0.01038718, "balance_loss_clip": 1.03942561, "balance_loss_mlp": 1.02265501, "epoch": 0.5682849842176462, "flos": 21288600138240.0, "grad_norm": 3.0838986562683557, "language_loss": 0.71882987, "learning_rate": 1.656454488573026e-06, "loss": 0.74017769, "num_input_tokens_seen": 203590605, "step": 9452, "time_per_iteration": 2.6950924396514893 }, { "auxiliary_loss_clip": 0.01076417, "auxiliary_loss_mlp": 0.01030065, "balance_loss_clip": 1.03938448, "balance_loss_mlp": 1.01734543, "epoch": 0.5683451074703142, "flos": 21141689512320.0, "grad_norm": 1.8642874843773423, "language_loss": 0.70013601, "learning_rate": 1.656070822132428e-06, "loss": 0.72120082, "num_input_tokens_seen": 203610080, "step": 9453, "time_per_iteration": 2.7006165981292725 }, { "auxiliary_loss_clip": 0.01076829, "auxiliary_loss_mlp": 0.00769854, "balance_loss_clip": 1.04066825, "balance_loss_mlp": 1.00014949, "epoch": 0.5684052307229821, "flos": 22344481359360.0, "grad_norm": 2.037972918051024, "language_loss": 0.70139372, "learning_rate": 1.6556871687336592e-06, "loss": 0.71986055, "num_input_tokens_seen": 203630060, "step": 9454, "time_per_iteration": 2.759376287460327 }, { "auxiliary_loss_clip": 0.01095428, "auxiliary_loss_mlp": 0.01031911, "balance_loss_clip": 1.03987896, "balance_loss_mlp": 1.01938248, "epoch": 0.5684653539756501, "flos": 21798639308160.0, "grad_norm": 1.989743078970872, "language_loss": 0.6078186, "learning_rate": 1.6553035283912671e-06, "loss": 0.62909198, "num_input_tokens_seen": 203649065, "step": 9455, "time_per_iteration": 2.678152322769165 }, { "auxiliary_loss_clip": 0.01082741, "auxiliary_loss_mlp": 0.0103652, "balance_loss_clip": 1.0447154, "balance_loss_mlp": 1.02253652, "epoch": 0.568525477228318, "flos": 22999635475200.0, "grad_norm": 4.296474832454859, "language_loss": 0.73108375, "learning_rate": 1.6549199011198e-06, "loss": 0.75227636, "num_input_tokens_seen": 203667545, "step": 9456, "time_per_iteration": 2.7307004928588867 }, { "auxiliary_loss_clip": 0.01099598, "auxiliary_loss_mlp": 0.01031688, "balance_loss_clip": 1.04188192, "balance_loss_mlp": 1.01902199, "epoch": 0.568585600480986, "flos": 21392489249280.0, "grad_norm": 1.662795047431792, "language_loss": 0.77013254, "learning_rate": 1.6545362869338048e-06, "loss": 0.79144537, "num_input_tokens_seen": 203686025, "step": 9457, "time_per_iteration": 2.665708303451538 }, { "auxiliary_loss_clip": 0.01111194, "auxiliary_loss_mlp": 0.01036842, "balance_loss_clip": 1.0429163, "balance_loss_mlp": 1.02280521, "epoch": 0.568645723733654, "flos": 30007351359360.0, "grad_norm": 2.0672888051412817, "language_loss": 0.66191971, "learning_rate": 1.6541526858478285e-06, "loss": 0.68340003, "num_input_tokens_seen": 203705540, "step": 9458, "time_per_iteration": 2.780771017074585 }, { "auxiliary_loss_clip": 0.01110997, "auxiliary_loss_mlp": 0.01031454, "balance_loss_clip": 1.04201722, "balance_loss_mlp": 1.01742291, "epoch": 0.568705846986322, "flos": 20412667077120.0, "grad_norm": 2.504426538314312, "language_loss": 0.68920743, "learning_rate": 1.6537690978764167e-06, "loss": 0.71063197, "num_input_tokens_seen": 203723670, "step": 9459, "time_per_iteration": 2.637176513671875 }, { "auxiliary_loss_clip": 0.01095236, "auxiliary_loss_mlp": 0.01032887, "balance_loss_clip": 1.0442152, "balance_loss_mlp": 1.01929152, "epoch": 0.5687659702389899, "flos": 17456752131840.0, "grad_norm": 2.127788828908428, "language_loss": 0.76758575, "learning_rate": 1.6533855230341155e-06, "loss": 0.788867, "num_input_tokens_seen": 203739705, "step": 9460, "time_per_iteration": 2.7338075637817383 }, { "auxiliary_loss_clip": 0.01066336, "auxiliary_loss_mlp": 0.0103936, "balance_loss_clip": 1.04204893, "balance_loss_mlp": 1.02563262, "epoch": 0.5688260934916579, "flos": 25406081095680.0, "grad_norm": 1.8378075196350074, "language_loss": 0.71994978, "learning_rate": 1.65300196133547e-06, "loss": 0.74100673, "num_input_tokens_seen": 203759000, "step": 9461, "time_per_iteration": 2.9295692443847656 }, { "auxiliary_loss_clip": 0.01110974, "auxiliary_loss_mlp": 0.01036974, "balance_loss_clip": 1.04267561, "balance_loss_mlp": 1.02314544, "epoch": 0.5688862167443258, "flos": 21608024808960.0, "grad_norm": 2.3363777583338794, "language_loss": 0.73092425, "learning_rate": 1.6526184127950249e-06, "loss": 0.75240374, "num_input_tokens_seen": 203774295, "step": 9462, "time_per_iteration": 2.639132022857666 }, { "auxiliary_loss_clip": 0.01105415, "auxiliary_loss_mlp": 0.01026496, "balance_loss_clip": 1.03986573, "balance_loss_mlp": 1.01507592, "epoch": 0.5689463399969938, "flos": 22418996123520.0, "grad_norm": 1.9966058203681178, "language_loss": 0.72878397, "learning_rate": 1.6522348774273246e-06, "loss": 0.75010306, "num_input_tokens_seen": 203792710, "step": 9463, "time_per_iteration": 2.687623977661133 }, { "auxiliary_loss_clip": 0.01108157, "auxiliary_loss_mlp": 0.01032686, "balance_loss_clip": 1.04214895, "balance_loss_mlp": 1.02012718, "epoch": 0.5690064632496618, "flos": 18296810484480.0, "grad_norm": 2.136514167684146, "language_loss": 0.73800778, "learning_rate": 1.6518513552469123e-06, "loss": 0.75941622, "num_input_tokens_seen": 203811645, "step": 9464, "time_per_iteration": 2.6446449756622314 }, { "auxiliary_loss_clip": 0.01110623, "auxiliary_loss_mlp": 0.0077176, "balance_loss_clip": 1.04163098, "balance_loss_mlp": 1.00012827, "epoch": 0.5690665865023298, "flos": 21579260993280.0, "grad_norm": 2.0135063282733108, "language_loss": 0.84068149, "learning_rate": 1.6514678462683312e-06, "loss": 0.85950536, "num_input_tokens_seen": 203830040, "step": 9465, "time_per_iteration": 2.6243364810943604 }, { "auxiliary_loss_clip": 0.01092541, "auxiliary_loss_mlp": 0.01032276, "balance_loss_clip": 1.03678536, "balance_loss_mlp": 1.0195086, "epoch": 0.5691267097549978, "flos": 24421446501120.0, "grad_norm": 1.6434295280058835, "language_loss": 0.72125626, "learning_rate": 1.651084350506125e-06, "loss": 0.74250448, "num_input_tokens_seen": 203851245, "step": 9466, "time_per_iteration": 5.837533712387085 }, { "auxiliary_loss_clip": 0.01016007, "auxiliary_loss_mlp": 0.01001581, "balance_loss_clip": 1.01873374, "balance_loss_mlp": 1.00037718, "epoch": 0.5691868330076657, "flos": 61657906199040.0, "grad_norm": 0.7155703714304625, "language_loss": 0.55334294, "learning_rate": 1.6507008679748343e-06, "loss": 0.57351875, "num_input_tokens_seen": 203916400, "step": 9467, "time_per_iteration": 4.8396992683410645 }, { "auxiliary_loss_clip": 0.01107605, "auxiliary_loss_mlp": 0.0103869, "balance_loss_clip": 1.04263473, "balance_loss_mlp": 1.02364564, "epoch": 0.5692469562603337, "flos": 21325193118720.0, "grad_norm": 16.186384536861027, "language_loss": 0.6343258, "learning_rate": 1.6503173986890023e-06, "loss": 0.65578872, "num_input_tokens_seen": 203935870, "step": 9468, "time_per_iteration": 2.6212332248687744 }, { "auxiliary_loss_clip": 0.01066902, "auxiliary_loss_mlp": 0.01038069, "balance_loss_clip": 1.03614831, "balance_loss_mlp": 1.02334094, "epoch": 0.5693070795130016, "flos": 23367899664000.0, "grad_norm": 2.927691999708818, "language_loss": 0.78902012, "learning_rate": 1.64993394266317e-06, "loss": 0.81006986, "num_input_tokens_seen": 203954950, "step": 9469, "time_per_iteration": 2.745016098022461 }, { "auxiliary_loss_clip": 0.01085393, "auxiliary_loss_mlp": 0.01053274, "balance_loss_clip": 1.04159784, "balance_loss_mlp": 1.03830147, "epoch": 0.5693672027656697, "flos": 18697250280960.0, "grad_norm": 2.217720738619104, "language_loss": 0.69655335, "learning_rate": 1.6495504999118769e-06, "loss": 0.71793997, "num_input_tokens_seen": 203972715, "step": 9470, "time_per_iteration": 2.6895534992218018 }, { "auxiliary_loss_clip": 0.01097198, "auxiliary_loss_mlp": 0.01036868, "balance_loss_clip": 1.04529762, "balance_loss_mlp": 1.02352285, "epoch": 0.5694273260183376, "flos": 20449188230400.0, "grad_norm": 1.6026966116267123, "language_loss": 0.74473977, "learning_rate": 1.6491670704496644e-06, "loss": 0.76608038, "num_input_tokens_seen": 203990775, "step": 9471, "time_per_iteration": 2.6734213829040527 }, { "auxiliary_loss_clip": 0.01077759, "auxiliary_loss_mlp": 0.01040388, "balance_loss_clip": 1.0421195, "balance_loss_mlp": 1.02579701, "epoch": 0.5694874492710056, "flos": 17603195880960.0, "grad_norm": 1.75714793559233, "language_loss": 0.57588744, "learning_rate": 1.6487836542910716e-06, "loss": 0.59706891, "num_input_tokens_seen": 204008845, "step": 9472, "time_per_iteration": 4.335491180419922 }, { "auxiliary_loss_clip": 0.01082559, "auxiliary_loss_mlp": 0.01032344, "balance_loss_clip": 1.03902221, "balance_loss_mlp": 1.01946378, "epoch": 0.5695475725236735, "flos": 13370836250880.0, "grad_norm": 1.9281443896441626, "language_loss": 0.73845899, "learning_rate": 1.648400251450638e-06, "loss": 0.75960797, "num_input_tokens_seen": 204023755, "step": 9473, "time_per_iteration": 2.706148147583008 }, { "auxiliary_loss_clip": 0.01017729, "auxiliary_loss_mlp": 0.01007582, "balance_loss_clip": 1.02078795, "balance_loss_mlp": 1.00631857, "epoch": 0.5696076957763415, "flos": 68174398661760.0, "grad_norm": 0.6469732305814715, "language_loss": 0.57547617, "learning_rate": 1.6480168619429023e-06, "loss": 0.59572935, "num_input_tokens_seen": 204091255, "step": 9474, "time_per_iteration": 3.2811825275421143 }, { "auxiliary_loss_clip": 0.01106855, "auxiliary_loss_mlp": 0.01038889, "balance_loss_clip": 1.04254341, "balance_loss_mlp": 1.02532923, "epoch": 0.5696678190290094, "flos": 33838301525760.0, "grad_norm": 2.207374996280549, "language_loss": 0.53488398, "learning_rate": 1.6476334857824017e-06, "loss": 0.55634141, "num_input_tokens_seen": 204113285, "step": 9475, "time_per_iteration": 2.701791524887085 }, { "auxiliary_loss_clip": 0.01122912, "auxiliary_loss_mlp": 0.01039618, "balance_loss_clip": 1.04524517, "balance_loss_mlp": 1.0262965, "epoch": 0.5697279422816774, "flos": 26356600748160.0, "grad_norm": 1.6070261580589493, "language_loss": 0.79622197, "learning_rate": 1.647250122983675e-06, "loss": 0.81784725, "num_input_tokens_seen": 204133045, "step": 9476, "time_per_iteration": 2.695966958999634 }, { "auxiliary_loss_clip": 0.01101607, "auxiliary_loss_mlp": 0.01038712, "balance_loss_clip": 1.04603529, "balance_loss_mlp": 1.0258019, "epoch": 0.5697880655343454, "flos": 22930507751040.0, "grad_norm": 1.9576279407758228, "language_loss": 0.66811013, "learning_rate": 1.6468667735612592e-06, "loss": 0.68951333, "num_input_tokens_seen": 204152590, "step": 9477, "time_per_iteration": 2.6981940269470215 }, { "auxiliary_loss_clip": 0.0108821, "auxiliary_loss_mlp": 0.01037709, "balance_loss_clip": 1.04286826, "balance_loss_mlp": 1.02403569, "epoch": 0.5698481887870134, "flos": 26761314263040.0, "grad_norm": 1.587062911340377, "language_loss": 0.70738614, "learning_rate": 1.6464834375296906e-06, "loss": 0.72864532, "num_input_tokens_seen": 204171815, "step": 9478, "time_per_iteration": 2.779813766479492 }, { "auxiliary_loss_clip": 0.01084042, "auxiliary_loss_mlp": 0.01031832, "balance_loss_clip": 1.03916287, "balance_loss_mlp": 1.0200479, "epoch": 0.5699083120396814, "flos": 15742269089280.0, "grad_norm": 4.484039953055517, "language_loss": 0.6938777, "learning_rate": 1.6461001149035055e-06, "loss": 0.71503651, "num_input_tokens_seen": 204188535, "step": 9479, "time_per_iteration": 2.712655782699585 }, { "auxiliary_loss_clip": 0.01078443, "auxiliary_loss_mlp": 0.01033369, "balance_loss_clip": 1.04121661, "balance_loss_mlp": 1.02166843, "epoch": 0.5699684352923493, "flos": 19537272720000.0, "grad_norm": 2.2062311419155205, "language_loss": 0.71329868, "learning_rate": 1.6457168056972392e-06, "loss": 0.73441678, "num_input_tokens_seen": 204208365, "step": 9480, "time_per_iteration": 2.727628469467163 }, { "auxiliary_loss_clip": 0.01089043, "auxiliary_loss_mlp": 0.00769268, "balance_loss_clip": 1.04188204, "balance_loss_mlp": 1.00015211, "epoch": 0.5700285585450173, "flos": 16253349753600.0, "grad_norm": 2.49302312393396, "language_loss": 0.7201618, "learning_rate": 1.6453335099254276e-06, "loss": 0.73874491, "num_input_tokens_seen": 204226560, "step": 9481, "time_per_iteration": 2.6870779991149902 }, { "auxiliary_loss_clip": 0.01111632, "auxiliary_loss_mlp": 0.01037308, "balance_loss_clip": 1.04494166, "balance_loss_mlp": 1.02441525, "epoch": 0.5700886817976852, "flos": 19864993432320.0, "grad_norm": 2.3265371075794046, "language_loss": 0.78086042, "learning_rate": 1.6449502276026041e-06, "loss": 0.80234993, "num_input_tokens_seen": 204245410, "step": 9482, "time_per_iteration": 2.648545742034912 }, { "auxiliary_loss_clip": 0.01099058, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.04446602, "balance_loss_mlp": 1.01918221, "epoch": 0.5701488050503533, "flos": 23841704989440.0, "grad_norm": 1.4982420423731841, "language_loss": 0.77999502, "learning_rate": 1.6445669587433043e-06, "loss": 0.80129617, "num_input_tokens_seen": 204264840, "step": 9483, "time_per_iteration": 2.716085910797119 }, { "auxiliary_loss_clip": 0.01098634, "auxiliary_loss_mlp": 0.01043773, "balance_loss_clip": 1.04435062, "balance_loss_mlp": 1.03189337, "epoch": 0.5702089283030212, "flos": 23659673840640.0, "grad_norm": 1.773078274148673, "language_loss": 0.81291378, "learning_rate": 1.6441837033620612e-06, "loss": 0.83433783, "num_input_tokens_seen": 204284335, "step": 9484, "time_per_iteration": 2.7283802032470703 }, { "auxiliary_loss_clip": 0.01120809, "auxiliary_loss_mlp": 0.0077026, "balance_loss_clip": 1.04378128, "balance_loss_mlp": 1.00009394, "epoch": 0.5702690515556892, "flos": 27891171544320.0, "grad_norm": 294.9687469035841, "language_loss": 0.60670495, "learning_rate": 1.6438004614734073e-06, "loss": 0.6256156, "num_input_tokens_seen": 204302590, "step": 9485, "time_per_iteration": 2.7182137966156006 }, { "auxiliary_loss_clip": 0.01107456, "auxiliary_loss_mlp": 0.01033766, "balance_loss_clip": 1.04291701, "balance_loss_mlp": 1.02048063, "epoch": 0.5703291748083571, "flos": 24023951619840.0, "grad_norm": 2.0199937842049676, "language_loss": 0.65740418, "learning_rate": 1.6434172330918757e-06, "loss": 0.67881644, "num_input_tokens_seen": 204323055, "step": 9486, "time_per_iteration": 2.7076590061187744 }, { "auxiliary_loss_clip": 0.01026531, "auxiliary_loss_mlp": 0.01001416, "balance_loss_clip": 1.02014589, "balance_loss_mlp": 1.00029588, "epoch": 0.5703892980610251, "flos": 57023382919680.0, "grad_norm": 0.6682653451732087, "language_loss": 0.47990364, "learning_rate": 1.6430340182319978e-06, "loss": 0.50018317, "num_input_tokens_seen": 204386160, "step": 9487, "time_per_iteration": 3.3227086067199707 }, { "auxiliary_loss_clip": 0.0108502, "auxiliary_loss_mlp": 0.00770885, "balance_loss_clip": 1.04171848, "balance_loss_mlp": 1.00012314, "epoch": 0.570449421313693, "flos": 24351025887360.0, "grad_norm": 1.5998860502141972, "language_loss": 0.85676056, "learning_rate": 1.6426508169083067e-06, "loss": 0.87531954, "num_input_tokens_seen": 204406315, "step": 9488, "time_per_iteration": 2.7443041801452637 }, { "auxiliary_loss_clip": 0.01084932, "auxiliary_loss_mlp": 0.01036169, "balance_loss_clip": 1.04087424, "balance_loss_mlp": 1.02245951, "epoch": 0.570509544566361, "flos": 24828566227200.0, "grad_norm": 1.4382001019160457, "language_loss": 0.78847331, "learning_rate": 1.6422676291353314e-06, "loss": 0.80968434, "num_input_tokens_seen": 204427645, "step": 9489, "time_per_iteration": 2.7456719875335693 }, { "auxiliary_loss_clip": 0.01099206, "auxiliary_loss_mlp": 0.01028445, "balance_loss_clip": 1.04345155, "balance_loss_mlp": 1.01655364, "epoch": 0.570569667819029, "flos": 21397301671680.0, "grad_norm": 1.7750907148912565, "language_loss": 0.70044166, "learning_rate": 1.641884454927604e-06, "loss": 0.72171819, "num_input_tokens_seen": 204445910, "step": 9490, "time_per_iteration": 2.646172046661377 }, { "auxiliary_loss_clip": 0.01085076, "auxiliary_loss_mlp": 0.0103304, "balance_loss_clip": 1.04102945, "balance_loss_mlp": 1.02055264, "epoch": 0.570629791071697, "flos": 23216751233280.0, "grad_norm": 1.5662629922292932, "language_loss": 0.76374, "learning_rate": 1.6415012942996548e-06, "loss": 0.78492117, "num_input_tokens_seen": 204464680, "step": 9491, "time_per_iteration": 2.686228036880493 }, { "auxiliary_loss_clip": 0.01010704, "auxiliary_loss_mlp": 0.0075136, "balance_loss_clip": 1.01657176, "balance_loss_mlp": 0.99964297, "epoch": 0.570689914324365, "flos": 65284666525440.0, "grad_norm": 0.7940313966382696, "language_loss": 0.57365447, "learning_rate": 1.641118147266011e-06, "loss": 0.5912751, "num_input_tokens_seen": 204525580, "step": 9492, "time_per_iteration": 3.275951623916626 }, { "auxiliary_loss_clip": 0.01091927, "auxiliary_loss_mlp": 0.00770164, "balance_loss_clip": 1.0425539, "balance_loss_mlp": 1.00009966, "epoch": 0.5707500375770329, "flos": 21141904993920.0, "grad_norm": 1.811585397599456, "language_loss": 0.71563506, "learning_rate": 1.6407350138412035e-06, "loss": 0.73425597, "num_input_tokens_seen": 204541320, "step": 9493, "time_per_iteration": 2.6741974353790283 }, { "auxiliary_loss_clip": 0.0112282, "auxiliary_loss_mlp": 0.01032391, "balance_loss_clip": 1.0450213, "balance_loss_mlp": 1.01957655, "epoch": 0.5708101608297009, "flos": 20812747737600.0, "grad_norm": 1.647557383472974, "language_loss": 0.7782768, "learning_rate": 1.6403518940397606e-06, "loss": 0.79982895, "num_input_tokens_seen": 204560275, "step": 9494, "time_per_iteration": 2.6302967071533203 }, { "auxiliary_loss_clip": 0.01124725, "auxiliary_loss_mlp": 0.01031331, "balance_loss_clip": 1.04463601, "balance_loss_mlp": 1.01786041, "epoch": 0.5708702840823688, "flos": 25812338895360.0, "grad_norm": 2.0991801198395166, "language_loss": 0.80634642, "learning_rate": 1.6399687878762096e-06, "loss": 0.82790697, "num_input_tokens_seen": 204579430, "step": 9495, "time_per_iteration": 2.628124237060547 }, { "auxiliary_loss_clip": 0.01077213, "auxiliary_loss_mlp": 0.01041189, "balance_loss_clip": 1.03985035, "balance_loss_mlp": 1.02567959, "epoch": 0.5709304073350369, "flos": 23651916503040.0, "grad_norm": 2.1559343585674067, "language_loss": 0.66669941, "learning_rate": 1.6395856953650784e-06, "loss": 0.68788344, "num_input_tokens_seen": 204597710, "step": 9496, "time_per_iteration": 2.7877724170684814 }, { "auxiliary_loss_clip": 0.01125369, "auxiliary_loss_mlp": 0.01038193, "balance_loss_clip": 1.04586279, "balance_loss_mlp": 1.02479351, "epoch": 0.5709905305877048, "flos": 16107552449280.0, "grad_norm": 2.6392695697640387, "language_loss": 0.69406897, "learning_rate": 1.6392026165208938e-06, "loss": 0.71570456, "num_input_tokens_seen": 204616140, "step": 9497, "time_per_iteration": 2.5715434551239014 }, { "auxiliary_loss_clip": 0.01107343, "auxiliary_loss_mlp": 0.00770833, "balance_loss_clip": 1.04470205, "balance_loss_mlp": 1.00010204, "epoch": 0.5710506538403728, "flos": 24750819239040.0, "grad_norm": 2.381002532737965, "language_loss": 0.81296104, "learning_rate": 1.638819551358182e-06, "loss": 0.83174282, "num_input_tokens_seen": 204636470, "step": 9498, "time_per_iteration": 2.7146875858306885 }, { "auxiliary_loss_clip": 0.01122241, "auxiliary_loss_mlp": 0.01039082, "balance_loss_clip": 1.04371977, "balance_loss_mlp": 1.02453244, "epoch": 0.5711107770930407, "flos": 21982250655360.0, "grad_norm": 1.8640767096069095, "language_loss": 0.66366005, "learning_rate": 1.638436499891469e-06, "loss": 0.68527335, "num_input_tokens_seen": 204656640, "step": 9499, "time_per_iteration": 2.59460711479187 }, { "auxiliary_loss_clip": 0.01090983, "auxiliary_loss_mlp": 0.01034376, "balance_loss_clip": 1.04218864, "balance_loss_mlp": 1.02126861, "epoch": 0.5711709003457087, "flos": 19574009354880.0, "grad_norm": 1.5439081268362653, "language_loss": 0.71755552, "learning_rate": 1.6380534621352805e-06, "loss": 0.73880911, "num_input_tokens_seen": 204675475, "step": 9500, "time_per_iteration": 2.6723949909210205 }, { "auxiliary_loss_clip": 0.01092856, "auxiliary_loss_mlp": 0.01032614, "balance_loss_clip": 1.04149878, "balance_loss_mlp": 1.01973963, "epoch": 0.5712310235983766, "flos": 24242683489920.0, "grad_norm": 1.9336466751975971, "language_loss": 0.76224887, "learning_rate": 1.6376704381041407e-06, "loss": 0.78350353, "num_input_tokens_seen": 204695385, "step": 9501, "time_per_iteration": 2.7653119564056396 }, { "auxiliary_loss_clip": 0.01101056, "auxiliary_loss_mlp": 0.01035695, "balance_loss_clip": 1.04289281, "balance_loss_mlp": 1.02269506, "epoch": 0.5712911468510447, "flos": 20996143603200.0, "grad_norm": 1.6146609274124086, "language_loss": 0.75141633, "learning_rate": 1.6372874278125742e-06, "loss": 0.77278382, "num_input_tokens_seen": 204714730, "step": 9502, "time_per_iteration": 2.6820828914642334 }, { "auxiliary_loss_clip": 0.01088314, "auxiliary_loss_mlp": 0.01027948, "balance_loss_clip": 1.04387522, "balance_loss_mlp": 1.01492405, "epoch": 0.5713512701037126, "flos": 18916987731840.0, "grad_norm": 1.5621825440350152, "language_loss": 0.82271576, "learning_rate": 1.636904431275105e-06, "loss": 0.84387839, "num_input_tokens_seen": 204735025, "step": 9503, "time_per_iteration": 2.663109302520752 }, { "auxiliary_loss_clip": 0.01085944, "auxiliary_loss_mlp": 0.01033945, "balance_loss_clip": 1.04204583, "balance_loss_mlp": 1.02192843, "epoch": 0.5714113933563806, "flos": 17413443308160.0, "grad_norm": 2.684901451113001, "language_loss": 0.86263931, "learning_rate": 1.6365214485062553e-06, "loss": 0.88383818, "num_input_tokens_seen": 204751365, "step": 9504, "time_per_iteration": 2.763122320175171 }, { "auxiliary_loss_clip": 0.01075538, "auxiliary_loss_mlp": 0.01028568, "balance_loss_clip": 1.04011607, "balance_loss_mlp": 1.01565766, "epoch": 0.5714715166090486, "flos": 20193360589440.0, "grad_norm": 1.7486163539852246, "language_loss": 0.75459665, "learning_rate": 1.6361384795205496e-06, "loss": 0.77563769, "num_input_tokens_seen": 204768980, "step": 9505, "time_per_iteration": 4.519685506820679 }, { "auxiliary_loss_clip": 0.0111822, "auxiliary_loss_mlp": 0.0103209, "balance_loss_clip": 1.04235733, "balance_loss_mlp": 1.02002621, "epoch": 0.5715316398617165, "flos": 18551668458240.0, "grad_norm": 1.4826686830874622, "language_loss": 0.81888402, "learning_rate": 1.635755524332509e-06, "loss": 0.84038711, "num_input_tokens_seen": 204788110, "step": 9506, "time_per_iteration": 5.6948935985565186 }, { "auxiliary_loss_clip": 0.01080083, "auxiliary_loss_mlp": 0.00770857, "balance_loss_clip": 1.03905082, "balance_loss_mlp": 1.00010204, "epoch": 0.5715917631143845, "flos": 18478195188480.0, "grad_norm": 1.7330193393772828, "language_loss": 0.77595812, "learning_rate": 1.6353725829566552e-06, "loss": 0.79446745, "num_input_tokens_seen": 204807240, "step": 9507, "time_per_iteration": 2.7299420833587646 }, { "auxiliary_loss_clip": 0.01098783, "auxiliary_loss_mlp": 0.01037694, "balance_loss_clip": 1.04040074, "balance_loss_mlp": 1.02350807, "epoch": 0.5716518863670524, "flos": 24020037037440.0, "grad_norm": 1.9478835056583133, "language_loss": 0.6852861, "learning_rate": 1.63498965540751e-06, "loss": 0.70665085, "num_input_tokens_seen": 204826415, "step": 9508, "time_per_iteration": 2.7023262977600098 }, { "auxiliary_loss_clip": 0.01121987, "auxiliary_loss_mlp": 0.01031189, "balance_loss_clip": 1.04333735, "balance_loss_mlp": 1.01777184, "epoch": 0.5717120096197205, "flos": 17819485626240.0, "grad_norm": 2.087333212498838, "language_loss": 0.80104595, "learning_rate": 1.634606741699593e-06, "loss": 0.82257771, "num_input_tokens_seen": 204844305, "step": 9509, "time_per_iteration": 2.6331591606140137 }, { "auxiliary_loss_clip": 0.01104906, "auxiliary_loss_mlp": 0.01033683, "balance_loss_clip": 1.04276729, "balance_loss_mlp": 1.02071953, "epoch": 0.5717721328723884, "flos": 21866043179520.0, "grad_norm": 1.9468766397229225, "language_loss": 0.71857727, "learning_rate": 1.6342238418474255e-06, "loss": 0.73996317, "num_input_tokens_seen": 204861765, "step": 9510, "time_per_iteration": 2.6763837337493896 }, { "auxiliary_loss_clip": 0.01096671, "auxiliary_loss_mlp": 0.01031456, "balance_loss_clip": 1.04109251, "balance_loss_mlp": 1.01920152, "epoch": 0.5718322561250564, "flos": 28437624126720.0, "grad_norm": 1.5755083758344817, "language_loss": 0.69395983, "learning_rate": 1.6338409558655264e-06, "loss": 0.71524119, "num_input_tokens_seen": 204882505, "step": 9511, "time_per_iteration": 4.320638418197632 }, { "auxiliary_loss_clip": 0.01097735, "auxiliary_loss_mlp": 0.01035503, "balance_loss_clip": 1.04172611, "balance_loss_mlp": 1.02338552, "epoch": 0.5718923793777243, "flos": 13551825905280.0, "grad_norm": 2.0067389560068047, "language_loss": 0.6147874, "learning_rate": 1.6334580837684152e-06, "loss": 0.63611984, "num_input_tokens_seen": 204899830, "step": 9512, "time_per_iteration": 2.759669065475464 }, { "auxiliary_loss_clip": 0.01095927, "auxiliary_loss_mlp": 0.01029716, "balance_loss_clip": 1.04188657, "balance_loss_mlp": 1.01700234, "epoch": 0.5719525026303923, "flos": 17822035491840.0, "grad_norm": 2.401258082797128, "language_loss": 0.76018667, "learning_rate": 1.6330752255706104e-06, "loss": 0.78144312, "num_input_tokens_seen": 204918100, "step": 9513, "time_per_iteration": 2.7117698192596436 }, { "auxiliary_loss_clip": 0.01030995, "auxiliary_loss_mlp": 0.00999994, "balance_loss_clip": 1.01519012, "balance_loss_mlp": 0.99881381, "epoch": 0.5720126258830602, "flos": 61298042814720.0, "grad_norm": 0.8987559536316853, "language_loss": 0.66807652, "learning_rate": 1.6326923812866288e-06, "loss": 0.68838638, "num_input_tokens_seen": 204972925, "step": 9514, "time_per_iteration": 3.1701343059539795 }, { "auxiliary_loss_clip": 0.01114643, "auxiliary_loss_mlp": 0.01042868, "balance_loss_clip": 1.0446943, "balance_loss_mlp": 1.02930832, "epoch": 0.5720727491357283, "flos": 23988040997760.0, "grad_norm": 2.0869347470902704, "language_loss": 0.81355566, "learning_rate": 1.63230955093099e-06, "loss": 0.83513075, "num_input_tokens_seen": 204990910, "step": 9515, "time_per_iteration": 2.668982744216919 }, { "auxiliary_loss_clip": 0.01098965, "auxiliary_loss_mlp": 0.01032673, "balance_loss_clip": 1.04036427, "balance_loss_mlp": 1.01993597, "epoch": 0.5721328723883962, "flos": 23405426398080.0, "grad_norm": 3.1746972716468664, "language_loss": 0.85928082, "learning_rate": 1.6319267345182092e-06, "loss": 0.88059723, "num_input_tokens_seen": 205010500, "step": 9516, "time_per_iteration": 2.6741178035736084 }, { "auxiliary_loss_clip": 0.01083742, "auxiliary_loss_mlp": 0.01031013, "balance_loss_clip": 1.04019785, "balance_loss_mlp": 1.01784658, "epoch": 0.5721929956410642, "flos": 18804910320000.0, "grad_norm": 1.8608727945257042, "language_loss": 0.87884629, "learning_rate": 1.6315439320628038e-06, "loss": 0.8999939, "num_input_tokens_seen": 205028560, "step": 9517, "time_per_iteration": 2.699981451034546 }, { "auxiliary_loss_clip": 0.01066403, "auxiliary_loss_mlp": 0.01031636, "balance_loss_clip": 1.03665698, "balance_loss_mlp": 1.01866579, "epoch": 0.5722531188937322, "flos": 27196659100800.0, "grad_norm": 1.632945668541975, "language_loss": 0.85146403, "learning_rate": 1.6311611435792893e-06, "loss": 0.87244439, "num_input_tokens_seen": 205048650, "step": 9518, "time_per_iteration": 2.8667659759521484 }, { "auxiliary_loss_clip": 0.01104733, "auxiliary_loss_mlp": 0.01033736, "balance_loss_clip": 1.04255366, "balance_loss_mlp": 1.02131414, "epoch": 0.5723132421464001, "flos": 15195672852480.0, "grad_norm": 1.838676422571758, "language_loss": 0.7901606, "learning_rate": 1.6307783690821812e-06, "loss": 0.81154531, "num_input_tokens_seen": 205066480, "step": 9519, "time_per_iteration": 2.593822479248047 }, { "auxiliary_loss_clip": 0.01117664, "auxiliary_loss_mlp": 0.01029991, "balance_loss_clip": 1.04276991, "balance_loss_mlp": 1.01755762, "epoch": 0.5723733653990681, "flos": 27599433281280.0, "grad_norm": 1.4978137038182386, "language_loss": 0.83191645, "learning_rate": 1.6303956085859944e-06, "loss": 0.85339302, "num_input_tokens_seen": 205087475, "step": 9520, "time_per_iteration": 2.664851427078247 }, { "auxiliary_loss_clip": 0.01098568, "auxiliary_loss_mlp": 0.01044625, "balance_loss_clip": 1.04248536, "balance_loss_mlp": 1.03115487, "epoch": 0.572433488651736, "flos": 18222870337920.0, "grad_norm": 2.1952309591015267, "language_loss": 0.72542965, "learning_rate": 1.630012862105243e-06, "loss": 0.74686158, "num_input_tokens_seen": 205106495, "step": 9521, "time_per_iteration": 2.7253611087799072 }, { "auxiliary_loss_clip": 0.011175, "auxiliary_loss_mlp": 0.00769564, "balance_loss_clip": 1.04164016, "balance_loss_mlp": 1.00010264, "epoch": 0.5724936119044041, "flos": 31249106484480.0, "grad_norm": 2.153094973040902, "language_loss": 0.78315163, "learning_rate": 1.6296301296544415e-06, "loss": 0.80202222, "num_input_tokens_seen": 205128285, "step": 9522, "time_per_iteration": 2.6890037059783936 }, { "auxiliary_loss_clip": 0.01088616, "auxiliary_loss_mlp": 0.01034098, "balance_loss_clip": 1.04117084, "balance_loss_mlp": 1.02251649, "epoch": 0.572553735157072, "flos": 19202189719680.0, "grad_norm": 1.511112661891623, "language_loss": 0.71476662, "learning_rate": 1.629247411248102e-06, "loss": 0.73599374, "num_input_tokens_seen": 205146595, "step": 9523, "time_per_iteration": 2.6567182540893555 }, { "auxiliary_loss_clip": 0.01092274, "auxiliary_loss_mlp": 0.01033734, "balance_loss_clip": 1.03921247, "balance_loss_mlp": 1.02187228, "epoch": 0.57261385840974, "flos": 21214911386880.0, "grad_norm": 2.2130630300856207, "language_loss": 0.70017171, "learning_rate": 1.628864706900738e-06, "loss": 0.72143173, "num_input_tokens_seen": 205164295, "step": 9524, "time_per_iteration": 2.700518846511841 }, { "auxiliary_loss_clip": 0.01107505, "auxiliary_loss_mlp": 0.01031795, "balance_loss_clip": 1.04225564, "balance_loss_mlp": 1.01971316, "epoch": 0.5726739816624079, "flos": 33984529793280.0, "grad_norm": 1.461112152817653, "language_loss": 0.65126455, "learning_rate": 1.6284820166268615e-06, "loss": 0.67265761, "num_input_tokens_seen": 205185380, "step": 9525, "time_per_iteration": 2.7389535903930664 }, { "auxiliary_loss_clip": 0.01091158, "auxiliary_loss_mlp": 0.01035018, "balance_loss_clip": 1.03928351, "balance_loss_mlp": 1.023139, "epoch": 0.5727341049150759, "flos": 24275972419200.0, "grad_norm": 1.930578654391071, "language_loss": 0.72484279, "learning_rate": 1.628099340440984e-06, "loss": 0.7461046, "num_input_tokens_seen": 205204895, "step": 9526, "time_per_iteration": 2.702472448348999 }, { "auxiliary_loss_clip": 0.01103623, "auxiliary_loss_mlp": 0.01038123, "balance_loss_clip": 1.03998101, "balance_loss_mlp": 1.02604759, "epoch": 0.5727942281677438, "flos": 28400564269440.0, "grad_norm": 2.0565235980515206, "language_loss": 0.8007257, "learning_rate": 1.6277166783576176e-06, "loss": 0.8221432, "num_input_tokens_seen": 205223440, "step": 9527, "time_per_iteration": 2.7238149642944336 }, { "auxiliary_loss_clip": 0.01101882, "auxiliary_loss_mlp": 0.01036542, "balance_loss_clip": 1.03860235, "balance_loss_mlp": 1.02360809, "epoch": 0.5728543514204119, "flos": 19536769929600.0, "grad_norm": 1.770832454252008, "language_loss": 0.72136271, "learning_rate": 1.6273340303912713e-06, "loss": 0.74274695, "num_input_tokens_seen": 205242800, "step": 9528, "time_per_iteration": 2.593954086303711 }, { "auxiliary_loss_clip": 0.01117957, "auxiliary_loss_mlp": 0.01036459, "balance_loss_clip": 1.04303622, "balance_loss_mlp": 1.02363753, "epoch": 0.5729144746730798, "flos": 21506757390720.0, "grad_norm": 2.0200513223103846, "language_loss": 0.86137569, "learning_rate": 1.6269513965564557e-06, "loss": 0.88291985, "num_input_tokens_seen": 205259465, "step": 9529, "time_per_iteration": 2.6399447917938232 }, { "auxiliary_loss_clip": 0.01022279, "auxiliary_loss_mlp": 0.00999796, "balance_loss_clip": 1.01659954, "balance_loss_mlp": 0.99862826, "epoch": 0.5729745979257478, "flos": 58681628242560.0, "grad_norm": 0.7634342678167043, "language_loss": 0.56170225, "learning_rate": 1.6265687768676813e-06, "loss": 0.58192301, "num_input_tokens_seen": 205314100, "step": 9530, "time_per_iteration": 3.081955671310425 }, { "auxiliary_loss_clip": 0.01096881, "auxiliary_loss_mlp": 0.01030649, "balance_loss_clip": 1.04126835, "balance_loss_mlp": 1.01860929, "epoch": 0.5730347211784158, "flos": 18552099421440.0, "grad_norm": 1.8014631294656338, "language_loss": 0.66785836, "learning_rate": 1.6261861713394553e-06, "loss": 0.6891337, "num_input_tokens_seen": 205333420, "step": 9531, "time_per_iteration": 2.650801658630371 }, { "auxiliary_loss_clip": 0.01102348, "auxiliary_loss_mlp": 0.01042246, "balance_loss_clip": 1.03970659, "balance_loss_mlp": 1.02834046, "epoch": 0.5730948444310837, "flos": 38031482396160.0, "grad_norm": 2.1479743871986314, "language_loss": 0.75923574, "learning_rate": 1.6258035799862876e-06, "loss": 0.78068173, "num_input_tokens_seen": 205350995, "step": 9532, "time_per_iteration": 2.7268972396850586 }, { "auxiliary_loss_clip": 0.01117449, "auxiliary_loss_mlp": 0.01031067, "balance_loss_clip": 1.0426352, "balance_loss_mlp": 1.01828206, "epoch": 0.5731549676837517, "flos": 25227066689280.0, "grad_norm": 1.3324145118640112, "language_loss": 0.78908527, "learning_rate": 1.625421002822686e-06, "loss": 0.81057048, "num_input_tokens_seen": 205372675, "step": 9533, "time_per_iteration": 2.6636223793029785 }, { "auxiliary_loss_clip": 0.01105019, "auxiliary_loss_mlp": 0.01029773, "balance_loss_clip": 1.04237115, "balance_loss_mlp": 1.01806676, "epoch": 0.5732150909364196, "flos": 23368222886400.0, "grad_norm": 1.7921135162563215, "language_loss": 0.85584033, "learning_rate": 1.6250384398631574e-06, "loss": 0.87718827, "num_input_tokens_seen": 205392590, "step": 9534, "time_per_iteration": 2.6173202991485596 }, { "auxiliary_loss_clip": 0.01098044, "auxiliary_loss_mlp": 0.01038668, "balance_loss_clip": 1.0421629, "balance_loss_mlp": 1.02537584, "epoch": 0.5732752141890877, "flos": 23079357711360.0, "grad_norm": 1.8285457434330181, "language_loss": 0.7536543, "learning_rate": 1.6246558911222085e-06, "loss": 0.77502143, "num_input_tokens_seen": 205414885, "step": 9535, "time_per_iteration": 2.6797807216644287 }, { "auxiliary_loss_clip": 0.0110163, "auxiliary_loss_mlp": 0.01032829, "balance_loss_clip": 1.04250264, "balance_loss_mlp": 1.01984715, "epoch": 0.5733353374417556, "flos": 24352282863360.0, "grad_norm": 1.4660219442049842, "language_loss": 0.71041429, "learning_rate": 1.624273356614346e-06, "loss": 0.73175883, "num_input_tokens_seen": 205434440, "step": 9536, "time_per_iteration": 2.6927666664123535 }, { "auxiliary_loss_clip": 0.0107587, "auxiliary_loss_mlp": 0.01034692, "balance_loss_clip": 1.03728056, "balance_loss_mlp": 1.02034533, "epoch": 0.5733954606944236, "flos": 27198849830400.0, "grad_norm": 1.9779932456354445, "language_loss": 0.69794559, "learning_rate": 1.6238908363540755e-06, "loss": 0.71905118, "num_input_tokens_seen": 205454225, "step": 9537, "time_per_iteration": 2.758420944213867 }, { "auxiliary_loss_clip": 0.01119262, "auxiliary_loss_mlp": 0.01036385, "balance_loss_clip": 1.04359508, "balance_loss_mlp": 1.02364206, "epoch": 0.5734555839470915, "flos": 28765129357440.0, "grad_norm": 1.8277858348507134, "language_loss": 0.62517941, "learning_rate": 1.623508330355902e-06, "loss": 0.64673591, "num_input_tokens_seen": 205474750, "step": 9538, "time_per_iteration": 2.6978628635406494 }, { "auxiliary_loss_clip": 0.01105121, "auxiliary_loss_mlp": 0.0103457, "balance_loss_clip": 1.04219174, "balance_loss_mlp": 1.02135563, "epoch": 0.5735157071997595, "flos": 22966813422720.0, "grad_norm": 1.6582870130678489, "language_loss": 0.83564949, "learning_rate": 1.6231258386343306e-06, "loss": 0.85704643, "num_input_tokens_seen": 205495495, "step": 9539, "time_per_iteration": 2.7695393562316895 }, { "auxiliary_loss_clip": 0.01086088, "auxiliary_loss_mlp": 0.01038955, "balance_loss_clip": 1.04798675, "balance_loss_mlp": 1.02566326, "epoch": 0.5735758304524274, "flos": 18989455420800.0, "grad_norm": 2.207302017109072, "language_loss": 0.73048598, "learning_rate": 1.6227433612038647e-06, "loss": 0.75173634, "num_input_tokens_seen": 205510070, "step": 9540, "time_per_iteration": 2.760653018951416 }, { "auxiliary_loss_clip": 0.01101303, "auxiliary_loss_mlp": 0.00769854, "balance_loss_clip": 1.03920221, "balance_loss_mlp": 1.00004601, "epoch": 0.5736359537050955, "flos": 28397942576640.0, "grad_norm": 2.4125489920069074, "language_loss": 0.79765099, "learning_rate": 1.6223608980790089e-06, "loss": 0.81636256, "num_input_tokens_seen": 205530190, "step": 9541, "time_per_iteration": 2.789978504180908 }, { "auxiliary_loss_clip": 0.01096764, "auxiliary_loss_mlp": 0.01033683, "balance_loss_clip": 1.040447, "balance_loss_mlp": 1.02054572, "epoch": 0.5736960769577634, "flos": 15627210848640.0, "grad_norm": 2.579963788523863, "language_loss": 0.6497947, "learning_rate": 1.6219784492742654e-06, "loss": 0.67109919, "num_input_tokens_seen": 205547380, "step": 9542, "time_per_iteration": 2.684465169906616 }, { "auxiliary_loss_clip": 0.01094703, "auxiliary_loss_mlp": 0.01032355, "balance_loss_clip": 1.03985989, "balance_loss_mlp": 1.01992106, "epoch": 0.5737562002104314, "flos": 18003994813440.0, "grad_norm": 2.1591412151518625, "language_loss": 0.82844281, "learning_rate": 1.6215960148041365e-06, "loss": 0.84971344, "num_input_tokens_seen": 205566540, "step": 9543, "time_per_iteration": 2.724700450897217 }, { "auxiliary_loss_clip": 0.01078135, "auxiliary_loss_mlp": 0.01034179, "balance_loss_clip": 1.03842759, "balance_loss_mlp": 1.01990938, "epoch": 0.5738163234630994, "flos": 20698192287360.0, "grad_norm": 2.0892075264702616, "language_loss": 0.73500836, "learning_rate": 1.6212135946831257e-06, "loss": 0.75613153, "num_input_tokens_seen": 205584200, "step": 9544, "time_per_iteration": 2.7072341442108154 }, { "auxiliary_loss_clip": 0.01063343, "auxiliary_loss_mlp": 0.01034841, "balance_loss_clip": 1.03527069, "balance_loss_mlp": 1.02173972, "epoch": 0.5738764467157673, "flos": 23149311448320.0, "grad_norm": 1.791719724630014, "language_loss": 0.76021409, "learning_rate": 1.620831188925733e-06, "loss": 0.78119594, "num_input_tokens_seen": 205604675, "step": 9545, "time_per_iteration": 4.402756690979004 }, { "auxiliary_loss_clip": 0.0109842, "auxiliary_loss_mlp": 0.0103679, "balance_loss_clip": 1.04495752, "balance_loss_mlp": 1.02345061, "epoch": 0.5739365699684353, "flos": 29492930730240.0, "grad_norm": 1.94712066693327, "language_loss": 0.56656086, "learning_rate": 1.620448797546459e-06, "loss": 0.58791304, "num_input_tokens_seen": 205624680, "step": 9546, "time_per_iteration": 6.025787115097046 }, { "auxiliary_loss_clip": 0.01091236, "auxiliary_loss_mlp": 0.01033391, "balance_loss_clip": 1.03923881, "balance_loss_mlp": 1.02023625, "epoch": 0.5739966932211032, "flos": 14027247342720.0, "grad_norm": 2.369322585416499, "language_loss": 0.7595309, "learning_rate": 1.6200664205598055e-06, "loss": 0.78077716, "num_input_tokens_seen": 205641950, "step": 9547, "time_per_iteration": 2.71240496635437 }, { "auxiliary_loss_clip": 0.01104111, "auxiliary_loss_mlp": 0.01030548, "balance_loss_clip": 1.03877449, "balance_loss_mlp": 1.01709485, "epoch": 0.5740568164737713, "flos": 19062030850560.0, "grad_norm": 5.307379698295213, "language_loss": 0.74525601, "learning_rate": 1.6196840579802704e-06, "loss": 0.76660264, "num_input_tokens_seen": 205660130, "step": 9548, "time_per_iteration": 2.651829957962036 }, { "auxiliary_loss_clip": 0.01085909, "auxiliary_loss_mlp": 0.0103587, "balance_loss_clip": 1.03760338, "balance_loss_mlp": 1.02268577, "epoch": 0.5741169397264392, "flos": 22127832478080.0, "grad_norm": 4.02154100378115, "language_loss": 0.69476151, "learning_rate": 1.619301709822355e-06, "loss": 0.71597928, "num_input_tokens_seen": 205678895, "step": 9549, "time_per_iteration": 2.7304623126983643 }, { "auxiliary_loss_clip": 0.01068231, "auxiliary_loss_mlp": 0.01031011, "balance_loss_clip": 1.04319942, "balance_loss_mlp": 1.01907182, "epoch": 0.5741770629791072, "flos": 24936836797440.0, "grad_norm": 1.4366767261825364, "language_loss": 0.79742229, "learning_rate": 1.6189193761005564e-06, "loss": 0.81841469, "num_input_tokens_seen": 205698450, "step": 9550, "time_per_iteration": 2.759152889251709 }, { "auxiliary_loss_clip": 0.01091678, "auxiliary_loss_mlp": 0.01036065, "balance_loss_clip": 1.04081261, "balance_loss_mlp": 1.0213902, "epoch": 0.5742371862317751, "flos": 18801462614400.0, "grad_norm": 1.889418417446442, "language_loss": 0.67791235, "learning_rate": 1.6185370568293727e-06, "loss": 0.69918978, "num_input_tokens_seen": 205714870, "step": 9551, "time_per_iteration": 4.226199150085449 }, { "auxiliary_loss_clip": 0.01082087, "auxiliary_loss_mlp": 0.0103572, "balance_loss_clip": 1.04173434, "balance_loss_mlp": 1.02287543, "epoch": 0.5742973094844431, "flos": 24460661174400.0, "grad_norm": 2.3194402923297157, "language_loss": 0.7223655, "learning_rate": 1.6181547520233031e-06, "loss": 0.74354362, "num_input_tokens_seen": 205736045, "step": 9552, "time_per_iteration": 2.736600160598755 }, { "auxiliary_loss_clip": 0.01103832, "auxiliary_loss_mlp": 0.01033342, "balance_loss_clip": 1.04454732, "balance_loss_mlp": 1.02040219, "epoch": 0.574357432737111, "flos": 21652770176640.0, "grad_norm": 2.128940953023755, "language_loss": 0.79823256, "learning_rate": 1.617772461696843e-06, "loss": 0.81960428, "num_input_tokens_seen": 205754445, "step": 9553, "time_per_iteration": 2.6895127296447754 }, { "auxiliary_loss_clip": 0.01111471, "auxiliary_loss_mlp": 0.01032858, "balance_loss_clip": 1.04313147, "balance_loss_mlp": 1.02050185, "epoch": 0.5744175559897791, "flos": 16544728880640.0, "grad_norm": 1.880148698667659, "language_loss": 0.8353495, "learning_rate": 1.6173901858644895e-06, "loss": 0.85679281, "num_input_tokens_seen": 205770595, "step": 9554, "time_per_iteration": 2.615577220916748 }, { "auxiliary_loss_clip": 0.01115074, "auxiliary_loss_mlp": 0.0077091, "balance_loss_clip": 1.04545319, "balance_loss_mlp": 1.0001241, "epoch": 0.574477679242447, "flos": 24207598880640.0, "grad_norm": 1.4793540146055872, "language_loss": 0.71076667, "learning_rate": 1.6170079245407385e-06, "loss": 0.72962654, "num_input_tokens_seen": 205791935, "step": 9555, "time_per_iteration": 2.7411417961120605 }, { "auxiliary_loss_clip": 0.01093974, "auxiliary_loss_mlp": 0.0103121, "balance_loss_clip": 1.04077876, "balance_loss_mlp": 1.01763785, "epoch": 0.574537802495115, "flos": 14903000835840.0, "grad_norm": 2.2805548015379755, "language_loss": 0.72663784, "learning_rate": 1.6166256777400853e-06, "loss": 0.7478897, "num_input_tokens_seen": 205807260, "step": 9556, "time_per_iteration": 2.6720690727233887 }, { "auxiliary_loss_clip": 0.01111378, "auxiliary_loss_mlp": 0.01033136, "balance_loss_clip": 1.04576373, "balance_loss_mlp": 1.02015448, "epoch": 0.5745979257477829, "flos": 24934969290240.0, "grad_norm": 1.744837604754053, "language_loss": 0.74087226, "learning_rate": 1.6162434454770248e-06, "loss": 0.76231742, "num_input_tokens_seen": 205826885, "step": 9557, "time_per_iteration": 2.7899231910705566 }, { "auxiliary_loss_clip": 0.01108542, "auxiliary_loss_mlp": 0.01034037, "balance_loss_clip": 1.04274464, "balance_loss_mlp": 1.02157927, "epoch": 0.5746580490004509, "flos": 17235757704960.0, "grad_norm": 1.5016834383596844, "language_loss": 0.67902005, "learning_rate": 1.6158612277660514e-06, "loss": 0.70044577, "num_input_tokens_seen": 205844630, "step": 9558, "time_per_iteration": 2.762430429458618 }, { "auxiliary_loss_clip": 0.01094279, "auxiliary_loss_mlp": 0.01052047, "balance_loss_clip": 1.04277229, "balance_loss_mlp": 1.03471398, "epoch": 0.5747181722531189, "flos": 13187871348480.0, "grad_norm": 2.4192829019987148, "language_loss": 0.72013688, "learning_rate": 1.615479024621659e-06, "loss": 0.74160016, "num_input_tokens_seen": 205860960, "step": 9559, "time_per_iteration": 2.757319688796997 }, { "auxiliary_loss_clip": 0.01097547, "auxiliary_loss_mlp": 0.00769026, "balance_loss_clip": 1.04342794, "balance_loss_mlp": 1.00012159, "epoch": 0.5747782955057869, "flos": 22963006581120.0, "grad_norm": 1.6274858947785595, "language_loss": 0.78883743, "learning_rate": 1.6150968360583398e-06, "loss": 0.8075031, "num_input_tokens_seen": 205880675, "step": 9560, "time_per_iteration": 2.746260166168213 }, { "auxiliary_loss_clip": 0.01052934, "auxiliary_loss_mlp": 0.01029841, "balance_loss_clip": 1.03918111, "balance_loss_mlp": 1.0164957, "epoch": 0.5748384187584549, "flos": 23403235668480.0, "grad_norm": 2.1977539095196903, "language_loss": 0.64321613, "learning_rate": 1.614714662090588e-06, "loss": 0.6640439, "num_input_tokens_seen": 205900050, "step": 9561, "time_per_iteration": 2.8124732971191406 }, { "auxiliary_loss_clip": 0.01116845, "auxiliary_loss_mlp": 0.01039625, "balance_loss_clip": 1.04539895, "balance_loss_mlp": 1.02567124, "epoch": 0.5748985420111228, "flos": 17785514338560.0, "grad_norm": 2.0210299953328414, "language_loss": 0.7193495, "learning_rate": 1.6143325027328945e-06, "loss": 0.74091417, "num_input_tokens_seen": 205918855, "step": 9562, "time_per_iteration": 2.7868704795837402 }, { "auxiliary_loss_clip": 0.01067199, "auxiliary_loss_mlp": 0.01032486, "balance_loss_clip": 1.03979492, "balance_loss_mlp": 1.02039778, "epoch": 0.5749586652637908, "flos": 19866250408320.0, "grad_norm": 1.4806264841650407, "language_loss": 0.84100068, "learning_rate": 1.613950357999751e-06, "loss": 0.86199754, "num_input_tokens_seen": 205936970, "step": 9563, "time_per_iteration": 2.7772703170776367 }, { "auxiliary_loss_clip": 0.01073481, "auxiliary_loss_mlp": 0.01039774, "balance_loss_clip": 1.0434773, "balance_loss_mlp": 1.02635074, "epoch": 0.5750187885164587, "flos": 21287235421440.0, "grad_norm": 2.0689431633426802, "language_loss": 0.5717746, "learning_rate": 1.6135682279056488e-06, "loss": 0.59290713, "num_input_tokens_seen": 205954630, "step": 9564, "time_per_iteration": 2.8411808013916016 }, { "auxiliary_loss_clip": 0.01092301, "auxiliary_loss_mlp": 0.01036175, "balance_loss_clip": 1.04144359, "balance_loss_mlp": 1.0226326, "epoch": 0.5750789117691267, "flos": 18804658924800.0, "grad_norm": 1.7191674250507119, "language_loss": 0.76114881, "learning_rate": 1.613186112465078e-06, "loss": 0.78243363, "num_input_tokens_seen": 205971510, "step": 9565, "time_per_iteration": 2.822044610977173 }, { "auxiliary_loss_clip": 0.01002918, "auxiliary_loss_mlp": 0.01012299, "balance_loss_clip": 1.01532471, "balance_loss_mlp": 1.01098824, "epoch": 0.5751390350217946, "flos": 70663224124800.0, "grad_norm": 0.74248986424084, "language_loss": 0.60725588, "learning_rate": 1.6128040116925287e-06, "loss": 0.62740809, "num_input_tokens_seen": 206035125, "step": 9566, "time_per_iteration": 3.427154064178467 }, { "auxiliary_loss_clip": 0.01093716, "auxiliary_loss_mlp": 0.0103477, "balance_loss_clip": 1.04347396, "balance_loss_mlp": 1.02224672, "epoch": 0.5751991582744627, "flos": 14246338348800.0, "grad_norm": 2.3384715191144214, "language_loss": 0.75378191, "learning_rate": 1.6124219256024901e-06, "loss": 0.77506685, "num_input_tokens_seen": 206052075, "step": 9567, "time_per_iteration": 2.8895022869110107 }, { "auxiliary_loss_clip": 0.0110852, "auxiliary_loss_mlp": 0.0103381, "balance_loss_clip": 1.04461062, "balance_loss_mlp": 1.02136469, "epoch": 0.5752592815271306, "flos": 18328160079360.0, "grad_norm": 1.398692478003959, "language_loss": 0.74487442, "learning_rate": 1.6120398542094504e-06, "loss": 0.7662977, "num_input_tokens_seen": 206069970, "step": 9568, "time_per_iteration": 2.745008945465088 }, { "auxiliary_loss_clip": 0.01122376, "auxiliary_loss_mlp": 0.01031079, "balance_loss_clip": 1.04557085, "balance_loss_mlp": 1.01852036, "epoch": 0.5753194047797986, "flos": 20922742160640.0, "grad_norm": 1.8288224744161317, "language_loss": 0.71572077, "learning_rate": 1.6116577975278994e-06, "loss": 0.73725533, "num_input_tokens_seen": 206088950, "step": 9569, "time_per_iteration": 2.9613218307495117 }, { "auxiliary_loss_clip": 0.01113684, "auxiliary_loss_mlp": 0.01037553, "balance_loss_clip": 1.04693925, "balance_loss_mlp": 1.02399325, "epoch": 0.5753795280324665, "flos": 19281804215040.0, "grad_norm": 2.1991270484780916, "language_loss": 0.55975366, "learning_rate": 1.6112757555723223e-06, "loss": 0.58126599, "num_input_tokens_seen": 206107780, "step": 9570, "time_per_iteration": 2.6928811073303223 }, { "auxiliary_loss_clip": 0.01118829, "auxiliary_loss_mlp": 0.01034724, "balance_loss_clip": 1.04458117, "balance_loss_mlp": 1.02252328, "epoch": 0.5754396512851345, "flos": 21652877917440.0, "grad_norm": 1.4030574698632734, "language_loss": 0.64338309, "learning_rate": 1.6108937283572082e-06, "loss": 0.66491854, "num_input_tokens_seen": 206127445, "step": 9571, "time_per_iteration": 2.635603427886963 }, { "auxiliary_loss_clip": 0.01111717, "auxiliary_loss_mlp": 0.01031618, "balance_loss_clip": 1.04484558, "balance_loss_mlp": 1.01890385, "epoch": 0.5754997745378025, "flos": 51021700179840.0, "grad_norm": 1.5230879857727748, "language_loss": 0.67137802, "learning_rate": 1.6105117158970434e-06, "loss": 0.69281137, "num_input_tokens_seen": 206152005, "step": 9572, "time_per_iteration": 2.9080519676208496 }, { "auxiliary_loss_clip": 0.01101219, "auxiliary_loss_mlp": 0.01032315, "balance_loss_clip": 1.04746473, "balance_loss_mlp": 1.01870155, "epoch": 0.5755598977904705, "flos": 22856890826880.0, "grad_norm": 1.7883651828614429, "language_loss": 0.72390687, "learning_rate": 1.6101297182063123e-06, "loss": 0.74524224, "num_input_tokens_seen": 206169875, "step": 9573, "time_per_iteration": 2.815703868865967 }, { "auxiliary_loss_clip": 0.01118198, "auxiliary_loss_mlp": 0.01031966, "balance_loss_clip": 1.04730046, "balance_loss_mlp": 1.0202539, "epoch": 0.5756200210431385, "flos": 38472824805120.0, "grad_norm": 1.8637575754568128, "language_loss": 0.76394922, "learning_rate": 1.6097477352995022e-06, "loss": 0.78545088, "num_input_tokens_seen": 206192635, "step": 9574, "time_per_iteration": 2.778196096420288 }, { "auxiliary_loss_clip": 0.01068081, "auxiliary_loss_mlp": 0.01036908, "balance_loss_clip": 1.03836775, "balance_loss_mlp": 1.02201867, "epoch": 0.5756801442958064, "flos": 23910006700800.0, "grad_norm": 2.572143968399992, "language_loss": 0.66373074, "learning_rate": 1.6093657671910968e-06, "loss": 0.68478066, "num_input_tokens_seen": 206211485, "step": 9575, "time_per_iteration": 2.780195951461792 }, { "auxiliary_loss_clip": 0.01097887, "auxiliary_loss_mlp": 0.01031317, "balance_loss_clip": 1.04497039, "balance_loss_mlp": 1.01917517, "epoch": 0.5757402675484744, "flos": 21105276099840.0, "grad_norm": 1.5189421087528554, "language_loss": 0.79787755, "learning_rate": 1.6089838138955804e-06, "loss": 0.81916952, "num_input_tokens_seen": 206231740, "step": 9576, "time_per_iteration": 2.7809135913848877 }, { "auxiliary_loss_clip": 0.01096091, "auxiliary_loss_mlp": 0.0102674, "balance_loss_clip": 1.0435828, "balance_loss_mlp": 1.01512265, "epoch": 0.5758003908011423, "flos": 20559110826240.0, "grad_norm": 1.7619408585744085, "language_loss": 0.69726396, "learning_rate": 1.6086018754274372e-06, "loss": 0.71849227, "num_input_tokens_seen": 206250975, "step": 9577, "time_per_iteration": 2.732150077819824 }, { "auxiliary_loss_clip": 0.01111358, "auxiliary_loss_mlp": 0.01035186, "balance_loss_clip": 1.04446626, "balance_loss_mlp": 1.02306843, "epoch": 0.5758605140538103, "flos": 16473015377280.0, "grad_norm": 2.216832845639703, "language_loss": 0.66558278, "learning_rate": 1.6082199518011504e-06, "loss": 0.6870482, "num_input_tokens_seen": 206268800, "step": 9578, "time_per_iteration": 2.639571189880371 }, { "auxiliary_loss_clip": 0.01091288, "auxiliary_loss_mlp": 0.01032209, "balance_loss_clip": 1.04414392, "balance_loss_mlp": 1.01997256, "epoch": 0.5759206373064782, "flos": 21287558643840.0, "grad_norm": 1.7735647320590846, "language_loss": 0.72313404, "learning_rate": 1.6078380430312016e-06, "loss": 0.74436903, "num_input_tokens_seen": 206287190, "step": 9579, "time_per_iteration": 2.6910343170166016 }, { "auxiliary_loss_clip": 0.0110168, "auxiliary_loss_mlp": 0.01035785, "balance_loss_clip": 1.04436874, "balance_loss_mlp": 1.02170634, "epoch": 0.5759807605591463, "flos": 26067879227520.0, "grad_norm": 4.803146579630836, "language_loss": 0.65395081, "learning_rate": 1.6074561491320742e-06, "loss": 0.67532551, "num_input_tokens_seen": 206307020, "step": 9580, "time_per_iteration": 2.7227509021759033 }, { "auxiliary_loss_clip": 0.01092842, "auxiliary_loss_mlp": 0.01034767, "balance_loss_clip": 1.04106581, "balance_loss_mlp": 1.0212729, "epoch": 0.5760408838118142, "flos": 18873068376960.0, "grad_norm": 1.9154940218320493, "language_loss": 0.85214174, "learning_rate": 1.6070742701182486e-06, "loss": 0.87341785, "num_input_tokens_seen": 206324095, "step": 9581, "time_per_iteration": 2.699432849884033 }, { "auxiliary_loss_clip": 0.0113104, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.05060983, "balance_loss_mlp": 1.02360821, "epoch": 0.5761010070644822, "flos": 15378134964480.0, "grad_norm": 2.109676550381332, "language_loss": 0.67354548, "learning_rate": 1.6066924060042057e-06, "loss": 0.69522369, "num_input_tokens_seen": 206343210, "step": 9582, "time_per_iteration": 2.6381587982177734 }, { "auxiliary_loss_clip": 0.01026383, "auxiliary_loss_mlp": 0.01001724, "balance_loss_clip": 1.01951599, "balance_loss_mlp": 1.00040722, "epoch": 0.5761611303171501, "flos": 71471932882560.0, "grad_norm": 0.6463341323488921, "language_loss": 0.57134479, "learning_rate": 1.6063105568044271e-06, "loss": 0.59162581, "num_input_tokens_seen": 206415935, "step": 9583, "time_per_iteration": 3.52109694480896 }, { "auxiliary_loss_clip": 0.01090801, "auxiliary_loss_mlp": 0.01030991, "balance_loss_clip": 1.04208195, "balance_loss_mlp": 1.01818216, "epoch": 0.5762212535698181, "flos": 16246167033600.0, "grad_norm": 1.791358766979404, "language_loss": 0.82729411, "learning_rate": 1.6059287225333912e-06, "loss": 0.84851205, "num_input_tokens_seen": 206431900, "step": 9584, "time_per_iteration": 2.7258176803588867 }, { "auxiliary_loss_clip": 0.0104221, "auxiliary_loss_mlp": 0.01002028, "balance_loss_clip": 1.0174526, "balance_loss_mlp": 1.00080013, "epoch": 0.5762813768224861, "flos": 70185504216960.0, "grad_norm": 0.623568426409687, "language_loss": 0.49559212, "learning_rate": 1.6055469032055773e-06, "loss": 0.51603448, "num_input_tokens_seen": 206501200, "step": 9585, "time_per_iteration": 7.823396682739258 }, { "auxiliary_loss_clip": 0.01092491, "auxiliary_loss_mlp": 0.01027016, "balance_loss_clip": 1.04217815, "balance_loss_mlp": 1.01516044, "epoch": 0.5763415000751541, "flos": 20518028645760.0, "grad_norm": 1.574762209284147, "language_loss": 0.85150623, "learning_rate": 1.605165098835465e-06, "loss": 0.87270141, "num_input_tokens_seen": 206520575, "step": 9586, "time_per_iteration": 2.6869027614593506 }, { "auxiliary_loss_clip": 0.0110803, "auxiliary_loss_mlp": 0.01034855, "balance_loss_clip": 1.04531455, "balance_loss_mlp": 1.02091956, "epoch": 0.5764016233278221, "flos": 15815526877440.0, "grad_norm": 2.1680790738732796, "language_loss": 0.80101568, "learning_rate": 1.6047833094375308e-06, "loss": 0.8224445, "num_input_tokens_seen": 206538060, "step": 9587, "time_per_iteration": 2.664121627807617 }, { "auxiliary_loss_clip": 0.01091421, "auxiliary_loss_mlp": 0.01037732, "balance_loss_clip": 1.04280019, "balance_loss_mlp": 1.02400517, "epoch": 0.57646174658049, "flos": 20772312001920.0, "grad_norm": 1.6197519148440016, "language_loss": 0.66023791, "learning_rate": 1.6044015350262542e-06, "loss": 0.68152946, "num_input_tokens_seen": 206557320, "step": 9588, "time_per_iteration": 2.6596546173095703 }, { "auxiliary_loss_clip": 0.01095166, "auxiliary_loss_mlp": 0.01039726, "balance_loss_clip": 1.04326534, "balance_loss_mlp": 1.02583766, "epoch": 0.576521869833158, "flos": 23549930812800.0, "grad_norm": 2.4954533064787383, "language_loss": 0.78688884, "learning_rate": 1.6040197756161104e-06, "loss": 0.80823773, "num_input_tokens_seen": 206575780, "step": 9589, "time_per_iteration": 2.799503803253174 }, { "auxiliary_loss_clip": 0.01114482, "auxiliary_loss_mlp": 0.01025254, "balance_loss_clip": 1.041682, "balance_loss_mlp": 1.01353538, "epoch": 0.5765819930858259, "flos": 20266582464000.0, "grad_norm": 2.2193599120856304, "language_loss": 0.79450285, "learning_rate": 1.6036380312215762e-06, "loss": 0.81590021, "num_input_tokens_seen": 206594100, "step": 9590, "time_per_iteration": 4.355879545211792 }, { "auxiliary_loss_clip": 0.01052935, "auxiliary_loss_mlp": 0.00769289, "balance_loss_clip": 1.03650951, "balance_loss_mlp": 1.00013447, "epoch": 0.5766421163384939, "flos": 23148772744320.0, "grad_norm": 1.8083193654510727, "language_loss": 0.63346255, "learning_rate": 1.6032563018571283e-06, "loss": 0.65168482, "num_input_tokens_seen": 206613325, "step": 9591, "time_per_iteration": 2.8449039459228516 }, { "auxiliary_loss_clip": 0.01122211, "auxiliary_loss_mlp": 0.00769941, "balance_loss_clip": 1.04640627, "balance_loss_mlp": 1.00013709, "epoch": 0.5767022395911618, "flos": 25848895962240.0, "grad_norm": 2.331025746602298, "language_loss": 0.78112143, "learning_rate": 1.6028745875372406e-06, "loss": 0.80004299, "num_input_tokens_seen": 206634265, "step": 9592, "time_per_iteration": 2.7304346561431885 }, { "auxiliary_loss_clip": 0.01004052, "auxiliary_loss_mlp": 0.01021446, "balance_loss_clip": 1.02547979, "balance_loss_mlp": 1.01965749, "epoch": 0.5767623628438299, "flos": 68293299657600.0, "grad_norm": 0.7436002967471621, "language_loss": 0.59609032, "learning_rate": 1.6024928882763885e-06, "loss": 0.61634529, "num_input_tokens_seen": 206696990, "step": 9593, "time_per_iteration": 3.461658477783203 }, { "auxiliary_loss_clip": 0.01110844, "auxiliary_loss_mlp": 0.01041399, "balance_loss_clip": 1.042449, "balance_loss_mlp": 1.02810097, "epoch": 0.5768224860964978, "flos": 30188448754560.0, "grad_norm": 1.9449888897854992, "language_loss": 0.71144432, "learning_rate": 1.6021112040890463e-06, "loss": 0.73296678, "num_input_tokens_seen": 206717815, "step": 9594, "time_per_iteration": 2.8465657234191895 }, { "auxiliary_loss_clip": 0.01085879, "auxiliary_loss_mlp": 0.01033309, "balance_loss_clip": 1.04293251, "balance_loss_mlp": 1.02196598, "epoch": 0.5768826093491658, "flos": 17895041884800.0, "grad_norm": 2.485745999068748, "language_loss": 0.70693135, "learning_rate": 1.6017295349896863e-06, "loss": 0.72812331, "num_input_tokens_seen": 206735985, "step": 9595, "time_per_iteration": 2.724013566970825 }, { "auxiliary_loss_clip": 0.01120342, "auxiliary_loss_mlp": 0.01030885, "balance_loss_clip": 1.04522467, "balance_loss_mlp": 1.01821947, "epoch": 0.5769427326018337, "flos": 17457183095040.0, "grad_norm": 2.28937358102888, "language_loss": 0.69969249, "learning_rate": 1.6013478809927828e-06, "loss": 0.72120476, "num_input_tokens_seen": 206753370, "step": 9596, "time_per_iteration": 2.602410316467285 }, { "auxiliary_loss_clip": 0.01097835, "auxiliary_loss_mlp": 0.01033862, "balance_loss_clip": 1.04560232, "balance_loss_mlp": 1.01944959, "epoch": 0.5770028558545017, "flos": 39421728345600.0, "grad_norm": 1.7463690567151626, "language_loss": 0.67612261, "learning_rate": 1.6009662421128074e-06, "loss": 0.69743955, "num_input_tokens_seen": 206777645, "step": 9597, "time_per_iteration": 2.9427249431610107 }, { "auxiliary_loss_clip": 0.01096299, "auxiliary_loss_mlp": 0.01033961, "balance_loss_clip": 1.04274464, "balance_loss_mlp": 1.02137804, "epoch": 0.5770629791071697, "flos": 21536383132800.0, "grad_norm": 1.8692422611288704, "language_loss": 0.81584179, "learning_rate": 1.6005846183642323e-06, "loss": 0.83714437, "num_input_tokens_seen": 206794865, "step": 9598, "time_per_iteration": 2.748018503189087 }, { "auxiliary_loss_clip": 0.01073806, "auxiliary_loss_mlp": 0.01042323, "balance_loss_clip": 1.03563309, "balance_loss_mlp": 1.0270164, "epoch": 0.5771231023598377, "flos": 20886795624960.0, "grad_norm": 1.6175391320992503, "language_loss": 0.7306143, "learning_rate": 1.6002030097615277e-06, "loss": 0.7517755, "num_input_tokens_seen": 206814095, "step": 9599, "time_per_iteration": 2.7712650299072266 }, { "auxiliary_loss_clip": 0.01115679, "auxiliary_loss_mlp": 0.01033218, "balance_loss_clip": 1.04342914, "balance_loss_mlp": 1.0211184, "epoch": 0.5771832256125057, "flos": 18077216688000.0, "grad_norm": 3.919070780783451, "language_loss": 0.78193593, "learning_rate": 1.5998214163191663e-06, "loss": 0.80342484, "num_input_tokens_seen": 206832245, "step": 9600, "time_per_iteration": 2.6597604751586914 }, { "auxiliary_loss_clip": 0.01113425, "auxiliary_loss_mlp": 0.0077084, "balance_loss_clip": 1.04604816, "balance_loss_mlp": 1.00016284, "epoch": 0.5772433488651736, "flos": 26359078786560.0, "grad_norm": 1.665079650983798, "language_loss": 0.72689855, "learning_rate": 1.5994398380516163e-06, "loss": 0.74574125, "num_input_tokens_seen": 206851535, "step": 9601, "time_per_iteration": 2.7263121604919434 }, { "auxiliary_loss_clip": 0.01064473, "auxiliary_loss_mlp": 0.01036032, "balance_loss_clip": 1.04480124, "balance_loss_mlp": 1.02311611, "epoch": 0.5773034721178416, "flos": 19680987035520.0, "grad_norm": 2.0948856363437534, "language_loss": 0.68606448, "learning_rate": 1.599058274973348e-06, "loss": 0.70706952, "num_input_tokens_seen": 206870595, "step": 9602, "time_per_iteration": 2.8572375774383545 }, { "auxiliary_loss_clip": 0.01088049, "auxiliary_loss_mlp": 0.01035522, "balance_loss_clip": 1.03997481, "balance_loss_mlp": 1.02274275, "epoch": 0.5773635953705095, "flos": 25082885496960.0, "grad_norm": 1.4139424352201144, "language_loss": 0.73376763, "learning_rate": 1.5986767270988297e-06, "loss": 0.75500333, "num_input_tokens_seen": 206892320, "step": 9603, "time_per_iteration": 2.816098928451538 }, { "auxiliary_loss_clip": 0.01108536, "auxiliary_loss_mlp": 0.01029532, "balance_loss_clip": 1.0450983, "balance_loss_mlp": 1.01732492, "epoch": 0.5774237186231775, "flos": 21032987978880.0, "grad_norm": 1.7349679186761677, "language_loss": 0.76407522, "learning_rate": 1.5982951944425298e-06, "loss": 0.78545588, "num_input_tokens_seen": 206912485, "step": 9604, "time_per_iteration": 2.718163013458252 }, { "auxiliary_loss_clip": 0.01086662, "auxiliary_loss_mlp": 0.0103562, "balance_loss_clip": 1.04304457, "balance_loss_mlp": 1.02200651, "epoch": 0.5774838418758454, "flos": 15231727128960.0, "grad_norm": 2.5247859182247026, "language_loss": 0.83387136, "learning_rate": 1.5979136770189174e-06, "loss": 0.85509419, "num_input_tokens_seen": 206929100, "step": 9605, "time_per_iteration": 2.8076066970825195 }, { "auxiliary_loss_clip": 0.01096142, "auxiliary_loss_mlp": 0.01031724, "balance_loss_clip": 1.04626584, "balance_loss_mlp": 1.01667333, "epoch": 0.5775439651285135, "flos": 23582609210880.0, "grad_norm": 1.8595500746131972, "language_loss": 0.77926147, "learning_rate": 1.5975321748424581e-06, "loss": 0.80054009, "num_input_tokens_seen": 206947020, "step": 9606, "time_per_iteration": 2.7766621112823486 }, { "auxiliary_loss_clip": 0.01117345, "auxiliary_loss_mlp": 0.01035757, "balance_loss_clip": 1.04331446, "balance_loss_mlp": 1.02362752, "epoch": 0.5776040883811814, "flos": 18040515966720.0, "grad_norm": 1.672602422897938, "language_loss": 0.73896575, "learning_rate": 1.597150687927619e-06, "loss": 0.76049674, "num_input_tokens_seen": 206964065, "step": 9607, "time_per_iteration": 2.6057968139648438 }, { "auxiliary_loss_clip": 0.01076534, "auxiliary_loss_mlp": 0.01034666, "balance_loss_clip": 1.04220486, "balance_loss_mlp": 1.02155876, "epoch": 0.5776642116338494, "flos": 18624638937600.0, "grad_norm": 1.6326461875987317, "language_loss": 0.69385672, "learning_rate": 1.5967692162888664e-06, "loss": 0.71496868, "num_input_tokens_seen": 206981940, "step": 9608, "time_per_iteration": 2.784708023071289 }, { "auxiliary_loss_clip": 0.01084539, "auxiliary_loss_mlp": 0.01033053, "balance_loss_clip": 1.03977787, "balance_loss_mlp": 1.01979709, "epoch": 0.5777243348865173, "flos": 28402539517440.0, "grad_norm": 1.6850838728782904, "language_loss": 0.76766187, "learning_rate": 1.596387759940665e-06, "loss": 0.78883779, "num_input_tokens_seen": 207002365, "step": 9609, "time_per_iteration": 2.7439122200012207 }, { "auxiliary_loss_clip": 0.01090565, "auxiliary_loss_mlp": 0.01033653, "balance_loss_clip": 1.04297495, "balance_loss_mlp": 1.02154744, "epoch": 0.5777844581391853, "flos": 24024705805440.0, "grad_norm": 1.7626877282975804, "language_loss": 0.76948774, "learning_rate": 1.5960063188974808e-06, "loss": 0.79072988, "num_input_tokens_seen": 207021195, "step": 9610, "time_per_iteration": 2.748898506164551 }, { "auxiliary_loss_clip": 0.0108266, "auxiliary_loss_mlp": 0.01029905, "balance_loss_clip": 1.03885353, "balance_loss_mlp": 1.01625562, "epoch": 0.5778445813918534, "flos": 17777361951360.0, "grad_norm": 2.997373910278609, "language_loss": 0.68867594, "learning_rate": 1.5956248931737777e-06, "loss": 0.70980155, "num_input_tokens_seen": 207037465, "step": 9611, "time_per_iteration": 2.7037806510925293 }, { "auxiliary_loss_clip": 0.01103482, "auxiliary_loss_mlp": 0.01028915, "balance_loss_clip": 1.03957248, "balance_loss_mlp": 1.01607609, "epoch": 0.5779047046445213, "flos": 22233194046720.0, "grad_norm": 1.7435127822918648, "language_loss": 0.83207917, "learning_rate": 1.5952434827840185e-06, "loss": 0.85340309, "num_input_tokens_seen": 207054230, "step": 9612, "time_per_iteration": 2.6507790088653564 }, { "auxiliary_loss_clip": 0.01119736, "auxiliary_loss_mlp": 0.01030573, "balance_loss_clip": 1.04522681, "balance_loss_mlp": 1.01779914, "epoch": 0.5779648278971893, "flos": 21434361528960.0, "grad_norm": 1.6430153650030166, "language_loss": 0.79567391, "learning_rate": 1.594862087742667e-06, "loss": 0.81717706, "num_input_tokens_seen": 207073150, "step": 9613, "time_per_iteration": 2.679202079772949 }, { "auxiliary_loss_clip": 0.01107, "auxiliary_loss_mlp": 0.01032074, "balance_loss_clip": 1.04167032, "balance_loss_mlp": 1.02013552, "epoch": 0.5780249511498572, "flos": 19026120228480.0, "grad_norm": 1.7764623177151277, "language_loss": 0.77572, "learning_rate": 1.5944807080641863e-06, "loss": 0.7971108, "num_input_tokens_seen": 207090375, "step": 9614, "time_per_iteration": 2.6978790760040283 }, { "auxiliary_loss_clip": 0.01086413, "auxiliary_loss_mlp": 0.01033429, "balance_loss_clip": 1.04169321, "balance_loss_mlp": 1.020715, "epoch": 0.5780850744025252, "flos": 12124663752960.0, "grad_norm": 2.2008207091737093, "language_loss": 0.81598818, "learning_rate": 1.5940993437630375e-06, "loss": 0.83718669, "num_input_tokens_seen": 207106030, "step": 9615, "time_per_iteration": 2.7248473167419434 }, { "auxiliary_loss_clip": 0.01104516, "auxiliary_loss_mlp": 0.01032639, "balance_loss_clip": 1.03926682, "balance_loss_mlp": 1.01978278, "epoch": 0.5781451976551931, "flos": 25044425009280.0, "grad_norm": 1.4596798757523364, "language_loss": 0.67086244, "learning_rate": 1.5937179948536825e-06, "loss": 0.69223398, "num_input_tokens_seen": 207125435, "step": 9616, "time_per_iteration": 2.7597362995147705 }, { "auxiliary_loss_clip": 0.01106834, "auxiliary_loss_mlp": 0.01032261, "balance_loss_clip": 1.04345763, "balance_loss_mlp": 1.01935697, "epoch": 0.5782053209078611, "flos": 19245606284160.0, "grad_norm": 1.6175721800228267, "language_loss": 0.77521074, "learning_rate": 1.5933366613505812e-06, "loss": 0.79660165, "num_input_tokens_seen": 207145095, "step": 9617, "time_per_iteration": 2.8377323150634766 }, { "auxiliary_loss_clip": 0.01094943, "auxiliary_loss_mlp": 0.01035216, "balance_loss_clip": 1.04236281, "balance_loss_mlp": 1.02231812, "epoch": 0.578265444160529, "flos": 25993831340160.0, "grad_norm": 1.5155731004031996, "language_loss": 0.75113726, "learning_rate": 1.5929553432681947e-06, "loss": 0.77243888, "num_input_tokens_seen": 207166045, "step": 9618, "time_per_iteration": 2.665472984313965 }, { "auxiliary_loss_clip": 0.0111694, "auxiliary_loss_mlp": 0.01028064, "balance_loss_clip": 1.04336691, "balance_loss_mlp": 1.01594067, "epoch": 0.5783255674131971, "flos": 21798603394560.0, "grad_norm": 2.8083861615500445, "language_loss": 0.81775922, "learning_rate": 1.5925740406209826e-06, "loss": 0.83920932, "num_input_tokens_seen": 207185290, "step": 9619, "time_per_iteration": 2.6156482696533203 }, { "auxiliary_loss_clip": 0.01099184, "auxiliary_loss_mlp": 0.01032562, "balance_loss_clip": 1.04264188, "balance_loss_mlp": 1.0207603, "epoch": 0.578385690665865, "flos": 24789746603520.0, "grad_norm": 1.7083869874707343, "language_loss": 0.72963226, "learning_rate": 1.5921927534234039e-06, "loss": 0.75094968, "num_input_tokens_seen": 207205505, "step": 9620, "time_per_iteration": 2.7066376209259033 }, { "auxiliary_loss_clip": 0.01096891, "auxiliary_loss_mlp": 0.01030675, "balance_loss_clip": 1.04079533, "balance_loss_mlp": 1.01831877, "epoch": 0.578445813918533, "flos": 21212864311680.0, "grad_norm": 8.221069459540734, "language_loss": 0.76836628, "learning_rate": 1.591811481689916e-06, "loss": 0.78964192, "num_input_tokens_seen": 207225315, "step": 9621, "time_per_iteration": 2.746229887008667 }, { "auxiliary_loss_clip": 0.01054178, "auxiliary_loss_mlp": 0.0104303, "balance_loss_clip": 1.03465438, "balance_loss_mlp": 1.02871835, "epoch": 0.5785059371712009, "flos": 25046795306880.0, "grad_norm": 1.8397649270084009, "language_loss": 0.70646143, "learning_rate": 1.5914302254349787e-06, "loss": 0.72743344, "num_input_tokens_seen": 207247690, "step": 9622, "time_per_iteration": 2.7708969116210938 }, { "auxiliary_loss_clip": 0.01024027, "auxiliary_loss_mlp": 0.01003845, "balance_loss_clip": 1.01965523, "balance_loss_mlp": 1.00259304, "epoch": 0.5785660604238689, "flos": 70843172284800.0, "grad_norm": 0.7693139889423115, "language_loss": 0.55946988, "learning_rate": 1.5910489846730476e-06, "loss": 0.57974857, "num_input_tokens_seen": 207301735, "step": 9623, "time_per_iteration": 3.2743892669677734 }, { "auxiliary_loss_clip": 0.01084844, "auxiliary_loss_mlp": 0.01037987, "balance_loss_clip": 1.04244125, "balance_loss_mlp": 1.02392614, "epoch": 0.578626183676537, "flos": 31649977244160.0, "grad_norm": 2.0494784145389677, "language_loss": 0.71381462, "learning_rate": 1.5906677594185799e-06, "loss": 0.73504293, "num_input_tokens_seen": 207321240, "step": 9624, "time_per_iteration": 2.761348247528076 }, { "auxiliary_loss_clip": 0.01084192, "auxiliary_loss_mlp": 0.01039308, "balance_loss_clip": 1.03928137, "balance_loss_mlp": 1.02572453, "epoch": 0.5786863069292049, "flos": 21865181253120.0, "grad_norm": 2.0143803075104687, "language_loss": 0.82421607, "learning_rate": 1.5902865496860322e-06, "loss": 0.845451, "num_input_tokens_seen": 207339540, "step": 9625, "time_per_iteration": 4.566919326782227 }, { "auxiliary_loss_clip": 0.01116336, "auxiliary_loss_mlp": 0.01033709, "balance_loss_clip": 1.042328, "balance_loss_mlp": 1.02037549, "epoch": 0.5787464301818729, "flos": 23364954748800.0, "grad_norm": 1.438878240234706, "language_loss": 0.70356315, "learning_rate": 1.5899053554898591e-06, "loss": 0.72506356, "num_input_tokens_seen": 207360470, "step": 9626, "time_per_iteration": 2.6495361328125 }, { "auxiliary_loss_clip": 0.01095761, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.0427779, "balance_loss_mlp": 1.02442503, "epoch": 0.5788065534345408, "flos": 30004011394560.0, "grad_norm": 1.470476031522724, "language_loss": 0.72111934, "learning_rate": 1.5895241768445166e-06, "loss": 0.74244475, "num_input_tokens_seen": 207383080, "step": 9627, "time_per_iteration": 2.8884880542755127 }, { "auxiliary_loss_clip": 0.01104923, "auxiliary_loss_mlp": 0.0103066, "balance_loss_clip": 1.04045546, "balance_loss_mlp": 1.01872754, "epoch": 0.5788666766872088, "flos": 24527849564160.0, "grad_norm": 5.936898137308074, "language_loss": 0.83902895, "learning_rate": 1.589143013764458e-06, "loss": 0.8603847, "num_input_tokens_seen": 207401000, "step": 9628, "time_per_iteration": 2.746950626373291 }, { "auxiliary_loss_clip": 0.01093971, "auxiliary_loss_mlp": 0.01031789, "balance_loss_clip": 1.03782499, "balance_loss_mlp": 1.01856256, "epoch": 0.5789267999398767, "flos": 23732823888000.0, "grad_norm": 1.5735702827765405, "language_loss": 0.72260225, "learning_rate": 1.5887618662641376e-06, "loss": 0.74385989, "num_input_tokens_seen": 207419230, "step": 9629, "time_per_iteration": 4.194722652435303 }, { "auxiliary_loss_clip": 0.01096902, "auxiliary_loss_mlp": 0.0103477, "balance_loss_clip": 1.043715, "balance_loss_mlp": 1.02154994, "epoch": 0.5789869231925447, "flos": 21135045496320.0, "grad_norm": 2.2622526010485062, "language_loss": 0.74250948, "learning_rate": 1.5883807343580087e-06, "loss": 0.76382619, "num_input_tokens_seen": 207437615, "step": 9630, "time_per_iteration": 2.754213571548462 }, { "auxiliary_loss_clip": 0.01083141, "auxiliary_loss_mlp": 0.00770695, "balance_loss_clip": 1.0400362, "balance_loss_mlp": 1.00009274, "epoch": 0.5790470464452127, "flos": 21209632087680.0, "grad_norm": 1.6843723839781237, "language_loss": 0.78927267, "learning_rate": 1.587999618060523e-06, "loss": 0.8078109, "num_input_tokens_seen": 207457270, "step": 9631, "time_per_iteration": 2.757955551147461 }, { "auxiliary_loss_clip": 0.01116603, "auxiliary_loss_mlp": 0.01029207, "balance_loss_clip": 1.04169166, "balance_loss_mlp": 1.01674962, "epoch": 0.5791071696978807, "flos": 23404384903680.0, "grad_norm": 1.5220400196762927, "language_loss": 0.75543463, "learning_rate": 1.5876185173861333e-06, "loss": 0.77689266, "num_input_tokens_seen": 207477890, "step": 9632, "time_per_iteration": 2.5955679416656494 }, { "auxiliary_loss_clip": 0.01090291, "auxiliary_loss_mlp": 0.01030616, "balance_loss_clip": 1.04132521, "balance_loss_mlp": 1.01704419, "epoch": 0.5791672929505486, "flos": 24206521472640.0, "grad_norm": 2.166079097569446, "language_loss": 0.79483461, "learning_rate": 1.5872374323492915e-06, "loss": 0.81604362, "num_input_tokens_seen": 207497670, "step": 9633, "time_per_iteration": 3.0309832096099854 }, { "auxiliary_loss_clip": 0.01090489, "auxiliary_loss_mlp": 0.0104029, "balance_loss_clip": 1.04247785, "balance_loss_mlp": 1.02621174, "epoch": 0.5792274162032166, "flos": 24348871071360.0, "grad_norm": 1.6628345099755575, "language_loss": 0.77489352, "learning_rate": 1.5868563629644464e-06, "loss": 0.79620135, "num_input_tokens_seen": 207516105, "step": 9634, "time_per_iteration": 2.742804765701294 }, { "auxiliary_loss_clip": 0.01103303, "auxiliary_loss_mlp": 0.01039877, "balance_loss_clip": 1.04325557, "balance_loss_mlp": 1.0265131, "epoch": 0.5792875394558845, "flos": 20449403712000.0, "grad_norm": 2.0206641079359695, "language_loss": 0.63376474, "learning_rate": 1.5864753092460502e-06, "loss": 0.65519655, "num_input_tokens_seen": 207533685, "step": 9635, "time_per_iteration": 2.758554220199585 }, { "auxiliary_loss_clip": 0.01090702, "auxiliary_loss_mlp": 0.01040857, "balance_loss_clip": 1.0402782, "balance_loss_mlp": 1.02797055, "epoch": 0.5793476627085525, "flos": 24060329118720.0, "grad_norm": 1.4022803042470642, "language_loss": 0.77229643, "learning_rate": 1.5860942712085516e-06, "loss": 0.793612, "num_input_tokens_seen": 207552840, "step": 9636, "time_per_iteration": 2.6893904209136963 }, { "auxiliary_loss_clip": 0.01087778, "auxiliary_loss_mlp": 0.01033423, "balance_loss_clip": 1.03770018, "balance_loss_mlp": 1.02124608, "epoch": 0.5794077859612206, "flos": 22054287381120.0, "grad_norm": 1.6516741793622702, "language_loss": 0.68164212, "learning_rate": 1.5857132488663998e-06, "loss": 0.70285416, "num_input_tokens_seen": 207572095, "step": 9637, "time_per_iteration": 2.7232043743133545 }, { "auxiliary_loss_clip": 0.01076767, "auxiliary_loss_mlp": 0.01035713, "balance_loss_clip": 1.04049063, "balance_loss_mlp": 1.02214098, "epoch": 0.5794679092138885, "flos": 11434855991040.0, "grad_norm": 2.739438707467598, "language_loss": 0.72531378, "learning_rate": 1.585332242234043e-06, "loss": 0.74643862, "num_input_tokens_seen": 207587495, "step": 9638, "time_per_iteration": 2.819202423095703 }, { "auxiliary_loss_clip": 0.01107966, "auxiliary_loss_mlp": 0.0103288, "balance_loss_clip": 1.04470587, "balance_loss_mlp": 1.02056587, "epoch": 0.5795280324665565, "flos": 18880215183360.0, "grad_norm": 1.716063507275685, "language_loss": 0.72309893, "learning_rate": 1.5849512513259291e-06, "loss": 0.74450737, "num_input_tokens_seen": 207606795, "step": 9639, "time_per_iteration": 2.683488130569458 }, { "auxiliary_loss_clip": 0.01094721, "auxiliary_loss_mlp": 0.01039725, "balance_loss_clip": 1.0399698, "balance_loss_mlp": 1.02682686, "epoch": 0.5795881557192244, "flos": 13005947940480.0, "grad_norm": 1.8567608995858262, "language_loss": 0.70044529, "learning_rate": 1.5845702761565054e-06, "loss": 0.72178972, "num_input_tokens_seen": 207623620, "step": 9640, "time_per_iteration": 2.672945737838745 }, { "auxiliary_loss_clip": 0.01096614, "auxiliary_loss_mlp": 0.01042841, "balance_loss_clip": 1.0413754, "balance_loss_mlp": 1.02858996, "epoch": 0.5796482789718924, "flos": 19932397303680.0, "grad_norm": 2.4123450370287958, "language_loss": 0.7753675, "learning_rate": 1.5841893167402183e-06, "loss": 0.79676205, "num_input_tokens_seen": 207639380, "step": 9641, "time_per_iteration": 2.688164472579956 }, { "auxiliary_loss_clip": 0.01119399, "auxiliary_loss_mlp": 0.01036698, "balance_loss_clip": 1.04407382, "balance_loss_mlp": 1.02385926, "epoch": 0.5797084022245603, "flos": 21650794928640.0, "grad_norm": 1.8311937480298248, "language_loss": 0.73798597, "learning_rate": 1.5838083730915143e-06, "loss": 0.75954694, "num_input_tokens_seen": 207657915, "step": 9642, "time_per_iteration": 2.624521017074585 }, { "auxiliary_loss_clip": 0.01102536, "auxiliary_loss_mlp": 0.01038535, "balance_loss_clip": 1.04526544, "balance_loss_mlp": 1.02577972, "epoch": 0.5797685254772283, "flos": 26031573555840.0, "grad_norm": 5.942363913556237, "language_loss": 0.73259425, "learning_rate": 1.5834274452248378e-06, "loss": 0.75400496, "num_input_tokens_seen": 207678620, "step": 9643, "time_per_iteration": 2.715672254562378 }, { "auxiliary_loss_clip": 0.01121691, "auxiliary_loss_mlp": 0.01033759, "balance_loss_clip": 1.04416251, "balance_loss_mlp": 1.02062845, "epoch": 0.5798286487298963, "flos": 22705167778560.0, "grad_norm": 1.8659489070776951, "language_loss": 0.67181957, "learning_rate": 1.5830465331546352e-06, "loss": 0.69337404, "num_input_tokens_seen": 207696980, "step": 9644, "time_per_iteration": 2.6038551330566406 }, { "auxiliary_loss_clip": 0.01116177, "auxiliary_loss_mlp": 0.0103453, "balance_loss_clip": 1.04553771, "balance_loss_mlp": 1.02103531, "epoch": 0.5798887719825643, "flos": 23148988225920.0, "grad_norm": 2.1679759651263044, "language_loss": 0.85346615, "learning_rate": 1.5826656368953496e-06, "loss": 0.8749733, "num_input_tokens_seen": 207714065, "step": 9645, "time_per_iteration": 2.667259931564331 }, { "auxiliary_loss_clip": 0.01122251, "auxiliary_loss_mlp": 0.01030168, "balance_loss_clip": 1.04620934, "balance_loss_mlp": 1.01735902, "epoch": 0.5799488952352322, "flos": 24426043441920.0, "grad_norm": 2.1123906469300935, "language_loss": 0.75605559, "learning_rate": 1.5822847564614244e-06, "loss": 0.77757978, "num_input_tokens_seen": 207734720, "step": 9646, "time_per_iteration": 2.559659481048584 }, { "auxiliary_loss_clip": 0.01099999, "auxiliary_loss_mlp": 0.01037708, "balance_loss_clip": 1.04342473, "balance_loss_mlp": 1.02371335, "epoch": 0.5800090184879002, "flos": 38395903829760.0, "grad_norm": 1.698650252646941, "language_loss": 0.59495735, "learning_rate": 1.5819038918673038e-06, "loss": 0.61633444, "num_input_tokens_seen": 207755435, "step": 9647, "time_per_iteration": 2.7939651012420654 }, { "auxiliary_loss_clip": 0.0107788, "auxiliary_loss_mlp": 0.0105249, "balance_loss_clip": 1.04142165, "balance_loss_mlp": 1.03642702, "epoch": 0.5800691417405681, "flos": 19784840232960.0, "grad_norm": 1.6988187353884752, "language_loss": 0.84499681, "learning_rate": 1.5815230431274288e-06, "loss": 0.86630046, "num_input_tokens_seen": 207773570, "step": 9648, "time_per_iteration": 2.7750449180603027 }, { "auxiliary_loss_clip": 0.01032269, "auxiliary_loss_mlp": 0.01003411, "balance_loss_clip": 1.01776171, "balance_loss_mlp": 1.0021714, "epoch": 0.5801292649932361, "flos": 70314565783680.0, "grad_norm": 0.8432525659417933, "language_loss": 0.62929457, "learning_rate": 1.581142210256242e-06, "loss": 0.64965135, "num_input_tokens_seen": 207830095, "step": 9649, "time_per_iteration": 3.21219801902771 }, { "auxiliary_loss_clip": 0.01078275, "auxiliary_loss_mlp": 0.0103905, "balance_loss_clip": 1.03673697, "balance_loss_mlp": 1.02525127, "epoch": 0.5801893882459042, "flos": 18734812928640.0, "grad_norm": 1.587591091557097, "language_loss": 0.82462633, "learning_rate": 1.5807613932681857e-06, "loss": 0.84579957, "num_input_tokens_seen": 207848555, "step": 9650, "time_per_iteration": 2.8374016284942627 }, { "auxiliary_loss_clip": 0.0108491, "auxiliary_loss_mlp": 0.01036427, "balance_loss_clip": 1.03912425, "balance_loss_mlp": 1.0230515, "epoch": 0.5802495114985721, "flos": 15596507698560.0, "grad_norm": 3.679017793776146, "language_loss": 0.7786057, "learning_rate": 1.580380592177698e-06, "loss": 0.79981905, "num_input_tokens_seen": 207867060, "step": 9651, "time_per_iteration": 2.728508949279785 }, { "auxiliary_loss_clip": 0.01103104, "auxiliary_loss_mlp": 0.01039294, "balance_loss_clip": 1.04429924, "balance_loss_mlp": 1.02555537, "epoch": 0.5803096347512401, "flos": 18255405081600.0, "grad_norm": 1.8929228958840072, "language_loss": 0.74471784, "learning_rate": 1.5799998069992213e-06, "loss": 0.76614177, "num_input_tokens_seen": 207884520, "step": 9652, "time_per_iteration": 2.6977131366729736 }, { "auxiliary_loss_clip": 0.01092621, "auxiliary_loss_mlp": 0.01028533, "balance_loss_clip": 1.04145324, "balance_loss_mlp": 1.0150857, "epoch": 0.580369758003908, "flos": 22893160584960.0, "grad_norm": 2.031010770866024, "language_loss": 0.7703613, "learning_rate": 1.579619037747193e-06, "loss": 0.79157287, "num_input_tokens_seen": 207905370, "step": 9653, "time_per_iteration": 2.7233431339263916 }, { "auxiliary_loss_clip": 0.01121993, "auxiliary_loss_mlp": 0.01034522, "balance_loss_clip": 1.04465187, "balance_loss_mlp": 1.02035964, "epoch": 0.580429881256576, "flos": 18697681244160.0, "grad_norm": 1.9204408515131524, "language_loss": 0.74248046, "learning_rate": 1.5792382844360534e-06, "loss": 0.76404566, "num_input_tokens_seen": 207923790, "step": 9654, "time_per_iteration": 2.595330238342285 }, { "auxiliary_loss_clip": 0.01054131, "auxiliary_loss_mlp": 0.01037747, "balance_loss_clip": 1.04102838, "balance_loss_mlp": 1.02466965, "epoch": 0.5804900045092439, "flos": 24681978823680.0, "grad_norm": 1.627345886244452, "language_loss": 0.70138443, "learning_rate": 1.5788575470802408e-06, "loss": 0.72230321, "num_input_tokens_seen": 207942335, "step": 9655, "time_per_iteration": 2.8097565174102783 }, { "auxiliary_loss_clip": 0.01125048, "auxiliary_loss_mlp": 0.01038459, "balance_loss_clip": 1.04366922, "balance_loss_mlp": 1.02495217, "epoch": 0.580550127761912, "flos": 23112790295040.0, "grad_norm": 1.8908787804935243, "language_loss": 0.69673449, "learning_rate": 1.5784768256941915e-06, "loss": 0.71836954, "num_input_tokens_seen": 207961975, "step": 9656, "time_per_iteration": 2.6233110427856445 }, { "auxiliary_loss_clip": 0.01107455, "auxiliary_loss_mlp": 0.01034723, "balance_loss_clip": 1.04619503, "balance_loss_mlp": 1.02208686, "epoch": 0.5806102510145799, "flos": 18475681236480.0, "grad_norm": 1.5577317145380594, "language_loss": 0.71972537, "learning_rate": 1.5780961202923433e-06, "loss": 0.7411471, "num_input_tokens_seen": 207979520, "step": 9657, "time_per_iteration": 2.616337537765503 }, { "auxiliary_loss_clip": 0.01111294, "auxiliary_loss_mlp": 0.01037621, "balance_loss_clip": 1.04370785, "balance_loss_mlp": 1.0237869, "epoch": 0.5806703742672479, "flos": 23915645136000.0, "grad_norm": 1.9819747060784367, "language_loss": 0.70975304, "learning_rate": 1.5777154308891328e-06, "loss": 0.73124212, "num_input_tokens_seen": 207998375, "step": 9658, "time_per_iteration": 2.6383109092712402 }, { "auxiliary_loss_clip": 0.01031383, "auxiliary_loss_mlp": 0.01001283, "balance_loss_clip": 1.01641989, "balance_loss_mlp": 1.00009727, "epoch": 0.5807304975199158, "flos": 66311999412480.0, "grad_norm": 0.7167527277810166, "language_loss": 0.5357672, "learning_rate": 1.5773347574989953e-06, "loss": 0.55609381, "num_input_tokens_seen": 208060605, "step": 9659, "time_per_iteration": 3.1848106384277344 }, { "auxiliary_loss_clip": 0.0111162, "auxiliary_loss_mlp": 0.01040087, "balance_loss_clip": 1.04272866, "balance_loss_mlp": 1.02638984, "epoch": 0.5807906207725838, "flos": 31722444933120.0, "grad_norm": 1.8377682291636406, "language_loss": 0.61835778, "learning_rate": 1.576954100136366e-06, "loss": 0.63987488, "num_input_tokens_seen": 208080320, "step": 9660, "time_per_iteration": 2.7875893115997314 }, { "auxiliary_loss_clip": 0.01108259, "auxiliary_loss_mlp": 0.01035512, "balance_loss_clip": 1.03933334, "balance_loss_mlp": 1.02131391, "epoch": 0.5808507440252517, "flos": 23801161512960.0, "grad_norm": 1.4582842247400174, "language_loss": 0.65268373, "learning_rate": 1.5765734588156797e-06, "loss": 0.6741215, "num_input_tokens_seen": 208099305, "step": 9661, "time_per_iteration": 2.640033721923828 }, { "auxiliary_loss_clip": 0.01060469, "auxiliary_loss_mlp": 0.01027812, "balance_loss_clip": 1.03416336, "balance_loss_mlp": 1.01562285, "epoch": 0.5809108672779197, "flos": 13698449222400.0, "grad_norm": 13.818010552074016, "language_loss": 0.74664855, "learning_rate": 1.5761928335513704e-06, "loss": 0.76753139, "num_input_tokens_seen": 208116960, "step": 9662, "time_per_iteration": 2.78912091255188 }, { "auxiliary_loss_clip": 0.0103935, "auxiliary_loss_mlp": 0.01000149, "balance_loss_clip": 1.01472378, "balance_loss_mlp": 0.99883789, "epoch": 0.5809709905305876, "flos": 69134866381440.0, "grad_norm": 0.8720581464390529, "language_loss": 0.58341724, "learning_rate": 1.5758122243578709e-06, "loss": 0.60381216, "num_input_tokens_seen": 208182190, "step": 9663, "time_per_iteration": 3.2206766605377197 }, { "auxiliary_loss_clip": 0.01099545, "auxiliary_loss_mlp": 0.01034444, "balance_loss_clip": 1.04324317, "balance_loss_mlp": 1.02127123, "epoch": 0.5810311137832557, "flos": 19827538525440.0, "grad_norm": 2.2012699158511073, "language_loss": 0.82044816, "learning_rate": 1.5754316312496152e-06, "loss": 0.84178805, "num_input_tokens_seen": 208197015, "step": 9664, "time_per_iteration": 5.9192726612091064 }, { "auxiliary_loss_clip": 0.01089768, "auxiliary_loss_mlp": 0.00771212, "balance_loss_clip": 1.03780138, "balance_loss_mlp": 1.0000962, "epoch": 0.5810912370359237, "flos": 29238503719680.0, "grad_norm": 4.331316838714664, "language_loss": 0.81583905, "learning_rate": 1.5750510542410337e-06, "loss": 0.83444887, "num_input_tokens_seen": 208215795, "step": 9665, "time_per_iteration": 2.7813103199005127 }, { "auxiliary_loss_clip": 0.01104588, "auxiliary_loss_mlp": 0.01035909, "balance_loss_clip": 1.0461179, "balance_loss_mlp": 1.02123475, "epoch": 0.5811513602885916, "flos": 22785572373120.0, "grad_norm": 1.7229241789226792, "language_loss": 0.81392443, "learning_rate": 1.5746704933465599e-06, "loss": 0.83532941, "num_input_tokens_seen": 208234655, "step": 9666, "time_per_iteration": 2.7249464988708496 }, { "auxiliary_loss_clip": 0.01101961, "auxiliary_loss_mlp": 0.01035898, "balance_loss_clip": 1.04181623, "balance_loss_mlp": 1.02339292, "epoch": 0.5812114835412596, "flos": 18734346051840.0, "grad_norm": 1.7975787773576042, "language_loss": 0.80100554, "learning_rate": 1.5742899485806227e-06, "loss": 0.82238424, "num_input_tokens_seen": 208251300, "step": 9667, "time_per_iteration": 2.600576639175415 }, { "auxiliary_loss_clip": 0.01117108, "auxiliary_loss_mlp": 0.01037273, "balance_loss_clip": 1.04451418, "balance_loss_mlp": 1.02237177, "epoch": 0.5812716067939275, "flos": 26431295080320.0, "grad_norm": 1.4400303722288619, "language_loss": 0.78809667, "learning_rate": 1.573909419957653e-06, "loss": 0.80964047, "num_input_tokens_seen": 208272685, "step": 9668, "time_per_iteration": 4.22690486907959 }, { "auxiliary_loss_clip": 0.01098312, "auxiliary_loss_mlp": 0.01033665, "balance_loss_clip": 1.04209864, "balance_loss_mlp": 1.02148795, "epoch": 0.5813317300465956, "flos": 43397865285120.0, "grad_norm": 1.8465293320084986, "language_loss": 0.64245093, "learning_rate": 1.5735289074920819e-06, "loss": 0.66377068, "num_input_tokens_seen": 208294315, "step": 9669, "time_per_iteration": 2.8652687072753906 }, { "auxiliary_loss_clip": 0.01069091, "auxiliary_loss_mlp": 0.01041038, "balance_loss_clip": 1.03997946, "balance_loss_mlp": 1.02672672, "epoch": 0.5813918532992635, "flos": 24785472885120.0, "grad_norm": 1.4411692985545548, "language_loss": 0.7307651, "learning_rate": 1.5731484111983363e-06, "loss": 0.75186646, "num_input_tokens_seen": 208315610, "step": 9670, "time_per_iteration": 2.829456329345703 }, { "auxiliary_loss_clip": 0.01086705, "auxiliary_loss_mlp": 0.01034661, "balance_loss_clip": 1.03999424, "balance_loss_mlp": 1.02194691, "epoch": 0.5814519765519315, "flos": 22857357703680.0, "grad_norm": 2.0479138475359844, "language_loss": 0.7874738, "learning_rate": 1.5727679310908464e-06, "loss": 0.80868745, "num_input_tokens_seen": 208334725, "step": 9671, "time_per_iteration": 2.7991318702697754 }, { "auxiliary_loss_clip": 0.0107985, "auxiliary_loss_mlp": 0.01044541, "balance_loss_clip": 1.0416975, "balance_loss_mlp": 1.02910936, "epoch": 0.5815120998045994, "flos": 24060831909120.0, "grad_norm": 1.9838213735263186, "language_loss": 0.61369407, "learning_rate": 1.5723874671840399e-06, "loss": 0.634938, "num_input_tokens_seen": 208353825, "step": 9672, "time_per_iteration": 2.8498592376708984 }, { "auxiliary_loss_clip": 0.01065855, "auxiliary_loss_mlp": 0.01038617, "balance_loss_clip": 1.04000103, "balance_loss_mlp": 1.02496195, "epoch": 0.5815722230572674, "flos": 24279491952000.0, "grad_norm": 2.0691966635939365, "language_loss": 0.81397313, "learning_rate": 1.572007019492342e-06, "loss": 0.83501786, "num_input_tokens_seen": 208374160, "step": 9673, "time_per_iteration": 2.8208439350128174 }, { "auxiliary_loss_clip": 0.0108779, "auxiliary_loss_mlp": 0.01038429, "balance_loss_clip": 1.04342866, "balance_loss_mlp": 1.0242784, "epoch": 0.5816323463099353, "flos": 22200371994240.0, "grad_norm": 1.86389400550988, "language_loss": 0.88404083, "learning_rate": 1.5716265880301817e-06, "loss": 0.905303, "num_input_tokens_seen": 208392105, "step": 9674, "time_per_iteration": 2.7522170543670654 }, { "auxiliary_loss_clip": 0.01120808, "auxiliary_loss_mlp": 0.00770234, "balance_loss_clip": 1.04347241, "balance_loss_mlp": 1.00026846, "epoch": 0.5816924695626033, "flos": 24134448833280.0, "grad_norm": 1.4106486697266074, "language_loss": 0.78974068, "learning_rate": 1.571246172811984e-06, "loss": 0.80865109, "num_input_tokens_seen": 208411755, "step": 9675, "time_per_iteration": 2.6588079929351807 }, { "auxiliary_loss_clip": 0.01106314, "auxiliary_loss_mlp": 0.01035578, "balance_loss_clip": 1.04066849, "balance_loss_mlp": 1.02178526, "epoch": 0.5817525928152713, "flos": 21324223451520.0, "grad_norm": 1.863415006013356, "language_loss": 0.70507479, "learning_rate": 1.5708657738521748e-06, "loss": 0.72649372, "num_input_tokens_seen": 208429995, "step": 9676, "time_per_iteration": 2.64201283454895 }, { "auxiliary_loss_clip": 0.01058756, "auxiliary_loss_mlp": 0.01033649, "balance_loss_clip": 1.0396111, "balance_loss_mlp": 1.02030993, "epoch": 0.5818127160679393, "flos": 26934510666240.0, "grad_norm": 2.6670948708651636, "language_loss": 0.63821483, "learning_rate": 1.5704853911651779e-06, "loss": 0.65913892, "num_input_tokens_seen": 208443655, "step": 9677, "time_per_iteration": 2.818047523498535 }, { "auxiliary_loss_clip": 0.01020823, "auxiliary_loss_mlp": 0.01010612, "balance_loss_clip": 1.02114296, "balance_loss_mlp": 1.00937831, "epoch": 0.5818728393206073, "flos": 63918626342400.0, "grad_norm": 0.8047469836092298, "language_loss": 0.54188442, "learning_rate": 1.5701050247654182e-06, "loss": 0.56219876, "num_input_tokens_seen": 208498405, "step": 9678, "time_per_iteration": 3.2669215202331543 }, { "auxiliary_loss_clip": 0.01019281, "auxiliary_loss_mlp": 0.0100911, "balance_loss_clip": 1.01330447, "balance_loss_mlp": 1.00782299, "epoch": 0.5819329625732752, "flos": 64954108638720.0, "grad_norm": 0.7377482843760589, "language_loss": 0.56218177, "learning_rate": 1.569724674667319e-06, "loss": 0.58246571, "num_input_tokens_seen": 208559075, "step": 9679, "time_per_iteration": 3.130009174346924 }, { "auxiliary_loss_clip": 0.01118656, "auxiliary_loss_mlp": 0.01031808, "balance_loss_clip": 1.04236495, "balance_loss_mlp": 1.01982164, "epoch": 0.5819930858259432, "flos": 21215270522880.0, "grad_norm": 1.65967573029577, "language_loss": 0.65638047, "learning_rate": 1.5693443408853032e-06, "loss": 0.67788512, "num_input_tokens_seen": 208577770, "step": 9680, "time_per_iteration": 2.63765811920166 }, { "auxiliary_loss_clip": 0.01095966, "auxiliary_loss_mlp": 0.01030097, "balance_loss_clip": 1.04104781, "balance_loss_mlp": 1.01797342, "epoch": 0.5820532090786111, "flos": 19458520151040.0, "grad_norm": 1.9145859585775957, "language_loss": 0.83394265, "learning_rate": 1.5689640234337933e-06, "loss": 0.85520327, "num_input_tokens_seen": 208595110, "step": 9681, "time_per_iteration": 2.6886913776397705 }, { "auxiliary_loss_clip": 0.0112012, "auxiliary_loss_mlp": 0.01033373, "balance_loss_clip": 1.04263687, "balance_loss_mlp": 1.02064157, "epoch": 0.5821133323312792, "flos": 17712615686400.0, "grad_norm": 1.6180763493056738, "language_loss": 0.76095504, "learning_rate": 1.5685837223272109e-06, "loss": 0.78248996, "num_input_tokens_seen": 208612080, "step": 9682, "time_per_iteration": 2.616946220397949 }, { "auxiliary_loss_clip": 0.01054825, "auxiliary_loss_mlp": 0.01035748, "balance_loss_clip": 1.03545356, "balance_loss_mlp": 1.0205251, "epoch": 0.5821734555839471, "flos": 24571804832640.0, "grad_norm": 1.897202579717977, "language_loss": 0.7534517, "learning_rate": 1.568203437579977e-06, "loss": 0.77435744, "num_input_tokens_seen": 208630235, "step": 9683, "time_per_iteration": 2.7519571781158447 }, { "auxiliary_loss_clip": 0.01098515, "auxiliary_loss_mlp": 0.01032751, "balance_loss_clip": 1.04482961, "balance_loss_mlp": 1.0191133, "epoch": 0.5822335788366151, "flos": 22382259488640.0, "grad_norm": 1.7304603651050097, "language_loss": 0.73967683, "learning_rate": 1.5678231692065116e-06, "loss": 0.76098949, "num_input_tokens_seen": 208647925, "step": 9684, "time_per_iteration": 2.585839033126831 }, { "auxiliary_loss_clip": 0.01095398, "auxiliary_loss_mlp": 0.01040225, "balance_loss_clip": 1.04306865, "balance_loss_mlp": 1.02714145, "epoch": 0.582293702089283, "flos": 26722494639360.0, "grad_norm": 1.9911340281622987, "language_loss": 0.78017914, "learning_rate": 1.5674429172212348e-06, "loss": 0.80153537, "num_input_tokens_seen": 208666180, "step": 9685, "time_per_iteration": 2.6262004375457764 }, { "auxiliary_loss_clip": 0.01119541, "auxiliary_loss_mlp": 0.01037721, "balance_loss_clip": 1.04301238, "balance_loss_mlp": 1.02463138, "epoch": 0.582353825341951, "flos": 17348661129600.0, "grad_norm": 1.534499166945951, "language_loss": 0.75514185, "learning_rate": 1.5670626816385667e-06, "loss": 0.7767145, "num_input_tokens_seen": 208684240, "step": 9686, "time_per_iteration": 2.4799644947052 }, { "auxiliary_loss_clip": 0.01029752, "auxiliary_loss_mlp": 0.00999968, "balance_loss_clip": 1.01506877, "balance_loss_mlp": 0.99893057, "epoch": 0.5824139485946189, "flos": 55473261534720.0, "grad_norm": 0.8130045203422185, "language_loss": 0.57394326, "learning_rate": 1.5666824624729244e-06, "loss": 0.59424043, "num_input_tokens_seen": 208736090, "step": 9687, "time_per_iteration": 2.9722440242767334 }, { "auxiliary_loss_clip": 0.01079028, "auxiliary_loss_mlp": 0.01038321, "balance_loss_clip": 1.03950655, "balance_loss_mlp": 1.02262747, "epoch": 0.582474071847287, "flos": 20303031790080.0, "grad_norm": 1.7516030258378996, "language_loss": 0.70063931, "learning_rate": 1.566302259738727e-06, "loss": 0.72181278, "num_input_tokens_seen": 208754600, "step": 9688, "time_per_iteration": 2.802976369857788 }, { "auxiliary_loss_clip": 0.01110989, "auxiliary_loss_mlp": 0.01033418, "balance_loss_clip": 1.04311526, "balance_loss_mlp": 1.02075768, "epoch": 0.5825341950999549, "flos": 23878010661120.0, "grad_norm": 2.126323858989827, "language_loss": 0.65013343, "learning_rate": 1.5659220734503918e-06, "loss": 0.67157751, "num_input_tokens_seen": 208773140, "step": 9689, "time_per_iteration": 2.6299288272857666 }, { "auxiliary_loss_clip": 0.01095981, "auxiliary_loss_mlp": 0.00770437, "balance_loss_clip": 1.04142618, "balance_loss_mlp": 1.00009274, "epoch": 0.5825943183526229, "flos": 23113041690240.0, "grad_norm": 1.599269729220552, "language_loss": 0.7352339, "learning_rate": 1.5655419036223341e-06, "loss": 0.75389808, "num_input_tokens_seen": 208793410, "step": 9690, "time_per_iteration": 2.6903798580169678 }, { "auxiliary_loss_clip": 0.01096107, "auxiliary_loss_mlp": 0.01038726, "balance_loss_clip": 1.03903055, "balance_loss_mlp": 1.02372348, "epoch": 0.5826544416052909, "flos": 22857429530880.0, "grad_norm": 1.61399606195473, "language_loss": 0.75654376, "learning_rate": 1.5651617502689717e-06, "loss": 0.77789205, "num_input_tokens_seen": 208811920, "step": 9691, "time_per_iteration": 2.7056210041046143 }, { "auxiliary_loss_clip": 0.01109061, "auxiliary_loss_mlp": 0.01032641, "balance_loss_clip": 1.04082966, "balance_loss_mlp": 1.01972461, "epoch": 0.5827145648579588, "flos": 31501845555840.0, "grad_norm": 2.2562223304416755, "language_loss": 0.80682158, "learning_rate": 1.5647816134047184e-06, "loss": 0.82823855, "num_input_tokens_seen": 208834720, "step": 9692, "time_per_iteration": 2.7577641010284424 }, { "auxiliary_loss_clip": 0.01028968, "auxiliary_loss_mlp": 0.01002786, "balance_loss_clip": 1.01420581, "balance_loss_mlp": 1.00161159, "epoch": 0.5827746881106268, "flos": 69811817074560.0, "grad_norm": 0.7560919402716259, "language_loss": 0.5693723, "learning_rate": 1.5644014930439907e-06, "loss": 0.58968985, "num_input_tokens_seen": 208898415, "step": 9693, "time_per_iteration": 3.145176887512207 }, { "auxiliary_loss_clip": 0.01105496, "auxiliary_loss_mlp": 0.0076985, "balance_loss_clip": 1.04020321, "balance_loss_mlp": 1.00010538, "epoch": 0.5828348113632947, "flos": 23112395245440.0, "grad_norm": 2.61225629767126, "language_loss": 0.79375291, "learning_rate": 1.5640213892012025e-06, "loss": 0.81250644, "num_input_tokens_seen": 208919045, "step": 9694, "time_per_iteration": 2.7443995475769043 }, { "auxiliary_loss_clip": 0.01083069, "auxiliary_loss_mlp": 0.01042673, "balance_loss_clip": 1.03822398, "balance_loss_mlp": 1.02909541, "epoch": 0.5828949346159628, "flos": 21873082245120.0, "grad_norm": 1.4254101237523094, "language_loss": 0.76205015, "learning_rate": 1.5636413018907656e-06, "loss": 0.78330755, "num_input_tokens_seen": 208939375, "step": 9695, "time_per_iteration": 2.688107490539551 }, { "auxiliary_loss_clip": 0.01027446, "auxiliary_loss_mlp": 0.01003052, "balance_loss_clip": 1.01271224, "balance_loss_mlp": 1.00191391, "epoch": 0.5829550578686307, "flos": 65962553950080.0, "grad_norm": 0.7742487055111029, "language_loss": 0.54982823, "learning_rate": 1.563261231127095e-06, "loss": 0.57013327, "num_input_tokens_seen": 209004760, "step": 9696, "time_per_iteration": 3.239593029022217 }, { "auxiliary_loss_clip": 0.0108245, "auxiliary_loss_mlp": 0.01030212, "balance_loss_clip": 1.04170382, "balance_loss_mlp": 1.01751041, "epoch": 0.5830151811212987, "flos": 16289799079680.0, "grad_norm": 2.124266497676036, "language_loss": 0.76664579, "learning_rate": 1.5628811769246021e-06, "loss": 0.78777242, "num_input_tokens_seen": 209022930, "step": 9697, "time_per_iteration": 2.6790308952331543 }, { "auxiliary_loss_clip": 0.01121339, "auxiliary_loss_mlp": 0.01035657, "balance_loss_clip": 1.04233479, "balance_loss_mlp": 1.02154899, "epoch": 0.5830753043739666, "flos": 24168851084160.0, "grad_norm": 1.5579611092820027, "language_loss": 0.77714729, "learning_rate": 1.5625011392976991e-06, "loss": 0.79871726, "num_input_tokens_seen": 209043740, "step": 9698, "time_per_iteration": 2.635885715484619 }, { "auxiliary_loss_clip": 0.01079274, "auxiliary_loss_mlp": 0.01038337, "balance_loss_clip": 1.0413661, "balance_loss_mlp": 1.02498519, "epoch": 0.5831354276266346, "flos": 27059050097280.0, "grad_norm": 1.5784163010462595, "language_loss": 0.84167337, "learning_rate": 1.5621211182607966e-06, "loss": 0.86284947, "num_input_tokens_seen": 209068885, "step": 9699, "time_per_iteration": 2.8312487602233887 }, { "auxiliary_loss_clip": 0.01095092, "auxiliary_loss_mlp": 0.010366, "balance_loss_clip": 1.03954756, "balance_loss_mlp": 1.02281952, "epoch": 0.5831955508793025, "flos": 23623475909760.0, "grad_norm": 2.065302984121428, "language_loss": 0.65489984, "learning_rate": 1.561741113828305e-06, "loss": 0.67621672, "num_input_tokens_seen": 209087340, "step": 9700, "time_per_iteration": 2.784442901611328 }, { "auxiliary_loss_clip": 0.01108875, "auxiliary_loss_mlp": 0.01034575, "balance_loss_clip": 1.04089403, "balance_loss_mlp": 1.02150953, "epoch": 0.5832556741319705, "flos": 24973250209920.0, "grad_norm": 1.5991522353668115, "language_loss": 0.71547067, "learning_rate": 1.5613611260146344e-06, "loss": 0.73690522, "num_input_tokens_seen": 209108840, "step": 9701, "time_per_iteration": 2.6895313262939453 }, { "auxiliary_loss_clip": 0.01096283, "auxiliary_loss_mlp": 0.01041435, "balance_loss_clip": 1.04180253, "balance_loss_mlp": 1.02841139, "epoch": 0.5833157973846385, "flos": 23221563655680.0, "grad_norm": 1.6635802287235106, "language_loss": 0.85541105, "learning_rate": 1.5609811548341936e-06, "loss": 0.87678826, "num_input_tokens_seen": 209127985, "step": 9702, "time_per_iteration": 2.6746225357055664 }, { "auxiliary_loss_clip": 0.01102319, "auxiliary_loss_mlp": 0.01033634, "balance_loss_clip": 1.04071856, "balance_loss_mlp": 1.02131367, "epoch": 0.5833759206373065, "flos": 21977941023360.0, "grad_norm": 1.4183987857502756, "language_loss": 0.77847046, "learning_rate": 1.560601200301392e-06, "loss": 0.79983002, "num_input_tokens_seen": 209146885, "step": 9703, "time_per_iteration": 4.3035502433776855 }, { "auxiliary_loss_clip": 0.01122779, "auxiliary_loss_mlp": 0.01034804, "balance_loss_clip": 1.04359257, "balance_loss_mlp": 1.0208385, "epoch": 0.5834360438899745, "flos": 21762405463680.0, "grad_norm": 1.8064531110729998, "language_loss": 0.71067387, "learning_rate": 1.5602212624306366e-06, "loss": 0.73224974, "num_input_tokens_seen": 209166130, "step": 9704, "time_per_iteration": 4.107022762298584 }, { "auxiliary_loss_clip": 0.01094563, "auxiliary_loss_mlp": 0.01038062, "balance_loss_clip": 1.04187346, "balance_loss_mlp": 1.02561641, "epoch": 0.5834961671426424, "flos": 15992566035840.0, "grad_norm": 1.6675564380890735, "language_loss": 0.81363106, "learning_rate": 1.559841341236335e-06, "loss": 0.8349573, "num_input_tokens_seen": 209183350, "step": 9705, "time_per_iteration": 2.7058465480804443 }, { "auxiliary_loss_clip": 0.010702, "auxiliary_loss_mlp": 0.01034129, "balance_loss_clip": 1.03672004, "balance_loss_mlp": 1.02125466, "epoch": 0.5835562903953104, "flos": 22818322598400.0, "grad_norm": 1.7137147806220967, "language_loss": 0.80614948, "learning_rate": 1.5594614367328937e-06, "loss": 0.82719278, "num_input_tokens_seen": 209203945, "step": 9706, "time_per_iteration": 2.776280164718628 }, { "auxiliary_loss_clip": 0.01105997, "auxiliary_loss_mlp": 0.0103669, "balance_loss_clip": 1.04129124, "balance_loss_mlp": 1.02315402, "epoch": 0.5836164136479783, "flos": 48468056624640.0, "grad_norm": 2.0771057832537414, "language_loss": 0.74647468, "learning_rate": 1.5590815489347187e-06, "loss": 0.76790154, "num_input_tokens_seen": 209227080, "step": 9707, "time_per_iteration": 2.857609272003174 }, { "auxiliary_loss_clip": 0.01081909, "auxiliary_loss_mlp": 0.01031553, "balance_loss_clip": 1.03649998, "balance_loss_mlp": 1.01878548, "epoch": 0.5836765369006464, "flos": 26905998245760.0, "grad_norm": 2.7159127892637067, "language_loss": 0.81819087, "learning_rate": 1.5587016778562163e-06, "loss": 0.83932543, "num_input_tokens_seen": 209248170, "step": 9708, "time_per_iteration": 4.28432822227478 }, { "auxiliary_loss_clip": 0.01102304, "auxiliary_loss_mlp": 0.01032201, "balance_loss_clip": 1.0439347, "balance_loss_mlp": 1.01914191, "epoch": 0.5837366601533143, "flos": 20084048524800.0, "grad_norm": 1.4146539482815383, "language_loss": 0.78367102, "learning_rate": 1.5583218235117896e-06, "loss": 0.80501604, "num_input_tokens_seen": 209267730, "step": 9709, "time_per_iteration": 2.6337647438049316 }, { "auxiliary_loss_clip": 0.01017869, "auxiliary_loss_mlp": 0.00999553, "balance_loss_clip": 1.01163578, "balance_loss_mlp": 0.99844998, "epoch": 0.5837967834059823, "flos": 65363885971200.0, "grad_norm": 0.7723563596720286, "language_loss": 0.5654794, "learning_rate": 1.557941985915844e-06, "loss": 0.58565366, "num_input_tokens_seen": 209332510, "step": 9710, "time_per_iteration": 3.255643844604492 }, { "auxiliary_loss_clip": 0.01084064, "auxiliary_loss_mlp": 0.01035883, "balance_loss_clip": 1.03939962, "balance_loss_mlp": 1.02429581, "epoch": 0.5838569066586502, "flos": 25338641310720.0, "grad_norm": 1.5220841159249796, "language_loss": 0.6560964, "learning_rate": 1.5575621650827833e-06, "loss": 0.67729586, "num_input_tokens_seen": 209353355, "step": 9711, "time_per_iteration": 2.7771286964416504 }, { "auxiliary_loss_clip": 0.01124372, "auxiliary_loss_mlp": 0.01037032, "balance_loss_clip": 1.04342008, "balance_loss_mlp": 1.02279854, "epoch": 0.5839170299113182, "flos": 22229243550720.0, "grad_norm": 1.6457925309868888, "language_loss": 0.78601259, "learning_rate": 1.5571823610270085e-06, "loss": 0.80762661, "num_input_tokens_seen": 209370960, "step": 9712, "time_per_iteration": 2.6130564212799072 }, { "auxiliary_loss_clip": 0.01079932, "auxiliary_loss_mlp": 0.0077171, "balance_loss_clip": 1.03610897, "balance_loss_mlp": 1.00007439, "epoch": 0.5839771531639861, "flos": 22200012858240.0, "grad_norm": 1.6123088749448828, "language_loss": 0.73624194, "learning_rate": 1.5568025737629234e-06, "loss": 0.75475836, "num_input_tokens_seen": 209390955, "step": 9713, "time_per_iteration": 2.752688407897949 }, { "auxiliary_loss_clip": 0.01098855, "auxiliary_loss_mlp": 0.0103448, "balance_loss_clip": 1.03949571, "balance_loss_mlp": 1.02000761, "epoch": 0.5840372764166541, "flos": 22419355259520.0, "grad_norm": 2.057640389539287, "language_loss": 0.69393289, "learning_rate": 1.5564228033049292e-06, "loss": 0.71526623, "num_input_tokens_seen": 209410260, "step": 9714, "time_per_iteration": 2.697676181793213 }, { "auxiliary_loss_clip": 0.01118564, "auxiliary_loss_mlp": 0.01037008, "balance_loss_clip": 1.04040492, "balance_loss_mlp": 1.02368677, "epoch": 0.5840973996693221, "flos": 19828256797440.0, "grad_norm": 1.733937894535342, "language_loss": 0.80418617, "learning_rate": 1.5560430496674268e-06, "loss": 0.82574189, "num_input_tokens_seen": 209429920, "step": 9715, "time_per_iteration": 2.5865848064422607 }, { "auxiliary_loss_clip": 0.01094879, "auxiliary_loss_mlp": 0.0103561, "balance_loss_clip": 1.03690863, "balance_loss_mlp": 1.02182388, "epoch": 0.5841575229219901, "flos": 21142982401920.0, "grad_norm": 2.4772648960449586, "language_loss": 0.72541732, "learning_rate": 1.5556633128648167e-06, "loss": 0.74672222, "num_input_tokens_seen": 209449470, "step": 9716, "time_per_iteration": 2.760240077972412 }, { "auxiliary_loss_clip": 0.01088946, "auxiliary_loss_mlp": 0.01033627, "balance_loss_clip": 1.03793585, "balance_loss_mlp": 1.02124131, "epoch": 0.5842176461746581, "flos": 24640322025600.0, "grad_norm": 1.7815945401286815, "language_loss": 0.75058079, "learning_rate": 1.5552835929114976e-06, "loss": 0.7718066, "num_input_tokens_seen": 209467695, "step": 9717, "time_per_iteration": 2.7470862865448 }, { "auxiliary_loss_clip": 0.01109202, "auxiliary_loss_mlp": 0.01038785, "balance_loss_clip": 1.04155052, "balance_loss_mlp": 1.02575004, "epoch": 0.584277769427326, "flos": 19131158574720.0, "grad_norm": 3.2108802254609827, "language_loss": 0.79614913, "learning_rate": 1.5549038898218697e-06, "loss": 0.81762898, "num_input_tokens_seen": 209484250, "step": 9718, "time_per_iteration": 2.6843111515045166 }, { "auxiliary_loss_clip": 0.01094695, "auxiliary_loss_mlp": 0.01032977, "balance_loss_clip": 1.03992128, "balance_loss_mlp": 1.01880288, "epoch": 0.584337892679994, "flos": 22675111073280.0, "grad_norm": 1.6948464280827684, "language_loss": 0.67670137, "learning_rate": 1.5545242036103306e-06, "loss": 0.69797808, "num_input_tokens_seen": 209502830, "step": 9719, "time_per_iteration": 2.658722400665283 }, { "auxiliary_loss_clip": 0.01119777, "auxiliary_loss_mlp": 0.01038004, "balance_loss_clip": 1.04168653, "balance_loss_mlp": 1.02466464, "epoch": 0.5843980159326619, "flos": 31284083352960.0, "grad_norm": 1.997670996956063, "language_loss": 0.75795102, "learning_rate": 1.5541445342912786e-06, "loss": 0.77952886, "num_input_tokens_seen": 209525995, "step": 9720, "time_per_iteration": 2.6901891231536865 }, { "auxiliary_loss_clip": 0.01082891, "auxiliary_loss_mlp": 0.01039482, "balance_loss_clip": 1.04280281, "balance_loss_mlp": 1.02657783, "epoch": 0.58445813918533, "flos": 22748117466240.0, "grad_norm": 1.7155190503214905, "language_loss": 0.83123529, "learning_rate": 1.5537648818791105e-06, "loss": 0.85245907, "num_input_tokens_seen": 209545895, "step": 9721, "time_per_iteration": 2.71907639503479 }, { "auxiliary_loss_clip": 0.01037273, "auxiliary_loss_mlp": 0.01006637, "balance_loss_clip": 1.01290512, "balance_loss_mlp": 1.00543344, "epoch": 0.5845182624379979, "flos": 60686556658560.0, "grad_norm": 0.9400176499911559, "language_loss": 0.7134223, "learning_rate": 1.5533852463882226e-06, "loss": 0.73386145, "num_input_tokens_seen": 209602315, "step": 9722, "time_per_iteration": 3.1959645748138428 }, { "auxiliary_loss_clip": 0.01099534, "auxiliary_loss_mlp": 0.01040774, "balance_loss_clip": 1.03890538, "balance_loss_mlp": 1.02751184, "epoch": 0.5845783856906659, "flos": 16362446336640.0, "grad_norm": 1.9834511811038693, "language_loss": 0.89731622, "learning_rate": 1.5530056278330113e-06, "loss": 0.91871929, "num_input_tokens_seen": 209617615, "step": 9723, "time_per_iteration": 2.592627763748169 }, { "auxiliary_loss_clip": 0.01094383, "auxiliary_loss_mlp": 0.01038255, "balance_loss_clip": 1.04275918, "balance_loss_mlp": 1.02554142, "epoch": 0.5846385089433338, "flos": 20083402080000.0, "grad_norm": 1.398468813522248, "language_loss": 0.68486446, "learning_rate": 1.5526260262278709e-06, "loss": 0.70619082, "num_input_tokens_seen": 209637005, "step": 9724, "time_per_iteration": 2.655640125274658 }, { "auxiliary_loss_clip": 0.01110347, "auxiliary_loss_mlp": 0.01036604, "balance_loss_clip": 1.04291487, "balance_loss_mlp": 1.02341366, "epoch": 0.5846986321960018, "flos": 17311062568320.0, "grad_norm": 1.717409456716096, "language_loss": 0.86049938, "learning_rate": 1.552246441587197e-06, "loss": 0.88196886, "num_input_tokens_seen": 209653170, "step": 9725, "time_per_iteration": 2.6035261154174805 }, { "auxiliary_loss_clip": 0.01095255, "auxiliary_loss_mlp": 0.010422, "balance_loss_clip": 1.04249406, "balance_loss_mlp": 1.02926588, "epoch": 0.5847587554486697, "flos": 17197907748480.0, "grad_norm": 1.6193535846243259, "language_loss": 0.82923484, "learning_rate": 1.5518668739253821e-06, "loss": 0.85060942, "num_input_tokens_seen": 209671275, "step": 9726, "time_per_iteration": 2.655017137527466 }, { "auxiliary_loss_clip": 0.01055108, "auxiliary_loss_mlp": 0.00770936, "balance_loss_clip": 1.03983736, "balance_loss_mlp": 1.00008965, "epoch": 0.5848188787013378, "flos": 24529106540160.0, "grad_norm": 1.736262693329601, "language_loss": 0.66609311, "learning_rate": 1.5514873232568206e-06, "loss": 0.68435353, "num_input_tokens_seen": 209690380, "step": 9727, "time_per_iteration": 2.820906639099121 }, { "auxiliary_loss_clip": 0.01083507, "auxiliary_loss_mlp": 0.01045274, "balance_loss_clip": 1.03799105, "balance_loss_mlp": 1.03056347, "epoch": 0.5848790019540057, "flos": 20628382204800.0, "grad_norm": 1.7999573427153348, "language_loss": 0.81628853, "learning_rate": 1.5511077895959055e-06, "loss": 0.83757633, "num_input_tokens_seen": 209708845, "step": 9728, "time_per_iteration": 2.7597923278808594 }, { "auxiliary_loss_clip": 0.01103874, "auxiliary_loss_mlp": 0.01042076, "balance_loss_clip": 1.03965843, "balance_loss_mlp": 1.0296309, "epoch": 0.5849391252066737, "flos": 22418852469120.0, "grad_norm": 2.078641796720901, "language_loss": 0.77696002, "learning_rate": 1.550728272957027e-06, "loss": 0.79841954, "num_input_tokens_seen": 209729000, "step": 9729, "time_per_iteration": 2.663864850997925 }, { "auxiliary_loss_clip": 0.01102359, "auxiliary_loss_mlp": 0.0103712, "balance_loss_clip": 1.03954148, "balance_loss_mlp": 1.022475, "epoch": 0.5849992484593417, "flos": 25410929431680.0, "grad_norm": 1.8450519403802392, "language_loss": 0.70192915, "learning_rate": 1.5503487733545782e-06, "loss": 0.72332394, "num_input_tokens_seen": 209747435, "step": 9730, "time_per_iteration": 2.6668407917022705 }, { "auxiliary_loss_clip": 0.01124849, "auxiliary_loss_mlp": 0.01036601, "balance_loss_clip": 1.04504502, "balance_loss_mlp": 1.02224803, "epoch": 0.5850593717120096, "flos": 21065163586560.0, "grad_norm": 1.6923527463370078, "language_loss": 0.78973091, "learning_rate": 1.5499692908029482e-06, "loss": 0.81134546, "num_input_tokens_seen": 209764910, "step": 9731, "time_per_iteration": 2.6093108654022217 }, { "auxiliary_loss_clip": 0.01103256, "auxiliary_loss_mlp": 0.01046113, "balance_loss_clip": 1.04004776, "balance_loss_mlp": 1.03114593, "epoch": 0.5851194949646776, "flos": 25301545539840.0, "grad_norm": 2.322897025480009, "language_loss": 0.70276213, "learning_rate": 1.549589825316528e-06, "loss": 0.7242558, "num_input_tokens_seen": 209786115, "step": 9732, "time_per_iteration": 2.6483914852142334 }, { "auxiliary_loss_clip": 0.01068434, "auxiliary_loss_mlp": 0.01041994, "balance_loss_clip": 1.03862739, "balance_loss_mlp": 1.02584136, "epoch": 0.5851796182173455, "flos": 23587242065280.0, "grad_norm": 1.8361177860467572, "language_loss": 0.53096974, "learning_rate": 1.5492103769097075e-06, "loss": 0.55207402, "num_input_tokens_seen": 209806095, "step": 9733, "time_per_iteration": 2.7837493419647217 }, { "auxiliary_loss_clip": 0.0110623, "auxiliary_loss_mlp": 0.01037809, "balance_loss_clip": 1.04327631, "balance_loss_mlp": 1.023206, "epoch": 0.5852397414700136, "flos": 24822712310400.0, "grad_norm": 2.1555850580582945, "language_loss": 0.87172639, "learning_rate": 1.5488309455968739e-06, "loss": 0.89316678, "num_input_tokens_seen": 209823650, "step": 9734, "time_per_iteration": 2.647822618484497 }, { "auxiliary_loss_clip": 0.0109023, "auxiliary_loss_mlp": 0.01035437, "balance_loss_clip": 1.03915906, "balance_loss_mlp": 1.02305174, "epoch": 0.5852998647226815, "flos": 19937784343680.0, "grad_norm": 1.6523754491187739, "language_loss": 0.72117126, "learning_rate": 1.5484515313924163e-06, "loss": 0.74242795, "num_input_tokens_seen": 209843220, "step": 9735, "time_per_iteration": 2.6707499027252197 }, { "auxiliary_loss_clip": 0.01111823, "auxiliary_loss_mlp": 0.01038537, "balance_loss_clip": 1.04385519, "balance_loss_mlp": 1.02448797, "epoch": 0.5853599879753495, "flos": 16720367408640.0, "grad_norm": 5.660280505854459, "language_loss": 0.74303764, "learning_rate": 1.5480721343107217e-06, "loss": 0.76454127, "num_input_tokens_seen": 209854880, "step": 9736, "time_per_iteration": 2.6474769115448 }, { "auxiliary_loss_clip": 0.01084732, "auxiliary_loss_mlp": 0.01038896, "balance_loss_clip": 1.03950977, "balance_loss_mlp": 1.0241437, "epoch": 0.5854201112280174, "flos": 44456583680640.0, "grad_norm": 1.705724680337342, "language_loss": 0.7066859, "learning_rate": 1.5476927543661772e-06, "loss": 0.72792208, "num_input_tokens_seen": 209877870, "step": 9737, "time_per_iteration": 2.8703529834747314 }, { "auxiliary_loss_clip": 0.01079098, "auxiliary_loss_mlp": 0.01042352, "balance_loss_clip": 1.03875983, "balance_loss_mlp": 1.02830887, "epoch": 0.5854802344806854, "flos": 20339193807360.0, "grad_norm": 1.7465210824086157, "language_loss": 0.82571793, "learning_rate": 1.547313391573169e-06, "loss": 0.84693247, "num_input_tokens_seen": 209896690, "step": 9738, "time_per_iteration": 2.6930525302886963 }, { "auxiliary_loss_clip": 0.01123353, "auxiliary_loss_mlp": 0.00771973, "balance_loss_clip": 1.04294574, "balance_loss_mlp": 1.00014758, "epoch": 0.5855403577333533, "flos": 20921054221440.0, "grad_norm": 1.6403149295747592, "language_loss": 0.68084544, "learning_rate": 1.546934045946082e-06, "loss": 0.6997987, "num_input_tokens_seen": 209914640, "step": 9739, "time_per_iteration": 2.6120223999023438 }, { "auxiliary_loss_clip": 0.01122823, "auxiliary_loss_mlp": 0.01028069, "balance_loss_clip": 1.04343581, "balance_loss_mlp": 1.01383555, "epoch": 0.5856004809860214, "flos": 20448649526400.0, "grad_norm": 2.346965983276941, "language_loss": 0.5878849, "learning_rate": 1.5465547174993017e-06, "loss": 0.60939384, "num_input_tokens_seen": 209933375, "step": 9740, "time_per_iteration": 2.6393442153930664 }, { "auxiliary_loss_clip": 0.01091861, "auxiliary_loss_mlp": 0.01034284, "balance_loss_clip": 1.03964174, "balance_loss_mlp": 1.01996112, "epoch": 0.5856606042386893, "flos": 19640766781440.0, "grad_norm": 1.8171598434150709, "language_loss": 0.75508714, "learning_rate": 1.5461754062472113e-06, "loss": 0.77634859, "num_input_tokens_seen": 209952055, "step": 9741, "time_per_iteration": 2.6550915241241455 }, { "auxiliary_loss_clip": 0.01085436, "auxiliary_loss_mlp": 0.01034709, "balance_loss_clip": 1.03900838, "balance_loss_mlp": 1.02109587, "epoch": 0.5857207274913573, "flos": 21686166846720.0, "grad_norm": 1.6487285096737663, "language_loss": 0.75935274, "learning_rate": 1.5457961122041959e-06, "loss": 0.78055418, "num_input_tokens_seen": 209971190, "step": 9742, "time_per_iteration": 4.381955146789551 }, { "auxiliary_loss_clip": 0.01098042, "auxiliary_loss_mlp": 0.01033792, "balance_loss_clip": 1.04340363, "balance_loss_mlp": 1.0209775, "epoch": 0.5857808507440253, "flos": 23182708118400.0, "grad_norm": 1.6035533638401356, "language_loss": 0.74864548, "learning_rate": 1.5454168353846369e-06, "loss": 0.76996386, "num_input_tokens_seen": 209990695, "step": 9743, "time_per_iteration": 5.72803258895874 }, { "auxiliary_loss_clip": 0.01098389, "auxiliary_loss_mlp": 0.01032176, "balance_loss_clip": 1.04424453, "balance_loss_mlp": 1.01949835, "epoch": 0.5858409739966932, "flos": 27235299156480.0, "grad_norm": 1.98808093933083, "language_loss": 0.81046313, "learning_rate": 1.5450375758029172e-06, "loss": 0.83176875, "num_input_tokens_seen": 210010210, "step": 9744, "time_per_iteration": 2.7265267372131348 }, { "auxiliary_loss_clip": 0.01094798, "auxiliary_loss_mlp": 0.01030607, "balance_loss_clip": 1.04087067, "balance_loss_mlp": 1.01669562, "epoch": 0.5859010972493612, "flos": 27855512317440.0, "grad_norm": 1.7065591540492446, "language_loss": 0.71426034, "learning_rate": 1.5446583334734183e-06, "loss": 0.73551434, "num_input_tokens_seen": 210030030, "step": 9745, "time_per_iteration": 2.737842082977295 }, { "auxiliary_loss_clip": 0.01023206, "auxiliary_loss_mlp": 0.01004158, "balance_loss_clip": 1.01973987, "balance_loss_mlp": 1.00301957, "epoch": 0.5859612205020291, "flos": 70007064428160.0, "grad_norm": 0.7272764484566879, "language_loss": 0.53267932, "learning_rate": 1.5442791084105204e-06, "loss": 0.552953, "num_input_tokens_seen": 210094840, "step": 9746, "time_per_iteration": 3.3027215003967285 }, { "auxiliary_loss_clip": 0.01094571, "auxiliary_loss_mlp": 0.01035687, "balance_loss_clip": 1.04237437, "balance_loss_mlp": 1.02163196, "epoch": 0.5860213437546972, "flos": 24056019486720.0, "grad_norm": 2.0261235602549466, "language_loss": 0.73138428, "learning_rate": 1.5438999006286054e-06, "loss": 0.75268686, "num_input_tokens_seen": 210114660, "step": 9747, "time_per_iteration": 4.224852085113525 }, { "auxiliary_loss_clip": 0.01092652, "auxiliary_loss_mlp": 0.01046673, "balance_loss_clip": 1.03909874, "balance_loss_mlp": 1.03123569, "epoch": 0.5860814670073651, "flos": 18947583141120.0, "grad_norm": 1.867050340664373, "language_loss": 0.81183696, "learning_rate": 1.543520710142051e-06, "loss": 0.83323026, "num_input_tokens_seen": 210132770, "step": 9748, "time_per_iteration": 2.6568126678466797 }, { "auxiliary_loss_clip": 0.01111974, "auxiliary_loss_mlp": 0.01038317, "balance_loss_clip": 1.04387689, "balance_loss_mlp": 1.0241785, "epoch": 0.5861415902600331, "flos": 22561848512640.0, "grad_norm": 1.7272716772059427, "language_loss": 0.72221619, "learning_rate": 1.5431415369652375e-06, "loss": 0.7437191, "num_input_tokens_seen": 210151895, "step": 9749, "time_per_iteration": 2.6895384788513184 }, { "auxiliary_loss_clip": 0.01101508, "auxiliary_loss_mlp": 0.01035837, "balance_loss_clip": 1.04664361, "balance_loss_mlp": 1.02205098, "epoch": 0.586201713512701, "flos": 14392027912320.0, "grad_norm": 2.592210537631562, "language_loss": 0.75040287, "learning_rate": 1.5427623811125428e-06, "loss": 0.77177632, "num_input_tokens_seen": 210168040, "step": 9750, "time_per_iteration": 2.737083911895752 }, { "auxiliary_loss_clip": 0.0108729, "auxiliary_loss_mlp": 0.01036704, "balance_loss_clip": 1.04378581, "balance_loss_mlp": 1.02202928, "epoch": 0.586261836765369, "flos": 19498560837120.0, "grad_norm": 1.8612157402372733, "language_loss": 0.70927167, "learning_rate": 1.542383242598344e-06, "loss": 0.73051161, "num_input_tokens_seen": 210187720, "step": 9751, "time_per_iteration": 2.7111241817474365 }, { "auxiliary_loss_clip": 0.01125805, "auxiliary_loss_mlp": 0.01043313, "balance_loss_clip": 1.04531717, "balance_loss_mlp": 1.02769637, "epoch": 0.5863219600180369, "flos": 20701819560960.0, "grad_norm": 1.7129799601344229, "language_loss": 0.74548101, "learning_rate": 1.5420041214370184e-06, "loss": 0.76717222, "num_input_tokens_seen": 210206080, "step": 9752, "time_per_iteration": 2.626716136932373 }, { "auxiliary_loss_clip": 0.01108046, "auxiliary_loss_mlp": 0.01031989, "balance_loss_clip": 1.04339004, "balance_loss_mlp": 1.01842308, "epoch": 0.586382083270705, "flos": 19792130693760.0, "grad_norm": 1.767262069370236, "language_loss": 0.77331054, "learning_rate": 1.541625017642943e-06, "loss": 0.79471087, "num_input_tokens_seen": 210225660, "step": 9753, "time_per_iteration": 2.6093239784240723 }, { "auxiliary_loss_clip": 0.01116295, "auxiliary_loss_mlp": 0.01029138, "balance_loss_clip": 1.04288065, "balance_loss_mlp": 1.01651943, "epoch": 0.5864422065233729, "flos": 16500558130560.0, "grad_norm": 1.6790243104766265, "language_loss": 0.70988512, "learning_rate": 1.5412459312304927e-06, "loss": 0.73133945, "num_input_tokens_seen": 210242725, "step": 9754, "time_per_iteration": 2.5604028701782227 }, { "auxiliary_loss_clip": 0.01095441, "auxiliary_loss_mlp": 0.01034082, "balance_loss_clip": 1.0401392, "balance_loss_mlp": 1.0194732, "epoch": 0.5865023297760409, "flos": 20413277608320.0, "grad_norm": 2.0857561604768065, "language_loss": 0.72379315, "learning_rate": 1.540866862214043e-06, "loss": 0.7450884, "num_input_tokens_seen": 210263225, "step": 9755, "time_per_iteration": 2.656785011291504 }, { "auxiliary_loss_clip": 0.01012678, "auxiliary_loss_mlp": 0.01004177, "balance_loss_clip": 1.01731849, "balance_loss_mlp": 1.00294328, "epoch": 0.5865624530287089, "flos": 63350769254400.0, "grad_norm": 0.7450356800362308, "language_loss": 0.56920898, "learning_rate": 1.540487810607967e-06, "loss": 0.58937752, "num_input_tokens_seen": 210322310, "step": 9756, "time_per_iteration": 3.2905054092407227 }, { "auxiliary_loss_clip": 0.01115752, "auxiliary_loss_mlp": 0.01031709, "balance_loss_clip": 1.04039788, "balance_loss_mlp": 1.01922202, "epoch": 0.5866225762813768, "flos": 27016279977600.0, "grad_norm": 11.015446509800649, "language_loss": 0.76104087, "learning_rate": 1.5401087764266396e-06, "loss": 0.78251553, "num_input_tokens_seen": 210340845, "step": 9757, "time_per_iteration": 2.6325418949127197 }, { "auxiliary_loss_clip": 0.01021435, "auxiliary_loss_mlp": 0.01009977, "balance_loss_clip": 1.01624918, "balance_loss_mlp": 1.00884426, "epoch": 0.5866826995340448, "flos": 72987038507520.0, "grad_norm": 0.8546616305193999, "language_loss": 0.60420328, "learning_rate": 1.5397297596844337e-06, "loss": 0.62451738, "num_input_tokens_seen": 210397815, "step": 9758, "time_per_iteration": 3.227780342102051 }, { "auxiliary_loss_clip": 0.0112535, "auxiliary_loss_mlp": 0.01036264, "balance_loss_clip": 1.0447619, "balance_loss_mlp": 1.02245307, "epoch": 0.5867428227867127, "flos": 21285727050240.0, "grad_norm": 2.191365428773927, "language_loss": 0.71787071, "learning_rate": 1.5393507603957212e-06, "loss": 0.73948681, "num_input_tokens_seen": 210413900, "step": 9759, "time_per_iteration": 2.593574047088623 }, { "auxiliary_loss_clip": 0.01096792, "auxiliary_loss_mlp": 0.0103787, "balance_loss_clip": 1.04106188, "balance_loss_mlp": 1.02525759, "epoch": 0.5868029460393808, "flos": 33468852188160.0, "grad_norm": 1.6194048366561686, "language_loss": 0.72730052, "learning_rate": 1.5389717785748742e-06, "loss": 0.74864709, "num_input_tokens_seen": 210434110, "step": 9760, "time_per_iteration": 2.7872965335845947 }, { "auxiliary_loss_clip": 0.01107006, "auxiliary_loss_mlp": 0.01032523, "balance_loss_clip": 1.04269731, "balance_loss_mlp": 1.01910627, "epoch": 0.5868630692920487, "flos": 17889475276800.0, "grad_norm": 1.9662195987833622, "language_loss": 0.72611898, "learning_rate": 1.5385928142362637e-06, "loss": 0.74751425, "num_input_tokens_seen": 210451685, "step": 9761, "time_per_iteration": 2.701533317565918 }, { "auxiliary_loss_clip": 0.01106159, "auxiliary_loss_mlp": 0.01036709, "balance_loss_clip": 1.04491735, "balance_loss_mlp": 1.02211809, "epoch": 0.5869231925447167, "flos": 21035035054080.0, "grad_norm": 1.7395731063260564, "language_loss": 0.75217378, "learning_rate": 1.5382138673942597e-06, "loss": 0.77360249, "num_input_tokens_seen": 210470825, "step": 9762, "time_per_iteration": 2.721714496612549 }, { "auxiliary_loss_clip": 0.01082216, "auxiliary_loss_mlp": 0.01036155, "balance_loss_clip": 1.03985929, "balance_loss_mlp": 1.02164149, "epoch": 0.5869833157973846, "flos": 74738219293440.0, "grad_norm": 4.660992958273475, "language_loss": 0.72322762, "learning_rate": 1.5378349380632317e-06, "loss": 0.74441129, "num_input_tokens_seen": 210500075, "step": 9763, "time_per_iteration": 3.1116628646850586 }, { "auxiliary_loss_clip": 0.01101878, "auxiliary_loss_mlp": 0.01034613, "balance_loss_clip": 1.03773355, "balance_loss_mlp": 1.02203679, "epoch": 0.5870434390500526, "flos": 17638998762240.0, "grad_norm": 1.815727939349207, "language_loss": 0.80352604, "learning_rate": 1.53745602625755e-06, "loss": 0.82489097, "num_input_tokens_seen": 210518150, "step": 9764, "time_per_iteration": 2.682579278945923 }, { "auxiliary_loss_clip": 0.01091583, "auxiliary_loss_mlp": 0.01034941, "balance_loss_clip": 1.04217017, "balance_loss_mlp": 1.02132726, "epoch": 0.5871035623027205, "flos": 21506146859520.0, "grad_norm": 1.83004906571999, "language_loss": 0.79265928, "learning_rate": 1.5370771319915819e-06, "loss": 0.81392443, "num_input_tokens_seen": 210537760, "step": 9765, "time_per_iteration": 2.6972546577453613 }, { "auxiliary_loss_clip": 0.01088979, "auxiliary_loss_mlp": 0.01039927, "balance_loss_clip": 1.04256606, "balance_loss_mlp": 1.02595556, "epoch": 0.5871636855553886, "flos": 13551861818880.0, "grad_norm": 1.76294195099967, "language_loss": 0.83693898, "learning_rate": 1.5366982552796947e-06, "loss": 0.85822797, "num_input_tokens_seen": 210555515, "step": 9766, "time_per_iteration": 2.7466630935668945 }, { "auxiliary_loss_clip": 0.01111118, "auxiliary_loss_mlp": 0.01037087, "balance_loss_clip": 1.04195547, "balance_loss_mlp": 1.02393794, "epoch": 0.5872238088080565, "flos": 26212922346240.0, "grad_norm": 1.5937380342892973, "language_loss": 0.6981988, "learning_rate": 1.536319396136257e-06, "loss": 0.71968091, "num_input_tokens_seen": 210575000, "step": 9767, "time_per_iteration": 2.6740965843200684 }, { "auxiliary_loss_clip": 0.0110439, "auxiliary_loss_mlp": 0.0077267, "balance_loss_clip": 1.04049277, "balance_loss_mlp": 1.00008368, "epoch": 0.5872839320607245, "flos": 30665198995200.0, "grad_norm": 2.1136221747138095, "language_loss": 0.6360091, "learning_rate": 1.5359405545756336e-06, "loss": 0.65477967, "num_input_tokens_seen": 210595185, "step": 9768, "time_per_iteration": 2.7575178146362305 }, { "auxiliary_loss_clip": 0.01037412, "auxiliary_loss_mlp": 0.00751529, "balance_loss_clip": 1.01318574, "balance_loss_mlp": 0.99987358, "epoch": 0.5873440553133924, "flos": 60303570871680.0, "grad_norm": 0.7223687744232398, "language_loss": 0.53866827, "learning_rate": 1.5355617306121914e-06, "loss": 0.55655766, "num_input_tokens_seen": 210653210, "step": 9769, "time_per_iteration": 3.1609816551208496 }, { "auxiliary_loss_clip": 0.01084812, "auxiliary_loss_mlp": 0.01042021, "balance_loss_clip": 1.03922772, "balance_loss_mlp": 1.02880073, "epoch": 0.5874041785660604, "flos": 21539292134400.0, "grad_norm": 1.4066762666706196, "language_loss": 0.70984697, "learning_rate": 1.5351829242602945e-06, "loss": 0.73111528, "num_input_tokens_seen": 210673750, "step": 9770, "time_per_iteration": 2.7312963008880615 }, { "auxiliary_loss_clip": 0.01073411, "auxiliary_loss_mlp": 0.01035898, "balance_loss_clip": 1.0386194, "balance_loss_mlp": 1.02226591, "epoch": 0.5874643018187284, "flos": 24388947671040.0, "grad_norm": 1.7359405395861034, "language_loss": 0.681171, "learning_rate": 1.5348041355343077e-06, "loss": 0.70226407, "num_input_tokens_seen": 210692960, "step": 9771, "time_per_iteration": 2.7748193740844727 }, { "auxiliary_loss_clip": 0.01072231, "auxiliary_loss_mlp": 0.01041976, "balance_loss_clip": 1.03671551, "balance_loss_mlp": 1.02564466, "epoch": 0.5875244250713964, "flos": 28147717457280.0, "grad_norm": 1.5217173137024316, "language_loss": 0.661672, "learning_rate": 1.5344253644485954e-06, "loss": 0.68281412, "num_input_tokens_seen": 210714040, "step": 9772, "time_per_iteration": 2.841942071914673 }, { "auxiliary_loss_clip": 0.01124952, "auxiliary_loss_mlp": 0.01044932, "balance_loss_clip": 1.045434, "balance_loss_mlp": 1.03047216, "epoch": 0.5875845483240644, "flos": 25812410722560.0, "grad_norm": 1.4922365157265927, "language_loss": 0.74535245, "learning_rate": 1.534046611017519e-06, "loss": 0.76705128, "num_input_tokens_seen": 210733710, "step": 9773, "time_per_iteration": 2.6284871101379395 }, { "auxiliary_loss_clip": 0.01087977, "auxiliary_loss_mlp": 0.0104147, "balance_loss_clip": 1.04292727, "balance_loss_mlp": 1.02706945, "epoch": 0.5876446715767323, "flos": 26906572863360.0, "grad_norm": 1.947316209295704, "language_loss": 0.52915788, "learning_rate": 1.5336678752554421e-06, "loss": 0.55045235, "num_input_tokens_seen": 210753580, "step": 9774, "time_per_iteration": 2.7891509532928467 }, { "auxiliary_loss_clip": 0.01113387, "auxiliary_loss_mlp": 0.01039669, "balance_loss_clip": 1.04437912, "balance_loss_mlp": 1.02526808, "epoch": 0.5877047948294003, "flos": 36684832579200.0, "grad_norm": 2.3607783176851824, "language_loss": 0.64713901, "learning_rate": 1.5332891571767264e-06, "loss": 0.66866958, "num_input_tokens_seen": 210773495, "step": 9775, "time_per_iteration": 2.771148920059204 }, { "auxiliary_loss_clip": 0.01105141, "auxiliary_loss_mlp": 0.01036995, "balance_loss_clip": 1.04033184, "balance_loss_mlp": 1.02344131, "epoch": 0.5877649180820682, "flos": 26724721282560.0, "grad_norm": 1.636403069820384, "language_loss": 0.73844278, "learning_rate": 1.5329104567957326e-06, "loss": 0.75986409, "num_input_tokens_seen": 210793645, "step": 9776, "time_per_iteration": 2.690695285797119 }, { "auxiliary_loss_clip": 0.01119488, "auxiliary_loss_mlp": 0.01039689, "balance_loss_clip": 1.0420121, "balance_loss_mlp": 1.0264504, "epoch": 0.5878250413347362, "flos": 21032197879680.0, "grad_norm": 1.5421458331894318, "language_loss": 0.73914766, "learning_rate": 1.532531774126821e-06, "loss": 0.76073945, "num_input_tokens_seen": 210813415, "step": 9777, "time_per_iteration": 2.6284945011138916 }, { "auxiliary_loss_clip": 0.01083567, "auxiliary_loss_mlp": 0.01038914, "balance_loss_clip": 1.04067087, "balance_loss_mlp": 1.02573574, "epoch": 0.5878851645874041, "flos": 25484259047040.0, "grad_norm": 1.8412101918270336, "language_loss": 0.74325955, "learning_rate": 1.5321531091843512e-06, "loss": 0.76448435, "num_input_tokens_seen": 210833850, "step": 9778, "time_per_iteration": 2.7255308628082275 }, { "auxiliary_loss_clip": 0.01072977, "auxiliary_loss_mlp": 0.01040231, "balance_loss_clip": 1.03567362, "balance_loss_mlp": 1.0246737, "epoch": 0.5879452878400722, "flos": 23769129559680.0, "grad_norm": 1.8337946976743424, "language_loss": 0.70162809, "learning_rate": 1.5317744619826824e-06, "loss": 0.72276014, "num_input_tokens_seen": 210853115, "step": 9779, "time_per_iteration": 2.715529680252075 }, { "auxiliary_loss_clip": 0.01121839, "auxiliary_loss_mlp": 0.00771635, "balance_loss_clip": 1.04201186, "balance_loss_mlp": 1.00009024, "epoch": 0.5880054110927401, "flos": 17824513530240.0, "grad_norm": 2.202026224542238, "language_loss": 0.66388619, "learning_rate": 1.5313958325361727e-06, "loss": 0.68282098, "num_input_tokens_seen": 210872090, "step": 9780, "time_per_iteration": 2.628286361694336 }, { "auxiliary_loss_clip": 0.01091434, "auxiliary_loss_mlp": 0.01038369, "balance_loss_clip": 1.04466867, "balance_loss_mlp": 1.02406991, "epoch": 0.5880655343454081, "flos": 19463404400640.0, "grad_norm": 1.8753551233884636, "language_loss": 0.72474289, "learning_rate": 1.5310172208591807e-06, "loss": 0.74604088, "num_input_tokens_seen": 210888490, "step": 9781, "time_per_iteration": 4.2804930210113525 }, { "auxiliary_loss_clip": 0.01092565, "auxiliary_loss_mlp": 0.00771373, "balance_loss_clip": 1.04225159, "balance_loss_mlp": 1.00005984, "epoch": 0.588125657598076, "flos": 21397588980480.0, "grad_norm": 1.5003005055277707, "language_loss": 0.70744377, "learning_rate": 1.5306386269660622e-06, "loss": 0.72608316, "num_input_tokens_seen": 210908220, "step": 9782, "time_per_iteration": 4.278367519378662 }, { "auxiliary_loss_clip": 0.01105689, "auxiliary_loss_mlp": 0.01041859, "balance_loss_clip": 1.03929675, "balance_loss_mlp": 1.02716005, "epoch": 0.588185780850744, "flos": 16034653797120.0, "grad_norm": 2.093864455539888, "language_loss": 0.70450729, "learning_rate": 1.5302600508711741e-06, "loss": 0.72598279, "num_input_tokens_seen": 210923945, "step": 9783, "time_per_iteration": 4.194809436798096 }, { "auxiliary_loss_clip": 0.01085302, "auxiliary_loss_mlp": 0.01036158, "balance_loss_clip": 1.04440248, "balance_loss_mlp": 1.02117932, "epoch": 0.588245904103412, "flos": 23728226947200.0, "grad_norm": 2.1947417455944653, "language_loss": 0.69071788, "learning_rate": 1.5298814925888719e-06, "loss": 0.71193242, "num_input_tokens_seen": 210941955, "step": 9784, "time_per_iteration": 2.7187066078186035 }, { "auxiliary_loss_clip": 0.01072816, "auxiliary_loss_mlp": 0.01034537, "balance_loss_clip": 1.03863633, "balance_loss_mlp": 1.02094078, "epoch": 0.58830602735608, "flos": 33802534558080.0, "grad_norm": 24.973572945721454, "language_loss": 0.69460654, "learning_rate": 1.5295029521335102e-06, "loss": 0.71568, "num_input_tokens_seen": 210963105, "step": 9785, "time_per_iteration": 2.878143548965454 }, { "auxiliary_loss_clip": 0.01107899, "auxiliary_loss_mlp": 0.01029541, "balance_loss_clip": 1.04268789, "balance_loss_mlp": 1.01706553, "epoch": 0.588366150608748, "flos": 17090714586240.0, "grad_norm": 1.9508012380203874, "language_loss": 0.77078086, "learning_rate": 1.5291244295194448e-06, "loss": 0.79215527, "num_input_tokens_seen": 210978720, "step": 9786, "time_per_iteration": 2.6095898151397705 }, { "auxiliary_loss_clip": 0.01101968, "auxiliary_loss_mlp": 0.01029534, "balance_loss_clip": 1.04132032, "balance_loss_mlp": 1.01609302, "epoch": 0.5884262738614159, "flos": 22127186033280.0, "grad_norm": 1.4529797212559594, "language_loss": 0.79197991, "learning_rate": 1.5287459247610276e-06, "loss": 0.81329501, "num_input_tokens_seen": 210998750, "step": 9787, "time_per_iteration": 4.223788261413574 }, { "auxiliary_loss_clip": 0.01081001, "auxiliary_loss_mlp": 0.01036004, "balance_loss_clip": 1.04142892, "balance_loss_mlp": 1.02382052, "epoch": 0.5884863971140839, "flos": 21031838743680.0, "grad_norm": 2.5032495709629186, "language_loss": 0.6604932, "learning_rate": 1.5283674378726116e-06, "loss": 0.68166327, "num_input_tokens_seen": 211017550, "step": 9788, "time_per_iteration": 2.770289659500122 }, { "auxiliary_loss_clip": 0.01089935, "auxiliary_loss_mlp": 0.01038801, "balance_loss_clip": 1.04031539, "balance_loss_mlp": 1.02356613, "epoch": 0.5885465203667518, "flos": 23805112008960.0, "grad_norm": 2.4491161231159495, "language_loss": 0.80353689, "learning_rate": 1.5279889688685506e-06, "loss": 0.82482433, "num_input_tokens_seen": 211034135, "step": 9789, "time_per_iteration": 2.7129344940185547 }, { "auxiliary_loss_clip": 0.01088956, "auxiliary_loss_mlp": 0.00771498, "balance_loss_clip": 1.04013371, "balance_loss_mlp": 0.99999416, "epoch": 0.5886066436194198, "flos": 18880574319360.0, "grad_norm": 1.8752240370073765, "language_loss": 0.7074194, "learning_rate": 1.5276105177631944e-06, "loss": 0.72602391, "num_input_tokens_seen": 211053850, "step": 9790, "time_per_iteration": 2.7234628200531006 }, { "auxiliary_loss_clip": 0.01082257, "auxiliary_loss_mlp": 0.01034309, "balance_loss_clip": 1.04143536, "balance_loss_mlp": 1.02096915, "epoch": 0.5886667668720877, "flos": 24790141653120.0, "grad_norm": 1.7147674530197825, "language_loss": 0.83315635, "learning_rate": 1.527232084570895e-06, "loss": 0.85432208, "num_input_tokens_seen": 211072165, "step": 9791, "time_per_iteration": 2.711566686630249 }, { "auxiliary_loss_clip": 0.0110606, "auxiliary_loss_mlp": 0.01044469, "balance_loss_clip": 1.04232645, "balance_loss_mlp": 1.0296278, "epoch": 0.5887268901247558, "flos": 21614381516160.0, "grad_norm": 1.5737373299770356, "language_loss": 0.7653091, "learning_rate": 1.5268536693060026e-06, "loss": 0.78681433, "num_input_tokens_seen": 211089630, "step": 9792, "time_per_iteration": 2.634300947189331 }, { "auxiliary_loss_clip": 0.0105802, "auxiliary_loss_mlp": 0.01047083, "balance_loss_clip": 1.03111851, "balance_loss_mlp": 1.03123975, "epoch": 0.5887870133774237, "flos": 20481722974080.0, "grad_norm": 2.6665803472381935, "language_loss": 0.68956935, "learning_rate": 1.5264752719828662e-06, "loss": 0.7106204, "num_input_tokens_seen": 211106120, "step": 9793, "time_per_iteration": 2.7154650688171387 }, { "auxiliary_loss_clip": 0.01116924, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.04252207, "balance_loss_mlp": 1.01923692, "epoch": 0.5888471366300917, "flos": 19206283870080.0, "grad_norm": 1.9062241907170245, "language_loss": 0.60218275, "learning_rate": 1.5260968926158353e-06, "loss": 0.62368208, "num_input_tokens_seen": 211122450, "step": 9794, "time_per_iteration": 2.584721088409424 }, { "auxiliary_loss_clip": 0.01087928, "auxiliary_loss_mlp": 0.01038963, "balance_loss_clip": 1.04045248, "balance_loss_mlp": 1.0248251, "epoch": 0.5889072598827596, "flos": 19972904866560.0, "grad_norm": 1.5367259931320274, "language_loss": 0.65087652, "learning_rate": 1.525718531219257e-06, "loss": 0.67214543, "num_input_tokens_seen": 211141765, "step": 9795, "time_per_iteration": 2.6578221321105957 }, { "auxiliary_loss_clip": 0.01080946, "auxiliary_loss_mlp": 0.01041808, "balance_loss_clip": 1.03947282, "balance_loss_mlp": 1.02942848, "epoch": 0.5889673831354276, "flos": 20741249715840.0, "grad_norm": 1.5439612087123358, "language_loss": 0.74185097, "learning_rate": 1.5253401878074801e-06, "loss": 0.76307845, "num_input_tokens_seen": 211160475, "step": 9796, "time_per_iteration": 2.7106168270111084 }, { "auxiliary_loss_clip": 0.01094109, "auxiliary_loss_mlp": 0.01035258, "balance_loss_clip": 1.03922713, "balance_loss_mlp": 1.02194858, "epoch": 0.5890275063880956, "flos": 25300935008640.0, "grad_norm": 1.398085740010997, "language_loss": 0.82796204, "learning_rate": 1.5249618623948507e-06, "loss": 0.84925568, "num_input_tokens_seen": 211180480, "step": 9797, "time_per_iteration": 2.7226924896240234 }, { "auxiliary_loss_clip": 0.01089451, "auxiliary_loss_mlp": 0.01032137, "balance_loss_clip": 1.03643203, "balance_loss_mlp": 1.01857686, "epoch": 0.5890876296407636, "flos": 11765377964160.0, "grad_norm": 2.441249596431382, "language_loss": 0.792216, "learning_rate": 1.5245835549957152e-06, "loss": 0.81343186, "num_input_tokens_seen": 211198000, "step": 9798, "time_per_iteration": 2.661177396774292 }, { "auxiliary_loss_clip": 0.01116784, "auxiliary_loss_mlp": 0.01033567, "balance_loss_clip": 1.04251814, "balance_loss_mlp": 1.02085924, "epoch": 0.5891477528934316, "flos": 13589460380160.0, "grad_norm": 4.031600606780585, "language_loss": 0.74594498, "learning_rate": 1.5242052656244186e-06, "loss": 0.76744843, "num_input_tokens_seen": 211214765, "step": 9799, "time_per_iteration": 2.597598075866699 }, { "auxiliary_loss_clip": 0.0108372, "auxiliary_loss_mlp": 0.01033117, "balance_loss_clip": 1.03822446, "balance_loss_mlp": 1.01848447, "epoch": 0.5892078761460995, "flos": 15049193189760.0, "grad_norm": 1.9844034954522878, "language_loss": 0.7639305, "learning_rate": 1.5238269942953064e-06, "loss": 0.78509891, "num_input_tokens_seen": 211232335, "step": 9800, "time_per_iteration": 2.6959407329559326 }, { "auxiliary_loss_clip": 0.01068975, "auxiliary_loss_mlp": 0.01043567, "balance_loss_clip": 1.03649104, "balance_loss_mlp": 1.02863002, "epoch": 0.5892679993987675, "flos": 15778215624960.0, "grad_norm": 2.091540130493628, "language_loss": 0.78984964, "learning_rate": 1.523448741022722e-06, "loss": 0.81097507, "num_input_tokens_seen": 211249985, "step": 9801, "time_per_iteration": 2.7329885959625244 }, { "auxiliary_loss_clip": 0.01084752, "auxiliary_loss_mlp": 0.01033442, "balance_loss_clip": 1.04138374, "balance_loss_mlp": 1.01958394, "epoch": 0.5893281226514354, "flos": 25265203954560.0, "grad_norm": 1.6724920210450809, "language_loss": 0.66076094, "learning_rate": 1.5230705058210088e-06, "loss": 0.68194282, "num_input_tokens_seen": 211268425, "step": 9802, "time_per_iteration": 2.9191880226135254 }, { "auxiliary_loss_clip": 0.01106682, "auxiliary_loss_mlp": 0.01030935, "balance_loss_clip": 1.04172468, "balance_loss_mlp": 1.01782823, "epoch": 0.5893882459041034, "flos": 19458232842240.0, "grad_norm": 1.576394450599596, "language_loss": 0.78281248, "learning_rate": 1.5226922887045108e-06, "loss": 0.80418861, "num_input_tokens_seen": 211286680, "step": 9803, "time_per_iteration": 2.6395671367645264 }, { "auxiliary_loss_clip": 0.01110111, "auxiliary_loss_mlp": 0.01036458, "balance_loss_clip": 1.04354095, "balance_loss_mlp": 1.0227071, "epoch": 0.5894483691567713, "flos": 20634056553600.0, "grad_norm": 1.421228889325947, "language_loss": 0.73083454, "learning_rate": 1.5223140896875686e-06, "loss": 0.75230026, "num_input_tokens_seen": 211307700, "step": 9804, "time_per_iteration": 2.7451324462890625 }, { "auxiliary_loss_clip": 0.01091882, "auxiliary_loss_mlp": 0.01030959, "balance_loss_clip": 1.04156733, "balance_loss_mlp": 1.01769745, "epoch": 0.5895084924094394, "flos": 17778223877760.0, "grad_norm": 1.6374324136970364, "language_loss": 0.74669635, "learning_rate": 1.5219359087845234e-06, "loss": 0.76792479, "num_input_tokens_seen": 211324835, "step": 9805, "time_per_iteration": 2.6853296756744385 }, { "auxiliary_loss_clip": 0.01113863, "auxiliary_loss_mlp": 0.00772031, "balance_loss_clip": 1.04102564, "balance_loss_mlp": 1.00008976, "epoch": 0.5895686156621073, "flos": 20121072468480.0, "grad_norm": 1.677515475610003, "language_loss": 0.77973545, "learning_rate": 1.5215577460097174e-06, "loss": 0.79859436, "num_input_tokens_seen": 211344130, "step": 9806, "time_per_iteration": 2.6450774669647217 }, { "auxiliary_loss_clip": 0.01117555, "auxiliary_loss_mlp": 0.01031595, "balance_loss_clip": 1.0410825, "balance_loss_mlp": 1.01801682, "epoch": 0.5896287389147753, "flos": 20850058990080.0, "grad_norm": 1.7162663032269994, "language_loss": 0.76973009, "learning_rate": 1.5211796013774887e-06, "loss": 0.79122162, "num_input_tokens_seen": 211362915, "step": 9807, "time_per_iteration": 2.5557191371917725 }, { "auxiliary_loss_clip": 0.01115136, "auxiliary_loss_mlp": 0.01029659, "balance_loss_clip": 1.04593015, "balance_loss_mlp": 1.01563966, "epoch": 0.5896888621674432, "flos": 14537897043840.0, "grad_norm": 1.9630689597763404, "language_loss": 0.74407029, "learning_rate": 1.5208014749021786e-06, "loss": 0.76551819, "num_input_tokens_seen": 211380700, "step": 9808, "time_per_iteration": 2.649773359298706 }, { "auxiliary_loss_clip": 0.01069554, "auxiliary_loss_mlp": 0.01030007, "balance_loss_clip": 1.03687882, "balance_loss_mlp": 1.01540375, "epoch": 0.5897489854201112, "flos": 20886759711360.0, "grad_norm": 2.8224307817464194, "language_loss": 0.72173887, "learning_rate": 1.5204233665981236e-06, "loss": 0.74273449, "num_input_tokens_seen": 211400095, "step": 9809, "time_per_iteration": 2.8795154094696045 }, { "auxiliary_loss_clip": 0.01097105, "auxiliary_loss_mlp": 0.01035609, "balance_loss_clip": 1.03962231, "balance_loss_mlp": 1.02155459, "epoch": 0.5898091086727792, "flos": 20011149872640.0, "grad_norm": 1.9654509433248524, "language_loss": 0.82251418, "learning_rate": 1.5200452764796627e-06, "loss": 0.84384131, "num_input_tokens_seen": 211417810, "step": 9810, "time_per_iteration": 2.7300972938537598 }, { "auxiliary_loss_clip": 0.01108515, "auxiliary_loss_mlp": 0.01035386, "balance_loss_clip": 1.04266787, "balance_loss_mlp": 1.02213001, "epoch": 0.5898692319254472, "flos": 16253242012800.0, "grad_norm": 2.8325616643541043, "language_loss": 0.80945516, "learning_rate": 1.5196672045611336e-06, "loss": 0.83089411, "num_input_tokens_seen": 211436020, "step": 9811, "time_per_iteration": 2.6033973693847656 }, { "auxiliary_loss_clip": 0.01114433, "auxiliary_loss_mlp": 0.01031742, "balance_loss_clip": 1.04528522, "balance_loss_mlp": 1.01666236, "epoch": 0.5899293551781152, "flos": 20448541785600.0, "grad_norm": 2.9067717634400174, "language_loss": 0.77026772, "learning_rate": 1.5192891508568715e-06, "loss": 0.79172945, "num_input_tokens_seen": 211454335, "step": 9812, "time_per_iteration": 2.6283788681030273 }, { "auxiliary_loss_clip": 0.01085179, "auxiliary_loss_mlp": 0.01030145, "balance_loss_clip": 1.04126084, "balance_loss_mlp": 1.01832533, "epoch": 0.5899894784307831, "flos": 13881701433600.0, "grad_norm": 2.0160065726104426, "language_loss": 0.70596051, "learning_rate": 1.5189111153812133e-06, "loss": 0.72711378, "num_input_tokens_seen": 211472775, "step": 9813, "time_per_iteration": 2.7235190868377686 }, { "auxiliary_loss_clip": 0.01094818, "auxiliary_loss_mlp": 0.01038761, "balance_loss_clip": 1.04338694, "balance_loss_mlp": 1.02489126, "epoch": 0.5900496016834511, "flos": 20083797129600.0, "grad_norm": 10.075807478503481, "language_loss": 0.72172022, "learning_rate": 1.518533098148494e-06, "loss": 0.74305606, "num_input_tokens_seen": 211492195, "step": 9814, "time_per_iteration": 2.7245450019836426 }, { "auxiliary_loss_clip": 0.01093647, "auxiliary_loss_mlp": 0.01037117, "balance_loss_clip": 1.04272461, "balance_loss_mlp": 1.02348518, "epoch": 0.590109724936119, "flos": 20259148348800.0, "grad_norm": 1.7959189057174523, "language_loss": 0.78608483, "learning_rate": 1.5181550991730476e-06, "loss": 0.80739248, "num_input_tokens_seen": 211510220, "step": 9815, "time_per_iteration": 2.624587297439575 }, { "auxiliary_loss_clip": 0.0109595, "auxiliary_loss_mlp": 0.0077231, "balance_loss_clip": 1.04222631, "balance_loss_mlp": 1.00011277, "epoch": 0.590169848188787, "flos": 24235069806720.0, "grad_norm": 1.934955250523914, "language_loss": 0.75605524, "learning_rate": 1.5177771184692083e-06, "loss": 0.77473778, "num_input_tokens_seen": 211526260, "step": 9816, "time_per_iteration": 2.805889844894409 }, { "auxiliary_loss_clip": 0.01120987, "auxiliary_loss_mlp": 0.01039982, "balance_loss_clip": 1.04457593, "balance_loss_mlp": 1.02636814, "epoch": 0.590229971441455, "flos": 17784724239360.0, "grad_norm": 1.761702620923252, "language_loss": 0.81330854, "learning_rate": 1.517399156051309e-06, "loss": 0.8349182, "num_input_tokens_seen": 211542890, "step": 9817, "time_per_iteration": 2.5694470405578613 }, { "auxiliary_loss_clip": 0.0106411, "auxiliary_loss_mlp": 0.01046757, "balance_loss_clip": 1.03651428, "balance_loss_mlp": 1.03204691, "epoch": 0.590290094694123, "flos": 22236893147520.0, "grad_norm": 1.6227389463072333, "language_loss": 0.7634322, "learning_rate": 1.517021211933682e-06, "loss": 0.78454089, "num_input_tokens_seen": 211562685, "step": 9818, "time_per_iteration": 2.7369279861450195 }, { "auxiliary_loss_clip": 0.0108334, "auxiliary_loss_mlp": 0.01037737, "balance_loss_clip": 1.04248178, "balance_loss_mlp": 1.02451682, "epoch": 0.5903502179467909, "flos": 19098623831040.0, "grad_norm": 2.2508579930127333, "language_loss": 0.66751575, "learning_rate": 1.5166432861306592e-06, "loss": 0.68872648, "num_input_tokens_seen": 211579960, "step": 9819, "time_per_iteration": 2.683518648147583 }, { "auxiliary_loss_clip": 0.01121974, "auxiliary_loss_mlp": 0.01034215, "balance_loss_clip": 1.04451931, "balance_loss_mlp": 1.02100039, "epoch": 0.5904103411994589, "flos": 24235500769920.0, "grad_norm": 1.5861802995785013, "language_loss": 0.78221858, "learning_rate": 1.5162653786565714e-06, "loss": 0.80378044, "num_input_tokens_seen": 211599310, "step": 9820, "time_per_iteration": 2.67228627204895 }, { "auxiliary_loss_clip": 0.01010393, "auxiliary_loss_mlp": 0.01023264, "balance_loss_clip": 1.01880515, "balance_loss_mlp": 1.02123773, "epoch": 0.5904704644521268, "flos": 64876613045760.0, "grad_norm": 0.9671648573222682, "language_loss": 0.65189892, "learning_rate": 1.5158874895257487e-06, "loss": 0.67223543, "num_input_tokens_seen": 211658790, "step": 9821, "time_per_iteration": 4.79486083984375 }, { "auxiliary_loss_clip": 0.01079974, "auxiliary_loss_mlp": 0.01038386, "balance_loss_clip": 1.04072082, "balance_loss_mlp": 1.0247488, "epoch": 0.5905305877047948, "flos": 19609991804160.0, "grad_norm": 1.8549459171527238, "language_loss": 0.61307114, "learning_rate": 1.515509618752521e-06, "loss": 0.63425475, "num_input_tokens_seen": 211677240, "step": 9822, "time_per_iteration": 5.756153345108032 }, { "auxiliary_loss_clip": 0.01122858, "auxiliary_loss_mlp": 0.01041517, "balance_loss_clip": 1.04382062, "balance_loss_mlp": 1.02788556, "epoch": 0.5905907109574628, "flos": 18989634988800.0, "grad_norm": 2.151764899841445, "language_loss": 0.82442653, "learning_rate": 1.5151317663512173e-06, "loss": 0.84607029, "num_input_tokens_seen": 211695485, "step": 9823, "time_per_iteration": 2.6660759449005127 }, { "auxiliary_loss_clip": 0.01098497, "auxiliary_loss_mlp": 0.01032384, "balance_loss_clip": 1.04229546, "balance_loss_mlp": 1.0183413, "epoch": 0.5906508342101308, "flos": 22200407907840.0, "grad_norm": 1.984006151976339, "language_loss": 0.72755098, "learning_rate": 1.514753932336165e-06, "loss": 0.74885976, "num_input_tokens_seen": 211713090, "step": 9824, "time_per_iteration": 2.679081439971924 }, { "auxiliary_loss_clip": 0.01095276, "auxiliary_loss_mlp": 0.00772718, "balance_loss_clip": 1.04067087, "balance_loss_mlp": 1.00008035, "epoch": 0.5907109574627988, "flos": 20886687884160.0, "grad_norm": 2.158910240340413, "language_loss": 0.82870126, "learning_rate": 1.514376116721693e-06, "loss": 0.84738123, "num_input_tokens_seen": 211732510, "step": 9825, "time_per_iteration": 2.719106674194336 }, { "auxiliary_loss_clip": 0.0110445, "auxiliary_loss_mlp": 0.01034274, "balance_loss_clip": 1.04120886, "balance_loss_mlp": 1.02252591, "epoch": 0.5907710807154667, "flos": 21506649649920.0, "grad_norm": 1.7542204465206233, "language_loss": 0.76779485, "learning_rate": 1.5139983195221272e-06, "loss": 0.78918207, "num_input_tokens_seen": 211748695, "step": 9826, "time_per_iteration": 4.231219291687012 }, { "auxiliary_loss_clip": 0.01094981, "auxiliary_loss_mlp": 0.01031253, "balance_loss_clip": 1.04213846, "balance_loss_mlp": 1.01828933, "epoch": 0.5908312039681347, "flos": 22018376759040.0, "grad_norm": 1.9593281360323977, "language_loss": 0.72049826, "learning_rate": 1.513620540751793e-06, "loss": 0.74176061, "num_input_tokens_seen": 211768545, "step": 9827, "time_per_iteration": 2.654449462890625 }, { "auxiliary_loss_clip": 0.01073518, "auxiliary_loss_mlp": 0.010335, "balance_loss_clip": 1.03849053, "balance_loss_mlp": 1.02111387, "epoch": 0.5908913272208026, "flos": 18479523991680.0, "grad_norm": 1.6640399072146284, "language_loss": 0.79552126, "learning_rate": 1.5132427804250178e-06, "loss": 0.8165915, "num_input_tokens_seen": 211786665, "step": 9828, "time_per_iteration": 2.8060965538024902 }, { "auxiliary_loss_clip": 0.01065495, "auxiliary_loss_mlp": 0.01038324, "balance_loss_clip": 1.04091191, "balance_loss_mlp": 1.02510321, "epoch": 0.5909514504734706, "flos": 12312189682560.0, "grad_norm": 1.8739746775685384, "language_loss": 0.88231647, "learning_rate": 1.5128650385561241e-06, "loss": 0.90335464, "num_input_tokens_seen": 211801215, "step": 9829, "time_per_iteration": 2.819425106048584 }, { "auxiliary_loss_clip": 0.01023107, "auxiliary_loss_mlp": 0.01007549, "balance_loss_clip": 1.01821566, "balance_loss_mlp": 1.00632119, "epoch": 0.5910115737261386, "flos": 70213262451840.0, "grad_norm": 0.7698473487867592, "language_loss": 0.57849222, "learning_rate": 1.5124873151594376e-06, "loss": 0.59879881, "num_input_tokens_seen": 211857005, "step": 9830, "time_per_iteration": 3.1567955017089844 }, { "auxiliary_loss_clip": 0.01114755, "auxiliary_loss_mlp": 0.00772402, "balance_loss_clip": 1.04340577, "balance_loss_mlp": 1.0002377, "epoch": 0.5910716969788066, "flos": 22017766227840.0, "grad_norm": 2.1363303387386723, "language_loss": 0.75768107, "learning_rate": 1.5121096102492812e-06, "loss": 0.77655268, "num_input_tokens_seen": 211876675, "step": 9831, "time_per_iteration": 2.7048380374908447 }, { "auxiliary_loss_clip": 0.01089263, "auxiliary_loss_mlp": 0.01034604, "balance_loss_clip": 1.04322839, "balance_loss_mlp": 1.02142549, "epoch": 0.5911318202314745, "flos": 21251648021760.0, "grad_norm": 1.6552693507472749, "language_loss": 0.77847427, "learning_rate": 1.5117319238399767e-06, "loss": 0.79971302, "num_input_tokens_seen": 211895725, "step": 9832, "time_per_iteration": 2.716529369354248 }, { "auxiliary_loss_clip": 0.01105775, "auxiliary_loss_mlp": 0.01031029, "balance_loss_clip": 1.04159164, "balance_loss_mlp": 1.01780295, "epoch": 0.5911919434841425, "flos": 17821604528640.0, "grad_norm": 1.9563179904860062, "language_loss": 0.83245647, "learning_rate": 1.511354255945847e-06, "loss": 0.8538245, "num_input_tokens_seen": 211913860, "step": 9833, "time_per_iteration": 2.641958236694336 }, { "auxiliary_loss_clip": 0.0110771, "auxiliary_loss_mlp": 0.01038041, "balance_loss_clip": 1.04046118, "balance_loss_mlp": 1.02435589, "epoch": 0.5912520667368104, "flos": 20374781207040.0, "grad_norm": 1.5336556134798032, "language_loss": 0.74267918, "learning_rate": 1.5109766065812123e-06, "loss": 0.76413667, "num_input_tokens_seen": 211932880, "step": 9834, "time_per_iteration": 2.628453016281128 }, { "auxiliary_loss_clip": 0.01119479, "auxiliary_loss_mlp": 0.01034016, "balance_loss_clip": 1.04244208, "balance_loss_mlp": 1.02121329, "epoch": 0.5913121899894784, "flos": 17930557457280.0, "grad_norm": 2.771797648904754, "language_loss": 0.78298235, "learning_rate": 1.5105989757603942e-06, "loss": 0.80451727, "num_input_tokens_seen": 211948625, "step": 9835, "time_per_iteration": 2.5689404010772705 }, { "auxiliary_loss_clip": 0.01095655, "auxiliary_loss_mlp": 0.01036768, "balance_loss_clip": 1.03806067, "balance_loss_mlp": 1.0237323, "epoch": 0.5913723132421465, "flos": 22126934638080.0, "grad_norm": 1.8733256786117318, "language_loss": 0.73799431, "learning_rate": 1.5102213634977117e-06, "loss": 0.75931853, "num_input_tokens_seen": 211965355, "step": 9836, "time_per_iteration": 2.695117712020874 }, { "auxiliary_loss_clip": 0.01083057, "auxiliary_loss_mlp": 0.01035766, "balance_loss_clip": 1.03943884, "balance_loss_mlp": 1.02149653, "epoch": 0.5914324364948144, "flos": 15697918771200.0, "grad_norm": 1.9392468028622023, "language_loss": 0.82138634, "learning_rate": 1.5098437698074841e-06, "loss": 0.84257448, "num_input_tokens_seen": 211982245, "step": 9837, "time_per_iteration": 2.6912343502044678 }, { "auxiliary_loss_clip": 0.01078463, "auxiliary_loss_mlp": 0.01035071, "balance_loss_clip": 1.03632522, "balance_loss_mlp": 1.02026534, "epoch": 0.5914925597474824, "flos": 22747327367040.0, "grad_norm": 2.27741138864597, "language_loss": 0.79637218, "learning_rate": 1.5094661947040304e-06, "loss": 0.81750751, "num_input_tokens_seen": 212000250, "step": 9838, "time_per_iteration": 2.6449244022369385 }, { "auxiliary_loss_clip": 0.010718, "auxiliary_loss_mlp": 0.01039396, "balance_loss_clip": 1.04010475, "balance_loss_mlp": 1.02605057, "epoch": 0.5915526830001503, "flos": 18292788161280.0, "grad_norm": 1.9685283368258655, "language_loss": 0.69672906, "learning_rate": 1.5090886382016673e-06, "loss": 0.71784103, "num_input_tokens_seen": 212017505, "step": 9839, "time_per_iteration": 2.76196026802063 }, { "auxiliary_loss_clip": 0.01093291, "auxiliary_loss_mlp": 0.01043789, "balance_loss_clip": 1.04008913, "balance_loss_mlp": 1.0308131, "epoch": 0.5916128062528183, "flos": 17019072910080.0, "grad_norm": 2.7566603972322943, "language_loss": 0.65802211, "learning_rate": 1.5087111003147124e-06, "loss": 0.67939293, "num_input_tokens_seen": 212034595, "step": 9840, "time_per_iteration": 2.647179365158081 }, { "auxiliary_loss_clip": 0.01095524, "auxiliary_loss_mlp": 0.01030956, "balance_loss_clip": 1.04105091, "balance_loss_mlp": 1.0170027, "epoch": 0.5916729295054862, "flos": 24754231031040.0, "grad_norm": 1.7835451737672352, "language_loss": 0.81441593, "learning_rate": 1.5083335810574813e-06, "loss": 0.83568072, "num_input_tokens_seen": 212055775, "step": 9841, "time_per_iteration": 2.693742036819458 }, { "auxiliary_loss_clip": 0.01090733, "auxiliary_loss_mlp": 0.01030377, "balance_loss_clip": 1.04020691, "balance_loss_mlp": 1.01772296, "epoch": 0.5917330527581542, "flos": 15958199698560.0, "grad_norm": 1.7111294758223268, "language_loss": 0.69152761, "learning_rate": 1.507956080444291e-06, "loss": 0.71273863, "num_input_tokens_seen": 212074000, "step": 9842, "time_per_iteration": 2.6797986030578613 }, { "auxiliary_loss_clip": 0.01093141, "auxiliary_loss_mlp": 0.0103715, "balance_loss_clip": 1.03811431, "balance_loss_mlp": 1.02367949, "epoch": 0.5917931760108222, "flos": 23800730549760.0, "grad_norm": 3.159007391867861, "language_loss": 0.83409858, "learning_rate": 1.5075785984894549e-06, "loss": 0.85540152, "num_input_tokens_seen": 212091415, "step": 9843, "time_per_iteration": 2.7194371223449707 }, { "auxiliary_loss_clip": 0.01090728, "auxiliary_loss_mlp": 0.01031987, "balance_loss_clip": 1.03646731, "balance_loss_mlp": 1.01762211, "epoch": 0.5918532992634902, "flos": 23249609199360.0, "grad_norm": 5.395713728013965, "language_loss": 0.81329596, "learning_rate": 1.5072011352072875e-06, "loss": 0.83452308, "num_input_tokens_seen": 212105255, "step": 9844, "time_per_iteration": 2.7136270999908447 }, { "auxiliary_loss_clip": 0.01068008, "auxiliary_loss_mlp": 0.01030142, "balance_loss_clip": 1.03874016, "balance_loss_mlp": 1.01633775, "epoch": 0.5919134225161581, "flos": 19499853726720.0, "grad_norm": 1.8542895008446525, "language_loss": 0.74591327, "learning_rate": 1.5068236906121032e-06, "loss": 0.7668947, "num_input_tokens_seen": 212122765, "step": 9845, "time_per_iteration": 2.781914710998535 }, { "auxiliary_loss_clip": 0.01077949, "auxiliary_loss_mlp": 0.0103435, "balance_loss_clip": 1.03821266, "balance_loss_mlp": 1.01837575, "epoch": 0.5919735457688261, "flos": 38800940567040.0, "grad_norm": 1.69458434045341, "language_loss": 0.63799906, "learning_rate": 1.506446264718213e-06, "loss": 0.65912199, "num_input_tokens_seen": 212143960, "step": 9846, "time_per_iteration": 2.8427982330322266 }, { "auxiliary_loss_clip": 0.01076538, "auxiliary_loss_mlp": 0.00769552, "balance_loss_clip": 1.03801441, "balance_loss_mlp": 1.00004482, "epoch": 0.592033669021494, "flos": 22163994495360.0, "grad_norm": 1.809865828874733, "language_loss": 0.76013452, "learning_rate": 1.506068857539931e-06, "loss": 0.77859539, "num_input_tokens_seen": 212162005, "step": 9847, "time_per_iteration": 2.737806797027588 }, { "auxiliary_loss_clip": 0.01092495, "auxiliary_loss_mlp": 0.01031315, "balance_loss_clip": 1.03829622, "balance_loss_mlp": 1.01720047, "epoch": 0.592093792274162, "flos": 22710985781760.0, "grad_norm": 1.7217593328479819, "language_loss": 0.62444723, "learning_rate": 1.5056914690915667e-06, "loss": 0.64568532, "num_input_tokens_seen": 212181635, "step": 9848, "time_per_iteration": 2.768158197402954 }, { "auxiliary_loss_clip": 0.01108627, "auxiliary_loss_mlp": 0.01039243, "balance_loss_clip": 1.04256344, "balance_loss_mlp": 1.02609384, "epoch": 0.59215391552683, "flos": 22528954632960.0, "grad_norm": 1.7269094299177161, "language_loss": 0.75832105, "learning_rate": 1.5053140993874312e-06, "loss": 0.7797997, "num_input_tokens_seen": 212201615, "step": 9849, "time_per_iteration": 2.6506807804107666 }, { "auxiliary_loss_clip": 0.01095576, "auxiliary_loss_mlp": 0.01036342, "balance_loss_clip": 1.04088306, "balance_loss_mlp": 1.02223945, "epoch": 0.592214038779498, "flos": 24499013921280.0, "grad_norm": 2.077646783474588, "language_loss": 0.75440395, "learning_rate": 1.5049367484418353e-06, "loss": 0.7757231, "num_input_tokens_seen": 212219355, "step": 9850, "time_per_iteration": 2.738163471221924 }, { "auxiliary_loss_clip": 0.01079223, "auxiliary_loss_mlp": 0.01038556, "balance_loss_clip": 1.0389607, "balance_loss_mlp": 1.02532411, "epoch": 0.592274162032166, "flos": 21831353619840.0, "grad_norm": 2.0657919494048094, "language_loss": 0.75485742, "learning_rate": 1.5045594162690868e-06, "loss": 0.77603519, "num_input_tokens_seen": 212236710, "step": 9851, "time_per_iteration": 2.7006642818450928 }, { "auxiliary_loss_clip": 0.0109594, "auxiliary_loss_mlp": 0.0103171, "balance_loss_clip": 1.04149699, "balance_loss_mlp": 1.01846635, "epoch": 0.5923342852848339, "flos": 24608146417920.0, "grad_norm": 1.9468749498411155, "language_loss": 0.7089386, "learning_rate": 1.5041821028834954e-06, "loss": 0.73021513, "num_input_tokens_seen": 212256195, "step": 9852, "time_per_iteration": 2.706106424331665 }, { "auxiliary_loss_clip": 0.01104361, "auxiliary_loss_mlp": 0.0077249, "balance_loss_clip": 1.04451549, "balance_loss_mlp": 1.00008225, "epoch": 0.5923944085375019, "flos": 19938143479680.0, "grad_norm": 1.600717143056076, "language_loss": 0.80555183, "learning_rate": 1.5038048082993685e-06, "loss": 0.82432032, "num_input_tokens_seen": 212274085, "step": 9853, "time_per_iteration": 2.7119646072387695 }, { "auxiliary_loss_clip": 0.01088586, "auxiliary_loss_mlp": 0.01028953, "balance_loss_clip": 1.03719842, "balance_loss_mlp": 1.01654959, "epoch": 0.5924545317901698, "flos": 28658510812800.0, "grad_norm": 1.9598293021275044, "language_loss": 0.67597294, "learning_rate": 1.5034275325310124e-06, "loss": 0.69714832, "num_input_tokens_seen": 212295530, "step": 9854, "time_per_iteration": 2.7060039043426514 }, { "auxiliary_loss_clip": 0.01081304, "auxiliary_loss_mlp": 0.01029538, "balance_loss_clip": 1.03990042, "balance_loss_mlp": 1.01680636, "epoch": 0.5925146550428378, "flos": 19864885691520.0, "grad_norm": 1.7821900938554989, "language_loss": 0.88811159, "learning_rate": 1.5030502755927344e-06, "loss": 0.90921998, "num_input_tokens_seen": 212313770, "step": 9855, "time_per_iteration": 2.749842882156372 }, { "auxiliary_loss_clip": 0.01097397, "auxiliary_loss_mlp": 0.01031382, "balance_loss_clip": 1.04023433, "balance_loss_mlp": 1.01912177, "epoch": 0.5925747782955058, "flos": 15122989681920.0, "grad_norm": 1.7553886735756365, "language_loss": 0.86097872, "learning_rate": 1.5026730374988397e-06, "loss": 0.8822664, "num_input_tokens_seen": 212331525, "step": 9856, "time_per_iteration": 2.8213181495666504 }, { "auxiliary_loss_clip": 0.0110594, "auxiliary_loss_mlp": 0.01036211, "balance_loss_clip": 1.03984308, "balance_loss_mlp": 1.02389097, "epoch": 0.5926349015481738, "flos": 18405440190720.0, "grad_norm": 3.6746631679389536, "language_loss": 0.77349007, "learning_rate": 1.5022958182636332e-06, "loss": 0.79491156, "num_input_tokens_seen": 212347295, "step": 9857, "time_per_iteration": 2.6580264568328857 }, { "auxiliary_loss_clip": 0.0107388, "auxiliary_loss_mlp": 0.01051977, "balance_loss_clip": 1.03587079, "balance_loss_mlp": 1.03689682, "epoch": 0.5926950248008417, "flos": 23111138269440.0, "grad_norm": 2.383524132494838, "language_loss": 0.64598405, "learning_rate": 1.501918617901419e-06, "loss": 0.66724265, "num_input_tokens_seen": 212365750, "step": 9858, "time_per_iteration": 2.7002615928649902 }, { "auxiliary_loss_clip": 0.01103608, "auxiliary_loss_mlp": 0.01033595, "balance_loss_clip": 1.04055738, "balance_loss_mlp": 1.02088773, "epoch": 0.5927551480535097, "flos": 28033916192640.0, "grad_norm": 1.88700094462338, "language_loss": 0.77598989, "learning_rate": 1.501541436426501e-06, "loss": 0.79736185, "num_input_tokens_seen": 212385300, "step": 9859, "time_per_iteration": 4.434144496917725 }, { "auxiliary_loss_clip": 0.01078779, "auxiliary_loss_mlp": 0.00771508, "balance_loss_clip": 1.04448819, "balance_loss_mlp": 1.00007796, "epoch": 0.5928152713061776, "flos": 21798675221760.0, "grad_norm": 4.274702781757113, "language_loss": 0.74740881, "learning_rate": 1.5011642738531818e-06, "loss": 0.7659117, "num_input_tokens_seen": 212402140, "step": 9860, "time_per_iteration": 2.8576431274414062 }, { "auxiliary_loss_clip": 0.01080315, "auxiliary_loss_mlp": 0.01034538, "balance_loss_clip": 1.04223692, "balance_loss_mlp": 1.02289104, "epoch": 0.5928753945588456, "flos": 24316839118080.0, "grad_norm": 1.6207851458155365, "language_loss": 0.7622723, "learning_rate": 1.500787130195763e-06, "loss": 0.7834208, "num_input_tokens_seen": 212421790, "step": 9861, "time_per_iteration": 5.779749393463135 }, { "auxiliary_loss_clip": 0.01079641, "auxiliary_loss_mlp": 0.01032307, "balance_loss_clip": 1.03737628, "balance_loss_mlp": 1.0201298, "epoch": 0.5929355178115137, "flos": 26464619923200.0, "grad_norm": 2.31911103307255, "language_loss": 0.70733476, "learning_rate": 1.5004100054685465e-06, "loss": 0.72845423, "num_input_tokens_seen": 212442115, "step": 9862, "time_per_iteration": 2.7879045009613037 }, { "auxiliary_loss_clip": 0.01057596, "auxiliary_loss_mlp": 0.01034108, "balance_loss_clip": 1.03278732, "balance_loss_mlp": 1.02148342, "epoch": 0.5929956410641816, "flos": 24965995662720.0, "grad_norm": 1.7884457502004503, "language_loss": 0.78123254, "learning_rate": 1.500032899685832e-06, "loss": 0.80214959, "num_input_tokens_seen": 212459535, "step": 9863, "time_per_iteration": 2.7296791076660156 }, { "auxiliary_loss_clip": 0.01089944, "auxiliary_loss_mlp": 0.01040962, "balance_loss_clip": 1.03986549, "balance_loss_mlp": 1.02770567, "epoch": 0.5930557643168496, "flos": 26208325405440.0, "grad_norm": 2.4622472815237506, "language_loss": 0.70487082, "learning_rate": 1.499655812861921e-06, "loss": 0.72617984, "num_input_tokens_seen": 212479385, "step": 9864, "time_per_iteration": 2.6773011684417725 }, { "auxiliary_loss_clip": 0.01089195, "auxiliary_loss_mlp": 0.01036172, "balance_loss_clip": 1.03835356, "balance_loss_mlp": 1.0226891, "epoch": 0.5931158875695175, "flos": 27854937699840.0, "grad_norm": 1.4468399758370936, "language_loss": 0.67205417, "learning_rate": 1.4992787450111112e-06, "loss": 0.69330788, "num_input_tokens_seen": 212500060, "step": 9865, "time_per_iteration": 4.260905981063843 }, { "auxiliary_loss_clip": 0.01098878, "auxiliary_loss_mlp": 0.0103771, "balance_loss_clip": 1.04014802, "balance_loss_mlp": 1.02411962, "epoch": 0.5931760108221855, "flos": 15413650536960.0, "grad_norm": 1.9702875461989908, "language_loss": 0.77913535, "learning_rate": 1.4989016961477015e-06, "loss": 0.80050123, "num_input_tokens_seen": 212518590, "step": 9866, "time_per_iteration": 2.6692967414855957 }, { "auxiliary_loss_clip": 0.01090663, "auxiliary_loss_mlp": 0.01031022, "balance_loss_clip": 1.04043937, "balance_loss_mlp": 1.01891649, "epoch": 0.5932361340748534, "flos": 30188520581760.0, "grad_norm": 2.3223854732809364, "language_loss": 0.71955562, "learning_rate": 1.4985246662859903e-06, "loss": 0.74077249, "num_input_tokens_seen": 212538190, "step": 9867, "time_per_iteration": 2.73850679397583 }, { "auxiliary_loss_clip": 0.01094459, "auxiliary_loss_mlp": 0.0103051, "balance_loss_clip": 1.04182947, "balance_loss_mlp": 1.01644969, "epoch": 0.5932962573275214, "flos": 20157557708160.0, "grad_norm": 1.577108097655746, "language_loss": 0.66789985, "learning_rate": 1.4981476554402732e-06, "loss": 0.68914956, "num_input_tokens_seen": 212557820, "step": 9868, "time_per_iteration": 2.776890277862549 }, { "auxiliary_loss_clip": 0.01060162, "auxiliary_loss_mlp": 0.00771363, "balance_loss_clip": 1.03597963, "balance_loss_mlp": 1.00004768, "epoch": 0.5933563805801894, "flos": 25445906300160.0, "grad_norm": 1.613226423561444, "language_loss": 0.75353992, "learning_rate": 1.4977706636248478e-06, "loss": 0.77185524, "num_input_tokens_seen": 212577645, "step": 9869, "time_per_iteration": 2.8630988597869873 }, { "auxiliary_loss_clip": 0.010636, "auxiliary_loss_mlp": 0.01038006, "balance_loss_clip": 1.03897762, "balance_loss_mlp": 1.02469635, "epoch": 0.5934165038328574, "flos": 59995740337920.0, "grad_norm": 1.8583969258808255, "language_loss": 0.74005115, "learning_rate": 1.4973936908540091e-06, "loss": 0.76106727, "num_input_tokens_seen": 212603430, "step": 9870, "time_per_iteration": 3.0915732383728027 }, { "auxiliary_loss_clip": 0.01071863, "auxiliary_loss_mlp": 0.01030945, "balance_loss_clip": 1.03705025, "balance_loss_mlp": 1.01810646, "epoch": 0.5934766270855253, "flos": 24420548661120.0, "grad_norm": 2.145127507644007, "language_loss": 0.7232281, "learning_rate": 1.4970167371420517e-06, "loss": 0.7442562, "num_input_tokens_seen": 212620730, "step": 9871, "time_per_iteration": 2.7629406452178955 }, { "auxiliary_loss_clip": 0.01086004, "auxiliary_loss_mlp": 0.01031261, "balance_loss_clip": 1.04104018, "balance_loss_mlp": 1.01764774, "epoch": 0.5935367503381933, "flos": 23513158264320.0, "grad_norm": 2.0164353140130835, "language_loss": 0.74587923, "learning_rate": 1.496639802503271e-06, "loss": 0.76705188, "num_input_tokens_seen": 212639745, "step": 9872, "time_per_iteration": 2.74772310256958 }, { "auxiliary_loss_clip": 0.01111382, "auxiliary_loss_mlp": 0.01038485, "balance_loss_clip": 1.04180598, "balance_loss_mlp": 1.02414966, "epoch": 0.5935968735908612, "flos": 18948337326720.0, "grad_norm": 2.3277369002939388, "language_loss": 0.79620034, "learning_rate": 1.4962628869519583e-06, "loss": 0.81769902, "num_input_tokens_seen": 212655915, "step": 9873, "time_per_iteration": 2.663547992706299 }, { "auxiliary_loss_clip": 0.01108216, "auxiliary_loss_mlp": 0.01034928, "balance_loss_clip": 1.04269648, "balance_loss_mlp": 1.021523, "epoch": 0.5936569968435292, "flos": 25483433034240.0, "grad_norm": 1.6892324145577737, "language_loss": 0.8490203, "learning_rate": 1.4958859905024078e-06, "loss": 0.87045169, "num_input_tokens_seen": 212676115, "step": 9874, "time_per_iteration": 2.654606580734253 }, { "auxiliary_loss_clip": 0.01019729, "auxiliary_loss_mlp": 0.01001192, "balance_loss_clip": 1.01379979, "balance_loss_mlp": 0.99991626, "epoch": 0.5937171200961973, "flos": 66378361789440.0, "grad_norm": 0.7079839888277836, "language_loss": 0.59980857, "learning_rate": 1.4955091131689115e-06, "loss": 0.62001777, "num_input_tokens_seen": 212737560, "step": 9875, "time_per_iteration": 3.3108227252960205 }, { "auxiliary_loss_clip": 0.01094208, "auxiliary_loss_mlp": 0.01033507, "balance_loss_clip": 1.03624558, "balance_loss_mlp": 1.01859426, "epoch": 0.5937772433488652, "flos": 14903467712640.0, "grad_norm": 5.919714877847386, "language_loss": 0.7768054, "learning_rate": 1.4951322549657594e-06, "loss": 0.79808253, "num_input_tokens_seen": 212755365, "step": 9876, "time_per_iteration": 2.6835005283355713 }, { "auxiliary_loss_clip": 0.01097876, "auxiliary_loss_mlp": 0.01028372, "balance_loss_clip": 1.03590453, "balance_loss_mlp": 1.01630843, "epoch": 0.5938373666015332, "flos": 22561489376640.0, "grad_norm": 1.528829961767438, "language_loss": 0.75805295, "learning_rate": 1.494755415907243e-06, "loss": 0.77931547, "num_input_tokens_seen": 212773875, "step": 9877, "time_per_iteration": 2.703756332397461 }, { "auxiliary_loss_clip": 0.0110632, "auxiliary_loss_mlp": 0.01028449, "balance_loss_clip": 1.03964424, "balance_loss_mlp": 1.01493096, "epoch": 0.5938974898542011, "flos": 18440883936000.0, "grad_norm": 2.6694319382348666, "language_loss": 0.81408948, "learning_rate": 1.4943785960076522e-06, "loss": 0.83543718, "num_input_tokens_seen": 212790590, "step": 9878, "time_per_iteration": 2.6299495697021484 }, { "auxiliary_loss_clip": 0.01090649, "auxiliary_loss_mlp": 0.00772164, "balance_loss_clip": 1.03885496, "balance_loss_mlp": 1.00006008, "epoch": 0.5939576131068691, "flos": 45586728270720.0, "grad_norm": 1.7408999007224344, "language_loss": 0.71310401, "learning_rate": 1.4940017952812754e-06, "loss": 0.73173207, "num_input_tokens_seen": 212812265, "step": 9879, "time_per_iteration": 2.9403438568115234 }, { "auxiliary_loss_clip": 0.01107517, "auxiliary_loss_mlp": 0.01037191, "balance_loss_clip": 1.04333889, "balance_loss_mlp": 1.02471602, "epoch": 0.594017736359537, "flos": 23587708942080.0, "grad_norm": 1.6220417937962182, "language_loss": 0.5754692, "learning_rate": 1.493625013742401e-06, "loss": 0.59691632, "num_input_tokens_seen": 212831915, "step": 9880, "time_per_iteration": 2.722222089767456 }, { "auxiliary_loss_clip": 0.01108825, "auxiliary_loss_mlp": 0.01034905, "balance_loss_clip": 1.04171181, "balance_loss_mlp": 1.02144003, "epoch": 0.594077859612205, "flos": 29457235589760.0, "grad_norm": 1.8505883622927, "language_loss": 0.77141905, "learning_rate": 1.4932482514053177e-06, "loss": 0.79285634, "num_input_tokens_seen": 212851350, "step": 9881, "time_per_iteration": 2.7424824237823486 }, { "auxiliary_loss_clip": 0.01104617, "auxiliary_loss_mlp": 0.01027481, "balance_loss_clip": 1.0387702, "balance_loss_mlp": 1.01456428, "epoch": 0.594137982864873, "flos": 16800089644800.0, "grad_norm": 2.611625845648677, "language_loss": 0.82625538, "learning_rate": 1.4928715082843112e-06, "loss": 0.84757638, "num_input_tokens_seen": 212867995, "step": 9882, "time_per_iteration": 2.6125638484954834 }, { "auxiliary_loss_clip": 0.01108328, "auxiliary_loss_mlp": 0.01036545, "balance_loss_clip": 1.04283643, "balance_loss_mlp": 1.02419496, "epoch": 0.594198106117541, "flos": 12750263953920.0, "grad_norm": 2.4545417723722434, "language_loss": 0.79556072, "learning_rate": 1.492494784393667e-06, "loss": 0.81700939, "num_input_tokens_seen": 212885220, "step": 9883, "time_per_iteration": 2.6739277839660645 }, { "auxiliary_loss_clip": 0.01090609, "auxiliary_loss_mlp": 0.00770805, "balance_loss_clip": 1.04405499, "balance_loss_mlp": 1.00010085, "epoch": 0.5942582293702089, "flos": 20996538652800.0, "grad_norm": 2.530798381383893, "language_loss": 0.7459439, "learning_rate": 1.4921180797476725e-06, "loss": 0.76455808, "num_input_tokens_seen": 212903195, "step": 9884, "time_per_iteration": 2.720139503479004 }, { "auxiliary_loss_clip": 0.01118755, "auxiliary_loss_mlp": 0.01030314, "balance_loss_clip": 1.04366493, "balance_loss_mlp": 1.01757646, "epoch": 0.5943183526228769, "flos": 28291431772800.0, "grad_norm": 2.040352336443274, "language_loss": 0.66608262, "learning_rate": 1.4917413943606106e-06, "loss": 0.68757325, "num_input_tokens_seen": 212923340, "step": 9885, "time_per_iteration": 2.6618847846984863 }, { "auxiliary_loss_clip": 0.01093907, "auxiliary_loss_mlp": 0.01041351, "balance_loss_clip": 1.04138327, "balance_loss_mlp": 1.02835155, "epoch": 0.5943784758755448, "flos": 26614619118720.0, "grad_norm": 2.630158617128694, "language_loss": 0.77534634, "learning_rate": 1.4913647282467667e-06, "loss": 0.79669893, "num_input_tokens_seen": 212942755, "step": 9886, "time_per_iteration": 2.7532429695129395 }, { "auxiliary_loss_clip": 0.01025813, "auxiliary_loss_mlp": 0.01001276, "balance_loss_clip": 1.01382208, "balance_loss_mlp": 0.99997658, "epoch": 0.5944385991282128, "flos": 64190935347840.0, "grad_norm": 0.9149518659336237, "language_loss": 0.64530778, "learning_rate": 1.490988081420423e-06, "loss": 0.66557866, "num_input_tokens_seen": 212999355, "step": 9887, "time_per_iteration": 3.060612440109253 }, { "auxiliary_loss_clip": 0.01097622, "auxiliary_loss_mlp": 0.01032109, "balance_loss_clip": 1.03770781, "balance_loss_mlp": 1.01940084, "epoch": 0.5944987223808808, "flos": 19571998193280.0, "grad_norm": 1.6915419105373903, "language_loss": 0.69181025, "learning_rate": 1.4906114538958615e-06, "loss": 0.71310759, "num_input_tokens_seen": 213018570, "step": 9888, "time_per_iteration": 2.617629051208496 }, { "auxiliary_loss_clip": 0.01088883, "auxiliary_loss_mlp": 0.01034911, "balance_loss_clip": 1.03844309, "balance_loss_mlp": 1.02113008, "epoch": 0.5945588456335488, "flos": 26177586341760.0, "grad_norm": 2.5005305893435685, "language_loss": 0.79495192, "learning_rate": 1.490234845687366e-06, "loss": 0.81618989, "num_input_tokens_seen": 213037735, "step": 9889, "time_per_iteration": 2.685150146484375 }, { "auxiliary_loss_clip": 0.01080162, "auxiliary_loss_mlp": 0.01026954, "balance_loss_clip": 1.03793621, "balance_loss_mlp": 1.01496744, "epoch": 0.5946189688862168, "flos": 20446494710400.0, "grad_norm": 1.6110540672551508, "language_loss": 0.70713383, "learning_rate": 1.4898582568092154e-06, "loss": 0.72820497, "num_input_tokens_seen": 213057160, "step": 9890, "time_per_iteration": 2.7299606800079346 }, { "auxiliary_loss_clip": 0.01088716, "auxiliary_loss_mlp": 0.01032845, "balance_loss_clip": 1.04451787, "balance_loss_mlp": 1.01896358, "epoch": 0.5946790921388847, "flos": 13437521850240.0, "grad_norm": 1.9451498476517268, "language_loss": 0.69461864, "learning_rate": 1.489481687275691e-06, "loss": 0.71583426, "num_input_tokens_seen": 213073630, "step": 9891, "time_per_iteration": 2.7253577709198 }, { "auxiliary_loss_clip": 0.01104108, "auxiliary_loss_mlp": 0.01040464, "balance_loss_clip": 1.04076028, "balance_loss_mlp": 1.02784514, "epoch": 0.5947392153915527, "flos": 20412272027520.0, "grad_norm": 1.8738043279095635, "language_loss": 0.53252602, "learning_rate": 1.4891051371010726e-06, "loss": 0.55397171, "num_input_tokens_seen": 213092450, "step": 9892, "time_per_iteration": 2.630176067352295 }, { "auxiliary_loss_clip": 0.01007775, "auxiliary_loss_mlp": 0.01004642, "balance_loss_clip": 1.01469183, "balance_loss_mlp": 1.00331867, "epoch": 0.5947993386442206, "flos": 65619138994560.0, "grad_norm": 0.662438980473289, "language_loss": 0.54533142, "learning_rate": 1.4887286062996375e-06, "loss": 0.56545562, "num_input_tokens_seen": 213155465, "step": 9893, "time_per_iteration": 3.3319764137268066 }, { "auxiliary_loss_clip": 0.01079474, "auxiliary_loss_mlp": 0.01035837, "balance_loss_clip": 1.04197478, "balance_loss_mlp": 1.02362406, "epoch": 0.5948594618968887, "flos": 23183103168000.0, "grad_norm": 1.5803116085974762, "language_loss": 0.74965519, "learning_rate": 1.4883520948856658e-06, "loss": 0.77080828, "num_input_tokens_seen": 213174875, "step": 9894, "time_per_iteration": 2.708012104034424 }, { "auxiliary_loss_clip": 0.01084394, "auxiliary_loss_mlp": 0.01031071, "balance_loss_clip": 1.04066491, "balance_loss_mlp": 1.01860142, "epoch": 0.5949195851495566, "flos": 13626771632640.0, "grad_norm": 1.7370359553625463, "language_loss": 0.77732074, "learning_rate": 1.487975602873434e-06, "loss": 0.79847538, "num_input_tokens_seen": 213192695, "step": 9895, "time_per_iteration": 2.6831347942352295 }, { "auxiliary_loss_clip": 0.01067508, "auxiliary_loss_mlp": 0.01037328, "balance_loss_clip": 1.03781974, "balance_loss_mlp": 1.0233922, "epoch": 0.5949797084022246, "flos": 19751012599680.0, "grad_norm": 1.6095460497638086, "language_loss": 0.79347014, "learning_rate": 1.4875991302772182e-06, "loss": 0.81451851, "num_input_tokens_seen": 213211195, "step": 9896, "time_per_iteration": 2.7621328830718994 }, { "auxiliary_loss_clip": 0.01106477, "auxiliary_loss_mlp": 0.01035793, "balance_loss_clip": 1.04062951, "balance_loss_mlp": 1.02315736, "epoch": 0.5950398316548925, "flos": 25773878407680.0, "grad_norm": 1.5421424712505716, "language_loss": 0.83955193, "learning_rate": 1.4872226771112954e-06, "loss": 0.86097461, "num_input_tokens_seen": 213231975, "step": 9897, "time_per_iteration": 2.7152647972106934 }, { "auxiliary_loss_clip": 0.01092695, "auxiliary_loss_mlp": 0.01037147, "balance_loss_clip": 1.04191113, "balance_loss_mlp": 1.02490425, "epoch": 0.5950999549075605, "flos": 23039029716480.0, "grad_norm": 1.9245000057416703, "language_loss": 0.70950294, "learning_rate": 1.486846243389939e-06, "loss": 0.73080134, "num_input_tokens_seen": 213249760, "step": 9898, "time_per_iteration": 4.332275867462158 }, { "auxiliary_loss_clip": 0.01105674, "auxiliary_loss_mlp": 0.01044981, "balance_loss_clip": 1.03863168, "balance_loss_mlp": 1.02892375, "epoch": 0.5951600781602284, "flos": 32446367637120.0, "grad_norm": 2.443382879492767, "language_loss": 0.64050412, "learning_rate": 1.4864698291274251e-06, "loss": 0.66201067, "num_input_tokens_seen": 213269890, "step": 9899, "time_per_iteration": 2.747209072113037 }, { "auxiliary_loss_clip": 0.01117539, "auxiliary_loss_mlp": 0.01028742, "balance_loss_clip": 1.04378319, "balance_loss_mlp": 1.01740563, "epoch": 0.5952202014128964, "flos": 23800874204160.0, "grad_norm": 1.865552618204713, "language_loss": 0.71956146, "learning_rate": 1.4860934343380267e-06, "loss": 0.74102432, "num_input_tokens_seen": 213289400, "step": 9900, "time_per_iteration": 5.790768146514893 }, { "auxiliary_loss_clip": 0.01114892, "auxiliary_loss_mlp": 0.01032325, "balance_loss_clip": 1.04192626, "balance_loss_mlp": 1.01949835, "epoch": 0.5952803246655644, "flos": 22492182084480.0, "grad_norm": 1.7457638078039162, "language_loss": 0.84428406, "learning_rate": 1.4857170590360169e-06, "loss": 0.86575621, "num_input_tokens_seen": 213308040, "step": 9901, "time_per_iteration": 2.7782936096191406 }, { "auxiliary_loss_clip": 0.00993307, "auxiliary_loss_mlp": 0.01008976, "balance_loss_clip": 1.01768923, "balance_loss_mlp": 1.00779581, "epoch": 0.5953404479182324, "flos": 51234688851840.0, "grad_norm": 0.8002603783256921, "language_loss": 0.58178693, "learning_rate": 1.4853407032356674e-06, "loss": 0.60180974, "num_input_tokens_seen": 213358585, "step": 9902, "time_per_iteration": 3.245389699935913 }, { "auxiliary_loss_clip": 0.01059574, "auxiliary_loss_mlp": 0.01029206, "balance_loss_clip": 1.03823233, "balance_loss_mlp": 1.01596808, "epoch": 0.5954005711709004, "flos": 23112682554240.0, "grad_norm": 2.326170730098328, "language_loss": 0.77513373, "learning_rate": 1.4849643669512503e-06, "loss": 0.79602152, "num_input_tokens_seen": 213379585, "step": 9903, "time_per_iteration": 2.938472032546997 }, { "auxiliary_loss_clip": 0.01080471, "auxiliary_loss_mlp": 0.01035506, "balance_loss_clip": 1.04236233, "balance_loss_mlp": 1.02275109, "epoch": 0.5954606944235683, "flos": 35954732736000.0, "grad_norm": 3.664262182530453, "language_loss": 0.7767508, "learning_rate": 1.4845880501970362e-06, "loss": 0.79791057, "num_input_tokens_seen": 213401465, "step": 9904, "time_per_iteration": 4.397410869598389 }, { "auxiliary_loss_clip": 0.01102001, "auxiliary_loss_mlp": 0.01038114, "balance_loss_clip": 1.04016399, "balance_loss_mlp": 1.02507877, "epoch": 0.5955208176762363, "flos": 30443665864320.0, "grad_norm": 1.9431813333035064, "language_loss": 0.72943354, "learning_rate": 1.4842117529872942e-06, "loss": 0.7508347, "num_input_tokens_seen": 213422720, "step": 9905, "time_per_iteration": 2.7936177253723145 }, { "auxiliary_loss_clip": 0.01109363, "auxiliary_loss_mlp": 0.01030507, "balance_loss_clip": 1.04223228, "balance_loss_mlp": 1.01717925, "epoch": 0.5955809409289042, "flos": 17640112083840.0, "grad_norm": 1.9824269605474862, "language_loss": 0.70172507, "learning_rate": 1.483835475336295e-06, "loss": 0.72312379, "num_input_tokens_seen": 213439480, "step": 9906, "time_per_iteration": 2.6985738277435303 }, { "auxiliary_loss_clip": 0.01106299, "auxiliary_loss_mlp": 0.01032912, "balance_loss_clip": 1.04149914, "balance_loss_mlp": 1.01987052, "epoch": 0.5956410641815723, "flos": 24279887001600.0, "grad_norm": 1.8692952809001842, "language_loss": 0.75197554, "learning_rate": 1.4834592172583057e-06, "loss": 0.77336764, "num_input_tokens_seen": 213458895, "step": 9907, "time_per_iteration": 2.6980481147766113 }, { "auxiliary_loss_clip": 0.01088924, "auxiliary_loss_mlp": 0.01032034, "balance_loss_clip": 1.03741193, "balance_loss_mlp": 1.0194813, "epoch": 0.5957011874342402, "flos": 35734277013120.0, "grad_norm": 1.635771489703633, "language_loss": 0.67245162, "learning_rate": 1.483082978767595e-06, "loss": 0.69366121, "num_input_tokens_seen": 213481730, "step": 9908, "time_per_iteration": 2.7698655128479004 }, { "auxiliary_loss_clip": 0.01040116, "auxiliary_loss_mlp": 0.01031975, "balance_loss_clip": 1.03187275, "balance_loss_mlp": 1.0195055, "epoch": 0.5957613106869082, "flos": 21245004005760.0, "grad_norm": 1.9181869047737456, "language_loss": 0.76516539, "learning_rate": 1.4827067598784298e-06, "loss": 0.78588629, "num_input_tokens_seen": 213497225, "step": 9909, "time_per_iteration": 2.8098058700561523 }, { "auxiliary_loss_clip": 0.0103764, "auxiliary_loss_mlp": 0.01004774, "balance_loss_clip": 1.01340699, "balance_loss_mlp": 1.00373673, "epoch": 0.5958214339395761, "flos": 65940969876480.0, "grad_norm": 0.9280508663350204, "language_loss": 0.73383075, "learning_rate": 1.4823305606050753e-06, "loss": 0.75425494, "num_input_tokens_seen": 213556890, "step": 9910, "time_per_iteration": 3.228283166885376 }, { "auxiliary_loss_clip": 0.0109102, "auxiliary_loss_mlp": 0.01035168, "balance_loss_clip": 1.03882253, "balance_loss_mlp": 1.02188206, "epoch": 0.5958815571922441, "flos": 23218690567680.0, "grad_norm": 2.4798653486938544, "language_loss": 0.69676727, "learning_rate": 1.481954380961799e-06, "loss": 0.71802914, "num_input_tokens_seen": 213575800, "step": 9911, "time_per_iteration": 2.6699378490448 }, { "auxiliary_loss_clip": 0.01116036, "auxiliary_loss_mlp": 0.01033392, "balance_loss_clip": 1.04485154, "balance_loss_mlp": 1.01942098, "epoch": 0.595941680444912, "flos": 16538623568640.0, "grad_norm": 1.9669774674890577, "language_loss": 0.65873277, "learning_rate": 1.4815782209628631e-06, "loss": 0.68022704, "num_input_tokens_seen": 213592740, "step": 9912, "time_per_iteration": 2.642876386642456 }, { "auxiliary_loss_clip": 0.0108881, "auxiliary_loss_mlp": 0.01037146, "balance_loss_clip": 1.04177618, "balance_loss_mlp": 1.02360988, "epoch": 0.59600180369758, "flos": 27818883423360.0, "grad_norm": 1.9028573243158677, "language_loss": 0.73863906, "learning_rate": 1.4812020806225337e-06, "loss": 0.7598986, "num_input_tokens_seen": 213611970, "step": 9913, "time_per_iteration": 2.860369920730591 }, { "auxiliary_loss_clip": 0.01083137, "auxiliary_loss_mlp": 0.00770309, "balance_loss_clip": 1.03919995, "balance_loss_mlp": 1.0000217, "epoch": 0.596061926950248, "flos": 29491566013440.0, "grad_norm": 2.1966155200103907, "language_loss": 0.79778421, "learning_rate": 1.4808259599550738e-06, "loss": 0.81631863, "num_input_tokens_seen": 213632230, "step": 9914, "time_per_iteration": 2.790907382965088 }, { "auxiliary_loss_clip": 0.01079867, "auxiliary_loss_mlp": 0.01029281, "balance_loss_clip": 1.03796613, "balance_loss_mlp": 1.01610804, "epoch": 0.596122050202916, "flos": 16836790366080.0, "grad_norm": 1.724717360749454, "language_loss": 0.67540228, "learning_rate": 1.4804498589747448e-06, "loss": 0.69649374, "num_input_tokens_seen": 213649645, "step": 9915, "time_per_iteration": 2.701197385787964 }, { "auxiliary_loss_clip": 0.01088406, "auxiliary_loss_mlp": 0.01030943, "balance_loss_clip": 1.03837395, "balance_loss_mlp": 1.0187242, "epoch": 0.596182173455584, "flos": 20996646393600.0, "grad_norm": 1.462048268018942, "language_loss": 0.78788066, "learning_rate": 1.4800737776958095e-06, "loss": 0.8090741, "num_input_tokens_seen": 213668850, "step": 9916, "time_per_iteration": 2.7466511726379395 }, { "auxiliary_loss_clip": 0.01093274, "auxiliary_loss_mlp": 0.01031597, "balance_loss_clip": 1.03742838, "balance_loss_mlp": 1.01851332, "epoch": 0.5962422967082519, "flos": 16065680169600.0, "grad_norm": 1.8319257164110343, "language_loss": 0.8272475, "learning_rate": 1.4796977161325286e-06, "loss": 0.84849626, "num_input_tokens_seen": 213685695, "step": 9917, "time_per_iteration": 2.6762564182281494 }, { "auxiliary_loss_clip": 0.01090404, "auxiliary_loss_mlp": 0.01034476, "balance_loss_clip": 1.04083288, "balance_loss_mlp": 1.02195954, "epoch": 0.5963024199609199, "flos": 12166966995840.0, "grad_norm": 1.8036319058685593, "language_loss": 0.76979315, "learning_rate": 1.4793216742991625e-06, "loss": 0.79104197, "num_input_tokens_seen": 213703515, "step": 9918, "time_per_iteration": 2.707718849182129 }, { "auxiliary_loss_clip": 0.01108865, "auxiliary_loss_mlp": 0.01038575, "balance_loss_clip": 1.04414129, "balance_loss_mlp": 1.02538431, "epoch": 0.5963625432135878, "flos": 28074280101120.0, "grad_norm": 2.6956936924639012, "language_loss": 0.78955698, "learning_rate": 1.4789456522099707e-06, "loss": 0.8110314, "num_input_tokens_seen": 213724170, "step": 9919, "time_per_iteration": 2.732933759689331 }, { "auxiliary_loss_clip": 0.01091105, "auxiliary_loss_mlp": 0.01037217, "balance_loss_clip": 1.04111147, "balance_loss_mlp": 1.02323401, "epoch": 0.5964226664662559, "flos": 19860324664320.0, "grad_norm": 1.8773735409019414, "language_loss": 0.77863061, "learning_rate": 1.4785696498792122e-06, "loss": 0.79991376, "num_input_tokens_seen": 213740620, "step": 9920, "time_per_iteration": 2.6758365631103516 }, { "auxiliary_loss_clip": 0.01105504, "auxiliary_loss_mlp": 0.01037014, "balance_loss_clip": 1.04226005, "balance_loss_mlp": 1.02303123, "epoch": 0.5964827897189238, "flos": 12932618325120.0, "grad_norm": 2.199993791667526, "language_loss": 0.82559252, "learning_rate": 1.4781936673211446e-06, "loss": 0.84701777, "num_input_tokens_seen": 213755390, "step": 9921, "time_per_iteration": 2.631972312927246 }, { "auxiliary_loss_clip": 0.0110339, "auxiliary_loss_mlp": 0.01032591, "balance_loss_clip": 1.0396421, "balance_loss_mlp": 1.01888192, "epoch": 0.5965429129715918, "flos": 18150797698560.0, "grad_norm": 3.5044992121063103, "language_loss": 0.80699342, "learning_rate": 1.4778177045500252e-06, "loss": 0.82835329, "num_input_tokens_seen": 213773225, "step": 9922, "time_per_iteration": 2.646479606628418 }, { "auxiliary_loss_clip": 0.01107944, "auxiliary_loss_mlp": 0.00770214, "balance_loss_clip": 1.04096532, "balance_loss_mlp": 1.000036, "epoch": 0.5966030362242597, "flos": 21763231476480.0, "grad_norm": 1.7423002236659255, "language_loss": 0.77125442, "learning_rate": 1.477441761580111e-06, "loss": 0.79003608, "num_input_tokens_seen": 213791860, "step": 9923, "time_per_iteration": 2.646597385406494 }, { "auxiliary_loss_clip": 0.01105997, "auxiliary_loss_mlp": 0.01038842, "balance_loss_clip": 1.04343677, "balance_loss_mlp": 1.02382815, "epoch": 0.5966631594769277, "flos": 18807208790400.0, "grad_norm": 1.7872252192325138, "language_loss": 0.76111019, "learning_rate": 1.4770658384256573e-06, "loss": 0.78255856, "num_input_tokens_seen": 213809455, "step": 9924, "time_per_iteration": 2.784302234649658 }, { "auxiliary_loss_clip": 0.01098024, "auxiliary_loss_mlp": 0.0103727, "balance_loss_clip": 1.03841281, "balance_loss_mlp": 1.02270854, "epoch": 0.5967232827295956, "flos": 14064163545600.0, "grad_norm": 2.5918588496554222, "language_loss": 0.66627729, "learning_rate": 1.4766899351009204e-06, "loss": 0.6876303, "num_input_tokens_seen": 213826615, "step": 9925, "time_per_iteration": 2.6964471340179443 }, { "auxiliary_loss_clip": 0.01088743, "auxiliary_loss_mlp": 0.01035345, "balance_loss_clip": 1.04202008, "balance_loss_mlp": 1.0219934, "epoch": 0.5967834059822636, "flos": 17238235743360.0, "grad_norm": 2.607968523577736, "language_loss": 0.71629661, "learning_rate": 1.4763140516201528e-06, "loss": 0.7375375, "num_input_tokens_seen": 213844495, "step": 9926, "time_per_iteration": 2.739656448364258 }, { "auxiliary_loss_clip": 0.01076071, "auxiliary_loss_mlp": 0.00771823, "balance_loss_clip": 1.04067254, "balance_loss_mlp": 1.0001483, "epoch": 0.5968435292349316, "flos": 42520244284800.0, "grad_norm": 1.798681806501109, "language_loss": 0.70456839, "learning_rate": 1.4759381879976088e-06, "loss": 0.72304738, "num_input_tokens_seen": 213869125, "step": 9927, "time_per_iteration": 2.9877870082855225 }, { "auxiliary_loss_clip": 0.01071922, "auxiliary_loss_mlp": 0.01029018, "balance_loss_clip": 1.03775859, "balance_loss_mlp": 1.01547647, "epoch": 0.5969036524875996, "flos": 37630898945280.0, "grad_norm": 1.7276883821850428, "language_loss": 0.63847625, "learning_rate": 1.4755623442475415e-06, "loss": 0.6594857, "num_input_tokens_seen": 213891115, "step": 9928, "time_per_iteration": 2.889533042907715 }, { "auxiliary_loss_clip": 0.01115406, "auxiliary_loss_mlp": 0.0103325, "balance_loss_clip": 1.04134023, "balance_loss_mlp": 1.02103138, "epoch": 0.5969637757402676, "flos": 23148377694720.0, "grad_norm": 1.6663701476220254, "language_loss": 0.69803309, "learning_rate": 1.4751865203842022e-06, "loss": 0.71951973, "num_input_tokens_seen": 213911925, "step": 9929, "time_per_iteration": 2.6571357250213623 }, { "auxiliary_loss_clip": 0.01073832, "auxiliary_loss_mlp": 0.01034603, "balance_loss_clip": 1.04385591, "balance_loss_mlp": 1.02244925, "epoch": 0.5970238989929355, "flos": 24020934877440.0, "grad_norm": 1.7972325287685906, "language_loss": 0.76839757, "learning_rate": 1.4748107164218431e-06, "loss": 0.78948194, "num_input_tokens_seen": 213930715, "step": 9930, "time_per_iteration": 2.7475857734680176 }, { "auxiliary_loss_clip": 0.0109514, "auxiliary_loss_mlp": 0.01034752, "balance_loss_clip": 1.04357862, "balance_loss_mlp": 1.02017856, "epoch": 0.5970840222456035, "flos": 19426883247360.0, "grad_norm": 1.7574249474808616, "language_loss": 0.68748617, "learning_rate": 1.4744349323747146e-06, "loss": 0.70878506, "num_input_tokens_seen": 213950015, "step": 9931, "time_per_iteration": 2.713695526123047 }, { "auxiliary_loss_clip": 0.01025314, "auxiliary_loss_mlp": 0.01000381, "balance_loss_clip": 1.01468325, "balance_loss_mlp": 0.99920666, "epoch": 0.5971441454982714, "flos": 62976615235200.0, "grad_norm": 0.8553027537300191, "language_loss": 0.64182514, "learning_rate": 1.474059168257065e-06, "loss": 0.66208208, "num_input_tokens_seen": 214003330, "step": 9932, "time_per_iteration": 3.106821060180664 }, { "auxiliary_loss_clip": 0.01084112, "auxiliary_loss_mlp": 0.01032121, "balance_loss_clip": 1.03818321, "balance_loss_mlp": 1.01869833, "epoch": 0.5972042687509395, "flos": 20266223328000.0, "grad_norm": 2.9993889514324463, "language_loss": 0.73966062, "learning_rate": 1.4736834240831454e-06, "loss": 0.76082295, "num_input_tokens_seen": 214021680, "step": 9933, "time_per_iteration": 2.718324899673462 }, { "auxiliary_loss_clip": 0.01028586, "auxiliary_loss_mlp": 0.01004687, "balance_loss_clip": 1.02009809, "balance_loss_mlp": 1.00334597, "epoch": 0.5972643920036074, "flos": 71652383832960.0, "grad_norm": 0.6592973095113355, "language_loss": 0.52000248, "learning_rate": 1.473307699867203e-06, "loss": 0.54033524, "num_input_tokens_seen": 214090265, "step": 9934, "time_per_iteration": 3.265408515930176 }, { "auxiliary_loss_clip": 0.01038691, "auxiliary_loss_mlp": 0.01008472, "balance_loss_clip": 1.01466894, "balance_loss_mlp": 1.00733399, "epoch": 0.5973245152562754, "flos": 56892702263040.0, "grad_norm": 0.8334850866606021, "language_loss": 0.54153717, "learning_rate": 1.4729319956234849e-06, "loss": 0.5620088, "num_input_tokens_seen": 214146375, "step": 9935, "time_per_iteration": 3.07120680809021 }, { "auxiliary_loss_clip": 0.01095451, "auxiliary_loss_mlp": 0.01033243, "balance_loss_clip": 1.04008901, "balance_loss_mlp": 1.01956391, "epoch": 0.5973846385089433, "flos": 24164361884160.0, "grad_norm": 1.5706852760220016, "language_loss": 0.66061485, "learning_rate": 1.4725563113662394e-06, "loss": 0.68190181, "num_input_tokens_seen": 214165340, "step": 9936, "time_per_iteration": 2.724457263946533 }, { "auxiliary_loss_clip": 0.01060903, "auxiliary_loss_mlp": 0.01035654, "balance_loss_clip": 1.03609622, "balance_loss_mlp": 1.02246332, "epoch": 0.5974447617616113, "flos": 17670599752320.0, "grad_norm": 1.9876387260879245, "language_loss": 0.6771605, "learning_rate": 1.4721806471097103e-06, "loss": 0.69812608, "num_input_tokens_seen": 214181360, "step": 9937, "time_per_iteration": 2.75978422164917 }, { "auxiliary_loss_clip": 0.0111018, "auxiliary_loss_mlp": 0.01032313, "balance_loss_clip": 1.04208851, "balance_loss_mlp": 1.01846123, "epoch": 0.5975048850142792, "flos": 22892514140160.0, "grad_norm": 2.408863368051578, "language_loss": 0.77660179, "learning_rate": 1.4718050028681442e-06, "loss": 0.79802668, "num_input_tokens_seen": 214198525, "step": 9938, "time_per_iteration": 4.499311447143555 }, { "auxiliary_loss_clip": 0.01105785, "auxiliary_loss_mlp": 0.01034855, "balance_loss_clip": 1.03925014, "balance_loss_mlp": 1.02100301, "epoch": 0.5975650082669473, "flos": 24353108876160.0, "grad_norm": 1.4410606641316148, "language_loss": 0.75726342, "learning_rate": 1.4714293786557855e-06, "loss": 0.77866983, "num_input_tokens_seen": 214218710, "step": 9939, "time_per_iteration": 4.202291011810303 }, { "auxiliary_loss_clip": 0.01073866, "auxiliary_loss_mlp": 0.01032947, "balance_loss_clip": 1.04116249, "balance_loss_mlp": 1.01718175, "epoch": 0.5976251315196152, "flos": 20923352691840.0, "grad_norm": 4.812638761028828, "language_loss": 0.68618965, "learning_rate": 1.471053774486878e-06, "loss": 0.70725775, "num_input_tokens_seen": 214237800, "step": 9940, "time_per_iteration": 4.418368339538574 }, { "auxiliary_loss_clip": 0.01090139, "auxiliary_loss_mlp": 0.01036739, "balance_loss_clip": 1.04158998, "balance_loss_mlp": 1.02415049, "epoch": 0.5976852547722832, "flos": 35844594658560.0, "grad_norm": 1.3494600203677949, "language_loss": 0.70370513, "learning_rate": 1.470678190375664e-06, "loss": 0.72497392, "num_input_tokens_seen": 214260355, "step": 9941, "time_per_iteration": 2.7807397842407227 }, { "auxiliary_loss_clip": 0.01092498, "auxiliary_loss_mlp": 0.01034522, "balance_loss_clip": 1.03824401, "balance_loss_mlp": 1.02123034, "epoch": 0.5977453780249512, "flos": 12855948744960.0, "grad_norm": 1.9808022638780955, "language_loss": 0.77407408, "learning_rate": 1.470302626336386e-06, "loss": 0.79534429, "num_input_tokens_seen": 214277120, "step": 9942, "time_per_iteration": 2.6881802082061768 }, { "auxiliary_loss_clip": 0.01071168, "auxiliary_loss_mlp": 0.01037338, "balance_loss_clip": 1.03963232, "balance_loss_mlp": 1.02418923, "epoch": 0.5978055012776191, "flos": 20959155573120.0, "grad_norm": 1.9541019064521015, "language_loss": 0.76172185, "learning_rate": 1.4699270823832857e-06, "loss": 0.78280699, "num_input_tokens_seen": 214295300, "step": 9943, "time_per_iteration": 4.4215734004974365 }, { "auxiliary_loss_clip": 0.0105205, "auxiliary_loss_mlp": 0.01034121, "balance_loss_clip": 1.03876281, "balance_loss_mlp": 1.02149105, "epoch": 0.5978656245302871, "flos": 34058003063040.0, "grad_norm": 1.735048648764757, "language_loss": 0.62473679, "learning_rate": 1.4695515585306032e-06, "loss": 0.64559853, "num_input_tokens_seen": 214317050, "step": 9944, "time_per_iteration": 2.8701138496398926 }, { "auxiliary_loss_clip": 0.0109987, "auxiliary_loss_mlp": 0.0103879, "balance_loss_clip": 1.04420114, "balance_loss_mlp": 1.02530718, "epoch": 0.597925747782955, "flos": 37373275624320.0, "grad_norm": 1.7121148929704375, "language_loss": 0.72442955, "learning_rate": 1.4691760547925795e-06, "loss": 0.74581611, "num_input_tokens_seen": 214337470, "step": 9945, "time_per_iteration": 2.7868094444274902 }, { "auxiliary_loss_clip": 0.01063078, "auxiliary_loss_mlp": 0.01035839, "balance_loss_clip": 1.03817308, "balance_loss_mlp": 1.02280903, "epoch": 0.5979858710356231, "flos": 25374803328000.0, "grad_norm": 2.215747344961558, "language_loss": 0.66905904, "learning_rate": 1.4688005711834522e-06, "loss": 0.6900481, "num_input_tokens_seen": 214357975, "step": 9946, "time_per_iteration": 2.83195161819458 }, { "auxiliary_loss_clip": 0.01104512, "auxiliary_loss_mlp": 0.01042624, "balance_loss_clip": 1.03969336, "balance_loss_mlp": 1.0275619, "epoch": 0.598045994288291, "flos": 13698413308800.0, "grad_norm": 1.928704516420183, "language_loss": 0.88898396, "learning_rate": 1.468425107717461e-06, "loss": 0.91045535, "num_input_tokens_seen": 214374125, "step": 9947, "time_per_iteration": 2.5993123054504395 }, { "auxiliary_loss_clip": 0.01112155, "auxiliary_loss_mlp": 0.01032443, "balance_loss_clip": 1.04039431, "balance_loss_mlp": 1.02080822, "epoch": 0.598106117540959, "flos": 21981352815360.0, "grad_norm": 1.8699586676771087, "language_loss": 0.72236538, "learning_rate": 1.4680496644088432e-06, "loss": 0.74381137, "num_input_tokens_seen": 214393395, "step": 9948, "time_per_iteration": 2.6766860485076904 }, { "auxiliary_loss_clip": 0.01093809, "auxiliary_loss_mlp": 0.01035478, "balance_loss_clip": 1.03969812, "balance_loss_mlp": 1.02129257, "epoch": 0.5981662407936269, "flos": 20559362221440.0, "grad_norm": 1.8848269321833362, "language_loss": 0.89223683, "learning_rate": 1.4676742412718347e-06, "loss": 0.91352975, "num_input_tokens_seen": 214411550, "step": 9949, "time_per_iteration": 2.731804370880127 }, { "auxiliary_loss_clip": 0.01105698, "auxiliary_loss_mlp": 0.01030419, "balance_loss_clip": 1.0420059, "balance_loss_mlp": 1.01814604, "epoch": 0.5982263640462949, "flos": 14063840323200.0, "grad_norm": 2.0634992965968917, "language_loss": 0.70250058, "learning_rate": 1.467298838320673e-06, "loss": 0.72386169, "num_input_tokens_seen": 214429780, "step": 9950, "time_per_iteration": 2.666879415512085 }, { "auxiliary_loss_clip": 0.01103442, "auxiliary_loss_mlp": 0.01031809, "balance_loss_clip": 1.0392406, "balance_loss_mlp": 1.01904809, "epoch": 0.5982864872989628, "flos": 17707228646400.0, "grad_norm": 1.610292824709656, "language_loss": 0.78345191, "learning_rate": 1.4669234555695921e-06, "loss": 0.80480444, "num_input_tokens_seen": 214447775, "step": 9951, "time_per_iteration": 2.624361753463745 }, { "auxiliary_loss_clip": 0.01096152, "auxiliary_loss_mlp": 0.01038536, "balance_loss_clip": 1.0411104, "balance_loss_mlp": 1.02471995, "epoch": 0.5983466105516309, "flos": 16764789553920.0, "grad_norm": 1.4677439185999286, "language_loss": 0.73951542, "learning_rate": 1.4665480930328275e-06, "loss": 0.76086229, "num_input_tokens_seen": 214467245, "step": 9952, "time_per_iteration": 2.780212640762329 }, { "auxiliary_loss_clip": 0.01097597, "auxiliary_loss_mlp": 0.00771764, "balance_loss_clip": 1.04058945, "balance_loss_mlp": 1.0000577, "epoch": 0.5984067338042988, "flos": 20042714949120.0, "grad_norm": 2.0876696722134493, "language_loss": 0.79496032, "learning_rate": 1.466172750724613e-06, "loss": 0.81365395, "num_input_tokens_seen": 214484385, "step": 9953, "time_per_iteration": 2.6629557609558105 }, { "auxiliary_loss_clip": 0.01088175, "auxiliary_loss_mlp": 0.01034264, "balance_loss_clip": 1.04368794, "balance_loss_mlp": 1.02172363, "epoch": 0.5984668570569668, "flos": 26319900026880.0, "grad_norm": 1.571611875852805, "language_loss": 0.69577867, "learning_rate": 1.4657974286591807e-06, "loss": 0.71700311, "num_input_tokens_seen": 214503465, "step": 9954, "time_per_iteration": 2.772745132446289 }, { "auxiliary_loss_clip": 0.01092663, "auxiliary_loss_mlp": 0.0103537, "balance_loss_clip": 1.03927422, "balance_loss_mlp": 1.02299023, "epoch": 0.5985269803096348, "flos": 20593728558720.0, "grad_norm": 1.8709505635033254, "language_loss": 0.73055756, "learning_rate": 1.4654221268507637e-06, "loss": 0.75183785, "num_input_tokens_seen": 214520725, "step": 9955, "time_per_iteration": 2.6827971935272217 }, { "auxiliary_loss_clip": 0.01118308, "auxiliary_loss_mlp": 0.01034246, "balance_loss_clip": 1.04205883, "balance_loss_mlp": 1.0209837, "epoch": 0.5985871035623027, "flos": 26865382942080.0, "grad_norm": 1.5476020192092728, "language_loss": 0.68627518, "learning_rate": 1.4650468453135934e-06, "loss": 0.70780075, "num_input_tokens_seen": 214540675, "step": 9956, "time_per_iteration": 2.6055126190185547 }, { "auxiliary_loss_clip": 0.01120333, "auxiliary_loss_mlp": 0.01033532, "balance_loss_clip": 1.0435667, "balance_loss_mlp": 1.02041864, "epoch": 0.5986472268149707, "flos": 19609704495360.0, "grad_norm": 5.767015828905461, "language_loss": 0.74026513, "learning_rate": 1.4646715840618999e-06, "loss": 0.76180387, "num_input_tokens_seen": 214559910, "step": 9957, "time_per_iteration": 2.670759677886963 }, { "auxiliary_loss_clip": 0.01082315, "auxiliary_loss_mlp": 0.01029692, "balance_loss_clip": 1.04125023, "balance_loss_mlp": 1.01696002, "epoch": 0.5987073500676386, "flos": 21794616984960.0, "grad_norm": 2.0517993540808157, "language_loss": 0.84612942, "learning_rate": 1.4642963431099138e-06, "loss": 0.86724949, "num_input_tokens_seen": 214575960, "step": 9958, "time_per_iteration": 2.710693597793579 }, { "auxiliary_loss_clip": 0.01088695, "auxiliary_loss_mlp": 0.00771117, "balance_loss_clip": 1.04130435, "balance_loss_mlp": 1.00005364, "epoch": 0.5987674733203067, "flos": 24314361079680.0, "grad_norm": 1.9589439151063424, "language_loss": 0.6649909, "learning_rate": 1.463921122471864e-06, "loss": 0.68358904, "num_input_tokens_seen": 214594230, "step": 9959, "time_per_iteration": 2.7052528858184814 }, { "auxiliary_loss_clip": 0.0110604, "auxiliary_loss_mlp": 0.01031714, "balance_loss_clip": 1.04048181, "balance_loss_mlp": 1.01915514, "epoch": 0.5988275965729746, "flos": 21320201128320.0, "grad_norm": 1.6803724665796522, "language_loss": 0.83453488, "learning_rate": 1.4635459221619796e-06, "loss": 0.85591239, "num_input_tokens_seen": 214613130, "step": 9960, "time_per_iteration": 2.698373317718506 }, { "auxiliary_loss_clip": 0.0110105, "auxiliary_loss_mlp": 0.01026917, "balance_loss_clip": 1.04384398, "balance_loss_mlp": 1.01451361, "epoch": 0.5988877198256426, "flos": 25118041933440.0, "grad_norm": 1.4637618649833892, "language_loss": 0.79449862, "learning_rate": 1.4631707421944868e-06, "loss": 0.81577832, "num_input_tokens_seen": 214634470, "step": 9961, "time_per_iteration": 2.763143539428711 }, { "auxiliary_loss_clip": 0.01115923, "auxiliary_loss_mlp": 0.01034214, "balance_loss_clip": 1.04150534, "balance_loss_mlp": 1.02107751, "epoch": 0.5989478430783105, "flos": 26429104350720.0, "grad_norm": 1.7720947984672266, "language_loss": 0.66938126, "learning_rate": 1.4627955825836136e-06, "loss": 0.69088268, "num_input_tokens_seen": 214654030, "step": 9962, "time_per_iteration": 2.6398210525512695 }, { "auxiliary_loss_clip": 0.01100963, "auxiliary_loss_mlp": 0.01040353, "balance_loss_clip": 1.03867447, "balance_loss_mlp": 1.02583313, "epoch": 0.5990079663309785, "flos": 25778439434880.0, "grad_norm": 1.3371562951805418, "language_loss": 0.74043596, "learning_rate": 1.4624204433435857e-06, "loss": 0.76184916, "num_input_tokens_seen": 214676985, "step": 9963, "time_per_iteration": 2.716456651687622 }, { "auxiliary_loss_clip": 0.01105789, "auxiliary_loss_mlp": 0.010335, "balance_loss_clip": 1.04120398, "balance_loss_mlp": 1.02003562, "epoch": 0.5990680895836464, "flos": 36831779118720.0, "grad_norm": 1.8119605341465645, "language_loss": 0.68010569, "learning_rate": 1.4620453244886281e-06, "loss": 0.70149863, "num_input_tokens_seen": 214700105, "step": 9964, "time_per_iteration": 2.764112710952759 }, { "auxiliary_loss_clip": 0.01082495, "auxiliary_loss_mlp": 0.01029274, "balance_loss_clip": 1.04189765, "balance_loss_mlp": 1.0158987, "epoch": 0.5991282128363145, "flos": 24133550993280.0, "grad_norm": 1.838028773427246, "language_loss": 0.76536453, "learning_rate": 1.4616702260329662e-06, "loss": 0.78648221, "num_input_tokens_seen": 214717885, "step": 9965, "time_per_iteration": 2.6872916221618652 }, { "auxiliary_loss_clip": 0.01100107, "auxiliary_loss_mlp": 0.01029644, "balance_loss_clip": 1.03997707, "balance_loss_mlp": 1.01664448, "epoch": 0.5991883360889824, "flos": 10304064956160.0, "grad_norm": 1.881941118756219, "language_loss": 0.77352554, "learning_rate": 1.4612951479908229e-06, "loss": 0.79482305, "num_input_tokens_seen": 214733680, "step": 9966, "time_per_iteration": 2.645473003387451 }, { "auxiliary_loss_clip": 0.01080024, "auxiliary_loss_mlp": 0.01029432, "balance_loss_clip": 1.04003799, "balance_loss_mlp": 1.01742721, "epoch": 0.5992484593416504, "flos": 23951196622080.0, "grad_norm": 1.4675663731632993, "language_loss": 0.73089266, "learning_rate": 1.460920090376422e-06, "loss": 0.75198722, "num_input_tokens_seen": 214753285, "step": 9967, "time_per_iteration": 2.7043392658233643 }, { "auxiliary_loss_clip": 0.0111042, "auxiliary_loss_mlp": 0.01035802, "balance_loss_clip": 1.04168642, "balance_loss_mlp": 1.02200305, "epoch": 0.5993085825943184, "flos": 11944105061760.0, "grad_norm": 2.0432757361111724, "language_loss": 0.68492925, "learning_rate": 1.4605450532039847e-06, "loss": 0.70639145, "num_input_tokens_seen": 214767810, "step": 9968, "time_per_iteration": 2.618802070617676 }, { "auxiliary_loss_clip": 0.01104497, "auxiliary_loss_mlp": 0.01037187, "balance_loss_clip": 1.03805614, "balance_loss_mlp": 1.02315605, "epoch": 0.5993687058469863, "flos": 19026838500480.0, "grad_norm": 1.5933947258371375, "language_loss": 0.79251635, "learning_rate": 1.4601700364877334e-06, "loss": 0.81393319, "num_input_tokens_seen": 214786040, "step": 9969, "time_per_iteration": 2.6758008003234863 }, { "auxiliary_loss_clip": 0.01100647, "auxiliary_loss_mlp": 0.01031223, "balance_loss_clip": 1.03998137, "balance_loss_mlp": 1.0176506, "epoch": 0.5994288290996543, "flos": 14282967242880.0, "grad_norm": 1.6601112189929519, "language_loss": 0.80936122, "learning_rate": 1.4597950402418889e-06, "loss": 0.83067989, "num_input_tokens_seen": 214803110, "step": 9970, "time_per_iteration": 2.7434401512145996 }, { "auxiliary_loss_clip": 0.01064445, "auxiliary_loss_mlp": 0.01044271, "balance_loss_clip": 1.0378437, "balance_loss_mlp": 1.02879751, "epoch": 0.5994889523523222, "flos": 19206643006080.0, "grad_norm": 2.015109530583561, "language_loss": 0.61666113, "learning_rate": 1.4594200644806697e-06, "loss": 0.6377483, "num_input_tokens_seen": 214819945, "step": 9971, "time_per_iteration": 2.6593470573425293 }, { "auxiliary_loss_clip": 0.01112816, "auxiliary_loss_mlp": 0.01033245, "balance_loss_clip": 1.04096997, "balance_loss_mlp": 1.02065659, "epoch": 0.5995490756049903, "flos": 28037040675840.0, "grad_norm": 1.7466561522631148, "language_loss": 0.79054534, "learning_rate": 1.4590451092182962e-06, "loss": 0.81200594, "num_input_tokens_seen": 214838810, "step": 9972, "time_per_iteration": 2.657733917236328 }, { "auxiliary_loss_clip": 0.01077287, "auxiliary_loss_mlp": 0.0103561, "balance_loss_clip": 1.03948355, "balance_loss_mlp": 1.0220139, "epoch": 0.5996091988576582, "flos": 29052953038080.0, "grad_norm": 2.7295276371688657, "language_loss": 0.76414442, "learning_rate": 1.4586701744689864e-06, "loss": 0.78527337, "num_input_tokens_seen": 214857040, "step": 9973, "time_per_iteration": 2.804370880126953 }, { "auxiliary_loss_clip": 0.01080222, "auxiliary_loss_mlp": 0.01031483, "balance_loss_clip": 1.03798461, "balance_loss_mlp": 1.01820862, "epoch": 0.5996693221103262, "flos": 20813968800000.0, "grad_norm": 2.687412315258338, "language_loss": 0.65429473, "learning_rate": 1.4582952602469578e-06, "loss": 0.6754117, "num_input_tokens_seen": 214873375, "step": 9974, "time_per_iteration": 2.7193095684051514 }, { "auxiliary_loss_clip": 0.01106109, "auxiliary_loss_mlp": 0.01032556, "balance_loss_clip": 1.0399034, "balance_loss_mlp": 1.01984227, "epoch": 0.5997294453629941, "flos": 23768914078080.0, "grad_norm": 1.3699302504221633, "language_loss": 0.74378854, "learning_rate": 1.457920366566428e-06, "loss": 0.76517522, "num_input_tokens_seen": 214893900, "step": 9975, "time_per_iteration": 2.6727962493896484 }, { "auxiliary_loss_clip": 0.01117306, "auxiliary_loss_mlp": 0.01031631, "balance_loss_clip": 1.04184341, "balance_loss_mlp": 1.01771951, "epoch": 0.5997895686156621, "flos": 20960017499520.0, "grad_norm": 1.8689128111534072, "language_loss": 0.77081978, "learning_rate": 1.457545493441611e-06, "loss": 0.79230917, "num_input_tokens_seen": 214912110, "step": 9976, "time_per_iteration": 2.5855295658111572 }, { "auxiliary_loss_clip": 0.01101132, "auxiliary_loss_mlp": 0.01036343, "balance_loss_clip": 1.04325271, "balance_loss_mlp": 1.0225029, "epoch": 0.59984969186833, "flos": 28365443746560.0, "grad_norm": 2.489782776024688, "language_loss": 0.74998355, "learning_rate": 1.4571706408867237e-06, "loss": 0.77135837, "num_input_tokens_seen": 214930140, "step": 9977, "time_per_iteration": 4.355423212051392 }, { "auxiliary_loss_clip": 0.01081083, "auxiliary_loss_mlp": 0.01029688, "balance_loss_clip": 1.03771675, "balance_loss_mlp": 1.01639032, "epoch": 0.5999098151209981, "flos": 22565906749440.0, "grad_norm": 1.7961745328309484, "language_loss": 0.69053113, "learning_rate": 1.4567958089159802e-06, "loss": 0.71163881, "num_input_tokens_seen": 214949200, "step": 9978, "time_per_iteration": 2.687735080718994 }, { "auxiliary_loss_clip": 0.01124045, "auxiliary_loss_mlp": 0.01035056, "balance_loss_clip": 1.04541636, "balance_loss_mlp": 1.02081037, "epoch": 0.599969938373666, "flos": 18768712389120.0, "grad_norm": 1.9378111201967976, "language_loss": 0.81427479, "learning_rate": 1.456420997543594e-06, "loss": 0.8358658, "num_input_tokens_seen": 214965775, "step": 9979, "time_per_iteration": 5.60455322265625 }, { "auxiliary_loss_clip": 0.01113469, "auxiliary_loss_mlp": 0.0103294, "balance_loss_clip": 1.04139137, "balance_loss_mlp": 1.02011895, "epoch": 0.600030061626334, "flos": 11327231865600.0, "grad_norm": 2.0199004568827577, "language_loss": 0.70054936, "learning_rate": 1.4560462067837782e-06, "loss": 0.72201335, "num_input_tokens_seen": 214982480, "step": 9980, "time_per_iteration": 2.5815303325653076 }, { "auxiliary_loss_clip": 0.01105293, "auxiliary_loss_mlp": 0.01032543, "balance_loss_clip": 1.03971553, "balance_loss_mlp": 1.01786244, "epoch": 0.600090184879002, "flos": 16578664254720.0, "grad_norm": 2.2746227330860327, "language_loss": 0.686566, "learning_rate": 1.4556714366507445e-06, "loss": 0.70794439, "num_input_tokens_seen": 214998110, "step": 9981, "time_per_iteration": 2.635133743286133 }, { "auxiliary_loss_clip": 0.01106547, "auxiliary_loss_mlp": 0.01036545, "balance_loss_clip": 1.04316497, "balance_loss_mlp": 1.02458215, "epoch": 0.6001503081316699, "flos": 23618627573760.0, "grad_norm": 1.8281310539755133, "language_loss": 0.78525096, "learning_rate": 1.4552966871587048e-06, "loss": 0.80668187, "num_input_tokens_seen": 215017995, "step": 9982, "time_per_iteration": 4.6227052211761475 }, { "auxiliary_loss_clip": 0.01066865, "auxiliary_loss_mlp": 0.01043371, "balance_loss_clip": 1.03895831, "balance_loss_mlp": 1.02730179, "epoch": 0.6002104313843379, "flos": 20667668705280.0, "grad_norm": 1.558592797835216, "language_loss": 0.73127562, "learning_rate": 1.4549219583218686e-06, "loss": 0.75237799, "num_input_tokens_seen": 215038285, "step": 9983, "time_per_iteration": 2.851017951965332 }, { "auxiliary_loss_clip": 0.01075266, "auxiliary_loss_mlp": 0.01033335, "balance_loss_clip": 1.03699243, "balance_loss_mlp": 1.01962018, "epoch": 0.6002705546370058, "flos": 22455229968000.0, "grad_norm": 4.7484025968689325, "language_loss": 0.78227878, "learning_rate": 1.454547250154447e-06, "loss": 0.80336481, "num_input_tokens_seen": 215057825, "step": 9984, "time_per_iteration": 2.6935315132141113 }, { "auxiliary_loss_clip": 0.01109117, "auxiliary_loss_mlp": 0.01035425, "balance_loss_clip": 1.04397178, "balance_loss_mlp": 1.02215743, "epoch": 0.6003306778896739, "flos": 25191982080000.0, "grad_norm": 1.729800567101094, "language_loss": 0.83458543, "learning_rate": 1.4541725626706485e-06, "loss": 0.85603082, "num_input_tokens_seen": 215077790, "step": 9985, "time_per_iteration": 2.7772903442382812 }, { "auxiliary_loss_clip": 0.01106318, "auxiliary_loss_mlp": 0.01039651, "balance_loss_clip": 1.04176068, "balance_loss_mlp": 1.02729487, "epoch": 0.6003908011423418, "flos": 26687733252480.0, "grad_norm": 2.2153021552569956, "language_loss": 0.71093589, "learning_rate": 1.4537978958846809e-06, "loss": 0.73239559, "num_input_tokens_seen": 215097650, "step": 9986, "time_per_iteration": 2.794067859649658 }, { "auxiliary_loss_clip": 0.0112089, "auxiliary_loss_mlp": 0.00771497, "balance_loss_clip": 1.04465151, "balance_loss_mlp": 1.00010371, "epoch": 0.6004509243950098, "flos": 22565080736640.0, "grad_norm": 1.3997582574427474, "language_loss": 0.71425599, "learning_rate": 1.4534232498107514e-06, "loss": 0.73317981, "num_input_tokens_seen": 215118235, "step": 9987, "time_per_iteration": 2.689911365509033 }, { "auxiliary_loss_clip": 0.01096945, "auxiliary_loss_mlp": 0.01039105, "balance_loss_clip": 1.04330432, "balance_loss_mlp": 1.02589071, "epoch": 0.6005110476476777, "flos": 19719303868800.0, "grad_norm": 1.7371829608849618, "language_loss": 0.84939432, "learning_rate": 1.4530486244630673e-06, "loss": 0.87075484, "num_input_tokens_seen": 215136755, "step": 9988, "time_per_iteration": 2.7220449447631836 }, { "auxiliary_loss_clip": 0.01108518, "auxiliary_loss_mlp": 0.01035377, "balance_loss_clip": 1.04211533, "balance_loss_mlp": 1.02187085, "epoch": 0.6005711709003457, "flos": 17712543859200.0, "grad_norm": 1.6453818743399957, "language_loss": 0.65595025, "learning_rate": 1.4526740198558346e-06, "loss": 0.6773892, "num_input_tokens_seen": 215155225, "step": 9989, "time_per_iteration": 2.708707809448242 }, { "auxiliary_loss_clip": 0.0110487, "auxiliary_loss_mlp": 0.01034775, "balance_loss_clip": 1.04078543, "balance_loss_mlp": 1.02239513, "epoch": 0.6006312941530136, "flos": 18514464946560.0, "grad_norm": 1.5276583445435046, "language_loss": 0.8036738, "learning_rate": 1.452299436003257e-06, "loss": 0.82507026, "num_input_tokens_seen": 215174815, "step": 9990, "time_per_iteration": 2.6760056018829346 }, { "auxiliary_loss_clip": 0.0107479, "auxiliary_loss_mlp": 0.01031552, "balance_loss_clip": 1.03909266, "balance_loss_mlp": 1.01817632, "epoch": 0.6006914174056817, "flos": 21390837223680.0, "grad_norm": 2.0016484487093833, "language_loss": 0.8290872, "learning_rate": 1.4519248729195403e-06, "loss": 0.85015059, "num_input_tokens_seen": 215192045, "step": 9991, "time_per_iteration": 2.6902015209198 }, { "auxiliary_loss_clip": 0.01062355, "auxiliary_loss_mlp": 0.01042556, "balance_loss_clip": 1.03686535, "balance_loss_mlp": 1.02867436, "epoch": 0.6007515406583496, "flos": 12750515349120.0, "grad_norm": 1.9693626562875086, "language_loss": 0.82834661, "learning_rate": 1.4515503306188878e-06, "loss": 0.84939575, "num_input_tokens_seen": 215209885, "step": 9992, "time_per_iteration": 2.750401496887207 }, { "auxiliary_loss_clip": 0.01095422, "auxiliary_loss_mlp": 0.00771119, "balance_loss_clip": 1.04209352, "balance_loss_mlp": 1.0001328, "epoch": 0.6008116639110176, "flos": 19206894401280.0, "grad_norm": 1.855753675619843, "language_loss": 0.66424763, "learning_rate": 1.4511758091155008e-06, "loss": 0.68291306, "num_input_tokens_seen": 215228150, "step": 9993, "time_per_iteration": 2.664606809616089 }, { "auxiliary_loss_clip": 0.01080718, "auxiliary_loss_mlp": 0.01034631, "balance_loss_clip": 1.03863966, "balance_loss_mlp": 1.02051032, "epoch": 0.6008717871636855, "flos": 17055342668160.0, "grad_norm": 2.4957386160129182, "language_loss": 0.80870563, "learning_rate": 1.4508013084235826e-06, "loss": 0.82985908, "num_input_tokens_seen": 215243755, "step": 9994, "time_per_iteration": 2.640841007232666 }, { "auxiliary_loss_clip": 0.01071985, "auxiliary_loss_mlp": 0.01029355, "balance_loss_clip": 1.03745914, "balance_loss_mlp": 1.01653457, "epoch": 0.6009319104163535, "flos": 20298686244480.0, "grad_norm": 1.874968253383489, "language_loss": 0.72665036, "learning_rate": 1.4504268285573337e-06, "loss": 0.7476638, "num_input_tokens_seen": 215262130, "step": 9995, "time_per_iteration": 2.694720506668091 }, { "auxiliary_loss_clip": 0.01094635, "auxiliary_loss_mlp": 0.01038473, "balance_loss_clip": 1.03786469, "balance_loss_mlp": 1.02479935, "epoch": 0.6009920336690215, "flos": 21836776573440.0, "grad_norm": 1.6925252532660184, "language_loss": 0.80807674, "learning_rate": 1.4500523695309546e-06, "loss": 0.82940787, "num_input_tokens_seen": 215281785, "step": 9996, "time_per_iteration": 2.6821236610412598 }, { "auxiliary_loss_clip": 0.01056059, "auxiliary_loss_mlp": 0.01045573, "balance_loss_clip": 1.0363729, "balance_loss_mlp": 1.03094554, "epoch": 0.6010521569216895, "flos": 22596107109120.0, "grad_norm": 2.5847377804090548, "language_loss": 0.78435457, "learning_rate": 1.4496779313586447e-06, "loss": 0.80537087, "num_input_tokens_seen": 215297550, "step": 9997, "time_per_iteration": 2.763819694519043 }, { "auxiliary_loss_clip": 0.01106886, "auxiliary_loss_mlp": 0.0103365, "balance_loss_clip": 1.0403868, "balance_loss_mlp": 1.01968443, "epoch": 0.6011122801743575, "flos": 19171702051200.0, "grad_norm": 1.6202780199081332, "language_loss": 0.73208427, "learning_rate": 1.4493035140546028e-06, "loss": 0.75348961, "num_input_tokens_seen": 215316360, "step": 9998, "time_per_iteration": 2.642061471939087 }, { "auxiliary_loss_clip": 0.01085494, "auxiliary_loss_mlp": 0.01033235, "balance_loss_clip": 1.03910601, "balance_loss_mlp": 1.01992536, "epoch": 0.6011724034270254, "flos": 25010022758400.0, "grad_norm": 1.482726748062067, "language_loss": 0.72144544, "learning_rate": 1.448929117633027e-06, "loss": 0.74263275, "num_input_tokens_seen": 215336405, "step": 9999, "time_per_iteration": 2.726409673690796 }, { "auxiliary_loss_clip": 0.01067323, "auxiliary_loss_mlp": 0.0103545, "balance_loss_clip": 1.03762555, "balance_loss_mlp": 1.02221787, "epoch": 0.6012325266796934, "flos": 21797669640960.0, "grad_norm": 1.6696696942731026, "language_loss": 0.78647506, "learning_rate": 1.4485547421081142e-06, "loss": 0.80750275, "num_input_tokens_seen": 215356590, "step": 10000, "time_per_iteration": 2.8357326984405518 }, { "auxiliary_loss_clip": 0.01121882, "auxiliary_loss_mlp": 0.0103934, "balance_loss_clip": 1.04357147, "balance_loss_mlp": 1.02509475, "epoch": 0.6012926499323613, "flos": 19573003774080.0, "grad_norm": 1.8951447876838274, "language_loss": 0.7747916, "learning_rate": 1.4481803874940608e-06, "loss": 0.79640388, "num_input_tokens_seen": 215374295, "step": 10001, "time_per_iteration": 2.623619556427002 }, { "auxiliary_loss_clip": 0.01110485, "auxiliary_loss_mlp": 0.01029908, "balance_loss_clip": 1.04319382, "balance_loss_mlp": 1.01584136, "epoch": 0.6013527731850293, "flos": 34860786076800.0, "grad_norm": 1.8091026033907125, "language_loss": 0.5879162, "learning_rate": 1.4478060538050624e-06, "loss": 0.60932016, "num_input_tokens_seen": 215394535, "step": 10002, "time_per_iteration": 2.7854535579681396 }, { "auxiliary_loss_clip": 0.01101715, "auxiliary_loss_mlp": 0.01040915, "balance_loss_clip": 1.04363275, "balance_loss_mlp": 1.02503633, "epoch": 0.6014128964376972, "flos": 23291948355840.0, "grad_norm": 1.7477200306776974, "language_loss": 0.7803607, "learning_rate": 1.447431741055314e-06, "loss": 0.80178702, "num_input_tokens_seen": 215414355, "step": 10003, "time_per_iteration": 2.717195987701416 }, { "auxiliary_loss_clip": 0.01119246, "auxiliary_loss_mlp": 0.01034591, "balance_loss_clip": 1.04236484, "balance_loss_mlp": 1.02104282, "epoch": 0.6014730196903653, "flos": 24820916630400.0, "grad_norm": 2.556614535664238, "language_loss": 0.77315271, "learning_rate": 1.4470574492590091e-06, "loss": 0.79469103, "num_input_tokens_seen": 215428280, "step": 10004, "time_per_iteration": 2.7323880195617676 }, { "auxiliary_loss_clip": 0.01103784, "auxiliary_loss_mlp": 0.01030419, "balance_loss_clip": 1.04035211, "balance_loss_mlp": 1.01653695, "epoch": 0.6015331429430332, "flos": 23112359331840.0, "grad_norm": 1.5669492652896668, "language_loss": 0.72698373, "learning_rate": 1.4466831784303408e-06, "loss": 0.74832577, "num_input_tokens_seen": 215448970, "step": 10005, "time_per_iteration": 2.6966609954833984 }, { "auxiliary_loss_clip": 0.0111171, "auxiliary_loss_mlp": 0.01028792, "balance_loss_clip": 1.03977418, "balance_loss_mlp": 1.01611972, "epoch": 0.6015932661957012, "flos": 19201363706880.0, "grad_norm": 2.133433515954987, "language_loss": 0.7512781, "learning_rate": 1.4463089285835026e-06, "loss": 0.77268308, "num_input_tokens_seen": 215465260, "step": 10006, "time_per_iteration": 2.5414936542510986 }, { "auxiliary_loss_clip": 0.01089042, "auxiliary_loss_mlp": 0.01034372, "balance_loss_clip": 1.03682578, "balance_loss_mlp": 1.0206567, "epoch": 0.6016533894483691, "flos": 18113630100480.0, "grad_norm": 2.222329085457676, "language_loss": 0.73606133, "learning_rate": 1.445934699732685e-06, "loss": 0.75729549, "num_input_tokens_seen": 215482725, "step": 10007, "time_per_iteration": 2.7956955432891846 }, { "auxiliary_loss_clip": 0.0109466, "auxiliary_loss_mlp": 0.01027479, "balance_loss_clip": 1.04082942, "balance_loss_mlp": 1.0153439, "epoch": 0.6017135127010371, "flos": 16216900427520.0, "grad_norm": 1.6405140373840412, "language_loss": 0.69996077, "learning_rate": 1.4455604918920785e-06, "loss": 0.72118211, "num_input_tokens_seen": 215500420, "step": 10008, "time_per_iteration": 2.740049362182617 }, { "auxiliary_loss_clip": 0.01104877, "auxiliary_loss_mlp": 0.01024718, "balance_loss_clip": 1.03994632, "balance_loss_mlp": 1.01252937, "epoch": 0.6017736359537051, "flos": 23444246021760.0, "grad_norm": 1.594791938839471, "language_loss": 0.76377881, "learning_rate": 1.4451863050758748e-06, "loss": 0.78507471, "num_input_tokens_seen": 215522260, "step": 10009, "time_per_iteration": 2.6797382831573486 }, { "auxiliary_loss_clip": 0.0109029, "auxiliary_loss_mlp": 0.00770516, "balance_loss_clip": 1.03898764, "balance_loss_mlp": 1.00010157, "epoch": 0.601833759206373, "flos": 23514056104320.0, "grad_norm": 1.9797273750165876, "language_loss": 0.74202949, "learning_rate": 1.4448121392982608e-06, "loss": 0.76063752, "num_input_tokens_seen": 215541715, "step": 10010, "time_per_iteration": 2.7184016704559326 }, { "auxiliary_loss_clip": 0.01028511, "auxiliary_loss_mlp": 0.00998357, "balance_loss_clip": 1.01324391, "balance_loss_mlp": 0.99717093, "epoch": 0.6018938824590411, "flos": 63991668648960.0, "grad_norm": 0.8045921055289736, "language_loss": 0.55051792, "learning_rate": 1.4444379945734268e-06, "loss": 0.57078665, "num_input_tokens_seen": 215603020, "step": 10011, "time_per_iteration": 3.2024238109588623 }, { "auxiliary_loss_clip": 0.01107806, "auxiliary_loss_mlp": 0.01034347, "balance_loss_clip": 1.04110157, "balance_loss_mlp": 1.02186561, "epoch": 0.601954005711709, "flos": 34640007131520.0, "grad_norm": 1.3534958886387711, "language_loss": 0.62085426, "learning_rate": 1.44406387091556e-06, "loss": 0.64227581, "num_input_tokens_seen": 215625115, "step": 10012, "time_per_iteration": 2.756197452545166 }, { "auxiliary_loss_clip": 0.01074106, "auxiliary_loss_mlp": 0.01028149, "balance_loss_clip": 1.03729844, "balance_loss_mlp": 1.01547122, "epoch": 0.602014128964377, "flos": 19427062815360.0, "grad_norm": 2.02443112791839, "language_loss": 0.74996275, "learning_rate": 1.4436897683388462e-06, "loss": 0.77098525, "num_input_tokens_seen": 215643730, "step": 10013, "time_per_iteration": 2.718114137649536 }, { "auxiliary_loss_clip": 0.01109921, "auxiliary_loss_mlp": 0.01028766, "balance_loss_clip": 1.03983474, "balance_loss_mlp": 1.01669037, "epoch": 0.6020742522170449, "flos": 28329389470080.0, "grad_norm": 1.6563944160673858, "language_loss": 0.81454921, "learning_rate": 1.4433156868574732e-06, "loss": 0.83593607, "num_input_tokens_seen": 215664425, "step": 10014, "time_per_iteration": 2.6359105110168457 }, { "auxiliary_loss_clip": 0.01089157, "auxiliary_loss_mlp": 0.01030481, "balance_loss_clip": 1.037884, "balance_loss_mlp": 1.01777339, "epoch": 0.6021343754697129, "flos": 22747040058240.0, "grad_norm": 1.540720048754759, "language_loss": 0.72213233, "learning_rate": 1.442941626485624e-06, "loss": 0.74332869, "num_input_tokens_seen": 215684280, "step": 10015, "time_per_iteration": 2.7502388954162598 }, { "auxiliary_loss_clip": 0.01020446, "auxiliary_loss_mlp": 0.01001943, "balance_loss_clip": 1.01667976, "balance_loss_mlp": 1.00080478, "epoch": 0.6021944987223808, "flos": 65752007402880.0, "grad_norm": 0.8150448703202539, "language_loss": 0.5473066, "learning_rate": 1.4425675872374848e-06, "loss": 0.56753051, "num_input_tokens_seen": 215739780, "step": 10016, "time_per_iteration": 4.701697826385498 }, { "auxiliary_loss_clip": 0.01094661, "auxiliary_loss_mlp": 0.01029515, "balance_loss_clip": 1.04152966, "balance_loss_mlp": 1.01637208, "epoch": 0.6022546219750489, "flos": 16105182151680.0, "grad_norm": 1.5504792190081969, "language_loss": 0.82899499, "learning_rate": 1.4421935691272381e-06, "loss": 0.85023677, "num_input_tokens_seen": 215757885, "step": 10017, "time_per_iteration": 2.636793851852417 }, { "auxiliary_loss_clip": 0.01091797, "auxiliary_loss_mlp": 0.01031972, "balance_loss_clip": 1.0407809, "balance_loss_mlp": 1.01946664, "epoch": 0.6023147452277168, "flos": 25512555985920.0, "grad_norm": 1.7715837391634046, "language_loss": 0.83621204, "learning_rate": 1.4418195721690677e-06, "loss": 0.85744977, "num_input_tokens_seen": 215776415, "step": 10018, "time_per_iteration": 6.060548543930054 }, { "auxiliary_loss_clip": 0.01093456, "auxiliary_loss_mlp": 0.01038236, "balance_loss_clip": 1.03801382, "balance_loss_mlp": 1.02431202, "epoch": 0.6023748684803848, "flos": 22636075968000.0, "grad_norm": 1.733441285539822, "language_loss": 0.78400528, "learning_rate": 1.4414455963771549e-06, "loss": 0.80532229, "num_input_tokens_seen": 215794865, "step": 10019, "time_per_iteration": 2.6781299114227295 }, { "auxiliary_loss_clip": 0.01075209, "auxiliary_loss_mlp": 0.00770827, "balance_loss_clip": 1.03914475, "balance_loss_mlp": 1.00017881, "epoch": 0.6024349917330527, "flos": 26210444307840.0, "grad_norm": 2.381543125857722, "language_loss": 0.73964417, "learning_rate": 1.441071641765681e-06, "loss": 0.7581045, "num_input_tokens_seen": 215816840, "step": 10020, "time_per_iteration": 2.7956390380859375 }, { "auxiliary_loss_clip": 0.01095191, "auxiliary_loss_mlp": 0.01033486, "balance_loss_clip": 1.04020286, "balance_loss_mlp": 1.0205282, "epoch": 0.6024951149857207, "flos": 21251755762560.0, "grad_norm": 2.1093873668761765, "language_loss": 0.64171422, "learning_rate": 1.4406977083488264e-06, "loss": 0.663001, "num_input_tokens_seen": 215836100, "step": 10021, "time_per_iteration": 4.23021388053894 }, { "auxiliary_loss_clip": 0.01102751, "auxiliary_loss_mlp": 0.01033404, "balance_loss_clip": 1.03910637, "balance_loss_mlp": 1.01996362, "epoch": 0.6025552382383887, "flos": 26943453152640.0, "grad_norm": 1.41151849166176, "language_loss": 0.80664903, "learning_rate": 1.4403237961407704e-06, "loss": 0.82801056, "num_input_tokens_seen": 215858480, "step": 10022, "time_per_iteration": 2.6966497898101807 }, { "auxiliary_loss_clip": 0.0110378, "auxiliary_loss_mlp": 0.01030649, "balance_loss_clip": 1.04190755, "balance_loss_mlp": 1.0179832, "epoch": 0.6026153614910567, "flos": 31684379495040.0, "grad_norm": 1.480979872703277, "language_loss": 0.66483712, "learning_rate": 1.439949905155693e-06, "loss": 0.68618143, "num_input_tokens_seen": 215879950, "step": 10023, "time_per_iteration": 2.691399574279785 }, { "auxiliary_loss_clip": 0.01104501, "auxiliary_loss_mlp": 0.01032886, "balance_loss_clip": 1.03789723, "balance_loss_mlp": 1.02022552, "epoch": 0.6026754847437247, "flos": 29312731175040.0, "grad_norm": 2.162444553659901, "language_loss": 0.74503481, "learning_rate": 1.4395760354077707e-06, "loss": 0.76640868, "num_input_tokens_seen": 215899830, "step": 10024, "time_per_iteration": 2.7364046573638916 }, { "auxiliary_loss_clip": 0.01104535, "auxiliary_loss_mlp": 0.01036059, "balance_loss_clip": 1.04094052, "balance_loss_mlp": 1.02257693, "epoch": 0.6027356079963926, "flos": 23586775188480.0, "grad_norm": 1.6406938647308078, "language_loss": 0.72738647, "learning_rate": 1.4392021869111815e-06, "loss": 0.74879241, "num_input_tokens_seen": 215920440, "step": 10025, "time_per_iteration": 2.6431972980499268 }, { "auxiliary_loss_clip": 0.01119748, "auxiliary_loss_mlp": 0.01037727, "balance_loss_clip": 1.04081619, "balance_loss_mlp": 1.02376747, "epoch": 0.6027957312490606, "flos": 20813753318400.0, "grad_norm": 2.306954455043105, "language_loss": 0.6677472, "learning_rate": 1.4388283596801016e-06, "loss": 0.68932194, "num_input_tokens_seen": 215940535, "step": 10026, "time_per_iteration": 2.6187641620635986 }, { "auxiliary_loss_clip": 0.0110922, "auxiliary_loss_mlp": 0.01036818, "balance_loss_clip": 1.03789234, "balance_loss_mlp": 1.02471268, "epoch": 0.6028558545017285, "flos": 19935773182080.0, "grad_norm": 1.830391575126131, "language_loss": 0.80050242, "learning_rate": 1.4384545537287061e-06, "loss": 0.82196277, "num_input_tokens_seen": 215958045, "step": 10027, "time_per_iteration": 2.576110601425171 }, { "auxiliary_loss_clip": 0.01081954, "auxiliary_loss_mlp": 0.01036412, "balance_loss_clip": 1.03722823, "balance_loss_mlp": 1.02301311, "epoch": 0.6029159777543965, "flos": 22820836550400.0, "grad_norm": 2.0223053255723236, "language_loss": 0.70934105, "learning_rate": 1.438080769071171e-06, "loss": 0.73052478, "num_input_tokens_seen": 215977330, "step": 10028, "time_per_iteration": 2.7288432121276855 }, { "auxiliary_loss_clip": 0.01084702, "auxiliary_loss_mlp": 0.01035574, "balance_loss_clip": 1.04540849, "balance_loss_mlp": 1.02254987, "epoch": 0.6029761010070644, "flos": 23587242065280.0, "grad_norm": 2.1142238314038595, "language_loss": 0.84057522, "learning_rate": 1.437707005721669e-06, "loss": 0.86177796, "num_input_tokens_seen": 215997865, "step": 10029, "time_per_iteration": 2.7901382446289062 }, { "auxiliary_loss_clip": 0.0109278, "auxiliary_loss_mlp": 0.01032236, "balance_loss_clip": 1.0393126, "balance_loss_mlp": 1.0201664, "epoch": 0.6030362242597325, "flos": 13662430859520.0, "grad_norm": 2.2431865033670744, "language_loss": 0.79994917, "learning_rate": 1.437333263694373e-06, "loss": 0.82119942, "num_input_tokens_seen": 216016230, "step": 10030, "time_per_iteration": 2.780527114868164 }, { "auxiliary_loss_clip": 0.01048723, "auxiliary_loss_mlp": 0.01042121, "balance_loss_clip": 1.03655624, "balance_loss_mlp": 1.02806032, "epoch": 0.6030963475124004, "flos": 24422883045120.0, "grad_norm": 1.9455803489075072, "language_loss": 0.71241331, "learning_rate": 1.4369595430034572e-06, "loss": 0.73332179, "num_input_tokens_seen": 216035785, "step": 10031, "time_per_iteration": 2.8193559646606445 }, { "auxiliary_loss_clip": 0.0107322, "auxiliary_loss_mlp": 0.01037048, "balance_loss_clip": 1.0378077, "balance_loss_mlp": 1.02281427, "epoch": 0.6031564707650684, "flos": 29644043247360.0, "grad_norm": 2.2622695973651834, "language_loss": 0.72744608, "learning_rate": 1.4365858436630912e-06, "loss": 0.74854881, "num_input_tokens_seen": 216059555, "step": 10032, "time_per_iteration": 2.8426249027252197 }, { "auxiliary_loss_clip": 0.0110112, "auxiliary_loss_mlp": 0.01034912, "balance_loss_clip": 1.04412532, "balance_loss_mlp": 1.02163815, "epoch": 0.6032165940177363, "flos": 16618776768000.0, "grad_norm": 1.8175959049184216, "language_loss": 0.68774295, "learning_rate": 1.4362121656874465e-06, "loss": 0.70910323, "num_input_tokens_seen": 216077235, "step": 10033, "time_per_iteration": 2.700209379196167 }, { "auxiliary_loss_clip": 0.01089272, "auxiliary_loss_mlp": 0.01037723, "balance_loss_clip": 1.04015613, "balance_loss_mlp": 1.02396595, "epoch": 0.6032767172704043, "flos": 17488173553920.0, "grad_norm": 2.115327938975923, "language_loss": 0.7568332, "learning_rate": 1.4358385090906934e-06, "loss": 0.77810311, "num_input_tokens_seen": 216094985, "step": 10034, "time_per_iteration": 2.6627981662750244 }, { "auxiliary_loss_clip": 0.01095189, "auxiliary_loss_mlp": 0.01030387, "balance_loss_clip": 1.04141998, "balance_loss_mlp": 1.01710701, "epoch": 0.6033368405230723, "flos": 26832955939200.0, "grad_norm": 3.2723425599009026, "language_loss": 0.74862671, "learning_rate": 1.4354648738870004e-06, "loss": 0.7698825, "num_input_tokens_seen": 216115905, "step": 10035, "time_per_iteration": 2.8429391384124756 }, { "auxiliary_loss_clip": 0.01082466, "auxiliary_loss_mlp": 0.01027098, "balance_loss_clip": 1.03635907, "balance_loss_mlp": 1.0147779, "epoch": 0.6033969637757403, "flos": 16909904499840.0, "grad_norm": 1.7778569727517832, "language_loss": 0.8656829, "learning_rate": 1.435091260090536e-06, "loss": 0.88677853, "num_input_tokens_seen": 216132420, "step": 10036, "time_per_iteration": 2.7539496421813965 }, { "auxiliary_loss_clip": 0.0107738, "auxiliary_loss_mlp": 0.01034344, "balance_loss_clip": 1.03851438, "balance_loss_mlp": 1.02084994, "epoch": 0.6034570870284083, "flos": 22930076787840.0, "grad_norm": 1.8216360444833892, "language_loss": 0.70128858, "learning_rate": 1.4347176677154676e-06, "loss": 0.72240573, "num_input_tokens_seen": 216149800, "step": 10037, "time_per_iteration": 2.6496496200561523 }, { "auxiliary_loss_clip": 0.0109976, "auxiliary_loss_mlp": 0.01037189, "balance_loss_clip": 1.03967977, "balance_loss_mlp": 1.02270496, "epoch": 0.6035172102810762, "flos": 23366319465600.0, "grad_norm": 1.570748886934951, "language_loss": 0.8512125, "learning_rate": 1.4343440967759616e-06, "loss": 0.87258202, "num_input_tokens_seen": 216168200, "step": 10038, "time_per_iteration": 2.6828958988189697 }, { "auxiliary_loss_clip": 0.01098827, "auxiliary_loss_mlp": 0.01034673, "balance_loss_clip": 1.04050255, "balance_loss_mlp": 1.02128005, "epoch": 0.6035773335337442, "flos": 20887082933760.0, "grad_norm": 2.3203593434406242, "language_loss": 0.76504898, "learning_rate": 1.4339705472861846e-06, "loss": 0.78638399, "num_input_tokens_seen": 216187105, "step": 10039, "time_per_iteration": 2.6590511798858643 }, { "auxiliary_loss_clip": 0.01102907, "auxiliary_loss_mlp": 0.01031911, "balance_loss_clip": 1.03922081, "balance_loss_mlp": 1.019382, "epoch": 0.6036374567864121, "flos": 24936298093440.0, "grad_norm": 1.8339871345285923, "language_loss": 0.71111763, "learning_rate": 1.433597019260301e-06, "loss": 0.73246586, "num_input_tokens_seen": 216205440, "step": 10040, "time_per_iteration": 2.6712801456451416 }, { "auxiliary_loss_clip": 0.01109688, "auxiliary_loss_mlp": 0.01031148, "balance_loss_clip": 1.04312241, "balance_loss_mlp": 1.01598454, "epoch": 0.6036975800390801, "flos": 23148269953920.0, "grad_norm": 2.0137364812654166, "language_loss": 0.78602934, "learning_rate": 1.433223512712475e-06, "loss": 0.80743772, "num_input_tokens_seen": 216223130, "step": 10041, "time_per_iteration": 2.670166015625 }, { "auxiliary_loss_clip": 0.01096185, "auxiliary_loss_mlp": 0.01029552, "balance_loss_clip": 1.04166305, "balance_loss_mlp": 1.01649821, "epoch": 0.603757703291748, "flos": 18660729127680.0, "grad_norm": 1.7274066455029002, "language_loss": 0.75525141, "learning_rate": 1.4328500276568704e-06, "loss": 0.77650881, "num_input_tokens_seen": 216240260, "step": 10042, "time_per_iteration": 2.6106081008911133 }, { "auxiliary_loss_clip": 0.0106962, "auxiliary_loss_mlp": 0.01029029, "balance_loss_clip": 1.03727007, "balance_loss_mlp": 1.01701236, "epoch": 0.6038178265444161, "flos": 19682603147520.0, "grad_norm": 1.9258503206144206, "language_loss": 0.84721899, "learning_rate": 1.4324765641076498e-06, "loss": 0.86820555, "num_input_tokens_seen": 216258510, "step": 10043, "time_per_iteration": 2.71673846244812 }, { "auxiliary_loss_clip": 0.01081507, "auxiliary_loss_mlp": 0.01040859, "balance_loss_clip": 1.03832972, "balance_loss_mlp": 1.02579701, "epoch": 0.603877949797084, "flos": 22638230784000.0, "grad_norm": 1.8215258720973655, "language_loss": 0.70104671, "learning_rate": 1.432103122078974e-06, "loss": 0.72227025, "num_input_tokens_seen": 216277550, "step": 10044, "time_per_iteration": 2.7252089977264404 }, { "auxiliary_loss_clip": 0.01106435, "auxiliary_loss_mlp": 0.01032617, "balance_loss_clip": 1.04218245, "balance_loss_mlp": 1.01826382, "epoch": 0.603938073049752, "flos": 25447881548160.0, "grad_norm": 1.9233339181851183, "language_loss": 0.78067368, "learning_rate": 1.4317297015850057e-06, "loss": 0.80206418, "num_input_tokens_seen": 216296690, "step": 10045, "time_per_iteration": 2.6885697841644287 }, { "auxiliary_loss_clip": 0.01071663, "auxiliary_loss_mlp": 0.01034579, "balance_loss_clip": 1.04522324, "balance_loss_mlp": 1.02084029, "epoch": 0.6039981963024199, "flos": 22340135813760.0, "grad_norm": 1.7431481861658145, "language_loss": 0.77048129, "learning_rate": 1.4313563026399036e-06, "loss": 0.79154372, "num_input_tokens_seen": 216316110, "step": 10046, "time_per_iteration": 2.762124538421631 }, { "auxiliary_loss_clip": 0.01061952, "auxiliary_loss_mlp": 0.01040938, "balance_loss_clip": 1.03495252, "balance_loss_mlp": 1.02685905, "epoch": 0.6040583195550879, "flos": 20703148364160.0, "grad_norm": 1.791420221750128, "language_loss": 0.87246406, "learning_rate": 1.430982925257827e-06, "loss": 0.893493, "num_input_tokens_seen": 216333855, "step": 10047, "time_per_iteration": 2.7445127964019775 }, { "auxiliary_loss_clip": 0.01104302, "auxiliary_loss_mlp": 0.01030965, "balance_loss_clip": 1.04149449, "balance_loss_mlp": 1.01879954, "epoch": 0.604118442807756, "flos": 27163118776320.0, "grad_norm": 1.4945345993269403, "language_loss": 0.75776327, "learning_rate": 1.4306095694529358e-06, "loss": 0.77911592, "num_input_tokens_seen": 216354890, "step": 10048, "time_per_iteration": 2.730748414993286 }, { "auxiliary_loss_clip": 0.01108329, "auxiliary_loss_mlp": 0.01044251, "balance_loss_clip": 1.04174399, "balance_loss_mlp": 1.02869403, "epoch": 0.6041785660604239, "flos": 30881524654080.0, "grad_norm": 2.2243998349441183, "language_loss": 0.66556633, "learning_rate": 1.430236235239386e-06, "loss": 0.68709219, "num_input_tokens_seen": 216376055, "step": 10049, "time_per_iteration": 2.6866142749786377 }, { "auxiliary_loss_clip": 0.01089915, "auxiliary_loss_mlp": 0.01042714, "balance_loss_clip": 1.03830862, "balance_loss_mlp": 1.02849865, "epoch": 0.6042386893130919, "flos": 19938215306880.0, "grad_norm": 1.639569270992707, "language_loss": 0.66928005, "learning_rate": 1.429862922631336e-06, "loss": 0.69060636, "num_input_tokens_seen": 216396295, "step": 10050, "time_per_iteration": 2.744527816772461 }, { "auxiliary_loss_clip": 0.01083354, "auxiliary_loss_mlp": 0.01036031, "balance_loss_clip": 1.03962123, "balance_loss_mlp": 1.02269185, "epoch": 0.6042988125657598, "flos": 32415915882240.0, "grad_norm": 1.7210161547813447, "language_loss": 0.6963383, "learning_rate": 1.4294896316429408e-06, "loss": 0.71753216, "num_input_tokens_seen": 216416605, "step": 10051, "time_per_iteration": 2.820204734802246 }, { "auxiliary_loss_clip": 0.01100825, "auxiliary_loss_mlp": 0.01032129, "balance_loss_clip": 1.03741777, "balance_loss_mlp": 1.01908135, "epoch": 0.6043589358184278, "flos": 17420805596160.0, "grad_norm": 2.3541607325849987, "language_loss": 0.64901161, "learning_rate": 1.4291163622883553e-06, "loss": 0.67034107, "num_input_tokens_seen": 216435130, "step": 10052, "time_per_iteration": 2.682201385498047 }, { "auxiliary_loss_clip": 0.01094174, "auxiliary_loss_mlp": 0.01034325, "balance_loss_clip": 1.0397222, "balance_loss_mlp": 1.0204432, "epoch": 0.6044190590710957, "flos": 27672834723840.0, "grad_norm": 1.5756389367941481, "language_loss": 0.69104528, "learning_rate": 1.4287431145817358e-06, "loss": 0.71233022, "num_input_tokens_seen": 216455640, "step": 10053, "time_per_iteration": 2.8296010494232178 }, { "auxiliary_loss_clip": 0.01018297, "auxiliary_loss_mlp": 0.01003475, "balance_loss_clip": 1.01298642, "balance_loss_mlp": 1.0022707, "epoch": 0.6044791823237637, "flos": 65316267515520.0, "grad_norm": 0.7275681454160189, "language_loss": 0.60339212, "learning_rate": 1.4283698885372336e-06, "loss": 0.62360984, "num_input_tokens_seen": 216518130, "step": 10054, "time_per_iteration": 3.3135299682617188 }, { "auxiliary_loss_clip": 0.01055185, "auxiliary_loss_mlp": 0.01033215, "balance_loss_clip": 1.03634906, "balance_loss_mlp": 1.019768, "epoch": 0.6045393055764317, "flos": 24492369905280.0, "grad_norm": 1.5749604097549974, "language_loss": 0.8565892, "learning_rate": 1.4279966841690027e-06, "loss": 0.87747318, "num_input_tokens_seen": 216536845, "step": 10055, "time_per_iteration": 2.803851842880249 }, { "auxiliary_loss_clip": 0.0109594, "auxiliary_loss_mlp": 0.01048723, "balance_loss_clip": 1.04159987, "balance_loss_mlp": 1.03321385, "epoch": 0.6045994288290997, "flos": 19054345340160.0, "grad_norm": 2.24817202299257, "language_loss": 0.74068117, "learning_rate": 1.4276235014911952e-06, "loss": 0.76212776, "num_input_tokens_seen": 216551860, "step": 10056, "time_per_iteration": 4.305849313735962 }, { "auxiliary_loss_clip": 0.01073635, "auxiliary_loss_mlp": 0.01035962, "balance_loss_clip": 1.03811693, "balance_loss_mlp": 1.02309358, "epoch": 0.6046595520817676, "flos": 26576697335040.0, "grad_norm": 1.7955377697616153, "language_loss": 0.80028808, "learning_rate": 1.4272503405179616e-06, "loss": 0.82138407, "num_input_tokens_seen": 216574775, "step": 10057, "time_per_iteration": 5.891208648681641 }, { "auxiliary_loss_clip": 0.0111396, "auxiliary_loss_mlp": 0.00770338, "balance_loss_clip": 1.04094028, "balance_loss_mlp": 1.00008702, "epoch": 0.6047196753344356, "flos": 13582277660160.0, "grad_norm": 2.047185386836812, "language_loss": 0.75578213, "learning_rate": 1.4268772012634527e-06, "loss": 0.77462518, "num_input_tokens_seen": 216590100, "step": 10058, "time_per_iteration": 2.6869444847106934 }, { "auxiliary_loss_clip": 0.0110179, "auxiliary_loss_mlp": 0.01030868, "balance_loss_clip": 1.03934133, "balance_loss_mlp": 1.01811314, "epoch": 0.6047797985871035, "flos": 25520456977920.0, "grad_norm": 1.9889135378311975, "language_loss": 0.70937455, "learning_rate": 1.4265040837418176e-06, "loss": 0.73070109, "num_input_tokens_seen": 216610145, "step": 10059, "time_per_iteration": 2.7275924682617188 }, { "auxiliary_loss_clip": 0.01092569, "auxiliary_loss_mlp": 0.0103084, "balance_loss_clip": 1.03944898, "balance_loss_mlp": 1.01753664, "epoch": 0.6048399218397715, "flos": 20520147548160.0, "grad_norm": 1.7704655084920065, "language_loss": 0.76338398, "learning_rate": 1.4261309879672054e-06, "loss": 0.78461802, "num_input_tokens_seen": 216630625, "step": 10060, "time_per_iteration": 4.274925470352173 }, { "auxiliary_loss_clip": 0.01104515, "auxiliary_loss_mlp": 0.01034248, "balance_loss_clip": 1.03981733, "balance_loss_mlp": 1.02105165, "epoch": 0.6049000450924396, "flos": 20408788408320.0, "grad_norm": 1.9626551853189032, "language_loss": 0.73588789, "learning_rate": 1.4257579139537628e-06, "loss": 0.75727558, "num_input_tokens_seen": 216649255, "step": 10061, "time_per_iteration": 2.6950912475585938 }, { "auxiliary_loss_clip": 0.01076727, "auxiliary_loss_mlp": 0.00771397, "balance_loss_clip": 1.04075074, "balance_loss_mlp": 1.00014019, "epoch": 0.6049601683451075, "flos": 20741357456640.0, "grad_norm": 2.92695225177956, "language_loss": 0.67823231, "learning_rate": 1.425384861715639e-06, "loss": 0.69671357, "num_input_tokens_seen": 216668100, "step": 10062, "time_per_iteration": 2.7427420616149902 }, { "auxiliary_loss_clip": 0.01099001, "auxiliary_loss_mlp": 0.010396, "balance_loss_clip": 1.03907073, "balance_loss_mlp": 1.02500868, "epoch": 0.6050202915977755, "flos": 20083114771200.0, "grad_norm": 2.0011992400768173, "language_loss": 0.71559471, "learning_rate": 1.425011831266978e-06, "loss": 0.73698068, "num_input_tokens_seen": 216686125, "step": 10063, "time_per_iteration": 2.652628183364868 }, { "auxiliary_loss_clip": 0.01111808, "auxiliary_loss_mlp": 0.01037973, "balance_loss_clip": 1.03926516, "balance_loss_mlp": 1.02516413, "epoch": 0.6050804148504434, "flos": 15960821391360.0, "grad_norm": 1.8208827458989, "language_loss": 0.84698188, "learning_rate": 1.424638822621926e-06, "loss": 0.86847973, "num_input_tokens_seen": 216704265, "step": 10064, "time_per_iteration": 2.6407761573791504 }, { "auxiliary_loss_clip": 0.01105098, "auxiliary_loss_mlp": 0.01032994, "balance_loss_clip": 1.04044116, "balance_loss_mlp": 1.01974392, "epoch": 0.6051405381031114, "flos": 17456644391040.0, "grad_norm": 2.095191883416591, "language_loss": 0.79596299, "learning_rate": 1.4242658357946278e-06, "loss": 0.81734389, "num_input_tokens_seen": 216721765, "step": 10065, "time_per_iteration": 2.633913040161133 }, { "auxiliary_loss_clip": 0.01067386, "auxiliary_loss_mlp": 0.01033127, "balance_loss_clip": 1.03866124, "balance_loss_mlp": 1.0181725, "epoch": 0.6052006613557793, "flos": 11400130517760.0, "grad_norm": 2.398871193370657, "language_loss": 0.78276229, "learning_rate": 1.423892870799226e-06, "loss": 0.80376744, "num_input_tokens_seen": 216738295, "step": 10066, "time_per_iteration": 2.729074001312256 }, { "auxiliary_loss_clip": 0.01059487, "auxiliary_loss_mlp": 0.01033515, "balance_loss_clip": 1.03963447, "balance_loss_mlp": 1.01981831, "epoch": 0.6052607846084473, "flos": 24750998807040.0, "grad_norm": 1.7528217462877862, "language_loss": 0.7308799, "learning_rate": 1.4235199276498655e-06, "loss": 0.75180995, "num_input_tokens_seen": 216759875, "step": 10067, "time_per_iteration": 2.81003999710083 }, { "auxiliary_loss_clip": 0.01094022, "auxiliary_loss_mlp": 0.00770796, "balance_loss_clip": 1.04127932, "balance_loss_mlp": 1.00018191, "epoch": 0.6053209078611153, "flos": 20741141975040.0, "grad_norm": 1.357631083448857, "language_loss": 0.68994391, "learning_rate": 1.4231470063606863e-06, "loss": 0.70859212, "num_input_tokens_seen": 216780705, "step": 10068, "time_per_iteration": 2.7258529663085938 }, { "auxiliary_loss_clip": 0.010988, "auxiliary_loss_mlp": 0.01031117, "balance_loss_clip": 1.03859472, "balance_loss_mlp": 1.01821876, "epoch": 0.6053810311137833, "flos": 18953149749120.0, "grad_norm": 3.7091992376991096, "language_loss": 0.870857, "learning_rate": 1.4227741069458303e-06, "loss": 0.89215624, "num_input_tokens_seen": 216797625, "step": 10069, "time_per_iteration": 2.57892107963562 }, { "auxiliary_loss_clip": 0.01081389, "auxiliary_loss_mlp": 0.01029042, "balance_loss_clip": 1.03757524, "balance_loss_mlp": 1.01611388, "epoch": 0.6054411543664512, "flos": 23951124794880.0, "grad_norm": 1.6595378120531261, "language_loss": 0.83174849, "learning_rate": 1.4224012294194387e-06, "loss": 0.85285282, "num_input_tokens_seen": 216817610, "step": 10070, "time_per_iteration": 2.7172200679779053 }, { "auxiliary_loss_clip": 0.01100339, "auxiliary_loss_mlp": 0.01034986, "balance_loss_clip": 1.04162169, "balance_loss_mlp": 1.02189064, "epoch": 0.6055012776191192, "flos": 20593979953920.0, "grad_norm": 1.9849870448156475, "language_loss": 0.85964417, "learning_rate": 1.4220283737956496e-06, "loss": 0.88099742, "num_input_tokens_seen": 216836835, "step": 10071, "time_per_iteration": 2.677682638168335 }, { "auxiliary_loss_clip": 0.01109082, "auxiliary_loss_mlp": 0.01035663, "balance_loss_clip": 1.04172432, "balance_loss_mlp": 1.02102959, "epoch": 0.6055614008717871, "flos": 30298191782400.0, "grad_norm": 1.8218197035918635, "language_loss": 0.77151179, "learning_rate": 1.421655540088603e-06, "loss": 0.79295927, "num_input_tokens_seen": 216856760, "step": 10072, "time_per_iteration": 2.806692123413086 }, { "auxiliary_loss_clip": 0.01094577, "auxiliary_loss_mlp": 0.01028599, "balance_loss_clip": 1.0381639, "balance_loss_mlp": 1.01447272, "epoch": 0.6056215241244551, "flos": 27125017424640.0, "grad_norm": 1.5487316274587832, "language_loss": 0.74428165, "learning_rate": 1.4212827283124367e-06, "loss": 0.76551342, "num_input_tokens_seen": 216878795, "step": 10073, "time_per_iteration": 2.746279239654541 }, { "auxiliary_loss_clip": 0.00997245, "auxiliary_loss_mlp": 0.01001533, "balance_loss_clip": 1.01025248, "balance_loss_mlp": 1.00035894, "epoch": 0.6056816473771232, "flos": 56007323925120.0, "grad_norm": 0.7538510449367495, "language_loss": 0.55113828, "learning_rate": 1.4209099384812863e-06, "loss": 0.57112598, "num_input_tokens_seen": 216937800, "step": 10074, "time_per_iteration": 3.3036320209503174 }, { "auxiliary_loss_clip": 0.01075201, "auxiliary_loss_mlp": 0.01042355, "balance_loss_clip": 1.03847015, "balance_loss_mlp": 1.02714372, "epoch": 0.6057417706297911, "flos": 23549499849600.0, "grad_norm": 1.7766669021243995, "language_loss": 0.81689596, "learning_rate": 1.4205371706092894e-06, "loss": 0.83807153, "num_input_tokens_seen": 216955280, "step": 10075, "time_per_iteration": 2.731048583984375 }, { "auxiliary_loss_clip": 0.01107881, "auxiliary_loss_mlp": 0.01025575, "balance_loss_clip": 1.04031885, "balance_loss_mlp": 1.01165175, "epoch": 0.6058018938824591, "flos": 27744296832000.0, "grad_norm": 1.740054911914685, "language_loss": 0.77907681, "learning_rate": 1.4201644247105813e-06, "loss": 0.80041134, "num_input_tokens_seen": 216976950, "step": 10076, "time_per_iteration": 2.6934380531311035 }, { "auxiliary_loss_clip": 0.01106108, "auxiliary_loss_mlp": 0.01036344, "balance_loss_clip": 1.03907084, "balance_loss_mlp": 1.02240217, "epoch": 0.605862017135127, "flos": 22783381643520.0, "grad_norm": 1.6512555736365901, "language_loss": 0.72421932, "learning_rate": 1.4197917007992964e-06, "loss": 0.74564385, "num_input_tokens_seen": 216996945, "step": 10077, "time_per_iteration": 2.6461181640625 }, { "auxiliary_loss_clip": 0.01117207, "auxiliary_loss_mlp": 0.0103146, "balance_loss_clip": 1.04170644, "balance_loss_mlp": 1.01762605, "epoch": 0.605922140387795, "flos": 21215019127680.0, "grad_norm": 1.9059777517343863, "language_loss": 0.55426162, "learning_rate": 1.4194189988895682e-06, "loss": 0.57574832, "num_input_tokens_seen": 217016580, "step": 10078, "time_per_iteration": 2.6261439323425293 }, { "auxiliary_loss_clip": 0.01073319, "auxiliary_loss_mlp": 0.01031331, "balance_loss_clip": 1.03767908, "balance_loss_mlp": 1.0181284, "epoch": 0.6059822636404629, "flos": 27268372604160.0, "grad_norm": 1.6895659179757812, "language_loss": 0.70538819, "learning_rate": 1.4190463189955297e-06, "loss": 0.72643465, "num_input_tokens_seen": 217037300, "step": 10079, "time_per_iteration": 2.830202102661133 }, { "auxiliary_loss_clip": 0.01092187, "auxiliary_loss_mlp": 0.01039196, "balance_loss_clip": 1.03862, "balance_loss_mlp": 1.02637529, "epoch": 0.606042386893131, "flos": 20631327120000.0, "grad_norm": 1.6859252666783793, "language_loss": 0.6267547, "learning_rate": 1.4186736611313131e-06, "loss": 0.64806855, "num_input_tokens_seen": 217055805, "step": 10080, "time_per_iteration": 2.6813855171203613 }, { "auxiliary_loss_clip": 0.01094103, "auxiliary_loss_mlp": 0.01031366, "balance_loss_clip": 1.03858209, "balance_loss_mlp": 1.01722753, "epoch": 0.6061025101457989, "flos": 23002293081600.0, "grad_norm": 2.6314265017345613, "language_loss": 0.71340102, "learning_rate": 1.4183010253110492e-06, "loss": 0.73465574, "num_input_tokens_seen": 217074175, "step": 10081, "time_per_iteration": 2.750216007232666 }, { "auxiliary_loss_clip": 0.01091896, "auxiliary_loss_mlp": 0.01029512, "balance_loss_clip": 1.03969479, "balance_loss_mlp": 1.01624978, "epoch": 0.6061626333984669, "flos": 29898937134720.0, "grad_norm": 1.724175069330151, "language_loss": 0.69190812, "learning_rate": 1.4179284115488691e-06, "loss": 0.71312225, "num_input_tokens_seen": 217095695, "step": 10082, "time_per_iteration": 2.7279422283172607 }, { "auxiliary_loss_clip": 0.01117243, "auxiliary_loss_mlp": 0.01032658, "balance_loss_clip": 1.04338622, "balance_loss_mlp": 1.01974726, "epoch": 0.6062227566511348, "flos": 25009196745600.0, "grad_norm": 1.3736157370589637, "language_loss": 0.65741009, "learning_rate": 1.4175558198589015e-06, "loss": 0.67890906, "num_input_tokens_seen": 217116260, "step": 10083, "time_per_iteration": 2.6431922912597656 }, { "auxiliary_loss_clip": 0.01104697, "auxiliary_loss_mlp": 0.01033772, "balance_loss_clip": 1.03986526, "balance_loss_mlp": 1.02053976, "epoch": 0.6062828799038028, "flos": 19463943104640.0, "grad_norm": 1.8569136538666067, "language_loss": 0.74291378, "learning_rate": 1.4171832502552764e-06, "loss": 0.7642985, "num_input_tokens_seen": 217134465, "step": 10084, "time_per_iteration": 2.693331003189087 }, { "auxiliary_loss_clip": 0.01089491, "auxiliary_loss_mlp": 0.01040114, "balance_loss_clip": 1.03806448, "balance_loss_mlp": 1.02654219, "epoch": 0.6063430031564707, "flos": 13589568120960.0, "grad_norm": 14.01820477469797, "language_loss": 0.72177935, "learning_rate": 1.4168107027521204e-06, "loss": 0.74307537, "num_input_tokens_seen": 217149920, "step": 10085, "time_per_iteration": 2.6207504272460938 }, { "auxiliary_loss_clip": 0.01115179, "auxiliary_loss_mlp": 0.0103546, "balance_loss_clip": 1.04101026, "balance_loss_mlp": 1.02325344, "epoch": 0.6064031264091387, "flos": 23255499029760.0, "grad_norm": 1.9650382613535748, "language_loss": 0.76113385, "learning_rate": 1.4164381773635605e-06, "loss": 0.78264022, "num_input_tokens_seen": 217168165, "step": 10086, "time_per_iteration": 2.6350982189178467 }, { "auxiliary_loss_clip": 0.01079834, "auxiliary_loss_mlp": 0.01033915, "balance_loss_clip": 1.03654695, "balance_loss_mlp": 1.02082586, "epoch": 0.6064632496618068, "flos": 22458462192000.0, "grad_norm": 1.6281495100420569, "language_loss": 0.72623181, "learning_rate": 1.4160656741037246e-06, "loss": 0.74736929, "num_input_tokens_seen": 217190070, "step": 10087, "time_per_iteration": 2.7133493423461914 }, { "auxiliary_loss_clip": 0.01101404, "auxiliary_loss_mlp": 0.0103704, "balance_loss_clip": 1.03922224, "balance_loss_mlp": 1.02555394, "epoch": 0.6065233729144747, "flos": 25118652464640.0, "grad_norm": 1.8336458297983596, "language_loss": 0.83669853, "learning_rate": 1.4156931929867355e-06, "loss": 0.85808301, "num_input_tokens_seen": 217209370, "step": 10088, "time_per_iteration": 2.6913206577301025 }, { "auxiliary_loss_clip": 0.01058404, "auxiliary_loss_mlp": 0.00771924, "balance_loss_clip": 1.03367972, "balance_loss_mlp": 1.00013125, "epoch": 0.6065834961671427, "flos": 23477355383040.0, "grad_norm": 2.41510818695702, "language_loss": 0.7150932, "learning_rate": 1.4153207340267201e-06, "loss": 0.73339653, "num_input_tokens_seen": 217226990, "step": 10089, "time_per_iteration": 2.71724271774292 }, { "auxiliary_loss_clip": 0.01104996, "auxiliary_loss_mlp": 0.01039267, "balance_loss_clip": 1.04092312, "balance_loss_mlp": 1.02694106, "epoch": 0.6066436194198106, "flos": 17019396132480.0, "grad_norm": 3.755310304725579, "language_loss": 0.82807851, "learning_rate": 1.4149482972378009e-06, "loss": 0.84952104, "num_input_tokens_seen": 217244585, "step": 10090, "time_per_iteration": 2.600306510925293 }, { "auxiliary_loss_clip": 0.01082916, "auxiliary_loss_mlp": 0.01036874, "balance_loss_clip": 1.04005432, "balance_loss_mlp": 1.02280176, "epoch": 0.6067037426724786, "flos": 18514752255360.0, "grad_norm": 2.395523786898732, "language_loss": 0.75284386, "learning_rate": 1.4145758826341e-06, "loss": 0.77404171, "num_input_tokens_seen": 217263435, "step": 10091, "time_per_iteration": 2.7555627822875977 }, { "auxiliary_loss_clip": 0.0111346, "auxiliary_loss_mlp": 0.01037619, "balance_loss_clip": 1.04098213, "balance_loss_mlp": 1.02436924, "epoch": 0.6067638659251465, "flos": 22345989730560.0, "grad_norm": 1.5349996815844518, "language_loss": 0.79607046, "learning_rate": 1.4142034902297415e-06, "loss": 0.81758124, "num_input_tokens_seen": 217283725, "step": 10092, "time_per_iteration": 2.607757568359375 }, { "auxiliary_loss_clip": 0.01094482, "auxiliary_loss_mlp": 0.01037242, "balance_loss_clip": 1.03954625, "balance_loss_mlp": 1.02349734, "epoch": 0.6068239891778145, "flos": 12451019748480.0, "grad_norm": 1.7756923536136626, "language_loss": 0.7618677, "learning_rate": 1.4138311200388444e-06, "loss": 0.78318495, "num_input_tokens_seen": 217301120, "step": 10093, "time_per_iteration": 2.730297327041626 }, { "auxiliary_loss_clip": 0.01088328, "auxiliary_loss_mlp": 0.01043446, "balance_loss_clip": 1.0393225, "balance_loss_mlp": 1.02897358, "epoch": 0.6068841124304825, "flos": 23185868515200.0, "grad_norm": 1.8396370870528131, "language_loss": 0.87565696, "learning_rate": 1.4134587720755304e-06, "loss": 0.89697462, "num_input_tokens_seen": 217319585, "step": 10094, "time_per_iteration": 2.7664146423339844 }, { "auxiliary_loss_clip": 0.01107836, "auxiliary_loss_mlp": 0.01029861, "balance_loss_clip": 1.04203224, "balance_loss_mlp": 1.01675439, "epoch": 0.6069442356831505, "flos": 18587902302720.0, "grad_norm": 1.805883260375072, "language_loss": 0.71895981, "learning_rate": 1.413086446353919e-06, "loss": 0.74033689, "num_input_tokens_seen": 217338880, "step": 10095, "time_per_iteration": 2.610901355743408 }, { "auxiliary_loss_clip": 0.01089454, "auxiliary_loss_mlp": 0.01034743, "balance_loss_clip": 1.03730071, "balance_loss_mlp": 1.02213049, "epoch": 0.6070043589358184, "flos": 20960340721920.0, "grad_norm": 1.8353844932279613, "language_loss": 0.76935136, "learning_rate": 1.4127141428881273e-06, "loss": 0.79059333, "num_input_tokens_seen": 217357480, "step": 10096, "time_per_iteration": 5.823329925537109 }, { "auxiliary_loss_clip": 0.01119601, "auxiliary_loss_mlp": 0.01041655, "balance_loss_clip": 1.04269695, "balance_loss_mlp": 1.02889967, "epoch": 0.6070644821884864, "flos": 11692443398400.0, "grad_norm": 2.030764189672632, "language_loss": 0.80070782, "learning_rate": 1.4123418616922749e-06, "loss": 0.82232034, "num_input_tokens_seen": 217374575, "step": 10097, "time_per_iteration": 2.63212513923645 }, { "auxiliary_loss_clip": 0.01090335, "auxiliary_loss_mlp": 0.01032018, "balance_loss_clip": 1.04231095, "balance_loss_mlp": 1.01897645, "epoch": 0.6071246054411543, "flos": 19310568030720.0, "grad_norm": 1.5236568833124404, "language_loss": 0.67320025, "learning_rate": 1.411969602780478e-06, "loss": 0.69442379, "num_input_tokens_seen": 217392950, "step": 10098, "time_per_iteration": 2.6840009689331055 }, { "auxiliary_loss_clip": 0.01114691, "auxiliary_loss_mlp": 0.01029516, "balance_loss_clip": 1.04036307, "balance_loss_mlp": 1.0169934, "epoch": 0.6071847286938223, "flos": 17749029098880.0, "grad_norm": 2.4274073378556125, "language_loss": 0.80730307, "learning_rate": 1.4115973661668523e-06, "loss": 0.82874513, "num_input_tokens_seen": 217412145, "step": 10099, "time_per_iteration": 2.5781733989715576 }, { "auxiliary_loss_clip": 0.01085094, "auxiliary_loss_mlp": 0.01039748, "balance_loss_clip": 1.03784657, "balance_loss_mlp": 1.02517462, "epoch": 0.6072448519464904, "flos": 22637512512000.0, "grad_norm": 2.246118750219277, "language_loss": 0.70420504, "learning_rate": 1.4112251518655133e-06, "loss": 0.7254535, "num_input_tokens_seen": 217432080, "step": 10100, "time_per_iteration": 4.310024738311768 }, { "auxiliary_loss_clip": 0.01077866, "auxiliary_loss_mlp": 0.01036569, "balance_loss_clip": 1.03830409, "balance_loss_mlp": 1.02207279, "epoch": 0.6073049751991583, "flos": 19537308633600.0, "grad_norm": 1.6047311801163284, "language_loss": 0.70821762, "learning_rate": 1.4108529598905764e-06, "loss": 0.72936189, "num_input_tokens_seen": 217450945, "step": 10101, "time_per_iteration": 2.726445198059082 }, { "auxiliary_loss_clip": 0.01084441, "auxiliary_loss_mlp": 0.01034143, "balance_loss_clip": 1.03571582, "balance_loss_mlp": 1.02082181, "epoch": 0.6073650984518263, "flos": 28294233033600.0, "grad_norm": 2.197032989023165, "language_loss": 0.69728243, "learning_rate": 1.410480790256154e-06, "loss": 0.71846825, "num_input_tokens_seen": 217473105, "step": 10102, "time_per_iteration": 2.7282192707061768 }, { "auxiliary_loss_clip": 0.0111817, "auxiliary_loss_mlp": 0.01035861, "balance_loss_clip": 1.04134989, "balance_loss_mlp": 1.0230341, "epoch": 0.6074252217044942, "flos": 25664422688640.0, "grad_norm": 1.8635985471124068, "language_loss": 0.73704481, "learning_rate": 1.4101086429763589e-06, "loss": 0.7585851, "num_input_tokens_seen": 217491780, "step": 10103, "time_per_iteration": 2.6332626342773438 }, { "auxiliary_loss_clip": 0.01077723, "auxiliary_loss_mlp": 0.01037617, "balance_loss_clip": 1.04122865, "balance_loss_mlp": 1.02333558, "epoch": 0.6074853449571622, "flos": 22857106308480.0, "grad_norm": 1.5666292395017738, "language_loss": 0.76782012, "learning_rate": 1.4097365180653032e-06, "loss": 0.78897351, "num_input_tokens_seen": 217510605, "step": 10104, "time_per_iteration": 2.7046008110046387 }, { "auxiliary_loss_clip": 0.01012823, "auxiliary_loss_mlp": 0.01009652, "balance_loss_clip": 1.01738811, "balance_loss_mlp": 1.00849557, "epoch": 0.6075454682098301, "flos": 67111406547840.0, "grad_norm": 0.7409971394494129, "language_loss": 0.55891275, "learning_rate": 1.4093644155370977e-06, "loss": 0.57913756, "num_input_tokens_seen": 217574815, "step": 10105, "time_per_iteration": 3.2526538372039795 }, { "auxiliary_loss_clip": 0.01030607, "auxiliary_loss_mlp": 0.01011283, "balance_loss_clip": 1.01659429, "balance_loss_mlp": 1.01022172, "epoch": 0.6076055914624982, "flos": 70712024751360.0, "grad_norm": 0.768019180696257, "language_loss": 0.56802553, "learning_rate": 1.4089923354058533e-06, "loss": 0.58844441, "num_input_tokens_seen": 217632375, "step": 10106, "time_per_iteration": 3.158289909362793 }, { "auxiliary_loss_clip": 0.01063356, "auxiliary_loss_mlp": 0.01035278, "balance_loss_clip": 1.03482223, "balance_loss_mlp": 1.02204537, "epoch": 0.6076657147151661, "flos": 28364545906560.0, "grad_norm": 1.5438087958158528, "language_loss": 0.68604589, "learning_rate": 1.4086202776856784e-06, "loss": 0.7070322, "num_input_tokens_seen": 217653055, "step": 10107, "time_per_iteration": 2.922015905380249 }, { "auxiliary_loss_clip": 0.01104951, "auxiliary_loss_mlp": 0.01029821, "balance_loss_clip": 1.03881001, "balance_loss_mlp": 1.01635098, "epoch": 0.6077258379678341, "flos": 15049767807360.0, "grad_norm": 1.8478390173687478, "language_loss": 0.81575567, "learning_rate": 1.4082482423906815e-06, "loss": 0.83710343, "num_input_tokens_seen": 217671520, "step": 10108, "time_per_iteration": 2.6345651149749756 }, { "auxiliary_loss_clip": 0.01090498, "auxiliary_loss_mlp": 0.01037826, "balance_loss_clip": 1.03763413, "balance_loss_mlp": 1.02306151, "epoch": 0.607785961220502, "flos": 36167251553280.0, "grad_norm": 2.15332165440763, "language_loss": 0.71337903, "learning_rate": 1.4078762295349714e-06, "loss": 0.73466218, "num_input_tokens_seen": 217691880, "step": 10109, "time_per_iteration": 2.874757766723633 }, { "auxiliary_loss_clip": 0.01090295, "auxiliary_loss_mlp": 0.01033773, "balance_loss_clip": 1.03903341, "balance_loss_mlp": 1.02175713, "epoch": 0.60784608447317, "flos": 22524249951360.0, "grad_norm": 1.6052444437933584, "language_loss": 0.79990447, "learning_rate": 1.407504239132653e-06, "loss": 0.82114512, "num_input_tokens_seen": 217710530, "step": 10110, "time_per_iteration": 2.6963181495666504 }, { "auxiliary_loss_clip": 0.01089001, "auxiliary_loss_mlp": 0.01029745, "balance_loss_clip": 1.03760231, "balance_loss_mlp": 1.01529717, "epoch": 0.6079062077258379, "flos": 23841166285440.0, "grad_norm": 2.270664246588292, "language_loss": 0.70269084, "learning_rate": 1.4071322711978338e-06, "loss": 0.72387832, "num_input_tokens_seen": 217728650, "step": 10111, "time_per_iteration": 2.6903553009033203 }, { "auxiliary_loss_clip": 0.01085414, "auxiliary_loss_mlp": 0.010291, "balance_loss_clip": 1.04066074, "balance_loss_mlp": 1.01539087, "epoch": 0.6079663309785059, "flos": 23367037737600.0, "grad_norm": 1.6556748056408641, "language_loss": 0.65621054, "learning_rate": 1.4067603257446186e-06, "loss": 0.67735571, "num_input_tokens_seen": 217747135, "step": 10112, "time_per_iteration": 2.7705774307250977 }, { "auxiliary_loss_clip": 0.01029897, "auxiliary_loss_mlp": 0.00999602, "balance_loss_clip": 1.01457083, "balance_loss_mlp": 0.99854136, "epoch": 0.6080264542311739, "flos": 71382873110400.0, "grad_norm": 0.6359208638260742, "language_loss": 0.49526292, "learning_rate": 1.4063884027871105e-06, "loss": 0.51555794, "num_input_tokens_seen": 217811860, "step": 10113, "time_per_iteration": 3.2169973850250244 }, { "auxiliary_loss_clip": 0.01030037, "auxiliary_loss_mlp": 0.01000401, "balance_loss_clip": 1.01493645, "balance_loss_mlp": 0.99929249, "epoch": 0.6080865774838419, "flos": 66529833442560.0, "grad_norm": 0.8386978497659568, "language_loss": 0.56947362, "learning_rate": 1.4060165023394147e-06, "loss": 0.58977795, "num_input_tokens_seen": 217866510, "step": 10114, "time_per_iteration": 3.1260786056518555 }, { "auxiliary_loss_clip": 0.01118489, "auxiliary_loss_mlp": 0.01029714, "balance_loss_clip": 1.04061675, "balance_loss_mlp": 1.01540279, "epoch": 0.6081467007365099, "flos": 19207935895680.0, "grad_norm": 2.0279729583270405, "language_loss": 0.70046329, "learning_rate": 1.4056446244156317e-06, "loss": 0.72194529, "num_input_tokens_seen": 217885650, "step": 10115, "time_per_iteration": 2.627066135406494 }, { "auxiliary_loss_clip": 0.01076474, "auxiliary_loss_mlp": 0.01030702, "balance_loss_clip": 1.03560662, "balance_loss_mlp": 1.01668298, "epoch": 0.6082068239891778, "flos": 24167737762560.0, "grad_norm": 1.5787360311779992, "language_loss": 0.72676456, "learning_rate": 1.4052727690298642e-06, "loss": 0.74783635, "num_input_tokens_seen": 217905300, "step": 10116, "time_per_iteration": 2.713207721710205 }, { "auxiliary_loss_clip": 0.01090032, "auxiliary_loss_mlp": 0.01036221, "balance_loss_clip": 1.03843713, "balance_loss_mlp": 1.02108169, "epoch": 0.6082669472418458, "flos": 37413316310400.0, "grad_norm": 1.6151215779769803, "language_loss": 0.53940326, "learning_rate": 1.4049009361962138e-06, "loss": 0.56066579, "num_input_tokens_seen": 217927845, "step": 10117, "time_per_iteration": 2.809150218963623 }, { "auxiliary_loss_clip": 0.01097513, "auxiliary_loss_mlp": 0.01030118, "balance_loss_clip": 1.04143286, "balance_loss_mlp": 1.01718414, "epoch": 0.6083270704945137, "flos": 15085534775040.0, "grad_norm": 1.724080776440041, "language_loss": 0.70168173, "learning_rate": 1.4045291259287786e-06, "loss": 0.72295797, "num_input_tokens_seen": 217946145, "step": 10118, "time_per_iteration": 2.6340367794036865 }, { "auxiliary_loss_clip": 0.01051915, "auxiliary_loss_mlp": 0.01030313, "balance_loss_clip": 1.03519964, "balance_loss_mlp": 1.01717043, "epoch": 0.6083871937471818, "flos": 20668458804480.0, "grad_norm": 1.7207126950990799, "language_loss": 0.74843824, "learning_rate": 1.4041573382416588e-06, "loss": 0.76926053, "num_input_tokens_seen": 217965190, "step": 10119, "time_per_iteration": 2.7610390186309814 }, { "auxiliary_loss_clip": 0.01102909, "auxiliary_loss_mlp": 0.01034672, "balance_loss_clip": 1.04056787, "balance_loss_mlp": 1.02195883, "epoch": 0.6084473169998497, "flos": 21506901045120.0, "grad_norm": 1.7294665557102438, "language_loss": 0.67426908, "learning_rate": 1.4037855731489525e-06, "loss": 0.69564486, "num_input_tokens_seen": 217983625, "step": 10120, "time_per_iteration": 2.6205523014068604 }, { "auxiliary_loss_clip": 0.01108129, "auxiliary_loss_mlp": 0.01033833, "balance_loss_clip": 1.04188108, "balance_loss_mlp": 1.02035594, "epoch": 0.6085074402525177, "flos": 26870051710080.0, "grad_norm": 1.6306465435700652, "language_loss": 0.74561995, "learning_rate": 1.4034138306647571e-06, "loss": 0.76703954, "num_input_tokens_seen": 218006005, "step": 10121, "time_per_iteration": 2.6655447483062744 }, { "auxiliary_loss_clip": 0.01103879, "auxiliary_loss_mlp": 0.01034712, "balance_loss_clip": 1.03920245, "balance_loss_mlp": 1.02181315, "epoch": 0.6085675635051856, "flos": 10889839952640.0, "grad_norm": 1.8102237735068374, "language_loss": 0.80563319, "learning_rate": 1.4030421108031685e-06, "loss": 0.8270191, "num_input_tokens_seen": 218024195, "step": 10122, "time_per_iteration": 2.5725269317626953 }, { "auxiliary_loss_clip": 0.011003, "auxiliary_loss_mlp": 0.01033892, "balance_loss_clip": 1.03930187, "balance_loss_mlp": 1.01991475, "epoch": 0.6086276867578536, "flos": 34862186707200.0, "grad_norm": 2.5216051585049994, "language_loss": 0.55656278, "learning_rate": 1.402670413578284e-06, "loss": 0.5779047, "num_input_tokens_seen": 218047190, "step": 10123, "time_per_iteration": 2.7452590465545654 }, { "auxiliary_loss_clip": 0.01107373, "auxiliary_loss_mlp": 0.01041375, "balance_loss_clip": 1.0430057, "balance_loss_mlp": 1.02773786, "epoch": 0.6086878100105215, "flos": 20047706939520.0, "grad_norm": 2.4791520044019526, "language_loss": 0.73864502, "learning_rate": 1.4022987390041965e-06, "loss": 0.76013255, "num_input_tokens_seen": 218065945, "step": 10124, "time_per_iteration": 2.6622564792633057 }, { "auxiliary_loss_clip": 0.01089528, "auxiliary_loss_mlp": 0.01035903, "balance_loss_clip": 1.03544164, "balance_loss_mlp": 1.0215143, "epoch": 0.6087479332631895, "flos": 18332469711360.0, "grad_norm": 2.9318658727845577, "language_loss": 0.65483487, "learning_rate": 1.4019270870950006e-06, "loss": 0.67608917, "num_input_tokens_seen": 218085285, "step": 10125, "time_per_iteration": 2.677290439605713 }, { "auxiliary_loss_clip": 0.01116071, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.04222536, "balance_loss_mlp": 1.0202589, "epoch": 0.6088080565158575, "flos": 24493411399680.0, "grad_norm": 1.769901084210043, "language_loss": 0.76367819, "learning_rate": 1.40155545786479e-06, "loss": 0.785173, "num_input_tokens_seen": 218104735, "step": 10126, "time_per_iteration": 2.6574339866638184 }, { "auxiliary_loss_clip": 0.01079175, "auxiliary_loss_mlp": 0.01032387, "balance_loss_clip": 1.04002953, "balance_loss_mlp": 1.01883876, "epoch": 0.6088681797685255, "flos": 10269016260480.0, "grad_norm": 2.3378560015936705, "language_loss": 0.70790273, "learning_rate": 1.4011838513276558e-06, "loss": 0.72901833, "num_input_tokens_seen": 218121855, "step": 10127, "time_per_iteration": 2.6849265098571777 }, { "auxiliary_loss_clip": 0.01121141, "auxiliary_loss_mlp": 0.01035296, "balance_loss_clip": 1.04394543, "balance_loss_mlp": 1.02121782, "epoch": 0.6089283030211935, "flos": 21973703218560.0, "grad_norm": 2.1716351382875874, "language_loss": 0.72938377, "learning_rate": 1.400812267497691e-06, "loss": 0.75094813, "num_input_tokens_seen": 218137325, "step": 10128, "time_per_iteration": 2.5779154300689697 }, { "auxiliary_loss_clip": 0.01065888, "auxiliary_loss_mlp": 0.01033014, "balance_loss_clip": 1.03911877, "balance_loss_mlp": 1.02046144, "epoch": 0.6089884262738614, "flos": 17785191116160.0, "grad_norm": 2.2560816683992075, "language_loss": 0.7314086, "learning_rate": 1.4004407063889842e-06, "loss": 0.7523976, "num_input_tokens_seen": 218155530, "step": 10129, "time_per_iteration": 2.765955924987793 }, { "auxiliary_loss_clip": 0.01113573, "auxiliary_loss_mlp": 0.01033476, "balance_loss_clip": 1.03910589, "balance_loss_mlp": 1.02067268, "epoch": 0.6090485495265294, "flos": 36910423946880.0, "grad_norm": 1.6122727527780822, "language_loss": 0.65641886, "learning_rate": 1.400069168015626e-06, "loss": 0.67788941, "num_input_tokens_seen": 218182535, "step": 10130, "time_per_iteration": 2.78676438331604 }, { "auxiliary_loss_clip": 0.01086426, "auxiliary_loss_mlp": 0.0103051, "balance_loss_clip": 1.03784049, "balance_loss_mlp": 1.01903617, "epoch": 0.6091086727791973, "flos": 19899036547200.0, "grad_norm": 1.8425930589011128, "language_loss": 0.76978183, "learning_rate": 1.3996976523917054e-06, "loss": 0.79095113, "num_input_tokens_seen": 218201740, "step": 10131, "time_per_iteration": 2.5955772399902344 }, { "auxiliary_loss_clip": 0.0108451, "auxiliary_loss_mlp": 0.01035484, "balance_loss_clip": 1.04026079, "balance_loss_mlp": 1.02387881, "epoch": 0.6091687960318654, "flos": 22163635359360.0, "grad_norm": 1.697349608419957, "language_loss": 0.76859689, "learning_rate": 1.3993261595313093e-06, "loss": 0.78979683, "num_input_tokens_seen": 218219800, "step": 10132, "time_per_iteration": 2.7611875534057617 }, { "auxiliary_loss_clip": 0.01112693, "auxiliary_loss_mlp": 0.01033853, "balance_loss_clip": 1.04171348, "balance_loss_mlp": 1.02192605, "epoch": 0.6092289192845333, "flos": 21465280160640.0, "grad_norm": 1.734329950775569, "language_loss": 0.75766826, "learning_rate": 1.3989546894485261e-06, "loss": 0.77913374, "num_input_tokens_seen": 218237585, "step": 10133, "time_per_iteration": 2.5794837474823 }, { "auxiliary_loss_clip": 0.0110335, "auxiliary_loss_mlp": 0.01034974, "balance_loss_clip": 1.03942025, "balance_loss_mlp": 1.0217123, "epoch": 0.6092890425372013, "flos": 28694924225280.0, "grad_norm": 1.7908978482931064, "language_loss": 0.63917655, "learning_rate": 1.3985832421574414e-06, "loss": 0.66055977, "num_input_tokens_seen": 218258700, "step": 10134, "time_per_iteration": 2.7197823524475098 }, { "auxiliary_loss_clip": 0.01091736, "auxiliary_loss_mlp": 0.01033364, "balance_loss_clip": 1.04008186, "balance_loss_mlp": 1.02060866, "epoch": 0.6093491657898692, "flos": 20813178700800.0, "grad_norm": 1.9213179565189793, "language_loss": 0.7841872, "learning_rate": 1.3982118176721397e-06, "loss": 0.80543816, "num_input_tokens_seen": 218275655, "step": 10135, "time_per_iteration": 4.243841171264648 }, { "auxiliary_loss_clip": 0.01093049, "auxiliary_loss_mlp": 0.01030115, "balance_loss_clip": 1.04067171, "balance_loss_mlp": 1.0183785, "epoch": 0.6094092890425372, "flos": 25446983708160.0, "grad_norm": 1.9609713951304055, "language_loss": 0.72346425, "learning_rate": 1.3978404160067069e-06, "loss": 0.7446959, "num_input_tokens_seen": 218295720, "step": 10136, "time_per_iteration": 4.175207853317261 }, { "auxiliary_loss_clip": 0.01118097, "auxiliary_loss_mlp": 0.01029914, "balance_loss_clip": 1.04258895, "balance_loss_mlp": 1.01715255, "epoch": 0.6094694122952051, "flos": 35621265847680.0, "grad_norm": 1.7802525821484743, "language_loss": 0.74853754, "learning_rate": 1.3974690371752253e-06, "loss": 0.77001762, "num_input_tokens_seen": 218316745, "step": 10137, "time_per_iteration": 2.7007157802581787 }, { "auxiliary_loss_clip": 0.01100831, "auxiliary_loss_mlp": 0.0104803, "balance_loss_clip": 1.0380677, "balance_loss_mlp": 1.03291392, "epoch": 0.6095295355478731, "flos": 24456962073600.0, "grad_norm": 2.07495429210998, "language_loss": 0.80021697, "learning_rate": 1.3970976811917785e-06, "loss": 0.82170558, "num_input_tokens_seen": 218335385, "step": 10138, "time_per_iteration": 2.642719268798828 }, { "auxiliary_loss_clip": 0.01085336, "auxiliary_loss_mlp": 0.01035484, "balance_loss_clip": 1.03812051, "balance_loss_mlp": 1.02355671, "epoch": 0.6095896588005411, "flos": 15633208419840.0, "grad_norm": 2.0335546536806746, "language_loss": 0.81230104, "learning_rate": 1.3967263480704481e-06, "loss": 0.83350921, "num_input_tokens_seen": 218353320, "step": 10139, "time_per_iteration": 4.268277645111084 }, { "auxiliary_loss_clip": 0.01077185, "auxiliary_loss_mlp": 0.01037802, "balance_loss_clip": 1.03831828, "balance_loss_mlp": 1.02411103, "epoch": 0.6096497820532091, "flos": 15550577182080.0, "grad_norm": 2.12947943365166, "language_loss": 0.83466005, "learning_rate": 1.396355037825315e-06, "loss": 0.85580993, "num_input_tokens_seen": 218365620, "step": 10140, "time_per_iteration": 2.8175792694091797 }, { "auxiliary_loss_clip": 0.01105576, "auxiliary_loss_mlp": 0.01033997, "balance_loss_clip": 1.04053175, "balance_loss_mlp": 1.02132499, "epoch": 0.6097099053058771, "flos": 24204474397440.0, "grad_norm": 1.6865480520512064, "language_loss": 0.7552228, "learning_rate": 1.3959837504704592e-06, "loss": 0.77661854, "num_input_tokens_seen": 218383785, "step": 10141, "time_per_iteration": 2.6393468379974365 }, { "auxiliary_loss_clip": 0.01087905, "auxiliary_loss_mlp": 0.01037932, "balance_loss_clip": 1.03879404, "balance_loss_mlp": 1.02400196, "epoch": 0.609770028558545, "flos": 19570238426880.0, "grad_norm": 2.2429886109126955, "language_loss": 0.76329374, "learning_rate": 1.3956124860199603e-06, "loss": 0.7845521, "num_input_tokens_seen": 218399055, "step": 10142, "time_per_iteration": 2.6924803256988525 }, { "auxiliary_loss_clip": 0.01117011, "auxiliary_loss_mlp": 0.01034845, "balance_loss_clip": 1.04226887, "balance_loss_mlp": 1.02116001, "epoch": 0.609830151811213, "flos": 23949185460480.0, "grad_norm": 1.9503172342998385, "language_loss": 0.77012557, "learning_rate": 1.3952412444878964e-06, "loss": 0.7916441, "num_input_tokens_seen": 218419120, "step": 10143, "time_per_iteration": 2.8441388607025146 }, { "auxiliary_loss_clip": 0.01100288, "auxiliary_loss_mlp": 0.01040669, "balance_loss_clip": 1.0388006, "balance_loss_mlp": 1.02585077, "epoch": 0.6098902750638809, "flos": 16179732829440.0, "grad_norm": 1.761002506839972, "language_loss": 0.75323224, "learning_rate": 1.3948700258883448e-06, "loss": 0.77464181, "num_input_tokens_seen": 218435290, "step": 10144, "time_per_iteration": 2.6133413314819336 }, { "auxiliary_loss_clip": 0.01087547, "auxiliary_loss_mlp": 0.01034684, "balance_loss_clip": 1.03644156, "balance_loss_mlp": 1.02106476, "epoch": 0.609950398316549, "flos": 44526393763200.0, "grad_norm": 2.2237996959363566, "language_loss": 0.72757131, "learning_rate": 1.394498830235383e-06, "loss": 0.7487936, "num_input_tokens_seen": 218457880, "step": 10145, "time_per_iteration": 2.939194679260254 }, { "auxiliary_loss_clip": 0.01090456, "auxiliary_loss_mlp": 0.0103494, "balance_loss_clip": 1.03637707, "balance_loss_mlp": 1.02223277, "epoch": 0.6100105215692169, "flos": 23221743223680.0, "grad_norm": 1.7269520496185313, "language_loss": 0.69230616, "learning_rate": 1.3941276575430862e-06, "loss": 0.7135601, "num_input_tokens_seen": 218475930, "step": 10146, "time_per_iteration": 2.6565699577331543 }, { "auxiliary_loss_clip": 0.01068091, "auxiliary_loss_mlp": 0.00769179, "balance_loss_clip": 1.03681684, "balance_loss_mlp": 1.00011373, "epoch": 0.6100706448218849, "flos": 15012564295680.0, "grad_norm": 1.635331048644393, "language_loss": 0.77205098, "learning_rate": 1.3937565078255289e-06, "loss": 0.79042363, "num_input_tokens_seen": 218493675, "step": 10147, "time_per_iteration": 2.7440903186798096 }, { "auxiliary_loss_clip": 0.01093041, "auxiliary_loss_mlp": 0.01032822, "balance_loss_clip": 1.03794551, "balance_loss_mlp": 1.01953053, "epoch": 0.6101307680745528, "flos": 19639976682240.0, "grad_norm": 2.1080947760938895, "language_loss": 0.78184944, "learning_rate": 1.393385381096786e-06, "loss": 0.80310804, "num_input_tokens_seen": 218511780, "step": 10148, "time_per_iteration": 2.638685703277588 }, { "auxiliary_loss_clip": 0.01080447, "auxiliary_loss_mlp": 0.01036994, "balance_loss_clip": 1.0334245, "balance_loss_mlp": 1.02181888, "epoch": 0.6101908913272208, "flos": 29935566028800.0, "grad_norm": 2.0657205711801776, "language_loss": 0.54227436, "learning_rate": 1.39301427737093e-06, "loss": 0.56344879, "num_input_tokens_seen": 218531850, "step": 10149, "time_per_iteration": 2.800041437149048 }, { "auxiliary_loss_clip": 0.01092603, "auxiliary_loss_mlp": 0.01036713, "balance_loss_clip": 1.04295909, "balance_loss_mlp": 1.02440453, "epoch": 0.6102510145798887, "flos": 21798639308160.0, "grad_norm": 1.8291736341547842, "language_loss": 0.8044911, "learning_rate": 1.3926431966620333e-06, "loss": 0.82578421, "num_input_tokens_seen": 218551245, "step": 10150, "time_per_iteration": 2.725109577178955 }, { "auxiliary_loss_clip": 0.01091495, "auxiliary_loss_mlp": 0.01041201, "balance_loss_clip": 1.04189467, "balance_loss_mlp": 1.02752137, "epoch": 0.6103111378325567, "flos": 20706129192960.0, "grad_norm": 3.373563414576853, "language_loss": 0.68982595, "learning_rate": 1.3922721389841684e-06, "loss": 0.71115291, "num_input_tokens_seen": 218571365, "step": 10151, "time_per_iteration": 2.672344923019409 }, { "auxiliary_loss_clip": 0.01114149, "auxiliary_loss_mlp": 0.01030926, "balance_loss_clip": 1.03988385, "balance_loss_mlp": 1.01889241, "epoch": 0.6103712610852247, "flos": 29381643417600.0, "grad_norm": 1.7910960351729457, "language_loss": 0.7080698, "learning_rate": 1.3919011043514036e-06, "loss": 0.72952056, "num_input_tokens_seen": 218588315, "step": 10152, "time_per_iteration": 2.687704086303711 }, { "auxiliary_loss_clip": 0.01081357, "auxiliary_loss_mlp": 0.01034717, "balance_loss_clip": 1.04208827, "balance_loss_mlp": 1.02176535, "epoch": 0.6104313843378927, "flos": 20813035046400.0, "grad_norm": 1.9308044180202404, "language_loss": 0.77972472, "learning_rate": 1.391530092777811e-06, "loss": 0.80088544, "num_input_tokens_seen": 218605940, "step": 10153, "time_per_iteration": 2.737981081008911 }, { "auxiliary_loss_clip": 0.01090347, "auxiliary_loss_mlp": 0.01032715, "balance_loss_clip": 1.03806591, "balance_loss_mlp": 1.01951218, "epoch": 0.6104915075905607, "flos": 26578457101440.0, "grad_norm": 3.840439775750313, "language_loss": 0.79736745, "learning_rate": 1.3911591042774573e-06, "loss": 0.81859809, "num_input_tokens_seen": 218626100, "step": 10154, "time_per_iteration": 2.737769365310669 }, { "auxiliary_loss_clip": 0.01105395, "auxiliary_loss_mlp": 0.01031455, "balance_loss_clip": 1.04265189, "balance_loss_mlp": 1.01911139, "epoch": 0.6105516308432286, "flos": 23915788790400.0, "grad_norm": 1.6402345774234983, "language_loss": 0.70273185, "learning_rate": 1.3907881388644116e-06, "loss": 0.72410041, "num_input_tokens_seen": 218645060, "step": 10155, "time_per_iteration": 2.710547924041748 }, { "auxiliary_loss_clip": 0.0110624, "auxiliary_loss_mlp": 0.01033447, "balance_loss_clip": 1.04239929, "balance_loss_mlp": 1.01990473, "epoch": 0.6106117540958966, "flos": 31577365900800.0, "grad_norm": 1.4885347094481511, "language_loss": 0.71531796, "learning_rate": 1.3904171965527413e-06, "loss": 0.73671484, "num_input_tokens_seen": 218667690, "step": 10156, "time_per_iteration": 2.7332398891448975 }, { "auxiliary_loss_clip": 0.0109286, "auxiliary_loss_mlp": 0.01032773, "balance_loss_clip": 1.04169869, "balance_loss_mlp": 1.01951134, "epoch": 0.6106718773485645, "flos": 19608160210560.0, "grad_norm": 1.5588894396348068, "language_loss": 0.6765914, "learning_rate": 1.3900462773565114e-06, "loss": 0.69784772, "num_input_tokens_seen": 218687505, "step": 10157, "time_per_iteration": 2.7539916038513184 }, { "auxiliary_loss_clip": 0.01075332, "auxiliary_loss_mlp": 0.01028524, "balance_loss_clip": 1.03566861, "balance_loss_mlp": 1.01551235, "epoch": 0.6107320006012326, "flos": 17123895774720.0, "grad_norm": 1.7948221929891892, "language_loss": 0.72670758, "learning_rate": 1.3896753812897877e-06, "loss": 0.74774617, "num_input_tokens_seen": 218705315, "step": 10158, "time_per_iteration": 2.7469441890716553 }, { "auxiliary_loss_clip": 0.01103253, "auxiliary_loss_mlp": 0.01033152, "balance_loss_clip": 1.0429678, "balance_loss_mlp": 1.02017009, "epoch": 0.6107921238539005, "flos": 30148228500480.0, "grad_norm": 1.507227050717275, "language_loss": 0.69370097, "learning_rate": 1.389304508366635e-06, "loss": 0.715065, "num_input_tokens_seen": 218725735, "step": 10159, "time_per_iteration": 2.7083382606506348 }, { "auxiliary_loss_clip": 0.01118821, "auxiliary_loss_mlp": 0.01031902, "balance_loss_clip": 1.04300821, "balance_loss_mlp": 1.01859236, "epoch": 0.6108522471065685, "flos": 18440273404800.0, "grad_norm": 1.9516164322769225, "language_loss": 0.78660917, "learning_rate": 1.3889336586011167e-06, "loss": 0.80811644, "num_input_tokens_seen": 218743215, "step": 10160, "time_per_iteration": 2.5400774478912354 }, { "auxiliary_loss_clip": 0.01029498, "auxiliary_loss_mlp": 0.01003038, "balance_loss_clip": 1.01565576, "balance_loss_mlp": 1.00198889, "epoch": 0.6109123703592364, "flos": 64135454791680.0, "grad_norm": 0.8179177002663486, "language_loss": 0.61458665, "learning_rate": 1.388562832007295e-06, "loss": 0.63491201, "num_input_tokens_seen": 218806440, "step": 10161, "time_per_iteration": 3.3134469985961914 }, { "auxiliary_loss_clip": 0.01099659, "auxiliary_loss_mlp": 0.00772317, "balance_loss_clip": 1.04388893, "balance_loss_mlp": 1.00015724, "epoch": 0.6109724936119044, "flos": 20667848273280.0, "grad_norm": 4.3292370915840666, "language_loss": 0.76713967, "learning_rate": 1.3881920285992324e-06, "loss": 0.78585941, "num_input_tokens_seen": 218825720, "step": 10162, "time_per_iteration": 2.666212797164917 }, { "auxiliary_loss_clip": 0.01115754, "auxiliary_loss_mlp": 0.01032204, "balance_loss_clip": 1.04164445, "balance_loss_mlp": 1.0187993, "epoch": 0.6110326168645723, "flos": 31351882273920.0, "grad_norm": 1.703540773348326, "language_loss": 0.71334386, "learning_rate": 1.3878212483909888e-06, "loss": 0.73482347, "num_input_tokens_seen": 218847735, "step": 10163, "time_per_iteration": 2.65462327003479 }, { "auxiliary_loss_clip": 0.01112689, "auxiliary_loss_mlp": 0.0102818, "balance_loss_clip": 1.03985834, "balance_loss_mlp": 1.01618207, "epoch": 0.6110927401172404, "flos": 25003378742400.0, "grad_norm": 1.8985707771161122, "language_loss": 0.59787023, "learning_rate": 1.387450491396625e-06, "loss": 0.61927891, "num_input_tokens_seen": 218866585, "step": 10164, "time_per_iteration": 2.5967462062835693 }, { "auxiliary_loss_clip": 0.01098803, "auxiliary_loss_mlp": 0.01031481, "balance_loss_clip": 1.04045308, "balance_loss_mlp": 1.01886845, "epoch": 0.6111528633699083, "flos": 26248078782720.0, "grad_norm": 1.6376390692210252, "language_loss": 0.75717723, "learning_rate": 1.3870797576302003e-06, "loss": 0.77848011, "num_input_tokens_seen": 218885560, "step": 10165, "time_per_iteration": 2.706014633178711 }, { "auxiliary_loss_clip": 0.01092416, "auxiliary_loss_mlp": 0.01029472, "balance_loss_clip": 1.04052818, "balance_loss_mlp": 1.01629317, "epoch": 0.6112129866225763, "flos": 22382474970240.0, "grad_norm": 1.6019347929518222, "language_loss": 0.79179376, "learning_rate": 1.3867090471057719e-06, "loss": 0.81301266, "num_input_tokens_seen": 218905055, "step": 10166, "time_per_iteration": 2.648865222930908 }, { "auxiliary_loss_clip": 0.01089634, "auxiliary_loss_mlp": 0.01029582, "balance_loss_clip": 1.03888917, "balance_loss_mlp": 1.01634979, "epoch": 0.6112731098752443, "flos": 25227892702080.0, "grad_norm": 1.8171337399867642, "language_loss": 0.67561293, "learning_rate": 1.3863383598373987e-06, "loss": 0.69680506, "num_input_tokens_seen": 218924030, "step": 10167, "time_per_iteration": 2.700876474380493 }, { "auxiliary_loss_clip": 0.01114313, "auxiliary_loss_mlp": 0.01035637, "balance_loss_clip": 1.04177189, "balance_loss_mlp": 1.02360249, "epoch": 0.6113332331279122, "flos": 22893160584960.0, "grad_norm": 1.916507906954157, "language_loss": 0.79281151, "learning_rate": 1.3859676958391364e-06, "loss": 0.81431103, "num_input_tokens_seen": 218943750, "step": 10168, "time_per_iteration": 2.7144253253936768 }, { "auxiliary_loss_clip": 0.01121355, "auxiliary_loss_mlp": 0.01040045, "balance_loss_clip": 1.04141057, "balance_loss_mlp": 1.02516127, "epoch": 0.6113933563805802, "flos": 18620329305600.0, "grad_norm": 5.812497502727784, "language_loss": 0.85299641, "learning_rate": 1.3855970551250398e-06, "loss": 0.87461042, "num_input_tokens_seen": 218957585, "step": 10169, "time_per_iteration": 2.5470833778381348 }, { "auxiliary_loss_clip": 0.01112463, "auxiliary_loss_mlp": 0.01031208, "balance_loss_clip": 1.03939247, "balance_loss_mlp": 1.01953125, "epoch": 0.6114534796332481, "flos": 41866275317760.0, "grad_norm": 1.6486085762416796, "language_loss": 0.78718483, "learning_rate": 1.3852264377091652e-06, "loss": 0.80862153, "num_input_tokens_seen": 218980025, "step": 10170, "time_per_iteration": 2.729773998260498 }, { "auxiliary_loss_clip": 0.01098191, "auxiliary_loss_mlp": 0.01039397, "balance_loss_clip": 1.03988242, "balance_loss_mlp": 1.02454388, "epoch": 0.6115136028859162, "flos": 21908454163200.0, "grad_norm": 2.373771480482072, "language_loss": 0.68857706, "learning_rate": 1.3848558436055651e-06, "loss": 0.70995295, "num_input_tokens_seen": 218998200, "step": 10171, "time_per_iteration": 2.8418185710906982 }, { "auxiliary_loss_clip": 0.01084168, "auxiliary_loss_mlp": 0.01037331, "balance_loss_clip": 1.03639293, "balance_loss_mlp": 1.0224179, "epoch": 0.6115737261385841, "flos": 28804846821120.0, "grad_norm": 1.5357621569118813, "language_loss": 0.79195881, "learning_rate": 1.3844852728282934e-06, "loss": 0.81317377, "num_input_tokens_seen": 219017910, "step": 10172, "time_per_iteration": 2.7578020095825195 }, { "auxiliary_loss_clip": 0.01083831, "auxiliary_loss_mlp": 0.0103911, "balance_loss_clip": 1.03985405, "balance_loss_mlp": 1.02511525, "epoch": 0.6116338493912521, "flos": 21251468453760.0, "grad_norm": 2.0161581722139252, "language_loss": 0.67301053, "learning_rate": 1.3841147253914022e-06, "loss": 0.69423997, "num_input_tokens_seen": 219037730, "step": 10173, "time_per_iteration": 2.767425298690796 }, { "auxiliary_loss_clip": 0.01093328, "auxiliary_loss_mlp": 0.01039514, "balance_loss_clip": 1.0412842, "balance_loss_mlp": 1.02572155, "epoch": 0.61169397264392, "flos": 17530189488000.0, "grad_norm": 1.7178312614749116, "language_loss": 0.55863279, "learning_rate": 1.3837442013089416e-06, "loss": 0.57996118, "num_input_tokens_seen": 219056755, "step": 10174, "time_per_iteration": 4.367191314697266 }, { "auxiliary_loss_clip": 0.01098909, "auxiliary_loss_mlp": 0.0103905, "balance_loss_clip": 1.0425694, "balance_loss_mlp": 1.02503705, "epoch": 0.611754095896588, "flos": 23951555758080.0, "grad_norm": 1.9312072143196408, "language_loss": 0.66054702, "learning_rate": 1.3833737005949628e-06, "loss": 0.68192655, "num_input_tokens_seen": 219076985, "step": 10175, "time_per_iteration": 4.3369996547698975 }, { "auxiliary_loss_clip": 0.0110119, "auxiliary_loss_mlp": 0.00770739, "balance_loss_clip": 1.03765738, "balance_loss_mlp": 1.00009918, "epoch": 0.6118142191492559, "flos": 25994872834560.0, "grad_norm": 2.243694545574497, "language_loss": 0.83143312, "learning_rate": 1.3830032232635154e-06, "loss": 0.85015237, "num_input_tokens_seen": 219096050, "step": 10176, "time_per_iteration": 2.6386196613311768 }, { "auxiliary_loss_clip": 0.01097242, "auxiliary_loss_mlp": 0.01040776, "balance_loss_clip": 1.04172039, "balance_loss_mlp": 1.02604187, "epoch": 0.611874342401924, "flos": 24603190341120.0, "grad_norm": 1.9428160095597935, "language_loss": 0.77491206, "learning_rate": 1.3826327693286474e-06, "loss": 0.79629225, "num_input_tokens_seen": 219112665, "step": 10177, "time_per_iteration": 2.68098521232605 }, { "auxiliary_loss_clip": 0.01100764, "auxiliary_loss_mlp": 0.00771744, "balance_loss_clip": 1.03818965, "balance_loss_mlp": 1.00019312, "epoch": 0.6119344656545919, "flos": 15887132640000.0, "grad_norm": 3.7169342629070505, "language_loss": 0.75467336, "learning_rate": 1.3822623388044065e-06, "loss": 0.77339846, "num_input_tokens_seen": 219129120, "step": 10178, "time_per_iteration": 2.600816011428833 }, { "auxiliary_loss_clip": 0.01088953, "auxiliary_loss_mlp": 0.01045893, "balance_loss_clip": 1.03788781, "balance_loss_mlp": 1.03069353, "epoch": 0.6119945889072599, "flos": 21652877917440.0, "grad_norm": 1.6054240792862575, "language_loss": 0.67197716, "learning_rate": 1.3818919317048402e-06, "loss": 0.69332558, "num_input_tokens_seen": 219148950, "step": 10179, "time_per_iteration": 4.199966669082642 }, { "auxiliary_loss_clip": 0.0109746, "auxiliary_loss_mlp": 0.01034998, "balance_loss_clip": 1.0424819, "balance_loss_mlp": 1.02241588, "epoch": 0.6120547121599279, "flos": 13772533023360.0, "grad_norm": 1.7918927990683708, "language_loss": 0.83621407, "learning_rate": 1.3815215480439933e-06, "loss": 0.85753864, "num_input_tokens_seen": 219165585, "step": 10180, "time_per_iteration": 2.617266893386841 }, { "auxiliary_loss_clip": 0.01117181, "auxiliary_loss_mlp": 0.01032718, "balance_loss_clip": 1.04272151, "balance_loss_mlp": 1.01881814, "epoch": 0.6121148354125958, "flos": 20079164275200.0, "grad_norm": 1.5733186243311148, "language_loss": 0.7745713, "learning_rate": 1.3811511878359113e-06, "loss": 0.79607022, "num_input_tokens_seen": 219183280, "step": 10181, "time_per_iteration": 2.542682409286499 }, { "auxiliary_loss_clip": 0.01117399, "auxiliary_loss_mlp": 0.01035122, "balance_loss_clip": 1.041821, "balance_loss_mlp": 1.02228367, "epoch": 0.6121749586652638, "flos": 13471313569920.0, "grad_norm": 15.811946001131306, "language_loss": 0.80652797, "learning_rate": 1.3807808510946384e-06, "loss": 0.82805324, "num_input_tokens_seen": 219197200, "step": 10182, "time_per_iteration": 2.6980040073394775 }, { "auxiliary_loss_clip": 0.01077836, "auxiliary_loss_mlp": 0.01037108, "balance_loss_clip": 1.03717065, "balance_loss_mlp": 1.02501428, "epoch": 0.6122350819179317, "flos": 20120533764480.0, "grad_norm": 1.5906642026710172, "language_loss": 0.82815677, "learning_rate": 1.3804105378342177e-06, "loss": 0.84930623, "num_input_tokens_seen": 219216825, "step": 10183, "time_per_iteration": 2.808246612548828 }, { "auxiliary_loss_clip": 0.01025033, "auxiliary_loss_mlp": 0.01005337, "balance_loss_clip": 1.01312232, "balance_loss_mlp": 1.00417471, "epoch": 0.6122952051705998, "flos": 65429242767360.0, "grad_norm": 0.7045561187177276, "language_loss": 0.62833804, "learning_rate": 1.3800402480686914e-06, "loss": 0.64864177, "num_input_tokens_seen": 219283795, "step": 10184, "time_per_iteration": 3.2871408462524414 }, { "auxiliary_loss_clip": 0.01108097, "auxiliary_loss_mlp": 0.01037833, "balance_loss_clip": 1.042454, "balance_loss_mlp": 1.02517307, "epoch": 0.6123553284232677, "flos": 20376253664640.0, "grad_norm": 1.792613488461195, "language_loss": 0.82103658, "learning_rate": 1.379669981812101e-06, "loss": 0.8424958, "num_input_tokens_seen": 219302385, "step": 10185, "time_per_iteration": 2.623692750930786 }, { "auxiliary_loss_clip": 0.0109256, "auxiliary_loss_mlp": 0.01038333, "balance_loss_clip": 1.04070401, "balance_loss_mlp": 1.02442169, "epoch": 0.6124154516759357, "flos": 23987645948160.0, "grad_norm": 1.7206353448570937, "language_loss": 0.74358237, "learning_rate": 1.3792997390784868e-06, "loss": 0.76489139, "num_input_tokens_seen": 219319765, "step": 10186, "time_per_iteration": 2.657557725906372 }, { "auxiliary_loss_clip": 0.01099771, "auxiliary_loss_mlp": 0.0103362, "balance_loss_clip": 1.03756428, "balance_loss_mlp": 1.021294, "epoch": 0.6124755749286036, "flos": 21468799693440.0, "grad_norm": 1.5881045533275502, "language_loss": 0.7818836, "learning_rate": 1.3789295198818895e-06, "loss": 0.80321753, "num_input_tokens_seen": 219337440, "step": 10187, "time_per_iteration": 2.625558376312256 }, { "auxiliary_loss_clip": 0.01113087, "auxiliary_loss_mlp": 0.01033487, "balance_loss_clip": 1.03851271, "balance_loss_mlp": 1.02038562, "epoch": 0.6125356981812716, "flos": 23879195809920.0, "grad_norm": 1.8256616870215527, "language_loss": 0.83049744, "learning_rate": 1.3785593242363462e-06, "loss": 0.85196316, "num_input_tokens_seen": 219357525, "step": 10188, "time_per_iteration": 2.6045219898223877 }, { "auxiliary_loss_clip": 0.0108702, "auxiliary_loss_mlp": 0.0103141, "balance_loss_clip": 1.04232693, "balance_loss_mlp": 1.01822519, "epoch": 0.6125958214339395, "flos": 14425604150400.0, "grad_norm": 1.7058207723590004, "language_loss": 0.7547375, "learning_rate": 1.378189152155896e-06, "loss": 0.77592176, "num_input_tokens_seen": 219374855, "step": 10189, "time_per_iteration": 2.7627220153808594 }, { "auxiliary_loss_clip": 0.01101171, "auxiliary_loss_mlp": 0.0104025, "balance_loss_clip": 1.03780556, "balance_loss_mlp": 1.02642107, "epoch": 0.6126559446866076, "flos": 23259090389760.0, "grad_norm": 1.513309715943079, "language_loss": 0.74214786, "learning_rate": 1.3778190036545758e-06, "loss": 0.76356208, "num_input_tokens_seen": 219394740, "step": 10190, "time_per_iteration": 2.617075204849243 }, { "auxiliary_loss_clip": 0.01104454, "auxiliary_loss_mlp": 0.0103683, "balance_loss_clip": 1.04099751, "balance_loss_mlp": 1.02338362, "epoch": 0.6127160679392755, "flos": 26864808324480.0, "grad_norm": 1.7858486096662998, "language_loss": 0.68623936, "learning_rate": 1.3774488787464207e-06, "loss": 0.70765221, "num_input_tokens_seen": 219413755, "step": 10191, "time_per_iteration": 2.681180477142334 }, { "auxiliary_loss_clip": 0.0110296, "auxiliary_loss_mlp": 0.0103819, "balance_loss_clip": 1.0385741, "balance_loss_mlp": 1.02425456, "epoch": 0.6127761911919435, "flos": 26396425952640.0, "grad_norm": 2.13769200790618, "language_loss": 0.73452723, "learning_rate": 1.377078777445467e-06, "loss": 0.75593865, "num_input_tokens_seen": 219433560, "step": 10192, "time_per_iteration": 2.6742324829101562 }, { "auxiliary_loss_clip": 0.01075917, "auxiliary_loss_mlp": 0.01033242, "balance_loss_clip": 1.03988755, "balance_loss_mlp": 1.02090943, "epoch": 0.6128363144446115, "flos": 22634747164800.0, "grad_norm": 2.0088299944144636, "language_loss": 0.83632165, "learning_rate": 1.3767086997657478e-06, "loss": 0.85741329, "num_input_tokens_seen": 219452640, "step": 10193, "time_per_iteration": 2.701087474822998 }, { "auxiliary_loss_clip": 0.01082703, "auxiliary_loss_mlp": 0.01035438, "balance_loss_clip": 1.03853893, "balance_loss_mlp": 1.02231348, "epoch": 0.6128964376972794, "flos": 26759051706240.0, "grad_norm": 2.1771802645074105, "language_loss": 0.6991539, "learning_rate": 1.3763386457212979e-06, "loss": 0.72033525, "num_input_tokens_seen": 219468585, "step": 10194, "time_per_iteration": 2.6878440380096436 }, { "auxiliary_loss_clip": 0.01010189, "auxiliary_loss_mlp": 0.01003845, "balance_loss_clip": 1.01538479, "balance_loss_mlp": 1.002653, "epoch": 0.6129565609499474, "flos": 65567929178880.0, "grad_norm": 0.8185640373649049, "language_loss": 0.58629549, "learning_rate": 1.375968615326149e-06, "loss": 0.60643584, "num_input_tokens_seen": 219523015, "step": 10195, "time_per_iteration": 3.05383038520813 }, { "auxiliary_loss_clip": 0.01095455, "auxiliary_loss_mlp": 0.01035767, "balance_loss_clip": 1.04045796, "balance_loss_mlp": 1.02244508, "epoch": 0.6130166842026153, "flos": 16362087200640.0, "grad_norm": 2.135532256863793, "language_loss": 0.69762802, "learning_rate": 1.3755986085943324e-06, "loss": 0.71894026, "num_input_tokens_seen": 219539980, "step": 10196, "time_per_iteration": 2.6125218868255615 }, { "auxiliary_loss_clip": 0.01089403, "auxiliary_loss_mlp": 0.01036656, "balance_loss_clip": 1.03711545, "balance_loss_mlp": 1.02356637, "epoch": 0.6130768074552834, "flos": 23652455207040.0, "grad_norm": 1.7041901113683988, "language_loss": 0.71497107, "learning_rate": 1.3752286255398788e-06, "loss": 0.73623163, "num_input_tokens_seen": 219556980, "step": 10197, "time_per_iteration": 2.687622547149658 }, { "auxiliary_loss_clip": 0.01102107, "auxiliary_loss_mlp": 0.01046474, "balance_loss_clip": 1.03892088, "balance_loss_mlp": 1.03226423, "epoch": 0.6131369307079513, "flos": 20047455544320.0, "grad_norm": 2.1425841144655533, "language_loss": 0.79149073, "learning_rate": 1.3748586661768191e-06, "loss": 0.81297648, "num_input_tokens_seen": 219576410, "step": 10198, "time_per_iteration": 2.6697170734405518 }, { "auxiliary_loss_clip": 0.01092328, "auxiliary_loss_mlp": 0.01031323, "balance_loss_clip": 1.04398417, "balance_loss_mlp": 1.01794744, "epoch": 0.6131970539606193, "flos": 22672166158080.0, "grad_norm": 1.4352269101197792, "language_loss": 0.74505019, "learning_rate": 1.374488730519181e-06, "loss": 0.76628667, "num_input_tokens_seen": 219597180, "step": 10199, "time_per_iteration": 2.789501905441284 }, { "auxiliary_loss_clip": 0.01092976, "auxiliary_loss_mlp": 0.01040283, "balance_loss_clip": 1.03864002, "balance_loss_mlp": 1.02596581, "epoch": 0.6132571772132872, "flos": 26870913636480.0, "grad_norm": 2.276152956596312, "language_loss": 0.62111485, "learning_rate": 1.374118818580993e-06, "loss": 0.64244747, "num_input_tokens_seen": 219617630, "step": 10200, "time_per_iteration": 2.7012946605682373 }, { "auxiliary_loss_clip": 0.01092122, "auxiliary_loss_mlp": 0.01030786, "balance_loss_clip": 1.04091394, "balance_loss_mlp": 1.01772022, "epoch": 0.6133173004659552, "flos": 22892657794560.0, "grad_norm": 2.1392566641947464, "language_loss": 0.6911571, "learning_rate": 1.3737489303762822e-06, "loss": 0.71238619, "num_input_tokens_seen": 219637025, "step": 10201, "time_per_iteration": 2.7815003395080566 }, { "auxiliary_loss_clip": 0.01091125, "auxiliary_loss_mlp": 0.01031281, "balance_loss_clip": 1.03879607, "balance_loss_mlp": 1.018466, "epoch": 0.6133774237186231, "flos": 20485098852480.0, "grad_norm": 1.7663162719665984, "language_loss": 0.83417988, "learning_rate": 1.3733790659190746e-06, "loss": 0.85540396, "num_input_tokens_seen": 219656625, "step": 10202, "time_per_iteration": 2.6809394359588623 }, { "auxiliary_loss_clip": 0.01037873, "auxiliary_loss_mlp": 0.0100084, "balance_loss_clip": 1.01421046, "balance_loss_mlp": 0.99977362, "epoch": 0.6134375469712912, "flos": 69413065217280.0, "grad_norm": 0.8928245444729744, "language_loss": 0.67083746, "learning_rate": 1.3730092252233953e-06, "loss": 0.69122458, "num_input_tokens_seen": 219718090, "step": 10203, "time_per_iteration": 3.153150796890259 }, { "auxiliary_loss_clip": 0.01107329, "auxiliary_loss_mlp": 0.01030437, "balance_loss_clip": 1.04093874, "balance_loss_mlp": 1.01783061, "epoch": 0.6134976702239591, "flos": 41281541815680.0, "grad_norm": 1.5881826993460113, "language_loss": 0.61211205, "learning_rate": 1.37263940830327e-06, "loss": 0.63348967, "num_input_tokens_seen": 219740100, "step": 10204, "time_per_iteration": 2.8730733394622803 }, { "auxiliary_loss_clip": 0.01079745, "auxiliary_loss_mlp": 0.0102996, "balance_loss_clip": 1.03856349, "balance_loss_mlp": 1.0171572, "epoch": 0.6135577934766271, "flos": 22346600261760.0, "grad_norm": 1.8494988248574857, "language_loss": 0.72484612, "learning_rate": 1.3722696151727204e-06, "loss": 0.74594319, "num_input_tokens_seen": 219761225, "step": 10205, "time_per_iteration": 2.789635419845581 }, { "auxiliary_loss_clip": 0.0110225, "auxiliary_loss_mlp": 0.01027505, "balance_loss_clip": 1.04025602, "balance_loss_mlp": 1.01416492, "epoch": 0.6136179167292951, "flos": 23728155120000.0, "grad_norm": 1.726684216662775, "language_loss": 0.76188898, "learning_rate": 1.3718998458457701e-06, "loss": 0.78318655, "num_input_tokens_seen": 219780085, "step": 10206, "time_per_iteration": 2.6312029361724854 }, { "auxiliary_loss_clip": 0.01085288, "auxiliary_loss_mlp": 0.01031734, "balance_loss_clip": 1.04444122, "balance_loss_mlp": 1.0182395, "epoch": 0.613678039981963, "flos": 26024678144640.0, "grad_norm": 2.2620215533288013, "language_loss": 0.7565456, "learning_rate": 1.3715301003364407e-06, "loss": 0.7777158, "num_input_tokens_seen": 219797895, "step": 10207, "time_per_iteration": 2.768277645111084 }, { "auxiliary_loss_clip": 0.01103864, "auxiliary_loss_mlp": 0.01034203, "balance_loss_clip": 1.04067349, "balance_loss_mlp": 1.02212703, "epoch": 0.613738163234631, "flos": 9859957200000.0, "grad_norm": 2.399068150435751, "language_loss": 0.83005822, "learning_rate": 1.3711603786587525e-06, "loss": 0.85143888, "num_input_tokens_seen": 219811295, "step": 10208, "time_per_iteration": 2.5925726890563965 }, { "auxiliary_loss_clip": 0.0109897, "auxiliary_loss_mlp": 0.01034142, "balance_loss_clip": 1.04265046, "balance_loss_mlp": 1.01999795, "epoch": 0.613798286487299, "flos": 33182070001920.0, "grad_norm": 1.8170662176874706, "language_loss": 0.72382063, "learning_rate": 1.3707906808267265e-06, "loss": 0.74515176, "num_input_tokens_seen": 219832735, "step": 10209, "time_per_iteration": 2.7966833114624023 }, { "auxiliary_loss_clip": 0.01115107, "auxiliary_loss_mlp": 0.01038487, "balance_loss_clip": 1.04209638, "balance_loss_mlp": 1.02545118, "epoch": 0.613858409739967, "flos": 25627901535360.0, "grad_norm": 1.6402518788547593, "language_loss": 0.74474829, "learning_rate": 1.37042100685438e-06, "loss": 0.76628423, "num_input_tokens_seen": 219852755, "step": 10210, "time_per_iteration": 2.615272045135498 }, { "auxiliary_loss_clip": 0.01010153, "auxiliary_loss_mlp": 0.01001177, "balance_loss_clip": 1.01308346, "balance_loss_mlp": 0.99999064, "epoch": 0.6139185329926349, "flos": 67192313932800.0, "grad_norm": 0.8597503962544338, "language_loss": 0.64958251, "learning_rate": 1.3700513567557325e-06, "loss": 0.66969585, "num_input_tokens_seen": 219922785, "step": 10211, "time_per_iteration": 3.410182476043701 }, { "auxiliary_loss_clip": 0.01093321, "auxiliary_loss_mlp": 0.00771551, "balance_loss_clip": 1.03993869, "balance_loss_mlp": 1.00015092, "epoch": 0.6139786562453029, "flos": 21543637680000.0, "grad_norm": 2.0754424248675893, "language_loss": 0.7585628, "learning_rate": 1.369681730544801e-06, "loss": 0.77721149, "num_input_tokens_seen": 219942215, "step": 10212, "time_per_iteration": 3.0132839679718018 }, { "auxiliary_loss_clip": 0.01087691, "auxiliary_loss_mlp": 0.01041947, "balance_loss_clip": 1.03709769, "balance_loss_mlp": 1.02745092, "epoch": 0.6140387794979708, "flos": 26068489758720.0, "grad_norm": 1.8964126815157365, "language_loss": 0.74028683, "learning_rate": 1.3693121282356009e-06, "loss": 0.76158321, "num_input_tokens_seen": 219963830, "step": 10213, "time_per_iteration": 2.757840871810913 }, { "auxiliary_loss_clip": 0.01100654, "auxiliary_loss_mlp": 0.01037388, "balance_loss_clip": 1.04215121, "balance_loss_mlp": 1.02341056, "epoch": 0.6140989027506388, "flos": 23694614795520.0, "grad_norm": 1.4673821315924696, "language_loss": 0.73059738, "learning_rate": 1.3689425498421483e-06, "loss": 0.7519778, "num_input_tokens_seen": 219983815, "step": 10214, "time_per_iteration": 5.874944686889648 }, { "auxiliary_loss_clip": 0.01119065, "auxiliary_loss_mlp": 0.01032618, "balance_loss_clip": 1.04233837, "balance_loss_mlp": 1.01859856, "epoch": 0.6141590260033067, "flos": 22231721589120.0, "grad_norm": 1.979046579810642, "language_loss": 0.74642611, "learning_rate": 1.3685729953784572e-06, "loss": 0.76794291, "num_input_tokens_seen": 220003165, "step": 10215, "time_per_iteration": 4.103458404541016 }, { "auxiliary_loss_clip": 0.0110334, "auxiliary_loss_mlp": 0.01036271, "balance_loss_clip": 1.04110682, "balance_loss_mlp": 1.02308035, "epoch": 0.6142191492559748, "flos": 23871653953920.0, "grad_norm": 3.0132083118300526, "language_loss": 0.78161263, "learning_rate": 1.368203464858542e-06, "loss": 0.80300874, "num_input_tokens_seen": 220021015, "step": 10216, "time_per_iteration": 2.6554577350616455 }, { "auxiliary_loss_clip": 0.01116166, "auxiliary_loss_mlp": 0.01038341, "balance_loss_clip": 1.04212427, "balance_loss_mlp": 1.02428079, "epoch": 0.6142792725086427, "flos": 15042513260160.0, "grad_norm": 2.385226690327553, "language_loss": 0.80102211, "learning_rate": 1.3678339582964147e-06, "loss": 0.82256722, "num_input_tokens_seen": 220035780, "step": 10217, "time_per_iteration": 2.5665090084075928 }, { "auxiliary_loss_clip": 0.01096361, "auxiliary_loss_mlp": 0.01032764, "balance_loss_clip": 1.04036403, "balance_loss_mlp": 1.0193646, "epoch": 0.6143393957613107, "flos": 23330947547520.0, "grad_norm": 2.363056906877031, "language_loss": 0.7822212, "learning_rate": 1.3674644757060865e-06, "loss": 0.80351239, "num_input_tokens_seen": 220054280, "step": 10218, "time_per_iteration": 2.659820795059204 }, { "auxiliary_loss_clip": 0.01108038, "auxiliary_loss_mlp": 0.01034908, "balance_loss_clip": 1.04290485, "balance_loss_mlp": 1.02203321, "epoch": 0.6143995190139786, "flos": 20117086058880.0, "grad_norm": 1.5950804577882065, "language_loss": 0.8189528, "learning_rate": 1.367095017101569e-06, "loss": 0.84038228, "num_input_tokens_seen": 220074120, "step": 10219, "time_per_iteration": 4.207094192504883 }, { "auxiliary_loss_clip": 0.01098839, "auxiliary_loss_mlp": 0.01035321, "balance_loss_clip": 1.0370295, "balance_loss_mlp": 1.02146316, "epoch": 0.6144596422666466, "flos": 42303559489920.0, "grad_norm": 2.5627103076938424, "language_loss": 0.66738832, "learning_rate": 1.3667255824968717e-06, "loss": 0.68872988, "num_input_tokens_seen": 220096320, "step": 10220, "time_per_iteration": 2.7829878330230713 }, { "auxiliary_loss_clip": 0.01103534, "auxiliary_loss_mlp": 0.01029408, "balance_loss_clip": 1.03913307, "balance_loss_mlp": 1.01669455, "epoch": 0.6145197655193146, "flos": 21573622558080.0, "grad_norm": 1.8637833274966709, "language_loss": 0.71766376, "learning_rate": 1.3663561719060041e-06, "loss": 0.73899317, "num_input_tokens_seen": 220114850, "step": 10221, "time_per_iteration": 2.621060609817505 }, { "auxiliary_loss_clip": 0.01066987, "auxiliary_loss_mlp": 0.01030601, "balance_loss_clip": 1.03472996, "balance_loss_mlp": 1.01779163, "epoch": 0.6145798887719826, "flos": 21471098163840.0, "grad_norm": 1.725179067455254, "language_loss": 0.79747754, "learning_rate": 1.3659867853429735e-06, "loss": 0.81845343, "num_input_tokens_seen": 220133395, "step": 10222, "time_per_iteration": 2.7557356357574463 }, { "auxiliary_loss_clip": 0.01092387, "auxiliary_loss_mlp": 0.01042666, "balance_loss_clip": 1.04074025, "balance_loss_mlp": 1.02842045, "epoch": 0.6146400120246506, "flos": 20777016683520.0, "grad_norm": 1.8633750333173091, "language_loss": 0.76163048, "learning_rate": 1.365617422821788e-06, "loss": 0.78298092, "num_input_tokens_seen": 220152790, "step": 10223, "time_per_iteration": 2.649580717086792 }, { "auxiliary_loss_clip": 0.01093219, "auxiliary_loss_mlp": 0.01034751, "balance_loss_clip": 1.04123545, "balance_loss_mlp": 1.02193058, "epoch": 0.6147001352773185, "flos": 13881306384000.0, "grad_norm": 1.8872928812493461, "language_loss": 0.78260607, "learning_rate": 1.3652480843564535e-06, "loss": 0.80388576, "num_input_tokens_seen": 220169535, "step": 10224, "time_per_iteration": 2.6781771183013916 }, { "auxiliary_loss_clip": 0.01076582, "auxiliary_loss_mlp": 0.01032552, "balance_loss_clip": 1.03500175, "balance_loss_mlp": 1.02076793, "epoch": 0.6147602585299865, "flos": 56641791807360.0, "grad_norm": 1.4371349679447827, "language_loss": 0.66419935, "learning_rate": 1.3648787699609746e-06, "loss": 0.68529069, "num_input_tokens_seen": 220195305, "step": 10225, "time_per_iteration": 3.0390100479125977 }, { "auxiliary_loss_clip": 0.01103654, "auxiliary_loss_mlp": 0.00771954, "balance_loss_clip": 1.04197466, "balance_loss_mlp": 1.00015104, "epoch": 0.6148203817826544, "flos": 32817217605120.0, "grad_norm": 2.067542776960223, "language_loss": 0.6355052, "learning_rate": 1.364509479649357e-06, "loss": 0.65426129, "num_input_tokens_seen": 220215040, "step": 10226, "time_per_iteration": 2.744330644607544 }, { "auxiliary_loss_clip": 0.01090925, "auxiliary_loss_mlp": 0.01037806, "balance_loss_clip": 1.03825569, "balance_loss_mlp": 1.02304804, "epoch": 0.6148805050353224, "flos": 18332038748160.0, "grad_norm": 1.7718988021259403, "language_loss": 0.75872779, "learning_rate": 1.3641402134356037e-06, "loss": 0.78001511, "num_input_tokens_seen": 220234205, "step": 10227, "time_per_iteration": 2.7481887340545654 }, { "auxiliary_loss_clip": 0.01054701, "auxiliary_loss_mlp": 0.01043082, "balance_loss_clip": 1.03239739, "balance_loss_mlp": 1.02689981, "epoch": 0.6149406282879903, "flos": 14063983977600.0, "grad_norm": 2.209409032208413, "language_loss": 0.62177163, "learning_rate": 1.3637709713337164e-06, "loss": 0.64274943, "num_input_tokens_seen": 220252730, "step": 10228, "time_per_iteration": 2.797832489013672 }, { "auxiliary_loss_clip": 0.0109079, "auxiliary_loss_mlp": 0.01033221, "balance_loss_clip": 1.03737903, "balance_loss_mlp": 1.0200839, "epoch": 0.6150007515406584, "flos": 25190186400000.0, "grad_norm": 2.3158396173840683, "language_loss": 0.74483359, "learning_rate": 1.3634017533576985e-06, "loss": 0.7660737, "num_input_tokens_seen": 220273345, "step": 10229, "time_per_iteration": 2.7949423789978027 }, { "auxiliary_loss_clip": 0.01118363, "auxiliary_loss_mlp": 0.01039286, "balance_loss_clip": 1.04305434, "balance_loss_mlp": 1.02533805, "epoch": 0.6150608747933263, "flos": 21945262625280.0, "grad_norm": 1.6423781673268174, "language_loss": 0.7801019, "learning_rate": 1.3630325595215493e-06, "loss": 0.80167842, "num_input_tokens_seen": 220293845, "step": 10230, "time_per_iteration": 2.666316509246826 }, { "auxiliary_loss_clip": 0.01086667, "auxiliary_loss_mlp": 0.01029888, "balance_loss_clip": 1.03686535, "balance_loss_mlp": 1.01674509, "epoch": 0.6151209980459943, "flos": 30117453523200.0, "grad_norm": 1.4482184431954421, "language_loss": 0.73085076, "learning_rate": 1.36266338983927e-06, "loss": 0.75201631, "num_input_tokens_seen": 220316070, "step": 10231, "time_per_iteration": 2.7693657875061035 }, { "auxiliary_loss_clip": 0.01095915, "auxiliary_loss_mlp": 0.01033904, "balance_loss_clip": 1.04084241, "balance_loss_mlp": 1.02099395, "epoch": 0.6151811212986622, "flos": 30008356940160.0, "grad_norm": 1.525819080770735, "language_loss": 0.69824755, "learning_rate": 1.362294244324858e-06, "loss": 0.71954578, "num_input_tokens_seen": 220335695, "step": 10232, "time_per_iteration": 2.682452917098999 }, { "auxiliary_loss_clip": 0.01099274, "auxiliary_loss_mlp": 0.00770809, "balance_loss_clip": 1.03777719, "balance_loss_mlp": 1.00007868, "epoch": 0.6152412445513302, "flos": 18872888808960.0, "grad_norm": 2.3038424014240215, "language_loss": 0.91654289, "learning_rate": 1.3619251229923126e-06, "loss": 0.93524379, "num_input_tokens_seen": 220353720, "step": 10233, "time_per_iteration": 2.6199569702148438 }, { "auxiliary_loss_clip": 0.01083051, "auxiliary_loss_mlp": 0.01033569, "balance_loss_clip": 1.04041195, "balance_loss_mlp": 1.02191687, "epoch": 0.6153013678039982, "flos": 25703601448320.0, "grad_norm": 1.8226041312601646, "language_loss": 0.71622181, "learning_rate": 1.3615560258556306e-06, "loss": 0.73738801, "num_input_tokens_seen": 220372515, "step": 10234, "time_per_iteration": 2.6806395053863525 }, { "auxiliary_loss_clip": 0.01107194, "auxiliary_loss_mlp": 0.00771951, "balance_loss_clip": 1.04099405, "balance_loss_mlp": 1.0002284, "epoch": 0.6153614910566662, "flos": 28510271383680.0, "grad_norm": 2.918285420802953, "language_loss": 0.66839552, "learning_rate": 1.3611869529288077e-06, "loss": 0.68718696, "num_input_tokens_seen": 220393490, "step": 10235, "time_per_iteration": 2.896367073059082 }, { "auxiliary_loss_clip": 0.01102816, "auxiliary_loss_mlp": 0.0103213, "balance_loss_clip": 1.04112911, "balance_loss_mlp": 1.01878452, "epoch": 0.6154216143093342, "flos": 23549787158400.0, "grad_norm": 1.534901762766115, "language_loss": 0.81011724, "learning_rate": 1.3608179042258398e-06, "loss": 0.83146667, "num_input_tokens_seen": 220412855, "step": 10236, "time_per_iteration": 2.679506301879883 }, { "auxiliary_loss_clip": 0.01117813, "auxiliary_loss_mlp": 0.01032266, "balance_loss_clip": 1.04047644, "balance_loss_mlp": 1.01949906, "epoch": 0.6154817375620021, "flos": 22748081552640.0, "grad_norm": 1.522081781804378, "language_loss": 0.80553526, "learning_rate": 1.360448879760721e-06, "loss": 0.82703608, "num_input_tokens_seen": 220433440, "step": 10237, "time_per_iteration": 2.6127498149871826 }, { "auxiliary_loss_clip": 0.011004, "auxiliary_loss_mlp": 0.01042204, "balance_loss_clip": 1.04215753, "balance_loss_mlp": 1.02890038, "epoch": 0.6155418608146701, "flos": 27162975121920.0, "grad_norm": 1.7653521660078044, "language_loss": 0.75694555, "learning_rate": 1.3600798795474449e-06, "loss": 0.77837157, "num_input_tokens_seen": 220453445, "step": 10238, "time_per_iteration": 2.7021820545196533 }, { "auxiliary_loss_clip": 0.00990356, "auxiliary_loss_mlp": 0.01013988, "balance_loss_clip": 1.01446486, "balance_loss_mlp": 1.01235473, "epoch": 0.615601984067338, "flos": 68811165014400.0, "grad_norm": 0.760761219232036, "language_loss": 0.57602662, "learning_rate": 1.3597109036000036e-06, "loss": 0.59607005, "num_input_tokens_seen": 220509730, "step": 10239, "time_per_iteration": 3.3009963035583496 }, { "auxiliary_loss_clip": 0.01096252, "auxiliary_loss_mlp": 0.0103361, "balance_loss_clip": 1.03823948, "balance_loss_mlp": 1.01997805, "epoch": 0.615662107320006, "flos": 15517144598400.0, "grad_norm": 1.7796767695280624, "language_loss": 0.77439094, "learning_rate": 1.3593419519323892e-06, "loss": 0.79568958, "num_input_tokens_seen": 220527295, "step": 10240, "time_per_iteration": 2.7327582836151123 }, { "auxiliary_loss_clip": 0.011174, "auxiliary_loss_mlp": 0.01036437, "balance_loss_clip": 1.04190874, "balance_loss_mlp": 1.02288342, "epoch": 0.615722230572674, "flos": 21063691128960.0, "grad_norm": 3.4544456151934315, "language_loss": 0.73013711, "learning_rate": 1.3589730245585922e-06, "loss": 0.75167549, "num_input_tokens_seen": 220542730, "step": 10241, "time_per_iteration": 2.6023552417755127 }, { "auxiliary_loss_clip": 0.01112719, "auxiliary_loss_mlp": 0.01028871, "balance_loss_clip": 1.03958392, "balance_loss_mlp": 1.01619887, "epoch": 0.615782353825342, "flos": 23256791919360.0, "grad_norm": 1.6070807308789545, "language_loss": 0.72045815, "learning_rate": 1.3586041214926018e-06, "loss": 0.7418741, "num_input_tokens_seen": 220562995, "step": 10242, "time_per_iteration": 2.6226117610931396 }, { "auxiliary_loss_clip": 0.0110498, "auxiliary_loss_mlp": 0.01029697, "balance_loss_clip": 1.04025722, "balance_loss_mlp": 1.01723933, "epoch": 0.6158424770780099, "flos": 21103911383040.0, "grad_norm": 3.2758585328662693, "language_loss": 0.72332186, "learning_rate": 1.3582352427484086e-06, "loss": 0.74466866, "num_input_tokens_seen": 220581775, "step": 10243, "time_per_iteration": 2.6781527996063232 }, { "auxiliary_loss_clip": 0.01030422, "auxiliary_loss_mlp": 0.01003075, "balance_loss_clip": 1.01600218, "balance_loss_mlp": 1.00200224, "epoch": 0.6159026003306779, "flos": 70333276769280.0, "grad_norm": 0.7877801586989086, "language_loss": 0.56873554, "learning_rate": 1.3578663883399984e-06, "loss": 0.5890705, "num_input_tokens_seen": 220646395, "step": 10244, "time_per_iteration": 3.2125418186187744 }, { "auxiliary_loss_clip": 0.01114981, "auxiliary_loss_mlp": 0.01034292, "balance_loss_clip": 1.03982329, "balance_loss_mlp": 1.02022541, "epoch": 0.6159627235833458, "flos": 33874355802240.0, "grad_norm": 1.5742269245602847, "language_loss": 0.63524461, "learning_rate": 1.3574975582813593e-06, "loss": 0.65673733, "num_input_tokens_seen": 220668335, "step": 10245, "time_per_iteration": 2.7619571685791016 }, { "auxiliary_loss_clip": 0.01065921, "auxiliary_loss_mlp": 0.01029863, "balance_loss_clip": 1.03640854, "balance_loss_mlp": 1.01676226, "epoch": 0.6160228468360138, "flos": 26575440359040.0, "grad_norm": 2.04251017264565, "language_loss": 0.79142463, "learning_rate": 1.3571287525864771e-06, "loss": 0.81238246, "num_input_tokens_seen": 220688915, "step": 10246, "time_per_iteration": 2.799443483352661 }, { "auxiliary_loss_clip": 0.01079892, "auxiliary_loss_mlp": 0.00772846, "balance_loss_clip": 1.03852773, "balance_loss_mlp": 1.00013709, "epoch": 0.6160829700886818, "flos": 17193274894080.0, "grad_norm": 3.4946061818357115, "language_loss": 0.87453389, "learning_rate": 1.3567599712693368e-06, "loss": 0.89306134, "num_input_tokens_seen": 220703465, "step": 10247, "time_per_iteration": 2.652655839920044 }, { "auxiliary_loss_clip": 0.01044965, "auxiliary_loss_mlp": 0.01035448, "balance_loss_clip": 1.03624761, "balance_loss_mlp": 1.02157784, "epoch": 0.6161430933413498, "flos": 23623547736960.0, "grad_norm": 1.6669970799602325, "language_loss": 0.79791045, "learning_rate": 1.3563912143439235e-06, "loss": 0.81871456, "num_input_tokens_seen": 220722090, "step": 10248, "time_per_iteration": 2.742093563079834 }, { "auxiliary_loss_clip": 0.01068661, "auxiliary_loss_mlp": 0.010344, "balance_loss_clip": 1.03618228, "balance_loss_mlp": 1.02193117, "epoch": 0.6162032165940178, "flos": 23002436736000.0, "grad_norm": 3.2255403010195884, "language_loss": 0.87085855, "learning_rate": 1.3560224818242191e-06, "loss": 0.89188921, "num_input_tokens_seen": 220741075, "step": 10249, "time_per_iteration": 2.7385706901550293 }, { "auxiliary_loss_clip": 0.01115811, "auxiliary_loss_mlp": 0.01026714, "balance_loss_clip": 1.04125154, "balance_loss_mlp": 1.01251006, "epoch": 0.6162633398466857, "flos": 39421979740800.0, "grad_norm": 2.234106446125174, "language_loss": 0.69080746, "learning_rate": 1.3556537737242072e-06, "loss": 0.71223265, "num_input_tokens_seen": 220763395, "step": 10250, "time_per_iteration": 2.736942768096924 }, { "auxiliary_loss_clip": 0.0108508, "auxiliary_loss_mlp": 0.0102783, "balance_loss_clip": 1.03718221, "balance_loss_mlp": 1.01533055, "epoch": 0.6163234630993537, "flos": 19244672530560.0, "grad_norm": 1.84490130709099, "language_loss": 0.74013072, "learning_rate": 1.3552850900578692e-06, "loss": 0.76125979, "num_input_tokens_seen": 220780640, "step": 10251, "time_per_iteration": 2.736994504928589 }, { "auxiliary_loss_clip": 0.01098297, "auxiliary_loss_mlp": 0.01035781, "balance_loss_clip": 1.03710103, "balance_loss_mlp": 1.02119529, "epoch": 0.6163835863520216, "flos": 15961791058560.0, "grad_norm": 2.3749552307580615, "language_loss": 0.68138051, "learning_rate": 1.3549164308391844e-06, "loss": 0.7027213, "num_input_tokens_seen": 220797960, "step": 10252, "time_per_iteration": 2.5879385471343994 }, { "auxiliary_loss_clip": 0.00977001, "auxiliary_loss_mlp": 0.01000711, "balance_loss_clip": 1.01395059, "balance_loss_mlp": 0.9993996, "epoch": 0.6164437096046896, "flos": 68103834393600.0, "grad_norm": 0.8911370167619598, "language_loss": 0.57833099, "learning_rate": 1.3545477960821333e-06, "loss": 0.59810811, "num_input_tokens_seen": 220856930, "step": 10253, "time_per_iteration": 6.5962769985198975 }, { "auxiliary_loss_clip": 0.0109176, "auxiliary_loss_mlp": 0.01033043, "balance_loss_clip": 1.03666162, "balance_loss_mlp": 1.01960826, "epoch": 0.6165038328573575, "flos": 21361211481600.0, "grad_norm": 1.506953433371801, "language_loss": 0.80028725, "learning_rate": 1.3541791858006946e-06, "loss": 0.82153523, "num_input_tokens_seen": 220877595, "step": 10254, "time_per_iteration": 4.457768678665161 }, { "auxiliary_loss_clip": 0.01092373, "auxiliary_loss_mlp": 0.01030068, "balance_loss_clip": 1.04135227, "balance_loss_mlp": 1.01689541, "epoch": 0.6165639561100256, "flos": 21101972048640.0, "grad_norm": 2.217497401179692, "language_loss": 0.80495244, "learning_rate": 1.353810600008846e-06, "loss": 0.82617688, "num_input_tokens_seen": 220896880, "step": 10255, "time_per_iteration": 2.730621814727783 }, { "auxiliary_loss_clip": 0.010977, "auxiliary_loss_mlp": 0.0103255, "balance_loss_clip": 1.04147696, "balance_loss_mlp": 1.01882291, "epoch": 0.6166240793626935, "flos": 25338533569920.0, "grad_norm": 2.145694668444534, "language_loss": 0.65628386, "learning_rate": 1.3534420387205646e-06, "loss": 0.67758632, "num_input_tokens_seen": 220916425, "step": 10256, "time_per_iteration": 2.7114098072052 }, { "auxiliary_loss_clip": 0.01103834, "auxiliary_loss_mlp": 0.01031477, "balance_loss_clip": 1.04223847, "balance_loss_mlp": 1.01924038, "epoch": 0.6166842026153615, "flos": 19682639061120.0, "grad_norm": 1.5926214774863399, "language_loss": 0.7198689, "learning_rate": 1.353073501949825e-06, "loss": 0.74122202, "num_input_tokens_seen": 220935050, "step": 10257, "time_per_iteration": 2.633733034133911 }, { "auxiliary_loss_clip": 0.01096088, "auxiliary_loss_mlp": 0.01034844, "balance_loss_clip": 1.04075146, "balance_loss_mlp": 1.02102792, "epoch": 0.6167443258680294, "flos": 19318361281920.0, "grad_norm": 1.5727725354833466, "language_loss": 0.72232676, "learning_rate": 1.3527049897106034e-06, "loss": 0.74363607, "num_input_tokens_seen": 220953085, "step": 10258, "time_per_iteration": 4.227793455123901 }, { "auxiliary_loss_clip": 0.010877, "auxiliary_loss_mlp": 0.01041882, "balance_loss_clip": 1.03643775, "balance_loss_mlp": 1.02724326, "epoch": 0.6168044491206974, "flos": 25265239868160.0, "grad_norm": 2.5764422484709026, "language_loss": 0.63939095, "learning_rate": 1.3523365020168735e-06, "loss": 0.66068673, "num_input_tokens_seen": 220969050, "step": 10259, "time_per_iteration": 2.66133713722229 }, { "auxiliary_loss_clip": 0.01079598, "auxiliary_loss_mlp": 0.01032477, "balance_loss_clip": 1.04043519, "balance_loss_mlp": 1.01882792, "epoch": 0.6168645723733654, "flos": 13219903301760.0, "grad_norm": 1.7806797732317314, "language_loss": 0.71367824, "learning_rate": 1.3519680388826084e-06, "loss": 0.73479903, "num_input_tokens_seen": 220985825, "step": 10260, "time_per_iteration": 2.7046947479248047 }, { "auxiliary_loss_clip": 0.01112627, "auxiliary_loss_mlp": 0.010363, "balance_loss_clip": 1.04544723, "balance_loss_mlp": 1.02161956, "epoch": 0.6169246956260334, "flos": 26652038112000.0, "grad_norm": 2.1654324787038366, "language_loss": 0.68724519, "learning_rate": 1.3515996003217803e-06, "loss": 0.70873445, "num_input_tokens_seen": 221004465, "step": 10261, "time_per_iteration": 2.6891751289367676 }, { "auxiliary_loss_clip": 0.01077329, "auxiliary_loss_mlp": 0.01039226, "balance_loss_clip": 1.03780389, "balance_loss_mlp": 1.02766895, "epoch": 0.6169848188787014, "flos": 23148413608320.0, "grad_norm": 2.004758584780846, "language_loss": 0.71780062, "learning_rate": 1.3512311863483602e-06, "loss": 0.73896611, "num_input_tokens_seen": 221023260, "step": 10262, "time_per_iteration": 2.7089951038360596 }, { "auxiliary_loss_clip": 0.01096265, "auxiliary_loss_mlp": 0.01036729, "balance_loss_clip": 1.0397017, "balance_loss_mlp": 1.02370548, "epoch": 0.6170449421313693, "flos": 23331917214720.0, "grad_norm": 1.9399509227658047, "language_loss": 0.70199084, "learning_rate": 1.3508627969763188e-06, "loss": 0.72332084, "num_input_tokens_seen": 221043090, "step": 10263, "time_per_iteration": 2.750321865081787 }, { "auxiliary_loss_clip": 0.01051355, "auxiliary_loss_mlp": 0.01030928, "balance_loss_clip": 1.03560829, "balance_loss_mlp": 1.01777899, "epoch": 0.6171050653840373, "flos": 15851617067520.0, "grad_norm": 2.2572438712768217, "language_loss": 0.75942671, "learning_rate": 1.3504944322196244e-06, "loss": 0.78024954, "num_input_tokens_seen": 221061435, "step": 10264, "time_per_iteration": 2.868535041809082 }, { "auxiliary_loss_clip": 0.0111535, "auxiliary_loss_mlp": 0.01034326, "balance_loss_clip": 1.04105282, "balance_loss_mlp": 1.0207653, "epoch": 0.6171651886367052, "flos": 20045516209920.0, "grad_norm": 2.372576687009926, "language_loss": 0.85552394, "learning_rate": 1.350126092092247e-06, "loss": 0.87702072, "num_input_tokens_seen": 221078705, "step": 10265, "time_per_iteration": 2.8565142154693604 }, { "auxiliary_loss_clip": 0.01067477, "auxiliary_loss_mlp": 0.01039322, "balance_loss_clip": 1.04373622, "balance_loss_mlp": 1.0262332, "epoch": 0.6172253118893732, "flos": 26432695710720.0, "grad_norm": 1.8305416019911092, "language_loss": 0.64584678, "learning_rate": 1.349757776608153e-06, "loss": 0.66691476, "num_input_tokens_seen": 221099245, "step": 10266, "time_per_iteration": 2.8642327785491943 }, { "auxiliary_loss_clip": 0.01077105, "auxiliary_loss_mlp": 0.01033494, "balance_loss_clip": 1.03542173, "balance_loss_mlp": 1.02038074, "epoch": 0.6172854351420412, "flos": 22632879657600.0, "grad_norm": 1.5801224931645446, "language_loss": 0.75690526, "learning_rate": 1.3493894857813094e-06, "loss": 0.77801126, "num_input_tokens_seen": 221116930, "step": 10267, "time_per_iteration": 2.6700358390808105 }, { "auxiliary_loss_clip": 0.01085691, "auxiliary_loss_mlp": 0.01032961, "balance_loss_clip": 1.03821349, "balance_loss_mlp": 1.01927543, "epoch": 0.6173455583947092, "flos": 21212936138880.0, "grad_norm": 1.8670913933452218, "language_loss": 0.75156605, "learning_rate": 1.3490212196256818e-06, "loss": 0.77275252, "num_input_tokens_seen": 221137660, "step": 10268, "time_per_iteration": 2.696876287460327 }, { "auxiliary_loss_clip": 0.01094834, "auxiliary_loss_mlp": 0.01028833, "balance_loss_clip": 1.03917122, "balance_loss_mlp": 1.01574397, "epoch": 0.6174056816473771, "flos": 19500284689920.0, "grad_norm": 1.6535000846549075, "language_loss": 0.75516117, "learning_rate": 1.3486529781552342e-06, "loss": 0.77639782, "num_input_tokens_seen": 221156225, "step": 10269, "time_per_iteration": 2.602811098098755 }, { "auxiliary_loss_clip": 0.01112983, "auxiliary_loss_mlp": 0.01032291, "balance_loss_clip": 1.03888416, "balance_loss_mlp": 1.01934433, "epoch": 0.6174658049000451, "flos": 15997342544640.0, "grad_norm": 2.0658565412775864, "language_loss": 0.76633871, "learning_rate": 1.3482847613839318e-06, "loss": 0.78779137, "num_input_tokens_seen": 221173820, "step": 10270, "time_per_iteration": 2.4974937438964844 }, { "auxiliary_loss_clip": 0.01094367, "auxiliary_loss_mlp": 0.01029373, "balance_loss_clip": 1.03897905, "balance_loss_mlp": 1.01614046, "epoch": 0.617525928152713, "flos": 21903893136000.0, "grad_norm": 1.7132984501088018, "language_loss": 0.82571089, "learning_rate": 1.347916569325736e-06, "loss": 0.84694827, "num_input_tokens_seen": 221191815, "step": 10271, "time_per_iteration": 2.5579023361206055 }, { "auxiliary_loss_clip": 0.01117578, "auxiliary_loss_mlp": 0.00770278, "balance_loss_clip": 1.04181647, "balance_loss_mlp": 1.00026119, "epoch": 0.617586051405381, "flos": 21105958458240.0, "grad_norm": 1.7753710890796277, "language_loss": 0.77119303, "learning_rate": 1.3475484019946093e-06, "loss": 0.79007161, "num_input_tokens_seen": 221211205, "step": 10272, "time_per_iteration": 2.5040929317474365 }, { "auxiliary_loss_clip": 0.01010193, "auxiliary_loss_mlp": 0.01008445, "balance_loss_clip": 1.01500225, "balance_loss_mlp": 1.00734258, "epoch": 0.617646174658049, "flos": 58610776665600.0, "grad_norm": 0.8102559733678494, "language_loss": 0.59036177, "learning_rate": 1.347180259404513e-06, "loss": 0.61054814, "num_input_tokens_seen": 221268430, "step": 10273, "time_per_iteration": 3.0667202472686768 }, { "auxiliary_loss_clip": 0.0108364, "auxiliary_loss_mlp": 0.01039906, "balance_loss_clip": 1.03496802, "balance_loss_mlp": 1.02545786, "epoch": 0.617706297910717, "flos": 13878684691200.0, "grad_norm": 2.411144915020525, "language_loss": 0.73045421, "learning_rate": 1.3468121415694059e-06, "loss": 0.75168967, "num_input_tokens_seen": 221281930, "step": 10274, "time_per_iteration": 2.608651638031006 }, { "auxiliary_loss_clip": 0.0110423, "auxiliary_loss_mlp": 0.00770133, "balance_loss_clip": 1.04004967, "balance_loss_mlp": 1.00015223, "epoch": 0.617766421163385, "flos": 19208438686080.0, "grad_norm": 2.134780547516878, "language_loss": 0.77694172, "learning_rate": 1.3464440485032484e-06, "loss": 0.79568529, "num_input_tokens_seen": 221301605, "step": 10275, "time_per_iteration": 2.588878631591797 }, { "auxiliary_loss_clip": 0.01073523, "auxiliary_loss_mlp": 0.01029844, "balance_loss_clip": 1.03674793, "balance_loss_mlp": 1.01733303, "epoch": 0.6178265444160529, "flos": 22565978576640.0, "grad_norm": 2.554653383498776, "language_loss": 0.79304695, "learning_rate": 1.346075980219998e-06, "loss": 0.8140806, "num_input_tokens_seen": 221320105, "step": 10276, "time_per_iteration": 2.704596757888794 }, { "auxiliary_loss_clip": 0.0104785, "auxiliary_loss_mlp": 0.0103935, "balance_loss_clip": 1.03442883, "balance_loss_mlp": 1.02518225, "epoch": 0.6178866676687209, "flos": 11984289402240.0, "grad_norm": 1.984181156670454, "language_loss": 0.80967486, "learning_rate": 1.345707936733612e-06, "loss": 0.83054686, "num_input_tokens_seen": 221335915, "step": 10277, "time_per_iteration": 2.7356364727020264 }, { "auxiliary_loss_clip": 0.01088845, "auxiliary_loss_mlp": 0.01030881, "balance_loss_clip": 1.04154968, "balance_loss_mlp": 1.01682067, "epoch": 0.6179467909213888, "flos": 20991510748800.0, "grad_norm": 1.5775634797191704, "language_loss": 0.81279171, "learning_rate": 1.3453399180580466e-06, "loss": 0.83398896, "num_input_tokens_seen": 221353965, "step": 10278, "time_per_iteration": 2.703054666519165 }, { "auxiliary_loss_clip": 0.0106686, "auxiliary_loss_mlp": 0.00769812, "balance_loss_clip": 1.03503084, "balance_loss_mlp": 1.00006652, "epoch": 0.6180069141740568, "flos": 25338102606720.0, "grad_norm": 1.5156506321196916, "language_loss": 0.74347699, "learning_rate": 1.3449719242072567e-06, "loss": 0.76184368, "num_input_tokens_seen": 221374080, "step": 10279, "time_per_iteration": 2.777080774307251 }, { "auxiliary_loss_clip": 0.01096628, "auxiliary_loss_mlp": 0.010318, "balance_loss_clip": 1.03583622, "balance_loss_mlp": 1.01950932, "epoch": 0.6180670374267248, "flos": 19645722858240.0, "grad_norm": 1.5230976022896776, "language_loss": 0.70880997, "learning_rate": 1.3446039551951975e-06, "loss": 0.73009425, "num_input_tokens_seen": 221392910, "step": 10280, "time_per_iteration": 2.682345151901245 }, { "auxiliary_loss_clip": 0.01116485, "auxiliary_loss_mlp": 0.01035107, "balance_loss_clip": 1.04136443, "balance_loss_mlp": 1.02197635, "epoch": 0.6181271606793928, "flos": 19464876858240.0, "grad_norm": 1.5388475151652443, "language_loss": 0.72637439, "learning_rate": 1.3442360110358215e-06, "loss": 0.74789023, "num_input_tokens_seen": 221410990, "step": 10281, "time_per_iteration": 2.546891927719116 }, { "auxiliary_loss_clip": 0.01091569, "auxiliary_loss_mlp": 0.0102895, "balance_loss_clip": 1.04059482, "balance_loss_mlp": 1.01733923, "epoch": 0.6181872839320607, "flos": 25594289383680.0, "grad_norm": 1.5263826245103997, "language_loss": 0.76680994, "learning_rate": 1.3438680917430827e-06, "loss": 0.78801513, "num_input_tokens_seen": 221431020, "step": 10282, "time_per_iteration": 2.6794841289520264 }, { "auxiliary_loss_clip": 0.0108706, "auxiliary_loss_mlp": 0.01034652, "balance_loss_clip": 1.03559065, "balance_loss_mlp": 1.01857102, "epoch": 0.6182474071847287, "flos": 25551806572800.0, "grad_norm": 1.675077981875324, "language_loss": 0.69088876, "learning_rate": 1.343500197330931e-06, "loss": 0.71210587, "num_input_tokens_seen": 221453235, "step": 10283, "time_per_iteration": 2.704653263092041 }, { "auxiliary_loss_clip": 0.01110364, "auxiliary_loss_mlp": 0.01029861, "balance_loss_clip": 1.03980327, "balance_loss_mlp": 1.01613414, "epoch": 0.6183075304373966, "flos": 22123738327680.0, "grad_norm": 1.6796519430341141, "language_loss": 0.75191927, "learning_rate": 1.3431323278133176e-06, "loss": 0.77332163, "num_input_tokens_seen": 221472560, "step": 10284, "time_per_iteration": 2.613283395767212 }, { "auxiliary_loss_clip": 0.010977, "auxiliary_loss_mlp": 0.01036897, "balance_loss_clip": 1.04041815, "balance_loss_mlp": 1.02422476, "epoch": 0.6183676536900646, "flos": 22455589104000.0, "grad_norm": 1.4535785054838537, "language_loss": 0.75249875, "learning_rate": 1.3427644832041922e-06, "loss": 0.77384472, "num_input_tokens_seen": 221492835, "step": 10285, "time_per_iteration": 2.661404848098755 }, { "auxiliary_loss_clip": 0.01076492, "auxiliary_loss_mlp": 0.01032852, "balance_loss_clip": 1.03464127, "balance_loss_mlp": 1.02047253, "epoch": 0.6184277769427327, "flos": 23364128736000.0, "grad_norm": 1.9348516071602069, "language_loss": 0.72801822, "learning_rate": 1.342396663517503e-06, "loss": 0.74911165, "num_input_tokens_seen": 221511870, "step": 10286, "time_per_iteration": 2.7692575454711914 }, { "auxiliary_loss_clip": 0.01112181, "auxiliary_loss_mlp": 0.01029502, "balance_loss_clip": 1.03996992, "balance_loss_mlp": 1.01705098, "epoch": 0.6184879001954006, "flos": 22711057608960.0, "grad_norm": 1.6994058202973141, "language_loss": 0.76147521, "learning_rate": 1.342028868767199e-06, "loss": 0.78289199, "num_input_tokens_seen": 221529915, "step": 10287, "time_per_iteration": 2.737244129180908 }, { "auxiliary_loss_clip": 0.01075986, "auxiliary_loss_mlp": 0.01033857, "balance_loss_clip": 1.038939, "balance_loss_mlp": 1.02116728, "epoch": 0.6185480234480686, "flos": 23841920471040.0, "grad_norm": 1.661792493637227, "language_loss": 0.73342609, "learning_rate": 1.3416610989672262e-06, "loss": 0.75452453, "num_input_tokens_seen": 221549745, "step": 10288, "time_per_iteration": 2.738234281539917 }, { "auxiliary_loss_clip": 0.01099888, "auxiliary_loss_mlp": 0.01035399, "balance_loss_clip": 1.03925002, "balance_loss_mlp": 1.0233885, "epoch": 0.6186081467007365, "flos": 45477595774080.0, "grad_norm": 1.4788464659042324, "language_loss": 0.72843671, "learning_rate": 1.3412933541315296e-06, "loss": 0.7497896, "num_input_tokens_seen": 221572455, "step": 10289, "time_per_iteration": 2.870210886001587 }, { "auxiliary_loss_clip": 0.01088106, "auxiliary_loss_mlp": 0.01030969, "balance_loss_clip": 1.0376215, "balance_loss_mlp": 1.01749849, "epoch": 0.6186682699534045, "flos": 23550864566400.0, "grad_norm": 1.4742798847115595, "language_loss": 0.79430723, "learning_rate": 1.340925634274056e-06, "loss": 0.81549788, "num_input_tokens_seen": 221591325, "step": 10290, "time_per_iteration": 2.7061526775360107 }, { "auxiliary_loss_clip": 0.01104029, "auxiliary_loss_mlp": 0.01033504, "balance_loss_clip": 1.03934646, "balance_loss_mlp": 1.02068937, "epoch": 0.6187283932060724, "flos": 25774201630080.0, "grad_norm": 1.6274786697127714, "language_loss": 0.81492877, "learning_rate": 1.3405579394087475e-06, "loss": 0.83630407, "num_input_tokens_seen": 221611640, "step": 10291, "time_per_iteration": 2.664706230163574 }, { "auxiliary_loss_clip": 0.01114199, "auxiliary_loss_mlp": 0.01034165, "balance_loss_clip": 1.04050338, "balance_loss_mlp": 1.02185655, "epoch": 0.6187885164587404, "flos": 25265203954560.0, "grad_norm": 1.5926453232151345, "language_loss": 0.77492392, "learning_rate": 1.3401902695495487e-06, "loss": 0.79640758, "num_input_tokens_seen": 221631225, "step": 10292, "time_per_iteration": 4.222437381744385 }, { "auxiliary_loss_clip": 0.01085532, "auxiliary_loss_mlp": 0.01041109, "balance_loss_clip": 1.03610599, "balance_loss_mlp": 1.02526617, "epoch": 0.6188486397114084, "flos": 26250772302720.0, "grad_norm": 2.004631291368857, "language_loss": 0.7354871, "learning_rate": 1.339822624710401e-06, "loss": 0.75675344, "num_input_tokens_seen": 221651035, "step": 10293, "time_per_iteration": 4.283612251281738 }, { "auxiliary_loss_clip": 0.01083695, "auxiliary_loss_mlp": 0.0077033, "balance_loss_clip": 1.03986382, "balance_loss_mlp": 1.00014317, "epoch": 0.6189087629640764, "flos": 20923388605440.0, "grad_norm": 1.9118403389506524, "language_loss": 0.8346625, "learning_rate": 1.3394550049052454e-06, "loss": 0.85320276, "num_input_tokens_seen": 221671300, "step": 10294, "time_per_iteration": 4.339020013809204 }, { "auxiliary_loss_clip": 0.01097661, "auxiliary_loss_mlp": 0.01034696, "balance_loss_clip": 1.04166722, "balance_loss_mlp": 1.02219725, "epoch": 0.6189688862167443, "flos": 14829814874880.0, "grad_norm": 2.141454584748579, "language_loss": 0.706837, "learning_rate": 1.3390874101480225e-06, "loss": 0.72816062, "num_input_tokens_seen": 221687320, "step": 10295, "time_per_iteration": 2.631901264190674 }, { "auxiliary_loss_clip": 0.01115282, "auxiliary_loss_mlp": 0.01039296, "balance_loss_clip": 1.04228771, "balance_loss_mlp": 1.02599859, "epoch": 0.6190290094694123, "flos": 24285058560000.0, "grad_norm": 1.7512650583676883, "language_loss": 0.70329851, "learning_rate": 1.3387198404526705e-06, "loss": 0.72484434, "num_input_tokens_seen": 221710175, "step": 10296, "time_per_iteration": 2.689392566680908 }, { "auxiliary_loss_clip": 0.01081279, "auxiliary_loss_mlp": 0.01034769, "balance_loss_clip": 1.03957784, "balance_loss_mlp": 1.02048767, "epoch": 0.6190891327220802, "flos": 22529457423360.0, "grad_norm": 1.9634695381003797, "language_loss": 0.71536231, "learning_rate": 1.3383522958331287e-06, "loss": 0.73652285, "num_input_tokens_seen": 221728145, "step": 10297, "time_per_iteration": 2.7065582275390625 }, { "auxiliary_loss_clip": 0.01036404, "auxiliary_loss_mlp": 0.01000643, "balance_loss_clip": 1.01235867, "balance_loss_mlp": 0.99964732, "epoch": 0.6191492559747482, "flos": 67729357152000.0, "grad_norm": 0.8790158844538737, "language_loss": 0.64109659, "learning_rate": 1.3379847763033345e-06, "loss": 0.66146708, "num_input_tokens_seen": 221786100, "step": 10298, "time_per_iteration": 4.634017467498779 }, { "auxiliary_loss_clip": 0.01116645, "auxiliary_loss_mlp": 0.01033648, "balance_loss_clip": 1.04158056, "balance_loss_mlp": 1.02121425, "epoch": 0.6192093792274163, "flos": 22346672088960.0, "grad_norm": 1.7807348336033566, "language_loss": 0.74117303, "learning_rate": 1.3376172818772236e-06, "loss": 0.762676, "num_input_tokens_seen": 221806450, "step": 10299, "time_per_iteration": 2.6040680408477783 }, { "auxiliary_loss_clip": 0.01108454, "auxiliary_loss_mlp": 0.01030477, "balance_loss_clip": 1.0418222, "balance_loss_mlp": 1.01792383, "epoch": 0.6192695024800842, "flos": 13553944807680.0, "grad_norm": 1.8290075775776669, "language_loss": 0.68678868, "learning_rate": 1.337249812568732e-06, "loss": 0.70817792, "num_input_tokens_seen": 221823330, "step": 10300, "time_per_iteration": 2.641167163848877 }, { "auxiliary_loss_clip": 0.01101551, "auxiliary_loss_mlp": 0.00770748, "balance_loss_clip": 1.04044676, "balance_loss_mlp": 1.00015926, "epoch": 0.6193296257327522, "flos": 17415310815360.0, "grad_norm": 1.7786248978132038, "language_loss": 0.66813135, "learning_rate": 1.3368823683917939e-06, "loss": 0.68685436, "num_input_tokens_seen": 221839360, "step": 10301, "time_per_iteration": 2.639004945755005 }, { "auxiliary_loss_clip": 0.01072819, "auxiliary_loss_mlp": 0.01035838, "balance_loss_clip": 1.03622746, "balance_loss_mlp": 1.02365446, "epoch": 0.6193897489854201, "flos": 31101118450560.0, "grad_norm": 1.5932766793388897, "language_loss": 0.72753853, "learning_rate": 1.3365149493603424e-06, "loss": 0.74862504, "num_input_tokens_seen": 221859465, "step": 10302, "time_per_iteration": 2.7263267040252686 }, { "auxiliary_loss_clip": 0.01090931, "auxiliary_loss_mlp": 0.01030697, "balance_loss_clip": 1.0426929, "balance_loss_mlp": 1.01734614, "epoch": 0.6194498722380881, "flos": 19134031662720.0, "grad_norm": 1.7635802463343486, "language_loss": 0.80626869, "learning_rate": 1.3361475554883107e-06, "loss": 0.82748497, "num_input_tokens_seen": 221878555, "step": 10303, "time_per_iteration": 2.674865961074829 }, { "auxiliary_loss_clip": 0.01117513, "auxiliary_loss_mlp": 0.01032597, "balance_loss_clip": 1.04101253, "balance_loss_mlp": 1.01882231, "epoch": 0.619509995490756, "flos": 21835088634240.0, "grad_norm": 4.546006861231834, "language_loss": 0.76722652, "learning_rate": 1.3357801867896307e-06, "loss": 0.78872764, "num_input_tokens_seen": 221898790, "step": 10304, "time_per_iteration": 2.578068256378174 }, { "auxiliary_loss_clip": 0.01085456, "auxiliary_loss_mlp": 0.0103497, "balance_loss_clip": 1.04078317, "balance_loss_mlp": 1.02160096, "epoch": 0.619570118743424, "flos": 23806548552960.0, "grad_norm": 2.037308303130727, "language_loss": 0.77085918, "learning_rate": 1.3354128432782324e-06, "loss": 0.79206347, "num_input_tokens_seen": 221918875, "step": 10305, "time_per_iteration": 2.6557652950286865 }, { "auxiliary_loss_clip": 0.01112573, "auxiliary_loss_mlp": 0.01033009, "balance_loss_clip": 1.04317331, "balance_loss_mlp": 1.01832271, "epoch": 0.619630241996092, "flos": 21101612912640.0, "grad_norm": 1.5905815409224004, "language_loss": 0.7876581, "learning_rate": 1.335045524968045e-06, "loss": 0.80911398, "num_input_tokens_seen": 221937895, "step": 10306, "time_per_iteration": 2.58312726020813 }, { "auxiliary_loss_clip": 0.01056494, "auxiliary_loss_mlp": 0.01030878, "balance_loss_clip": 1.03859866, "balance_loss_mlp": 1.0192728, "epoch": 0.61969036524876, "flos": 27308269635840.0, "grad_norm": 1.649742748314876, "language_loss": 0.80246294, "learning_rate": 1.3346782318729988e-06, "loss": 0.82333666, "num_input_tokens_seen": 221955920, "step": 10307, "time_per_iteration": 2.7693941593170166 }, { "auxiliary_loss_clip": 0.01001046, "auxiliary_loss_mlp": 0.01015241, "balance_loss_clip": 1.01444507, "balance_loss_mlp": 1.0141207, "epoch": 0.6197504885014279, "flos": 51648955384320.0, "grad_norm": 0.8068090756771118, "language_loss": 0.59387553, "learning_rate": 1.3343109640070203e-06, "loss": 0.61403841, "num_input_tokens_seen": 222011405, "step": 10308, "time_per_iteration": 3.2183339595794678 }, { "auxiliary_loss_clip": 0.01087174, "auxiliary_loss_mlp": 0.01030837, "balance_loss_clip": 1.03852654, "balance_loss_mlp": 1.01956522, "epoch": 0.6198106117540959, "flos": 30557107992960.0, "grad_norm": 1.7201847601109612, "language_loss": 0.67907512, "learning_rate": 1.333943721384037e-06, "loss": 0.70025527, "num_input_tokens_seen": 222034545, "step": 10309, "time_per_iteration": 2.728565216064453 }, { "auxiliary_loss_clip": 0.01083478, "auxiliary_loss_mlp": 0.0103687, "balance_loss_clip": 1.03543091, "balance_loss_mlp": 1.02430511, "epoch": 0.6198707350067638, "flos": 18909733184640.0, "grad_norm": 1.5362872726536445, "language_loss": 0.72323126, "learning_rate": 1.3335765040179746e-06, "loss": 0.74443471, "num_input_tokens_seen": 222052690, "step": 10310, "time_per_iteration": 2.7349348068237305 }, { "auxiliary_loss_clip": 0.01098291, "auxiliary_loss_mlp": 0.01037346, "balance_loss_clip": 1.04345024, "balance_loss_mlp": 1.02295148, "epoch": 0.6199308582594318, "flos": 21433858738560.0, "grad_norm": 2.3493071886977948, "language_loss": 0.79078376, "learning_rate": 1.3332093119227573e-06, "loss": 0.81214017, "num_input_tokens_seen": 222069095, "step": 10311, "time_per_iteration": 2.682654857635498 }, { "auxiliary_loss_clip": 0.01081352, "auxiliary_loss_mlp": 0.01035506, "balance_loss_clip": 1.0394609, "balance_loss_mlp": 1.02252364, "epoch": 0.6199909815120999, "flos": 18407379525120.0, "grad_norm": 1.7307569913604643, "language_loss": 0.72513938, "learning_rate": 1.3328421451123105e-06, "loss": 0.74630797, "num_input_tokens_seen": 222087360, "step": 10312, "time_per_iteration": 2.677211284637451 }, { "auxiliary_loss_clip": 0.01071298, "auxiliary_loss_mlp": 0.01034778, "balance_loss_clip": 1.04210687, "balance_loss_mlp": 1.02137268, "epoch": 0.6200511047647678, "flos": 21466860359040.0, "grad_norm": 3.7235217852030926, "language_loss": 0.72115338, "learning_rate": 1.3324750036005557e-06, "loss": 0.74221408, "num_input_tokens_seen": 222106130, "step": 10313, "time_per_iteration": 2.7689011096954346 }, { "auxiliary_loss_clip": 0.01108898, "auxiliary_loss_mlp": 0.01033235, "balance_loss_clip": 1.04191053, "balance_loss_mlp": 1.01971102, "epoch": 0.6201112280174358, "flos": 18215903099520.0, "grad_norm": 1.7819666620639945, "language_loss": 0.78249431, "learning_rate": 1.332107887401416e-06, "loss": 0.80391562, "num_input_tokens_seen": 222123125, "step": 10314, "time_per_iteration": 2.618197441101074 }, { "auxiliary_loss_clip": 0.01102699, "auxiliary_loss_mlp": 0.01031591, "balance_loss_clip": 1.03891587, "balance_loss_mlp": 1.01907969, "epoch": 0.6201713512701037, "flos": 20011185786240.0, "grad_norm": 1.747606387674539, "language_loss": 0.78019774, "learning_rate": 1.331740796528812e-06, "loss": 0.80154061, "num_input_tokens_seen": 222140655, "step": 10315, "time_per_iteration": 2.6219210624694824 }, { "auxiliary_loss_clip": 0.01081861, "auxiliary_loss_mlp": 0.01033596, "balance_loss_clip": 1.0434972, "balance_loss_mlp": 1.02088857, "epoch": 0.6202314745227717, "flos": 22487692884480.0, "grad_norm": 2.153515207542012, "language_loss": 0.76050055, "learning_rate": 1.3313737309966641e-06, "loss": 0.78165507, "num_input_tokens_seen": 222160450, "step": 10316, "time_per_iteration": 2.766108989715576 }, { "auxiliary_loss_clip": 0.01115322, "auxiliary_loss_mlp": 0.01032068, "balance_loss_clip": 1.03810644, "balance_loss_mlp": 1.01903796, "epoch": 0.6202915977754396, "flos": 26828682220800.0, "grad_norm": 2.0292313073024366, "language_loss": 0.77797258, "learning_rate": 1.3310066908188915e-06, "loss": 0.79944646, "num_input_tokens_seen": 222179170, "step": 10317, "time_per_iteration": 2.66479754447937 }, { "auxiliary_loss_clip": 0.01017104, "auxiliary_loss_mlp": 0.01000773, "balance_loss_clip": 1.01230764, "balance_loss_mlp": 0.99964064, "epoch": 0.6203517210281076, "flos": 62742694890240.0, "grad_norm": 0.6983272901342329, "language_loss": 0.59043646, "learning_rate": 1.3306396760094122e-06, "loss": 0.61061525, "num_input_tokens_seen": 222242660, "step": 10318, "time_per_iteration": 3.26334547996521 }, { "auxiliary_loss_clip": 0.01087685, "auxiliary_loss_mlp": 0.01036361, "balance_loss_clip": 1.04098892, "balance_loss_mlp": 1.02262819, "epoch": 0.6204118442807756, "flos": 23404277162880.0, "grad_norm": 1.7402353399266621, "language_loss": 0.77895933, "learning_rate": 1.330272686582143e-06, "loss": 0.80019981, "num_input_tokens_seen": 222262170, "step": 10319, "time_per_iteration": 2.729206085205078 }, { "auxiliary_loss_clip": 0.01095977, "auxiliary_loss_mlp": 0.01036689, "balance_loss_clip": 1.04197454, "balance_loss_mlp": 1.02473831, "epoch": 0.6204719675334436, "flos": 20193647898240.0, "grad_norm": 1.990293472142164, "language_loss": 0.66651958, "learning_rate": 1.3299057225510013e-06, "loss": 0.6878463, "num_input_tokens_seen": 222280375, "step": 10320, "time_per_iteration": 2.6254241466522217 }, { "auxiliary_loss_clip": 0.0107265, "auxiliary_loss_mlp": 0.01032632, "balance_loss_clip": 1.03743291, "balance_loss_mlp": 1.02023411, "epoch": 0.6205320907861115, "flos": 13188050916480.0, "grad_norm": 1.82656973457559, "language_loss": 0.76147729, "learning_rate": 1.3295387839299013e-06, "loss": 0.78253013, "num_input_tokens_seen": 222297325, "step": 10321, "time_per_iteration": 2.7273271083831787 }, { "auxiliary_loss_clip": 0.01086085, "auxiliary_loss_mlp": 0.01026791, "balance_loss_clip": 1.03792763, "balance_loss_mlp": 1.01485252, "epoch": 0.6205922140387795, "flos": 20668386977280.0, "grad_norm": 1.806601811467465, "language_loss": 0.73700678, "learning_rate": 1.329171870732758e-06, "loss": 0.75813556, "num_input_tokens_seen": 222317095, "step": 10322, "time_per_iteration": 2.699514627456665 }, { "auxiliary_loss_clip": 0.01074398, "auxiliary_loss_mlp": 0.01028622, "balance_loss_clip": 1.03568387, "balance_loss_mlp": 1.01665354, "epoch": 0.6206523372914474, "flos": 23877831093120.0, "grad_norm": 1.7201277094728098, "language_loss": 0.72919118, "learning_rate": 1.3288049829734845e-06, "loss": 0.75022137, "num_input_tokens_seen": 222337055, "step": 10323, "time_per_iteration": 2.743650436401367 }, { "auxiliary_loss_clip": 0.01111352, "auxiliary_loss_mlp": 0.0103222, "balance_loss_clip": 1.04181314, "balance_loss_mlp": 1.0182364, "epoch": 0.6207124605441154, "flos": 13406603218560.0, "grad_norm": 2.6397698912445495, "language_loss": 0.58581293, "learning_rate": 1.3284381206659933e-06, "loss": 0.60724854, "num_input_tokens_seen": 222354515, "step": 10324, "time_per_iteration": 2.624112129211426 }, { "auxiliary_loss_clip": 0.0107635, "auxiliary_loss_mlp": 0.01039987, "balance_loss_clip": 1.03851843, "balance_loss_mlp": 1.02483535, "epoch": 0.6207725837967835, "flos": 18916341287040.0, "grad_norm": 1.9960731674186785, "language_loss": 0.77214384, "learning_rate": 1.3280712838241956e-06, "loss": 0.79330719, "num_input_tokens_seen": 222372755, "step": 10325, "time_per_iteration": 2.7152631282806396 }, { "auxiliary_loss_clip": 0.01106149, "auxiliary_loss_mlp": 0.01030383, "balance_loss_clip": 1.03993106, "balance_loss_mlp": 1.01689494, "epoch": 0.6208327070494514, "flos": 23980211832960.0, "grad_norm": 1.8479718801200147, "language_loss": 0.72421134, "learning_rate": 1.327704472462003e-06, "loss": 0.74557668, "num_input_tokens_seen": 222391380, "step": 10326, "time_per_iteration": 2.7786142826080322 }, { "auxiliary_loss_clip": 0.01108733, "auxiliary_loss_mlp": 0.01040278, "balance_loss_clip": 1.04103386, "balance_loss_mlp": 1.02686155, "epoch": 0.6208928303021194, "flos": 22820405587200.0, "grad_norm": 2.631988552550178, "language_loss": 0.74086714, "learning_rate": 1.3273376865933234e-06, "loss": 0.76235723, "num_input_tokens_seen": 222411165, "step": 10327, "time_per_iteration": 2.6204168796539307 }, { "auxiliary_loss_clip": 0.01090969, "auxiliary_loss_mlp": 0.01032322, "balance_loss_clip": 1.03982306, "balance_loss_mlp": 1.01871443, "epoch": 0.6209529535547873, "flos": 17564519911680.0, "grad_norm": 1.9488386802913455, "language_loss": 0.79213655, "learning_rate": 1.326970926232066e-06, "loss": 0.81336939, "num_input_tokens_seen": 222428110, "step": 10328, "time_per_iteration": 2.678966522216797 }, { "auxiliary_loss_clip": 0.01080917, "auxiliary_loss_mlp": 0.01040936, "balance_loss_clip": 1.03594792, "balance_loss_mlp": 1.02738202, "epoch": 0.6210130768074553, "flos": 22011912311040.0, "grad_norm": 1.6747137440925206, "language_loss": 0.77850568, "learning_rate": 1.3266041913921396e-06, "loss": 0.79972422, "num_input_tokens_seen": 222446385, "step": 10329, "time_per_iteration": 2.7247962951660156 }, { "auxiliary_loss_clip": 0.01022383, "auxiliary_loss_mlp": 0.01002444, "balance_loss_clip": 1.00971746, "balance_loss_mlp": 1.00120986, "epoch": 0.6210732000601232, "flos": 63676873854720.0, "grad_norm": 0.8323168859834922, "language_loss": 0.62231028, "learning_rate": 1.3262374820874484e-06, "loss": 0.64255857, "num_input_tokens_seen": 222502150, "step": 10330, "time_per_iteration": 3.1397132873535156 }, { "auxiliary_loss_clip": 0.01109711, "auxiliary_loss_mlp": 0.01039515, "balance_loss_clip": 1.04052687, "balance_loss_mlp": 1.02538919, "epoch": 0.6211333233127913, "flos": 24243365848320.0, "grad_norm": 1.916638297562339, "language_loss": 0.77865416, "learning_rate": 1.3258707983319002e-06, "loss": 0.80014634, "num_input_tokens_seen": 222519880, "step": 10331, "time_per_iteration": 4.165555715560913 }, { "auxiliary_loss_clip": 0.01119225, "auxiliary_loss_mlp": 0.01036042, "balance_loss_clip": 1.04211998, "balance_loss_mlp": 1.0226016, "epoch": 0.6211934465654592, "flos": 16943803960320.0, "grad_norm": 2.274669690788456, "language_loss": 0.67796123, "learning_rate": 1.3255041401393992e-06, "loss": 0.69951391, "num_input_tokens_seen": 222538545, "step": 10332, "time_per_iteration": 4.209641933441162 }, { "auxiliary_loss_clip": 0.01082735, "auxiliary_loss_mlp": 0.01033197, "balance_loss_clip": 1.03757524, "balance_loss_mlp": 1.0202266, "epoch": 0.6212535698181272, "flos": 15267386355840.0, "grad_norm": 1.6414227257739276, "language_loss": 0.76285797, "learning_rate": 1.3251375075238476e-06, "loss": 0.78401732, "num_input_tokens_seen": 222556935, "step": 10333, "time_per_iteration": 4.338353157043457 }, { "auxiliary_loss_clip": 0.01086354, "auxiliary_loss_mlp": 0.01035943, "balance_loss_clip": 1.03819084, "balance_loss_mlp": 1.02344966, "epoch": 0.6213136930707951, "flos": 13443950384640.0, "grad_norm": 2.217560323857708, "language_loss": 0.69773704, "learning_rate": 1.3247709004991507e-06, "loss": 0.71896005, "num_input_tokens_seen": 222574035, "step": 10334, "time_per_iteration": 2.6839816570281982 }, { "auxiliary_loss_clip": 0.01092709, "auxiliary_loss_mlp": 0.00770618, "balance_loss_clip": 1.03960049, "balance_loss_mlp": 1.00011337, "epoch": 0.6213738163234631, "flos": 18111223889280.0, "grad_norm": 1.6672758368774196, "language_loss": 0.69724143, "learning_rate": 1.3244043190792078e-06, "loss": 0.71587467, "num_input_tokens_seen": 222592290, "step": 10335, "time_per_iteration": 2.6737349033355713 }, { "auxiliary_loss_clip": 0.01059124, "auxiliary_loss_mlp": 0.01035916, "balance_loss_clip": 1.03123188, "balance_loss_mlp": 1.02301764, "epoch": 0.621433939576131, "flos": 25337348421120.0, "grad_norm": 1.5976161024349493, "language_loss": 0.79976332, "learning_rate": 1.3240377632779213e-06, "loss": 0.82071376, "num_input_tokens_seen": 222612805, "step": 10336, "time_per_iteration": 2.747412919998169 }, { "auxiliary_loss_clip": 0.01113717, "auxiliary_loss_mlp": 0.01036201, "balance_loss_clip": 1.04143834, "balance_loss_mlp": 1.02375555, "epoch": 0.621494062828799, "flos": 22565619440640.0, "grad_norm": 1.7008650000144097, "language_loss": 0.73422229, "learning_rate": 1.3236712331091907e-06, "loss": 0.75572157, "num_input_tokens_seen": 222632260, "step": 10337, "time_per_iteration": 4.168013334274292 }, { "auxiliary_loss_clip": 0.01118051, "auxiliary_loss_mlp": 0.01039175, "balance_loss_clip": 1.04091513, "balance_loss_mlp": 1.0258832, "epoch": 0.621554186081467, "flos": 27417976750080.0, "grad_norm": 4.811980339506567, "language_loss": 0.63192534, "learning_rate": 1.3233047285869145e-06, "loss": 0.65349758, "num_input_tokens_seen": 222653570, "step": 10338, "time_per_iteration": 2.640453815460205 }, { "auxiliary_loss_clip": 0.01103195, "auxiliary_loss_mlp": 0.01037115, "balance_loss_clip": 1.0407145, "balance_loss_mlp": 1.0245744, "epoch": 0.621614309334135, "flos": 22346815743360.0, "grad_norm": 1.5973259219647309, "language_loss": 0.71490097, "learning_rate": 1.322938249724991e-06, "loss": 0.73630404, "num_input_tokens_seen": 222672480, "step": 10339, "time_per_iteration": 2.6346054077148438 }, { "auxiliary_loss_clip": 0.01062852, "auxiliary_loss_mlp": 0.01037431, "balance_loss_clip": 1.03612769, "balance_loss_mlp": 1.02370453, "epoch": 0.621674432586803, "flos": 19281229597440.0, "grad_norm": 1.7281695006377986, "language_loss": 0.69872439, "learning_rate": 1.3225717965373166e-06, "loss": 0.71972716, "num_input_tokens_seen": 222691200, "step": 10340, "time_per_iteration": 2.7176573276519775 }, { "auxiliary_loss_clip": 0.01067449, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.03537023, "balance_loss_mlp": 1.01955473, "epoch": 0.6217345558394709, "flos": 21609533180160.0, "grad_norm": 2.160368660473176, "language_loss": 0.68745732, "learning_rate": 1.322205369037788e-06, "loss": 0.70845366, "num_input_tokens_seen": 222709975, "step": 10341, "time_per_iteration": 2.667415142059326 }, { "auxiliary_loss_clip": 0.01105428, "auxiliary_loss_mlp": 0.01033663, "balance_loss_clip": 1.04163766, "balance_loss_mlp": 1.01951921, "epoch": 0.6217946790921389, "flos": 18004102554240.0, "grad_norm": 1.857842108735868, "language_loss": 0.8084417, "learning_rate": 1.321838967240299e-06, "loss": 0.82983261, "num_input_tokens_seen": 222729005, "step": 10342, "time_per_iteration": 2.6358642578125 }, { "auxiliary_loss_clip": 0.01016012, "auxiliary_loss_mlp": 0.01001969, "balance_loss_clip": 1.01067889, "balance_loss_mlp": 1.00081241, "epoch": 0.6218548023448068, "flos": 61973631768960.0, "grad_norm": 0.7777664565041693, "language_loss": 0.57339287, "learning_rate": 1.3214725911587452e-06, "loss": 0.59357268, "num_input_tokens_seen": 222786090, "step": 10343, "time_per_iteration": 3.105703830718994 }, { "auxiliary_loss_clip": 0.01071779, "auxiliary_loss_mlp": 0.01031556, "balance_loss_clip": 1.03384042, "balance_loss_mlp": 1.01972461, "epoch": 0.6219149255974749, "flos": 25739152934400.0, "grad_norm": 1.873733183159078, "language_loss": 0.7244643, "learning_rate": 1.3211062408070184e-06, "loss": 0.74549764, "num_input_tokens_seen": 222806100, "step": 10344, "time_per_iteration": 2.7128279209136963 }, { "auxiliary_loss_clip": 0.01106863, "auxiliary_loss_mlp": 0.01045674, "balance_loss_clip": 1.04245842, "balance_loss_mlp": 1.03368115, "epoch": 0.6219750488501428, "flos": 25411073086080.0, "grad_norm": 3.095022336982982, "language_loss": 0.60327411, "learning_rate": 1.3207399161990105e-06, "loss": 0.62479943, "num_input_tokens_seen": 222826575, "step": 10345, "time_per_iteration": 2.741757392883301 }, { "auxiliary_loss_clip": 0.01048609, "auxiliary_loss_mlp": 0.01041234, "balance_loss_clip": 1.03204262, "balance_loss_mlp": 1.02753103, "epoch": 0.6220351721028108, "flos": 20047383717120.0, "grad_norm": 1.8310337674001005, "language_loss": 0.77749038, "learning_rate": 1.320373617348614e-06, "loss": 0.79838884, "num_input_tokens_seen": 222845285, "step": 10346, "time_per_iteration": 2.770772695541382 }, { "auxiliary_loss_clip": 0.01080995, "auxiliary_loss_mlp": 0.01037279, "balance_loss_clip": 1.03780663, "balance_loss_mlp": 1.02326, "epoch": 0.6220952953554787, "flos": 27488397363840.0, "grad_norm": 1.684158236808197, "language_loss": 0.71739966, "learning_rate": 1.3200073442697171e-06, "loss": 0.73858243, "num_input_tokens_seen": 222864575, "step": 10347, "time_per_iteration": 2.708918333053589 }, { "auxiliary_loss_clip": 0.01099172, "auxiliary_loss_mlp": 0.01031988, "balance_loss_clip": 1.03707337, "balance_loss_mlp": 1.01956046, "epoch": 0.6221554186081467, "flos": 19207612673280.0, "grad_norm": 1.7479247707562864, "language_loss": 0.71972638, "learning_rate": 1.3196410969762108e-06, "loss": 0.74103796, "num_input_tokens_seen": 222884420, "step": 10348, "time_per_iteration": 2.7594058513641357 }, { "auxiliary_loss_clip": 0.01001862, "auxiliary_loss_mlp": 0.01006112, "balance_loss_clip": 1.01154137, "balance_loss_mlp": 1.00479472, "epoch": 0.6222155418608146, "flos": 62950939989120.0, "grad_norm": 0.816855091094188, "language_loss": 0.54121429, "learning_rate": 1.3192748754819815e-06, "loss": 0.56129414, "num_input_tokens_seen": 222944690, "step": 10349, "time_per_iteration": 3.2531776428222656 }, { "auxiliary_loss_clip": 0.0107704, "auxiliary_loss_mlp": 0.01030722, "balance_loss_clip": 1.03621447, "balance_loss_mlp": 1.01792502, "epoch": 0.6222756651134826, "flos": 22601099099520.0, "grad_norm": 2.4846967665996234, "language_loss": 0.69486421, "learning_rate": 1.3189086798009173e-06, "loss": 0.71594191, "num_input_tokens_seen": 222962990, "step": 10350, "time_per_iteration": 2.7475686073303223 }, { "auxiliary_loss_clip": 0.01116919, "auxiliary_loss_mlp": 0.01038055, "balance_loss_clip": 1.04172456, "balance_loss_mlp": 1.02536559, "epoch": 0.6223357883661506, "flos": 21142228216320.0, "grad_norm": 1.8297714166368801, "language_loss": 0.5704937, "learning_rate": 1.3185425099469046e-06, "loss": 0.59204346, "num_input_tokens_seen": 222980715, "step": 10351, "time_per_iteration": 2.675811290740967 }, { "auxiliary_loss_clip": 0.01024035, "auxiliary_loss_mlp": 0.01004222, "balance_loss_clip": 1.01215839, "balance_loss_mlp": 1.00262439, "epoch": 0.6223959116188186, "flos": 63765071700480.0, "grad_norm": 0.8048031710876978, "language_loss": 0.61121249, "learning_rate": 1.3181763659338276e-06, "loss": 0.63149512, "num_input_tokens_seen": 223040685, "step": 10352, "time_per_iteration": 3.2121970653533936 }, { "auxiliary_loss_clip": 0.01111121, "auxiliary_loss_mlp": 0.01037194, "balance_loss_clip": 1.03907847, "balance_loss_mlp": 1.02456367, "epoch": 0.6224560348714866, "flos": 22565727181440.0, "grad_norm": 2.8594267132643267, "language_loss": 0.82211882, "learning_rate": 1.3178102477755714e-06, "loss": 0.84360194, "num_input_tokens_seen": 223059000, "step": 10353, "time_per_iteration": 2.6481454372406006 }, { "auxiliary_loss_clip": 0.01097506, "auxiliary_loss_mlp": 0.01033284, "balance_loss_clip": 1.03879428, "balance_loss_mlp": 1.02166736, "epoch": 0.6225161581241545, "flos": 24097748112000.0, "grad_norm": 1.6101266746131675, "language_loss": 0.75329089, "learning_rate": 1.3174441554860195e-06, "loss": 0.77459884, "num_input_tokens_seen": 223079345, "step": 10354, "time_per_iteration": 2.672100067138672 }, { "auxiliary_loss_clip": 0.01071329, "auxiliary_loss_mlp": 0.01033068, "balance_loss_clip": 1.03829408, "balance_loss_mlp": 1.02011561, "epoch": 0.6225762813768225, "flos": 20443513881600.0, "grad_norm": 1.4917034506382563, "language_loss": 0.78818482, "learning_rate": 1.3170780890790528e-06, "loss": 0.80922878, "num_input_tokens_seen": 223097880, "step": 10355, "time_per_iteration": 2.6894590854644775 }, { "auxiliary_loss_clip": 0.0110748, "auxiliary_loss_mlp": 0.01038653, "balance_loss_clip": 1.04353356, "balance_loss_mlp": 1.0261302, "epoch": 0.6226364046294904, "flos": 27198131558400.0, "grad_norm": 1.5243384390478247, "language_loss": 0.7810744, "learning_rate": 1.3167120485685538e-06, "loss": 0.80253577, "num_input_tokens_seen": 223118185, "step": 10356, "time_per_iteration": 2.662597417831421 }, { "auxiliary_loss_clip": 0.01095206, "auxiliary_loss_mlp": 0.00771022, "balance_loss_clip": 1.03841674, "balance_loss_mlp": 1.0001657, "epoch": 0.6226965278821585, "flos": 20445776438400.0, "grad_norm": 1.8782312562736863, "language_loss": 0.6801585, "learning_rate": 1.3163460339684024e-06, "loss": 0.69882077, "num_input_tokens_seen": 223137600, "step": 10357, "time_per_iteration": 2.630401611328125 }, { "auxiliary_loss_clip": 0.01095487, "auxiliary_loss_mlp": 0.01037985, "balance_loss_clip": 1.03887713, "balance_loss_mlp": 1.02341211, "epoch": 0.6227566511348264, "flos": 22162737519360.0, "grad_norm": 2.8474094143077453, "language_loss": 0.76153404, "learning_rate": 1.3159800452924778e-06, "loss": 0.78286874, "num_input_tokens_seen": 223154360, "step": 10358, "time_per_iteration": 2.661013126373291 }, { "auxiliary_loss_clip": 0.01092746, "auxiliary_loss_mlp": 0.01033714, "balance_loss_clip": 1.03905225, "balance_loss_mlp": 1.02091646, "epoch": 0.6228167743874944, "flos": 18040875102720.0, "grad_norm": 2.1492109037016287, "language_loss": 0.82438827, "learning_rate": 1.3156140825546588e-06, "loss": 0.84565282, "num_input_tokens_seen": 223172255, "step": 10359, "time_per_iteration": 2.75612211227417 }, { "auxiliary_loss_clip": 0.01084816, "auxiliary_loss_mlp": 0.0105208, "balance_loss_clip": 1.03617096, "balance_loss_mlp": 1.0374589, "epoch": 0.6228768976401623, "flos": 17742851959680.0, "grad_norm": 3.2541550800674046, "language_loss": 0.73383337, "learning_rate": 1.315248145768822e-06, "loss": 0.75520235, "num_input_tokens_seen": 223186965, "step": 10360, "time_per_iteration": 2.761385440826416 }, { "auxiliary_loss_clip": 0.01103199, "auxiliary_loss_mlp": 0.01038017, "balance_loss_clip": 1.03837395, "balance_loss_mlp": 1.025244, "epoch": 0.6229370208928303, "flos": 17894934144000.0, "grad_norm": 1.937323368007563, "language_loss": 0.77496618, "learning_rate": 1.3148822349488442e-06, "loss": 0.79637837, "num_input_tokens_seen": 223206045, "step": 10361, "time_per_iteration": 2.7078726291656494 }, { "auxiliary_loss_clip": 0.0107034, "auxiliary_loss_mlp": 0.01029878, "balance_loss_clip": 1.0354948, "balance_loss_mlp": 1.01774836, "epoch": 0.6229971441454982, "flos": 17347763289600.0, "grad_norm": 2.088996135555703, "language_loss": 0.6762352, "learning_rate": 1.3145163501086005e-06, "loss": 0.69723737, "num_input_tokens_seen": 223224820, "step": 10362, "time_per_iteration": 2.693016529083252 }, { "auxiliary_loss_clip": 0.01095554, "auxiliary_loss_mlp": 0.01033637, "balance_loss_clip": 1.03886461, "balance_loss_mlp": 1.02005267, "epoch": 0.6230572673981662, "flos": 29241376807680.0, "grad_norm": 2.450509773967882, "language_loss": 0.67575699, "learning_rate": 1.3141504912619658e-06, "loss": 0.6970489, "num_input_tokens_seen": 223243205, "step": 10363, "time_per_iteration": 2.7115700244903564 }, { "auxiliary_loss_clip": 0.01068138, "auxiliary_loss_mlp": 0.01032085, "balance_loss_clip": 1.03868961, "balance_loss_mlp": 1.01858449, "epoch": 0.6231173906508342, "flos": 16325961096960.0, "grad_norm": 1.7878512911378444, "language_loss": 0.86638474, "learning_rate": 1.3137846584228127e-06, "loss": 0.88738704, "num_input_tokens_seen": 223261370, "step": 10364, "time_per_iteration": 2.6732850074768066 }, { "auxiliary_loss_clip": 0.01017483, "auxiliary_loss_mlp": 0.01010257, "balance_loss_clip": 1.01340818, "balance_loss_mlp": 1.00900543, "epoch": 0.6231775139035022, "flos": 68702032517760.0, "grad_norm": 0.8935233084209503, "language_loss": 0.60708529, "learning_rate": 1.313418851605015e-06, "loss": 0.62736267, "num_input_tokens_seen": 223315050, "step": 10365, "time_per_iteration": 3.2580301761627197 }, { "auxiliary_loss_clip": 0.01085426, "auxiliary_loss_mlp": 0.007721, "balance_loss_clip": 1.04356837, "balance_loss_mlp": 1.00019813, "epoch": 0.6232376371561702, "flos": 19821038163840.0, "grad_norm": 1.9797808373338666, "language_loss": 0.75283766, "learning_rate": 1.3130530708224427e-06, "loss": 0.77141291, "num_input_tokens_seen": 223332130, "step": 10366, "time_per_iteration": 2.695686101913452 }, { "auxiliary_loss_clip": 0.01107257, "auxiliary_loss_mlp": 0.01040192, "balance_loss_clip": 1.04238236, "balance_loss_mlp": 1.0269959, "epoch": 0.6232977604088381, "flos": 23258264376960.0, "grad_norm": 3.5413788647782978, "language_loss": 0.76049531, "learning_rate": 1.3126873160889665e-06, "loss": 0.78196979, "num_input_tokens_seen": 223351605, "step": 10367, "time_per_iteration": 2.6170830726623535 }, { "auxiliary_loss_clip": 0.01102139, "auxiliary_loss_mlp": 0.01034539, "balance_loss_clip": 1.04015589, "balance_loss_mlp": 1.02192068, "epoch": 0.6233578836615061, "flos": 21106425335040.0, "grad_norm": 1.5257334476599056, "language_loss": 0.78428042, "learning_rate": 1.312321587418457e-06, "loss": 0.80564719, "num_input_tokens_seen": 223372090, "step": 10368, "time_per_iteration": 2.625438928604126 }, { "auxiliary_loss_clip": 0.01052163, "auxiliary_loss_mlp": 0.01035756, "balance_loss_clip": 1.03783369, "balance_loss_mlp": 1.02115321, "epoch": 0.623418006914174, "flos": 23769416868480.0, "grad_norm": 2.0197111691245735, "language_loss": 0.68460292, "learning_rate": 1.3119558848247811e-06, "loss": 0.70548213, "num_input_tokens_seen": 223390110, "step": 10369, "time_per_iteration": 2.808359146118164 }, { "auxiliary_loss_clip": 0.01117993, "auxiliary_loss_mlp": 0.0103678, "balance_loss_clip": 1.04215741, "balance_loss_mlp": 1.02325583, "epoch": 0.6234781301668421, "flos": 17890480857600.0, "grad_norm": 2.044462972771541, "language_loss": 0.88031048, "learning_rate": 1.3115902083218072e-06, "loss": 0.90185821, "num_input_tokens_seen": 223404205, "step": 10370, "time_per_iteration": 4.117987155914307 }, { "auxiliary_loss_clip": 0.0111332, "auxiliary_loss_mlp": 0.01029208, "balance_loss_clip": 1.039994, "balance_loss_mlp": 1.01634502, "epoch": 0.62353825341951, "flos": 26175503352960.0, "grad_norm": 1.608857921427384, "language_loss": 0.66079128, "learning_rate": 1.311224557923402e-06, "loss": 0.68221653, "num_input_tokens_seen": 223424855, "step": 10371, "time_per_iteration": 4.359363079071045 }, { "auxiliary_loss_clip": 0.01098316, "auxiliary_loss_mlp": 0.01029741, "balance_loss_clip": 1.03844571, "balance_loss_mlp": 1.01849937, "epoch": 0.623598376672178, "flos": 31139902160640.0, "grad_norm": 1.3363320294252738, "language_loss": 0.77749312, "learning_rate": 1.3108589336434298e-06, "loss": 0.79877365, "num_input_tokens_seen": 223447225, "step": 10372, "time_per_iteration": 4.2803263664245605 }, { "auxiliary_loss_clip": 0.01105747, "auxiliary_loss_mlp": 0.01033888, "balance_loss_clip": 1.0399971, "balance_loss_mlp": 1.02063167, "epoch": 0.6236584999248459, "flos": 23730202195200.0, "grad_norm": 1.86692873433382, "language_loss": 0.77388912, "learning_rate": 1.3104933354957568e-06, "loss": 0.79528546, "num_input_tokens_seen": 223467520, "step": 10373, "time_per_iteration": 2.6164214611053467 }, { "auxiliary_loss_clip": 0.01099988, "auxiliary_loss_mlp": 0.01029626, "balance_loss_clip": 1.03910232, "balance_loss_mlp": 1.0177052, "epoch": 0.6237186231775139, "flos": 21762764599680.0, "grad_norm": 1.5661441130214229, "language_loss": 0.69628543, "learning_rate": 1.3101277634942448e-06, "loss": 0.71758157, "num_input_tokens_seen": 223488130, "step": 10374, "time_per_iteration": 2.620152711868286 }, { "auxiliary_loss_clip": 0.0109877, "auxiliary_loss_mlp": 0.01027687, "balance_loss_clip": 1.04083633, "balance_loss_mlp": 1.01481199, "epoch": 0.6237787464301818, "flos": 14939486075520.0, "grad_norm": 1.8629467261116164, "language_loss": 0.77406085, "learning_rate": 1.3097622176527577e-06, "loss": 0.7953254, "num_input_tokens_seen": 223505105, "step": 10375, "time_per_iteration": 2.662888526916504 }, { "auxiliary_loss_clip": 0.0108805, "auxiliary_loss_mlp": 0.01027669, "balance_loss_clip": 1.03999758, "balance_loss_mlp": 1.01531863, "epoch": 0.6238388696828499, "flos": 35590311302400.0, "grad_norm": 1.5320519249858895, "language_loss": 0.70062512, "learning_rate": 1.3093966979851566e-06, "loss": 0.72178227, "num_input_tokens_seen": 223528065, "step": 10376, "time_per_iteration": 2.7455239295959473 }, { "auxiliary_loss_clip": 0.01087005, "auxiliary_loss_mlp": 0.01030618, "balance_loss_clip": 1.04036319, "balance_loss_mlp": 1.01622987, "epoch": 0.6238989929355178, "flos": 23623511823360.0, "grad_norm": 1.5317768555363875, "language_loss": 0.76383424, "learning_rate": 1.309031204505301e-06, "loss": 0.78501046, "num_input_tokens_seen": 223547305, "step": 10377, "time_per_iteration": 4.217595338821411 }, { "auxiliary_loss_clip": 0.01095365, "auxiliary_loss_mlp": 0.01032947, "balance_loss_clip": 1.04230881, "balance_loss_mlp": 1.02149701, "epoch": 0.6239591161881858, "flos": 22087468569600.0, "grad_norm": 1.9922863755635154, "language_loss": 0.68561447, "learning_rate": 1.308665737227052e-06, "loss": 0.70689762, "num_input_tokens_seen": 223567205, "step": 10378, "time_per_iteration": 2.668548822402954 }, { "auxiliary_loss_clip": 0.01089219, "auxiliary_loss_mlp": 0.01032012, "balance_loss_clip": 1.03835332, "balance_loss_mlp": 1.01904845, "epoch": 0.6240192394408538, "flos": 24535930124160.0, "grad_norm": 1.8244104489721222, "language_loss": 0.76516432, "learning_rate": 1.3083002961642675e-06, "loss": 0.78637671, "num_input_tokens_seen": 223586560, "step": 10379, "time_per_iteration": 2.636387825012207 }, { "auxiliary_loss_clip": 0.01091775, "auxiliary_loss_mlp": 0.01029494, "balance_loss_clip": 1.03986144, "balance_loss_mlp": 1.01667941, "epoch": 0.6240793626935217, "flos": 27931930502400.0, "grad_norm": 1.3063592721987374, "language_loss": 0.79515195, "learning_rate": 1.3079348813308051e-06, "loss": 0.81636459, "num_input_tokens_seen": 223610595, "step": 10380, "time_per_iteration": 2.7264626026153564 }, { "auxiliary_loss_clip": 0.01098611, "auxiliary_loss_mlp": 0.01032438, "balance_loss_clip": 1.04053771, "balance_loss_mlp": 1.02064252, "epoch": 0.6241394859461897, "flos": 22892514140160.0, "grad_norm": 1.5486607590861352, "language_loss": 0.80008709, "learning_rate": 1.3075694927405207e-06, "loss": 0.82139754, "num_input_tokens_seen": 223630230, "step": 10381, "time_per_iteration": 2.6646101474761963 }, { "auxiliary_loss_clip": 0.01089557, "auxiliary_loss_mlp": 0.01035129, "balance_loss_clip": 1.03794694, "balance_loss_mlp": 1.0213902, "epoch": 0.6241996091988576, "flos": 12750766744320.0, "grad_norm": 2.2250038803258256, "language_loss": 0.74777293, "learning_rate": 1.3072041304072718e-06, "loss": 0.76901984, "num_input_tokens_seen": 223648360, "step": 10382, "time_per_iteration": 2.7230777740478516 }, { "auxiliary_loss_clip": 0.01101818, "auxiliary_loss_mlp": 0.01025191, "balance_loss_clip": 1.03977156, "balance_loss_mlp": 1.01332331, "epoch": 0.6242597324515257, "flos": 25851302173440.0, "grad_norm": 1.6487787752646939, "language_loss": 0.78440118, "learning_rate": 1.306838794344911e-06, "loss": 0.80567122, "num_input_tokens_seen": 223671255, "step": 10383, "time_per_iteration": 2.7347943782806396 }, { "auxiliary_loss_clip": 0.01078794, "auxiliary_loss_mlp": 0.01030474, "balance_loss_clip": 1.03457248, "balance_loss_mlp": 1.01803493, "epoch": 0.6243198557041936, "flos": 19937712516480.0, "grad_norm": 1.7448881929287328, "language_loss": 0.74959773, "learning_rate": 1.3064734845672925e-06, "loss": 0.77069044, "num_input_tokens_seen": 223689860, "step": 10384, "time_per_iteration": 2.715670347213745 }, { "auxiliary_loss_clip": 0.01090865, "auxiliary_loss_mlp": 0.01039296, "balance_loss_clip": 1.03685331, "balance_loss_mlp": 1.02441823, "epoch": 0.6243799789568616, "flos": 18406194376320.0, "grad_norm": 1.703443113253697, "language_loss": 0.66354865, "learning_rate": 1.3061082010882694e-06, "loss": 0.68485022, "num_input_tokens_seen": 223707835, "step": 10385, "time_per_iteration": 2.6395132541656494 }, { "auxiliary_loss_clip": 0.01017413, "auxiliary_loss_mlp": 0.00999729, "balance_loss_clip": 1.01207745, "balance_loss_mlp": 0.998501, "epoch": 0.6244401022095295, "flos": 66027587523840.0, "grad_norm": 0.7616108367019777, "language_loss": 0.6200667, "learning_rate": 1.305742943921692e-06, "loss": 0.64023811, "num_input_tokens_seen": 223771875, "step": 10386, "time_per_iteration": 3.2555150985717773 }, { "auxiliary_loss_clip": 0.01103744, "auxiliary_loss_mlp": 0.01032194, "balance_loss_clip": 1.03913903, "balance_loss_mlp": 1.01928985, "epoch": 0.6245002254621975, "flos": 24571266128640.0, "grad_norm": 2.488369520959267, "language_loss": 0.7205711, "learning_rate": 1.3053777130814128e-06, "loss": 0.74193048, "num_input_tokens_seen": 223788895, "step": 10387, "time_per_iteration": 2.6242222785949707 }, { "auxiliary_loss_clip": 0.01111553, "auxiliary_loss_mlp": 0.01040034, "balance_loss_clip": 1.04189062, "balance_loss_mlp": 1.0254066, "epoch": 0.6245603487148654, "flos": 29168837291520.0, "grad_norm": 2.3305483255195787, "language_loss": 0.65657806, "learning_rate": 1.3050125085812798e-06, "loss": 0.67809391, "num_input_tokens_seen": 223810385, "step": 10388, "time_per_iteration": 2.659313440322876 }, { "auxiliary_loss_clip": 0.0107602, "auxiliary_loss_mlp": 0.01029774, "balance_loss_clip": 1.03905761, "balance_loss_mlp": 1.01803207, "epoch": 0.6246204719675335, "flos": 14790097411200.0, "grad_norm": 1.9152677822128796, "language_loss": 0.79151481, "learning_rate": 1.3046473304351417e-06, "loss": 0.81257272, "num_input_tokens_seen": 223826040, "step": 10389, "time_per_iteration": 2.6531307697296143 }, { "auxiliary_loss_clip": 0.0108775, "auxiliary_loss_mlp": 0.0103435, "balance_loss_clip": 1.036906, "balance_loss_mlp": 1.02176762, "epoch": 0.6246805952202014, "flos": 12493538472960.0, "grad_norm": 4.707823306169989, "language_loss": 0.60542148, "learning_rate": 1.3042821786568475e-06, "loss": 0.62664247, "num_input_tokens_seen": 223842300, "step": 10390, "time_per_iteration": 2.6380884647369385 }, { "auxiliary_loss_clip": 0.01095689, "auxiliary_loss_mlp": 0.01032943, "balance_loss_clip": 1.03998685, "balance_loss_mlp": 1.02047336, "epoch": 0.6247407184728694, "flos": 12786677366400.0, "grad_norm": 1.9478919515008288, "language_loss": 0.76811498, "learning_rate": 1.3039170532602416e-06, "loss": 0.78940129, "num_input_tokens_seen": 223858320, "step": 10391, "time_per_iteration": 2.6485612392425537 }, { "auxiliary_loss_clip": 0.01095815, "auxiliary_loss_mlp": 0.01034177, "balance_loss_clip": 1.0409584, "balance_loss_mlp": 1.02074265, "epoch": 0.6248008417255374, "flos": 40629188960640.0, "grad_norm": 1.4703588614112992, "language_loss": 0.64372337, "learning_rate": 1.3035519542591718e-06, "loss": 0.66502333, "num_input_tokens_seen": 223883545, "step": 10392, "time_per_iteration": 2.8461811542510986 }, { "auxiliary_loss_clip": 0.01096988, "auxiliary_loss_mlp": 0.01034326, "balance_loss_clip": 1.04133046, "balance_loss_mlp": 1.02083135, "epoch": 0.6248609649782053, "flos": 19902017376000.0, "grad_norm": 1.871769291735266, "language_loss": 0.76746744, "learning_rate": 1.3031868816674819e-06, "loss": 0.78878057, "num_input_tokens_seen": 223901445, "step": 10393, "time_per_iteration": 2.637190818786621 }, { "auxiliary_loss_clip": 0.01078713, "auxiliary_loss_mlp": 0.00772119, "balance_loss_clip": 1.03866291, "balance_loss_mlp": 1.00009847, "epoch": 0.6249210882308733, "flos": 19682746801920.0, "grad_norm": 1.7077234803990555, "language_loss": 0.82370424, "learning_rate": 1.3028218354990142e-06, "loss": 0.84221256, "num_input_tokens_seen": 223920170, "step": 10394, "time_per_iteration": 2.6997132301330566 }, { "auxiliary_loss_clip": 0.01095186, "auxiliary_loss_mlp": 0.01037496, "balance_loss_clip": 1.03878772, "balance_loss_mlp": 1.02421618, "epoch": 0.6249812114835412, "flos": 13990726189440.0, "grad_norm": 1.9873009659143388, "language_loss": 0.75021064, "learning_rate": 1.3024568157676128e-06, "loss": 0.77153742, "num_input_tokens_seen": 223936495, "step": 10395, "time_per_iteration": 2.6623713970184326 }, { "auxiliary_loss_clip": 0.01095635, "auxiliary_loss_mlp": 0.01033862, "balance_loss_clip": 1.03662229, "balance_loss_mlp": 1.0203023, "epoch": 0.6250413347362093, "flos": 14530031965440.0, "grad_norm": 3.229511376831138, "language_loss": 0.72134733, "learning_rate": 1.302091822487119e-06, "loss": 0.74264228, "num_input_tokens_seen": 223950070, "step": 10396, "time_per_iteration": 2.677992820739746 }, { "auxiliary_loss_clip": 0.01075755, "auxiliary_loss_mlp": 0.0103795, "balance_loss_clip": 1.04014516, "balance_loss_mlp": 1.0248127, "epoch": 0.6251014579888772, "flos": 22963006581120.0, "grad_norm": 1.7904379273274065, "language_loss": 0.75906593, "learning_rate": 1.3017268556713732e-06, "loss": 0.78020298, "num_input_tokens_seen": 223970065, "step": 10397, "time_per_iteration": 2.722014904022217 }, { "auxiliary_loss_clip": 0.01092491, "auxiliary_loss_mlp": 0.0103722, "balance_loss_clip": 1.04022741, "balance_loss_mlp": 1.02372003, "epoch": 0.6251615812415452, "flos": 28111232217600.0, "grad_norm": 4.888327827010162, "language_loss": 0.74880314, "learning_rate": 1.3013619153342154e-06, "loss": 0.77010036, "num_input_tokens_seen": 223990315, "step": 10398, "time_per_iteration": 2.7456398010253906 }, { "auxiliary_loss_clip": 0.01117793, "auxiliary_loss_mlp": 0.01031154, "balance_loss_clip": 1.03983879, "balance_loss_mlp": 1.01699233, "epoch": 0.6252217044942131, "flos": 26724469887360.0, "grad_norm": 1.9767586095703997, "language_loss": 0.73813987, "learning_rate": 1.300997001489483e-06, "loss": 0.75962937, "num_input_tokens_seen": 224009960, "step": 10399, "time_per_iteration": 2.6542532444000244 }, { "auxiliary_loss_clip": 0.01077509, "auxiliary_loss_mlp": 0.01036637, "balance_loss_clip": 1.03692555, "balance_loss_mlp": 1.02285028, "epoch": 0.6252818277468811, "flos": 20006768413440.0, "grad_norm": 1.7877165034058586, "language_loss": 0.74266648, "learning_rate": 1.3006321141510147e-06, "loss": 0.76380795, "num_input_tokens_seen": 224028870, "step": 10400, "time_per_iteration": 2.6837148666381836 }, { "auxiliary_loss_clip": 0.0101245, "auxiliary_loss_mlp": 0.01001226, "balance_loss_clip": 1.01475704, "balance_loss_mlp": 0.99997389, "epoch": 0.625341950999549, "flos": 59278285059840.0, "grad_norm": 0.8429284892848663, "language_loss": 0.56419927, "learning_rate": 1.3002672533326465e-06, "loss": 0.58433604, "num_input_tokens_seen": 224094140, "step": 10401, "time_per_iteration": 3.3155579566955566 }, { "auxiliary_loss_clip": 0.01107517, "auxiliary_loss_mlp": 0.01034196, "balance_loss_clip": 1.04071486, "balance_loss_mlp": 1.02067709, "epoch": 0.625402074252217, "flos": 20157090831360.0, "grad_norm": 2.04205601235836, "language_loss": 0.83276439, "learning_rate": 1.2999024190482146e-06, "loss": 0.85418153, "num_input_tokens_seen": 224113235, "step": 10402, "time_per_iteration": 2.691084146499634 }, { "auxiliary_loss_clip": 0.01036621, "auxiliary_loss_mlp": 0.01034014, "balance_loss_clip": 1.036587, "balance_loss_mlp": 1.02084088, "epoch": 0.625462197504885, "flos": 29132531619840.0, "grad_norm": 2.64185876470146, "language_loss": 0.69291663, "learning_rate": 1.2995376113115527e-06, "loss": 0.71362293, "num_input_tokens_seen": 224134530, "step": 10403, "time_per_iteration": 2.9650638103485107 }, { "auxiliary_loss_clip": 0.01081288, "auxiliary_loss_mlp": 0.01031215, "balance_loss_clip": 1.03741455, "balance_loss_mlp": 1.01692796, "epoch": 0.625522320757553, "flos": 26104436294400.0, "grad_norm": 1.773424214610222, "language_loss": 0.71938539, "learning_rate": 1.2991728301364954e-06, "loss": 0.74051046, "num_input_tokens_seen": 224154170, "step": 10404, "time_per_iteration": 3.032392978668213 }, { "auxiliary_loss_clip": 0.01071553, "auxiliary_loss_mlp": 0.01037364, "balance_loss_clip": 1.03673673, "balance_loss_mlp": 1.02419138, "epoch": 0.625582444010221, "flos": 20630967984000.0, "grad_norm": 1.988268046568807, "language_loss": 0.69859874, "learning_rate": 1.2988080755368742e-06, "loss": 0.71968794, "num_input_tokens_seen": 224172730, "step": 10405, "time_per_iteration": 2.752593994140625 }, { "auxiliary_loss_clip": 0.01088298, "auxiliary_loss_mlp": 0.01038031, "balance_loss_clip": 1.03901088, "balance_loss_mlp": 1.02447712, "epoch": 0.6256425672628889, "flos": 20521512264960.0, "grad_norm": 1.8903848634840759, "language_loss": 0.7935456, "learning_rate": 1.2984433475265207e-06, "loss": 0.81480896, "num_input_tokens_seen": 224192620, "step": 10406, "time_per_iteration": 2.6944150924682617 }, { "auxiliary_loss_clip": 0.01078593, "auxiliary_loss_mlp": 0.01035645, "balance_loss_clip": 1.0391295, "balance_loss_mlp": 1.02321792, "epoch": 0.6257026905155569, "flos": 29529200488320.0, "grad_norm": 1.7747095604461551, "language_loss": 0.68853474, "learning_rate": 1.2980786461192666e-06, "loss": 0.70967722, "num_input_tokens_seen": 224214660, "step": 10407, "time_per_iteration": 2.7394134998321533 }, { "auxiliary_loss_clip": 0.01101618, "auxiliary_loss_mlp": 0.00769457, "balance_loss_clip": 1.03912544, "balance_loss_mlp": 1.00006318, "epoch": 0.6257628137682248, "flos": 24024885373440.0, "grad_norm": 1.6542698687790116, "language_loss": 0.8580991, "learning_rate": 1.2977139713289398e-06, "loss": 0.87680984, "num_input_tokens_seen": 224234170, "step": 10408, "time_per_iteration": 2.647240400314331 }, { "auxiliary_loss_clip": 0.01090915, "auxiliary_loss_mlp": 0.00769522, "balance_loss_clip": 1.03742266, "balance_loss_mlp": 1.00007892, "epoch": 0.6258229370208929, "flos": 20850956830080.0, "grad_norm": 1.8769352919555562, "language_loss": 0.79664773, "learning_rate": 1.2973493231693699e-06, "loss": 0.81525207, "num_input_tokens_seen": 224253115, "step": 10409, "time_per_iteration": 5.298889636993408 }, { "auxiliary_loss_clip": 0.01091226, "auxiliary_loss_mlp": 0.01033686, "balance_loss_clip": 1.03762126, "balance_loss_mlp": 1.02168143, "epoch": 0.6258830602735608, "flos": 22231542021120.0, "grad_norm": 2.146507314015339, "language_loss": 0.69629455, "learning_rate": 1.2969847016543845e-06, "loss": 0.71754372, "num_input_tokens_seen": 224271375, "step": 10410, "time_per_iteration": 2.7642364501953125 }, { "auxiliary_loss_clip": 0.01066453, "auxiliary_loss_mlp": 0.01032082, "balance_loss_clip": 1.03571606, "balance_loss_mlp": 1.01986265, "epoch": 0.6259431835262288, "flos": 25076887925760.0, "grad_norm": 2.4453810502825153, "language_loss": 0.67605823, "learning_rate": 1.2966201067978086e-06, "loss": 0.6970436, "num_input_tokens_seen": 224290315, "step": 10411, "time_per_iteration": 4.3257997035980225 }, { "auxiliary_loss_clip": 0.0106799, "auxiliary_loss_mlp": 0.01040597, "balance_loss_clip": 1.03715658, "balance_loss_mlp": 1.02818179, "epoch": 0.6260033067788967, "flos": 28252288926720.0, "grad_norm": 1.954494979108325, "language_loss": 0.69357151, "learning_rate": 1.2962555386134702e-06, "loss": 0.71465743, "num_input_tokens_seen": 224310545, "step": 10412, "time_per_iteration": 4.512540578842163 }, { "auxiliary_loss_clip": 0.01080692, "auxiliary_loss_mlp": 0.0104025, "balance_loss_clip": 1.03551555, "balance_loss_mlp": 1.02700531, "epoch": 0.6260634300315647, "flos": 23367432787200.0, "grad_norm": 1.4726479761814617, "language_loss": 0.6975283, "learning_rate": 1.2958909971151908e-06, "loss": 0.71873772, "num_input_tokens_seen": 224331115, "step": 10413, "time_per_iteration": 2.715327262878418 }, { "auxiliary_loss_clip": 0.01083008, "auxiliary_loss_mlp": 0.01034077, "balance_loss_clip": 1.03659189, "balance_loss_mlp": 1.01976025, "epoch": 0.6261235532842326, "flos": 18035308494720.0, "grad_norm": 2.5748151630879277, "language_loss": 0.80629605, "learning_rate": 1.295526482316796e-06, "loss": 0.82746685, "num_input_tokens_seen": 224347525, "step": 10414, "time_per_iteration": 2.7809388637542725 }, { "auxiliary_loss_clip": 0.0110639, "auxiliary_loss_mlp": 0.01037432, "balance_loss_clip": 1.04208875, "balance_loss_mlp": 1.0249393, "epoch": 0.6261836765369007, "flos": 22011265866240.0, "grad_norm": 1.7429212772998885, "language_loss": 0.74786866, "learning_rate": 1.2951619942321083e-06, "loss": 0.7693069, "num_input_tokens_seen": 224367045, "step": 10415, "time_per_iteration": 2.790271282196045 }, { "auxiliary_loss_clip": 0.01062067, "auxiliary_loss_mlp": 0.01034612, "balance_loss_clip": 1.03746879, "balance_loss_mlp": 1.0215826, "epoch": 0.6262437997895686, "flos": 24936010784640.0, "grad_norm": 1.5794864494822807, "language_loss": 0.74193609, "learning_rate": 1.2947975328749472e-06, "loss": 0.76290286, "num_input_tokens_seen": 224388860, "step": 10416, "time_per_iteration": 2.7647581100463867 }, { "auxiliary_loss_clip": 0.01086432, "auxiliary_loss_mlp": 0.01033307, "balance_loss_clip": 1.04012477, "balance_loss_mlp": 1.02101088, "epoch": 0.6263039230422366, "flos": 31608428186880.0, "grad_norm": 1.6472166500534797, "language_loss": 0.84573495, "learning_rate": 1.2944330982591352e-06, "loss": 0.86693239, "num_input_tokens_seen": 224409645, "step": 10417, "time_per_iteration": 4.274592638015747 }, { "auxiliary_loss_clip": 0.01105981, "auxiliary_loss_mlp": 0.0103493, "balance_loss_clip": 1.04019403, "balance_loss_mlp": 1.02186441, "epoch": 0.6263640462949046, "flos": 17639465639040.0, "grad_norm": 2.0790985994239066, "language_loss": 0.56728101, "learning_rate": 1.2940686903984904e-06, "loss": 0.58869016, "num_input_tokens_seen": 224428530, "step": 10418, "time_per_iteration": 2.691500186920166 }, { "auxiliary_loss_clip": 0.01110622, "auxiliary_loss_mlp": 0.0104313, "balance_loss_clip": 1.04013753, "balance_loss_mlp": 1.0293498, "epoch": 0.6264241695475725, "flos": 19974951941760.0, "grad_norm": 1.8736530467564598, "language_loss": 0.8455261, "learning_rate": 1.2937043093068316e-06, "loss": 0.86706358, "num_input_tokens_seen": 224447175, "step": 10419, "time_per_iteration": 2.739027261734009 }, { "auxiliary_loss_clip": 0.01119559, "auxiliary_loss_mlp": 0.01031671, "balance_loss_clip": 1.04406238, "balance_loss_mlp": 1.01907599, "epoch": 0.6264842928002405, "flos": 27344323912320.0, "grad_norm": 1.509247263381085, "language_loss": 0.6426456, "learning_rate": 1.2933399549979762e-06, "loss": 0.66415787, "num_input_tokens_seen": 224469445, "step": 10420, "time_per_iteration": 2.7180798053741455 }, { "auxiliary_loss_clip": 0.01076087, "auxiliary_loss_mlp": 0.01035851, "balance_loss_clip": 1.03824723, "balance_loss_mlp": 1.02204061, "epoch": 0.6265444160529084, "flos": 22997265177600.0, "grad_norm": 2.1707304020443527, "language_loss": 0.86138391, "learning_rate": 1.292975627485741e-06, "loss": 0.88250327, "num_input_tokens_seen": 224486590, "step": 10421, "time_per_iteration": 2.7487831115722656 }, { "auxiliary_loss_clip": 0.01078665, "auxiliary_loss_mlp": 0.01036628, "balance_loss_clip": 1.03799725, "balance_loss_mlp": 1.02374697, "epoch": 0.6266045393055765, "flos": 19938323047680.0, "grad_norm": 2.422674057917065, "language_loss": 0.79407763, "learning_rate": 1.2926113267839403e-06, "loss": 0.81523055, "num_input_tokens_seen": 224502795, "step": 10422, "time_per_iteration": 2.8828704357147217 }, { "auxiliary_loss_clip": 0.01104293, "auxiliary_loss_mlp": 0.01027022, "balance_loss_clip": 1.04006767, "balance_loss_mlp": 1.01370621, "epoch": 0.6266646625582444, "flos": 24389091325440.0, "grad_norm": 2.2930026415354368, "language_loss": 0.74455339, "learning_rate": 1.292247052906389e-06, "loss": 0.76586652, "num_input_tokens_seen": 224522300, "step": 10423, "time_per_iteration": 2.7208752632141113 }, { "auxiliary_loss_clip": 0.01114032, "auxiliary_loss_mlp": 0.01028546, "balance_loss_clip": 1.04019392, "balance_loss_mlp": 1.01625562, "epoch": 0.6267247858109124, "flos": 14683802088960.0, "grad_norm": 1.9557551713522223, "language_loss": 0.7775594, "learning_rate": 1.2918828058669004e-06, "loss": 0.79898518, "num_input_tokens_seen": 224538260, "step": 10424, "time_per_iteration": 2.592926263809204 }, { "auxiliary_loss_clip": 0.01113819, "auxiliary_loss_mlp": 0.01032907, "balance_loss_clip": 1.04032254, "balance_loss_mlp": 1.01879287, "epoch": 0.6267849090635803, "flos": 24929977299840.0, "grad_norm": 2.1677847187028605, "language_loss": 0.6903978, "learning_rate": 1.2915185856792868e-06, "loss": 0.71186507, "num_input_tokens_seen": 224559155, "step": 10425, "time_per_iteration": 2.668877363204956 }, { "auxiliary_loss_clip": 0.01089804, "auxiliary_loss_mlp": 0.01029639, "balance_loss_clip": 1.03939557, "balance_loss_mlp": 1.01808131, "epoch": 0.6268450323162483, "flos": 25337851211520.0, "grad_norm": 1.4857408938723873, "language_loss": 0.74492955, "learning_rate": 1.2911543923573598e-06, "loss": 0.76612389, "num_input_tokens_seen": 224578660, "step": 10426, "time_per_iteration": 2.720566987991333 }, { "auxiliary_loss_clip": 0.01106657, "auxiliary_loss_mlp": 0.00770492, "balance_loss_clip": 1.04118848, "balance_loss_mlp": 1.00016105, "epoch": 0.6269051555689162, "flos": 26177299032960.0, "grad_norm": 2.445291482107416, "language_loss": 0.80835652, "learning_rate": 1.290790225914929e-06, "loss": 0.82712793, "num_input_tokens_seen": 224599080, "step": 10427, "time_per_iteration": 2.6930294036865234 }, { "auxiliary_loss_clip": 0.01083192, "auxiliary_loss_mlp": 0.01039458, "balance_loss_clip": 1.03919089, "balance_loss_mlp": 1.02608228, "epoch": 0.6269652788215843, "flos": 18256877539200.0, "grad_norm": 2.002033794251086, "language_loss": 0.68361104, "learning_rate": 1.2904260863658034e-06, "loss": 0.70483756, "num_input_tokens_seen": 224614225, "step": 10428, "time_per_iteration": 2.750072717666626 }, { "auxiliary_loss_clip": 0.01070825, "auxiliary_loss_mlp": 0.01048713, "balance_loss_clip": 1.03721058, "balance_loss_mlp": 1.03428292, "epoch": 0.6270254020742522, "flos": 11765413877760.0, "grad_norm": 1.948024958379765, "language_loss": 0.71860063, "learning_rate": 1.2900619737237928e-06, "loss": 0.73979598, "num_input_tokens_seen": 224632365, "step": 10429, "time_per_iteration": 2.746628761291504 }, { "auxiliary_loss_clip": 0.01109377, "auxiliary_loss_mlp": 0.01032535, "balance_loss_clip": 1.04220653, "balance_loss_mlp": 1.01867652, "epoch": 0.6270855253269202, "flos": 23475631530240.0, "grad_norm": 1.6097875593140534, "language_loss": 0.79522586, "learning_rate": 1.2896978880027023e-06, "loss": 0.81664503, "num_input_tokens_seen": 224651125, "step": 10430, "time_per_iteration": 2.7708442211151123 }, { "auxiliary_loss_clip": 0.01033801, "auxiliary_loss_mlp": 0.01002127, "balance_loss_clip": 1.01011229, "balance_loss_mlp": 1.00103593, "epoch": 0.6271456485795882, "flos": 70064520232320.0, "grad_norm": 1.3411395578732954, "language_loss": 0.59105575, "learning_rate": 1.2893338292163393e-06, "loss": 0.61141503, "num_input_tokens_seen": 224716115, "step": 10431, "time_per_iteration": 3.284141778945923 }, { "auxiliary_loss_clip": 0.01016087, "auxiliary_loss_mlp": 0.01003696, "balance_loss_clip": 1.01267934, "balance_loss_mlp": 1.00251579, "epoch": 0.6272057718322561, "flos": 65156718280320.0, "grad_norm": 0.8756941222650257, "language_loss": 0.63814843, "learning_rate": 1.2889697973785095e-06, "loss": 0.65834618, "num_input_tokens_seen": 224782930, "step": 10432, "time_per_iteration": 3.315559148788452 }, { "auxiliary_loss_clip": 0.0109102, "auxiliary_loss_mlp": 0.01033306, "balance_loss_clip": 1.03992772, "balance_loss_mlp": 1.02161813, "epoch": 0.6272658950849241, "flos": 24389342720640.0, "grad_norm": 1.881228339897183, "language_loss": 0.64901084, "learning_rate": 1.2886057925030153e-06, "loss": 0.67025411, "num_input_tokens_seen": 224802010, "step": 10433, "time_per_iteration": 2.7182137966156006 }, { "auxiliary_loss_clip": 0.01108511, "auxiliary_loss_mlp": 0.01033022, "balance_loss_clip": 1.04193711, "balance_loss_mlp": 1.01966476, "epoch": 0.627326018337592, "flos": 17966001202560.0, "grad_norm": 2.029162826422426, "language_loss": 0.61656857, "learning_rate": 1.2882418146036612e-06, "loss": 0.63798386, "num_input_tokens_seen": 224818875, "step": 10434, "time_per_iteration": 2.698272228240967 }, { "auxiliary_loss_clip": 0.0107895, "auxiliary_loss_mlp": 0.01026455, "balance_loss_clip": 1.03706336, "balance_loss_mlp": 1.01392627, "epoch": 0.6273861415902601, "flos": 20230097224320.0, "grad_norm": 1.7060876035395582, "language_loss": 0.84624016, "learning_rate": 1.2878778636942484e-06, "loss": 0.86729419, "num_input_tokens_seen": 224837790, "step": 10435, "time_per_iteration": 2.7053635120391846 }, { "auxiliary_loss_clip": 0.01033575, "auxiliary_loss_mlp": 0.01005985, "balance_loss_clip": 1.00981998, "balance_loss_mlp": 1.00484645, "epoch": 0.627446264842928, "flos": 64953210798720.0, "grad_norm": 0.7308695189229724, "language_loss": 0.61571616, "learning_rate": 1.2875139397885786e-06, "loss": 0.63611174, "num_input_tokens_seen": 224899685, "step": 10436, "time_per_iteration": 3.1732895374298096 }, { "auxiliary_loss_clip": 0.01099296, "auxiliary_loss_mlp": 0.01040375, "balance_loss_clip": 1.04577446, "balance_loss_mlp": 1.02651119, "epoch": 0.627506388095596, "flos": 23584261236480.0, "grad_norm": 1.4615085745823022, "language_loss": 0.77539217, "learning_rate": 1.2871500429004523e-06, "loss": 0.79678893, "num_input_tokens_seen": 224918650, "step": 10437, "time_per_iteration": 2.8112289905548096 }, { "auxiliary_loss_clip": 0.0102524, "auxiliary_loss_mlp": 0.01007069, "balance_loss_clip": 1.01128411, "balance_loss_mlp": 1.00595462, "epoch": 0.6275665113482639, "flos": 67583631674880.0, "grad_norm": 0.7245410806399479, "language_loss": 0.54275799, "learning_rate": 1.2867861730436667e-06, "loss": 0.56308109, "num_input_tokens_seen": 224981575, "step": 10438, "time_per_iteration": 3.1365692615509033 }, { "auxiliary_loss_clip": 0.01063228, "auxiliary_loss_mlp": 0.01041641, "balance_loss_clip": 1.03674674, "balance_loss_mlp": 1.02898097, "epoch": 0.6276266346009319, "flos": 27636924101760.0, "grad_norm": 1.7255538562739963, "language_loss": 0.84122932, "learning_rate": 1.2864223302320214e-06, "loss": 0.86227804, "num_input_tokens_seen": 225000820, "step": 10439, "time_per_iteration": 2.909126043319702 }, { "auxiliary_loss_clip": 0.01077398, "auxiliary_loss_mlp": 0.01044046, "balance_loss_clip": 1.04187262, "balance_loss_mlp": 1.03006864, "epoch": 0.6276867578535998, "flos": 22746142218240.0, "grad_norm": 2.0752652164499783, "language_loss": 0.80063027, "learning_rate": 1.2860585144793128e-06, "loss": 0.8218447, "num_input_tokens_seen": 225017585, "step": 10440, "time_per_iteration": 2.7793238162994385 }, { "auxiliary_loss_clip": 0.01059905, "auxiliary_loss_mlp": 0.01030462, "balance_loss_clip": 1.03476882, "balance_loss_mlp": 1.01888728, "epoch": 0.6277468811062679, "flos": 24644200694400.0, "grad_norm": 1.357982638723412, "language_loss": 0.74566025, "learning_rate": 1.285694725799337e-06, "loss": 0.76656389, "num_input_tokens_seen": 225039085, "step": 10441, "time_per_iteration": 2.9267096519470215 }, { "auxiliary_loss_clip": 0.01095865, "auxiliary_loss_mlp": 0.0103067, "balance_loss_clip": 1.03701901, "balance_loss_mlp": 1.01759267, "epoch": 0.6278070043589358, "flos": 19678975873920.0, "grad_norm": 2.0708219033723316, "language_loss": 0.72098005, "learning_rate": 1.2853309642058884e-06, "loss": 0.74224538, "num_input_tokens_seen": 225058105, "step": 10442, "time_per_iteration": 2.6998653411865234 }, { "auxiliary_loss_clip": 0.01081918, "auxiliary_loss_mlp": 0.01030205, "balance_loss_clip": 1.03865194, "balance_loss_mlp": 1.01750898, "epoch": 0.6278671276116038, "flos": 22121834906880.0, "grad_norm": 1.6030154795021492, "language_loss": 0.7134285, "learning_rate": 1.284967229712762e-06, "loss": 0.73454976, "num_input_tokens_seen": 225077605, "step": 10443, "time_per_iteration": 2.8322415351867676 }, { "auxiliary_loss_clip": 0.0111667, "auxiliary_loss_mlp": 0.01031963, "balance_loss_clip": 1.04252923, "balance_loss_mlp": 1.01954722, "epoch": 0.6279272508642717, "flos": 23038562839680.0, "grad_norm": 2.1504215551644523, "language_loss": 0.73254573, "learning_rate": 1.2846035223337492e-06, "loss": 0.75403202, "num_input_tokens_seen": 225097775, "step": 10444, "time_per_iteration": 2.6936285495758057 }, { "auxiliary_loss_clip": 0.01085082, "auxiliary_loss_mlp": 0.01032194, "balance_loss_clip": 1.04689062, "balance_loss_mlp": 1.01936126, "epoch": 0.6279873741169397, "flos": 19824090819840.0, "grad_norm": 2.0098765769795697, "language_loss": 0.724576, "learning_rate": 1.2842398420826423e-06, "loss": 0.74574882, "num_input_tokens_seen": 225115585, "step": 10445, "time_per_iteration": 2.7513034343719482 }, { "auxiliary_loss_clip": 0.01101735, "auxiliary_loss_mlp": 0.01029744, "balance_loss_clip": 1.03916216, "balance_loss_mlp": 1.0170486, "epoch": 0.6280474973696077, "flos": 23915393740800.0, "grad_norm": 1.5354377153299141, "language_loss": 0.692366, "learning_rate": 1.2838761889732331e-06, "loss": 0.71368074, "num_input_tokens_seen": 225135575, "step": 10446, "time_per_iteration": 2.7197511196136475 }, { "auxiliary_loss_clip": 0.01075612, "auxiliary_loss_mlp": 0.0103331, "balance_loss_clip": 1.03858328, "balance_loss_mlp": 1.01901674, "epoch": 0.6281076206222757, "flos": 17967976450560.0, "grad_norm": 2.0624649000071638, "language_loss": 0.73082191, "learning_rate": 1.2835125630193102e-06, "loss": 0.75191116, "num_input_tokens_seen": 225154230, "step": 10447, "time_per_iteration": 2.8416759967803955 }, { "auxiliary_loss_clip": 0.01024228, "auxiliary_loss_mlp": 0.00999654, "balance_loss_clip": 1.00985765, "balance_loss_mlp": 0.99855727, "epoch": 0.6281677438749437, "flos": 66778370622720.0, "grad_norm": 0.6739953142314802, "language_loss": 0.52296638, "learning_rate": 1.2831489642346626e-06, "loss": 0.54320526, "num_input_tokens_seen": 225213650, "step": 10448, "time_per_iteration": 5.136569976806641 }, { "auxiliary_loss_clip": 0.01089733, "auxiliary_loss_mlp": 0.01050472, "balance_loss_clip": 1.0385865, "balance_loss_mlp": 1.03579164, "epoch": 0.6282278671276116, "flos": 11656173640320.0, "grad_norm": 2.2865528324647744, "language_loss": 0.91361725, "learning_rate": 1.282785392633079e-06, "loss": 0.93501937, "num_input_tokens_seen": 225230135, "step": 10449, "time_per_iteration": 2.7638633251190186 }, { "auxiliary_loss_clip": 0.01112884, "auxiliary_loss_mlp": 0.01032015, "balance_loss_clip": 1.03918815, "balance_loss_mlp": 1.02023697, "epoch": 0.6282879903802796, "flos": 42741597847680.0, "grad_norm": 1.5879286033336677, "language_loss": 0.60231853, "learning_rate": 1.2824218482283438e-06, "loss": 0.6237675, "num_input_tokens_seen": 225253520, "step": 10450, "time_per_iteration": 4.464092493057251 }, { "auxiliary_loss_clip": 0.01089139, "auxiliary_loss_mlp": 0.01032278, "balance_loss_clip": 1.04133666, "balance_loss_mlp": 1.01986873, "epoch": 0.6283481136329475, "flos": 20009210538240.0, "grad_norm": 1.522481037470791, "language_loss": 0.76846904, "learning_rate": 1.2820583310342452e-06, "loss": 0.78968322, "num_input_tokens_seen": 225272460, "step": 10451, "time_per_iteration": 4.40496563911438 }, { "auxiliary_loss_clip": 0.01090661, "auxiliary_loss_mlp": 0.01030764, "balance_loss_clip": 1.03676105, "balance_loss_mlp": 1.01773453, "epoch": 0.6284082368856155, "flos": 21904431840000.0, "grad_norm": 1.614739235308552, "language_loss": 0.77571416, "learning_rate": 1.281694841064566e-06, "loss": 0.79692847, "num_input_tokens_seen": 225291700, "step": 10452, "time_per_iteration": 2.7239017486572266 }, { "auxiliary_loss_clip": 0.01088221, "auxiliary_loss_mlp": 0.01034824, "balance_loss_clip": 1.04302955, "balance_loss_mlp": 1.02150226, "epoch": 0.6284683601382834, "flos": 25484187219840.0, "grad_norm": 1.7878849951641813, "language_loss": 0.72469395, "learning_rate": 1.2813313783330904e-06, "loss": 0.74592441, "num_input_tokens_seen": 225311470, "step": 10453, "time_per_iteration": 2.9393930435180664 }, { "auxiliary_loss_clip": 0.01053587, "auxiliary_loss_mlp": 0.01040763, "balance_loss_clip": 1.03172648, "balance_loss_mlp": 1.02527809, "epoch": 0.6285284833909515, "flos": 16538695395840.0, "grad_norm": 1.709886822132608, "language_loss": 0.80723816, "learning_rate": 1.2809679428536013e-06, "loss": 0.82818168, "num_input_tokens_seen": 225328385, "step": 10454, "time_per_iteration": 2.8191676139831543 }, { "auxiliary_loss_clip": 0.01086328, "auxiliary_loss_mlp": 0.01036988, "balance_loss_clip": 1.04401016, "balance_loss_mlp": 1.02476287, "epoch": 0.6285886066436194, "flos": 22820692896000.0, "grad_norm": 1.9883426544542775, "language_loss": 0.82205665, "learning_rate": 1.2806045346398792e-06, "loss": 0.84328985, "num_input_tokens_seen": 225348415, "step": 10455, "time_per_iteration": 2.778773784637451 }, { "auxiliary_loss_clip": 0.01066143, "auxiliary_loss_mlp": 0.00771548, "balance_loss_clip": 1.03564739, "balance_loss_mlp": 1.00019312, "epoch": 0.6286487298962874, "flos": 24715734629760.0, "grad_norm": 1.5354473458638056, "language_loss": 0.81757617, "learning_rate": 1.280241153705706e-06, "loss": 0.83595306, "num_input_tokens_seen": 225367740, "step": 10456, "time_per_iteration": 4.4299633502960205 }, { "auxiliary_loss_clip": 0.0108958, "auxiliary_loss_mlp": 0.01031242, "balance_loss_clip": 1.04148746, "balance_loss_mlp": 1.01731229, "epoch": 0.6287088531489553, "flos": 20740818752640.0, "grad_norm": 1.6813486630133685, "language_loss": 0.71938455, "learning_rate": 1.27987780006486e-06, "loss": 0.74059272, "num_input_tokens_seen": 225388405, "step": 10457, "time_per_iteration": 2.7010886669158936 }, { "auxiliary_loss_clip": 0.0110824, "auxiliary_loss_mlp": 0.01035135, "balance_loss_clip": 1.03882265, "balance_loss_mlp": 1.02124166, "epoch": 0.6287689764016233, "flos": 23070630706560.0, "grad_norm": 1.8855739678870833, "language_loss": 0.79754472, "learning_rate": 1.2795144737311202e-06, "loss": 0.81897843, "num_input_tokens_seen": 225408360, "step": 10458, "time_per_iteration": 2.826195478439331 }, { "auxiliary_loss_clip": 0.01110415, "auxiliary_loss_mlp": 0.01033556, "balance_loss_clip": 1.0434413, "balance_loss_mlp": 1.02032971, "epoch": 0.6288290996542913, "flos": 32233669251840.0, "grad_norm": 1.613153759988395, "language_loss": 0.61056519, "learning_rate": 1.2791511747182635e-06, "loss": 0.63200486, "num_input_tokens_seen": 225431310, "step": 10459, "time_per_iteration": 2.8198750019073486 }, { "auxiliary_loss_clip": 0.01090967, "auxiliary_loss_mlp": 0.0103353, "balance_loss_clip": 1.03930306, "balance_loss_mlp": 1.02109075, "epoch": 0.6288892229069593, "flos": 24641327606400.0, "grad_norm": 1.6884168463635612, "language_loss": 0.78966278, "learning_rate": 1.2787879030400666e-06, "loss": 0.81090778, "num_input_tokens_seen": 225450385, "step": 10460, "time_per_iteration": 2.8095743656158447 }, { "auxiliary_loss_clip": 0.01074125, "auxiliary_loss_mlp": 0.01031631, "balance_loss_clip": 1.0369761, "balance_loss_mlp": 1.01822627, "epoch": 0.6289493461596273, "flos": 17858341163520.0, "grad_norm": 1.6519482013468527, "language_loss": 0.73814094, "learning_rate": 1.2784246587103047e-06, "loss": 0.75919855, "num_input_tokens_seen": 225467325, "step": 10461, "time_per_iteration": 2.754106044769287 }, { "auxiliary_loss_clip": 0.01093245, "auxiliary_loss_mlp": 0.01040397, "balance_loss_clip": 1.03983331, "balance_loss_mlp": 1.02764726, "epoch": 0.6290094694122952, "flos": 22345379199360.0, "grad_norm": 1.7440118950274472, "language_loss": 0.69962513, "learning_rate": 1.2780614417427523e-06, "loss": 0.72096151, "num_input_tokens_seen": 225487370, "step": 10462, "time_per_iteration": 2.721280574798584 }, { "auxiliary_loss_clip": 0.01109582, "auxiliary_loss_mlp": 0.01030961, "balance_loss_clip": 1.04013419, "balance_loss_mlp": 1.01948082, "epoch": 0.6290695926649632, "flos": 28402431776640.0, "grad_norm": 2.4371122708038846, "language_loss": 0.7249735, "learning_rate": 1.2776982521511821e-06, "loss": 0.74637896, "num_input_tokens_seen": 225506915, "step": 10463, "time_per_iteration": 2.7322490215301514 }, { "auxiliary_loss_clip": 0.01094633, "auxiliary_loss_mlp": 0.0104, "balance_loss_clip": 1.04333925, "balance_loss_mlp": 1.02713692, "epoch": 0.6291297159176311, "flos": 21505464501120.0, "grad_norm": 1.7167597419504528, "language_loss": 0.72533494, "learning_rate": 1.2773350899493665e-06, "loss": 0.74668121, "num_input_tokens_seen": 225525670, "step": 10464, "time_per_iteration": 2.7556610107421875 }, { "auxiliary_loss_clip": 0.01086904, "auxiliary_loss_mlp": 0.01034283, "balance_loss_clip": 1.04166722, "balance_loss_mlp": 1.02168906, "epoch": 0.6291898391702991, "flos": 12203308581120.0, "grad_norm": 1.750105989459617, "language_loss": 0.69012117, "learning_rate": 1.2769719551510768e-06, "loss": 0.71133304, "num_input_tokens_seen": 225542235, "step": 10465, "time_per_iteration": 2.6720523834228516 }, { "auxiliary_loss_clip": 0.01026598, "auxiliary_loss_mlp": 0.01001492, "balance_loss_clip": 1.0124836, "balance_loss_mlp": 1.00023413, "epoch": 0.629249962422967, "flos": 69299479434240.0, "grad_norm": 0.6784608705879751, "language_loss": 0.59741104, "learning_rate": 1.2766088477700832e-06, "loss": 0.61769187, "num_input_tokens_seen": 225607185, "step": 10466, "time_per_iteration": 3.353839635848999 }, { "auxiliary_loss_clip": 0.01073177, "auxiliary_loss_mlp": 0.01032178, "balance_loss_clip": 1.03545153, "balance_loss_mlp": 1.02020311, "epoch": 0.6293100856756351, "flos": 40077888042240.0, "grad_norm": 1.835286938356158, "language_loss": 0.64667165, "learning_rate": 1.276245767820154e-06, "loss": 0.66772521, "num_input_tokens_seen": 225628785, "step": 10467, "time_per_iteration": 2.921297550201416 }, { "auxiliary_loss_clip": 0.01014455, "auxiliary_loss_mlp": 0.01000173, "balance_loss_clip": 1.01132929, "balance_loss_mlp": 0.9989695, "epoch": 0.629370208928303, "flos": 67501108177920.0, "grad_norm": 0.7915302961276658, "language_loss": 0.56811368, "learning_rate": 1.2758827153150586e-06, "loss": 0.58825994, "num_input_tokens_seen": 225678980, "step": 10468, "time_per_iteration": 3.01094126701355 }, { "auxiliary_loss_clip": 0.00999481, "auxiliary_loss_mlp": 0.00999518, "balance_loss_clip": 1.01559901, "balance_loss_mlp": 0.9980635, "epoch": 0.629430332180971, "flos": 60660450449280.0, "grad_norm": 0.7367622716998392, "language_loss": 0.57934558, "learning_rate": 1.2755196902685626e-06, "loss": 0.59933555, "num_input_tokens_seen": 225740295, "step": 10469, "time_per_iteration": 3.254342555999756 }, { "auxiliary_loss_clip": 0.01032056, "auxiliary_loss_mlp": 0.01005271, "balance_loss_clip": 1.02417684, "balance_loss_mlp": 1.00394154, "epoch": 0.6294904554336389, "flos": 66869764778880.0, "grad_norm": 0.6802993920043705, "language_loss": 0.5213244, "learning_rate": 1.2751566926944329e-06, "loss": 0.54169762, "num_input_tokens_seen": 225805615, "step": 10470, "time_per_iteration": 3.2833499908447266 }, { "auxiliary_loss_clip": 0.01099474, "auxiliary_loss_mlp": 0.0103723, "balance_loss_clip": 1.03933227, "balance_loss_mlp": 1.02434301, "epoch": 0.6295505786863069, "flos": 42522794150400.0, "grad_norm": 1.6833251005433751, "language_loss": 0.7409395, "learning_rate": 1.2747937226064342e-06, "loss": 0.76230645, "num_input_tokens_seen": 225826585, "step": 10471, "time_per_iteration": 2.839749574661255 }, { "auxiliary_loss_clip": 0.0108924, "auxiliary_loss_mlp": 0.0103139, "balance_loss_clip": 1.0421524, "balance_loss_mlp": 1.01881981, "epoch": 0.629610701938975, "flos": 17384140788480.0, "grad_norm": 1.8072062146815357, "language_loss": 0.63223195, "learning_rate": 1.2744307800183297e-06, "loss": 0.65343827, "num_input_tokens_seen": 225844095, "step": 10472, "time_per_iteration": 2.72947359085083 }, { "auxiliary_loss_clip": 0.01121891, "auxiliary_loss_mlp": 0.01039125, "balance_loss_clip": 1.04511738, "balance_loss_mlp": 1.02616739, "epoch": 0.6296708251916429, "flos": 24242934885120.0, "grad_norm": 1.6320866537592498, "language_loss": 0.69356817, "learning_rate": 1.2740678649438828e-06, "loss": 0.71517837, "num_input_tokens_seen": 225864310, "step": 10473, "time_per_iteration": 2.68420672416687 }, { "auxiliary_loss_clip": 0.01090218, "auxiliary_loss_mlp": 0.0103276, "balance_loss_clip": 1.03732657, "balance_loss_mlp": 1.02030838, "epoch": 0.6297309484443109, "flos": 19278536077440.0, "grad_norm": 1.63494515725041, "language_loss": 0.7420494, "learning_rate": 1.2737049773968554e-06, "loss": 0.7632792, "num_input_tokens_seen": 225883830, "step": 10474, "time_per_iteration": 2.7413995265960693 }, { "auxiliary_loss_clip": 0.01090194, "auxiliary_loss_mlp": 0.00769939, "balance_loss_clip": 1.03743196, "balance_loss_mlp": 1.0001384, "epoch": 0.6297910716969788, "flos": 30662685043200.0, "grad_norm": 1.4351205807606953, "language_loss": 0.66564953, "learning_rate": 1.2733421173910081e-06, "loss": 0.68425083, "num_input_tokens_seen": 225905755, "step": 10475, "time_per_iteration": 2.7660322189331055 }, { "auxiliary_loss_clip": 0.0106541, "auxiliary_loss_mlp": 0.01030607, "balance_loss_clip": 1.03863168, "balance_loss_mlp": 1.01878738, "epoch": 0.6298511949496468, "flos": 14423018371200.0, "grad_norm": 1.9836644906797416, "language_loss": 0.9036352, "learning_rate": 1.272979284940101e-06, "loss": 0.92459542, "num_input_tokens_seen": 225922155, "step": 10476, "time_per_iteration": 2.758232593536377 }, { "auxiliary_loss_clip": 0.01114316, "auxiliary_loss_mlp": 0.01035706, "balance_loss_clip": 1.04105282, "balance_loss_mlp": 1.02374947, "epoch": 0.6299113182023147, "flos": 23514163845120.0, "grad_norm": 5.4120485720423055, "language_loss": 0.75543785, "learning_rate": 1.2726164800578913e-06, "loss": 0.77693808, "num_input_tokens_seen": 225941060, "step": 10477, "time_per_iteration": 2.689332962036133 }, { "auxiliary_loss_clip": 0.01100017, "auxiliary_loss_mlp": 0.01035097, "balance_loss_clip": 1.03945518, "balance_loss_mlp": 1.02181101, "epoch": 0.6299714414549827, "flos": 22674500542080.0, "grad_norm": 1.792423931833335, "language_loss": 0.70299745, "learning_rate": 1.272253702758138e-06, "loss": 0.7243486, "num_input_tokens_seen": 225960870, "step": 10478, "time_per_iteration": 2.641702651977539 }, { "auxiliary_loss_clip": 0.011102, "auxiliary_loss_mlp": 0.01032825, "balance_loss_clip": 1.04167068, "balance_loss_mlp": 1.01943791, "epoch": 0.6300315647076506, "flos": 14501735026560.0, "grad_norm": 2.1836774795585012, "language_loss": 0.66761291, "learning_rate": 1.2718909530545974e-06, "loss": 0.68904316, "num_input_tokens_seen": 225977895, "step": 10479, "time_per_iteration": 2.6688246726989746 }, { "auxiliary_loss_clip": 0.01090005, "auxiliary_loss_mlp": 0.0077118, "balance_loss_clip": 1.03907907, "balance_loss_mlp": 1.0001682, "epoch": 0.6300916879603187, "flos": 21871681614720.0, "grad_norm": 2.512846896597075, "language_loss": 0.73645091, "learning_rate": 1.2715282309610245e-06, "loss": 0.7550627, "num_input_tokens_seen": 225997835, "step": 10480, "time_per_iteration": 2.7305657863616943 }, { "auxiliary_loss_clip": 0.011053, "auxiliary_loss_mlp": 0.01035523, "balance_loss_clip": 1.04060471, "balance_loss_mlp": 1.02189767, "epoch": 0.6301518112129866, "flos": 21834047139840.0, "grad_norm": 1.8722485238317301, "language_loss": 0.79015726, "learning_rate": 1.2711655364911744e-06, "loss": 0.81156552, "num_input_tokens_seen": 226017620, "step": 10481, "time_per_iteration": 2.687849283218384 }, { "auxiliary_loss_clip": 0.01021696, "auxiliary_loss_mlp": 0.01011899, "balance_loss_clip": 1.01580834, "balance_loss_mlp": 1.01079035, "epoch": 0.6302119344656546, "flos": 44334237957120.0, "grad_norm": 0.8976146461078123, "language_loss": 0.61833119, "learning_rate": 1.2708028696588e-06, "loss": 0.63866711, "num_input_tokens_seen": 226068755, "step": 10482, "time_per_iteration": 3.008683681488037 }, { "auxiliary_loss_clip": 0.01109585, "auxiliary_loss_mlp": 0.01034347, "balance_loss_clip": 1.04106355, "balance_loss_mlp": 1.02004182, "epoch": 0.6302720577183225, "flos": 11217919800960.0, "grad_norm": 2.2108979789482635, "language_loss": 0.8277266, "learning_rate": 1.2704402304776541e-06, "loss": 0.84916592, "num_input_tokens_seen": 226084395, "step": 10483, "time_per_iteration": 2.623480796813965 }, { "auxiliary_loss_clip": 0.01094195, "auxiliary_loss_mlp": 0.01042488, "balance_loss_clip": 1.03946197, "balance_loss_mlp": 1.03022778, "epoch": 0.6303321809709905, "flos": 27964932122880.0, "grad_norm": 1.5219185358756147, "language_loss": 0.72691327, "learning_rate": 1.270077618961487e-06, "loss": 0.74828005, "num_input_tokens_seen": 226105890, "step": 10484, "time_per_iteration": 2.7577946186065674 }, { "auxiliary_loss_clip": 0.0108643, "auxiliary_loss_mlp": 0.01033065, "balance_loss_clip": 1.04017258, "balance_loss_mlp": 1.01970792, "epoch": 0.6303923042236586, "flos": 28220759763840.0, "grad_norm": 2.7543040419083606, "language_loss": 0.74625325, "learning_rate": 1.2697150351240506e-06, "loss": 0.76744819, "num_input_tokens_seen": 226126760, "step": 10485, "time_per_iteration": 2.8124029636383057 }, { "auxiliary_loss_clip": 0.01093712, "auxiliary_loss_mlp": 0.00771476, "balance_loss_clip": 1.04156017, "balance_loss_mlp": 1.00019419, "epoch": 0.6304524274763265, "flos": 27631034271360.0, "grad_norm": 1.7508926529215563, "language_loss": 0.81359017, "learning_rate": 1.269352478979093e-06, "loss": 0.83224207, "num_input_tokens_seen": 226147315, "step": 10486, "time_per_iteration": 2.8222594261169434 }, { "auxiliary_loss_clip": 0.0109264, "auxiliary_loss_mlp": 0.01040277, "balance_loss_clip": 1.04081047, "balance_loss_mlp": 1.02773643, "epoch": 0.6305125507289945, "flos": 17311313963520.0, "grad_norm": 1.7524407832841304, "language_loss": 0.63269603, "learning_rate": 1.2689899505403628e-06, "loss": 0.6540252, "num_input_tokens_seen": 226165935, "step": 10487, "time_per_iteration": 2.629199743270874 }, { "auxiliary_loss_clip": 0.01116472, "auxiliary_loss_mlp": 0.01040106, "balance_loss_clip": 1.04161322, "balance_loss_mlp": 1.0270344, "epoch": 0.6305726739816624, "flos": 25808280658560.0, "grad_norm": 1.6120412913951392, "language_loss": 0.66997957, "learning_rate": 1.2686274498216065e-06, "loss": 0.69154537, "num_input_tokens_seen": 226186890, "step": 10488, "time_per_iteration": 4.3398730754852295 }, { "auxiliary_loss_clip": 0.01096551, "auxiliary_loss_mlp": 0.01032615, "balance_loss_clip": 1.04035902, "balance_loss_mlp": 1.02013993, "epoch": 0.6306327972343304, "flos": 21797454159360.0, "grad_norm": 1.6559636367213997, "language_loss": 0.67318177, "learning_rate": 1.2682649768365706e-06, "loss": 0.69447345, "num_input_tokens_seen": 226206710, "step": 10489, "time_per_iteration": 4.3245344161987305 }, { "auxiliary_loss_clip": 0.01079741, "auxiliary_loss_mlp": 0.01044256, "balance_loss_clip": 1.03847003, "balance_loss_mlp": 1.02838886, "epoch": 0.6306929204869983, "flos": 20777375819520.0, "grad_norm": 1.8294067402999528, "language_loss": 0.6980201, "learning_rate": 1.2679025315990007e-06, "loss": 0.7192601, "num_input_tokens_seen": 226225565, "step": 10490, "time_per_iteration": 2.7364768981933594 }, { "auxiliary_loss_clip": 0.0109348, "auxiliary_loss_mlp": 0.01037174, "balance_loss_clip": 1.03807712, "balance_loss_mlp": 1.02385783, "epoch": 0.6307530437396663, "flos": 23654214973440.0, "grad_norm": 3.3228808138384545, "language_loss": 0.78209651, "learning_rate": 1.2675401141226393e-06, "loss": 0.80340308, "num_input_tokens_seen": 226243680, "step": 10491, "time_per_iteration": 4.192841053009033 }, { "auxiliary_loss_clip": 0.01089569, "auxiliary_loss_mlp": 0.01036924, "balance_loss_clip": 1.03836989, "balance_loss_mlp": 1.02435327, "epoch": 0.6308131669923343, "flos": 24719002767360.0, "grad_norm": 2.408793546436542, "language_loss": 0.55951095, "learning_rate": 1.2671777244212308e-06, "loss": 0.58077586, "num_input_tokens_seen": 226264345, "step": 10492, "time_per_iteration": 2.7634830474853516 }, { "auxiliary_loss_clip": 0.01118182, "auxiliary_loss_mlp": 0.01040842, "balance_loss_clip": 1.04113233, "balance_loss_mlp": 1.026793, "epoch": 0.6308732902450023, "flos": 22565403959040.0, "grad_norm": 1.8001504389218699, "language_loss": 0.64376915, "learning_rate": 1.2668153625085168e-06, "loss": 0.66535938, "num_input_tokens_seen": 226283165, "step": 10493, "time_per_iteration": 2.617398977279663 }, { "auxiliary_loss_clip": 0.01079208, "auxiliary_loss_mlp": 0.01031715, "balance_loss_clip": 1.03931165, "balance_loss_mlp": 1.01834536, "epoch": 0.6309334134976702, "flos": 24644200694400.0, "grad_norm": 1.3815551057795799, "language_loss": 0.82869065, "learning_rate": 1.2664530283982367e-06, "loss": 0.84979987, "num_input_tokens_seen": 226304080, "step": 10494, "time_per_iteration": 2.9209089279174805 }, { "auxiliary_loss_clip": 0.01102712, "auxiliary_loss_mlp": 0.01035887, "balance_loss_clip": 1.04531574, "balance_loss_mlp": 1.02259517, "epoch": 0.6309935367503382, "flos": 41427949651200.0, "grad_norm": 1.8103540070682869, "language_loss": 0.79647011, "learning_rate": 1.2660907221041317e-06, "loss": 0.81785613, "num_input_tokens_seen": 226325925, "step": 10495, "time_per_iteration": 2.913984775543213 }, { "auxiliary_loss_clip": 0.0108712, "auxiliary_loss_mlp": 0.01035788, "balance_loss_clip": 1.03742623, "balance_loss_mlp": 1.02182817, "epoch": 0.6310536600030061, "flos": 15118931445120.0, "grad_norm": 1.9837558740535257, "language_loss": 0.70338362, "learning_rate": 1.2657284436399403e-06, "loss": 0.72461271, "num_input_tokens_seen": 226344190, "step": 10496, "time_per_iteration": 4.195697546005249 }, { "auxiliary_loss_clip": 0.01097081, "auxiliary_loss_mlp": 0.01036726, "balance_loss_clip": 1.04069757, "balance_loss_mlp": 1.02359533, "epoch": 0.6311137832556741, "flos": 15231619388160.0, "grad_norm": 2.0479454703454616, "language_loss": 0.79674435, "learning_rate": 1.2653661930193997e-06, "loss": 0.81808245, "num_input_tokens_seen": 226361520, "step": 10497, "time_per_iteration": 2.7244081497192383 }, { "auxiliary_loss_clip": 0.01080809, "auxiliary_loss_mlp": 0.01033562, "balance_loss_clip": 1.03673339, "balance_loss_mlp": 1.02134275, "epoch": 0.6311739065083422, "flos": 22018664067840.0, "grad_norm": 1.9007003272679206, "language_loss": 0.73755234, "learning_rate": 1.265003970256247e-06, "loss": 0.75869608, "num_input_tokens_seen": 226381920, "step": 10498, "time_per_iteration": 2.702826976776123 }, { "auxiliary_loss_clip": 0.01106258, "auxiliary_loss_mlp": 0.01033967, "balance_loss_clip": 1.03932881, "balance_loss_mlp": 1.02077663, "epoch": 0.6312340297610101, "flos": 22710770300160.0, "grad_norm": 2.137540621016438, "language_loss": 0.70001101, "learning_rate": 1.264641775364217e-06, "loss": 0.72141325, "num_input_tokens_seen": 226400035, "step": 10499, "time_per_iteration": 2.6359314918518066 }, { "auxiliary_loss_clip": 0.01105058, "auxiliary_loss_mlp": 0.0104466, "balance_loss_clip": 1.04247713, "balance_loss_mlp": 1.03126705, "epoch": 0.6312941530136781, "flos": 24280102483200.0, "grad_norm": 1.7496076109467864, "language_loss": 0.69836605, "learning_rate": 1.2642796083570448e-06, "loss": 0.7198633, "num_input_tokens_seen": 226418280, "step": 10500, "time_per_iteration": 2.6434264183044434 }, { "auxiliary_loss_clip": 0.01117728, "auxiliary_loss_mlp": 0.01037176, "balance_loss_clip": 1.04233432, "balance_loss_mlp": 1.02433133, "epoch": 0.631354276266346, "flos": 21725956137600.0, "grad_norm": 1.767641766149829, "language_loss": 0.74439371, "learning_rate": 1.2639174692484634e-06, "loss": 0.76594275, "num_input_tokens_seen": 226436650, "step": 10501, "time_per_iteration": 2.6442511081695557 }, { "auxiliary_loss_clip": 0.01104233, "auxiliary_loss_mlp": 0.00770378, "balance_loss_clip": 1.04097271, "balance_loss_mlp": 1.00013256, "epoch": 0.631414399519014, "flos": 24025100855040.0, "grad_norm": 2.125617189575791, "language_loss": 0.75111711, "learning_rate": 1.2635553580522053e-06, "loss": 0.76986325, "num_input_tokens_seen": 226456275, "step": 10502, "time_per_iteration": 2.6732592582702637 }, { "auxiliary_loss_clip": 0.01108933, "auxiliary_loss_mlp": 0.01052555, "balance_loss_clip": 1.04151106, "balance_loss_mlp": 1.03879273, "epoch": 0.6314745227716819, "flos": 24315797623680.0, "grad_norm": 2.013663319345679, "language_loss": 0.85323668, "learning_rate": 1.2631932747820022e-06, "loss": 0.87485158, "num_input_tokens_seen": 226473610, "step": 10503, "time_per_iteration": 2.7602460384368896 }, { "auxiliary_loss_clip": 0.01084517, "auxiliary_loss_mlp": 0.01034434, "balance_loss_clip": 1.03906107, "balance_loss_mlp": 1.02097487, "epoch": 0.6315346460243499, "flos": 23366391292800.0, "grad_norm": 1.6896389995545142, "language_loss": 0.86806571, "learning_rate": 1.2628312194515838e-06, "loss": 0.88925523, "num_input_tokens_seen": 226493665, "step": 10504, "time_per_iteration": 2.6560161113739014 }, { "auxiliary_loss_clip": 0.0108443, "auxiliary_loss_mlp": 0.0103934, "balance_loss_clip": 1.0409503, "balance_loss_mlp": 1.02557158, "epoch": 0.6315947692770179, "flos": 20260333497600.0, "grad_norm": 1.5595011849504998, "language_loss": 0.76756787, "learning_rate": 1.2624691920746793e-06, "loss": 0.78880554, "num_input_tokens_seen": 226511625, "step": 10505, "time_per_iteration": 2.7035913467407227 }, { "auxiliary_loss_clip": 0.01073251, "auxiliary_loss_mlp": 0.01035629, "balance_loss_clip": 1.03666878, "balance_loss_mlp": 1.02143097, "epoch": 0.6316548925296859, "flos": 25265850399360.0, "grad_norm": 2.3166055953098774, "language_loss": 0.81818491, "learning_rate": 1.2621071926650166e-06, "loss": 0.83927369, "num_input_tokens_seen": 226530085, "step": 10506, "time_per_iteration": 2.762647867202759 }, { "auxiliary_loss_clip": 0.01118108, "auxiliary_loss_mlp": 0.01035819, "balance_loss_clip": 1.0422647, "balance_loss_mlp": 1.02248573, "epoch": 0.6317150157823538, "flos": 22930579578240.0, "grad_norm": 1.8490757285143165, "language_loss": 0.74521178, "learning_rate": 1.2617452212363238e-06, "loss": 0.76675105, "num_input_tokens_seen": 226548115, "step": 10507, "time_per_iteration": 2.598595380783081 }, { "auxiliary_loss_clip": 0.01094729, "auxiliary_loss_mlp": 0.01038809, "balance_loss_clip": 1.04198813, "balance_loss_mlp": 1.02511764, "epoch": 0.6317751390350218, "flos": 22527051212160.0, "grad_norm": 2.137138504509131, "language_loss": 0.67884028, "learning_rate": 1.2613832778023258e-06, "loss": 0.7001757, "num_input_tokens_seen": 226567955, "step": 10508, "time_per_iteration": 2.6457536220550537 }, { "auxiliary_loss_clip": 0.01081753, "auxiliary_loss_mlp": 0.01033704, "balance_loss_clip": 1.03684628, "balance_loss_mlp": 1.02029264, "epoch": 0.6318352622876897, "flos": 23294749616640.0, "grad_norm": 1.726891076070715, "language_loss": 0.70810485, "learning_rate": 1.2610213623767478e-06, "loss": 0.72925943, "num_input_tokens_seen": 226588205, "step": 10509, "time_per_iteration": 2.7340633869171143 }, { "auxiliary_loss_clip": 0.01100032, "auxiliary_loss_mlp": 0.01030079, "balance_loss_clip": 1.0408802, "balance_loss_mlp": 1.01750255, "epoch": 0.6318953855403577, "flos": 20704082117760.0, "grad_norm": 2.059347572016265, "language_loss": 0.79585326, "learning_rate": 1.2606594749733143e-06, "loss": 0.81715441, "num_input_tokens_seen": 226606965, "step": 10510, "time_per_iteration": 2.7126991748809814 }, { "auxiliary_loss_clip": 0.01073398, "auxiliary_loss_mlp": 0.00771235, "balance_loss_clip": 1.03949821, "balance_loss_mlp": 1.00013995, "epoch": 0.6319555087930258, "flos": 22820046451200.0, "grad_norm": 2.029248251908187, "language_loss": 0.70844626, "learning_rate": 1.2602976156057469e-06, "loss": 0.72689259, "num_input_tokens_seen": 226627845, "step": 10511, "time_per_iteration": 2.862959384918213 }, { "auxiliary_loss_clip": 0.01113995, "auxiliary_loss_mlp": 0.01035402, "balance_loss_clip": 1.04076004, "balance_loss_mlp": 1.02298617, "epoch": 0.6320156320456937, "flos": 19970929618560.0, "grad_norm": 1.5814642404723724, "language_loss": 0.80147332, "learning_rate": 1.2599357842877684e-06, "loss": 0.82296729, "num_input_tokens_seen": 226645855, "step": 10512, "time_per_iteration": 2.599238872528076 }, { "auxiliary_loss_clip": 0.01104767, "auxiliary_loss_mlp": 0.01033707, "balance_loss_clip": 1.04045844, "balance_loss_mlp": 1.01971221, "epoch": 0.6320757552983617, "flos": 27013406889600.0, "grad_norm": 2.290319172186619, "language_loss": 0.70844841, "learning_rate": 1.2595739810330994e-06, "loss": 0.72983325, "num_input_tokens_seen": 226665375, "step": 10513, "time_per_iteration": 2.706372022628784 }, { "auxiliary_loss_clip": 0.01107929, "auxiliary_loss_mlp": 0.01034926, "balance_loss_clip": 1.03973472, "balance_loss_mlp": 1.02081192, "epoch": 0.6321358785510296, "flos": 23695943598720.0, "grad_norm": 2.242079914271032, "language_loss": 0.6665644, "learning_rate": 1.259212205855459e-06, "loss": 0.68799293, "num_input_tokens_seen": 226685270, "step": 10514, "time_per_iteration": 2.6768577098846436 }, { "auxiliary_loss_clip": 0.01080896, "auxiliary_loss_mlp": 0.0103395, "balance_loss_clip": 1.03646874, "balance_loss_mlp": 1.02114093, "epoch": 0.6321960018036976, "flos": 25995231970560.0, "grad_norm": 1.8993538704282873, "language_loss": 0.74367702, "learning_rate": 1.2588504587685663e-06, "loss": 0.76482546, "num_input_tokens_seen": 226705325, "step": 10515, "time_per_iteration": 2.8709843158721924 }, { "auxiliary_loss_clip": 0.01089992, "auxiliary_loss_mlp": 0.01031214, "balance_loss_clip": 1.04074252, "balance_loss_mlp": 1.01873255, "epoch": 0.6322561250563655, "flos": 22821016118400.0, "grad_norm": 1.7638160656735167, "language_loss": 0.90024698, "learning_rate": 1.2584887397861379e-06, "loss": 0.92145908, "num_input_tokens_seen": 226723815, "step": 10516, "time_per_iteration": 2.691826343536377 }, { "auxiliary_loss_clip": 0.0112538, "auxiliary_loss_mlp": 0.01036003, "balance_loss_clip": 1.04528499, "balance_loss_mlp": 1.02075589, "epoch": 0.6323162483090335, "flos": 18988413926400.0, "grad_norm": 1.6560830086526979, "language_loss": 0.81829578, "learning_rate": 1.2581270489218911e-06, "loss": 0.83990955, "num_input_tokens_seen": 226741550, "step": 10517, "time_per_iteration": 2.620199203491211 }, { "auxiliary_loss_clip": 0.01061827, "auxiliary_loss_mlp": 0.01039321, "balance_loss_clip": 1.03930223, "balance_loss_mlp": 1.02642882, "epoch": 0.6323763715617015, "flos": 19865173000320.0, "grad_norm": 1.7035542921935394, "language_loss": 0.7784009, "learning_rate": 1.257765386189541e-06, "loss": 0.79941237, "num_input_tokens_seen": 226761115, "step": 10518, "time_per_iteration": 2.91979718208313 }, { "auxiliary_loss_clip": 0.01096755, "auxiliary_loss_mlp": 0.0103349, "balance_loss_clip": 1.03876209, "balance_loss_mlp": 1.02090716, "epoch": 0.6324364948143695, "flos": 22782699285120.0, "grad_norm": 1.44276453327461, "language_loss": 0.85200572, "learning_rate": 1.2574037516028018e-06, "loss": 0.87330812, "num_input_tokens_seen": 226782225, "step": 10519, "time_per_iteration": 2.74233078956604 }, { "auxiliary_loss_clip": 0.01088566, "auxiliary_loss_mlp": 0.01039518, "balance_loss_clip": 1.03878999, "balance_loss_mlp": 1.02666724, "epoch": 0.6324966180670374, "flos": 22235923480320.0, "grad_norm": 2.1806676145694692, "language_loss": 0.71964407, "learning_rate": 1.2570421451753867e-06, "loss": 0.74092495, "num_input_tokens_seen": 226802375, "step": 10520, "time_per_iteration": 2.682180404663086 }, { "auxiliary_loss_clip": 0.01103452, "auxiliary_loss_mlp": 0.01035272, "balance_loss_clip": 1.03956473, "balance_loss_mlp": 1.02224886, "epoch": 0.6325567413197054, "flos": 21689183589120.0, "grad_norm": 1.7702779314390575, "language_loss": 0.71439731, "learning_rate": 1.2566805669210081e-06, "loss": 0.73578453, "num_input_tokens_seen": 226822165, "step": 10521, "time_per_iteration": 2.657323122024536 }, { "auxiliary_loss_clip": 0.01076504, "auxiliary_loss_mlp": 0.01041724, "balance_loss_clip": 1.03893948, "balance_loss_mlp": 1.0255115, "epoch": 0.6326168645723733, "flos": 19937137898880.0, "grad_norm": 1.7329974565509776, "language_loss": 0.721259, "learning_rate": 1.256319016853377e-06, "loss": 0.74244124, "num_input_tokens_seen": 226841645, "step": 10522, "time_per_iteration": 2.746037721633911 }, { "auxiliary_loss_clip": 0.01074288, "auxiliary_loss_mlp": 0.01034292, "balance_loss_clip": 1.04106843, "balance_loss_mlp": 1.02167988, "epoch": 0.6326769878250413, "flos": 20230348619520.0, "grad_norm": 1.8934714872441534, "language_loss": 0.81941485, "learning_rate": 1.2559574949862023e-06, "loss": 0.84050065, "num_input_tokens_seen": 226860355, "step": 10523, "time_per_iteration": 2.761061906814575 }, { "auxiliary_loss_clip": 0.01103759, "auxiliary_loss_mlp": 0.01030499, "balance_loss_clip": 1.03989744, "balance_loss_mlp": 1.01712918, "epoch": 0.6327371110777094, "flos": 20775759707520.0, "grad_norm": 2.3750030560810163, "language_loss": 0.73983908, "learning_rate": 1.255596001333195e-06, "loss": 0.76118159, "num_input_tokens_seen": 226878390, "step": 10524, "time_per_iteration": 2.677591323852539 }, { "auxiliary_loss_clip": 0.01101897, "auxiliary_loss_mlp": 0.01041422, "balance_loss_clip": 1.04099619, "balance_loss_mlp": 1.02719402, "epoch": 0.6327972343303773, "flos": 30336544529280.0, "grad_norm": 1.9503552038514373, "language_loss": 0.84243858, "learning_rate": 1.2552345359080615e-06, "loss": 0.86387181, "num_input_tokens_seen": 226898420, "step": 10525, "time_per_iteration": 2.7905821800231934 }, { "auxiliary_loss_clip": 0.0108609, "auxiliary_loss_mlp": 0.01030251, "balance_loss_clip": 1.03651416, "balance_loss_mlp": 1.01617217, "epoch": 0.6328573575830453, "flos": 17092258871040.0, "grad_norm": 1.6646724041083503, "language_loss": 0.6700424, "learning_rate": 1.2548730987245093e-06, "loss": 0.6912058, "num_input_tokens_seen": 226916305, "step": 10526, "time_per_iteration": 2.658766031265259 }, { "auxiliary_loss_clip": 0.01111357, "auxiliary_loss_mlp": 0.01036081, "balance_loss_clip": 1.04416919, "balance_loss_mlp": 1.02141845, "epoch": 0.6329174808357132, "flos": 25047154442880.0, "grad_norm": 2.0355958158409346, "language_loss": 0.73648405, "learning_rate": 1.254511689796244e-06, "loss": 0.75795841, "num_input_tokens_seen": 226937705, "step": 10527, "time_per_iteration": 5.2298712730407715 }, { "auxiliary_loss_clip": 0.01105368, "auxiliary_loss_mlp": 0.01035127, "balance_loss_clip": 1.04319382, "balance_loss_mlp": 1.02256858, "epoch": 0.6329776040883812, "flos": 16836826279680.0, "grad_norm": 2.5914253744426614, "language_loss": 0.71704459, "learning_rate": 1.2541503091369693e-06, "loss": 0.73844951, "num_input_tokens_seen": 226954880, "step": 10528, "time_per_iteration": 2.6561360359191895 }, { "auxiliary_loss_clip": 0.01104345, "auxiliary_loss_mlp": 0.01031745, "balance_loss_clip": 1.04158008, "balance_loss_mlp": 1.01763082, "epoch": 0.6330377273410491, "flos": 13516705382400.0, "grad_norm": 1.8004698597026916, "language_loss": 0.66514266, "learning_rate": 1.2537889567603905e-06, "loss": 0.68650359, "num_input_tokens_seen": 226972595, "step": 10529, "time_per_iteration": 4.169236421585083 }, { "auxiliary_loss_clip": 0.01109158, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.0410428, "balance_loss_mlp": 1.01895428, "epoch": 0.6330978505937171, "flos": 21538825257600.0, "grad_norm": 2.1634257180763545, "language_loss": 0.75199169, "learning_rate": 1.2534276326802092e-06, "loss": 0.77341741, "num_input_tokens_seen": 226991910, "step": 10530, "time_per_iteration": 4.1243627071380615 }, { "auxiliary_loss_clip": 0.0111004, "auxiliary_loss_mlp": 0.00770904, "balance_loss_clip": 1.04529655, "balance_loss_mlp": 1.00030541, "epoch": 0.6331579738463851, "flos": 25009484054400.0, "grad_norm": 1.5033967127528767, "language_loss": 0.73765004, "learning_rate": 1.2530663369101259e-06, "loss": 0.75645947, "num_input_tokens_seen": 227010175, "step": 10531, "time_per_iteration": 2.757310152053833 }, { "auxiliary_loss_clip": 0.010819, "auxiliary_loss_mlp": 0.0103456, "balance_loss_clip": 1.0428102, "balance_loss_mlp": 1.02120292, "epoch": 0.6332180970990531, "flos": 14976007228800.0, "grad_norm": 2.152892996011048, "language_loss": 0.79560679, "learning_rate": 1.2527050694638432e-06, "loss": 0.81677139, "num_input_tokens_seen": 227025540, "step": 10532, "time_per_iteration": 2.693357229232788 }, { "auxiliary_loss_clip": 0.01106096, "auxiliary_loss_mlp": 0.01033111, "balance_loss_clip": 1.04273748, "balance_loss_mlp": 1.02105284, "epoch": 0.633278220351721, "flos": 22706963458560.0, "grad_norm": 1.5569394240480623, "language_loss": 0.74720097, "learning_rate": 1.2523438303550582e-06, "loss": 0.76859295, "num_input_tokens_seen": 227045520, "step": 10533, "time_per_iteration": 2.6261446475982666 }, { "auxiliary_loss_clip": 0.01096787, "auxiliary_loss_mlp": 0.01038782, "balance_loss_clip": 1.04458022, "balance_loss_mlp": 1.02473903, "epoch": 0.633338343604389, "flos": 12602922364800.0, "grad_norm": 2.379717167307364, "language_loss": 0.77104855, "learning_rate": 1.2519826195974706e-06, "loss": 0.79240417, "num_input_tokens_seen": 227059420, "step": 10534, "time_per_iteration": 2.6211531162261963 }, { "auxiliary_loss_clip": 0.01080216, "auxiliary_loss_mlp": 0.01043157, "balance_loss_clip": 1.03751063, "balance_loss_mlp": 1.02861977, "epoch": 0.6333984668570569, "flos": 25960111447680.0, "grad_norm": 1.7545098866738538, "language_loss": 0.86108071, "learning_rate": 1.251621437204777e-06, "loss": 0.88231444, "num_input_tokens_seen": 227081310, "step": 10535, "time_per_iteration": 4.269057035446167 }, { "auxiliary_loss_clip": 0.01110282, "auxiliary_loss_mlp": 0.01037711, "balance_loss_clip": 1.04232645, "balance_loss_mlp": 1.02399635, "epoch": 0.6334585901097249, "flos": 23659242877440.0, "grad_norm": 1.7414784178378062, "language_loss": 0.76938647, "learning_rate": 1.2512602831906733e-06, "loss": 0.79086637, "num_input_tokens_seen": 227100365, "step": 10536, "time_per_iteration": 2.6666407585144043 }, { "auxiliary_loss_clip": 0.01102168, "auxiliary_loss_mlp": 0.01038535, "balance_loss_clip": 1.04189527, "balance_loss_mlp": 1.02443218, "epoch": 0.633518713362393, "flos": 28760496503040.0, "grad_norm": 2.0392502828924353, "language_loss": 0.60273743, "learning_rate": 1.250899157568855e-06, "loss": 0.62414443, "num_input_tokens_seen": 227119680, "step": 10537, "time_per_iteration": 2.7295584678649902 }, { "auxiliary_loss_clip": 0.01012372, "auxiliary_loss_mlp": 0.01000462, "balance_loss_clip": 1.01797509, "balance_loss_mlp": 0.99907935, "epoch": 0.6335788366150609, "flos": 70420322401920.0, "grad_norm": 0.7714446209447136, "language_loss": 0.52451682, "learning_rate": 1.2505380603530155e-06, "loss": 0.54464519, "num_input_tokens_seen": 227184465, "step": 10538, "time_per_iteration": 3.3442068099975586 }, { "auxiliary_loss_clip": 0.01100864, "auxiliary_loss_mlp": 0.01035126, "balance_loss_clip": 1.04384637, "balance_loss_mlp": 1.02057028, "epoch": 0.6336389598677289, "flos": 23732069702400.0, "grad_norm": 1.8384221769935791, "language_loss": 0.83274323, "learning_rate": 1.250176991556848e-06, "loss": 0.85410309, "num_input_tokens_seen": 227202185, "step": 10539, "time_per_iteration": 2.696904182434082 }, { "auxiliary_loss_clip": 0.01090255, "auxiliary_loss_mlp": 0.01032507, "balance_loss_clip": 1.03990459, "balance_loss_mlp": 1.01798737, "epoch": 0.6336990831203968, "flos": 29276676898560.0, "grad_norm": 1.6347430731245383, "language_loss": 0.86721331, "learning_rate": 1.2498159511940438e-06, "loss": 0.88844097, "num_input_tokens_seen": 227222020, "step": 10540, "time_per_iteration": 2.7495079040527344 }, { "auxiliary_loss_clip": 0.01091229, "auxiliary_loss_mlp": 0.01034107, "balance_loss_clip": 1.04014003, "balance_loss_mlp": 1.02244198, "epoch": 0.6337592063730648, "flos": 29096836479360.0, "grad_norm": 2.116079588237037, "language_loss": 0.7269882, "learning_rate": 1.2494549392782943e-06, "loss": 0.74824154, "num_input_tokens_seen": 227240885, "step": 10541, "time_per_iteration": 2.750035285949707 }, { "auxiliary_loss_clip": 0.01111525, "auxiliary_loss_mlp": 0.010355, "balance_loss_clip": 1.04309511, "balance_loss_mlp": 1.02114677, "epoch": 0.6338193296257327, "flos": 34706477249280.0, "grad_norm": 2.608261813881904, "language_loss": 0.85043848, "learning_rate": 1.2490939558232887e-06, "loss": 0.87190866, "num_input_tokens_seen": 227257880, "step": 10542, "time_per_iteration": 2.7066802978515625 }, { "auxiliary_loss_clip": 0.01107251, "auxiliary_loss_mlp": 0.01033519, "balance_loss_clip": 1.04289162, "balance_loss_mlp": 1.01898777, "epoch": 0.6338794528784008, "flos": 16687581269760.0, "grad_norm": 1.8074408618170101, "language_loss": 0.77832586, "learning_rate": 1.2487330008427153e-06, "loss": 0.79973352, "num_input_tokens_seen": 227274840, "step": 10543, "time_per_iteration": 2.6362385749816895 }, { "auxiliary_loss_clip": 0.01065317, "auxiliary_loss_mlp": 0.0104211, "balance_loss_clip": 1.04040122, "balance_loss_mlp": 1.02933073, "epoch": 0.6339395761310687, "flos": 22346600261760.0, "grad_norm": 1.5926861927585991, "language_loss": 0.73305023, "learning_rate": 1.2483720743502618e-06, "loss": 0.75412452, "num_input_tokens_seen": 227294835, "step": 10544, "time_per_iteration": 2.7428245544433594 }, { "auxiliary_loss_clip": 0.01089874, "auxiliary_loss_mlp": 0.01039428, "balance_loss_clip": 1.04020858, "balance_loss_mlp": 1.02617836, "epoch": 0.6339996993837367, "flos": 18551812112640.0, "grad_norm": 4.4072583606750895, "language_loss": 0.68668348, "learning_rate": 1.2480111763596144e-06, "loss": 0.70797652, "num_input_tokens_seen": 227314935, "step": 10545, "time_per_iteration": 2.8335583209991455 }, { "auxiliary_loss_clip": 0.01092777, "auxiliary_loss_mlp": 0.01037678, "balance_loss_clip": 1.03954399, "balance_loss_mlp": 1.02418935, "epoch": 0.6340598226364046, "flos": 12969498614400.0, "grad_norm": 1.9287987147307617, "language_loss": 0.70950794, "learning_rate": 1.2476503068844592e-06, "loss": 0.73081255, "num_input_tokens_seen": 227332905, "step": 10546, "time_per_iteration": 2.6343114376068115 }, { "auxiliary_loss_clip": 0.01103009, "auxiliary_loss_mlp": 0.01031279, "balance_loss_clip": 1.0436604, "balance_loss_mlp": 1.01867259, "epoch": 0.6341199458890726, "flos": 26687984647680.0, "grad_norm": 1.2499026086544156, "language_loss": 0.77873629, "learning_rate": 1.2472894659384792e-06, "loss": 0.80007923, "num_input_tokens_seen": 227354915, "step": 10547, "time_per_iteration": 2.704674005508423 }, { "auxiliary_loss_clip": 0.01072985, "auxiliary_loss_mlp": 0.0104046, "balance_loss_clip": 1.03441143, "balance_loss_mlp": 1.02732289, "epoch": 0.6341800691417405, "flos": 18734274224640.0, "grad_norm": 1.6184133650868997, "language_loss": 0.62827075, "learning_rate": 1.2469286535353578e-06, "loss": 0.64940524, "num_input_tokens_seen": 227372990, "step": 10548, "time_per_iteration": 2.7401933670043945 }, { "auxiliary_loss_clip": 0.01089619, "auxiliary_loss_mlp": 0.01038454, "balance_loss_clip": 1.03783989, "balance_loss_mlp": 1.02509081, "epoch": 0.6342401923944085, "flos": 26249443499520.0, "grad_norm": 2.3059628412520308, "language_loss": 0.62195736, "learning_rate": 1.2465678696887785e-06, "loss": 0.64323807, "num_input_tokens_seen": 227393270, "step": 10549, "time_per_iteration": 2.825896739959717 }, { "auxiliary_loss_clip": 0.0106782, "auxiliary_loss_mlp": 0.01035303, "balance_loss_clip": 1.0408318, "balance_loss_mlp": 1.02268422, "epoch": 0.6343003156470765, "flos": 24680937329280.0, "grad_norm": 1.71498279606421, "language_loss": 0.73401284, "learning_rate": 1.2462071144124197e-06, "loss": 0.75504404, "num_input_tokens_seen": 227413630, "step": 10550, "time_per_iteration": 2.780163049697876 }, { "auxiliary_loss_clip": 0.0100437, "auxiliary_loss_mlp": 0.01001031, "balance_loss_clip": 1.0126493, "balance_loss_mlp": 0.99974936, "epoch": 0.6343604388997445, "flos": 69805352626560.0, "grad_norm": 0.6910389749764038, "language_loss": 0.57719415, "learning_rate": 1.2458463877199638e-06, "loss": 0.59724814, "num_input_tokens_seen": 227476630, "step": 10551, "time_per_iteration": 3.286808729171753 }, { "auxiliary_loss_clip": 0.01082742, "auxiliary_loss_mlp": 0.01030085, "balance_loss_clip": 1.04196656, "balance_loss_mlp": 1.01796162, "epoch": 0.6344205621524125, "flos": 21982430223360.0, "grad_norm": 1.74505505177434, "language_loss": 0.67322063, "learning_rate": 1.2454856896250881e-06, "loss": 0.69434893, "num_input_tokens_seen": 227496060, "step": 10552, "time_per_iteration": 2.7764453887939453 }, { "auxiliary_loss_clip": 0.01080056, "auxiliary_loss_mlp": 0.01034576, "balance_loss_clip": 1.03920615, "balance_loss_mlp": 1.02086091, "epoch": 0.6344806854050804, "flos": 20448865008000.0, "grad_norm": 1.5562703117807677, "language_loss": 0.81798071, "learning_rate": 1.24512502014147e-06, "loss": 0.839127, "num_input_tokens_seen": 227513440, "step": 10553, "time_per_iteration": 2.7851717472076416 }, { "auxiliary_loss_clip": 0.01106231, "auxiliary_loss_mlp": 0.0103609, "balance_loss_clip": 1.04020214, "balance_loss_mlp": 1.02246475, "epoch": 0.6345408086577484, "flos": 40510611187200.0, "grad_norm": 1.7532654974316204, "language_loss": 0.5476743, "learning_rate": 1.2447643792827879e-06, "loss": 0.56909752, "num_input_tokens_seen": 227535395, "step": 10554, "time_per_iteration": 2.79447078704834 }, { "auxiliary_loss_clip": 0.01096611, "auxiliary_loss_mlp": 0.01034981, "balance_loss_clip": 1.0413723, "balance_loss_mlp": 1.02187991, "epoch": 0.6346009319104163, "flos": 21361319222400.0, "grad_norm": 2.4671241924977583, "language_loss": 0.70400488, "learning_rate": 1.2444037670627153e-06, "loss": 0.72532082, "num_input_tokens_seen": 227554545, "step": 10555, "time_per_iteration": 2.6849427223205566 }, { "auxiliary_loss_clip": 0.01017602, "auxiliary_loss_mlp": 0.01006112, "balance_loss_clip": 1.0127604, "balance_loss_mlp": 1.00490761, "epoch": 0.6346610551630844, "flos": 71365419100800.0, "grad_norm": 0.773594882523352, "language_loss": 0.55296588, "learning_rate": 1.2440431834949276e-06, "loss": 0.57320297, "num_input_tokens_seen": 227608575, "step": 10556, "time_per_iteration": 3.1463379859924316 }, { "auxiliary_loss_clip": 0.01095791, "auxiliary_loss_mlp": 0.01031445, "balance_loss_clip": 1.0396291, "balance_loss_mlp": 1.01756358, "epoch": 0.6347211784157523, "flos": 25411504049280.0, "grad_norm": 2.5502749141285848, "language_loss": 0.67922962, "learning_rate": 1.2436826285930985e-06, "loss": 0.70050198, "num_input_tokens_seen": 227628175, "step": 10557, "time_per_iteration": 2.693422794342041 }, { "auxiliary_loss_clip": 0.0108673, "auxiliary_loss_mlp": 0.01038794, "balance_loss_clip": 1.03953815, "balance_loss_mlp": 1.02604496, "epoch": 0.6347813016684203, "flos": 15742735966080.0, "grad_norm": 1.602709548432784, "language_loss": 0.70369065, "learning_rate": 1.2433221023709002e-06, "loss": 0.72494584, "num_input_tokens_seen": 227645330, "step": 10558, "time_per_iteration": 2.671268939971924 }, { "auxiliary_loss_clip": 0.01083073, "auxiliary_loss_mlp": 0.01034562, "balance_loss_clip": 1.03938115, "balance_loss_mlp": 1.02120471, "epoch": 0.6348414249210882, "flos": 21464777370240.0, "grad_norm": 1.4417814449763804, "language_loss": 0.78316975, "learning_rate": 1.2429616048420031e-06, "loss": 0.80434608, "num_input_tokens_seen": 227665250, "step": 10559, "time_per_iteration": 2.7575199604034424 }, { "auxiliary_loss_clip": 0.01090706, "auxiliary_loss_mlp": 0.01041541, "balance_loss_clip": 1.03786755, "balance_loss_mlp": 1.02740252, "epoch": 0.6349015481737562, "flos": 21653057485440.0, "grad_norm": 1.8349318523473441, "language_loss": 0.67984653, "learning_rate": 1.242601136020078e-06, "loss": 0.70116907, "num_input_tokens_seen": 227685070, "step": 10560, "time_per_iteration": 2.6403374671936035 }, { "auxiliary_loss_clip": 0.01089304, "auxiliary_loss_mlp": 0.01045217, "balance_loss_clip": 1.03931737, "balance_loss_mlp": 1.03085184, "epoch": 0.6349616714264241, "flos": 22194984954240.0, "grad_norm": 1.606240636171636, "language_loss": 0.76797289, "learning_rate": 1.2422406959187939e-06, "loss": 0.78931808, "num_input_tokens_seen": 227704430, "step": 10561, "time_per_iteration": 2.7372517585754395 }, { "auxiliary_loss_clip": 0.01093461, "auxiliary_loss_mlp": 0.01035075, "balance_loss_clip": 1.03962195, "balance_loss_mlp": 1.02203918, "epoch": 0.6350217946790921, "flos": 25410354814080.0, "grad_norm": 2.1365474752692966, "language_loss": 0.71962273, "learning_rate": 1.2418802845518178e-06, "loss": 0.74090809, "num_input_tokens_seen": 227724920, "step": 10562, "time_per_iteration": 2.7133450508117676 }, { "auxiliary_loss_clip": 0.01105126, "auxiliary_loss_mlp": 0.01034495, "balance_loss_clip": 1.04334474, "balance_loss_mlp": 1.02005243, "epoch": 0.63508191793176, "flos": 19718944732800.0, "grad_norm": 2.0107972952363413, "language_loss": 0.80757058, "learning_rate": 1.2415199019328185e-06, "loss": 0.8289668, "num_input_tokens_seen": 227743400, "step": 10563, "time_per_iteration": 2.6585617065429688 }, { "auxiliary_loss_clip": 0.01091086, "auxiliary_loss_mlp": 0.01038953, "balance_loss_clip": 1.04419041, "balance_loss_mlp": 1.02567887, "epoch": 0.6351420411844281, "flos": 18186923802240.0, "grad_norm": 2.444256209228289, "language_loss": 0.81206977, "learning_rate": 1.2411595480754597e-06, "loss": 0.83337021, "num_input_tokens_seen": 227759990, "step": 10564, "time_per_iteration": 2.705941915512085 }, { "auxiliary_loss_clip": 0.01087784, "auxiliary_loss_mlp": 0.01045814, "balance_loss_clip": 1.04181719, "balance_loss_mlp": 1.03100812, "epoch": 0.6352021644370961, "flos": 33726511422720.0, "grad_norm": 1.5889053443954093, "language_loss": 0.72453761, "learning_rate": 1.240799222993407e-06, "loss": 0.74587357, "num_input_tokens_seen": 227780835, "step": 10565, "time_per_iteration": 2.765345335006714 }, { "auxiliary_loss_clip": 0.01102461, "auxiliary_loss_mlp": 0.01033958, "balance_loss_clip": 1.04256928, "balance_loss_mlp": 1.01919961, "epoch": 0.635262287689764, "flos": 20374781207040.0, "grad_norm": 2.121063161403432, "language_loss": 0.69596386, "learning_rate": 1.240438926700324e-06, "loss": 0.71732807, "num_input_tokens_seen": 227798580, "step": 10566, "time_per_iteration": 4.550225496292114 }, { "auxiliary_loss_clip": 0.01103568, "auxiliary_loss_mlp": 0.01033127, "balance_loss_clip": 1.04312527, "balance_loss_mlp": 1.0210278, "epoch": 0.635322410942432, "flos": 27525421307520.0, "grad_norm": 1.5800197118440122, "language_loss": 0.69619238, "learning_rate": 1.2400786592098725e-06, "loss": 0.71755934, "num_input_tokens_seen": 227819210, "step": 10567, "time_per_iteration": 2.6888957023620605 }, { "auxiliary_loss_clip": 0.01100039, "auxiliary_loss_mlp": 0.01031317, "balance_loss_clip": 1.04216862, "balance_loss_mlp": 1.01925862, "epoch": 0.6353825341950999, "flos": 21543601766400.0, "grad_norm": 2.2757897203537976, "language_loss": 0.8449024, "learning_rate": 1.2397184205357154e-06, "loss": 0.86621594, "num_input_tokens_seen": 227838340, "step": 10568, "time_per_iteration": 4.255465030670166 }, { "auxiliary_loss_clip": 0.01056215, "auxiliary_loss_mlp": 0.01041007, "balance_loss_clip": 1.03819847, "balance_loss_mlp": 1.026559, "epoch": 0.635442657447768, "flos": 31759756185600.0, "grad_norm": 1.8323936037096342, "language_loss": 0.84063637, "learning_rate": 1.2393582106915113e-06, "loss": 0.86160862, "num_input_tokens_seen": 227859170, "step": 10569, "time_per_iteration": 4.377737760543823 }, { "auxiliary_loss_clip": 0.01104285, "auxiliary_loss_mlp": 0.01032738, "balance_loss_clip": 1.04183245, "balance_loss_mlp": 1.01939797, "epoch": 0.6355027807004359, "flos": 19828831415040.0, "grad_norm": 1.6700504081300207, "language_loss": 0.69352221, "learning_rate": 1.2389980296909198e-06, "loss": 0.71489245, "num_input_tokens_seen": 227878545, "step": 10570, "time_per_iteration": 2.6112160682678223 }, { "auxiliary_loss_clip": 0.01107497, "auxiliary_loss_mlp": 0.01036944, "balance_loss_clip": 1.04085815, "balance_loss_mlp": 1.02342606, "epoch": 0.6355629039531039, "flos": 30372383324160.0, "grad_norm": 1.7288699826037912, "language_loss": 0.65762198, "learning_rate": 1.2386378775476e-06, "loss": 0.67906642, "num_input_tokens_seen": 227898875, "step": 10571, "time_per_iteration": 2.7335216999053955 }, { "auxiliary_loss_clip": 0.01113018, "auxiliary_loss_mlp": 0.01029154, "balance_loss_clip": 1.04446983, "balance_loss_mlp": 1.01616585, "epoch": 0.6356230272057718, "flos": 17932065828480.0, "grad_norm": 1.9788287371045428, "language_loss": 0.71541518, "learning_rate": 1.2382777542752074e-06, "loss": 0.73683691, "num_input_tokens_seen": 227917130, "step": 10572, "time_per_iteration": 2.6052427291870117 }, { "auxiliary_loss_clip": 0.01084769, "auxiliary_loss_mlp": 0.01034, "balance_loss_clip": 1.04089427, "balance_loss_mlp": 1.02181661, "epoch": 0.6356831504584398, "flos": 25375844822400.0, "grad_norm": 1.6900483013767176, "language_loss": 0.81165767, "learning_rate": 1.2379176598873992e-06, "loss": 0.83284533, "num_input_tokens_seen": 227939550, "step": 10573, "time_per_iteration": 2.8153634071350098 }, { "auxiliary_loss_clip": 0.0109877, "auxiliary_loss_mlp": 0.01033009, "balance_loss_clip": 1.04272556, "balance_loss_mlp": 1.02006316, "epoch": 0.6357432737111077, "flos": 46500331720320.0, "grad_norm": 1.6632630908080246, "language_loss": 0.68936265, "learning_rate": 1.2375575943978303e-06, "loss": 0.71068037, "num_input_tokens_seen": 227962200, "step": 10574, "time_per_iteration": 4.407367467880249 }, { "auxiliary_loss_clip": 0.01116558, "auxiliary_loss_mlp": 0.01031438, "balance_loss_clip": 1.04334235, "balance_loss_mlp": 1.01825356, "epoch": 0.6358033969637757, "flos": 17274361847040.0, "grad_norm": 2.216480993085757, "language_loss": 0.86364478, "learning_rate": 1.2371975578201525e-06, "loss": 0.88512474, "num_input_tokens_seen": 227979270, "step": 10575, "time_per_iteration": 2.59047532081604 }, { "auxiliary_loss_clip": 0.01116011, "auxiliary_loss_mlp": 0.01037179, "balance_loss_clip": 1.04200649, "balance_loss_mlp": 1.02420902, "epoch": 0.6358635202164437, "flos": 27125520215040.0, "grad_norm": 1.527365029746322, "language_loss": 0.72139943, "learning_rate": 1.2368375501680204e-06, "loss": 0.74293131, "num_input_tokens_seen": 228000550, "step": 10576, "time_per_iteration": 2.6213035583496094 }, { "auxiliary_loss_clip": 0.01094385, "auxiliary_loss_mlp": 0.0103256, "balance_loss_clip": 1.0408107, "balance_loss_mlp": 1.01913691, "epoch": 0.6359236434691117, "flos": 27525205825920.0, "grad_norm": 1.587362724967965, "language_loss": 0.69232905, "learning_rate": 1.236477571455085e-06, "loss": 0.71359849, "num_input_tokens_seen": 228022005, "step": 10577, "time_per_iteration": 2.6874570846557617 }, { "auxiliary_loss_clip": 0.01076719, "auxiliary_loss_mlp": 0.01031904, "balance_loss_clip": 1.04086065, "balance_loss_mlp": 1.01938713, "epoch": 0.6359837667217797, "flos": 39348290989440.0, "grad_norm": 1.631898217557544, "language_loss": 0.71984881, "learning_rate": 1.2361176216949964e-06, "loss": 0.74093509, "num_input_tokens_seen": 228043770, "step": 10578, "time_per_iteration": 2.956587314605713 }, { "auxiliary_loss_clip": 0.01011581, "auxiliary_loss_mlp": 0.00752167, "balance_loss_clip": 1.01532173, "balance_loss_mlp": 0.99992144, "epoch": 0.6360438899744476, "flos": 56413797206400.0, "grad_norm": 0.7005664562343583, "language_loss": 0.5446803, "learning_rate": 1.2357577009014044e-06, "loss": 0.56231779, "num_input_tokens_seen": 228104985, "step": 10579, "time_per_iteration": 3.3165230751037598 }, { "auxiliary_loss_clip": 0.01090928, "auxiliary_loss_mlp": 0.01034048, "balance_loss_clip": 1.03814209, "balance_loss_mlp": 1.02082229, "epoch": 0.6361040132271156, "flos": 24973106555520.0, "grad_norm": 1.557921238837489, "language_loss": 0.77395153, "learning_rate": 1.2353978090879568e-06, "loss": 0.7952013, "num_input_tokens_seen": 228125620, "step": 10580, "time_per_iteration": 2.712324857711792 }, { "auxiliary_loss_clip": 0.01087081, "auxiliary_loss_mlp": 0.00770805, "balance_loss_clip": 1.04100418, "balance_loss_mlp": 1.00011897, "epoch": 0.6361641364797835, "flos": 23259198130560.0, "grad_norm": 2.013936086375126, "language_loss": 0.66709065, "learning_rate": 1.235037946268301e-06, "loss": 0.68566948, "num_input_tokens_seen": 228143495, "step": 10581, "time_per_iteration": 2.7856929302215576 }, { "auxiliary_loss_clip": 0.01102449, "auxiliary_loss_mlp": 0.01034551, "balance_loss_clip": 1.0404247, "balance_loss_mlp": 1.02227867, "epoch": 0.6362242597324516, "flos": 25994513698560.0, "grad_norm": 1.9398130134586062, "language_loss": 0.68718088, "learning_rate": 1.2346781124560828e-06, "loss": 0.70855093, "num_input_tokens_seen": 228166500, "step": 10582, "time_per_iteration": 2.737300395965576 }, { "auxiliary_loss_clip": 0.01089734, "auxiliary_loss_mlp": 0.01038152, "balance_loss_clip": 1.04106402, "balance_loss_mlp": 1.02545059, "epoch": 0.6362843829851195, "flos": 25703242312320.0, "grad_norm": 2.1615330133159305, "language_loss": 0.84382987, "learning_rate": 1.2343183076649473e-06, "loss": 0.86510873, "num_input_tokens_seen": 228185325, "step": 10583, "time_per_iteration": 2.736928939819336 }, { "auxiliary_loss_clip": 0.01094529, "auxiliary_loss_mlp": 0.01034443, "balance_loss_clip": 1.04331303, "balance_loss_mlp": 1.02157402, "epoch": 0.6363445062377875, "flos": 20522912895360.0, "grad_norm": 1.8294448915060182, "language_loss": 0.75581825, "learning_rate": 1.233958531908538e-06, "loss": 0.77710795, "num_input_tokens_seen": 228204050, "step": 10584, "time_per_iteration": 2.66745662689209 }, { "auxiliary_loss_clip": 0.01092434, "auxiliary_loss_mlp": 0.01035865, "balance_loss_clip": 1.04142356, "balance_loss_mlp": 1.02158976, "epoch": 0.6364046294904554, "flos": 19463799450240.0, "grad_norm": 1.8372541511316505, "language_loss": 0.72750449, "learning_rate": 1.2335987852004985e-06, "loss": 0.74878752, "num_input_tokens_seen": 228222430, "step": 10585, "time_per_iteration": 2.7207906246185303 }, { "auxiliary_loss_clip": 0.01078843, "auxiliary_loss_mlp": 0.01028745, "balance_loss_clip": 1.03947806, "balance_loss_mlp": 1.01638353, "epoch": 0.6364647527431234, "flos": 20995892208000.0, "grad_norm": 1.8754451190030996, "language_loss": 0.82982284, "learning_rate": 1.2332390675544697e-06, "loss": 0.85089874, "num_input_tokens_seen": 228241925, "step": 10586, "time_per_iteration": 2.883169174194336 }, { "auxiliary_loss_clip": 0.01104026, "auxiliary_loss_mlp": 0.01024669, "balance_loss_clip": 1.04210103, "balance_loss_mlp": 1.01253915, "epoch": 0.6365248759957913, "flos": 25770789838080.0, "grad_norm": 2.4347749012599382, "language_loss": 0.72591609, "learning_rate": 1.2328793789840918e-06, "loss": 0.74720299, "num_input_tokens_seen": 228262535, "step": 10587, "time_per_iteration": 2.696120500564575 }, { "auxiliary_loss_clip": 0.01095392, "auxiliary_loss_mlp": 0.01030465, "balance_loss_clip": 1.04264998, "balance_loss_mlp": 1.01770997, "epoch": 0.6365849992484593, "flos": 22455589104000.0, "grad_norm": 2.0432270596750395, "language_loss": 0.77210999, "learning_rate": 1.2325197195030058e-06, "loss": 0.79336858, "num_input_tokens_seen": 228281340, "step": 10588, "time_per_iteration": 2.7811734676361084 }, { "auxiliary_loss_clip": 0.0106633, "auxiliary_loss_mlp": 0.01028817, "balance_loss_clip": 1.03860903, "balance_loss_mlp": 1.0154599, "epoch": 0.6366451225011273, "flos": 19025689265280.0, "grad_norm": 1.4710865244749312, "language_loss": 0.79949176, "learning_rate": 1.2321600891248478e-06, "loss": 0.82044327, "num_input_tokens_seen": 228300865, "step": 10589, "time_per_iteration": 2.8011467456817627 }, { "auxiliary_loss_clip": 0.01093718, "auxiliary_loss_mlp": 0.01032855, "balance_loss_clip": 1.03902805, "balance_loss_mlp": 1.02014768, "epoch": 0.6367052457537953, "flos": 25228395492480.0, "grad_norm": 2.226066060883624, "language_loss": 0.67151499, "learning_rate": 1.231800487863257e-06, "loss": 0.69278073, "num_input_tokens_seen": 228320815, "step": 10590, "time_per_iteration": 2.709080934524536 }, { "auxiliary_loss_clip": 0.01111263, "auxiliary_loss_mlp": 0.01033159, "balance_loss_clip": 1.04165292, "balance_loss_mlp": 1.01980138, "epoch": 0.6367653690064633, "flos": 19208438686080.0, "grad_norm": 2.18709267526875, "language_loss": 0.78891504, "learning_rate": 1.2314409157318685e-06, "loss": 0.81035924, "num_input_tokens_seen": 228339065, "step": 10591, "time_per_iteration": 2.636992931365967 }, { "auxiliary_loss_clip": 0.01092014, "auxiliary_loss_mlp": 0.01028959, "balance_loss_clip": 1.04065537, "balance_loss_mlp": 1.01711535, "epoch": 0.6368254922591312, "flos": 23546806329600.0, "grad_norm": 1.430576733389061, "language_loss": 0.89153397, "learning_rate": 1.231081372744317e-06, "loss": 0.91274369, "num_input_tokens_seen": 228359210, "step": 10592, "time_per_iteration": 2.7107973098754883 }, { "auxiliary_loss_clip": 0.01099214, "auxiliary_loss_mlp": 0.01027902, "balance_loss_clip": 1.03750551, "balance_loss_mlp": 1.01598144, "epoch": 0.6368856155117992, "flos": 26467313443200.0, "grad_norm": 1.4034572445207882, "language_loss": 0.68212253, "learning_rate": 1.2307218589142376e-06, "loss": 0.7033937, "num_input_tokens_seen": 228379630, "step": 10593, "time_per_iteration": 2.807321786880493 }, { "auxiliary_loss_clip": 0.01061371, "auxiliary_loss_mlp": 0.01042752, "balance_loss_clip": 1.03203607, "balance_loss_mlp": 1.02891731, "epoch": 0.6369457387644671, "flos": 33692432394240.0, "grad_norm": 1.761330533007529, "language_loss": 0.63678664, "learning_rate": 1.2303623742552618e-06, "loss": 0.65782785, "num_input_tokens_seen": 228401410, "step": 10594, "time_per_iteration": 2.856600046157837 }, { "auxiliary_loss_clip": 0.01023648, "auxiliary_loss_mlp": 0.01001204, "balance_loss_clip": 1.01176047, "balance_loss_mlp": 0.99982756, "epoch": 0.6370058620171352, "flos": 70908600908160.0, "grad_norm": 0.7623002997880329, "language_loss": 0.54635006, "learning_rate": 1.230002918781022e-06, "loss": 0.56659859, "num_input_tokens_seen": 228470335, "step": 10595, "time_per_iteration": 3.2980732917785645 }, { "auxiliary_loss_clip": 0.01118729, "auxiliary_loss_mlp": 0.01042081, "balance_loss_clip": 1.04251242, "balance_loss_mlp": 1.02855635, "epoch": 0.6370659852698031, "flos": 21141940907520.0, "grad_norm": 2.0781706076151445, "language_loss": 0.67100823, "learning_rate": 1.2296434925051493e-06, "loss": 0.69261628, "num_input_tokens_seen": 228490765, "step": 10596, "time_per_iteration": 2.6011126041412354 }, { "auxiliary_loss_clip": 0.01099686, "auxiliary_loss_mlp": 0.01037794, "balance_loss_clip": 1.04006338, "balance_loss_mlp": 1.02463365, "epoch": 0.6371261085224711, "flos": 20193288762240.0, "grad_norm": 2.011756808968462, "language_loss": 0.7937991, "learning_rate": 1.2292840954412718e-06, "loss": 0.81517392, "num_input_tokens_seen": 228509700, "step": 10597, "time_per_iteration": 2.6972439289093018 }, { "auxiliary_loss_clip": 0.01108387, "auxiliary_loss_mlp": 0.01037543, "balance_loss_clip": 1.04363835, "balance_loss_mlp": 1.02541316, "epoch": 0.637186231775139, "flos": 19683536901120.0, "grad_norm": 1.60919791295429, "language_loss": 0.74850726, "learning_rate": 1.2289247276030189e-06, "loss": 0.76996648, "num_input_tokens_seen": 228529050, "step": 10598, "time_per_iteration": 2.6332266330718994 }, { "auxiliary_loss_clip": 0.01084454, "auxiliary_loss_mlp": 0.00771297, "balance_loss_clip": 1.03999043, "balance_loss_mlp": 1.00013983, "epoch": 0.637246355027807, "flos": 13071196995840.0, "grad_norm": 1.9548116793493355, "language_loss": 0.68556929, "learning_rate": 1.2285653890040176e-06, "loss": 0.70412678, "num_input_tokens_seen": 228544665, "step": 10599, "time_per_iteration": 2.6878466606140137 }, { "auxiliary_loss_clip": 0.01077983, "auxiliary_loss_mlp": 0.01031504, "balance_loss_clip": 1.03724337, "balance_loss_mlp": 1.01745534, "epoch": 0.6373064782804749, "flos": 18222654856320.0, "grad_norm": 2.0583135447897933, "language_loss": 0.80303937, "learning_rate": 1.2282060796578942e-06, "loss": 0.82413423, "num_input_tokens_seen": 228562060, "step": 10600, "time_per_iteration": 2.653907060623169 }, { "auxiliary_loss_clip": 0.01101937, "auxiliary_loss_mlp": 0.01036294, "balance_loss_clip": 1.03776395, "balance_loss_mlp": 1.02380645, "epoch": 0.637366601533143, "flos": 24498475217280.0, "grad_norm": 1.4639641102491714, "language_loss": 0.79828721, "learning_rate": 1.2278467995782732e-06, "loss": 0.81966954, "num_input_tokens_seen": 228582550, "step": 10601, "time_per_iteration": 2.797588586807251 }, { "auxiliary_loss_clip": 0.01085997, "auxiliary_loss_mlp": 0.01032519, "balance_loss_clip": 1.04335141, "balance_loss_mlp": 1.01989436, "epoch": 0.6374267247858109, "flos": 26359042872960.0, "grad_norm": 2.3452009289064737, "language_loss": 0.6766789, "learning_rate": 1.2274875487787797e-06, "loss": 0.69786406, "num_input_tokens_seen": 228604960, "step": 10602, "time_per_iteration": 2.742664098739624 }, { "auxiliary_loss_clip": 0.01037986, "auxiliary_loss_mlp": 0.01033695, "balance_loss_clip": 1.03193176, "balance_loss_mlp": 1.02034974, "epoch": 0.6374868480384789, "flos": 20371728551040.0, "grad_norm": 2.210099504390163, "language_loss": 0.79618657, "learning_rate": 1.2271283272730354e-06, "loss": 0.81690341, "num_input_tokens_seen": 228622195, "step": 10603, "time_per_iteration": 2.8134090900421143 }, { "auxiliary_loss_clip": 0.0107315, "auxiliary_loss_mlp": 0.00770892, "balance_loss_clip": 1.03933704, "balance_loss_mlp": 1.00014615, "epoch": 0.6375469712911469, "flos": 20996251344000.0, "grad_norm": 1.8573318102619591, "language_loss": 0.76802522, "learning_rate": 1.2267691350746621e-06, "loss": 0.78646559, "num_input_tokens_seen": 228639735, "step": 10604, "time_per_iteration": 2.7761478424072266 }, { "auxiliary_loss_clip": 0.01095415, "auxiliary_loss_mlp": 0.01031172, "balance_loss_clip": 1.03836191, "balance_loss_mlp": 1.01792753, "epoch": 0.6376070945438148, "flos": 19715748422400.0, "grad_norm": 1.6662789413728705, "language_loss": 0.76640069, "learning_rate": 1.226409972197281e-06, "loss": 0.78766656, "num_input_tokens_seen": 228658195, "step": 10605, "time_per_iteration": 4.650303602218628 }, { "auxiliary_loss_clip": 0.01057897, "auxiliary_loss_mlp": 0.01038795, "balance_loss_clip": 1.03824091, "balance_loss_mlp": 1.02234411, "epoch": 0.6376672177964828, "flos": 21506757390720.0, "grad_norm": 1.7802518386545212, "language_loss": 0.65565449, "learning_rate": 1.2260508386545106e-06, "loss": 0.67662132, "num_input_tokens_seen": 228677415, "step": 10606, "time_per_iteration": 2.8175783157348633 }, { "auxiliary_loss_clip": 0.01090718, "auxiliary_loss_mlp": 0.01037026, "balance_loss_clip": 1.04083657, "balance_loss_mlp": 1.02489638, "epoch": 0.6377273410491507, "flos": 18843873598080.0, "grad_norm": 1.601218417819437, "language_loss": 0.75069982, "learning_rate": 1.225691734459971e-06, "loss": 0.77197731, "num_input_tokens_seen": 228696450, "step": 10607, "time_per_iteration": 2.6365914344787598 }, { "auxiliary_loss_clip": 0.01091801, "auxiliary_loss_mlp": 0.01037938, "balance_loss_clip": 1.04039049, "balance_loss_mlp": 1.02553403, "epoch": 0.6377874643018188, "flos": 53062970181120.0, "grad_norm": 1.5840122270167216, "language_loss": 0.65928984, "learning_rate": 1.225332659627278e-06, "loss": 0.68058717, "num_input_tokens_seen": 228721600, "step": 10608, "time_per_iteration": 4.558081150054932 }, { "auxiliary_loss_clip": 0.00982544, "auxiliary_loss_mlp": 0.01007387, "balance_loss_clip": 1.01596785, "balance_loss_mlp": 1.00617146, "epoch": 0.6378475875544867, "flos": 65135026465920.0, "grad_norm": 0.7133010996130292, "language_loss": 0.51879215, "learning_rate": 1.2249736141700475e-06, "loss": 0.53869152, "num_input_tokens_seen": 228784535, "step": 10609, "time_per_iteration": 3.3632545471191406 }, { "auxiliary_loss_clip": 0.0109935, "auxiliary_loss_mlp": 0.01025243, "balance_loss_clip": 1.03736722, "balance_loss_mlp": 1.01379943, "epoch": 0.6379077108071547, "flos": 23002759958400.0, "grad_norm": 1.6332455111471063, "language_loss": 0.74713194, "learning_rate": 1.2246145981018965e-06, "loss": 0.7683779, "num_input_tokens_seen": 228804110, "step": 10610, "time_per_iteration": 3.2196428775787354 }, { "auxiliary_loss_clip": 0.0101651, "auxiliary_loss_mlp": 0.0100476, "balance_loss_clip": 1.01297092, "balance_loss_mlp": 1.00353765, "epoch": 0.6379678340598226, "flos": 67601947610880.0, "grad_norm": 0.8493432056950548, "language_loss": 0.63061231, "learning_rate": 1.2242556114364364e-06, "loss": 0.65082502, "num_input_tokens_seen": 228867705, "step": 10611, "time_per_iteration": 3.272512435913086 }, { "auxiliary_loss_clip": 0.01103402, "auxiliary_loss_mlp": 0.01034119, "balance_loss_clip": 1.04139113, "balance_loss_mlp": 1.0207442, "epoch": 0.6380279573124906, "flos": 29680061610240.0, "grad_norm": 1.8312259315457267, "language_loss": 0.72302759, "learning_rate": 1.223896654187282e-06, "loss": 0.74440277, "num_input_tokens_seen": 228889215, "step": 10612, "time_per_iteration": 2.7299270629882812 }, { "auxiliary_loss_clip": 0.01015421, "auxiliary_loss_mlp": 0.0100432, "balance_loss_clip": 1.0106107, "balance_loss_mlp": 1.00311053, "epoch": 0.6380880805651585, "flos": 66484046580480.0, "grad_norm": 0.7098749409658618, "language_loss": 0.57844174, "learning_rate": 1.2235377263680446e-06, "loss": 0.59863913, "num_input_tokens_seen": 228948465, "step": 10613, "time_per_iteration": 4.943511009216309 }, { "auxiliary_loss_clip": 0.01071494, "auxiliary_loss_mlp": 0.01035158, "balance_loss_clip": 1.03659904, "balance_loss_mlp": 1.02168155, "epoch": 0.6381482038178266, "flos": 23914998691200.0, "grad_norm": 1.7198956941454036, "language_loss": 0.75381726, "learning_rate": 1.2231788279923334e-06, "loss": 0.77488375, "num_input_tokens_seen": 228967955, "step": 10614, "time_per_iteration": 2.8167922496795654 }, { "auxiliary_loss_clip": 0.01094834, "auxiliary_loss_mlp": 0.00770691, "balance_loss_clip": 1.04056311, "balance_loss_mlp": 1.00018597, "epoch": 0.6382083270704945, "flos": 24243042625920.0, "grad_norm": 1.8795242058434967, "language_loss": 0.79825491, "learning_rate": 1.2228199590737599e-06, "loss": 0.81691015, "num_input_tokens_seen": 228985495, "step": 10615, "time_per_iteration": 2.769399642944336 }, { "auxiliary_loss_clip": 0.01013557, "auxiliary_loss_mlp": 0.01001876, "balance_loss_clip": 1.01154137, "balance_loss_mlp": 1.00048769, "epoch": 0.6382684503231625, "flos": 70775552931840.0, "grad_norm": 0.6556730902042093, "language_loss": 0.55564505, "learning_rate": 1.2224611196259305e-06, "loss": 0.57579941, "num_input_tokens_seen": 229052995, "step": 10616, "time_per_iteration": 3.277085542678833 }, { "auxiliary_loss_clip": 0.01086789, "auxiliary_loss_mlp": 0.01036908, "balance_loss_clip": 1.0364368, "balance_loss_mlp": 1.0233475, "epoch": 0.6383285735758305, "flos": 16544836621440.0, "grad_norm": 1.9142103073146424, "language_loss": 0.83900499, "learning_rate": 1.2221023096624538e-06, "loss": 0.86024189, "num_input_tokens_seen": 229071030, "step": 10617, "time_per_iteration": 2.712834119796753 }, { "auxiliary_loss_clip": 0.0110772, "auxiliary_loss_mlp": 0.0104261, "balance_loss_clip": 1.04189885, "balance_loss_mlp": 1.02821589, "epoch": 0.6383886968284984, "flos": 14427651225600.0, "grad_norm": 1.8904429928249138, "language_loss": 0.87499708, "learning_rate": 1.221743529196936e-06, "loss": 0.89650035, "num_input_tokens_seen": 229088275, "step": 10618, "time_per_iteration": 2.6345932483673096 }, { "auxiliary_loss_clip": 0.01068321, "auxiliary_loss_mlp": 0.01032015, "balance_loss_clip": 1.04150379, "balance_loss_mlp": 1.02012992, "epoch": 0.6384488200811664, "flos": 17929659617280.0, "grad_norm": 1.7304686428232843, "language_loss": 0.73287666, "learning_rate": 1.2213847782429806e-06, "loss": 0.75388002, "num_input_tokens_seen": 229105190, "step": 10619, "time_per_iteration": 2.777869701385498 }, { "auxiliary_loss_clip": 0.0109667, "auxiliary_loss_mlp": 0.01037459, "balance_loss_clip": 1.04080129, "balance_loss_mlp": 1.02271247, "epoch": 0.6385089433338343, "flos": 18515578268160.0, "grad_norm": 1.9267832317981652, "language_loss": 0.76312691, "learning_rate": 1.221026056814193e-06, "loss": 0.78446817, "num_input_tokens_seen": 229122290, "step": 10620, "time_per_iteration": 2.701122760772705 }, { "auxiliary_loss_clip": 0.01093794, "auxiliary_loss_mlp": 0.01029286, "balance_loss_clip": 1.04239035, "balance_loss_mlp": 1.01672101, "epoch": 0.6385690665865024, "flos": 24753620499840.0, "grad_norm": 2.5441546745937114, "language_loss": 0.70669818, "learning_rate": 1.2206673649241752e-06, "loss": 0.727929, "num_input_tokens_seen": 229141620, "step": 10621, "time_per_iteration": 2.7129428386688232 }, { "auxiliary_loss_clip": 0.01085349, "auxiliary_loss_mlp": 0.0102653, "balance_loss_clip": 1.03596258, "balance_loss_mlp": 1.01482916, "epoch": 0.6386291898391703, "flos": 20120569678080.0, "grad_norm": 1.616578696475536, "language_loss": 0.77862823, "learning_rate": 1.220308702586529e-06, "loss": 0.79974699, "num_input_tokens_seen": 229161570, "step": 10622, "time_per_iteration": 2.722543954849243 }, { "auxiliary_loss_clip": 0.01075591, "auxiliary_loss_mlp": 0.01030912, "balance_loss_clip": 1.03845859, "balance_loss_mlp": 1.01837754, "epoch": 0.6386893130918383, "flos": 16867278034560.0, "grad_norm": 1.771071416148221, "language_loss": 0.74746549, "learning_rate": 1.2199500698148546e-06, "loss": 0.76853049, "num_input_tokens_seen": 229178465, "step": 10623, "time_per_iteration": 2.728158712387085 }, { "auxiliary_loss_clip": 0.0109049, "auxiliary_loss_mlp": 0.01029194, "balance_loss_clip": 1.03953004, "balance_loss_mlp": 1.01796472, "epoch": 0.6387494363445062, "flos": 22966274718720.0, "grad_norm": 1.3721054330124807, "language_loss": 0.76588684, "learning_rate": 1.2195914666227527e-06, "loss": 0.78708369, "num_input_tokens_seen": 229198975, "step": 10624, "time_per_iteration": 2.833406925201416 }, { "auxiliary_loss_clip": 0.0105041, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.03588271, "balance_loss_mlp": 1.02247274, "epoch": 0.6388095595971742, "flos": 22857716839680.0, "grad_norm": 1.873995828783276, "language_loss": 0.80408549, "learning_rate": 1.21923289302382e-06, "loss": 0.82495034, "num_input_tokens_seen": 229218825, "step": 10625, "time_per_iteration": 2.810683488845825 }, { "auxiliary_loss_clip": 0.01094331, "auxiliary_loss_mlp": 0.01033337, "balance_loss_clip": 1.04317892, "balance_loss_mlp": 1.02039063, "epoch": 0.6388696828498421, "flos": 17311529445120.0, "grad_norm": 1.9242726484746675, "language_loss": 0.72490007, "learning_rate": 1.218874349031654e-06, "loss": 0.74617672, "num_input_tokens_seen": 229236060, "step": 10626, "time_per_iteration": 2.667686939239502 }, { "auxiliary_loss_clip": 0.01093032, "auxiliary_loss_mlp": 0.01033337, "balance_loss_clip": 1.03836656, "balance_loss_mlp": 1.02036738, "epoch": 0.6389298061025102, "flos": 17128636369920.0, "grad_norm": 1.8547762721762564, "language_loss": 0.72446245, "learning_rate": 1.2185158346598517e-06, "loss": 0.74572611, "num_input_tokens_seen": 229255160, "step": 10627, "time_per_iteration": 2.681147575378418 }, { "auxiliary_loss_clip": 0.01095264, "auxiliary_loss_mlp": 0.01034256, "balance_loss_clip": 1.04398704, "balance_loss_mlp": 1.01995111, "epoch": 0.6389899293551781, "flos": 27710971989120.0, "grad_norm": 1.6812239823438198, "language_loss": 0.67369878, "learning_rate": 1.2181573499220064e-06, "loss": 0.69499397, "num_input_tokens_seen": 229278705, "step": 10628, "time_per_iteration": 2.7938716411590576 }, { "auxiliary_loss_clip": 0.0111173, "auxiliary_loss_mlp": 0.01029902, "balance_loss_clip": 1.04083705, "balance_loss_mlp": 1.01804066, "epoch": 0.6390500526078461, "flos": 21215701486080.0, "grad_norm": 1.7139884939852632, "language_loss": 0.68161869, "learning_rate": 1.2177988948317135e-06, "loss": 0.703035, "num_input_tokens_seen": 229299990, "step": 10629, "time_per_iteration": 2.644061803817749 }, { "auxiliary_loss_clip": 0.01079014, "auxiliary_loss_mlp": 0.01040793, "balance_loss_clip": 1.03948665, "balance_loss_mlp": 1.02554584, "epoch": 0.6391101758605141, "flos": 21581056673280.0, "grad_norm": 1.5487398291576047, "language_loss": 0.75722307, "learning_rate": 1.2174404694025646e-06, "loss": 0.77842116, "num_input_tokens_seen": 229319230, "step": 10630, "time_per_iteration": 2.7381680011749268 }, { "auxiliary_loss_clip": 0.01089485, "auxiliary_loss_mlp": 0.01035881, "balance_loss_clip": 1.03773403, "balance_loss_mlp": 1.02401352, "epoch": 0.639170299113182, "flos": 19900473091200.0, "grad_norm": 1.4699321095065776, "language_loss": 0.7028895, "learning_rate": 1.2170820736481511e-06, "loss": 0.72414321, "num_input_tokens_seen": 229338600, "step": 10631, "time_per_iteration": 2.76301908493042 }, { "auxiliary_loss_clip": 0.01010735, "auxiliary_loss_mlp": 0.01020885, "balance_loss_clip": 1.00987029, "balance_loss_mlp": 1.01946056, "epoch": 0.63923042236585, "flos": 69877604833920.0, "grad_norm": 1.2867788563374962, "language_loss": 0.62960958, "learning_rate": 1.2167237075820646e-06, "loss": 0.64992577, "num_input_tokens_seen": 229402420, "step": 10632, "time_per_iteration": 3.23628306388855 }, { "auxiliary_loss_clip": 0.01092617, "auxiliary_loss_mlp": 0.01034269, "balance_loss_clip": 1.04134142, "balance_loss_mlp": 1.02143598, "epoch": 0.639290545618518, "flos": 22674823764480.0, "grad_norm": 11.316815321652387, "language_loss": 0.66998363, "learning_rate": 1.216365371217893e-06, "loss": 0.69125253, "num_input_tokens_seen": 229419185, "step": 10633, "time_per_iteration": 2.719403028488159 }, { "auxiliary_loss_clip": 0.01051248, "auxiliary_loss_mlp": 0.01028599, "balance_loss_clip": 1.04067874, "balance_loss_mlp": 1.01645792, "epoch": 0.639350668871186, "flos": 19829190551040.0, "grad_norm": 2.281228369443932, "language_loss": 0.81935, "learning_rate": 1.216007064569225e-06, "loss": 0.84014845, "num_input_tokens_seen": 229436735, "step": 10634, "time_per_iteration": 2.8779945373535156 }, { "auxiliary_loss_clip": 0.01089506, "auxiliary_loss_mlp": 0.01036012, "balance_loss_clip": 1.0404712, "balance_loss_mlp": 1.02211165, "epoch": 0.6394107921238539, "flos": 20553328736640.0, "grad_norm": 1.5224758560315717, "language_loss": 0.74918383, "learning_rate": 1.2156487876496483e-06, "loss": 0.77043903, "num_input_tokens_seen": 229455595, "step": 10635, "time_per_iteration": 2.7275381088256836 }, { "auxiliary_loss_clip": 0.0110297, "auxiliary_loss_mlp": 0.01033837, "balance_loss_clip": 1.04365182, "balance_loss_mlp": 1.02071238, "epoch": 0.6394709153765219, "flos": 25774991729280.0, "grad_norm": 1.6416528841405902, "language_loss": 0.71164483, "learning_rate": 1.2152905404727475e-06, "loss": 0.73301286, "num_input_tokens_seen": 229476230, "step": 10636, "time_per_iteration": 2.6989855766296387 }, { "auxiliary_loss_clip": 0.0109626, "auxiliary_loss_mlp": 0.01037788, "balance_loss_clip": 1.04154992, "balance_loss_mlp": 1.02471662, "epoch": 0.6395310386291898, "flos": 17530153574400.0, "grad_norm": 1.863216274856941, "language_loss": 0.73810291, "learning_rate": 1.2149323230521085e-06, "loss": 0.7594434, "num_input_tokens_seen": 229494300, "step": 10637, "time_per_iteration": 2.7064554691314697 }, { "auxiliary_loss_clip": 0.01102986, "auxiliary_loss_mlp": 0.01035332, "balance_loss_clip": 1.04232454, "balance_loss_mlp": 1.0214324, "epoch": 0.6395911618818578, "flos": 18588225525120.0, "grad_norm": 1.8583759044592125, "language_loss": 0.77674294, "learning_rate": 1.2145741354013143e-06, "loss": 0.7981261, "num_input_tokens_seen": 229512985, "step": 10638, "time_per_iteration": 2.742272138595581 }, { "auxiliary_loss_clip": 0.01092544, "auxiliary_loss_mlp": 0.01035401, "balance_loss_clip": 1.039186, "balance_loss_mlp": 1.02218056, "epoch": 0.6396512851345257, "flos": 28366557068160.0, "grad_norm": 1.7706841809309422, "language_loss": 0.81434906, "learning_rate": 1.2142159775339478e-06, "loss": 0.83562851, "num_input_tokens_seen": 229534270, "step": 10639, "time_per_iteration": 2.7076473236083984 }, { "auxiliary_loss_clip": 0.0101793, "auxiliary_loss_mlp": 0.0099976, "balance_loss_clip": 1.01366258, "balance_loss_mlp": 0.9985556, "epoch": 0.6397114083871938, "flos": 70724307202560.0, "grad_norm": 0.8066832194631076, "language_loss": 0.58980644, "learning_rate": 1.21385784946359e-06, "loss": 0.60998333, "num_input_tokens_seen": 229596455, "step": 10640, "time_per_iteration": 3.175328254699707 }, { "auxiliary_loss_clip": 0.01081778, "auxiliary_loss_mlp": 0.01030879, "balance_loss_clip": 1.03485847, "balance_loss_mlp": 1.01876175, "epoch": 0.6397715316398617, "flos": 18142537570560.0, "grad_norm": 1.8250663746988522, "language_loss": 0.78291178, "learning_rate": 1.2134997512038215e-06, "loss": 0.80403835, "num_input_tokens_seen": 229612860, "step": 10641, "time_per_iteration": 2.6736910343170166 }, { "auxiliary_loss_clip": 0.01069736, "auxiliary_loss_mlp": 0.01041571, "balance_loss_clip": 1.03781104, "balance_loss_mlp": 1.02828479, "epoch": 0.6398316548925297, "flos": 25739512070400.0, "grad_norm": 1.5814049726496198, "language_loss": 0.63194126, "learning_rate": 1.2131416827682209e-06, "loss": 0.65305436, "num_input_tokens_seen": 229633960, "step": 10642, "time_per_iteration": 2.840916156768799 }, { "auxiliary_loss_clip": 0.01004085, "auxiliary_loss_mlp": 0.01008093, "balance_loss_clip": 1.00885439, "balance_loss_mlp": 1.00666296, "epoch": 0.6398917781451977, "flos": 71214234756480.0, "grad_norm": 0.9138015475084418, "language_loss": 0.55936515, "learning_rate": 1.2127836441703667e-06, "loss": 0.57948697, "num_input_tokens_seen": 229686730, "step": 10643, "time_per_iteration": 3.134157419204712 }, { "auxiliary_loss_clip": 0.01082549, "auxiliary_loss_mlp": 0.01028613, "balance_loss_clip": 1.03844333, "balance_loss_mlp": 1.01577973, "epoch": 0.6399519013978656, "flos": 20521835487360.0, "grad_norm": 2.4755783411685055, "language_loss": 0.76844835, "learning_rate": 1.2124256354238358e-06, "loss": 0.78955996, "num_input_tokens_seen": 229704800, "step": 10644, "time_per_iteration": 2.750016212463379 }, { "auxiliary_loss_clip": 0.01083772, "auxiliary_loss_mlp": 0.0103714, "balance_loss_clip": 1.04259241, "balance_loss_mlp": 1.02343059, "epoch": 0.6400120246505336, "flos": 24460840742400.0, "grad_norm": 1.476966637211995, "language_loss": 0.82139534, "learning_rate": 1.212067656542203e-06, "loss": 0.84260446, "num_input_tokens_seen": 229725265, "step": 10645, "time_per_iteration": 4.434756755828857 }, { "auxiliary_loss_clip": 0.01108206, "auxiliary_loss_mlp": 0.01043381, "balance_loss_clip": 1.0400579, "balance_loss_mlp": 1.02844369, "epoch": 0.6400721479032015, "flos": 28366090191360.0, "grad_norm": 1.9873684481859661, "language_loss": 0.73491621, "learning_rate": 1.2117097075390447e-06, "loss": 0.75643206, "num_input_tokens_seen": 229744840, "step": 10646, "time_per_iteration": 2.790422201156616 }, { "auxiliary_loss_clip": 0.01076409, "auxiliary_loss_mlp": 0.01036032, "balance_loss_clip": 1.037462, "balance_loss_mlp": 1.02220368, "epoch": 0.6401322711558696, "flos": 17816540711040.0, "grad_norm": 2.1141413827607227, "language_loss": 0.79825467, "learning_rate": 1.2113517884279327e-06, "loss": 0.81937909, "num_input_tokens_seen": 229759095, "step": 10647, "time_per_iteration": 6.299994707107544 }, { "auxiliary_loss_clip": 0.0106918, "auxiliary_loss_mlp": 0.01033575, "balance_loss_clip": 1.03744197, "balance_loss_mlp": 1.02105761, "epoch": 0.6401923944085375, "flos": 26030855283840.0, "grad_norm": 1.5992559976065106, "language_loss": 0.75935119, "learning_rate": 1.2109938992224399e-06, "loss": 0.7803787, "num_input_tokens_seen": 229777750, "step": 10648, "time_per_iteration": 2.823535680770874 }, { "auxiliary_loss_clip": 0.01088631, "auxiliary_loss_mlp": 0.01035586, "balance_loss_clip": 1.03901458, "balance_loss_mlp": 1.02278256, "epoch": 0.6402525176612055, "flos": 23586451966080.0, "grad_norm": 3.2506814778416566, "language_loss": 0.78615916, "learning_rate": 1.210636039936138e-06, "loss": 0.80740136, "num_input_tokens_seen": 229796785, "step": 10649, "time_per_iteration": 2.7334954738616943 }, { "auxiliary_loss_clip": 0.01058756, "auxiliary_loss_mlp": 0.01037312, "balance_loss_clip": 1.03965068, "balance_loss_mlp": 1.02403259, "epoch": 0.6403126409138734, "flos": 18041413806720.0, "grad_norm": 4.7583637580681515, "language_loss": 0.75450838, "learning_rate": 1.2102782105825956e-06, "loss": 0.77546906, "num_input_tokens_seen": 229815425, "step": 10650, "time_per_iteration": 2.834925651550293 }, { "auxiliary_loss_clip": 0.01114658, "auxiliary_loss_mlp": 0.01038182, "balance_loss_clip": 1.04058218, "balance_loss_mlp": 1.02501488, "epoch": 0.6403727641665414, "flos": 21979485308160.0, "grad_norm": 1.5877577982319235, "language_loss": 0.7111091, "learning_rate": 1.2099204111753833e-06, "loss": 0.73263752, "num_input_tokens_seen": 229834545, "step": 10651, "time_per_iteration": 2.599517345428467 }, { "auxiliary_loss_clip": 0.01082313, "auxiliary_loss_mlp": 0.01041331, "balance_loss_clip": 1.03811073, "balance_loss_mlp": 1.02803898, "epoch": 0.6404328874192093, "flos": 24895539135360.0, "grad_norm": 2.6398543727492494, "language_loss": 0.63837707, "learning_rate": 1.2095626417280684e-06, "loss": 0.65961355, "num_input_tokens_seen": 229849175, "step": 10652, "time_per_iteration": 4.367003679275513 }, { "auxiliary_loss_clip": 0.0109017, "auxiliary_loss_mlp": 0.01029675, "balance_loss_clip": 1.03734291, "balance_loss_mlp": 1.01728261, "epoch": 0.6404930106718774, "flos": 17597198309760.0, "grad_norm": 2.0413197407443247, "language_loss": 0.79417443, "learning_rate": 1.2092049022542168e-06, "loss": 0.81537288, "num_input_tokens_seen": 229865400, "step": 10653, "time_per_iteration": 2.672642707824707 }, { "auxiliary_loss_clip": 0.01089835, "auxiliary_loss_mlp": 0.01057293, "balance_loss_clip": 1.03523707, "balance_loss_mlp": 1.04088974, "epoch": 0.6405531339245453, "flos": 20157880930560.0, "grad_norm": 2.1735639110567884, "language_loss": 0.70573318, "learning_rate": 1.2088471927673952e-06, "loss": 0.72720444, "num_input_tokens_seen": 229882945, "step": 10654, "time_per_iteration": 2.6905150413513184 }, { "auxiliary_loss_clip": 0.01109265, "auxiliary_loss_mlp": 0.01041023, "balance_loss_clip": 1.04214334, "balance_loss_mlp": 1.02721834, "epoch": 0.6406132571772133, "flos": 21942281796480.0, "grad_norm": 1.704852134606112, "language_loss": 0.73023099, "learning_rate": 1.2084895132811666e-06, "loss": 0.75173384, "num_input_tokens_seen": 229901590, "step": 10655, "time_per_iteration": 2.6235902309417725 }, { "auxiliary_loss_clip": 0.01082305, "auxiliary_loss_mlp": 0.01040345, "balance_loss_clip": 1.04245615, "balance_loss_mlp": 1.0268271, "epoch": 0.6406733804298813, "flos": 28768002445440.0, "grad_norm": 1.5348114269310231, "language_loss": 0.82592511, "learning_rate": 1.2081318638090952e-06, "loss": 0.84715158, "num_input_tokens_seen": 229922535, "step": 10656, "time_per_iteration": 2.786027193069458 }, { "auxiliary_loss_clip": 0.01057312, "auxiliary_loss_mlp": 0.01037289, "balance_loss_clip": 1.034778, "balance_loss_mlp": 1.02465284, "epoch": 0.6407335036825492, "flos": 17457183095040.0, "grad_norm": 2.2686127713919566, "language_loss": 0.72339928, "learning_rate": 1.2077742443647433e-06, "loss": 0.74434525, "num_input_tokens_seen": 229939575, "step": 10657, "time_per_iteration": 2.7300093173980713 }, { "auxiliary_loss_clip": 0.01080913, "auxiliary_loss_mlp": 0.01039634, "balance_loss_clip": 1.03770339, "balance_loss_mlp": 1.0274924, "epoch": 0.6407936269352172, "flos": 22125282612480.0, "grad_norm": 2.024621973540982, "language_loss": 0.77556098, "learning_rate": 1.2074166549616707e-06, "loss": 0.7967664, "num_input_tokens_seen": 229958840, "step": 10658, "time_per_iteration": 2.7543232440948486 }, { "auxiliary_loss_clip": 0.01119551, "auxiliary_loss_mlp": 0.01041614, "balance_loss_clip": 1.04269636, "balance_loss_mlp": 1.02797651, "epoch": 0.6408537501878852, "flos": 23110635479040.0, "grad_norm": 2.31675003494523, "language_loss": 0.76086068, "learning_rate": 1.2070590956134386e-06, "loss": 0.78247231, "num_input_tokens_seen": 229979680, "step": 10659, "time_per_iteration": 2.64536190032959 }, { "auxiliary_loss_clip": 0.01105159, "auxiliary_loss_mlp": 0.01032937, "balance_loss_clip": 1.04132307, "balance_loss_mlp": 1.01971078, "epoch": 0.6409138734405532, "flos": 16472440759680.0, "grad_norm": 1.82994064834737, "language_loss": 0.78033829, "learning_rate": 1.2067015663336046e-06, "loss": 0.80171925, "num_input_tokens_seen": 229996830, "step": 10660, "time_per_iteration": 2.6234161853790283 }, { "auxiliary_loss_clip": 0.01092799, "auxiliary_loss_mlp": 0.01035048, "balance_loss_clip": 1.03941202, "balance_loss_mlp": 1.02086258, "epoch": 0.6409739966932211, "flos": 22777922776320.0, "grad_norm": 1.735823034314566, "language_loss": 0.68326354, "learning_rate": 1.206344067135727e-06, "loss": 0.70454198, "num_input_tokens_seen": 230015115, "step": 10661, "time_per_iteration": 2.7175955772399902 }, { "auxiliary_loss_clip": 0.01114459, "auxiliary_loss_mlp": 0.01038734, "balance_loss_clip": 1.04276872, "balance_loss_mlp": 1.02682471, "epoch": 0.6410341199458891, "flos": 25152049134720.0, "grad_norm": 1.9252684871674384, "language_loss": 0.75755298, "learning_rate": 1.205986598033362e-06, "loss": 0.77908492, "num_input_tokens_seen": 230035515, "step": 10662, "time_per_iteration": 2.633653402328491 }, { "auxiliary_loss_clip": 0.01098112, "auxiliary_loss_mlp": 0.01035568, "balance_loss_clip": 1.03684235, "balance_loss_mlp": 1.02221704, "epoch": 0.641094243198557, "flos": 27046193028480.0, "grad_norm": 2.784052529669845, "language_loss": 0.70107532, "learning_rate": 1.2056291590400644e-06, "loss": 0.72241217, "num_input_tokens_seen": 230054355, "step": 10663, "time_per_iteration": 2.7310519218444824 }, { "auxiliary_loss_clip": 0.01083056, "auxiliary_loss_mlp": 0.0104552, "balance_loss_clip": 1.04077351, "balance_loss_mlp": 1.03102446, "epoch": 0.641154366451225, "flos": 25374551932800.0, "grad_norm": 1.9822481402863719, "language_loss": 0.67971885, "learning_rate": 1.205271750169389e-06, "loss": 0.70100462, "num_input_tokens_seen": 230074605, "step": 10664, "time_per_iteration": 2.773348093032837 }, { "auxiliary_loss_clip": 0.01087025, "auxiliary_loss_mlp": 0.01033843, "balance_loss_clip": 1.03581822, "balance_loss_mlp": 1.02188087, "epoch": 0.6412144897038929, "flos": 25153342024320.0, "grad_norm": 1.8870991168532496, "language_loss": 0.66328347, "learning_rate": 1.2049143714348881e-06, "loss": 0.68449211, "num_input_tokens_seen": 230093820, "step": 10665, "time_per_iteration": 2.6490859985351562 }, { "auxiliary_loss_clip": 0.01103479, "auxiliary_loss_mlp": 0.01027966, "balance_loss_clip": 1.04036629, "balance_loss_mlp": 1.01522827, "epoch": 0.641274612956561, "flos": 23440762402560.0, "grad_norm": 1.6713056871656586, "language_loss": 0.6435259, "learning_rate": 1.2045570228501145e-06, "loss": 0.66484034, "num_input_tokens_seen": 230114285, "step": 10666, "time_per_iteration": 2.667050361633301 }, { "auxiliary_loss_clip": 0.01105312, "auxiliary_loss_mlp": 0.01033422, "balance_loss_clip": 1.04096031, "balance_loss_mlp": 1.02103066, "epoch": 0.6413347362092289, "flos": 19427493778560.0, "grad_norm": 1.5002235875983176, "language_loss": 0.70960593, "learning_rate": 1.2041997044286176e-06, "loss": 0.73099327, "num_input_tokens_seen": 230132760, "step": 10667, "time_per_iteration": 2.701289176940918 }, { "auxiliary_loss_clip": 0.01066227, "auxiliary_loss_mlp": 0.00773491, "balance_loss_clip": 1.0367496, "balance_loss_mlp": 1.00030184, "epoch": 0.6413948594618969, "flos": 17196578945280.0, "grad_norm": 2.416405769977824, "language_loss": 0.77665913, "learning_rate": 1.2038424161839484e-06, "loss": 0.79505634, "num_input_tokens_seen": 230149690, "step": 10668, "time_per_iteration": 2.746056079864502 }, { "auxiliary_loss_clip": 0.01108161, "auxiliary_loss_mlp": 0.01036614, "balance_loss_clip": 1.04348612, "balance_loss_mlp": 1.02366185, "epoch": 0.6414549827145648, "flos": 22269787027200.0, "grad_norm": 1.4845911693701175, "language_loss": 0.67707181, "learning_rate": 1.2034851581296544e-06, "loss": 0.69851947, "num_input_tokens_seen": 230166950, "step": 10669, "time_per_iteration": 2.7345635890960693 }, { "auxiliary_loss_clip": 0.0111572, "auxiliary_loss_mlp": 0.01038211, "balance_loss_clip": 1.04545701, "balance_loss_mlp": 1.02449608, "epoch": 0.6415151059672328, "flos": 19640192163840.0, "grad_norm": 2.894165174832574, "language_loss": 0.78665972, "learning_rate": 1.2031279302792825e-06, "loss": 0.80819899, "num_input_tokens_seen": 230184785, "step": 10670, "time_per_iteration": 2.6661479473114014 }, { "auxiliary_loss_clip": 0.01081535, "auxiliary_loss_mlp": 0.01035319, "balance_loss_clip": 1.03874564, "balance_loss_mlp": 1.02164531, "epoch": 0.6415752292199008, "flos": 14865833237760.0, "grad_norm": 2.1933536907134554, "language_loss": 0.88588488, "learning_rate": 1.20277073264638e-06, "loss": 0.90705341, "num_input_tokens_seen": 230201385, "step": 10671, "time_per_iteration": 2.641057252883911 }, { "auxiliary_loss_clip": 0.01104202, "auxiliary_loss_mlp": 0.01028531, "balance_loss_clip": 1.04201674, "balance_loss_mlp": 1.01649058, "epoch": 0.6416353524725688, "flos": 13735580906880.0, "grad_norm": 1.6223655469146963, "language_loss": 0.68986869, "learning_rate": 1.2024135652444907e-06, "loss": 0.71119601, "num_input_tokens_seen": 230220380, "step": 10672, "time_per_iteration": 2.6609199047088623 }, { "auxiliary_loss_clip": 0.01111137, "auxiliary_loss_mlp": 0.01033932, "balance_loss_clip": 1.04236984, "balance_loss_mlp": 1.01922166, "epoch": 0.6416954757252368, "flos": 24534924543360.0, "grad_norm": 2.291371400531435, "language_loss": 0.73951614, "learning_rate": 1.2020564280871593e-06, "loss": 0.76096678, "num_input_tokens_seen": 230239845, "step": 10673, "time_per_iteration": 2.7125818729400635 }, { "auxiliary_loss_clip": 0.01076968, "auxiliary_loss_mlp": 0.01038267, "balance_loss_clip": 1.03657365, "balance_loss_mlp": 1.02410507, "epoch": 0.6417555989779047, "flos": 27710002321920.0, "grad_norm": 25.869198527491033, "language_loss": 0.69720078, "learning_rate": 1.2016993211879283e-06, "loss": 0.71835309, "num_input_tokens_seen": 230262420, "step": 10674, "time_per_iteration": 2.8267860412597656 }, { "auxiliary_loss_clip": 0.01119164, "auxiliary_loss_mlp": 0.01029301, "balance_loss_clip": 1.04007125, "balance_loss_mlp": 1.01571679, "epoch": 0.6418157222305727, "flos": 20556632787840.0, "grad_norm": 1.784339148090001, "language_loss": 0.66459048, "learning_rate": 1.201342244560338e-06, "loss": 0.68607509, "num_input_tokens_seen": 230279950, "step": 10675, "time_per_iteration": 2.6572489738464355 }, { "auxiliary_loss_clip": 0.01117705, "auxiliary_loss_mlp": 0.01037266, "balance_loss_clip": 1.04312348, "balance_loss_mlp": 1.02500582, "epoch": 0.6418758454832406, "flos": 22601530062720.0, "grad_norm": 1.859703676283548, "language_loss": 0.66479051, "learning_rate": 1.2009851982179307e-06, "loss": 0.68634021, "num_input_tokens_seen": 230299705, "step": 10676, "time_per_iteration": 2.6424221992492676 }, { "auxiliary_loss_clip": 0.01119453, "auxiliary_loss_mlp": 0.01034897, "balance_loss_clip": 1.04334652, "balance_loss_mlp": 1.02030003, "epoch": 0.6419359687359086, "flos": 27375098889600.0, "grad_norm": 1.821732847085161, "language_loss": 0.75731808, "learning_rate": 1.2006281821742446e-06, "loss": 0.77886158, "num_input_tokens_seen": 230320030, "step": 10677, "time_per_iteration": 2.651279926300049 }, { "auxiliary_loss_clip": 0.01017238, "auxiliary_loss_mlp": 0.0100428, "balance_loss_clip": 1.01344991, "balance_loss_mlp": 1.00320745, "epoch": 0.6419960919885765, "flos": 67251924552960.0, "grad_norm": 0.7863000332751263, "language_loss": 0.60634637, "learning_rate": 1.200271196442818e-06, "loss": 0.62656152, "num_input_tokens_seen": 230381495, "step": 10678, "time_per_iteration": 3.29689359664917 }, { "auxiliary_loss_clip": 0.01100247, "auxiliary_loss_mlp": 0.01035517, "balance_loss_clip": 1.03918314, "balance_loss_mlp": 1.02296972, "epoch": 0.6420562152412446, "flos": 19901873721600.0, "grad_norm": 1.6874144871208372, "language_loss": 0.6772809, "learning_rate": 1.1999142410371875e-06, "loss": 0.69863856, "num_input_tokens_seen": 230401385, "step": 10679, "time_per_iteration": 2.656188488006592 }, { "auxiliary_loss_clip": 0.01103127, "auxiliary_loss_mlp": 0.01041549, "balance_loss_clip": 1.04055119, "balance_loss_mlp": 1.02634931, "epoch": 0.6421163384939125, "flos": 24790177566720.0, "grad_norm": 2.4808394593739123, "language_loss": 0.73067611, "learning_rate": 1.1995573159708897e-06, "loss": 0.75212288, "num_input_tokens_seen": 230421340, "step": 10680, "time_per_iteration": 2.6635870933532715 }, { "auxiliary_loss_clip": 0.01079924, "auxiliary_loss_mlp": 0.01028158, "balance_loss_clip": 1.03821039, "balance_loss_mlp": 1.01660097, "epoch": 0.6421764617465805, "flos": 25592816926080.0, "grad_norm": 1.6629690093206273, "language_loss": 0.67730248, "learning_rate": 1.1992004212574582e-06, "loss": 0.69838333, "num_input_tokens_seen": 230441270, "step": 10681, "time_per_iteration": 2.7426977157592773 }, { "auxiliary_loss_clip": 0.0111386, "auxiliary_loss_mlp": 0.01031892, "balance_loss_clip": 1.04021406, "balance_loss_mlp": 1.01944685, "epoch": 0.6422365849992484, "flos": 14134727813760.0, "grad_norm": 1.7354882322045777, "language_loss": 0.74501145, "learning_rate": 1.198843556910427e-06, "loss": 0.76646894, "num_input_tokens_seen": 230457455, "step": 10682, "time_per_iteration": 2.5474164485931396 }, { "auxiliary_loss_clip": 0.01051042, "auxiliary_loss_mlp": 0.01032531, "balance_loss_clip": 1.03735995, "balance_loss_mlp": 1.02086592, "epoch": 0.6422967082519164, "flos": 22383911514240.0, "grad_norm": 1.4579009699070558, "language_loss": 0.79108202, "learning_rate": 1.1984867229433287e-06, "loss": 0.81191772, "num_input_tokens_seen": 230478955, "step": 10683, "time_per_iteration": 2.913137435913086 }, { "auxiliary_loss_clip": 0.01118799, "auxiliary_loss_mlp": 0.01035941, "balance_loss_clip": 1.04291272, "balance_loss_mlp": 1.0225358, "epoch": 0.6423568315045844, "flos": 14647927380480.0, "grad_norm": 1.7236127231650058, "language_loss": 0.67390025, "learning_rate": 1.1981299193696941e-06, "loss": 0.69544768, "num_input_tokens_seen": 230496425, "step": 10684, "time_per_iteration": 4.21756386756897 }, { "auxiliary_loss_clip": 0.0110472, "auxiliary_loss_mlp": 0.0103372, "balance_loss_clip": 1.04010284, "balance_loss_mlp": 1.02044034, "epoch": 0.6424169547572524, "flos": 26833925606400.0, "grad_norm": 2.002909718847722, "language_loss": 0.7144649, "learning_rate": 1.1977731462030533e-06, "loss": 0.73584938, "num_input_tokens_seen": 230516245, "step": 10685, "time_per_iteration": 2.715785026550293 }, { "auxiliary_loss_clip": 0.0107774, "auxiliary_loss_mlp": 0.01037662, "balance_loss_clip": 1.03614187, "balance_loss_mlp": 1.02484107, "epoch": 0.6424770780099204, "flos": 22707430335360.0, "grad_norm": 1.5191327003401023, "language_loss": 0.75144935, "learning_rate": 1.197416403456935e-06, "loss": 0.77260327, "num_input_tokens_seen": 230534745, "step": 10686, "time_per_iteration": 4.366745948791504 }, { "auxiliary_loss_clip": 0.01082252, "auxiliary_loss_mlp": 0.01034259, "balance_loss_clip": 1.04008722, "balance_loss_mlp": 1.01991844, "epoch": 0.6425372012625883, "flos": 28469512425600.0, "grad_norm": 6.424850822093427, "language_loss": 0.68726957, "learning_rate": 1.197059691144867e-06, "loss": 0.7084347, "num_input_tokens_seen": 230555895, "step": 10687, "time_per_iteration": 4.32355523109436 }, { "auxiliary_loss_clip": 0.01092278, "auxiliary_loss_mlp": 0.0103296, "balance_loss_clip": 1.03951168, "balance_loss_mlp": 1.02028227, "epoch": 0.6425973245152563, "flos": 29351694453120.0, "grad_norm": 1.9785933660475024, "language_loss": 0.66424388, "learning_rate": 1.1967030092803767e-06, "loss": 0.68549621, "num_input_tokens_seen": 230577460, "step": 10688, "time_per_iteration": 2.8096606731414795 }, { "auxiliary_loss_clip": 0.01114997, "auxiliary_loss_mlp": 0.01034078, "balance_loss_clip": 1.04043853, "balance_loss_mlp": 1.02081013, "epoch": 0.6426574477679242, "flos": 16430388912000.0, "grad_norm": 1.653295180436115, "language_loss": 0.73148823, "learning_rate": 1.1963463578769876e-06, "loss": 0.75297892, "num_input_tokens_seen": 230595030, "step": 10689, "time_per_iteration": 2.5335159301757812 }, { "auxiliary_loss_clip": 0.01097981, "auxiliary_loss_mlp": 0.01032097, "balance_loss_clip": 1.04061198, "balance_loss_mlp": 1.0200088, "epoch": 0.6427175710205922, "flos": 21835914647040.0, "grad_norm": 2.974297200312542, "language_loss": 0.72271609, "learning_rate": 1.195989736948226e-06, "loss": 0.74401689, "num_input_tokens_seen": 230615135, "step": 10690, "time_per_iteration": 2.678732395172119 }, { "auxiliary_loss_clip": 0.01087197, "auxiliary_loss_mlp": 0.01034747, "balance_loss_clip": 1.03962326, "balance_loss_mlp": 1.02202129, "epoch": 0.6427776942732601, "flos": 17786627660160.0, "grad_norm": 1.747376446154191, "language_loss": 0.77734852, "learning_rate": 1.1956331465076143e-06, "loss": 0.79856801, "num_input_tokens_seen": 230631965, "step": 10691, "time_per_iteration": 2.659553050994873 }, { "auxiliary_loss_clip": 0.01094577, "auxiliary_loss_mlp": 0.01035823, "balance_loss_clip": 1.03965449, "balance_loss_mlp": 1.02299619, "epoch": 0.6428378175259282, "flos": 15085893911040.0, "grad_norm": 1.8605559166150418, "language_loss": 0.74422169, "learning_rate": 1.1952765865686738e-06, "loss": 0.76552576, "num_input_tokens_seen": 230649565, "step": 10692, "time_per_iteration": 4.251460790634155 }, { "auxiliary_loss_clip": 0.01104664, "auxiliary_loss_mlp": 0.01034677, "balance_loss_clip": 1.04084218, "balance_loss_mlp": 1.02164721, "epoch": 0.6428979407785961, "flos": 23841776816640.0, "grad_norm": 1.9248860914210837, "language_loss": 0.61550558, "learning_rate": 1.1949200571449263e-06, "loss": 0.63689899, "num_input_tokens_seen": 230669265, "step": 10693, "time_per_iteration": 2.6779651641845703 }, { "auxiliary_loss_clip": 0.01080488, "auxiliary_loss_mlp": 0.0102922, "balance_loss_clip": 1.04029202, "balance_loss_mlp": 1.016065, "epoch": 0.6429580640312641, "flos": 32926852892160.0, "grad_norm": 2.329079095224612, "language_loss": 0.59532356, "learning_rate": 1.1945635582498903e-06, "loss": 0.61642069, "num_input_tokens_seen": 230690575, "step": 10694, "time_per_iteration": 2.8363914489746094 }, { "auxiliary_loss_clip": 0.01089804, "auxiliary_loss_mlp": 0.01035527, "balance_loss_clip": 1.03853726, "balance_loss_mlp": 1.02255106, "epoch": 0.643018187283932, "flos": 21068359896960.0, "grad_norm": 1.4014414192812676, "language_loss": 0.80109406, "learning_rate": 1.1942070898970853e-06, "loss": 0.82234728, "num_input_tokens_seen": 230709420, "step": 10695, "time_per_iteration": 2.6794557571411133 }, { "auxiliary_loss_clip": 0.01116687, "auxiliary_loss_mlp": 0.01040293, "balance_loss_clip": 1.04089379, "balance_loss_mlp": 1.02677488, "epoch": 0.6430783105366, "flos": 26724649455360.0, "grad_norm": 1.7759454400987778, "language_loss": 0.73687971, "learning_rate": 1.1938506521000285e-06, "loss": 0.75844944, "num_input_tokens_seen": 230729350, "step": 10696, "time_per_iteration": 2.7068281173706055 }, { "auxiliary_loss_clip": 0.01078835, "auxiliary_loss_mlp": 0.01029709, "balance_loss_clip": 1.03717327, "balance_loss_mlp": 1.01736438, "epoch": 0.643138433789268, "flos": 23696841438720.0, "grad_norm": 1.6299732646475602, "language_loss": 0.75820529, "learning_rate": 1.1934942448722347e-06, "loss": 0.7792908, "num_input_tokens_seen": 230749220, "step": 10697, "time_per_iteration": 2.8328888416290283 }, { "auxiliary_loss_clip": 0.01091041, "auxiliary_loss_mlp": 0.01032937, "balance_loss_clip": 1.03859711, "balance_loss_mlp": 1.02061689, "epoch": 0.643198557041936, "flos": 34202184255360.0, "grad_norm": 1.3945921589698136, "language_loss": 0.65932959, "learning_rate": 1.1931378682272208e-06, "loss": 0.68056941, "num_input_tokens_seen": 230770245, "step": 10698, "time_per_iteration": 2.784822702407837 }, { "auxiliary_loss_clip": 0.01036478, "auxiliary_loss_mlp": 0.01005901, "balance_loss_clip": 1.01277423, "balance_loss_mlp": 1.00470889, "epoch": 0.643258680294604, "flos": 67626473621760.0, "grad_norm": 0.8642865572859256, "language_loss": 0.63445872, "learning_rate": 1.1927815221784996e-06, "loss": 0.65488249, "num_input_tokens_seen": 230837030, "step": 10699, "time_per_iteration": 3.1397321224212646 }, { "auxiliary_loss_clip": 0.01103425, "auxiliary_loss_mlp": 0.01028666, "balance_loss_clip": 1.04155254, "balance_loss_mlp": 1.01698923, "epoch": 0.6433188035472719, "flos": 25185984508800.0, "grad_norm": 1.8812795881876412, "language_loss": 0.69277722, "learning_rate": 1.1924252067395838e-06, "loss": 0.71409816, "num_input_tokens_seen": 230856845, "step": 10700, "time_per_iteration": 2.6566555500030518 }, { "auxiliary_loss_clip": 0.01115928, "auxiliary_loss_mlp": 0.01028377, "balance_loss_clip": 1.04087234, "balance_loss_mlp": 1.01547289, "epoch": 0.6433789267999399, "flos": 24973573432320.0, "grad_norm": 2.050726314143076, "language_loss": 0.7285673, "learning_rate": 1.1920689219239855e-06, "loss": 0.75001037, "num_input_tokens_seen": 230878785, "step": 10701, "time_per_iteration": 2.7663381099700928 }, { "auxiliary_loss_clip": 0.01106257, "auxiliary_loss_mlp": 0.01031843, "balance_loss_clip": 1.03919315, "balance_loss_mlp": 1.01695347, "epoch": 0.6434390500526078, "flos": 17566028282880.0, "grad_norm": 1.983939492381853, "language_loss": 0.82094157, "learning_rate": 1.1917126677452144e-06, "loss": 0.84232259, "num_input_tokens_seen": 230895445, "step": 10702, "time_per_iteration": 2.634734630584717 }, { "auxiliary_loss_clip": 0.01084567, "auxiliary_loss_mlp": 0.01040406, "balance_loss_clip": 1.03733373, "balance_loss_mlp": 1.02802002, "epoch": 0.6434991733052758, "flos": 20843594542080.0, "grad_norm": 2.1366744665576536, "language_loss": 0.74528348, "learning_rate": 1.1913564442167798e-06, "loss": 0.76653326, "num_input_tokens_seen": 230911375, "step": 10703, "time_per_iteration": 2.712024688720703 }, { "auxiliary_loss_clip": 0.00980042, "auxiliary_loss_mlp": 0.01002542, "balance_loss_clip": 1.00990796, "balance_loss_mlp": 1.00124288, "epoch": 0.6435592965579437, "flos": 66094596345600.0, "grad_norm": 0.6668164665543085, "language_loss": 0.54507017, "learning_rate": 1.1910002513521898e-06, "loss": 0.56489605, "num_input_tokens_seen": 230975990, "step": 10704, "time_per_iteration": 3.391496419906616 }, { "auxiliary_loss_clip": 0.01074279, "auxiliary_loss_mlp": 0.01024183, "balance_loss_clip": 1.03965342, "balance_loss_mlp": 1.01269126, "epoch": 0.6436194198106118, "flos": 23768842250880.0, "grad_norm": 1.6398007726436414, "language_loss": 0.76942575, "learning_rate": 1.1906440891649519e-06, "loss": 0.79041034, "num_input_tokens_seen": 230997110, "step": 10705, "time_per_iteration": 3.151123523712158 }, { "auxiliary_loss_clip": 0.01080341, "auxiliary_loss_mlp": 0.01040696, "balance_loss_clip": 1.03794503, "balance_loss_mlp": 1.02824438, "epoch": 0.6436795430632797, "flos": 20230312705920.0, "grad_norm": 1.6220851966206657, "language_loss": 0.78966212, "learning_rate": 1.1902879576685708e-06, "loss": 0.81087244, "num_input_tokens_seen": 231015590, "step": 10706, "time_per_iteration": 2.7351467609405518 }, { "auxiliary_loss_clip": 0.01073614, "auxiliary_loss_mlp": 0.01037334, "balance_loss_clip": 1.03537798, "balance_loss_mlp": 1.02350581, "epoch": 0.6437396663159477, "flos": 20301846641280.0, "grad_norm": 1.995060337991945, "language_loss": 0.80729055, "learning_rate": 1.1899318568765518e-06, "loss": 0.82840002, "num_input_tokens_seen": 231033800, "step": 10707, "time_per_iteration": 2.8090367317199707 }, { "auxiliary_loss_clip": 0.01102074, "auxiliary_loss_mlp": 0.01034341, "balance_loss_clip": 1.03903484, "balance_loss_mlp": 1.02176499, "epoch": 0.6437997895686156, "flos": 23878585278720.0, "grad_norm": 1.8783086721412918, "language_loss": 0.85947567, "learning_rate": 1.1895757868023978e-06, "loss": 0.88083982, "num_input_tokens_seen": 231053160, "step": 10708, "time_per_iteration": 2.7102444171905518 }, { "auxiliary_loss_clip": 0.01070026, "auxiliary_loss_mlp": 0.0104392, "balance_loss_clip": 1.04000461, "balance_loss_mlp": 1.02895367, "epoch": 0.6438599128212836, "flos": 18989275852800.0, "grad_norm": 2.169380763975439, "language_loss": 0.65262228, "learning_rate": 1.1892197474596106e-06, "loss": 0.67376173, "num_input_tokens_seen": 231069470, "step": 10709, "time_per_iteration": 2.6978535652160645 }, { "auxiliary_loss_clip": 0.01115477, "auxiliary_loss_mlp": 0.01032813, "balance_loss_clip": 1.04076731, "balance_loss_mlp": 1.02048671, "epoch": 0.6439200360739517, "flos": 24096347481600.0, "grad_norm": 1.8116959175260157, "language_loss": 0.80929708, "learning_rate": 1.1888637388616929e-06, "loss": 0.83077991, "num_input_tokens_seen": 231088205, "step": 10710, "time_per_iteration": 2.6809825897216797 }, { "auxiliary_loss_clip": 0.0110175, "auxiliary_loss_mlp": 0.01032632, "balance_loss_clip": 1.03748906, "balance_loss_mlp": 1.0203414, "epoch": 0.6439801593266196, "flos": 31902141697920.0, "grad_norm": 2.6140299044708106, "language_loss": 0.6634506, "learning_rate": 1.1885077610221425e-06, "loss": 0.68479443, "num_input_tokens_seen": 231107850, "step": 10711, "time_per_iteration": 2.71571946144104 }, { "auxiliary_loss_clip": 0.01077359, "auxiliary_loss_mlp": 0.01033147, "balance_loss_clip": 1.0414753, "balance_loss_mlp": 1.02000391, "epoch": 0.6440402825792876, "flos": 27125879351040.0, "grad_norm": 2.2683722533974437, "language_loss": 0.78656554, "learning_rate": 1.1881518139544597e-06, "loss": 0.80767059, "num_input_tokens_seen": 231127200, "step": 10712, "time_per_iteration": 2.785280466079712 }, { "auxiliary_loss_clip": 0.01103094, "auxiliary_loss_mlp": 0.01037973, "balance_loss_clip": 1.03856206, "balance_loss_mlp": 1.02487159, "epoch": 0.6441004058319555, "flos": 20667704618880.0, "grad_norm": 1.6337129224497011, "language_loss": 0.82845241, "learning_rate": 1.1877958976721417e-06, "loss": 0.84986305, "num_input_tokens_seen": 231146360, "step": 10713, "time_per_iteration": 2.6682519912719727 }, { "auxiliary_loss_clip": 0.01111989, "auxiliary_loss_mlp": 0.0103674, "balance_loss_clip": 1.04118943, "balance_loss_mlp": 1.02455091, "epoch": 0.6441605290846235, "flos": 26026006947840.0, "grad_norm": 1.377683768387238, "language_loss": 0.78550875, "learning_rate": 1.187440012188684e-06, "loss": 0.80699605, "num_input_tokens_seen": 231168350, "step": 10714, "time_per_iteration": 2.6294350624084473 }, { "auxiliary_loss_clip": 0.01081537, "auxiliary_loss_mlp": 0.01031396, "balance_loss_clip": 1.03937292, "balance_loss_mlp": 1.01982093, "epoch": 0.6442206523372914, "flos": 24899489631360.0, "grad_norm": 1.6804962466974145, "language_loss": 0.8137539, "learning_rate": 1.187084157517583e-06, "loss": 0.83488327, "num_input_tokens_seen": 231188385, "step": 10715, "time_per_iteration": 2.7179040908813477 }, { "auxiliary_loss_clip": 0.01083275, "auxiliary_loss_mlp": 0.01033506, "balance_loss_clip": 1.03462327, "balance_loss_mlp": 1.02041125, "epoch": 0.6442807755899594, "flos": 25156322853120.0, "grad_norm": 2.56330690161098, "language_loss": 0.81656396, "learning_rate": 1.186728333672332e-06, "loss": 0.83773172, "num_input_tokens_seen": 231209880, "step": 10716, "time_per_iteration": 2.71616268157959 }, { "auxiliary_loss_clip": 0.01080679, "auxiliary_loss_mlp": 0.01037142, "balance_loss_clip": 1.03870273, "balance_loss_mlp": 1.02335536, "epoch": 0.6443408988426274, "flos": 27344503480320.0, "grad_norm": 2.019166193158946, "language_loss": 0.78575444, "learning_rate": 1.186372540666424e-06, "loss": 0.80693269, "num_input_tokens_seen": 231230765, "step": 10717, "time_per_iteration": 2.7821998596191406 }, { "auxiliary_loss_clip": 0.01111081, "auxiliary_loss_mlp": 0.01033784, "balance_loss_clip": 1.03954279, "balance_loss_mlp": 1.0215416, "epoch": 0.6444010220952954, "flos": 27928339142400.0, "grad_norm": 1.554880211694131, "language_loss": 0.68287563, "learning_rate": 1.1860167785133513e-06, "loss": 0.70432431, "num_input_tokens_seen": 231252350, "step": 10718, "time_per_iteration": 2.619870662689209 }, { "auxiliary_loss_clip": 0.01025406, "auxiliary_loss_mlp": 0.01008951, "balance_loss_clip": 1.01146674, "balance_loss_mlp": 1.00788391, "epoch": 0.6444611453479633, "flos": 71215024855680.0, "grad_norm": 0.7631804630715925, "language_loss": 0.49633595, "learning_rate": 1.185661047226603e-06, "loss": 0.51667953, "num_input_tokens_seen": 231313865, "step": 10719, "time_per_iteration": 3.3252131938934326 }, { "auxiliary_loss_clip": 0.01118591, "auxiliary_loss_mlp": 0.01039818, "balance_loss_clip": 1.04287648, "balance_loss_mlp": 1.02602601, "epoch": 0.6445212686006313, "flos": 22705131864960.0, "grad_norm": 2.1022111741366603, "language_loss": 0.77604353, "learning_rate": 1.18530534681967e-06, "loss": 0.79762757, "num_input_tokens_seen": 231331710, "step": 10720, "time_per_iteration": 2.6171679496765137 }, { "auxiliary_loss_clip": 0.01094489, "auxiliary_loss_mlp": 0.01034779, "balance_loss_clip": 1.04128611, "balance_loss_mlp": 1.02126074, "epoch": 0.6445813918532992, "flos": 21178821196800.0, "grad_norm": 1.7066840296237504, "language_loss": 0.76980746, "learning_rate": 1.18494967730604e-06, "loss": 0.79110014, "num_input_tokens_seen": 231350705, "step": 10721, "time_per_iteration": 2.8883464336395264 }, { "auxiliary_loss_clip": 0.01077386, "auxiliary_loss_mlp": 0.01035031, "balance_loss_clip": 1.03889298, "balance_loss_mlp": 1.02178049, "epoch": 0.6446415151059672, "flos": 25191910252800.0, "grad_norm": 2.156937552750908, "language_loss": 0.73425972, "learning_rate": 1.1845940386991995e-06, "loss": 0.75538391, "num_input_tokens_seen": 231369550, "step": 10722, "time_per_iteration": 3.0992050170898438 }, { "auxiliary_loss_clip": 0.0111233, "auxiliary_loss_mlp": 0.01033624, "balance_loss_clip": 1.03991735, "balance_loss_mlp": 1.02135682, "epoch": 0.6447016383586353, "flos": 25302227898240.0, "grad_norm": 1.8325068766714112, "language_loss": 0.77818036, "learning_rate": 1.184238431012635e-06, "loss": 0.79963994, "num_input_tokens_seen": 231389285, "step": 10723, "time_per_iteration": 2.6199328899383545 }, { "auxiliary_loss_clip": 0.01104393, "auxiliary_loss_mlp": 0.01038636, "balance_loss_clip": 1.03816402, "balance_loss_mlp": 1.02488565, "epoch": 0.6447617616113032, "flos": 27703142824320.0, "grad_norm": 2.2443871002503903, "language_loss": 0.58686608, "learning_rate": 1.1838828542598312e-06, "loss": 0.60829639, "num_input_tokens_seen": 231408820, "step": 10724, "time_per_iteration": 4.554950475692749 }, { "auxiliary_loss_clip": 0.01102176, "auxiliary_loss_mlp": 0.01033682, "balance_loss_clip": 1.0418992, "balance_loss_mlp": 1.02188635, "epoch": 0.6448218848639712, "flos": 23039101543680.0, "grad_norm": 1.7131170240074274, "language_loss": 0.83707219, "learning_rate": 1.183527308454271e-06, "loss": 0.8584308, "num_input_tokens_seen": 231428100, "step": 10725, "time_per_iteration": 2.5963871479034424 }, { "auxiliary_loss_clip": 0.01089104, "auxiliary_loss_mlp": 0.01037801, "balance_loss_clip": 1.03586388, "balance_loss_mlp": 1.02444363, "epoch": 0.6448820081166391, "flos": 24496104919680.0, "grad_norm": 1.7945503193220944, "language_loss": 0.82327414, "learning_rate": 1.1831717936094368e-06, "loss": 0.84454322, "num_input_tokens_seen": 231445810, "step": 10726, "time_per_iteration": 6.177702188491821 }, { "auxiliary_loss_clip": 0.0110184, "auxiliary_loss_mlp": 0.01037744, "balance_loss_clip": 1.03911293, "balance_loss_mlp": 1.02391601, "epoch": 0.6449421313693071, "flos": 22419283432320.0, "grad_norm": 5.950779634023435, "language_loss": 0.81306756, "learning_rate": 1.1828163097388108e-06, "loss": 0.83446342, "num_input_tokens_seen": 231463570, "step": 10727, "time_per_iteration": 2.646756172180176 }, { "auxiliary_loss_clip": 0.01114052, "auxiliary_loss_mlp": 0.01035116, "balance_loss_clip": 1.04432821, "balance_loss_mlp": 1.02101326, "epoch": 0.645002254621975, "flos": 20225715765120.0, "grad_norm": 2.0767423550252047, "language_loss": 0.79137063, "learning_rate": 1.1824608568558717e-06, "loss": 0.81286234, "num_input_tokens_seen": 231482155, "step": 10728, "time_per_iteration": 2.6014702320098877 }, { "auxiliary_loss_clip": 0.01018281, "auxiliary_loss_mlp": 0.01043432, "balance_loss_clip": 1.03341746, "balance_loss_mlp": 1.02857876, "epoch": 0.645062377874643, "flos": 27855440490240.0, "grad_norm": 1.6698019924695346, "language_loss": 0.74069214, "learning_rate": 1.1821054349740988e-06, "loss": 0.76130933, "num_input_tokens_seen": 231502465, "step": 10729, "time_per_iteration": 2.9942080974578857 }, { "auxiliary_loss_clip": 0.01072033, "auxiliary_loss_mlp": 0.01034846, "balance_loss_clip": 1.03895199, "balance_loss_mlp": 1.0206902, "epoch": 0.645122501127311, "flos": 25301509626240.0, "grad_norm": 1.675292027703949, "language_loss": 0.66314375, "learning_rate": 1.1817500441069706e-06, "loss": 0.68421257, "num_input_tokens_seen": 231522740, "step": 10730, "time_per_iteration": 3.029480218887329 }, { "auxiliary_loss_clip": 0.01053326, "auxiliary_loss_mlp": 0.01035886, "balance_loss_clip": 1.03969455, "balance_loss_mlp": 1.02077615, "epoch": 0.645182624379979, "flos": 18807352444800.0, "grad_norm": 1.6301580114634824, "language_loss": 0.63516945, "learning_rate": 1.1813946842679614e-06, "loss": 0.65606159, "num_input_tokens_seen": 231542050, "step": 10731, "time_per_iteration": 4.425801038742065 }, { "auxiliary_loss_clip": 0.01111857, "auxiliary_loss_mlp": 0.01032419, "balance_loss_clip": 1.03885424, "balance_loss_mlp": 1.01941907, "epoch": 0.6452427476326469, "flos": 18332182402560.0, "grad_norm": 1.6688797138193545, "language_loss": 0.68021357, "learning_rate": 1.1810393554705492e-06, "loss": 0.70165634, "num_input_tokens_seen": 231560380, "step": 10732, "time_per_iteration": 2.531669855117798 }, { "auxiliary_loss_clip": 0.01104232, "auxiliary_loss_mlp": 0.01036786, "balance_loss_clip": 1.04108346, "balance_loss_mlp": 1.0236969, "epoch": 0.6453028708853149, "flos": 22784746360320.0, "grad_norm": 2.2675077381725557, "language_loss": 0.75637865, "learning_rate": 1.1806840577282055e-06, "loss": 0.77778876, "num_input_tokens_seen": 231580810, "step": 10733, "time_per_iteration": 2.6263926029205322 }, { "auxiliary_loss_clip": 0.01104718, "auxiliary_loss_mlp": 0.01039721, "balance_loss_clip": 1.03942811, "balance_loss_mlp": 1.02548099, "epoch": 0.6453629941379828, "flos": 23945989150080.0, "grad_norm": 2.5422080980889903, "language_loss": 0.66799378, "learning_rate": 1.1803287910544048e-06, "loss": 0.6894381, "num_input_tokens_seen": 231600585, "step": 10734, "time_per_iteration": 2.639566421508789 }, { "auxiliary_loss_clip": 0.01113842, "auxiliary_loss_mlp": 0.01041504, "balance_loss_clip": 1.04339838, "balance_loss_mlp": 1.028898, "epoch": 0.6454231173906508, "flos": 17676381841920.0, "grad_norm": 1.794099580406708, "language_loss": 0.73622543, "learning_rate": 1.1799735554626191e-06, "loss": 0.75777888, "num_input_tokens_seen": 231618765, "step": 10735, "time_per_iteration": 2.5158708095550537 }, { "auxiliary_loss_clip": 0.01052163, "auxiliary_loss_mlp": 0.00771954, "balance_loss_clip": 1.03596699, "balance_loss_mlp": 1.00020361, "epoch": 0.6454832406433189, "flos": 23292774368640.0, "grad_norm": 1.8433870916344732, "language_loss": 0.74927819, "learning_rate": 1.1796183509663176e-06, "loss": 0.76751935, "num_input_tokens_seen": 231638525, "step": 10736, "time_per_iteration": 2.781177282333374 }, { "auxiliary_loss_clip": 0.01109179, "auxiliary_loss_mlp": 0.01033432, "balance_loss_clip": 1.04235053, "balance_loss_mlp": 1.01909697, "epoch": 0.6455433638959868, "flos": 20157198572160.0, "grad_norm": 1.9123509169430688, "language_loss": 0.70616424, "learning_rate": 1.1792631775789708e-06, "loss": 0.72759038, "num_input_tokens_seen": 231656785, "step": 10737, "time_per_iteration": 2.5800046920776367 }, { "auxiliary_loss_clip": 0.0102545, "auxiliary_loss_mlp": 0.01002929, "balance_loss_clip": 1.01085997, "balance_loss_mlp": 1.00164151, "epoch": 0.6456034871486548, "flos": 66532922012160.0, "grad_norm": 0.7817772178911736, "language_loss": 0.58405674, "learning_rate": 1.1789080353140464e-06, "loss": 0.60434055, "num_input_tokens_seen": 231719075, "step": 10738, "time_per_iteration": 3.238203287124634 }, { "auxiliary_loss_clip": 0.01079809, "auxiliary_loss_mlp": 0.01029827, "balance_loss_clip": 1.0387454, "balance_loss_mlp": 1.01666009, "epoch": 0.6456636104013227, "flos": 24206090509440.0, "grad_norm": 1.920167100598176, "language_loss": 0.74507523, "learning_rate": 1.1785529241850118e-06, "loss": 0.76617157, "num_input_tokens_seen": 231737810, "step": 10739, "time_per_iteration": 2.704909324645996 }, { "auxiliary_loss_clip": 0.01096514, "auxiliary_loss_mlp": 0.00771409, "balance_loss_clip": 1.04137897, "balance_loss_mlp": 1.00027609, "epoch": 0.6457237336539907, "flos": 23624086440960.0, "grad_norm": 1.8028230929667255, "language_loss": 0.70776832, "learning_rate": 1.1781978442053324e-06, "loss": 0.72644746, "num_input_tokens_seen": 231756140, "step": 10740, "time_per_iteration": 2.6947245597839355 }, { "auxiliary_loss_clip": 0.01016337, "auxiliary_loss_mlp": 0.01004394, "balance_loss_clip": 1.01068592, "balance_loss_mlp": 1.00314224, "epoch": 0.6457838569066586, "flos": 65846023251840.0, "grad_norm": 0.8728350789543404, "language_loss": 0.55255193, "learning_rate": 1.1778427953884733e-06, "loss": 0.57275927, "num_input_tokens_seen": 231823665, "step": 10741, "time_per_iteration": 3.214613676071167 }, { "auxiliary_loss_clip": 0.01113695, "auxiliary_loss_mlp": 0.01034634, "balance_loss_clip": 1.04090226, "balance_loss_mlp": 1.02212918, "epoch": 0.6458439801593266, "flos": 22381972179840.0, "grad_norm": 1.5851201591734638, "language_loss": 0.80647045, "learning_rate": 1.1774877777478977e-06, "loss": 0.8279537, "num_input_tokens_seen": 231844500, "step": 10742, "time_per_iteration": 2.6147494316101074 }, { "auxiliary_loss_clip": 0.01089275, "auxiliary_loss_mlp": 0.01034555, "balance_loss_clip": 1.03800607, "balance_loss_mlp": 1.02160883, "epoch": 0.6459041034119946, "flos": 24789243813120.0, "grad_norm": 1.493920390788815, "language_loss": 0.81934315, "learning_rate": 1.1771327912970678e-06, "loss": 0.84058142, "num_input_tokens_seen": 231864510, "step": 10743, "time_per_iteration": 2.7598674297332764 }, { "auxiliary_loss_clip": 0.01088471, "auxiliary_loss_mlp": 0.01032232, "balance_loss_clip": 1.03786039, "balance_loss_mlp": 1.01933324, "epoch": 0.6459642266646626, "flos": 18325358818560.0, "grad_norm": 5.256757204998113, "language_loss": 0.7177366, "learning_rate": 1.1767778360494453e-06, "loss": 0.73894364, "num_input_tokens_seen": 231881555, "step": 10744, "time_per_iteration": 2.620422840118408 }, { "auxiliary_loss_clip": 0.01114623, "auxiliary_loss_mlp": 0.01029271, "balance_loss_clip": 1.04074514, "balance_loss_mlp": 1.01683736, "epoch": 0.6460243499173305, "flos": 43581368891520.0, "grad_norm": 1.6850885635931934, "language_loss": 0.66688418, "learning_rate": 1.1764229120184896e-06, "loss": 0.68832302, "num_input_tokens_seen": 231905945, "step": 10745, "time_per_iteration": 2.7924861907958984 }, { "auxiliary_loss_clip": 0.01101668, "auxiliary_loss_mlp": 0.01034902, "balance_loss_clip": 1.03878927, "balance_loss_mlp": 1.02122271, "epoch": 0.6460844731699985, "flos": 19244026085760.0, "grad_norm": 2.3841357931880536, "language_loss": 0.73933601, "learning_rate": 1.1760680192176597e-06, "loss": 0.76070166, "num_input_tokens_seen": 231922535, "step": 10746, "time_per_iteration": 2.607113838195801 }, { "auxiliary_loss_clip": 0.01106683, "auxiliary_loss_mlp": 0.01035848, "balance_loss_clip": 1.04162467, "balance_loss_mlp": 1.02289009, "epoch": 0.6461445964226664, "flos": 27453348668160.0, "grad_norm": 1.3562492191561222, "language_loss": 0.66809833, "learning_rate": 1.175713157660413e-06, "loss": 0.6895237, "num_input_tokens_seen": 231944800, "step": 10747, "time_per_iteration": 2.7339725494384766 }, { "auxiliary_loss_clip": 0.01082798, "auxiliary_loss_mlp": 0.0104212, "balance_loss_clip": 1.03962016, "balance_loss_mlp": 1.02953124, "epoch": 0.6462047196753344, "flos": 20295489934080.0, "grad_norm": 1.7696623956762259, "language_loss": 0.67370367, "learning_rate": 1.1753583273602056e-06, "loss": 0.69495285, "num_input_tokens_seen": 231962970, "step": 10748, "time_per_iteration": 2.733555555343628 }, { "auxiliary_loss_clip": 0.01117812, "auxiliary_loss_mlp": 0.01044313, "balance_loss_clip": 1.04119956, "balance_loss_mlp": 1.03015089, "epoch": 0.6462648429280025, "flos": 22018340845440.0, "grad_norm": 1.9035207458082712, "language_loss": 0.75889313, "learning_rate": 1.1750035283304937e-06, "loss": 0.78051442, "num_input_tokens_seen": 231981195, "step": 10749, "time_per_iteration": 2.6402747631073 }, { "auxiliary_loss_clip": 0.01075833, "auxiliary_loss_mlp": 0.01041632, "balance_loss_clip": 1.03445184, "balance_loss_mlp": 1.02752352, "epoch": 0.6463249661806704, "flos": 27781141207680.0, "grad_norm": 1.5147294862876182, "language_loss": 0.77007931, "learning_rate": 1.17464876058473e-06, "loss": 0.79125392, "num_input_tokens_seen": 232001735, "step": 10750, "time_per_iteration": 2.7375411987304688 }, { "auxiliary_loss_clip": 0.01097872, "auxiliary_loss_mlp": 0.01038153, "balance_loss_clip": 1.03953791, "balance_loss_mlp": 1.02282298, "epoch": 0.6463850894333384, "flos": 22050588280320.0, "grad_norm": 2.1693323351013496, "language_loss": 0.68254787, "learning_rate": 1.1742940241363683e-06, "loss": 0.70390815, "num_input_tokens_seen": 232019830, "step": 10751, "time_per_iteration": 2.757457733154297 }, { "auxiliary_loss_clip": 0.01088079, "auxiliary_loss_mlp": 0.01032447, "balance_loss_clip": 1.03963614, "balance_loss_mlp": 1.0185945, "epoch": 0.6464452126860063, "flos": 21106245767040.0, "grad_norm": 1.9208554879181607, "language_loss": 0.71538639, "learning_rate": 1.1739393189988604e-06, "loss": 0.73659164, "num_input_tokens_seen": 232039625, "step": 10752, "time_per_iteration": 2.702068328857422 }, { "auxiliary_loss_clip": 0.0108316, "auxiliary_loss_mlp": 0.0104047, "balance_loss_clip": 1.03569722, "balance_loss_mlp": 1.02468061, "epoch": 0.6465053359386743, "flos": 16028045694720.0, "grad_norm": 1.6304463713193273, "language_loss": 0.78174138, "learning_rate": 1.1735846451856554e-06, "loss": 0.80297774, "num_input_tokens_seen": 232055855, "step": 10753, "time_per_iteration": 2.679288387298584 }, { "auxiliary_loss_clip": 0.01114663, "auxiliary_loss_mlp": 0.01041928, "balance_loss_clip": 1.04108715, "balance_loss_mlp": 1.02888012, "epoch": 0.6465654591913422, "flos": 23398674641280.0, "grad_norm": 1.8389919923642137, "language_loss": 0.85325253, "learning_rate": 1.1732300027102041e-06, "loss": 0.87481844, "num_input_tokens_seen": 232073475, "step": 10754, "time_per_iteration": 2.7047979831695557 }, { "auxiliary_loss_clip": 0.01089928, "auxiliary_loss_mlp": 0.01033615, "balance_loss_clip": 1.0371294, "balance_loss_mlp": 1.02018571, "epoch": 0.6466255824440102, "flos": 15377273038080.0, "grad_norm": 2.0086067487297203, "language_loss": 0.596542, "learning_rate": 1.1728753915859541e-06, "loss": 0.61777741, "num_input_tokens_seen": 232091090, "step": 10755, "time_per_iteration": 2.660458564758301 }, { "auxiliary_loss_clip": 0.01070404, "auxiliary_loss_mlp": 0.01034574, "balance_loss_clip": 1.03757024, "balance_loss_mlp": 1.02103186, "epoch": 0.6466857056966782, "flos": 16252846963200.0, "grad_norm": 2.348911212047805, "language_loss": 0.68158704, "learning_rate": 1.1725208118263518e-06, "loss": 0.70263684, "num_input_tokens_seen": 232107320, "step": 10756, "time_per_iteration": 2.667661190032959 }, { "auxiliary_loss_clip": 0.0107653, "auxiliary_loss_mlp": 0.01039991, "balance_loss_clip": 1.03933072, "balance_loss_mlp": 1.02511406, "epoch": 0.6467458289493462, "flos": 21178246579200.0, "grad_norm": 2.3037886815422772, "language_loss": 0.74333578, "learning_rate": 1.172166263444844e-06, "loss": 0.76450104, "num_input_tokens_seen": 232123930, "step": 10757, "time_per_iteration": 2.752260446548462 }, { "auxiliary_loss_clip": 0.01064083, "auxiliary_loss_mlp": 0.01037594, "balance_loss_clip": 1.0400213, "balance_loss_mlp": 1.02434397, "epoch": 0.6468059522020141, "flos": 17968299672960.0, "grad_norm": 1.4896032445983383, "language_loss": 0.74085969, "learning_rate": 1.1718117464548734e-06, "loss": 0.76187646, "num_input_tokens_seen": 232142905, "step": 10758, "time_per_iteration": 2.752277135848999 }, { "auxiliary_loss_clip": 0.01078484, "auxiliary_loss_mlp": 0.0103444, "balance_loss_clip": 1.04134357, "balance_loss_mlp": 1.02081478, "epoch": 0.6468660754546821, "flos": 17890157635200.0, "grad_norm": 1.5569302711566517, "language_loss": 0.67830229, "learning_rate": 1.1714572608698845e-06, "loss": 0.69943154, "num_input_tokens_seen": 232162230, "step": 10759, "time_per_iteration": 2.6961419582366943 }, { "auxiliary_loss_clip": 0.01078582, "auxiliary_loss_mlp": 0.01038565, "balance_loss_clip": 1.03437579, "balance_loss_mlp": 1.02430177, "epoch": 0.64692619870735, "flos": 22600991358720.0, "grad_norm": 1.7675629477863553, "language_loss": 0.75511646, "learning_rate": 1.1711028067033197e-06, "loss": 0.77628791, "num_input_tokens_seen": 232182700, "step": 10760, "time_per_iteration": 2.7628531455993652 }, { "auxiliary_loss_clip": 0.01088869, "auxiliary_loss_mlp": 0.01035724, "balance_loss_clip": 1.03735101, "balance_loss_mlp": 1.02188993, "epoch": 0.646986321960018, "flos": 49600786993920.0, "grad_norm": 1.635479063212096, "language_loss": 0.65361971, "learning_rate": 1.1707483839686194e-06, "loss": 0.6748656, "num_input_tokens_seen": 232208235, "step": 10761, "time_per_iteration": 2.939115047454834 }, { "auxiliary_loss_clip": 0.01069611, "auxiliary_loss_mlp": 0.01035372, "balance_loss_clip": 1.03998923, "balance_loss_mlp": 1.02115035, "epoch": 0.6470464452126861, "flos": 21908454163200.0, "grad_norm": 2.1978879485100014, "language_loss": 0.6946497, "learning_rate": 1.1703939926792235e-06, "loss": 0.71569955, "num_input_tokens_seen": 232228720, "step": 10762, "time_per_iteration": 4.4654014110565186 }, { "auxiliary_loss_clip": 0.01117949, "auxiliary_loss_mlp": 0.01037436, "balance_loss_clip": 1.04075444, "balance_loss_mlp": 1.02360213, "epoch": 0.647106568465354, "flos": 18106124158080.0, "grad_norm": 1.972655429723057, "language_loss": 0.82998466, "learning_rate": 1.1700396328485705e-06, "loss": 0.85153854, "num_input_tokens_seen": 232244655, "step": 10763, "time_per_iteration": 2.592090129852295 }, { "auxiliary_loss_clip": 0.0103456, "auxiliary_loss_mlp": 0.01005031, "balance_loss_clip": 1.01049972, "balance_loss_mlp": 1.00385058, "epoch": 0.647166691718022, "flos": 69480038125440.0, "grad_norm": 0.712357320853497, "language_loss": 0.57828617, "learning_rate": 1.1696853044900978e-06, "loss": 0.59868205, "num_input_tokens_seen": 232308685, "step": 10764, "time_per_iteration": 3.3077809810638428 }, { "auxiliary_loss_clip": 0.01077866, "auxiliary_loss_mlp": 0.01033689, "balance_loss_clip": 1.03704214, "balance_loss_mlp": 1.02015924, "epoch": 0.6472268149706899, "flos": 34095170661120.0, "grad_norm": 2.021573071850794, "language_loss": 0.6068002, "learning_rate": 1.1693310076172413e-06, "loss": 0.62791574, "num_input_tokens_seen": 232327520, "step": 10765, "time_per_iteration": 2.940326690673828 }, { "auxiliary_loss_clip": 0.01113775, "auxiliary_loss_mlp": 0.01033181, "balance_loss_clip": 1.04050612, "balance_loss_mlp": 1.02059865, "epoch": 0.6472869382233579, "flos": 28111232217600.0, "grad_norm": 1.7427036976648405, "language_loss": 0.62848121, "learning_rate": 1.168976742243437e-06, "loss": 0.64995074, "num_input_tokens_seen": 232349025, "step": 10766, "time_per_iteration": 5.861475229263306 }, { "auxiliary_loss_clip": 0.01090186, "auxiliary_loss_mlp": 0.01036411, "balance_loss_clip": 1.04002905, "balance_loss_mlp": 1.02172494, "epoch": 0.6473470614760258, "flos": 22492146170880.0, "grad_norm": 2.0617673547917255, "language_loss": 0.75767088, "learning_rate": 1.1686225083821174e-06, "loss": 0.77893686, "num_input_tokens_seen": 232367835, "step": 10767, "time_per_iteration": 2.7045323848724365 }, { "auxiliary_loss_clip": 0.01096864, "auxiliary_loss_mlp": 0.01033099, "balance_loss_clip": 1.03984213, "balance_loss_mlp": 1.02028418, "epoch": 0.6474071847286939, "flos": 14538938538240.0, "grad_norm": 1.9988107632557572, "language_loss": 0.78334147, "learning_rate": 1.1682683060467153e-06, "loss": 0.80464113, "num_input_tokens_seen": 232385840, "step": 10768, "time_per_iteration": 2.603180170059204 }, { "auxiliary_loss_clip": 0.01056997, "auxiliary_loss_mlp": 0.01034297, "balance_loss_clip": 1.03838003, "balance_loss_mlp": 1.02096355, "epoch": 0.6474673079813618, "flos": 24098214988800.0, "grad_norm": 1.607650242718932, "language_loss": 0.71857584, "learning_rate": 1.167914135250663e-06, "loss": 0.73948884, "num_input_tokens_seen": 232406205, "step": 10769, "time_per_iteration": 2.7530863285064697 }, { "auxiliary_loss_clip": 0.01113406, "auxiliary_loss_mlp": 0.0103478, "balance_loss_clip": 1.04209769, "balance_loss_mlp": 1.02214372, "epoch": 0.6475274312340298, "flos": 14976186796800.0, "grad_norm": 1.9573022312706896, "language_loss": 0.71980953, "learning_rate": 1.1675599960073895e-06, "loss": 0.74129134, "num_input_tokens_seen": 232424995, "step": 10770, "time_per_iteration": 4.22503137588501 }, { "auxiliary_loss_clip": 0.01073177, "auxiliary_loss_mlp": 0.01031965, "balance_loss_clip": 1.03501081, "balance_loss_mlp": 1.01759458, "epoch": 0.6475875544866977, "flos": 25045322849280.0, "grad_norm": 1.5542236081497367, "language_loss": 0.73281699, "learning_rate": 1.167205888330325e-06, "loss": 0.75386834, "num_input_tokens_seen": 232445870, "step": 10771, "time_per_iteration": 2.841069459915161 }, { "auxiliary_loss_clip": 0.01074703, "auxiliary_loss_mlp": 0.0103808, "balance_loss_clip": 1.03516805, "balance_loss_mlp": 1.02413297, "epoch": 0.6476476777393657, "flos": 16472153450880.0, "grad_norm": 1.9087232907246778, "language_loss": 0.74044871, "learning_rate": 1.1668518122328958e-06, "loss": 0.76157653, "num_input_tokens_seen": 232464285, "step": 10772, "time_per_iteration": 2.775754690170288 }, { "auxiliary_loss_clip": 0.01088465, "auxiliary_loss_mlp": 0.01031281, "balance_loss_clip": 1.03951991, "balance_loss_mlp": 1.01950288, "epoch": 0.6477078009920336, "flos": 25812267068160.0, "grad_norm": 1.563820733818388, "language_loss": 0.8277418, "learning_rate": 1.1664977677285305e-06, "loss": 0.84893924, "num_input_tokens_seen": 232485815, "step": 10773, "time_per_iteration": 2.7739098072052 }, { "auxiliary_loss_clip": 0.01100228, "auxiliary_loss_mlp": 0.00769385, "balance_loss_clip": 1.03956735, "balance_loss_mlp": 1.00008345, "epoch": 0.6477679242447016, "flos": 17676130446720.0, "grad_norm": 1.451687382466444, "language_loss": 0.78496003, "learning_rate": 1.1661437548306524e-06, "loss": 0.80365622, "num_input_tokens_seen": 232504875, "step": 10774, "time_per_iteration": 2.7035605907440186 }, { "auxiliary_loss_clip": 0.01104625, "auxiliary_loss_mlp": 0.01040629, "balance_loss_clip": 1.04012299, "balance_loss_mlp": 1.02751637, "epoch": 0.6478280474973696, "flos": 21032305620480.0, "grad_norm": 2.3182968489247986, "language_loss": 0.68886763, "learning_rate": 1.1657897735526867e-06, "loss": 0.71032017, "num_input_tokens_seen": 232521945, "step": 10775, "time_per_iteration": 2.7283878326416016 }, { "auxiliary_loss_clip": 0.01078255, "auxiliary_loss_mlp": 0.0104184, "balance_loss_clip": 1.03620017, "balance_loss_mlp": 1.02827358, "epoch": 0.6478881707500376, "flos": 21616931381760.0, "grad_norm": 1.867125007130101, "language_loss": 0.65918481, "learning_rate": 1.1654358239080574e-06, "loss": 0.68038571, "num_input_tokens_seen": 232541500, "step": 10776, "time_per_iteration": 2.792161226272583 }, { "auxiliary_loss_clip": 0.01086281, "auxiliary_loss_mlp": 0.01040573, "balance_loss_clip": 1.03693199, "balance_loss_mlp": 1.0267868, "epoch": 0.6479482940027056, "flos": 18442571875200.0, "grad_norm": 2.7363901491618297, "language_loss": 0.7900703, "learning_rate": 1.1650819059101839e-06, "loss": 0.81133884, "num_input_tokens_seen": 232559720, "step": 10777, "time_per_iteration": 2.6817147731781006 }, { "auxiliary_loss_clip": 0.01101857, "auxiliary_loss_mlp": 0.01033898, "balance_loss_clip": 1.04061663, "balance_loss_mlp": 1.0203439, "epoch": 0.6480084172553735, "flos": 22164066322560.0, "grad_norm": 2.418675876930909, "language_loss": 0.73090535, "learning_rate": 1.1647280195724896e-06, "loss": 0.75226295, "num_input_tokens_seen": 232579370, "step": 10778, "time_per_iteration": 2.7519023418426514 }, { "auxiliary_loss_clip": 0.01098704, "auxiliary_loss_mlp": 0.01030563, "balance_loss_clip": 1.03796005, "balance_loss_mlp": 1.01817703, "epoch": 0.6480685405080415, "flos": 24316228586880.0, "grad_norm": 1.4697687567373847, "language_loss": 0.78067875, "learning_rate": 1.1643741649083923e-06, "loss": 0.80197144, "num_input_tokens_seen": 232600495, "step": 10779, "time_per_iteration": 2.667295455932617 }, { "auxiliary_loss_clip": 0.01021608, "auxiliary_loss_mlp": 0.01004834, "balance_loss_clip": 1.00979376, "balance_loss_mlp": 1.00352228, "epoch": 0.6481286637607094, "flos": 59891207760000.0, "grad_norm": 0.722667977363254, "language_loss": 0.59406435, "learning_rate": 1.1640203419313095e-06, "loss": 0.61432874, "num_input_tokens_seen": 232663165, "step": 10780, "time_per_iteration": 3.146688461303711 }, { "auxiliary_loss_clip": 0.01013668, "auxiliary_loss_mlp": 0.01032512, "balance_loss_clip": 1.03276062, "balance_loss_mlp": 1.02043653, "epoch": 0.6481887870133775, "flos": 25484187219840.0, "grad_norm": 1.9346405521822077, "language_loss": 0.79079604, "learning_rate": 1.1636665506546599e-06, "loss": 0.81125784, "num_input_tokens_seen": 232683385, "step": 10781, "time_per_iteration": 3.1543314456939697 }, { "auxiliary_loss_clip": 0.01117668, "auxiliary_loss_mlp": 0.01036085, "balance_loss_clip": 1.04143655, "balance_loss_mlp": 1.02158904, "epoch": 0.6482489102660454, "flos": 19930206574080.0, "grad_norm": 2.567868177502946, "language_loss": 0.79041505, "learning_rate": 1.1633127910918578e-06, "loss": 0.81195259, "num_input_tokens_seen": 232699095, "step": 10782, "time_per_iteration": 2.8998003005981445 }, { "auxiliary_loss_clip": 0.01106141, "auxiliary_loss_mlp": 0.007711, "balance_loss_clip": 1.04090714, "balance_loss_mlp": 1.0001415, "epoch": 0.6483090335187134, "flos": 26979471515520.0, "grad_norm": 2.672580630052252, "language_loss": 0.64563107, "learning_rate": 1.1629590632563187e-06, "loss": 0.66440344, "num_input_tokens_seen": 232717920, "step": 10783, "time_per_iteration": 2.807725191116333 }, { "auxiliary_loss_clip": 0.01119847, "auxiliary_loss_mlp": 0.01038004, "balance_loss_clip": 1.04234159, "balance_loss_mlp": 1.02316856, "epoch": 0.6483691567713813, "flos": 25077965333760.0, "grad_norm": 1.6110368507909019, "language_loss": 0.88390124, "learning_rate": 1.1626053671614561e-06, "loss": 0.90547979, "num_input_tokens_seen": 232737605, "step": 10784, "time_per_iteration": 2.640153169631958 }, { "auxiliary_loss_clip": 0.01089797, "auxiliary_loss_mlp": 0.01033124, "balance_loss_clip": 1.03887093, "balance_loss_mlp": 1.02020776, "epoch": 0.6484292800240493, "flos": 16105972250880.0, "grad_norm": 2.090784466794914, "language_loss": 0.72988814, "learning_rate": 1.1622517028206815e-06, "loss": 0.75111735, "num_input_tokens_seen": 232755110, "step": 10785, "time_per_iteration": 2.6515488624572754 }, { "auxiliary_loss_clip": 0.01078138, "auxiliary_loss_mlp": 0.0103076, "balance_loss_clip": 1.03758073, "balance_loss_mlp": 1.01802194, "epoch": 0.6484894032767172, "flos": 28840398307200.0, "grad_norm": 1.5672388778764104, "language_loss": 0.69397259, "learning_rate": 1.1618980702474071e-06, "loss": 0.71506155, "num_input_tokens_seen": 232779040, "step": 10786, "time_per_iteration": 2.831984519958496 }, { "auxiliary_loss_clip": 0.01075224, "auxiliary_loss_mlp": 0.01031746, "balance_loss_clip": 1.03817129, "balance_loss_mlp": 1.01922286, "epoch": 0.6485495265293852, "flos": 30227052896640.0, "grad_norm": 2.0612082804403404, "language_loss": 0.71243078, "learning_rate": 1.161544469455041e-06, "loss": 0.73350048, "num_input_tokens_seen": 232800515, "step": 10787, "time_per_iteration": 2.793691635131836 }, { "auxiliary_loss_clip": 0.0111836, "auxiliary_loss_mlp": 0.01035294, "balance_loss_clip": 1.0412823, "balance_loss_mlp": 1.0220623, "epoch": 0.6486096497820532, "flos": 20082181017600.0, "grad_norm": 1.9333037316798733, "language_loss": 0.84715712, "learning_rate": 1.1611909004569934e-06, "loss": 0.86869359, "num_input_tokens_seen": 232818450, "step": 10788, "time_per_iteration": 2.606229543685913 }, { "auxiliary_loss_clip": 0.01078244, "auxiliary_loss_mlp": 0.01034763, "balance_loss_clip": 1.04034448, "balance_loss_mlp": 1.02126873, "epoch": 0.6486697730347212, "flos": 17129067333120.0, "grad_norm": 2.006310721450953, "language_loss": 0.7757296, "learning_rate": 1.1608373632666708e-06, "loss": 0.79685968, "num_input_tokens_seen": 232834785, "step": 10789, "time_per_iteration": 2.689147710800171 }, { "auxiliary_loss_clip": 0.01096496, "auxiliary_loss_mlp": 0.01031686, "balance_loss_clip": 1.03580093, "balance_loss_mlp": 1.01941395, "epoch": 0.6487298962873892, "flos": 38911940570880.0, "grad_norm": 1.6467685685264215, "language_loss": 0.75511503, "learning_rate": 1.160483857897479e-06, "loss": 0.77639687, "num_input_tokens_seen": 232856050, "step": 10790, "time_per_iteration": 2.8264946937561035 }, { "auxiliary_loss_clip": 0.01113527, "auxiliary_loss_mlp": 0.01036831, "balance_loss_clip": 1.04156542, "balance_loss_mlp": 1.02490366, "epoch": 0.6487900195400571, "flos": 11947840076160.0, "grad_norm": 2.307183406251666, "language_loss": 0.60332596, "learning_rate": 1.160130384362823e-06, "loss": 0.62482953, "num_input_tokens_seen": 232873945, "step": 10791, "time_per_iteration": 2.5990047454833984 }, { "auxiliary_loss_clip": 0.01076606, "auxiliary_loss_mlp": 0.01034239, "balance_loss_clip": 1.03773832, "balance_loss_mlp": 1.0215373, "epoch": 0.6488501427927251, "flos": 22344445445760.0, "grad_norm": 1.759760291391278, "language_loss": 0.86496675, "learning_rate": 1.1597769426761082e-06, "loss": 0.88607526, "num_input_tokens_seen": 232892160, "step": 10792, "time_per_iteration": 2.771683692932129 }, { "auxiliary_loss_clip": 0.01093434, "auxiliary_loss_mlp": 0.01039713, "balance_loss_clip": 1.03958428, "balance_loss_mlp": 1.02602792, "epoch": 0.648910266045393, "flos": 22236282616320.0, "grad_norm": 2.0358486422598445, "language_loss": 0.78231007, "learning_rate": 1.159423532850735e-06, "loss": 0.8036415, "num_input_tokens_seen": 232911725, "step": 10793, "time_per_iteration": 2.67922043800354 }, { "auxiliary_loss_clip": 0.0108252, "auxiliary_loss_mlp": 0.01032713, "balance_loss_clip": 1.0395385, "balance_loss_mlp": 1.0193671, "epoch": 0.6489703892980611, "flos": 25301258231040.0, "grad_norm": 2.0060089316964667, "language_loss": 0.75005889, "learning_rate": 1.1590701549001055e-06, "loss": 0.77121115, "num_input_tokens_seen": 232929085, "step": 10794, "time_per_iteration": 2.740185022354126 }, { "auxiliary_loss_clip": 0.01102066, "auxiliary_loss_mlp": 0.00770842, "balance_loss_clip": 1.03801179, "balance_loss_mlp": 1.00016379, "epoch": 0.649030512550729, "flos": 24571912573440.0, "grad_norm": 1.6388436552304226, "language_loss": 0.70095515, "learning_rate": 1.158716808837621e-06, "loss": 0.71968424, "num_input_tokens_seen": 232949455, "step": 10795, "time_per_iteration": 2.7056167125701904 }, { "auxiliary_loss_clip": 0.01092893, "auxiliary_loss_mlp": 0.01034868, "balance_loss_clip": 1.03938341, "balance_loss_mlp": 1.02145672, "epoch": 0.649090635803397, "flos": 26244702904320.0, "grad_norm": 1.931230622678825, "language_loss": 0.54384381, "learning_rate": 1.158363494676679e-06, "loss": 0.56512141, "num_input_tokens_seen": 232969445, "step": 10796, "time_per_iteration": 2.70178484916687 }, { "auxiliary_loss_clip": 0.0110304, "auxiliary_loss_mlp": 0.010382, "balance_loss_clip": 1.04058564, "balance_loss_mlp": 1.02635705, "epoch": 0.6491507590560649, "flos": 24937375501440.0, "grad_norm": 1.521654875765255, "language_loss": 0.77584833, "learning_rate": 1.1580102124306775e-06, "loss": 0.7972607, "num_input_tokens_seen": 232988900, "step": 10797, "time_per_iteration": 2.740236759185791 }, { "auxiliary_loss_clip": 0.010649, "auxiliary_loss_mlp": 0.01033495, "balance_loss_clip": 1.03765631, "balance_loss_mlp": 1.02110291, "epoch": 0.6492108823087329, "flos": 19499781899520.0, "grad_norm": 2.1886950551197835, "language_loss": 0.7017765, "learning_rate": 1.1576569621130134e-06, "loss": 0.72276044, "num_input_tokens_seen": 233005060, "step": 10798, "time_per_iteration": 2.7228379249572754 }, { "auxiliary_loss_clip": 0.01059107, "auxiliary_loss_mlp": 0.01032641, "balance_loss_clip": 1.03400683, "balance_loss_mlp": 1.02048159, "epoch": 0.6492710055614008, "flos": 19719303868800.0, "grad_norm": 1.8018305819700693, "language_loss": 0.76899987, "learning_rate": 1.1573037437370811e-06, "loss": 0.78991735, "num_input_tokens_seen": 233023375, "step": 10799, "time_per_iteration": 2.7452025413513184 }, { "auxiliary_loss_clip": 0.01102121, "auxiliary_loss_mlp": 0.0103607, "balance_loss_clip": 1.03952456, "balance_loss_mlp": 1.02255809, "epoch": 0.6493311288140688, "flos": 24317018686080.0, "grad_norm": 1.8690878603480447, "language_loss": 0.71881801, "learning_rate": 1.1569505573162755e-06, "loss": 0.74019992, "num_input_tokens_seen": 233043130, "step": 10800, "time_per_iteration": 2.681090831756592 }, { "auxiliary_loss_clip": 0.01025406, "auxiliary_loss_mlp": 0.01015193, "balance_loss_clip": 1.01085913, "balance_loss_mlp": 1.01379859, "epoch": 0.6493912520667368, "flos": 70934635290240.0, "grad_norm": 0.7781340996665279, "language_loss": 0.60211796, "learning_rate": 1.1565974028639897e-06, "loss": 0.62252396, "num_input_tokens_seen": 233110560, "step": 10801, "time_per_iteration": 3.3247601985931396 }, { "auxiliary_loss_clip": 0.01104473, "auxiliary_loss_mlp": 0.01042076, "balance_loss_clip": 1.04024768, "balance_loss_mlp": 1.02764034, "epoch": 0.6494513753194048, "flos": 25337779384320.0, "grad_norm": 2.523744267443401, "language_loss": 0.78645104, "learning_rate": 1.156244280393614e-06, "loss": 0.80791658, "num_input_tokens_seen": 233130080, "step": 10802, "time_per_iteration": 4.631081581115723 }, { "auxiliary_loss_clip": 0.01114091, "auxiliary_loss_mlp": 0.01039322, "balance_loss_clip": 1.03890288, "balance_loss_mlp": 1.02562487, "epoch": 0.6495114985720728, "flos": 24681978823680.0, "grad_norm": 1.6103480042358926, "language_loss": 0.74409741, "learning_rate": 1.155891189918541e-06, "loss": 0.76563156, "num_input_tokens_seen": 233150235, "step": 10803, "time_per_iteration": 2.6966469287872314 }, { "auxiliary_loss_clip": 0.01052817, "auxiliary_loss_mlp": 0.01033534, "balance_loss_clip": 1.03642201, "balance_loss_mlp": 1.02049232, "epoch": 0.6495716218247407, "flos": 23651162317440.0, "grad_norm": 2.357483246632454, "language_loss": 0.70044661, "learning_rate": 1.1555381314521578e-06, "loss": 0.72131014, "num_input_tokens_seen": 233166710, "step": 10804, "time_per_iteration": 2.8469581604003906 }, { "auxiliary_loss_clip": 0.01100022, "auxiliary_loss_mlp": 0.01031932, "balance_loss_clip": 1.03885949, "balance_loss_mlp": 1.01822269, "epoch": 0.6496317450774087, "flos": 22346169298560.0, "grad_norm": 1.6424372411167527, "language_loss": 0.72557664, "learning_rate": 1.1551851050078537e-06, "loss": 0.74689615, "num_input_tokens_seen": 233185445, "step": 10805, "time_per_iteration": 4.559306621551514 }, { "auxiliary_loss_clip": 0.01088097, "auxiliary_loss_mlp": 0.01030943, "balance_loss_clip": 1.03999364, "balance_loss_mlp": 1.01886106, "epoch": 0.6496918683300766, "flos": 30518647505280.0, "grad_norm": 2.1225421947180467, "language_loss": 0.65710378, "learning_rate": 1.1548321105990155e-06, "loss": 0.67829412, "num_input_tokens_seen": 233205805, "step": 10806, "time_per_iteration": 4.271615266799927 }, { "auxiliary_loss_clip": 0.01093074, "auxiliary_loss_mlp": 0.00771144, "balance_loss_clip": 1.03741765, "balance_loss_mlp": 1.00009441, "epoch": 0.6497519915827447, "flos": 12458992567680.0, "grad_norm": 1.9214718172589236, "language_loss": 0.78912604, "learning_rate": 1.1544791482390275e-06, "loss": 0.80776823, "num_input_tokens_seen": 233224215, "step": 10807, "time_per_iteration": 2.7781808376312256 }, { "auxiliary_loss_clip": 0.01014724, "auxiliary_loss_mlp": 0.0100172, "balance_loss_clip": 1.0100404, "balance_loss_mlp": 1.00033116, "epoch": 0.6498121148354126, "flos": 69093748287360.0, "grad_norm": 0.7866075869002591, "language_loss": 0.58888513, "learning_rate": 1.1541262179412745e-06, "loss": 0.60904956, "num_input_tokens_seen": 233294440, "step": 10808, "time_per_iteration": 3.3867762088775635 }, { "auxiliary_loss_clip": 0.01091297, "auxiliary_loss_mlp": 0.01027122, "balance_loss_clip": 1.04009056, "balance_loss_mlp": 1.01453352, "epoch": 0.6498722380880806, "flos": 36897135914880.0, "grad_norm": 1.7443140014145102, "language_loss": 0.63562334, "learning_rate": 1.1537733197191415e-06, "loss": 0.65680754, "num_input_tokens_seen": 233316125, "step": 10809, "time_per_iteration": 4.545352220535278 }, { "auxiliary_loss_clip": 0.01101385, "auxiliary_loss_mlp": 0.00769706, "balance_loss_clip": 1.04086709, "balance_loss_mlp": 1.00011587, "epoch": 0.6499323613407485, "flos": 29017760688000.0, "grad_norm": 1.6271930156290193, "language_loss": 0.81576955, "learning_rate": 1.153420453586008e-06, "loss": 0.8344804, "num_input_tokens_seen": 233336140, "step": 10810, "time_per_iteration": 2.6756200790405273 }, { "auxiliary_loss_clip": 0.01071315, "auxiliary_loss_mlp": 0.01036036, "balance_loss_clip": 1.0380795, "balance_loss_mlp": 1.02466989, "epoch": 0.6499924845934165, "flos": 20119240874880.0, "grad_norm": 1.6231866882582067, "language_loss": 0.72109252, "learning_rate": 1.1530676195552561e-06, "loss": 0.74216604, "num_input_tokens_seen": 233356095, "step": 10811, "time_per_iteration": 2.6948235034942627 }, { "auxiliary_loss_clip": 0.01053868, "auxiliary_loss_mlp": 0.01028108, "balance_loss_clip": 1.04128838, "balance_loss_mlp": 1.01610339, "epoch": 0.6500526078460844, "flos": 24421338760320.0, "grad_norm": 1.6351468205414483, "language_loss": 0.77842551, "learning_rate": 1.1527148176402649e-06, "loss": 0.79924524, "num_input_tokens_seen": 233376830, "step": 10812, "time_per_iteration": 2.8678853511810303 }, { "auxiliary_loss_clip": 0.01098947, "auxiliary_loss_mlp": 0.01036383, "balance_loss_clip": 1.04008079, "balance_loss_mlp": 1.02321005, "epoch": 0.6501127310987524, "flos": 23331019374720.0, "grad_norm": 1.6938636909154852, "language_loss": 0.85069716, "learning_rate": 1.152362047854413e-06, "loss": 0.8720504, "num_input_tokens_seen": 233395275, "step": 10813, "time_per_iteration": 2.618603467941284 }, { "auxiliary_loss_clip": 0.01071283, "auxiliary_loss_mlp": 0.01035396, "balance_loss_clip": 1.03572655, "balance_loss_mlp": 1.02187157, "epoch": 0.6501728543514204, "flos": 18697824898560.0, "grad_norm": 2.609145629781726, "language_loss": 0.79691541, "learning_rate": 1.1520093102110764e-06, "loss": 0.8179822, "num_input_tokens_seen": 233413345, "step": 10814, "time_per_iteration": 2.742004156112671 }, { "auxiliary_loss_clip": 0.01064254, "auxiliary_loss_mlp": 0.00773576, "balance_loss_clip": 1.03794754, "balance_loss_mlp": 1.00018024, "epoch": 0.6502329776040884, "flos": 44199858199680.0, "grad_norm": 1.9285039571390825, "language_loss": 0.65348196, "learning_rate": 1.1516566047236328e-06, "loss": 0.67186022, "num_input_tokens_seen": 233436105, "step": 10815, "time_per_iteration": 2.967710256576538 }, { "auxiliary_loss_clip": 0.01118333, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.04089963, "balance_loss_mlp": 1.01759648, "epoch": 0.6502931008567564, "flos": 14574741419520.0, "grad_norm": 1.878543508830568, "language_loss": 0.75245708, "learning_rate": 1.1513039314054546e-06, "loss": 0.77396703, "num_input_tokens_seen": 233452320, "step": 10816, "time_per_iteration": 2.619370698928833 }, { "auxiliary_loss_clip": 0.01085538, "auxiliary_loss_mlp": 0.01031729, "balance_loss_clip": 1.03892541, "balance_loss_mlp": 1.01897991, "epoch": 0.6503532241094243, "flos": 21395003201280.0, "grad_norm": 1.8185101411846911, "language_loss": 0.73227775, "learning_rate": 1.1509512902699174e-06, "loss": 0.75345039, "num_input_tokens_seen": 233469920, "step": 10817, "time_per_iteration": 2.758009672164917 }, { "auxiliary_loss_clip": 0.01071537, "auxiliary_loss_mlp": 0.01046459, "balance_loss_clip": 1.03518438, "balance_loss_mlp": 1.03168857, "epoch": 0.6504133473620923, "flos": 74740840986240.0, "grad_norm": 1.5063120441652318, "language_loss": 0.72075009, "learning_rate": 1.1505986813303916e-06, "loss": 0.74193007, "num_input_tokens_seen": 233499780, "step": 10818, "time_per_iteration": 3.143178701400757 }, { "auxiliary_loss_clip": 0.01085148, "auxiliary_loss_mlp": 0.01030336, "balance_loss_clip": 1.03872418, "balance_loss_mlp": 1.01738429, "epoch": 0.6504734706147602, "flos": 19713270384000.0, "grad_norm": 2.002053752481776, "language_loss": 0.65038371, "learning_rate": 1.150246104600249e-06, "loss": 0.67153859, "num_input_tokens_seen": 233518235, "step": 10819, "time_per_iteration": 2.704205274581909 }, { "auxiliary_loss_clip": 0.01077923, "auxiliary_loss_mlp": 0.01031636, "balance_loss_clip": 1.03569567, "balance_loss_mlp": 1.01811743, "epoch": 0.6505335938674283, "flos": 25556870390400.0, "grad_norm": 1.8302178372953948, "language_loss": 0.83782417, "learning_rate": 1.14989356009286e-06, "loss": 0.85891974, "num_input_tokens_seen": 233535215, "step": 10820, "time_per_iteration": 2.762343645095825 }, { "auxiliary_loss_clip": 0.01106479, "auxiliary_loss_mlp": 0.01030319, "balance_loss_clip": 1.03934109, "balance_loss_mlp": 1.01703274, "epoch": 0.6505937171200962, "flos": 17821424960640.0, "grad_norm": 2.074138898013104, "language_loss": 0.77881086, "learning_rate": 1.1495410478215914e-06, "loss": 0.80017889, "num_input_tokens_seen": 233552775, "step": 10821, "time_per_iteration": 2.6239891052246094 }, { "auxiliary_loss_clip": 0.01077516, "auxiliary_loss_mlp": 0.01028396, "balance_loss_clip": 1.03843164, "balance_loss_mlp": 1.01721418, "epoch": 0.6506538403727642, "flos": 20668135582080.0, "grad_norm": 1.4292101756111668, "language_loss": 0.80072695, "learning_rate": 1.1491885677998126e-06, "loss": 0.82178605, "num_input_tokens_seen": 233572080, "step": 10822, "time_per_iteration": 2.7913742065429688 }, { "auxiliary_loss_clip": 0.01084959, "auxiliary_loss_mlp": 0.01029649, "balance_loss_clip": 1.04204702, "balance_loss_mlp": 1.01634574, "epoch": 0.6507139636254321, "flos": 11721422695680.0, "grad_norm": 2.216597297898186, "language_loss": 0.8719157, "learning_rate": 1.1488361200408883e-06, "loss": 0.89306176, "num_input_tokens_seen": 233589155, "step": 10823, "time_per_iteration": 2.7045187950134277 }, { "auxiliary_loss_clip": 0.01114569, "auxiliary_loss_mlp": 0.01031767, "balance_loss_clip": 1.0398941, "balance_loss_mlp": 1.01913643, "epoch": 0.6507740868781001, "flos": 26761745226240.0, "grad_norm": 1.6940233286010407, "language_loss": 0.66299087, "learning_rate": 1.148483704558183e-06, "loss": 0.6844542, "num_input_tokens_seen": 233608180, "step": 10824, "time_per_iteration": 2.609870433807373 }, { "auxiliary_loss_clip": 0.01096015, "auxiliary_loss_mlp": 0.01031659, "balance_loss_clip": 1.04038215, "balance_loss_mlp": 1.01846242, "epoch": 0.650834210130768, "flos": 16471722487680.0, "grad_norm": 5.416027486189251, "language_loss": 0.87431592, "learning_rate": 1.1481313213650607e-06, "loss": 0.89559269, "num_input_tokens_seen": 233625750, "step": 10825, "time_per_iteration": 2.649099588394165 }, { "auxiliary_loss_clip": 0.01092468, "auxiliary_loss_mlp": 0.01028826, "balance_loss_clip": 1.03650379, "balance_loss_mlp": 1.01514649, "epoch": 0.650894333383436, "flos": 17128672283520.0, "grad_norm": 2.103621809336841, "language_loss": 0.73180604, "learning_rate": 1.147778970474885e-06, "loss": 0.75301898, "num_input_tokens_seen": 233644235, "step": 10826, "time_per_iteration": 2.6394810676574707 }, { "auxiliary_loss_clip": 0.01104739, "auxiliary_loss_mlp": 0.01027729, "balance_loss_clip": 1.04116881, "balance_loss_mlp": 1.01562333, "epoch": 0.650954456636104, "flos": 18734238311040.0, "grad_norm": 1.7744084173415924, "language_loss": 0.68743241, "learning_rate": 1.1474266519010157e-06, "loss": 0.70875704, "num_input_tokens_seen": 233662845, "step": 10827, "time_per_iteration": 2.5662622451782227 }, { "auxiliary_loss_clip": 0.01089545, "auxiliary_loss_mlp": 0.01031977, "balance_loss_clip": 1.03715336, "balance_loss_mlp": 1.02000248, "epoch": 0.651014579888772, "flos": 24528244613760.0, "grad_norm": 1.7280110593006797, "language_loss": 0.76715839, "learning_rate": 1.1470743656568136e-06, "loss": 0.78837359, "num_input_tokens_seen": 233681990, "step": 10828, "time_per_iteration": 2.6430130004882812 }, { "auxiliary_loss_clip": 0.01101657, "auxiliary_loss_mlp": 0.0102849, "balance_loss_clip": 1.0396359, "balance_loss_mlp": 1.01659322, "epoch": 0.65107470314144, "flos": 24061083304320.0, "grad_norm": 2.028448280689147, "language_loss": 0.89382976, "learning_rate": 1.1467221117556362e-06, "loss": 0.91513121, "num_input_tokens_seen": 233698930, "step": 10829, "time_per_iteration": 2.676887273788452 }, { "auxiliary_loss_clip": 0.01033575, "auxiliary_loss_mlp": 0.01003174, "balance_loss_clip": 1.00994611, "balance_loss_mlp": 1.00192249, "epoch": 0.6511348263941079, "flos": 72480734352000.0, "grad_norm": 0.6385058987930536, "language_loss": 0.55351257, "learning_rate": 1.1463698902108428e-06, "loss": 0.57388008, "num_input_tokens_seen": 233769825, "step": 10830, "time_per_iteration": 3.283604383468628 }, { "auxiliary_loss_clip": 0.01080445, "auxiliary_loss_mlp": 0.01033977, "balance_loss_clip": 1.03753436, "balance_loss_mlp": 1.02031004, "epoch": 0.6511949496467759, "flos": 23367684182400.0, "grad_norm": 2.2496423265989263, "language_loss": 0.74632305, "learning_rate": 1.1460177010357878e-06, "loss": 0.76746726, "num_input_tokens_seen": 233787095, "step": 10831, "time_per_iteration": 2.6958060264587402 }, { "auxiliary_loss_clip": 0.01016148, "auxiliary_loss_mlp": 0.01001305, "balance_loss_clip": 1.01118171, "balance_loss_mlp": 0.99989206, "epoch": 0.6512550728994438, "flos": 67333191073920.0, "grad_norm": 0.6457874133081085, "language_loss": 0.50977135, "learning_rate": 1.145665544243828e-06, "loss": 0.52994585, "num_input_tokens_seen": 233853050, "step": 10832, "time_per_iteration": 3.3019638061523438 }, { "auxiliary_loss_clip": 0.01094456, "auxiliary_loss_mlp": 0.0103476, "balance_loss_clip": 1.03838396, "balance_loss_mlp": 1.02121806, "epoch": 0.6513151961521119, "flos": 21141689512320.0, "grad_norm": 2.261964071454772, "language_loss": 0.83006239, "learning_rate": 1.145313419848316e-06, "loss": 0.85135454, "num_input_tokens_seen": 233871385, "step": 10833, "time_per_iteration": 2.643763542175293 }, { "auxiliary_loss_clip": 0.01096358, "auxiliary_loss_mlp": 0.01034172, "balance_loss_clip": 1.04303241, "balance_loss_mlp": 1.02144599, "epoch": 0.6513753194047798, "flos": 15158828476800.0, "grad_norm": 2.0262015833742937, "language_loss": 0.83040363, "learning_rate": 1.1449613278626049e-06, "loss": 0.85170895, "num_input_tokens_seen": 233888175, "step": 10834, "time_per_iteration": 2.696136713027954 }, { "auxiliary_loss_clip": 0.01102331, "auxiliary_loss_mlp": 0.01040155, "balance_loss_clip": 1.039487, "balance_loss_mlp": 1.02702951, "epoch": 0.6514354426574478, "flos": 30226621933440.0, "grad_norm": 1.5116925476060534, "language_loss": 0.7712391, "learning_rate": 1.1446092683000455e-06, "loss": 0.79266393, "num_input_tokens_seen": 233911470, "step": 10835, "time_per_iteration": 2.733752965927124 }, { "auxiliary_loss_clip": 0.01087811, "auxiliary_loss_mlp": 0.01038329, "balance_loss_clip": 1.0393815, "balance_loss_mlp": 1.02551985, "epoch": 0.6514955659101157, "flos": 24205587719040.0, "grad_norm": 1.603053369126082, "language_loss": 0.77712744, "learning_rate": 1.1442572411739882e-06, "loss": 0.79838884, "num_input_tokens_seen": 233932135, "step": 10836, "time_per_iteration": 2.7181618213653564 }, { "auxiliary_loss_clip": 0.01076915, "auxiliary_loss_mlp": 0.01034338, "balance_loss_clip": 1.037691, "balance_loss_mlp": 1.02143383, "epoch": 0.6515556891627837, "flos": 12377761960320.0, "grad_norm": 2.035005351868823, "language_loss": 0.82812917, "learning_rate": 1.143905246497783e-06, "loss": 0.84924167, "num_input_tokens_seen": 233947880, "step": 10837, "time_per_iteration": 2.6514079570770264 }, { "auxiliary_loss_clip": 0.01073313, "auxiliary_loss_mlp": 0.01035471, "balance_loss_clip": 1.03897333, "balance_loss_mlp": 1.0211482, "epoch": 0.6516158124154516, "flos": 49601217957120.0, "grad_norm": 1.8965490798746285, "language_loss": 0.5910452, "learning_rate": 1.1435532842847758e-06, "loss": 0.61213303, "num_input_tokens_seen": 233971475, "step": 10838, "time_per_iteration": 2.955751419067383 }, { "auxiliary_loss_clip": 0.01033147, "auxiliary_loss_mlp": 0.01008878, "balance_loss_clip": 1.0095979, "balance_loss_mlp": 1.00770998, "epoch": 0.6516759356681197, "flos": 59702748076800.0, "grad_norm": 0.7683915325325666, "language_loss": 0.60835862, "learning_rate": 1.1432013545483147e-06, "loss": 0.62877893, "num_input_tokens_seen": 234030690, "step": 10839, "time_per_iteration": 3.200835943222046 }, { "auxiliary_loss_clip": 0.0109233, "auxiliary_loss_mlp": 0.0103157, "balance_loss_clip": 1.04093075, "balance_loss_mlp": 1.01998901, "epoch": 0.6517360589207876, "flos": 37450807130880.0, "grad_norm": 1.7743025760939068, "language_loss": 0.67926049, "learning_rate": 1.1428494573017439e-06, "loss": 0.70049942, "num_input_tokens_seen": 234052470, "step": 10840, "time_per_iteration": 2.8348867893218994 }, { "auxiliary_loss_clip": 0.01067745, "auxiliary_loss_mlp": 0.01034413, "balance_loss_clip": 1.03654337, "balance_loss_mlp": 1.02269483, "epoch": 0.6517961821734556, "flos": 25374911068800.0, "grad_norm": 2.0615754511911306, "language_loss": 0.73519421, "learning_rate": 1.1424975925584071e-06, "loss": 0.75621581, "num_input_tokens_seen": 234071495, "step": 10841, "time_per_iteration": 4.435396671295166 }, { "auxiliary_loss_clip": 0.01114891, "auxiliary_loss_mlp": 0.01038031, "balance_loss_clip": 1.03930378, "balance_loss_mlp": 1.02487588, "epoch": 0.6518563054261236, "flos": 28766996864640.0, "grad_norm": 1.4942272074667713, "language_loss": 0.62317944, "learning_rate": 1.142145760331648e-06, "loss": 0.64470863, "num_input_tokens_seen": 234092325, "step": 10842, "time_per_iteration": 2.6767518520355225 }, { "auxiliary_loss_clip": 0.01024949, "auxiliary_loss_mlp": 0.01006106, "balance_loss_clip": 1.01075029, "balance_loss_mlp": 1.00497305, "epoch": 0.6519164286787915, "flos": 68924750797440.0, "grad_norm": 0.8104047899585891, "language_loss": 0.5617612, "learning_rate": 1.141793960634807e-06, "loss": 0.58207178, "num_input_tokens_seen": 234148005, "step": 10843, "time_per_iteration": 3.0310990810394287 }, { "auxiliary_loss_clip": 0.01104455, "auxiliary_loss_mlp": 0.01039452, "balance_loss_clip": 1.03846788, "balance_loss_mlp": 1.02576053, "epoch": 0.6519765519314595, "flos": 20441933683200.0, "grad_norm": 1.5675649945193708, "language_loss": 0.82750475, "learning_rate": 1.1414421934812253e-06, "loss": 0.84894383, "num_input_tokens_seen": 234164280, "step": 10844, "time_per_iteration": 5.7787792682647705 }, { "auxiliary_loss_clip": 0.01104311, "auxiliary_loss_mlp": 0.01034465, "balance_loss_clip": 1.04057419, "balance_loss_mlp": 1.02136445, "epoch": 0.6520366751841274, "flos": 28402970480640.0, "grad_norm": 1.85573565019848, "language_loss": 0.59983897, "learning_rate": 1.1410904588842421e-06, "loss": 0.62122673, "num_input_tokens_seen": 234185090, "step": 10845, "time_per_iteration": 2.7293028831481934 }, { "auxiliary_loss_clip": 0.0110391, "auxiliary_loss_mlp": 0.01032078, "balance_loss_clip": 1.04017997, "balance_loss_mlp": 1.01897073, "epoch": 0.6520967984367955, "flos": 22273414300800.0, "grad_norm": 1.668141485329768, "language_loss": 0.79591072, "learning_rate": 1.140738756857194e-06, "loss": 0.81727064, "num_input_tokens_seen": 234204050, "step": 10846, "time_per_iteration": 2.6495091915130615 }, { "auxiliary_loss_clip": 0.01025275, "auxiliary_loss_mlp": 0.01003438, "balance_loss_clip": 1.01079941, "balance_loss_mlp": 1.00228775, "epoch": 0.6521569216894634, "flos": 68917140092160.0, "grad_norm": 0.709011283257112, "language_loss": 0.60191703, "learning_rate": 1.1403870874134192e-06, "loss": 0.62220418, "num_input_tokens_seen": 234269790, "step": 10847, "time_per_iteration": 3.282104730606079 }, { "auxiliary_loss_clip": 0.0111717, "auxiliary_loss_mlp": 0.01037772, "balance_loss_clip": 1.0412842, "balance_loss_mlp": 1.02495718, "epoch": 0.6522170449421314, "flos": 29130520458240.0, "grad_norm": 1.5919105635369972, "language_loss": 0.81118578, "learning_rate": 1.1400354505662514e-06, "loss": 0.8327353, "num_input_tokens_seen": 234290135, "step": 10848, "time_per_iteration": 2.6569244861602783 }, { "auxiliary_loss_clip": 0.01084019, "auxiliary_loss_mlp": 0.01035374, "balance_loss_clip": 1.03738701, "balance_loss_mlp": 1.02265429, "epoch": 0.6522771681947993, "flos": 26651930371200.0, "grad_norm": 2.586521111897064, "language_loss": 0.74449492, "learning_rate": 1.1396838463290263e-06, "loss": 0.7656889, "num_input_tokens_seen": 234309535, "step": 10849, "time_per_iteration": 4.26736044883728 }, { "auxiliary_loss_clip": 0.0106317, "auxiliary_loss_mlp": 0.01031804, "balance_loss_clip": 1.03691697, "balance_loss_mlp": 1.0188818, "epoch": 0.6523372914474673, "flos": 25739763465600.0, "grad_norm": 1.4022053902069738, "language_loss": 0.67808872, "learning_rate": 1.1393322747150752e-06, "loss": 0.69903851, "num_input_tokens_seen": 234328755, "step": 10850, "time_per_iteration": 2.8357365131378174 }, { "auxiliary_loss_clip": 0.01089828, "auxiliary_loss_mlp": 0.00769863, "balance_loss_clip": 1.03987718, "balance_loss_mlp": 1.00014496, "epoch": 0.6523974147001352, "flos": 24827345164800.0, "grad_norm": 1.627745472842777, "language_loss": 0.66696799, "learning_rate": 1.1389807357377313e-06, "loss": 0.68556488, "num_input_tokens_seen": 234348655, "step": 10851, "time_per_iteration": 2.702782154083252 }, { "auxiliary_loss_clip": 0.01092324, "auxiliary_loss_mlp": 0.0103014, "balance_loss_clip": 1.04054999, "balance_loss_mlp": 1.01776636, "epoch": 0.6524575379528033, "flos": 26317637470080.0, "grad_norm": 2.9837115627224238, "language_loss": 0.73833734, "learning_rate": 1.1386292294103235e-06, "loss": 0.75956196, "num_input_tokens_seen": 234367445, "step": 10852, "time_per_iteration": 2.7116212844848633 }, { "auxiliary_loss_clip": 0.0109357, "auxiliary_loss_mlp": 0.01030172, "balance_loss_clip": 1.04287267, "balance_loss_mlp": 1.01617694, "epoch": 0.6525176612054712, "flos": 19494143464320.0, "grad_norm": 1.9044884730623952, "language_loss": 0.66662163, "learning_rate": 1.1382777557461812e-06, "loss": 0.68785906, "num_input_tokens_seen": 234384825, "step": 10853, "time_per_iteration": 2.7027504444122314 }, { "auxiliary_loss_clip": 0.01002155, "auxiliary_loss_mlp": 0.01000193, "balance_loss_clip": 1.01079071, "balance_loss_mlp": 0.99902517, "epoch": 0.6525777844581392, "flos": 71706894721920.0, "grad_norm": 0.7271722971933409, "language_loss": 0.62995195, "learning_rate": 1.137926314758634e-06, "loss": 0.64997554, "num_input_tokens_seen": 234450630, "step": 10854, "time_per_iteration": 3.330467462539673 }, { "auxiliary_loss_clip": 0.01098588, "auxiliary_loss_mlp": 0.01040453, "balance_loss_clip": 1.03749895, "balance_loss_mlp": 1.02501512, "epoch": 0.6526379077108072, "flos": 26653115520000.0, "grad_norm": 1.9818066069545293, "language_loss": 0.77810514, "learning_rate": 1.1375749064610072e-06, "loss": 0.79949546, "num_input_tokens_seen": 234473505, "step": 10855, "time_per_iteration": 2.856804132461548 }, { "auxiliary_loss_clip": 0.01073699, "auxiliary_loss_mlp": 0.01028598, "balance_loss_clip": 1.03438473, "balance_loss_mlp": 1.01601565, "epoch": 0.6526980309634751, "flos": 22820369673600.0, "grad_norm": 1.8477737717286657, "language_loss": 0.78975284, "learning_rate": 1.1372235308666256e-06, "loss": 0.81077588, "num_input_tokens_seen": 234492485, "step": 10856, "time_per_iteration": 2.7385408878326416 }, { "auxiliary_loss_clip": 0.01114282, "auxiliary_loss_mlp": 0.01034182, "balance_loss_clip": 1.04025459, "balance_loss_mlp": 1.0199244, "epoch": 0.6527581542161431, "flos": 28365048696960.0, "grad_norm": 3.158979628826276, "language_loss": 0.73701787, "learning_rate": 1.136872187988815e-06, "loss": 0.75850254, "num_input_tokens_seen": 234512645, "step": 10857, "time_per_iteration": 2.6843883991241455 }, { "auxiliary_loss_clip": 0.01090082, "auxiliary_loss_mlp": 0.01035453, "balance_loss_clip": 1.03591764, "balance_loss_mlp": 1.02337718, "epoch": 0.652818277468811, "flos": 18369206346240.0, "grad_norm": 3.7655949608052257, "language_loss": 0.6289376, "learning_rate": 1.1365208778408965e-06, "loss": 0.65019298, "num_input_tokens_seen": 234529310, "step": 10858, "time_per_iteration": 2.72822904586792 }, { "auxiliary_loss_clip": 0.01110966, "auxiliary_loss_mlp": 0.01034439, "balance_loss_clip": 1.03902686, "balance_loss_mlp": 1.02228558, "epoch": 0.6528784007214791, "flos": 18036170421120.0, "grad_norm": 1.6211282430818235, "language_loss": 0.78672451, "learning_rate": 1.1361696004361939e-06, "loss": 0.80817854, "num_input_tokens_seen": 234546685, "step": 10859, "time_per_iteration": 2.5962581634521484 }, { "auxiliary_loss_clip": 0.01104671, "auxiliary_loss_mlp": 0.01033239, "balance_loss_clip": 1.03923452, "balance_loss_mlp": 1.02013731, "epoch": 0.652938523974147, "flos": 22382008093440.0, "grad_norm": 1.697122178276391, "language_loss": 0.67908686, "learning_rate": 1.1358183557880256e-06, "loss": 0.70046592, "num_input_tokens_seen": 234566255, "step": 10860, "time_per_iteration": 2.7275006771087646 }, { "auxiliary_loss_clip": 0.01105971, "auxiliary_loss_mlp": 0.01029587, "balance_loss_clip": 1.04165852, "balance_loss_mlp": 1.01677179, "epoch": 0.652998647226815, "flos": 16764035368320.0, "grad_norm": 2.149849017639803, "language_loss": 0.67175591, "learning_rate": 1.135467143909712e-06, "loss": 0.69311142, "num_input_tokens_seen": 234585405, "step": 10861, "time_per_iteration": 2.700737237930298 }, { "auxiliary_loss_clip": 0.01093061, "auxiliary_loss_mlp": 0.01034965, "balance_loss_clip": 1.03918886, "balance_loss_mlp": 1.02101707, "epoch": 0.6530587704794829, "flos": 35772522019200.0, "grad_norm": 1.8900448169789823, "language_loss": 0.64973295, "learning_rate": 1.135115964814572e-06, "loss": 0.67101324, "num_input_tokens_seen": 234608095, "step": 10862, "time_per_iteration": 2.8191120624542236 }, { "auxiliary_loss_clip": 0.01090214, "auxiliary_loss_mlp": 0.01035944, "balance_loss_clip": 1.03788185, "balance_loss_mlp": 1.02351046, "epoch": 0.6531188937321509, "flos": 19316134638720.0, "grad_norm": 1.7201949909347662, "language_loss": 0.77214205, "learning_rate": 1.13476481851592e-06, "loss": 0.79340369, "num_input_tokens_seen": 234627335, "step": 10863, "time_per_iteration": 2.7301394939422607 }, { "auxiliary_loss_clip": 0.01086865, "auxiliary_loss_mlp": 0.01035498, "balance_loss_clip": 1.03934371, "balance_loss_mlp": 1.0234524, "epoch": 0.6531790169848188, "flos": 22893771116160.0, "grad_norm": 5.89922160085871, "language_loss": 0.74717021, "learning_rate": 1.1344137050270739e-06, "loss": 0.76839387, "num_input_tokens_seen": 234646540, "step": 10864, "time_per_iteration": 2.694638729095459 }, { "auxiliary_loss_clip": 0.01101868, "auxiliary_loss_mlp": 0.01037064, "balance_loss_clip": 1.03954864, "balance_loss_mlp": 1.02464223, "epoch": 0.6532391402374869, "flos": 29563530912000.0, "grad_norm": 1.7565530493907513, "language_loss": 0.86014044, "learning_rate": 1.1340626243613458e-06, "loss": 0.88152981, "num_input_tokens_seen": 234665470, "step": 10865, "time_per_iteration": 2.6702401638031006 }, { "auxiliary_loss_clip": 0.01084878, "auxiliary_loss_mlp": 0.00771127, "balance_loss_clip": 1.0366689, "balance_loss_mlp": 1.00016713, "epoch": 0.6532992634901548, "flos": 23105463920640.0, "grad_norm": 1.5997360666048854, "language_loss": 0.81537604, "learning_rate": 1.133711576532051e-06, "loss": 0.8339361, "num_input_tokens_seen": 234683955, "step": 10866, "time_per_iteration": 2.7677865028381348 }, { "auxiliary_loss_clip": 0.01092326, "auxiliary_loss_mlp": 0.01027552, "balance_loss_clip": 1.04049444, "balance_loss_mlp": 1.0153923, "epoch": 0.6533593867428228, "flos": 26067340523520.0, "grad_norm": 1.499689557141503, "language_loss": 0.82382023, "learning_rate": 1.1333605615524995e-06, "loss": 0.84501904, "num_input_tokens_seen": 234704595, "step": 10867, "time_per_iteration": 2.67887020111084 }, { "auxiliary_loss_clip": 0.01086387, "auxiliary_loss_mlp": 0.01028819, "balance_loss_clip": 1.03923059, "balance_loss_mlp": 1.01656437, "epoch": 0.6534195099954908, "flos": 21212469262080.0, "grad_norm": 1.9931778054716736, "language_loss": 0.81410849, "learning_rate": 1.1330095794360016e-06, "loss": 0.83526063, "num_input_tokens_seen": 234724090, "step": 10868, "time_per_iteration": 2.692563533782959 }, { "auxiliary_loss_clip": 0.01085283, "auxiliary_loss_mlp": 0.0103014, "balance_loss_clip": 1.04046869, "balance_loss_mlp": 1.01654446, "epoch": 0.6534796332481587, "flos": 19646584784640.0, "grad_norm": 1.7926198693955093, "language_loss": 0.79652596, "learning_rate": 1.1326586301958675e-06, "loss": 0.81768018, "num_input_tokens_seen": 234742560, "step": 10869, "time_per_iteration": 2.6747188568115234 }, { "auxiliary_loss_clip": 0.01107733, "auxiliary_loss_mlp": 0.01034253, "balance_loss_clip": 1.04306769, "balance_loss_mlp": 1.02144957, "epoch": 0.6535397565008267, "flos": 24022479162240.0, "grad_norm": 1.9247655195442634, "language_loss": 0.72409803, "learning_rate": 1.1323077138454063e-06, "loss": 0.74551791, "num_input_tokens_seen": 234762315, "step": 10870, "time_per_iteration": 2.6496713161468506 }, { "auxiliary_loss_clip": 0.01073837, "auxiliary_loss_mlp": 0.01040127, "balance_loss_clip": 1.0374316, "balance_loss_mlp": 1.02689457, "epoch": 0.6535998797534947, "flos": 24602759377920.0, "grad_norm": 2.0567680865886797, "language_loss": 0.7481339, "learning_rate": 1.1319568303979221e-06, "loss": 0.76927352, "num_input_tokens_seen": 234781300, "step": 10871, "time_per_iteration": 2.738467216491699 }, { "auxiliary_loss_clip": 0.01094755, "auxiliary_loss_mlp": 0.00768767, "balance_loss_clip": 1.04057598, "balance_loss_mlp": 1.00008535, "epoch": 0.6536600030061627, "flos": 23364164649600.0, "grad_norm": 1.631721616705098, "language_loss": 0.55669373, "learning_rate": 1.1316059798667227e-06, "loss": 0.57532895, "num_input_tokens_seen": 234801040, "step": 10872, "time_per_iteration": 2.7837493419647217 }, { "auxiliary_loss_clip": 0.01089558, "auxiliary_loss_mlp": 0.01033451, "balance_loss_clip": 1.03836048, "balance_loss_mlp": 1.02150071, "epoch": 0.6537201262588306, "flos": 23878477537920.0, "grad_norm": 1.5206380793292014, "language_loss": 0.74701464, "learning_rate": 1.1312551622651112e-06, "loss": 0.76824474, "num_input_tokens_seen": 234821415, "step": 10873, "time_per_iteration": 2.6991825103759766 }, { "auxiliary_loss_clip": 0.01103837, "auxiliary_loss_mlp": 0.01031753, "balance_loss_clip": 1.04124331, "balance_loss_mlp": 1.01923621, "epoch": 0.6537802495114986, "flos": 24354760901760.0, "grad_norm": 1.5572607769752447, "language_loss": 0.75670367, "learning_rate": 1.1309043776063917e-06, "loss": 0.7780596, "num_input_tokens_seen": 234843795, "step": 10874, "time_per_iteration": 2.78080153465271 }, { "auxiliary_loss_clip": 0.01071596, "auxiliary_loss_mlp": 0.01032474, "balance_loss_clip": 1.03871393, "balance_loss_mlp": 1.01939058, "epoch": 0.6538403727641665, "flos": 27996892248960.0, "grad_norm": 1.5478335962993721, "language_loss": 0.81636667, "learning_rate": 1.1305536259038642e-06, "loss": 0.83740735, "num_input_tokens_seen": 234862350, "step": 10875, "time_per_iteration": 2.8029510974884033 }, { "auxiliary_loss_clip": 0.01113458, "auxiliary_loss_mlp": 0.01038052, "balance_loss_clip": 1.03928709, "balance_loss_mlp": 1.0257194, "epoch": 0.6539004960168345, "flos": 27563594486400.0, "grad_norm": 1.7147154744114859, "language_loss": 0.70016718, "learning_rate": 1.1302029071708314e-06, "loss": 0.72168231, "num_input_tokens_seen": 234881790, "step": 10876, "time_per_iteration": 2.7378597259521484 }, { "auxiliary_loss_clip": 0.01019889, "auxiliary_loss_mlp": 0.01040083, "balance_loss_clip": 1.03454161, "balance_loss_mlp": 1.02664804, "epoch": 0.6539606192695024, "flos": 14530067879040.0, "grad_norm": 5.2813318768904, "language_loss": 0.79471064, "learning_rate": 1.1298522214205908e-06, "loss": 0.81531036, "num_input_tokens_seen": 234897775, "step": 10877, "time_per_iteration": 2.9654347896575928 }, { "auxiliary_loss_clip": 0.0109536, "auxiliary_loss_mlp": 0.00770832, "balance_loss_clip": 1.04007304, "balance_loss_mlp": 1.00019956, "epoch": 0.6540207425221705, "flos": 21616356764160.0, "grad_norm": 10.000647074298708, "language_loss": 0.79720318, "learning_rate": 1.1295015686664408e-06, "loss": 0.81586516, "num_input_tokens_seen": 234918395, "step": 10878, "time_per_iteration": 3.0778963565826416 }, { "auxiliary_loss_clip": 0.01091014, "auxiliary_loss_mlp": 0.01032504, "balance_loss_clip": 1.03766847, "balance_loss_mlp": 1.01898539, "epoch": 0.6540808657748384, "flos": 17668983640320.0, "grad_norm": 1.8035849841716871, "language_loss": 0.84622979, "learning_rate": 1.1291509489216797e-06, "loss": 0.8674649, "num_input_tokens_seen": 234936260, "step": 10879, "time_per_iteration": 2.668922185897827 }, { "auxiliary_loss_clip": 0.01093903, "auxiliary_loss_mlp": 0.01030306, "balance_loss_clip": 1.03903461, "balance_loss_mlp": 1.01730609, "epoch": 0.6541409890275064, "flos": 14538292093440.0, "grad_norm": 2.263757202052665, "language_loss": 0.71778309, "learning_rate": 1.128800362199601e-06, "loss": 0.73902524, "num_input_tokens_seen": 234952110, "step": 10880, "time_per_iteration": 2.662271499633789 }, { "auxiliary_loss_clip": 0.0107269, "auxiliary_loss_mlp": 0.01037728, "balance_loss_clip": 1.03594911, "balance_loss_mlp": 1.02518129, "epoch": 0.6542011122801744, "flos": 17165301177600.0, "grad_norm": 2.432806470924959, "language_loss": 0.8439703, "learning_rate": 1.1284498085135005e-06, "loss": 0.86507452, "num_input_tokens_seen": 234970810, "step": 10881, "time_per_iteration": 4.583907127380371 }, { "auxiliary_loss_clip": 0.01081012, "auxiliary_loss_mlp": 0.01035122, "balance_loss_clip": 1.03797197, "balance_loss_mlp": 1.02026868, "epoch": 0.6542612355328423, "flos": 18186600579840.0, "grad_norm": 1.797675187581498, "language_loss": 0.78180546, "learning_rate": 1.1280992878766699e-06, "loss": 0.80296683, "num_input_tokens_seen": 234989565, "step": 10882, "time_per_iteration": 2.7273218631744385 }, { "auxiliary_loss_clip": 0.01117869, "auxiliary_loss_mlp": 0.01030964, "balance_loss_clip": 1.04191113, "balance_loss_mlp": 1.01693869, "epoch": 0.6543213587855103, "flos": 19792453916160.0, "grad_norm": 2.0896373641472716, "language_loss": 0.82002509, "learning_rate": 1.1277488003024024e-06, "loss": 0.84151345, "num_input_tokens_seen": 235007955, "step": 10883, "time_per_iteration": 2.6430859565734863 }, { "auxiliary_loss_clip": 0.01063765, "auxiliary_loss_mlp": 0.01039023, "balance_loss_clip": 1.03828621, "balance_loss_mlp": 1.02518272, "epoch": 0.6543814820381783, "flos": 21105096531840.0, "grad_norm": 2.092498099252334, "language_loss": 0.85347474, "learning_rate": 1.127398345803988e-06, "loss": 0.8745026, "num_input_tokens_seen": 235024860, "step": 10884, "time_per_iteration": 6.071943998336792 }, { "auxiliary_loss_clip": 0.01092231, "auxiliary_loss_mlp": 0.01036915, "balance_loss_clip": 1.03901005, "balance_loss_mlp": 1.02371883, "epoch": 0.6544416052908463, "flos": 20194042947840.0, "grad_norm": 2.4941716916648367, "language_loss": 0.79124463, "learning_rate": 1.127047924394715e-06, "loss": 0.81253612, "num_input_tokens_seen": 235043815, "step": 10885, "time_per_iteration": 2.675748586654663 }, { "auxiliary_loss_clip": 0.01074538, "auxiliary_loss_mlp": 0.01031139, "balance_loss_clip": 1.03618622, "balance_loss_mlp": 1.01794887, "epoch": 0.6545017285435142, "flos": 23368258800000.0, "grad_norm": 1.8639137549782854, "language_loss": 0.72277772, "learning_rate": 1.1266975360878722e-06, "loss": 0.7438345, "num_input_tokens_seen": 235062985, "step": 10886, "time_per_iteration": 2.750396490097046 }, { "auxiliary_loss_clip": 0.0109826, "auxiliary_loss_mlp": 0.01029695, "balance_loss_clip": 1.04163647, "balance_loss_mlp": 1.01777434, "epoch": 0.6545618517961822, "flos": 19134714021120.0, "grad_norm": 1.7570103692481698, "language_loss": 0.77918178, "learning_rate": 1.1263471808967468e-06, "loss": 0.80046129, "num_input_tokens_seen": 235081670, "step": 10887, "time_per_iteration": 2.6504671573638916 }, { "auxiliary_loss_clip": 0.01087762, "auxiliary_loss_mlp": 0.01034009, "balance_loss_clip": 1.03893995, "balance_loss_mlp": 1.02152205, "epoch": 0.6546219750488501, "flos": 14938624149120.0, "grad_norm": 3.1473995780079567, "language_loss": 0.78907198, "learning_rate": 1.1259968588346234e-06, "loss": 0.81028962, "num_input_tokens_seen": 235098510, "step": 10888, "time_per_iteration": 4.194061040878296 }, { "auxiliary_loss_clip": 0.01101212, "auxiliary_loss_mlp": 0.01030296, "balance_loss_clip": 1.03934383, "balance_loss_mlp": 1.0185833, "epoch": 0.6546820983015181, "flos": 36320518886400.0, "grad_norm": 1.6496831253156983, "language_loss": 0.66765958, "learning_rate": 1.1256465699147874e-06, "loss": 0.68897462, "num_input_tokens_seen": 235119990, "step": 10889, "time_per_iteration": 2.784081220626831 }, { "auxiliary_loss_clip": 0.01087306, "auxiliary_loss_mlp": 0.01041216, "balance_loss_clip": 1.03762484, "balance_loss_mlp": 1.02561128, "epoch": 0.654742221554186, "flos": 20411446014720.0, "grad_norm": 1.423388332820949, "language_loss": 0.7975992, "learning_rate": 1.1252963141505203e-06, "loss": 0.81888443, "num_input_tokens_seen": 235139255, "step": 10890, "time_per_iteration": 2.630934000015259 }, { "auxiliary_loss_clip": 0.01103288, "auxiliary_loss_mlp": 0.00771276, "balance_loss_clip": 1.0388689, "balance_loss_mlp": 1.00018215, "epoch": 0.6548023448068541, "flos": 24863650836480.0, "grad_norm": 4.747441832744551, "language_loss": 0.66281724, "learning_rate": 1.1249460915551052e-06, "loss": 0.6815629, "num_input_tokens_seen": 235158455, "step": 10891, "time_per_iteration": 2.7071638107299805 }, { "auxiliary_loss_clip": 0.01100507, "auxiliary_loss_mlp": 0.01034408, "balance_loss_clip": 1.03802693, "balance_loss_mlp": 1.02253485, "epoch": 0.654862468059522, "flos": 21427573858560.0, "grad_norm": 1.8230572175778426, "language_loss": 0.79398739, "learning_rate": 1.1245959021418214e-06, "loss": 0.81533659, "num_input_tokens_seen": 235177350, "step": 10892, "time_per_iteration": 2.7039225101470947 }, { "auxiliary_loss_clip": 0.01109845, "auxiliary_loss_mlp": 0.01032726, "balance_loss_clip": 1.04345989, "balance_loss_mlp": 1.01996517, "epoch": 0.65492259131219, "flos": 26577846570240.0, "grad_norm": 1.9941624602256833, "language_loss": 0.7830174, "learning_rate": 1.1242457459239497e-06, "loss": 0.80444312, "num_input_tokens_seen": 235196435, "step": 10893, "time_per_iteration": 2.6736834049224854 }, { "auxiliary_loss_clip": 0.01119127, "auxiliary_loss_mlp": 0.01033009, "balance_loss_clip": 1.04234505, "balance_loss_mlp": 1.01919901, "epoch": 0.6549827145648579, "flos": 21501334437120.0, "grad_norm": 1.6280761795880925, "language_loss": 0.70089674, "learning_rate": 1.123895622914766e-06, "loss": 0.72241807, "num_input_tokens_seen": 235215430, "step": 10894, "time_per_iteration": 2.5782406330108643 }, { "auxiliary_loss_clip": 0.01108084, "auxiliary_loss_mlp": 0.01033898, "balance_loss_clip": 1.03990614, "balance_loss_mlp": 1.02057683, "epoch": 0.6550428378175259, "flos": 22594275515520.0, "grad_norm": 3.549181275643373, "language_loss": 0.63655615, "learning_rate": 1.123545533127549e-06, "loss": 0.65797597, "num_input_tokens_seen": 235232015, "step": 10895, "time_per_iteration": 2.629176139831543 }, { "auxiliary_loss_clip": 0.0109961, "auxiliary_loss_mlp": 0.01034607, "balance_loss_clip": 1.03651488, "balance_loss_mlp": 1.02231681, "epoch": 0.655102961070194, "flos": 12823809050880.0, "grad_norm": 1.94601425933446, "language_loss": 0.78524303, "learning_rate": 1.1231954765755722e-06, "loss": 0.80658519, "num_input_tokens_seen": 235248115, "step": 10896, "time_per_iteration": 2.5840821266174316 }, { "auxiliary_loss_clip": 0.01092224, "auxiliary_loss_mlp": 0.01033088, "balance_loss_clip": 1.04114115, "balance_loss_mlp": 1.02101183, "epoch": 0.6551630843228619, "flos": 24791075406720.0, "grad_norm": 1.3806195961019156, "language_loss": 0.70286167, "learning_rate": 1.1228454532721111e-06, "loss": 0.72411478, "num_input_tokens_seen": 235270785, "step": 10897, "time_per_iteration": 2.7511391639709473 }, { "auxiliary_loss_clip": 0.01117369, "auxiliary_loss_mlp": 0.01034749, "balance_loss_clip": 1.0412488, "balance_loss_mlp": 1.02182722, "epoch": 0.6552232075755299, "flos": 16724461559040.0, "grad_norm": 1.8561946448451885, "language_loss": 0.75477493, "learning_rate": 1.1224954632304391e-06, "loss": 0.77629614, "num_input_tokens_seen": 235287905, "step": 10898, "time_per_iteration": 2.5865721702575684 }, { "auxiliary_loss_clip": 0.0109408, "auxiliary_loss_mlp": 0.01033257, "balance_loss_clip": 1.03979027, "balance_loss_mlp": 1.0210743, "epoch": 0.6552833308281978, "flos": 22016473338240.0, "grad_norm": 3.2174058784853634, "language_loss": 0.73745394, "learning_rate": 1.122145506463827e-06, "loss": 0.75872725, "num_input_tokens_seen": 235305525, "step": 10899, "time_per_iteration": 2.6415457725524902 }, { "auxiliary_loss_clip": 0.01092854, "auxiliary_loss_mlp": 0.0103035, "balance_loss_clip": 1.0398674, "balance_loss_mlp": 1.0178864, "epoch": 0.6553434540808658, "flos": 24863399441280.0, "grad_norm": 1.7775828030787661, "language_loss": 0.5608502, "learning_rate": 1.1217955829855443e-06, "loss": 0.58208227, "num_input_tokens_seen": 235324415, "step": 10900, "time_per_iteration": 2.6782078742980957 }, { "auxiliary_loss_clip": 0.0110767, "auxiliary_loss_mlp": 0.01036194, "balance_loss_clip": 1.04541218, "balance_loss_mlp": 1.02259791, "epoch": 0.6554035773335337, "flos": 23221060865280.0, "grad_norm": 1.7239848151303507, "language_loss": 0.76706004, "learning_rate": 1.1214456928088622e-06, "loss": 0.78849864, "num_input_tokens_seen": 235341595, "step": 10901, "time_per_iteration": 2.6912708282470703 }, { "auxiliary_loss_clip": 0.01116025, "auxiliary_loss_mlp": 0.01030659, "balance_loss_clip": 1.04228628, "balance_loss_mlp": 1.01741457, "epoch": 0.6554637005862017, "flos": 22783597125120.0, "grad_norm": 1.8287933063935295, "language_loss": 0.73178118, "learning_rate": 1.1210958359470463e-06, "loss": 0.7532481, "num_input_tokens_seen": 235361700, "step": 10902, "time_per_iteration": 2.602215528488159 }, { "auxiliary_loss_clip": 0.01116289, "auxiliary_loss_mlp": 0.0103284, "balance_loss_clip": 1.04363585, "balance_loss_mlp": 1.020293, "epoch": 0.6555238238388696, "flos": 21507224267520.0, "grad_norm": 2.8262041202402806, "language_loss": 0.68081355, "learning_rate": 1.1207460124133645e-06, "loss": 0.7023049, "num_input_tokens_seen": 235382065, "step": 10903, "time_per_iteration": 2.6410489082336426 }, { "auxiliary_loss_clip": 0.01095479, "auxiliary_loss_mlp": 0.00772021, "balance_loss_clip": 1.0381676, "balance_loss_mlp": 1.00024486, "epoch": 0.6555839470915377, "flos": 30519473518080.0, "grad_norm": 1.6908937242491595, "language_loss": 0.66551757, "learning_rate": 1.1203962222210832e-06, "loss": 0.6841926, "num_input_tokens_seen": 235402130, "step": 10904, "time_per_iteration": 2.790280342102051 }, { "auxiliary_loss_clip": 0.01106834, "auxiliary_loss_mlp": 0.01041591, "balance_loss_clip": 1.0399909, "balance_loss_mlp": 1.02686858, "epoch": 0.6556440703442056, "flos": 24642943718400.0, "grad_norm": 1.7449585350931947, "language_loss": 0.90588987, "learning_rate": 1.120046465383464e-06, "loss": 0.92737412, "num_input_tokens_seen": 235420435, "step": 10905, "time_per_iteration": 2.6630730628967285 }, { "auxiliary_loss_clip": 0.01101239, "auxiliary_loss_mlp": 0.01036194, "balance_loss_clip": 1.0387404, "balance_loss_mlp": 1.02384353, "epoch": 0.6557041935968736, "flos": 23732464752000.0, "grad_norm": 1.68326433592196, "language_loss": 0.75189042, "learning_rate": 1.1196967419137721e-06, "loss": 0.77326465, "num_input_tokens_seen": 235439960, "step": 10906, "time_per_iteration": 2.808749198913574 }, { "auxiliary_loss_clip": 0.01120903, "auxiliary_loss_mlp": 0.01039658, "balance_loss_clip": 1.04417121, "balance_loss_mlp": 1.02620482, "epoch": 0.6557643168495415, "flos": 11102753819520.0, "grad_norm": 2.6025393297474753, "language_loss": 0.74533153, "learning_rate": 1.119347051825267e-06, "loss": 0.76693714, "num_input_tokens_seen": 235457495, "step": 10907, "time_per_iteration": 2.593248128890991 }, { "auxiliary_loss_clip": 0.01074084, "auxiliary_loss_mlp": 0.01033534, "balance_loss_clip": 1.03740346, "balance_loss_mlp": 1.01887107, "epoch": 0.6558244401022095, "flos": 30191034533760.0, "grad_norm": 1.4237999067012654, "language_loss": 0.72347319, "learning_rate": 1.118997395131211e-06, "loss": 0.74454939, "num_input_tokens_seen": 235479525, "step": 10908, "time_per_iteration": 2.82675838470459 }, { "auxiliary_loss_clip": 0.01119224, "auxiliary_loss_mlp": 0.01039345, "balance_loss_clip": 1.04407787, "balance_loss_mlp": 1.02501035, "epoch": 0.6558845633548775, "flos": 17931060247680.0, "grad_norm": 2.1324653040060206, "language_loss": 0.81237155, "learning_rate": 1.118647771844861e-06, "loss": 0.83395725, "num_input_tokens_seen": 235496305, "step": 10909, "time_per_iteration": 2.5471675395965576 }, { "auxiliary_loss_clip": 0.01118639, "auxiliary_loss_mlp": 0.01037445, "balance_loss_clip": 1.04318082, "balance_loss_mlp": 1.02355766, "epoch": 0.6559446866075455, "flos": 21904144531200.0, "grad_norm": 2.016309466872126, "language_loss": 0.6391021, "learning_rate": 1.1182981819794767e-06, "loss": 0.66066295, "num_input_tokens_seen": 235512545, "step": 10910, "time_per_iteration": 2.5981180667877197 }, { "auxiliary_loss_clip": 0.01094899, "auxiliary_loss_mlp": 0.01035785, "balance_loss_clip": 1.03948653, "balance_loss_mlp": 1.02022815, "epoch": 0.6560048098602135, "flos": 14127976056960.0, "grad_norm": 3.167812850459713, "language_loss": 0.75653553, "learning_rate": 1.117948625548313e-06, "loss": 0.7778424, "num_input_tokens_seen": 235526045, "step": 10911, "time_per_iteration": 2.6054794788360596 }, { "auxiliary_loss_clip": 0.01110901, "auxiliary_loss_mlp": 0.01032832, "balance_loss_clip": 1.03947508, "balance_loss_mlp": 1.02068496, "epoch": 0.6560649331128814, "flos": 18807567926400.0, "grad_norm": 1.6537881729795834, "language_loss": 0.75314403, "learning_rate": 1.1175991025646265e-06, "loss": 0.77458137, "num_input_tokens_seen": 235545285, "step": 10912, "time_per_iteration": 2.5621368885040283 }, { "auxiliary_loss_clip": 0.01080239, "auxiliary_loss_mlp": 0.00773337, "balance_loss_clip": 1.04076517, "balance_loss_mlp": 1.00024402, "epoch": 0.6561250563655494, "flos": 17053618815360.0, "grad_norm": 1.7152126223100395, "language_loss": 0.77399373, "learning_rate": 1.1172496130416697e-06, "loss": 0.79252946, "num_input_tokens_seen": 235563150, "step": 10913, "time_per_iteration": 2.6770215034484863 }, { "auxiliary_loss_clip": 0.01082486, "auxiliary_loss_mlp": 0.01031683, "balance_loss_clip": 1.03641891, "balance_loss_mlp": 1.0197978, "epoch": 0.6561851796182173, "flos": 22637656166400.0, "grad_norm": 1.7806335935721003, "language_loss": 0.71243644, "learning_rate": 1.1169001569926961e-06, "loss": 0.73357815, "num_input_tokens_seen": 235582535, "step": 10914, "time_per_iteration": 2.667307138442993 }, { "auxiliary_loss_clip": 0.01083296, "auxiliary_loss_mlp": 0.01037173, "balance_loss_clip": 1.03966224, "balance_loss_mlp": 1.02398872, "epoch": 0.6562453028708853, "flos": 19239213663360.0, "grad_norm": 1.6513290970886485, "language_loss": 0.73859835, "learning_rate": 1.116550734430958e-06, "loss": 0.75980306, "num_input_tokens_seen": 235601490, "step": 10915, "time_per_iteration": 2.6983346939086914 }, { "auxiliary_loss_clip": 0.01073456, "auxiliary_loss_mlp": 0.01034672, "balance_loss_clip": 1.03744984, "balance_loss_mlp": 1.02053952, "epoch": 0.6563054261235532, "flos": 23801305167360.0, "grad_norm": 1.7082646806866446, "language_loss": 0.79868412, "learning_rate": 1.1162013453697042e-06, "loss": 0.81976539, "num_input_tokens_seen": 235619165, "step": 10916, "time_per_iteration": 2.7007508277893066 }, { "auxiliary_loss_clip": 0.01085821, "auxiliary_loss_mlp": 0.01034159, "balance_loss_clip": 1.03703237, "balance_loss_mlp": 1.02174914, "epoch": 0.6563655493762213, "flos": 19240039676160.0, "grad_norm": 7.0038314681057265, "language_loss": 0.76291168, "learning_rate": 1.1158519898221831e-06, "loss": 0.78411144, "num_input_tokens_seen": 235637115, "step": 10917, "time_per_iteration": 2.6554038524627686 }, { "auxiliary_loss_clip": 0.01114484, "auxiliary_loss_mlp": 0.00770758, "balance_loss_clip": 1.04096055, "balance_loss_mlp": 1.00018668, "epoch": 0.6564256726288892, "flos": 25556439427200.0, "grad_norm": 1.7912511669436304, "language_loss": 0.69599342, "learning_rate": 1.1155026678016445e-06, "loss": 0.7148459, "num_input_tokens_seen": 235656330, "step": 10918, "time_per_iteration": 2.658940315246582 }, { "auxiliary_loss_clip": 0.0108095, "auxiliary_loss_mlp": 0.01038011, "balance_loss_clip": 1.04091477, "balance_loss_mlp": 1.02542877, "epoch": 0.6564857958815572, "flos": 22200623389440.0, "grad_norm": 1.5721628638219425, "language_loss": 0.76389003, "learning_rate": 1.115153379321332e-06, "loss": 0.78507966, "num_input_tokens_seen": 235674510, "step": 10919, "time_per_iteration": 2.8179666996002197 }, { "auxiliary_loss_clip": 0.01024309, "auxiliary_loss_mlp": 0.00751654, "balance_loss_clip": 1.01056981, "balance_loss_mlp": 0.99972719, "epoch": 0.6565459191342251, "flos": 58123144604160.0, "grad_norm": 0.7147618349733724, "language_loss": 0.52982259, "learning_rate": 1.1148041243944931e-06, "loss": 0.54758221, "num_input_tokens_seen": 235735050, "step": 10920, "time_per_iteration": 4.864136457443237 }, { "auxiliary_loss_clip": 0.01102705, "auxiliary_loss_mlp": 0.01032103, "balance_loss_clip": 1.03955173, "balance_loss_mlp": 1.01899636, "epoch": 0.6566060423868931, "flos": 30809631582720.0, "grad_norm": 1.4970588684029809, "language_loss": 0.65309536, "learning_rate": 1.1144549030343697e-06, "loss": 0.67444336, "num_input_tokens_seen": 235757545, "step": 10921, "time_per_iteration": 2.6399025917053223 }, { "auxiliary_loss_clip": 0.01088773, "auxiliary_loss_mlp": 0.01042354, "balance_loss_clip": 1.03777099, "balance_loss_mlp": 1.02691627, "epoch": 0.6566661656395612, "flos": 23367432787200.0, "grad_norm": 1.7149236463781705, "language_loss": 0.81306088, "learning_rate": 1.114105715254205e-06, "loss": 0.83437216, "num_input_tokens_seen": 235777265, "step": 10922, "time_per_iteration": 2.6043496131896973 }, { "auxiliary_loss_clip": 0.0105706, "auxiliary_loss_mlp": 0.00773782, "balance_loss_clip": 1.03730524, "balance_loss_mlp": 1.00019729, "epoch": 0.6567262888922291, "flos": 25735597488000.0, "grad_norm": 1.8848622596547697, "language_loss": 0.71114737, "learning_rate": 1.1137565610672414e-06, "loss": 0.72945583, "num_input_tokens_seen": 235796565, "step": 10923, "time_per_iteration": 4.080937385559082 }, { "auxiliary_loss_clip": 0.01080403, "auxiliary_loss_mlp": 0.01035157, "balance_loss_clip": 1.04141772, "balance_loss_mlp": 1.02234805, "epoch": 0.6567864121448971, "flos": 17123716206720.0, "grad_norm": 1.9659077727339813, "language_loss": 0.80819428, "learning_rate": 1.1134074404867169e-06, "loss": 0.82934988, "num_input_tokens_seen": 235814805, "step": 10924, "time_per_iteration": 4.207550287246704 }, { "auxiliary_loss_clip": 0.01098058, "auxiliary_loss_mlp": 0.01028766, "balance_loss_clip": 1.03715539, "balance_loss_mlp": 1.01637435, "epoch": 0.656846535397565, "flos": 22419319345920.0, "grad_norm": 4.832574898603098, "language_loss": 0.7250914, "learning_rate": 1.1130583535258717e-06, "loss": 0.74635959, "num_input_tokens_seen": 235833405, "step": 10925, "time_per_iteration": 2.637345790863037 }, { "auxiliary_loss_clip": 0.01101634, "auxiliary_loss_mlp": 0.01030115, "balance_loss_clip": 1.03830063, "balance_loss_mlp": 1.01710916, "epoch": 0.656906658650233, "flos": 17704535126400.0, "grad_norm": 2.262744420383479, "language_loss": 0.72445238, "learning_rate": 1.112709300197942e-06, "loss": 0.74576986, "num_input_tokens_seen": 235848530, "step": 10926, "time_per_iteration": 2.6307756900787354 }, { "auxiliary_loss_clip": 0.0106886, "auxiliary_loss_mlp": 0.01034286, "balance_loss_clip": 1.03765988, "balance_loss_mlp": 1.02080905, "epoch": 0.6569667819029009, "flos": 21175158009600.0, "grad_norm": 1.7200943700135627, "language_loss": 0.72494638, "learning_rate": 1.1123602805161656e-06, "loss": 0.74597794, "num_input_tokens_seen": 235867225, "step": 10927, "time_per_iteration": 4.311558246612549 }, { "auxiliary_loss_clip": 0.01005194, "auxiliary_loss_mlp": 0.01007222, "balance_loss_clip": 1.01187444, "balance_loss_mlp": 1.00603569, "epoch": 0.6570269051555689, "flos": 68761897511040.0, "grad_norm": 0.7266677598408974, "language_loss": 0.64416504, "learning_rate": 1.112011294493775e-06, "loss": 0.66428924, "num_input_tokens_seen": 235932925, "step": 10928, "time_per_iteration": 3.2423789501190186 }, { "auxiliary_loss_clip": 0.01100905, "auxiliary_loss_mlp": 0.01034907, "balance_loss_clip": 1.03707099, "balance_loss_mlp": 1.02176392, "epoch": 0.6570870284082369, "flos": 26319289495680.0, "grad_norm": 1.7795232563837846, "language_loss": 0.77698803, "learning_rate": 1.1116623421440063e-06, "loss": 0.79834616, "num_input_tokens_seen": 235952680, "step": 10929, "time_per_iteration": 2.6381664276123047 }, { "auxiliary_loss_clip": 0.01078467, "auxiliary_loss_mlp": 0.01030363, "balance_loss_clip": 1.03687572, "balance_loss_mlp": 1.01705337, "epoch": 0.6571471516609049, "flos": 26174749167360.0, "grad_norm": 2.3625903698766826, "language_loss": 0.65178704, "learning_rate": 1.1113134234800895e-06, "loss": 0.67287529, "num_input_tokens_seen": 235972075, "step": 10930, "time_per_iteration": 2.7424116134643555 }, { "auxiliary_loss_clip": 0.01063728, "auxiliary_loss_mlp": 0.01033942, "balance_loss_clip": 1.03379416, "balance_loss_mlp": 1.02037621, "epoch": 0.6572072749135728, "flos": 20376253664640.0, "grad_norm": 1.690752691180261, "language_loss": 0.70888293, "learning_rate": 1.110964538515258e-06, "loss": 0.72985959, "num_input_tokens_seen": 235990340, "step": 10931, "time_per_iteration": 2.7526936531066895 }, { "auxiliary_loss_clip": 0.01070712, "auxiliary_loss_mlp": 0.01038764, "balance_loss_clip": 1.03789568, "balance_loss_mlp": 1.02569246, "epoch": 0.6572673981662408, "flos": 17128744110720.0, "grad_norm": 2.7494594651926763, "language_loss": 0.68903434, "learning_rate": 1.1106156872627393e-06, "loss": 0.71012914, "num_input_tokens_seen": 236007470, "step": 10932, "time_per_iteration": 2.699676036834717 }, { "auxiliary_loss_clip": 0.01088862, "auxiliary_loss_mlp": 0.0077114, "balance_loss_clip": 1.03621304, "balance_loss_mlp": 1.00018311, "epoch": 0.6573275214189087, "flos": 41275113281280.0, "grad_norm": 1.7103641293724658, "language_loss": 0.80041671, "learning_rate": 1.1102668697357626e-06, "loss": 0.8190167, "num_input_tokens_seen": 236029030, "step": 10933, "time_per_iteration": 2.884944200515747 }, { "auxiliary_loss_clip": 0.01066755, "auxiliary_loss_mlp": 0.01038188, "balance_loss_clip": 1.03784192, "balance_loss_mlp": 1.02397847, "epoch": 0.6573876446715767, "flos": 22890143842560.0, "grad_norm": 1.944468432565168, "language_loss": 0.73796332, "learning_rate": 1.1099180859475571e-06, "loss": 0.75901282, "num_input_tokens_seen": 236047160, "step": 10934, "time_per_iteration": 2.689169406890869 }, { "auxiliary_loss_clip": 0.01097012, "auxiliary_loss_mlp": 0.01038397, "balance_loss_clip": 1.0375042, "balance_loss_mlp": 1.02410352, "epoch": 0.6574477679242448, "flos": 44018150273280.0, "grad_norm": 1.510657094056813, "language_loss": 0.76061821, "learning_rate": 1.1095693359113454e-06, "loss": 0.78197235, "num_input_tokens_seen": 236069215, "step": 10935, "time_per_iteration": 2.798928737640381 }, { "auxiliary_loss_clip": 0.01075783, "auxiliary_loss_mlp": 0.01039916, "balance_loss_clip": 1.03844082, "balance_loss_mlp": 1.02543783, "epoch": 0.6575078911769127, "flos": 24571517523840.0, "grad_norm": 1.6442083694725653, "language_loss": 0.78311378, "learning_rate": 1.1092206196403538e-06, "loss": 0.80427074, "num_input_tokens_seen": 236088335, "step": 10936, "time_per_iteration": 2.718698263168335 }, { "auxiliary_loss_clip": 0.01065449, "auxiliary_loss_mlp": 0.01032937, "balance_loss_clip": 1.0363667, "balance_loss_mlp": 1.02052104, "epoch": 0.6575680144295807, "flos": 20924035050240.0, "grad_norm": 1.7517271883506782, "language_loss": 0.68920904, "learning_rate": 1.1088719371478056e-06, "loss": 0.71019292, "num_input_tokens_seen": 236108540, "step": 10937, "time_per_iteration": 2.7036542892456055 }, { "auxiliary_loss_clip": 0.01087739, "auxiliary_loss_mlp": 0.0103126, "balance_loss_clip": 1.03832746, "balance_loss_mlp": 1.01813471, "epoch": 0.6576281376822486, "flos": 10925642833920.0, "grad_norm": 2.652931448732022, "language_loss": 0.6823296, "learning_rate": 1.1085232884469236e-06, "loss": 0.70351958, "num_input_tokens_seen": 236124495, "step": 10938, "time_per_iteration": 2.6599676609039307 }, { "auxiliary_loss_clip": 0.01085941, "auxiliary_loss_mlp": 0.01033843, "balance_loss_clip": 1.03766704, "balance_loss_mlp": 1.02009773, "epoch": 0.6576882609349166, "flos": 19281552819840.0, "grad_norm": 3.453384337403157, "language_loss": 0.71610057, "learning_rate": 1.108174673550927e-06, "loss": 0.73729843, "num_input_tokens_seen": 236142550, "step": 10939, "time_per_iteration": 2.650425672531128 }, { "auxiliary_loss_clip": 0.01092138, "auxiliary_loss_mlp": 0.00771382, "balance_loss_clip": 1.03735209, "balance_loss_mlp": 1.00023603, "epoch": 0.6577483841875845, "flos": 20220544206720.0, "grad_norm": 2.2437103704575345, "language_loss": 0.77729875, "learning_rate": 1.107826092473037e-06, "loss": 0.79593396, "num_input_tokens_seen": 236156620, "step": 10940, "time_per_iteration": 2.669313669204712 }, { "auxiliary_loss_clip": 0.01071259, "auxiliary_loss_mlp": 0.01031827, "balance_loss_clip": 1.03549123, "balance_loss_mlp": 1.01780236, "epoch": 0.6578085074402525, "flos": 34751078962560.0, "grad_norm": 2.3851655144351818, "language_loss": 0.68552613, "learning_rate": 1.107477545226471e-06, "loss": 0.70655704, "num_input_tokens_seen": 236177095, "step": 10941, "time_per_iteration": 2.8323819637298584 }, { "auxiliary_loss_clip": 0.01098124, "auxiliary_loss_mlp": 0.00771304, "balance_loss_clip": 1.03532124, "balance_loss_mlp": 1.00012338, "epoch": 0.6578686306929205, "flos": 23470998675840.0, "grad_norm": 2.4287401057679436, "language_loss": 0.68286288, "learning_rate": 1.1071290318244448e-06, "loss": 0.70155716, "num_input_tokens_seen": 236194695, "step": 10942, "time_per_iteration": 2.662338972091675 }, { "auxiliary_loss_clip": 0.01082673, "auxiliary_loss_mlp": 0.01036222, "balance_loss_clip": 1.03803504, "balance_loss_mlp": 1.02132106, "epoch": 0.6579287539455885, "flos": 18077073033600.0, "grad_norm": 1.9182303150374724, "language_loss": 0.71618617, "learning_rate": 1.1067805522801753e-06, "loss": 0.73737514, "num_input_tokens_seen": 236213885, "step": 10943, "time_per_iteration": 2.6217944622039795 }, { "auxiliary_loss_clip": 0.01070671, "auxiliary_loss_mlp": 0.01032935, "balance_loss_clip": 1.03640389, "balance_loss_mlp": 1.01936865, "epoch": 0.6579888771982564, "flos": 28661383900800.0, "grad_norm": 1.8289069022809952, "language_loss": 0.59149086, "learning_rate": 1.1064321066068778e-06, "loss": 0.61252689, "num_input_tokens_seen": 236237315, "step": 10944, "time_per_iteration": 2.8202292919158936 }, { "auxiliary_loss_clip": 0.01109311, "auxiliary_loss_mlp": 0.01034463, "balance_loss_clip": 1.04082966, "balance_loss_mlp": 1.02081347, "epoch": 0.6580490004509244, "flos": 25046543911680.0, "grad_norm": 1.5174772565974388, "language_loss": 0.7224496, "learning_rate": 1.1060836948177646e-06, "loss": 0.74388736, "num_input_tokens_seen": 236256345, "step": 10945, "time_per_iteration": 2.658428430557251 }, { "auxiliary_loss_clip": 0.0109325, "auxiliary_loss_mlp": 0.0102876, "balance_loss_clip": 1.04045701, "balance_loss_mlp": 1.0164274, "epoch": 0.6581091237035923, "flos": 43508793461760.0, "grad_norm": 1.5303954795060517, "language_loss": 0.70540607, "learning_rate": 1.105735316926046e-06, "loss": 0.72662616, "num_input_tokens_seen": 236281890, "step": 10946, "time_per_iteration": 2.859764814376831 }, { "auxiliary_loss_clip": 0.01103097, "auxiliary_loss_mlp": 0.01034987, "balance_loss_clip": 1.04042983, "balance_loss_mlp": 1.02167702, "epoch": 0.6581692469562603, "flos": 22415404763520.0, "grad_norm": 2.072130981046482, "language_loss": 0.82211119, "learning_rate": 1.105386972944934e-06, "loss": 0.84349203, "num_input_tokens_seen": 236298370, "step": 10947, "time_per_iteration": 2.630653142929077 }, { "auxiliary_loss_clip": 0.01056612, "auxiliary_loss_mlp": 0.00771489, "balance_loss_clip": 1.0330416, "balance_loss_mlp": 1.0001905, "epoch": 0.6582293702089284, "flos": 24859772167680.0, "grad_norm": 1.881732401940151, "language_loss": 0.77187896, "learning_rate": 1.1050386628876385e-06, "loss": 0.79015994, "num_input_tokens_seen": 236317380, "step": 10948, "time_per_iteration": 2.7764172554016113 }, { "auxiliary_loss_clip": 0.01105319, "auxiliary_loss_mlp": 0.01030601, "balance_loss_clip": 1.04180968, "balance_loss_mlp": 1.01791072, "epoch": 0.6582894934615963, "flos": 23039676161280.0, "grad_norm": 2.2574860884284793, "language_loss": 0.79085296, "learning_rate": 1.1046903867673655e-06, "loss": 0.81221217, "num_input_tokens_seen": 236336210, "step": 10949, "time_per_iteration": 2.7244157791137695 }, { "auxiliary_loss_clip": 0.0102471, "auxiliary_loss_mlp": 0.01003119, "balance_loss_clip": 1.01120281, "balance_loss_mlp": 1.00195682, "epoch": 0.6583496167142643, "flos": 72551980978560.0, "grad_norm": 0.7330189150463328, "language_loss": 0.6181432, "learning_rate": 1.104342144597323e-06, "loss": 0.63842142, "num_input_tokens_seen": 236403090, "step": 10950, "time_per_iteration": 3.2641515731811523 }, { "auxiliary_loss_clip": 0.01100983, "auxiliary_loss_mlp": 0.01032251, "balance_loss_clip": 1.0385226, "balance_loss_mlp": 1.02026415, "epoch": 0.6584097399669322, "flos": 13078846592640.0, "grad_norm": 2.3980091828088144, "language_loss": 0.67179585, "learning_rate": 1.1039939363907178e-06, "loss": 0.69312811, "num_input_tokens_seen": 236420475, "step": 10951, "time_per_iteration": 2.619748115539551 }, { "auxiliary_loss_clip": 0.01100086, "auxiliary_loss_mlp": 0.01034456, "balance_loss_clip": 1.03776073, "balance_loss_mlp": 1.02158761, "epoch": 0.6584698632196002, "flos": 28693164458880.0, "grad_norm": 1.4089441578543043, "language_loss": 0.76300871, "learning_rate": 1.1036457621607504e-06, "loss": 0.78435409, "num_input_tokens_seen": 236441915, "step": 10952, "time_per_iteration": 2.7250633239746094 }, { "auxiliary_loss_clip": 0.0111349, "auxiliary_loss_mlp": 0.0103139, "balance_loss_clip": 1.04090011, "balance_loss_mlp": 1.018188, "epoch": 0.6585299864722681, "flos": 14319272914560.0, "grad_norm": 1.8443663213164305, "language_loss": 0.73402822, "learning_rate": 1.1032976219206257e-06, "loss": 0.75547707, "num_input_tokens_seen": 236460340, "step": 10953, "time_per_iteration": 2.566080331802368 }, { "auxiliary_loss_clip": 0.01082894, "auxiliary_loss_mlp": 0.0104307, "balance_loss_clip": 1.03907454, "balance_loss_mlp": 1.02891934, "epoch": 0.6585901097249361, "flos": 26797907243520.0, "grad_norm": 2.1744380051357, "language_loss": 0.78487962, "learning_rate": 1.102949515683546e-06, "loss": 0.80613929, "num_input_tokens_seen": 236478280, "step": 10954, "time_per_iteration": 2.724165678024292 }, { "auxiliary_loss_clip": 0.01088368, "auxiliary_loss_mlp": 0.01037943, "balance_loss_clip": 1.03427434, "balance_loss_mlp": 1.0242219, "epoch": 0.658650232977604, "flos": 18733124989440.0, "grad_norm": 2.555140209313338, "language_loss": 0.69544291, "learning_rate": 1.1026014434627096e-06, "loss": 0.71670604, "num_input_tokens_seen": 236493225, "step": 10955, "time_per_iteration": 2.6414260864257812 }, { "auxiliary_loss_clip": 0.01082497, "auxiliary_loss_mlp": 0.01033927, "balance_loss_clip": 1.03517938, "balance_loss_mlp": 1.02191079, "epoch": 0.6587103562302721, "flos": 24753440931840.0, "grad_norm": 2.1706102915019434, "language_loss": 0.80620337, "learning_rate": 1.1022534052713172e-06, "loss": 0.82736766, "num_input_tokens_seen": 236514420, "step": 10956, "time_per_iteration": 2.679706335067749 }, { "auxiliary_loss_clip": 0.01104337, "auxiliary_loss_mlp": 0.01038231, "balance_loss_clip": 1.04236186, "balance_loss_mlp": 1.02459347, "epoch": 0.65877047948294, "flos": 22346133384960.0, "grad_norm": 2.024941431440732, "language_loss": 0.81428325, "learning_rate": 1.1019054011225648e-06, "loss": 0.83570898, "num_input_tokens_seen": 236532785, "step": 10957, "time_per_iteration": 2.7104432582855225 }, { "auxiliary_loss_clip": 0.01091788, "auxiliary_loss_mlp": 0.01030278, "balance_loss_clip": 1.04065537, "balance_loss_mlp": 1.01872087, "epoch": 0.658830602735608, "flos": 45180542298240.0, "grad_norm": 1.6614910080791612, "language_loss": 0.75887316, "learning_rate": 1.1015574310296506e-06, "loss": 0.78009385, "num_input_tokens_seen": 236553330, "step": 10958, "time_per_iteration": 2.829876661300659 }, { "auxiliary_loss_clip": 0.01070256, "auxiliary_loss_mlp": 0.01040301, "balance_loss_clip": 1.0364852, "balance_loss_mlp": 1.02578747, "epoch": 0.6588907259882759, "flos": 19901622326400.0, "grad_norm": 1.76623385890274, "language_loss": 0.74976909, "learning_rate": 1.1012094950057678e-06, "loss": 0.77087468, "num_input_tokens_seen": 236572960, "step": 10959, "time_per_iteration": 4.3221375942230225 }, { "auxiliary_loss_clip": 0.01103616, "auxiliary_loss_mlp": 0.01030743, "balance_loss_clip": 1.03967154, "balance_loss_mlp": 1.01826799, "epoch": 0.6589508492409439, "flos": 24133766474880.0, "grad_norm": 1.6028003647190308, "language_loss": 0.6497494, "learning_rate": 1.1008615930641107e-06, "loss": 0.67109299, "num_input_tokens_seen": 236594090, "step": 10960, "time_per_iteration": 2.685056209564209 }, { "auxiliary_loss_clip": 0.01119947, "auxiliary_loss_mlp": 0.01034422, "balance_loss_clip": 1.04166222, "balance_loss_mlp": 1.0203135, "epoch": 0.659010972493612, "flos": 18222906251520.0, "grad_norm": 3.156226944144234, "language_loss": 0.81759185, "learning_rate": 1.1005137252178734e-06, "loss": 0.83913553, "num_input_tokens_seen": 236610190, "step": 10961, "time_per_iteration": 2.6374056339263916 }, { "auxiliary_loss_clip": 0.01076452, "auxiliary_loss_mlp": 0.01033701, "balance_loss_clip": 1.03810012, "balance_loss_mlp": 1.01989698, "epoch": 0.6590710957462799, "flos": 27600007898880.0, "grad_norm": 1.7436713822775258, "language_loss": 0.73479664, "learning_rate": 1.1001658914802453e-06, "loss": 0.75589824, "num_input_tokens_seen": 236631575, "step": 10962, "time_per_iteration": 4.275976181030273 }, { "auxiliary_loss_clip": 0.0109814, "auxiliary_loss_mlp": 0.01033187, "balance_loss_clip": 1.03807235, "balance_loss_mlp": 1.01996064, "epoch": 0.6591312189989479, "flos": 20302959962880.0, "grad_norm": 1.9404531224692678, "language_loss": 0.80004346, "learning_rate": 1.0998180918644165e-06, "loss": 0.82135677, "num_input_tokens_seen": 236649815, "step": 10963, "time_per_iteration": 4.260782480239868 }, { "auxiliary_loss_clip": 0.01062785, "auxiliary_loss_mlp": 0.00769293, "balance_loss_clip": 1.0372498, "balance_loss_mlp": 1.00011432, "epoch": 0.6591913422516158, "flos": 12312943868160.0, "grad_norm": 1.8441045997478804, "language_loss": 0.78224564, "learning_rate": 1.0994703263835754e-06, "loss": 0.80056643, "num_input_tokens_seen": 236668335, "step": 10964, "time_per_iteration": 2.6830945014953613 }, { "auxiliary_loss_clip": 0.01075287, "auxiliary_loss_mlp": 0.01040262, "balance_loss_clip": 1.03438485, "balance_loss_mlp": 1.02721417, "epoch": 0.6592514655042838, "flos": 25884591102720.0, "grad_norm": 1.683709186180651, "language_loss": 0.73955643, "learning_rate": 1.0991225950509106e-06, "loss": 0.76071191, "num_input_tokens_seen": 236688945, "step": 10965, "time_per_iteration": 2.687619924545288 }, { "auxiliary_loss_clip": 0.01081038, "auxiliary_loss_mlp": 0.01038566, "balance_loss_clip": 1.03631306, "balance_loss_mlp": 1.02412999, "epoch": 0.6593115887569517, "flos": 14063624841600.0, "grad_norm": 2.0913085470177943, "language_loss": 0.73648584, "learning_rate": 1.0987748978796067e-06, "loss": 0.75768185, "num_input_tokens_seen": 236707055, "step": 10966, "time_per_iteration": 2.6525564193725586 }, { "auxiliary_loss_clip": 0.01102724, "auxiliary_loss_mlp": 0.01032944, "balance_loss_clip": 1.03741455, "balance_loss_mlp": 1.01951456, "epoch": 0.6593717120096197, "flos": 24717925359360.0, "grad_norm": 1.533295813226106, "language_loss": 0.76610076, "learning_rate": 1.0984272348828487e-06, "loss": 0.78745747, "num_input_tokens_seen": 236725900, "step": 10967, "time_per_iteration": 4.112145900726318 }, { "auxiliary_loss_clip": 0.01023116, "auxiliary_loss_mlp": 0.0100237, "balance_loss_clip": 1.00873816, "balance_loss_mlp": 1.00111854, "epoch": 0.6594318352622877, "flos": 55558083502080.0, "grad_norm": 0.6961608375444348, "language_loss": 0.48445863, "learning_rate": 1.0980796060738221e-06, "loss": 0.50471348, "num_input_tokens_seen": 236788415, "step": 10968, "time_per_iteration": 3.0989933013916016 }, { "auxiliary_loss_clip": 0.01066259, "auxiliary_loss_mlp": 0.01036629, "balance_loss_clip": 1.03324318, "balance_loss_mlp": 1.02168036, "epoch": 0.6594919585149557, "flos": 17456931699840.0, "grad_norm": 1.7813410381881563, "language_loss": 0.79142725, "learning_rate": 1.0977320114657058e-06, "loss": 0.81245613, "num_input_tokens_seen": 236805155, "step": 10969, "time_per_iteration": 2.6929845809936523 }, { "auxiliary_loss_clip": 0.01103958, "auxiliary_loss_mlp": 0.01031334, "balance_loss_clip": 1.0396595, "balance_loss_mlp": 1.01903188, "epoch": 0.6595520817676236, "flos": 18223229473920.0, "grad_norm": 2.1653605986578137, "language_loss": 0.65524602, "learning_rate": 1.0973844510716817e-06, "loss": 0.67659903, "num_input_tokens_seen": 236824360, "step": 10970, "time_per_iteration": 2.5729503631591797 }, { "auxiliary_loss_clip": 0.01098998, "auxiliary_loss_mlp": 0.01031128, "balance_loss_clip": 1.03612995, "balance_loss_mlp": 1.01827741, "epoch": 0.6596122050202916, "flos": 22199761463040.0, "grad_norm": 1.6607954000770715, "language_loss": 0.7680558, "learning_rate": 1.0970369249049308e-06, "loss": 0.78935707, "num_input_tokens_seen": 236844640, "step": 10971, "time_per_iteration": 2.699892997741699 }, { "auxiliary_loss_clip": 0.01045077, "auxiliary_loss_mlp": 0.01047077, "balance_loss_clip": 1.03190637, "balance_loss_mlp": 1.03174686, "epoch": 0.6596723282729595, "flos": 14173834746240.0, "grad_norm": 2.880961149913922, "language_loss": 0.70055163, "learning_rate": 1.096689432978629e-06, "loss": 0.72147322, "num_input_tokens_seen": 236861160, "step": 10972, "time_per_iteration": 2.7359213829040527 }, { "auxiliary_loss_clip": 0.01101135, "auxiliary_loss_mlp": 0.01025815, "balance_loss_clip": 1.03941655, "balance_loss_mlp": 1.01266074, "epoch": 0.6597324515256275, "flos": 30553193410560.0, "grad_norm": 9.926316428888306, "language_loss": 0.55695325, "learning_rate": 1.0963419753059556e-06, "loss": 0.57822275, "num_input_tokens_seen": 236880465, "step": 10973, "time_per_iteration": 2.69612455368042 }, { "auxiliary_loss_clip": 0.01099195, "auxiliary_loss_mlp": 0.01040169, "balance_loss_clip": 1.0419203, "balance_loss_mlp": 1.02660263, "epoch": 0.6597925747782956, "flos": 17639860688640.0, "grad_norm": 2.5012890193080026, "language_loss": 0.78572869, "learning_rate": 1.0959945519000839e-06, "loss": 0.80712223, "num_input_tokens_seen": 236897730, "step": 10974, "time_per_iteration": 2.6455633640289307 }, { "auxiliary_loss_clip": 0.01100482, "auxiliary_loss_mlp": 0.01037502, "balance_loss_clip": 1.04022431, "balance_loss_mlp": 1.02422214, "epoch": 0.6598526980309635, "flos": 22819112697600.0, "grad_norm": 2.251661999993696, "language_loss": 0.68701649, "learning_rate": 1.0956471627741906e-06, "loss": 0.70839626, "num_input_tokens_seen": 236917300, "step": 10975, "time_per_iteration": 2.6761295795440674 }, { "auxiliary_loss_clip": 0.01097399, "auxiliary_loss_mlp": 0.01032564, "balance_loss_clip": 1.03912926, "balance_loss_mlp": 1.02060747, "epoch": 0.6599128212836315, "flos": 21068036674560.0, "grad_norm": 1.6540937029958567, "language_loss": 0.70881736, "learning_rate": 1.0952998079414464e-06, "loss": 0.73011696, "num_input_tokens_seen": 236935590, "step": 10976, "time_per_iteration": 2.5975265502929688 }, { "auxiliary_loss_clip": 0.01083365, "auxiliary_loss_mlp": 0.01033377, "balance_loss_clip": 1.03734148, "balance_loss_mlp": 1.02016902, "epoch": 0.6599729445362994, "flos": 22163527618560.0, "grad_norm": 1.6096140121507374, "language_loss": 0.67765009, "learning_rate": 1.0949524874150243e-06, "loss": 0.69881749, "num_input_tokens_seen": 236952830, "step": 10977, "time_per_iteration": 2.676992177963257 }, { "auxiliary_loss_clip": 0.01079353, "auxiliary_loss_mlp": 0.01037069, "balance_loss_clip": 1.03872538, "balance_loss_mlp": 1.02254331, "epoch": 0.6600330677889674, "flos": 18150079426560.0, "grad_norm": 2.028840451789988, "language_loss": 0.80975902, "learning_rate": 1.0946052012080952e-06, "loss": 0.8309232, "num_input_tokens_seen": 236971930, "step": 10978, "time_per_iteration": 2.670058488845825 }, { "auxiliary_loss_clip": 0.01084138, "auxiliary_loss_mlp": 0.01037844, "balance_loss_clip": 1.03935933, "balance_loss_mlp": 1.02446318, "epoch": 0.6600931910416353, "flos": 18150115340160.0, "grad_norm": 3.3630669376979534, "language_loss": 0.67552471, "learning_rate": 1.0942579493338278e-06, "loss": 0.69674456, "num_input_tokens_seen": 236989920, "step": 10979, "time_per_iteration": 2.6543848514556885 }, { "auxiliary_loss_clip": 0.01082232, "auxiliary_loss_mlp": 0.01035503, "balance_loss_clip": 1.03750384, "balance_loss_mlp": 1.02135265, "epoch": 0.6601533142943034, "flos": 17420733768960.0, "grad_norm": 2.7062652296553793, "language_loss": 0.7310946, "learning_rate": 1.0939107318053889e-06, "loss": 0.75227201, "num_input_tokens_seen": 237006570, "step": 10980, "time_per_iteration": 2.614719867706299 }, { "auxiliary_loss_clip": 0.01075162, "auxiliary_loss_mlp": 0.01033537, "balance_loss_clip": 1.0369494, "balance_loss_mlp": 1.02132368, "epoch": 0.6602134375469713, "flos": 28219574615040.0, "grad_norm": 1.6769637422208983, "language_loss": 0.72674447, "learning_rate": 1.0935635486359459e-06, "loss": 0.74783146, "num_input_tokens_seen": 237028415, "step": 10981, "time_per_iteration": 2.7521674633026123 }, { "auxiliary_loss_clip": 0.01059889, "auxiliary_loss_mlp": 0.0103708, "balance_loss_clip": 1.03629518, "balance_loss_mlp": 1.02407432, "epoch": 0.6602735607996393, "flos": 29418056830080.0, "grad_norm": 2.169047564074697, "language_loss": 0.68625891, "learning_rate": 1.0932163998386647e-06, "loss": 0.70722854, "num_input_tokens_seen": 237046595, "step": 10982, "time_per_iteration": 2.791590690612793 }, { "auxiliary_loss_clip": 0.01102094, "auxiliary_loss_mlp": 0.01028932, "balance_loss_clip": 1.03903246, "balance_loss_mlp": 1.01600397, "epoch": 0.6603336840523072, "flos": 18588045957120.0, "grad_norm": 1.9479050528854345, "language_loss": 0.69151658, "learning_rate": 1.0928692854267075e-06, "loss": 0.71282685, "num_input_tokens_seen": 237066150, "step": 10983, "time_per_iteration": 2.662109851837158 }, { "auxiliary_loss_clip": 0.01102705, "auxiliary_loss_mlp": 0.01032804, "balance_loss_clip": 1.03690076, "balance_loss_mlp": 1.0190587, "epoch": 0.6603938073049752, "flos": 33254860913280.0, "grad_norm": 1.7348773084229319, "language_loss": 0.70333445, "learning_rate": 1.092522205413239e-06, "loss": 0.72468954, "num_input_tokens_seen": 237087060, "step": 10984, "time_per_iteration": 2.732595443725586 }, { "auxiliary_loss_clip": 0.01077924, "auxiliary_loss_mlp": 0.01038628, "balance_loss_clip": 1.03689432, "balance_loss_mlp": 1.02587259, "epoch": 0.6604539305576431, "flos": 17384284442880.0, "grad_norm": 1.6767760179184985, "language_loss": 0.83797729, "learning_rate": 1.0921751598114193e-06, "loss": 0.85914278, "num_input_tokens_seen": 237103825, "step": 10985, "time_per_iteration": 2.654433250427246 }, { "auxiliary_loss_clip": 0.01105556, "auxiliary_loss_mlp": 0.01034686, "balance_loss_clip": 1.0407331, "balance_loss_mlp": 1.02094078, "epoch": 0.6605140538103111, "flos": 21251145231360.0, "grad_norm": 2.384704611416695, "language_loss": 0.74183935, "learning_rate": 1.0918281486344077e-06, "loss": 0.76324177, "num_input_tokens_seen": 237121740, "step": 10986, "time_per_iteration": 2.6019506454467773 }, { "auxiliary_loss_clip": 0.01100549, "auxiliary_loss_mlp": 0.01029651, "balance_loss_clip": 1.03883743, "balance_loss_mlp": 1.01647878, "epoch": 0.6605741770629792, "flos": 13881701433600.0, "grad_norm": 1.9122697335713108, "language_loss": 0.78908652, "learning_rate": 1.0914811718953636e-06, "loss": 0.81038857, "num_input_tokens_seen": 237139565, "step": 10987, "time_per_iteration": 2.5722427368164062 }, { "auxiliary_loss_clip": 0.01008768, "auxiliary_loss_mlp": 0.01002668, "balance_loss_clip": 1.00836062, "balance_loss_mlp": 1.0013566, "epoch": 0.6606343003156471, "flos": 69316215171840.0, "grad_norm": 0.8094121865469099, "language_loss": 0.541363, "learning_rate": 1.0911342296074454e-06, "loss": 0.5614773, "num_input_tokens_seen": 237201055, "step": 10988, "time_per_iteration": 3.272397994995117 }, { "auxiliary_loss_clip": 0.01053267, "auxiliary_loss_mlp": 0.01036624, "balance_loss_clip": 1.03639925, "balance_loss_mlp": 1.02483392, "epoch": 0.6606944235683151, "flos": 27272394927360.0, "grad_norm": 1.725996965304981, "language_loss": 0.77469909, "learning_rate": 1.0907873217838077e-06, "loss": 0.79559803, "num_input_tokens_seen": 237221805, "step": 10989, "time_per_iteration": 2.911433458328247 }, { "auxiliary_loss_clip": 0.01092952, "auxiliary_loss_mlp": 0.01034291, "balance_loss_clip": 1.04096937, "balance_loss_mlp": 1.02172589, "epoch": 0.660754546820983, "flos": 13772820332160.0, "grad_norm": 2.2526328276614542, "language_loss": 0.77053428, "learning_rate": 1.0904404484376064e-06, "loss": 0.7918067, "num_input_tokens_seen": 237238270, "step": 10990, "time_per_iteration": 2.6875393390655518 }, { "auxiliary_loss_clip": 0.01116631, "auxiliary_loss_mlp": 0.01032525, "balance_loss_clip": 1.04041815, "balance_loss_mlp": 1.01960862, "epoch": 0.660814670073651, "flos": 15705209232000.0, "grad_norm": 4.452653785760573, "language_loss": 0.60725391, "learning_rate": 1.0900936095819937e-06, "loss": 0.62874544, "num_input_tokens_seen": 237255400, "step": 10991, "time_per_iteration": 2.581926107406616 }, { "auxiliary_loss_clip": 0.01088945, "auxiliary_loss_mlp": 0.01037016, "balance_loss_clip": 1.03823137, "balance_loss_mlp": 1.02305102, "epoch": 0.6608747933263189, "flos": 20850023076480.0, "grad_norm": 2.2752499400269057, "language_loss": 0.68441308, "learning_rate": 1.0897468052301234e-06, "loss": 0.70567274, "num_input_tokens_seen": 237273105, "step": 10992, "time_per_iteration": 2.6633994579315186 }, { "auxiliary_loss_clip": 0.01102357, "auxiliary_loss_mlp": 0.01033607, "balance_loss_clip": 1.03874791, "balance_loss_mlp": 1.02007651, "epoch": 0.660934916578987, "flos": 20632117219200.0, "grad_norm": 1.7286431682231886, "language_loss": 0.87802613, "learning_rate": 1.0894000353951444e-06, "loss": 0.89938569, "num_input_tokens_seen": 237292650, "step": 10993, "time_per_iteration": 2.618743419647217 }, { "auxiliary_loss_clip": 0.01111168, "auxiliary_loss_mlp": 0.01033402, "balance_loss_clip": 1.04143643, "balance_loss_mlp": 1.01837611, "epoch": 0.6609950398316549, "flos": 25113588647040.0, "grad_norm": 1.7020728160261662, "language_loss": 0.66939056, "learning_rate": 1.0890533000902078e-06, "loss": 0.69083625, "num_input_tokens_seen": 237312865, "step": 10994, "time_per_iteration": 2.694892406463623 }, { "auxiliary_loss_clip": 0.01078298, "auxiliary_loss_mlp": 0.01039322, "balance_loss_clip": 1.03795636, "balance_loss_mlp": 1.02551126, "epoch": 0.6610551630843229, "flos": 18661196004480.0, "grad_norm": 2.5249476876910277, "language_loss": 0.77071732, "learning_rate": 1.0887065993284626e-06, "loss": 0.79189348, "num_input_tokens_seen": 237331210, "step": 10995, "time_per_iteration": 2.6232664585113525 }, { "auxiliary_loss_clip": 0.01093968, "auxiliary_loss_mlp": 0.01029631, "balance_loss_clip": 1.03934228, "balance_loss_mlp": 1.01722097, "epoch": 0.6611152863369908, "flos": 23258192549760.0, "grad_norm": 1.8438791376239891, "language_loss": 0.74463415, "learning_rate": 1.088359933123053e-06, "loss": 0.76587015, "num_input_tokens_seen": 237349455, "step": 10996, "time_per_iteration": 2.628135919570923 }, { "auxiliary_loss_clip": 0.01115792, "auxiliary_loss_mlp": 0.01034651, "balance_loss_clip": 1.04123545, "balance_loss_mlp": 1.02159739, "epoch": 0.6611754095896588, "flos": 22159720776960.0, "grad_norm": 1.8400435689118084, "language_loss": 0.69207805, "learning_rate": 1.088013301487126e-06, "loss": 0.71358246, "num_input_tokens_seen": 237367100, "step": 10997, "time_per_iteration": 2.5729880332946777 }, { "auxiliary_loss_clip": 0.01095929, "auxiliary_loss_mlp": 0.01033874, "balance_loss_clip": 1.0389818, "balance_loss_mlp": 1.02096367, "epoch": 0.6612355328423267, "flos": 13991228979840.0, "grad_norm": 2.212339587573469, "language_loss": 0.68443197, "learning_rate": 1.0876667044338269e-06, "loss": 0.70572996, "num_input_tokens_seen": 237384840, "step": 10998, "time_per_iteration": 4.240036249160767 }, { "auxiliary_loss_clip": 0.01026396, "auxiliary_loss_mlp": 0.01003226, "balance_loss_clip": 1.01201963, "balance_loss_mlp": 1.00200462, "epoch": 0.6612956560949947, "flos": 61453716359040.0, "grad_norm": 0.6556172869742106, "language_loss": 0.51124817, "learning_rate": 1.087320141976297e-06, "loss": 0.53154439, "num_input_tokens_seen": 237443355, "step": 10999, "time_per_iteration": 3.0903005599975586 }, { "auxiliary_loss_clip": 0.01117437, "auxiliary_loss_mlp": 0.00771071, "balance_loss_clip": 1.04025114, "balance_loss_mlp": 1.00016904, "epoch": 0.6613557793476627, "flos": 21616644072960.0, "grad_norm": 2.396543073072743, "language_loss": 0.70902514, "learning_rate": 1.086973614127679e-06, "loss": 0.72791028, "num_input_tokens_seen": 237459205, "step": 11000, "time_per_iteration": 2.5685982704162598 }, { "auxiliary_loss_clip": 0.01082819, "auxiliary_loss_mlp": 0.01036908, "balance_loss_clip": 1.03847837, "balance_loss_mlp": 1.024737, "epoch": 0.6614159026003307, "flos": 34020117192960.0, "grad_norm": 1.430398099595452, "language_loss": 0.65089309, "learning_rate": 1.0866271209011133e-06, "loss": 0.67209029, "num_input_tokens_seen": 237483580, "step": 11001, "time_per_iteration": 4.2755303382873535 }, { "auxiliary_loss_clip": 0.01112876, "auxiliary_loss_mlp": 0.0103109, "balance_loss_clip": 1.03954029, "balance_loss_mlp": 1.01845384, "epoch": 0.6614760258529987, "flos": 24097281235200.0, "grad_norm": 1.7701672836009255, "language_loss": 0.7300179, "learning_rate": 1.086280662309739e-06, "loss": 0.75145757, "num_input_tokens_seen": 237502860, "step": 11002, "time_per_iteration": 2.6314847469329834 }, { "auxiliary_loss_clip": 0.01097492, "auxiliary_loss_mlp": 0.01037063, "balance_loss_clip": 1.03688526, "balance_loss_mlp": 1.02355647, "epoch": 0.6615361491056666, "flos": 14903790935040.0, "grad_norm": 1.9389438141435231, "language_loss": 0.79010653, "learning_rate": 1.0859342383666928e-06, "loss": 0.81145215, "num_input_tokens_seen": 237521030, "step": 11003, "time_per_iteration": 4.314274072647095 }, { "auxiliary_loss_clip": 0.01104366, "auxiliary_loss_mlp": 0.01038348, "balance_loss_clip": 1.03993845, "balance_loss_mlp": 1.02454972, "epoch": 0.6615962723583346, "flos": 15304877176320.0, "grad_norm": 1.933163608906101, "language_loss": 0.69039351, "learning_rate": 1.0855878490851119e-06, "loss": 0.7118206, "num_input_tokens_seen": 237539585, "step": 11004, "time_per_iteration": 2.6783957481384277 }, { "auxiliary_loss_clip": 0.01104574, "auxiliary_loss_mlp": 0.0103687, "balance_loss_clip": 1.03920364, "balance_loss_mlp": 1.02226102, "epoch": 0.6616563956110025, "flos": 18732586285440.0, "grad_norm": 2.0685835155239487, "language_loss": 0.69767517, "learning_rate": 1.085241494478132e-06, "loss": 0.71908963, "num_input_tokens_seen": 237557655, "step": 11005, "time_per_iteration": 2.5958964824676514 }, { "auxiliary_loss_clip": 0.01094809, "auxiliary_loss_mlp": 0.01030023, "balance_loss_clip": 1.04032111, "balance_loss_mlp": 1.01691008, "epoch": 0.6617165188636706, "flos": 24495063425280.0, "grad_norm": 4.5323320504778035, "language_loss": 0.78211862, "learning_rate": 1.0848951745588855e-06, "loss": 0.80336696, "num_input_tokens_seen": 237577000, "step": 11006, "time_per_iteration": 4.20892596244812 }, { "auxiliary_loss_clip": 0.01102255, "auxiliary_loss_mlp": 0.01033472, "balance_loss_clip": 1.03898382, "balance_loss_mlp": 1.02004886, "epoch": 0.6617766421163385, "flos": 22379673709440.0, "grad_norm": 1.4341781143462713, "language_loss": 0.76336843, "learning_rate": 1.0845488893405068e-06, "loss": 0.78472567, "num_input_tokens_seen": 237597960, "step": 11007, "time_per_iteration": 2.6313998699188232 }, { "auxiliary_loss_clip": 0.0110241, "auxiliary_loss_mlp": 0.01033667, "balance_loss_clip": 1.0410744, "balance_loss_mlp": 1.02089977, "epoch": 0.6618367653690065, "flos": 20850418126080.0, "grad_norm": 1.678556210667641, "language_loss": 0.78647077, "learning_rate": 1.0842026388361248e-06, "loss": 0.80783153, "num_input_tokens_seen": 237616385, "step": 11008, "time_per_iteration": 2.6336562633514404 }, { "auxiliary_loss_clip": 0.01117118, "auxiliary_loss_mlp": 0.01030319, "balance_loss_clip": 1.0386076, "balance_loss_mlp": 1.01620448, "epoch": 0.6618968886216744, "flos": 17712328377600.0, "grad_norm": 1.8062458144067923, "language_loss": 0.81780714, "learning_rate": 1.0838564230588715e-06, "loss": 0.83928156, "num_input_tokens_seen": 237634930, "step": 11009, "time_per_iteration": 2.559891939163208 }, { "auxiliary_loss_clip": 0.01003698, "auxiliary_loss_mlp": 0.01000096, "balance_loss_clip": 1.01631284, "balance_loss_mlp": 0.99864715, "epoch": 0.6619570118743424, "flos": 67035347498880.0, "grad_norm": 1.1306824373429385, "language_loss": 0.67373979, "learning_rate": 1.0835102420218735e-06, "loss": 0.69377768, "num_input_tokens_seen": 237693175, "step": 11010, "time_per_iteration": 3.1341817378997803 }, { "auxiliary_loss_clip": 0.01103659, "auxiliary_loss_mlp": 0.01034485, "balance_loss_clip": 1.03835106, "balance_loss_mlp": 1.02063894, "epoch": 0.6620171351270103, "flos": 18660908695680.0, "grad_norm": 1.5388019077167303, "language_loss": 0.71031803, "learning_rate": 1.0831640957382593e-06, "loss": 0.73169947, "num_input_tokens_seen": 237713160, "step": 11011, "time_per_iteration": 2.6373953819274902 }, { "auxiliary_loss_clip": 0.01106184, "auxiliary_loss_mlp": 0.01032454, "balance_loss_clip": 1.04299128, "balance_loss_mlp": 1.01964521, "epoch": 0.6620772583796783, "flos": 24170503109760.0, "grad_norm": 1.4417744086263622, "language_loss": 0.7236765, "learning_rate": 1.0828179842211557e-06, "loss": 0.74506283, "num_input_tokens_seen": 237733600, "step": 11012, "time_per_iteration": 2.6834990978240967 }, { "auxiliary_loss_clip": 0.01098433, "auxiliary_loss_mlp": 0.01034539, "balance_loss_clip": 1.03888941, "balance_loss_mlp": 1.02273691, "epoch": 0.6621373816323463, "flos": 23623547736960.0, "grad_norm": 1.657176750213381, "language_loss": 0.79366904, "learning_rate": 1.0824719074836845e-06, "loss": 0.81499881, "num_input_tokens_seen": 237752135, "step": 11013, "time_per_iteration": 2.6497538089752197 }, { "auxiliary_loss_clip": 0.01092428, "auxiliary_loss_mlp": 0.01032971, "balance_loss_clip": 1.03971934, "balance_loss_mlp": 1.01944637, "epoch": 0.6621975048850143, "flos": 18442212739200.0, "grad_norm": 2.6791842321865698, "language_loss": 0.70635635, "learning_rate": 1.082125865538971e-06, "loss": 0.72761035, "num_input_tokens_seen": 237770735, "step": 11014, "time_per_iteration": 2.6886751651763916 }, { "auxiliary_loss_clip": 0.01083433, "auxiliary_loss_mlp": 0.00768947, "balance_loss_clip": 1.03894365, "balance_loss_mlp": 1.00011313, "epoch": 0.6622576281376823, "flos": 14063876236800.0, "grad_norm": 1.8642341672837748, "language_loss": 0.77003562, "learning_rate": 1.081779858400137e-06, "loss": 0.78855944, "num_input_tokens_seen": 237789005, "step": 11015, "time_per_iteration": 2.7417409420013428 }, { "auxiliary_loss_clip": 0.01104344, "auxiliary_loss_mlp": 0.0077007, "balance_loss_clip": 1.04066467, "balance_loss_mlp": 1.00019598, "epoch": 0.6623177513903502, "flos": 17018965169280.0, "grad_norm": 1.678948777257364, "language_loss": 0.82612354, "learning_rate": 1.0814338860803021e-06, "loss": 0.84486771, "num_input_tokens_seen": 237807740, "step": 11016, "time_per_iteration": 2.6134469509124756 }, { "auxiliary_loss_clip": 0.01098949, "auxiliary_loss_mlp": 0.01033634, "balance_loss_clip": 1.03807402, "balance_loss_mlp": 1.02006221, "epoch": 0.6623778746430182, "flos": 17271021882240.0, "grad_norm": 1.953011458286016, "language_loss": 0.69714379, "learning_rate": 1.0810879485925864e-06, "loss": 0.71846962, "num_input_tokens_seen": 237826340, "step": 11017, "time_per_iteration": 2.58854079246521 }, { "auxiliary_loss_clip": 0.01083899, "auxiliary_loss_mlp": 0.01039946, "balance_loss_clip": 1.0361867, "balance_loss_mlp": 1.02632689, "epoch": 0.6624379978956861, "flos": 48792688767360.0, "grad_norm": 1.7400500770773162, "language_loss": 0.774885, "learning_rate": 1.0807420459501084e-06, "loss": 0.79612345, "num_input_tokens_seen": 237848305, "step": 11018, "time_per_iteration": 2.9582974910736084 }, { "auxiliary_loss_clip": 0.01091037, "auxiliary_loss_mlp": 0.01042104, "balance_loss_clip": 1.03768778, "balance_loss_mlp": 1.02916956, "epoch": 0.6624981211483542, "flos": 18952431477120.0, "grad_norm": 2.014925244928839, "language_loss": 0.83705002, "learning_rate": 1.0803961781659841e-06, "loss": 0.85838139, "num_input_tokens_seen": 237867020, "step": 11019, "time_per_iteration": 2.684549331665039 }, { "auxiliary_loss_clip": 0.01097432, "auxiliary_loss_mlp": 0.00772198, "balance_loss_clip": 1.03844643, "balance_loss_mlp": 1.00007081, "epoch": 0.6625582444010221, "flos": 23256576437760.0, "grad_norm": 1.6087102704367435, "language_loss": 0.71948653, "learning_rate": 1.080050345253328e-06, "loss": 0.73818284, "num_input_tokens_seen": 237886710, "step": 11020, "time_per_iteration": 2.6002566814422607 }, { "auxiliary_loss_clip": 0.01092653, "auxiliary_loss_mlp": 0.01030916, "balance_loss_clip": 1.03763211, "balance_loss_mlp": 1.01636672, "epoch": 0.6626183676536901, "flos": 21394823633280.0, "grad_norm": 1.6700673315170445, "language_loss": 0.72552252, "learning_rate": 1.0797045472252554e-06, "loss": 0.74675822, "num_input_tokens_seen": 237904795, "step": 11021, "time_per_iteration": 2.677899122238159 }, { "auxiliary_loss_clip": 0.01087084, "auxiliary_loss_mlp": 0.0104125, "balance_loss_clip": 1.03822863, "balance_loss_mlp": 1.02790403, "epoch": 0.662678490906358, "flos": 14571293713920.0, "grad_norm": 2.016335698142833, "language_loss": 0.83232486, "learning_rate": 1.0793587840948793e-06, "loss": 0.85360825, "num_input_tokens_seen": 237921320, "step": 11022, "time_per_iteration": 2.62428879737854 }, { "auxiliary_loss_clip": 0.01099654, "auxiliary_loss_mlp": 0.01034417, "balance_loss_clip": 1.04019356, "balance_loss_mlp": 1.01928318, "epoch": 0.662738614159026, "flos": 15992350554240.0, "grad_norm": 2.476624679148487, "language_loss": 0.72735739, "learning_rate": 1.0790130558753099e-06, "loss": 0.74869806, "num_input_tokens_seen": 237933525, "step": 11023, "time_per_iteration": 2.632291316986084 }, { "auxiliary_loss_clip": 0.01079183, "auxiliary_loss_mlp": 0.01035009, "balance_loss_clip": 1.03499722, "balance_loss_mlp": 1.02165151, "epoch": 0.6627987374116939, "flos": 19536338966400.0, "grad_norm": 1.8699789342451163, "language_loss": 0.75085115, "learning_rate": 1.0786673625796574e-06, "loss": 0.7719931, "num_input_tokens_seen": 237953395, "step": 11024, "time_per_iteration": 2.7034032344818115 }, { "auxiliary_loss_clip": 0.01083517, "auxiliary_loss_mlp": 0.01031822, "balance_loss_clip": 1.0384872, "balance_loss_mlp": 1.01755285, "epoch": 0.662858860664362, "flos": 15702838934400.0, "grad_norm": 2.491473090515614, "language_loss": 0.69829249, "learning_rate": 1.0783217042210306e-06, "loss": 0.71944588, "num_input_tokens_seen": 237971445, "step": 11025, "time_per_iteration": 2.7056894302368164 }, { "auxiliary_loss_clip": 0.01118609, "auxiliary_loss_mlp": 0.01038933, "balance_loss_clip": 1.04383016, "balance_loss_mlp": 1.02548599, "epoch": 0.6629189839170299, "flos": 20154289570560.0, "grad_norm": 1.5605432120454088, "language_loss": 0.79108787, "learning_rate": 1.0779760808125379e-06, "loss": 0.81266326, "num_input_tokens_seen": 237989965, "step": 11026, "time_per_iteration": 2.6094040870666504 }, { "auxiliary_loss_clip": 0.01104761, "auxiliary_loss_mlp": 0.01030194, "balance_loss_clip": 1.04092979, "balance_loss_mlp": 1.01790905, "epoch": 0.6629791071696979, "flos": 20915415786240.0, "grad_norm": 1.5667845463950276, "language_loss": 0.75913531, "learning_rate": 1.0776304923672842e-06, "loss": 0.7804848, "num_input_tokens_seen": 238006820, "step": 11027, "time_per_iteration": 2.6272130012512207 }, { "auxiliary_loss_clip": 0.01088271, "auxiliary_loss_mlp": 0.01038744, "balance_loss_clip": 1.03918552, "balance_loss_mlp": 1.02465403, "epoch": 0.6630392304223659, "flos": 20846898593280.0, "grad_norm": 2.126605601662929, "language_loss": 0.703035, "learning_rate": 1.0772849388983742e-06, "loss": 0.72430521, "num_input_tokens_seen": 238022560, "step": 11028, "time_per_iteration": 2.7173945903778076 }, { "auxiliary_loss_clip": 0.01103236, "auxiliary_loss_mlp": 0.01033808, "balance_loss_clip": 1.03955865, "balance_loss_mlp": 1.0220722, "epoch": 0.6630993536750338, "flos": 20995820380800.0, "grad_norm": 1.8721020554211893, "language_loss": 0.79606169, "learning_rate": 1.0769394204189138e-06, "loss": 0.81743217, "num_input_tokens_seen": 238041895, "step": 11029, "time_per_iteration": 2.5954697132110596 }, { "auxiliary_loss_clip": 0.01116256, "auxiliary_loss_mlp": 0.01035331, "balance_loss_clip": 1.03937316, "balance_loss_mlp": 1.02168214, "epoch": 0.6631594769277018, "flos": 18259032355200.0, "grad_norm": 2.1557545389807617, "language_loss": 0.76608872, "learning_rate": 1.0765939369420012e-06, "loss": 0.78760457, "num_input_tokens_seen": 238060445, "step": 11030, "time_per_iteration": 2.5441596508026123 }, { "auxiliary_loss_clip": 0.01113502, "auxiliary_loss_mlp": 0.01035912, "balance_loss_clip": 1.04352438, "balance_loss_mlp": 1.02144003, "epoch": 0.6632196001803697, "flos": 17820491207040.0, "grad_norm": 2.2370976778546803, "language_loss": 0.75485003, "learning_rate": 1.0762484884807391e-06, "loss": 0.77634418, "num_input_tokens_seen": 238077080, "step": 11031, "time_per_iteration": 2.607260227203369 }, { "auxiliary_loss_clip": 0.01106421, "auxiliary_loss_mlp": 0.01038808, "balance_loss_clip": 1.04007494, "balance_loss_mlp": 1.02508116, "epoch": 0.6632797234330378, "flos": 12670182581760.0, "grad_norm": 4.999518522839319, "language_loss": 0.74670291, "learning_rate": 1.075903075048228e-06, "loss": 0.76815522, "num_input_tokens_seen": 238091045, "step": 11032, "time_per_iteration": 2.594426393508911 }, { "auxiliary_loss_clip": 0.01072119, "auxiliary_loss_mlp": 0.01033962, "balance_loss_clip": 1.0367676, "balance_loss_mlp": 1.02086639, "epoch": 0.6633398466857057, "flos": 23584728113280.0, "grad_norm": 1.76988392785946, "language_loss": 0.80491328, "learning_rate": 1.0755576966575635e-06, "loss": 0.82597411, "num_input_tokens_seen": 238110220, "step": 11033, "time_per_iteration": 2.7742807865142822 }, { "auxiliary_loss_clip": 0.01098023, "auxiliary_loss_mlp": 0.01031641, "balance_loss_clip": 1.04221106, "balance_loss_mlp": 1.01806927, "epoch": 0.6633999699383737, "flos": 20631686256000.0, "grad_norm": 1.7697764735120445, "language_loss": 0.80480468, "learning_rate": 1.0752123533218451e-06, "loss": 0.82610136, "num_input_tokens_seen": 238130400, "step": 11034, "time_per_iteration": 2.72609543800354 }, { "auxiliary_loss_clip": 0.01098853, "auxiliary_loss_mlp": 0.01029417, "balance_loss_clip": 1.03850234, "balance_loss_mlp": 1.01725149, "epoch": 0.6634600931910416, "flos": 21797095023360.0, "grad_norm": 1.6912859958234545, "language_loss": 0.7568692, "learning_rate": 1.074867045054166e-06, "loss": 0.77815193, "num_input_tokens_seen": 238148165, "step": 11035, "time_per_iteration": 2.6851565837860107 }, { "auxiliary_loss_clip": 0.01080784, "auxiliary_loss_mlp": 0.01029042, "balance_loss_clip": 1.03555465, "balance_loss_mlp": 1.01562476, "epoch": 0.6635202164437096, "flos": 18732873594240.0, "grad_norm": 1.9570572428830155, "language_loss": 0.8271299, "learning_rate": 1.074521771867622e-06, "loss": 0.84822816, "num_input_tokens_seen": 238166360, "step": 11036, "time_per_iteration": 2.6795291900634766 }, { "auxiliary_loss_clip": 0.01034271, "auxiliary_loss_mlp": 0.01004373, "balance_loss_clip": 1.01085413, "balance_loss_mlp": 1.00327635, "epoch": 0.6635803396963775, "flos": 60222771227520.0, "grad_norm": 0.7751211897866269, "language_loss": 0.52259576, "learning_rate": 1.0741765337753044e-06, "loss": 0.54298222, "num_input_tokens_seen": 238227630, "step": 11037, "time_per_iteration": 4.7726218700408936 }, { "auxiliary_loss_clip": 0.01060431, "auxiliary_loss_mlp": 0.01041224, "balance_loss_clip": 1.03799784, "balance_loss_mlp": 1.0276525, "epoch": 0.6636404629490456, "flos": 29167041611520.0, "grad_norm": 1.5502874196412986, "language_loss": 0.79120708, "learning_rate": 1.0738313307903052e-06, "loss": 0.81222361, "num_input_tokens_seen": 238248435, "step": 11038, "time_per_iteration": 2.8115994930267334 }, { "auxiliary_loss_clip": 0.01082049, "auxiliary_loss_mlp": 0.01043022, "balance_loss_clip": 1.03767705, "balance_loss_mlp": 1.02863979, "epoch": 0.6637005862017135, "flos": 38907702766080.0, "grad_norm": 1.791707577314863, "language_loss": 0.63976014, "learning_rate": 1.073486162925716e-06, "loss": 0.66101086, "num_input_tokens_seen": 238268755, "step": 11039, "time_per_iteration": 2.8266031742095947 }, { "auxiliary_loss_clip": 0.0107412, "auxiliary_loss_mlp": 0.01031845, "balance_loss_clip": 1.03814256, "balance_loss_mlp": 1.01877952, "epoch": 0.6637607094543815, "flos": 22783345729920.0, "grad_norm": 1.6823578045159262, "language_loss": 0.63401222, "learning_rate": 1.0731410301946237e-06, "loss": 0.65507191, "num_input_tokens_seen": 238290120, "step": 11040, "time_per_iteration": 2.705897569656372 }, { "auxiliary_loss_clip": 0.01074324, "auxiliary_loss_mlp": 0.01044015, "balance_loss_clip": 1.03504574, "balance_loss_mlp": 1.02977514, "epoch": 0.6638208327070495, "flos": 18114096977280.0, "grad_norm": 1.8896789484535716, "language_loss": 0.71718216, "learning_rate": 1.0727959326101161e-06, "loss": 0.73836553, "num_input_tokens_seen": 238309290, "step": 11041, "time_per_iteration": 4.213087320327759 }, { "auxiliary_loss_clip": 0.01097087, "auxiliary_loss_mlp": 0.01048475, "balance_loss_clip": 1.03642857, "balance_loss_mlp": 1.03349042, "epoch": 0.6638809559597174, "flos": 29424880414080.0, "grad_norm": 2.2565600398451795, "language_loss": 0.61915213, "learning_rate": 1.0724508701852806e-06, "loss": 0.64060771, "num_input_tokens_seen": 238327280, "step": 11042, "time_per_iteration": 4.279943943023682 }, { "auxiliary_loss_clip": 0.011055, "auxiliary_loss_mlp": 0.01031149, "balance_loss_clip": 1.03810656, "balance_loss_mlp": 1.01686156, "epoch": 0.6639410792123854, "flos": 28072699902720.0, "grad_norm": 2.105682360594448, "language_loss": 0.68285942, "learning_rate": 1.0721058429331998e-06, "loss": 0.7042259, "num_input_tokens_seen": 238346330, "step": 11043, "time_per_iteration": 2.6422598361968994 }, { "auxiliary_loss_clip": 0.01101764, "auxiliary_loss_mlp": 0.0103036, "balance_loss_clip": 1.04116786, "balance_loss_mlp": 1.018767, "epoch": 0.6640012024650533, "flos": 25556367600000.0, "grad_norm": 1.5365440611155503, "language_loss": 0.83934712, "learning_rate": 1.0717608508669587e-06, "loss": 0.8606683, "num_input_tokens_seen": 238364650, "step": 11044, "time_per_iteration": 2.732520341873169 }, { "auxiliary_loss_clip": 0.01073049, "auxiliary_loss_mlp": 0.01031878, "balance_loss_clip": 1.03586829, "balance_loss_mlp": 1.0185442, "epoch": 0.6640613257177214, "flos": 14866946559360.0, "grad_norm": 2.1294485287076315, "language_loss": 0.6951791, "learning_rate": 1.0714158939996392e-06, "loss": 0.71622837, "num_input_tokens_seen": 238381630, "step": 11045, "time_per_iteration": 2.6816322803497314 }, { "auxiliary_loss_clip": 0.01104183, "auxiliary_loss_mlp": 0.01027582, "balance_loss_clip": 1.04048705, "balance_loss_mlp": 1.0148927, "epoch": 0.6641214489703893, "flos": 23221096778880.0, "grad_norm": 2.227953338696249, "language_loss": 0.64640826, "learning_rate": 1.0710709723443235e-06, "loss": 0.66772592, "num_input_tokens_seen": 238402595, "step": 11046, "time_per_iteration": 4.160333156585693 }, { "auxiliary_loss_clip": 0.01085109, "auxiliary_loss_mlp": 0.01027972, "balance_loss_clip": 1.03931284, "balance_loss_mlp": 1.01488853, "epoch": 0.6641815722230573, "flos": 37742617221120.0, "grad_norm": 1.6669339663762488, "language_loss": 0.71004307, "learning_rate": 1.070726085914088e-06, "loss": 0.73117387, "num_input_tokens_seen": 238426860, "step": 11047, "time_per_iteration": 2.8554368019104004 }, { "auxiliary_loss_clip": 0.01049735, "auxiliary_loss_mlp": 0.01036523, "balance_loss_clip": 1.04015899, "balance_loss_mlp": 1.02316511, "epoch": 0.6642416954757252, "flos": 17931132074880.0, "grad_norm": 1.8883257209384914, "language_loss": 0.77274108, "learning_rate": 1.0703812347220126e-06, "loss": 0.79360354, "num_input_tokens_seen": 238443990, "step": 11048, "time_per_iteration": 2.755452871322632 }, { "auxiliary_loss_clip": 0.01010482, "auxiliary_loss_mlp": 0.01002664, "balance_loss_clip": 1.01594365, "balance_loss_mlp": 1.00137699, "epoch": 0.6643018187283932, "flos": 51995384104320.0, "grad_norm": 0.747851272534148, "language_loss": 0.55009979, "learning_rate": 1.0700364187811745e-06, "loss": 0.57023126, "num_input_tokens_seen": 238503045, "step": 11049, "time_per_iteration": 3.232647180557251 }, { "auxiliary_loss_clip": 0.01103139, "auxiliary_loss_mlp": 0.01032284, "balance_loss_clip": 1.04035759, "balance_loss_mlp": 1.02035105, "epoch": 0.6643619419810611, "flos": 30226657847040.0, "grad_norm": 1.8987287972691187, "language_loss": 0.63542056, "learning_rate": 1.069691638104648e-06, "loss": 0.65677476, "num_input_tokens_seen": 238527320, "step": 11050, "time_per_iteration": 2.712871551513672 }, { "auxiliary_loss_clip": 0.01110292, "auxiliary_loss_mlp": 0.01033953, "balance_loss_clip": 1.03804648, "balance_loss_mlp": 1.02145386, "epoch": 0.6644220652337292, "flos": 22966131064320.0, "grad_norm": 2.56878578960884, "language_loss": 0.78747934, "learning_rate": 1.0693468927055085e-06, "loss": 0.80892181, "num_input_tokens_seen": 238546030, "step": 11051, "time_per_iteration": 2.5602593421936035 }, { "auxiliary_loss_clip": 0.01090775, "auxiliary_loss_mlp": 0.01036923, "balance_loss_clip": 1.04075074, "balance_loss_mlp": 1.02409577, "epoch": 0.6644821884863971, "flos": 21142228216320.0, "grad_norm": 1.6830071971795009, "language_loss": 0.85365808, "learning_rate": 1.0690021825968276e-06, "loss": 0.87493503, "num_input_tokens_seen": 238564175, "step": 11052, "time_per_iteration": 2.6400978565216064 }, { "auxiliary_loss_clip": 0.0106864, "auxiliary_loss_mlp": 0.01035785, "balance_loss_clip": 1.03640008, "balance_loss_mlp": 1.02115774, "epoch": 0.6645423117390651, "flos": 20192821885440.0, "grad_norm": 2.468702862512036, "language_loss": 0.7442345, "learning_rate": 1.0686575077916776e-06, "loss": 0.7652787, "num_input_tokens_seen": 238581010, "step": 11053, "time_per_iteration": 2.7525177001953125 }, { "auxiliary_loss_clip": 0.01081443, "auxiliary_loss_mlp": 0.0103047, "balance_loss_clip": 1.03704178, "balance_loss_mlp": 1.01803088, "epoch": 0.6646024349917331, "flos": 24351959640960.0, "grad_norm": 1.6350550334521685, "language_loss": 0.7937814, "learning_rate": 1.0683128683031278e-06, "loss": 0.81490058, "num_input_tokens_seen": 238601365, "step": 11054, "time_per_iteration": 2.6874406337738037 }, { "auxiliary_loss_clip": 0.01067976, "auxiliary_loss_mlp": 0.01035163, "balance_loss_clip": 1.03798532, "balance_loss_mlp": 1.02267623, "epoch": 0.664662558244401, "flos": 18806706000000.0, "grad_norm": 1.6423825875919162, "language_loss": 0.73928297, "learning_rate": 1.0679682641442472e-06, "loss": 0.76031435, "num_input_tokens_seen": 238619850, "step": 11055, "time_per_iteration": 2.733832597732544 }, { "auxiliary_loss_clip": 0.01082031, "auxiliary_loss_mlp": 0.01043702, "balance_loss_clip": 1.03823996, "balance_loss_mlp": 1.02983165, "epoch": 0.664722681497069, "flos": 18952790613120.0, "grad_norm": 1.8844406603153, "language_loss": 0.7300725, "learning_rate": 1.0676236953281042e-06, "loss": 0.75132978, "num_input_tokens_seen": 238637635, "step": 11056, "time_per_iteration": 2.6787209510803223 }, { "auxiliary_loss_clip": 0.01069462, "auxiliary_loss_mlp": 0.01036287, "balance_loss_clip": 1.0367837, "balance_loss_mlp": 1.02314389, "epoch": 0.6647828047497369, "flos": 19571279921280.0, "grad_norm": 3.230794817750296, "language_loss": 0.69325733, "learning_rate": 1.0672791618677641e-06, "loss": 0.71431488, "num_input_tokens_seen": 238656200, "step": 11057, "time_per_iteration": 2.749843120574951 }, { "auxiliary_loss_clip": 0.01103707, "auxiliary_loss_mlp": 0.01033971, "balance_loss_clip": 1.03987014, "balance_loss_mlp": 1.0206548, "epoch": 0.664842928002405, "flos": 23149455102720.0, "grad_norm": 1.6131185292636203, "language_loss": 0.80123711, "learning_rate": 1.066934663776291e-06, "loss": 0.82261384, "num_input_tokens_seen": 238675005, "step": 11058, "time_per_iteration": 2.6598408222198486 }, { "auxiliary_loss_clip": 0.01008973, "auxiliary_loss_mlp": 0.01008338, "balance_loss_clip": 1.01433647, "balance_loss_mlp": 1.00715828, "epoch": 0.6649030512550729, "flos": 65244913148160.0, "grad_norm": 0.802003162122869, "language_loss": 0.62611187, "learning_rate": 1.0665902010667496e-06, "loss": 0.64628494, "num_input_tokens_seen": 238731425, "step": 11059, "time_per_iteration": 3.12062668800354 }, { "auxiliary_loss_clip": 0.01102046, "auxiliary_loss_mlp": 0.01038723, "balance_loss_clip": 1.03967965, "balance_loss_mlp": 1.026546, "epoch": 0.6649631745077409, "flos": 20194797133440.0, "grad_norm": 1.442710173280966, "language_loss": 0.7869736, "learning_rate": 1.0662457737522008e-06, "loss": 0.80838132, "num_input_tokens_seen": 238752020, "step": 11060, "time_per_iteration": 2.776430606842041 }, { "auxiliary_loss_clip": 0.01082742, "auxiliary_loss_mlp": 0.01038039, "balance_loss_clip": 1.03887463, "balance_loss_mlp": 1.02412772, "epoch": 0.6650232977604088, "flos": 17238558965760.0, "grad_norm": 1.6774634080954063, "language_loss": 0.78738892, "learning_rate": 1.0659013818457055e-06, "loss": 0.80859673, "num_input_tokens_seen": 238769665, "step": 11061, "time_per_iteration": 2.6786346435546875 }, { "auxiliary_loss_clip": 0.01092682, "auxiliary_loss_mlp": 0.01030082, "balance_loss_clip": 1.0417732, "balance_loss_mlp": 1.01765454, "epoch": 0.6650834210130768, "flos": 10006867825920.0, "grad_norm": 2.291207929066884, "language_loss": 0.56939697, "learning_rate": 1.0655570253603243e-06, "loss": 0.59062469, "num_input_tokens_seen": 238782180, "step": 11062, "time_per_iteration": 2.6440412998199463 }, { "auxiliary_loss_clip": 0.01100317, "auxiliary_loss_mlp": 0.01037299, "balance_loss_clip": 1.03600216, "balance_loss_mlp": 1.02142608, "epoch": 0.6651435442657447, "flos": 10452088903680.0, "grad_norm": 1.8230256032266374, "language_loss": 0.75959098, "learning_rate": 1.0652127043091144e-06, "loss": 0.78096718, "num_input_tokens_seen": 238800315, "step": 11063, "time_per_iteration": 2.592930555343628 }, { "auxiliary_loss_clip": 0.01056354, "auxiliary_loss_mlp": 0.01044348, "balance_loss_clip": 1.03860426, "balance_loss_mlp": 1.03033507, "epoch": 0.6652036675184128, "flos": 22344229964160.0, "grad_norm": 1.2698232462033214, "language_loss": 0.70678842, "learning_rate": 1.0648684187051316e-06, "loss": 0.72779548, "num_input_tokens_seen": 238822250, "step": 11064, "time_per_iteration": 2.800218105316162 }, { "auxiliary_loss_clip": 0.01032183, "auxiliary_loss_mlp": 0.01006383, "balance_loss_clip": 1.00864732, "balance_loss_mlp": 1.00513113, "epoch": 0.6652637907710807, "flos": 52909633998720.0, "grad_norm": 0.8463523903026119, "language_loss": 0.629758, "learning_rate": 1.0645241685614322e-06, "loss": 0.65014362, "num_input_tokens_seen": 238877190, "step": 11065, "time_per_iteration": 3.1035780906677246 }, { "auxiliary_loss_clip": 0.01099093, "auxiliary_loss_mlp": 0.01039736, "balance_loss_clip": 1.03762209, "balance_loss_mlp": 1.02464366, "epoch": 0.6653239140237487, "flos": 23104637907840.0, "grad_norm": 1.610155502063491, "language_loss": 0.62464315, "learning_rate": 1.0641799538910708e-06, "loss": 0.64603138, "num_input_tokens_seen": 238896010, "step": 11066, "time_per_iteration": 2.6371681690216064 }, { "auxiliary_loss_clip": 0.01074468, "auxiliary_loss_mlp": 0.01041028, "balance_loss_clip": 1.03320074, "balance_loss_mlp": 1.02528, "epoch": 0.6653840372764167, "flos": 25959393175680.0, "grad_norm": 1.5735109104273866, "language_loss": 0.70316392, "learning_rate": 1.0638357747070985e-06, "loss": 0.72431886, "num_input_tokens_seen": 238918990, "step": 11067, "time_per_iteration": 2.712170362472534 }, { "auxiliary_loss_clip": 0.01015121, "auxiliary_loss_mlp": 0.0100891, "balance_loss_clip": 1.01019919, "balance_loss_mlp": 1.00739563, "epoch": 0.6654441605290846, "flos": 66041985899520.0, "grad_norm": 0.9248325292472583, "language_loss": 0.72063255, "learning_rate": 1.0634916310225684e-06, "loss": 0.74087286, "num_input_tokens_seen": 238975735, "step": 11068, "time_per_iteration": 3.188148021697998 }, { "auxiliary_loss_clip": 0.01006694, "auxiliary_loss_mlp": 0.01006942, "balance_loss_clip": 1.01129699, "balance_loss_mlp": 1.00560117, "epoch": 0.6655042837817526, "flos": 65196112521600.0, "grad_norm": 0.8265951746379137, "language_loss": 0.57727754, "learning_rate": 1.0631475228505285e-06, "loss": 0.5974139, "num_input_tokens_seen": 239042360, "step": 11069, "time_per_iteration": 3.3526012897491455 }, { "auxiliary_loss_clip": 0.01011659, "auxiliary_loss_mlp": 0.0100159, "balance_loss_clip": 1.00811982, "balance_loss_mlp": 1.00046349, "epoch": 0.6655644070344205, "flos": 69008746752000.0, "grad_norm": 0.7554433068818003, "language_loss": 0.63502038, "learning_rate": 1.062803450204029e-06, "loss": 0.65515292, "num_input_tokens_seen": 239109410, "step": 11070, "time_per_iteration": 3.189624071121216 }, { "auxiliary_loss_clip": 0.0111185, "auxiliary_loss_mlp": 0.01029573, "balance_loss_clip": 1.03767705, "balance_loss_mlp": 1.01622725, "epoch": 0.6656245302870886, "flos": 36315562809600.0, "grad_norm": 1.5957526683817405, "language_loss": 0.58635205, "learning_rate": 1.062459413096116e-06, "loss": 0.60776627, "num_input_tokens_seen": 239135345, "step": 11071, "time_per_iteration": 2.7373464107513428 }, { "auxiliary_loss_clip": 0.01107113, "auxiliary_loss_mlp": 0.01030576, "balance_loss_clip": 1.04254675, "balance_loss_mlp": 1.01822627, "epoch": 0.6656846535397565, "flos": 21794832466560.0, "grad_norm": 1.7851142792546852, "language_loss": 0.72693968, "learning_rate": 1.0621154115398364e-06, "loss": 0.74831653, "num_input_tokens_seen": 239154340, "step": 11072, "time_per_iteration": 2.6327590942382812 }, { "auxiliary_loss_clip": 0.01103867, "auxiliary_loss_mlp": 0.01032627, "balance_loss_clip": 1.04155874, "balance_loss_mlp": 1.01859617, "epoch": 0.6657447767924245, "flos": 37487615592960.0, "grad_norm": 1.879864387077726, "language_loss": 0.70789611, "learning_rate": 1.0617714455482353e-06, "loss": 0.72926104, "num_input_tokens_seen": 239177815, "step": 11073, "time_per_iteration": 2.704252243041992 }, { "auxiliary_loss_clip": 0.01084232, "auxiliary_loss_mlp": 0.0103114, "balance_loss_clip": 1.03998876, "balance_loss_mlp": 1.01784229, "epoch": 0.6658049000450924, "flos": 16837688206080.0, "grad_norm": 2.6568090318066475, "language_loss": 0.56073666, "learning_rate": 1.061427515134354e-06, "loss": 0.5818904, "num_input_tokens_seen": 239195735, "step": 11074, "time_per_iteration": 2.6551811695098877 }, { "auxiliary_loss_clip": 0.01116885, "auxiliary_loss_mlp": 0.00770661, "balance_loss_clip": 1.04282713, "balance_loss_mlp": 1.00006819, "epoch": 0.6658650232977604, "flos": 33510975863040.0, "grad_norm": 1.424580138870233, "language_loss": 0.7252624, "learning_rate": 1.061083620311235e-06, "loss": 0.74413788, "num_input_tokens_seen": 239217535, "step": 11075, "time_per_iteration": 2.7062625885009766 }, { "auxiliary_loss_clip": 0.01100028, "auxiliary_loss_mlp": 0.01032017, "balance_loss_clip": 1.03886509, "balance_loss_mlp": 1.01981592, "epoch": 0.6659251465504283, "flos": 37706311549440.0, "grad_norm": 1.4599897176092982, "language_loss": 0.66246772, "learning_rate": 1.0607397610919202e-06, "loss": 0.68378824, "num_input_tokens_seen": 239241975, "step": 11076, "time_per_iteration": 2.804659605026245 }, { "auxiliary_loss_clip": 0.01087468, "auxiliary_loss_mlp": 0.01032867, "balance_loss_clip": 1.03394532, "balance_loss_mlp": 1.01870489, "epoch": 0.6659852698030964, "flos": 24893420232960.0, "grad_norm": 1.6180459271945493, "language_loss": 0.75299704, "learning_rate": 1.0603959374894468e-06, "loss": 0.77420044, "num_input_tokens_seen": 239262025, "step": 11077, "time_per_iteration": 4.274590253829956 }, { "auxiliary_loss_clip": 0.0108965, "auxiliary_loss_mlp": 0.01031149, "balance_loss_clip": 1.0374043, "balance_loss_mlp": 1.01802957, "epoch": 0.6660453930557643, "flos": 24352821567360.0, "grad_norm": 1.5713803954899295, "language_loss": 0.66825247, "learning_rate": 1.0600521495168538e-06, "loss": 0.68946046, "num_input_tokens_seen": 239282775, "step": 11078, "time_per_iteration": 2.7334680557250977 }, { "auxiliary_loss_clip": 0.01115428, "auxiliary_loss_mlp": 0.01033982, "balance_loss_clip": 1.03945637, "balance_loss_mlp": 1.01990974, "epoch": 0.6661055163084323, "flos": 10597814380800.0, "grad_norm": 2.400926553792791, "language_loss": 0.69900686, "learning_rate": 1.0597083971871783e-06, "loss": 0.72050095, "num_input_tokens_seen": 239299775, "step": 11079, "time_per_iteration": 2.6223835945129395 }, { "auxiliary_loss_clip": 0.01089448, "auxiliary_loss_mlp": 0.01030137, "balance_loss_clip": 1.03717136, "balance_loss_mlp": 1.01738119, "epoch": 0.6661656395611003, "flos": 24057491944320.0, "grad_norm": 1.61546827465866, "language_loss": 0.80478466, "learning_rate": 1.0593646805134544e-06, "loss": 0.82598048, "num_input_tokens_seen": 239319660, "step": 11080, "time_per_iteration": 4.228775978088379 }, { "auxiliary_loss_clip": 0.01075927, "auxiliary_loss_mlp": 0.01033579, "balance_loss_clip": 1.03583407, "balance_loss_mlp": 1.02147329, "epoch": 0.6662257628137682, "flos": 23036192542080.0, "grad_norm": 1.8384302926010723, "language_loss": 0.78062707, "learning_rate": 1.0590209995087157e-06, "loss": 0.80172205, "num_input_tokens_seen": 239339215, "step": 11081, "time_per_iteration": 4.32209324836731 }, { "auxiliary_loss_clip": 0.01076143, "auxiliary_loss_mlp": 0.01039547, "balance_loss_clip": 1.03748226, "balance_loss_mlp": 1.02387714, "epoch": 0.6662858860664362, "flos": 24754446512640.0, "grad_norm": 1.6809862267344533, "language_loss": 0.80329323, "learning_rate": 1.0586773541859946e-06, "loss": 0.82445014, "num_input_tokens_seen": 239358545, "step": 11082, "time_per_iteration": 2.7251505851745605 }, { "auxiliary_loss_clip": 0.01076739, "auxiliary_loss_mlp": 0.01033286, "balance_loss_clip": 1.04017997, "balance_loss_mlp": 1.02098405, "epoch": 0.6663460093191041, "flos": 20009066883840.0, "grad_norm": 1.4477945081554633, "language_loss": 0.83849418, "learning_rate": 1.0583337445583234e-06, "loss": 0.85959446, "num_input_tokens_seen": 239376665, "step": 11083, "time_per_iteration": 2.669404983520508 }, { "auxiliary_loss_clip": 0.01079397, "auxiliary_loss_mlp": 0.01036057, "balance_loss_clip": 1.04023921, "balance_loss_mlp": 1.02203834, "epoch": 0.6664061325717722, "flos": 17821389047040.0, "grad_norm": 2.7255574695502216, "language_loss": 0.85510308, "learning_rate": 1.057990170638731e-06, "loss": 0.87625766, "num_input_tokens_seen": 239394345, "step": 11084, "time_per_iteration": 2.663749933242798 }, { "auxiliary_loss_clip": 0.01094685, "auxiliary_loss_mlp": 0.01031242, "balance_loss_clip": 1.03857958, "balance_loss_mlp": 1.01727629, "epoch": 0.6664662558244401, "flos": 18076893465600.0, "grad_norm": 2.199200259512602, "language_loss": 0.73457599, "learning_rate": 1.0576466324402452e-06, "loss": 0.75583529, "num_input_tokens_seen": 239410605, "step": 11085, "time_per_iteration": 4.193335771560669 }, { "auxiliary_loss_clip": 0.01087888, "auxiliary_loss_mlp": 0.01031014, "balance_loss_clip": 1.03528535, "balance_loss_mlp": 1.01760268, "epoch": 0.6665263790771081, "flos": 21574197175680.0, "grad_norm": 1.9746802098097909, "language_loss": 0.80359179, "learning_rate": 1.057303129975894e-06, "loss": 0.82478082, "num_input_tokens_seen": 239427155, "step": 11086, "time_per_iteration": 2.6708765029907227 }, { "auxiliary_loss_clip": 0.01090857, "auxiliary_loss_mlp": 0.01032379, "balance_loss_clip": 1.03936315, "balance_loss_mlp": 1.018646, "epoch": 0.666586502329776, "flos": 24206629213440.0, "grad_norm": 1.7936971088038383, "language_loss": 0.74496621, "learning_rate": 1.056959663258702e-06, "loss": 0.76619852, "num_input_tokens_seen": 239445510, "step": 11087, "time_per_iteration": 2.7366881370544434 }, { "auxiliary_loss_clip": 0.01101311, "auxiliary_loss_mlp": 0.01035633, "balance_loss_clip": 1.03835797, "balance_loss_mlp": 1.02250183, "epoch": 0.666646625582444, "flos": 22200515648640.0, "grad_norm": 1.692056233669114, "language_loss": 0.64937711, "learning_rate": 1.0566162323016939e-06, "loss": 0.67074656, "num_input_tokens_seen": 239464805, "step": 11088, "time_per_iteration": 2.652937412261963 }, { "auxiliary_loss_clip": 0.01099844, "auxiliary_loss_mlp": 0.01029761, "balance_loss_clip": 1.03648591, "balance_loss_mlp": 1.01637387, "epoch": 0.6667067488351119, "flos": 18259930195200.0, "grad_norm": 2.239140495962673, "language_loss": 0.64203691, "learning_rate": 1.0562728371178928e-06, "loss": 0.66333294, "num_input_tokens_seen": 239483890, "step": 11089, "time_per_iteration": 2.6637988090515137 }, { "auxiliary_loss_clip": 0.01113447, "auxiliary_loss_mlp": 0.01031785, "balance_loss_clip": 1.03998184, "balance_loss_mlp": 1.01876771, "epoch": 0.66676687208778, "flos": 17236547804160.0, "grad_norm": 2.535090345981802, "language_loss": 0.80804038, "learning_rate": 1.0559294777203221e-06, "loss": 0.82949275, "num_input_tokens_seen": 239500080, "step": 11090, "time_per_iteration": 2.581758737564087 }, { "auxiliary_loss_clip": 0.01092289, "auxiliary_loss_mlp": 0.01035587, "balance_loss_clip": 1.03686905, "balance_loss_mlp": 1.02217007, "epoch": 0.6668269953404479, "flos": 19752197748480.0, "grad_norm": 1.9927096976475185, "language_loss": 0.77528715, "learning_rate": 1.0555861541219984e-06, "loss": 0.79656601, "num_input_tokens_seen": 239517335, "step": 11091, "time_per_iteration": 2.673798084259033 }, { "auxiliary_loss_clip": 0.01114388, "auxiliary_loss_mlp": 0.01033443, "balance_loss_clip": 1.04016709, "balance_loss_mlp": 1.02024066, "epoch": 0.6668871185931159, "flos": 20558428467840.0, "grad_norm": 1.9227343143607547, "language_loss": 0.79361308, "learning_rate": 1.0552428663359425e-06, "loss": 0.81509137, "num_input_tokens_seen": 239536240, "step": 11092, "time_per_iteration": 2.6783652305603027 }, { "auxiliary_loss_clip": 0.01010839, "auxiliary_loss_mlp": 0.01001852, "balance_loss_clip": 1.01392734, "balance_loss_mlp": 1.00064206, "epoch": 0.6669472418457839, "flos": 58088167735680.0, "grad_norm": 1.5742465893545905, "language_loss": 0.57764924, "learning_rate": 1.0548996143751724e-06, "loss": 0.59777617, "num_input_tokens_seen": 239598000, "step": 11093, "time_per_iteration": 3.25225567817688 }, { "auxiliary_loss_clip": 0.011126, "auxiliary_loss_mlp": 0.01032323, "balance_loss_clip": 1.03999138, "balance_loss_mlp": 1.01957977, "epoch": 0.6670073650984518, "flos": 26065113880320.0, "grad_norm": 1.5604249547045095, "language_loss": 0.76737595, "learning_rate": 1.054556398252703e-06, "loss": 0.78882521, "num_input_tokens_seen": 239617650, "step": 11094, "time_per_iteration": 2.6441400051116943 }, { "auxiliary_loss_clip": 0.01114242, "auxiliary_loss_mlp": 0.01034632, "balance_loss_clip": 1.03926766, "balance_loss_mlp": 1.02063107, "epoch": 0.6670674883511198, "flos": 32416849635840.0, "grad_norm": 1.725805849880736, "language_loss": 0.73280704, "learning_rate": 1.05421321798155e-06, "loss": 0.75429583, "num_input_tokens_seen": 239639825, "step": 11095, "time_per_iteration": 2.6807525157928467 }, { "auxiliary_loss_clip": 0.01100599, "auxiliary_loss_mlp": 0.01038236, "balance_loss_clip": 1.03832078, "balance_loss_mlp": 1.02496827, "epoch": 0.6671276116037878, "flos": 18037786533120.0, "grad_norm": 1.9301541125816652, "language_loss": 0.73262459, "learning_rate": 1.053870073574727e-06, "loss": 0.75401294, "num_input_tokens_seen": 239656300, "step": 11096, "time_per_iteration": 2.568824052810669 }, { "auxiliary_loss_clip": 0.01069521, "auxiliary_loss_mlp": 0.01032338, "balance_loss_clip": 1.03659153, "balance_loss_mlp": 1.01915956, "epoch": 0.6671877348564558, "flos": 23767046570880.0, "grad_norm": 2.803880463620154, "language_loss": 0.64528841, "learning_rate": 1.0535269650452456e-06, "loss": 0.66630697, "num_input_tokens_seen": 239676655, "step": 11097, "time_per_iteration": 2.7534751892089844 }, { "auxiliary_loss_clip": 0.01101343, "auxiliary_loss_mlp": 0.01036605, "balance_loss_clip": 1.03823709, "balance_loss_mlp": 1.0242486, "epoch": 0.6672478581091237, "flos": 20918360701440.0, "grad_norm": 1.9121192931903639, "language_loss": 0.75842595, "learning_rate": 1.0531838924061158e-06, "loss": 0.77980542, "num_input_tokens_seen": 239695430, "step": 11098, "time_per_iteration": 2.6095056533813477 }, { "auxiliary_loss_clip": 0.01115287, "auxiliary_loss_mlp": 0.0103552, "balance_loss_clip": 1.04045045, "balance_loss_mlp": 1.02328897, "epoch": 0.6673079813617917, "flos": 27855799626240.0, "grad_norm": 1.5630193182693057, "language_loss": 0.74190086, "learning_rate": 1.0528408556703476e-06, "loss": 0.76340902, "num_input_tokens_seen": 239717070, "step": 11099, "time_per_iteration": 2.673234224319458 }, { "auxiliary_loss_clip": 0.01098732, "auxiliary_loss_mlp": 0.01036855, "balance_loss_clip": 1.03607726, "balance_loss_mlp": 1.02412391, "epoch": 0.6673681046144596, "flos": 21616859554560.0, "grad_norm": 1.7967972361910232, "language_loss": 0.78233874, "learning_rate": 1.0524978548509502e-06, "loss": 0.80369455, "num_input_tokens_seen": 239737105, "step": 11100, "time_per_iteration": 2.637829303741455 }, { "auxiliary_loss_clip": 0.01112293, "auxiliary_loss_mlp": 0.01037899, "balance_loss_clip": 1.03913033, "balance_loss_mlp": 1.02564454, "epoch": 0.6674282278671276, "flos": 20889884194560.0, "grad_norm": 3.1933899226541804, "language_loss": 0.60124767, "learning_rate": 1.0521548899609288e-06, "loss": 0.62274957, "num_input_tokens_seen": 239757835, "step": 11101, "time_per_iteration": 2.649627685546875 }, { "auxiliary_loss_clip": 0.01098761, "auxiliary_loss_mlp": 0.01034633, "balance_loss_clip": 1.03970337, "balance_loss_mlp": 1.02054238, "epoch": 0.6674883511197955, "flos": 23624194181760.0, "grad_norm": 2.1447614079629362, "language_loss": 0.71100485, "learning_rate": 1.0518119610132884e-06, "loss": 0.73233879, "num_input_tokens_seen": 239775425, "step": 11102, "time_per_iteration": 2.7131104469299316 }, { "auxiliary_loss_clip": 0.01103363, "auxiliary_loss_mlp": 0.01031628, "balance_loss_clip": 1.03698874, "balance_loss_mlp": 1.01878357, "epoch": 0.6675484743724636, "flos": 19609668581760.0, "grad_norm": 1.3386493038394256, "language_loss": 0.84490895, "learning_rate": 1.051469068021034e-06, "loss": 0.8662588, "num_input_tokens_seen": 239794605, "step": 11103, "time_per_iteration": 2.630141496658325 }, { "auxiliary_loss_clip": 0.01091051, "auxiliary_loss_mlp": 0.01027938, "balance_loss_clip": 1.03639507, "balance_loss_mlp": 1.01571894, "epoch": 0.6676085976251315, "flos": 14319452482560.0, "grad_norm": 1.8538250094473767, "language_loss": 0.77889514, "learning_rate": 1.0511262109971668e-06, "loss": 0.80008507, "num_input_tokens_seen": 239812135, "step": 11104, "time_per_iteration": 2.7340710163116455 }, { "auxiliary_loss_clip": 0.01067144, "auxiliary_loss_mlp": 0.01029925, "balance_loss_clip": 1.03659081, "balance_loss_mlp": 1.01740217, "epoch": 0.6676687208777995, "flos": 38104596529920.0, "grad_norm": 5.138036678415969, "language_loss": 0.58146316, "learning_rate": 1.0507833899546889e-06, "loss": 0.60243386, "num_input_tokens_seen": 239835845, "step": 11105, "time_per_iteration": 2.882567882537842 }, { "auxiliary_loss_clip": 0.01107097, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.03966367, "balance_loss_mlp": 1.02172112, "epoch": 0.6677288441304675, "flos": 23981576549760.0, "grad_norm": 6.297152012729004, "language_loss": 0.73476273, "learning_rate": 1.0504406049066e-06, "loss": 0.75619453, "num_input_tokens_seen": 239853820, "step": 11106, "time_per_iteration": 2.6627464294433594 }, { "auxiliary_loss_clip": 0.01113601, "auxiliary_loss_mlp": 0.0103128, "balance_loss_clip": 1.0392319, "balance_loss_mlp": 1.01777363, "epoch": 0.6677889673831354, "flos": 24170682677760.0, "grad_norm": 1.6820711130448331, "language_loss": 0.76552516, "learning_rate": 1.0500978558659e-06, "loss": 0.78697395, "num_input_tokens_seen": 239873365, "step": 11107, "time_per_iteration": 2.655085325241089 }, { "auxiliary_loss_clip": 0.01089336, "auxiliary_loss_mlp": 0.010326, "balance_loss_clip": 1.03778529, "balance_loss_mlp": 1.01969552, "epoch": 0.6678490906358034, "flos": 22309648145280.0, "grad_norm": 2.264065486271505, "language_loss": 0.90136391, "learning_rate": 1.049755142845583e-06, "loss": 0.92258334, "num_input_tokens_seen": 239891215, "step": 11108, "time_per_iteration": 2.7129766941070557 }, { "auxiliary_loss_clip": 0.01083707, "auxiliary_loss_mlp": 0.01029485, "balance_loss_clip": 1.04215026, "balance_loss_mlp": 1.01795101, "epoch": 0.6679092138884714, "flos": 36898752026880.0, "grad_norm": 1.413392892629677, "language_loss": 0.82960904, "learning_rate": 1.049412465858646e-06, "loss": 0.85074097, "num_input_tokens_seen": 239913490, "step": 11109, "time_per_iteration": 2.867154121398926 }, { "auxiliary_loss_clip": 0.01087234, "auxiliary_loss_mlp": 0.01035101, "balance_loss_clip": 1.03826952, "balance_loss_mlp": 1.02132595, "epoch": 0.6679693371411394, "flos": 18150294908160.0, "grad_norm": 2.421344403388597, "language_loss": 0.70021516, "learning_rate": 1.0490698249180847e-06, "loss": 0.72143853, "num_input_tokens_seen": 239931565, "step": 11110, "time_per_iteration": 2.6291885375976562 }, { "auxiliary_loss_clip": 0.01087492, "auxiliary_loss_mlp": 0.01037588, "balance_loss_clip": 1.03955197, "balance_loss_mlp": 1.02289498, "epoch": 0.6680294603938073, "flos": 27198167472000.0, "grad_norm": 1.5840834743354089, "language_loss": 0.73441553, "learning_rate": 1.04872722003689e-06, "loss": 0.75566632, "num_input_tokens_seen": 239952395, "step": 11111, "time_per_iteration": 2.677231788635254 }, { "auxiliary_loss_clip": 0.01110772, "auxiliary_loss_mlp": 0.01031184, "balance_loss_clip": 1.0385406, "balance_loss_mlp": 1.01907229, "epoch": 0.6680895836464753, "flos": 21725309692800.0, "grad_norm": 6.009810258732459, "language_loss": 0.65599185, "learning_rate": 1.0483846512280553e-06, "loss": 0.67741144, "num_input_tokens_seen": 239968910, "step": 11112, "time_per_iteration": 2.5904297828674316 }, { "auxiliary_loss_clip": 0.01086609, "auxiliary_loss_mlp": 0.01030827, "balance_loss_clip": 1.03706861, "balance_loss_mlp": 1.01755929, "epoch": 0.6681497068991432, "flos": 19646477043840.0, "grad_norm": 1.8628832000622026, "language_loss": 0.6369822, "learning_rate": 1.048042118504569e-06, "loss": 0.65815663, "num_input_tokens_seen": 239987680, "step": 11113, "time_per_iteration": 2.623263359069824 }, { "auxiliary_loss_clip": 0.01072141, "auxiliary_loss_mlp": 0.01037464, "balance_loss_clip": 1.04164052, "balance_loss_mlp": 1.02563202, "epoch": 0.6682098301518112, "flos": 17419153570560.0, "grad_norm": 1.9634476729216852, "language_loss": 0.6540277, "learning_rate": 1.047699621879422e-06, "loss": 0.67512381, "num_input_tokens_seen": 240005790, "step": 11114, "time_per_iteration": 2.865252733230591 }, { "auxiliary_loss_clip": 0.0110424, "auxiliary_loss_mlp": 0.0103987, "balance_loss_clip": 1.03883052, "balance_loss_mlp": 1.0267992, "epoch": 0.6682699534044791, "flos": 22599016110720.0, "grad_norm": 1.6562280172476918, "language_loss": 0.78432989, "learning_rate": 1.0473571613655998e-06, "loss": 0.80577099, "num_input_tokens_seen": 240025895, "step": 11115, "time_per_iteration": 2.7281594276428223 }, { "auxiliary_loss_clip": 0.0105862, "auxiliary_loss_mlp": 0.00771764, "balance_loss_clip": 1.0309999, "balance_loss_mlp": 1.00021195, "epoch": 0.6683300766571472, "flos": 24863686750080.0, "grad_norm": 1.6526033494173815, "language_loss": 0.79655063, "learning_rate": 1.0470147369760896e-06, "loss": 0.81485444, "num_input_tokens_seen": 240044880, "step": 11116, "time_per_iteration": 4.51043963432312 }, { "auxiliary_loss_clip": 0.01084566, "auxiliary_loss_mlp": 0.01036408, "balance_loss_clip": 1.03999686, "balance_loss_mlp": 1.02240658, "epoch": 0.6683901999098151, "flos": 27126633536640.0, "grad_norm": 2.4411111020753347, "language_loss": 0.7904433, "learning_rate": 1.0466723487238768e-06, "loss": 0.81165314, "num_input_tokens_seen": 240065785, "step": 11117, "time_per_iteration": 2.748905897140503 }, { "auxiliary_loss_clip": 0.01069081, "auxiliary_loss_mlp": 0.01033165, "balance_loss_clip": 1.03828013, "balance_loss_mlp": 1.01844835, "epoch": 0.6684503231624831, "flos": 20739023072640.0, "grad_norm": 3.4807828142340815, "language_loss": 0.65610313, "learning_rate": 1.0463299966219441e-06, "loss": 0.67712557, "num_input_tokens_seen": 240085130, "step": 11118, "time_per_iteration": 2.707383871078491 }, { "auxiliary_loss_clip": 0.01091583, "auxiliary_loss_mlp": 0.01033843, "balance_loss_clip": 1.03924751, "balance_loss_mlp": 1.02176738, "epoch": 0.668510446415151, "flos": 21762189982080.0, "grad_norm": 1.4374358637877027, "language_loss": 0.68942273, "learning_rate": 1.0459876806832727e-06, "loss": 0.71067697, "num_input_tokens_seen": 240105495, "step": 11119, "time_per_iteration": 2.6769771575927734 }, { "auxiliary_loss_clip": 0.01086506, "auxiliary_loss_mlp": 0.01033952, "balance_loss_clip": 1.03629112, "balance_loss_mlp": 1.02011752, "epoch": 0.668570569667819, "flos": 30191250015360.0, "grad_norm": 1.6841707968514, "language_loss": 0.67587042, "learning_rate": 1.0456454009208448e-06, "loss": 0.69707495, "num_input_tokens_seen": 240125455, "step": 11120, "time_per_iteration": 5.847496509552002 }, { "auxiliary_loss_clip": 0.01082761, "auxiliary_loss_mlp": 0.01034422, "balance_loss_clip": 1.03859222, "balance_loss_mlp": 1.02105224, "epoch": 0.668630692920487, "flos": 24170646764160.0, "grad_norm": 1.5497664343001825, "language_loss": 0.72015131, "learning_rate": 1.045303157347638e-06, "loss": 0.74132311, "num_input_tokens_seen": 240143870, "step": 11121, "time_per_iteration": 2.763155698776245 }, { "auxiliary_loss_clip": 0.01090844, "auxiliary_loss_mlp": 0.01037875, "balance_loss_clip": 1.03589582, "balance_loss_mlp": 1.02405834, "epoch": 0.668690816173155, "flos": 17457147181440.0, "grad_norm": 2.929304268957898, "language_loss": 0.70167738, "learning_rate": 1.0449609499766316e-06, "loss": 0.72296458, "num_input_tokens_seen": 240161020, "step": 11122, "time_per_iteration": 2.657095432281494 }, { "auxiliary_loss_clip": 0.0105491, "auxiliary_loss_mlp": 0.00772515, "balance_loss_clip": 1.03529811, "balance_loss_mlp": 1.00017619, "epoch": 0.668750939425823, "flos": 25005102595200.0, "grad_norm": 1.8472771024518286, "language_loss": 0.71752214, "learning_rate": 1.0446187788208015e-06, "loss": 0.73579645, "num_input_tokens_seen": 240179820, "step": 11123, "time_per_iteration": 2.811048984527588 }, { "auxiliary_loss_clip": 0.01096616, "auxiliary_loss_mlp": 0.01042035, "balance_loss_clip": 1.04108119, "balance_loss_mlp": 1.02759266, "epoch": 0.6688110626784909, "flos": 24096778444800.0, "grad_norm": 1.6363097123873878, "language_loss": 0.79147661, "learning_rate": 1.0442766438931244e-06, "loss": 0.81286311, "num_input_tokens_seen": 240200130, "step": 11124, "time_per_iteration": 4.317869663238525 }, { "auxiliary_loss_clip": 0.01089397, "auxiliary_loss_mlp": 0.01041307, "balance_loss_clip": 1.04114437, "balance_loss_mlp": 1.02808654, "epoch": 0.6688711859311589, "flos": 21759532375680.0, "grad_norm": 1.733456144830199, "language_loss": 0.74266189, "learning_rate": 1.0439345452065716e-06, "loss": 0.76396894, "num_input_tokens_seen": 240217945, "step": 11125, "time_per_iteration": 2.67317795753479 }, { "auxiliary_loss_clip": 0.01076985, "auxiliary_loss_mlp": 0.0104133, "balance_loss_clip": 1.0369091, "balance_loss_mlp": 1.02759719, "epoch": 0.6689313091838268, "flos": 22929645824640.0, "grad_norm": 2.098915501123677, "language_loss": 0.67166436, "learning_rate": 1.043592482774116e-06, "loss": 0.69284761, "num_input_tokens_seen": 240237220, "step": 11126, "time_per_iteration": 2.739659547805786 }, { "auxiliary_loss_clip": 0.01096554, "auxiliary_loss_mlp": 0.01031996, "balance_loss_clip": 1.03653789, "balance_loss_mlp": 1.01875162, "epoch": 0.6689914324364948, "flos": 20886149180160.0, "grad_norm": 1.7642293623874703, "language_loss": 0.71071386, "learning_rate": 1.0432504566087305e-06, "loss": 0.73199928, "num_input_tokens_seen": 240256000, "step": 11127, "time_per_iteration": 2.729490041732788 }, { "auxiliary_loss_clip": 0.01093813, "auxiliary_loss_mlp": 0.01034088, "balance_loss_clip": 1.03839648, "balance_loss_mlp": 1.01959229, "epoch": 0.6690515556891627, "flos": 22748225207040.0, "grad_norm": 1.9937177709857246, "language_loss": 0.80368018, "learning_rate": 1.0429084667233827e-06, "loss": 0.82495916, "num_input_tokens_seen": 240275845, "step": 11128, "time_per_iteration": 2.6976559162139893 }, { "auxiliary_loss_clip": 0.01114736, "auxiliary_loss_mlp": 0.01031549, "balance_loss_clip": 1.03945661, "balance_loss_mlp": 1.01769066, "epoch": 0.6691116789418308, "flos": 23331450337920.0, "grad_norm": 1.7224977753706385, "language_loss": 0.80861622, "learning_rate": 1.0425665131310427e-06, "loss": 0.83007908, "num_input_tokens_seen": 240294095, "step": 11129, "time_per_iteration": 2.6617815494537354 }, { "auxiliary_loss_clip": 0.01091652, "auxiliary_loss_mlp": 0.010401, "balance_loss_clip": 1.03546023, "balance_loss_mlp": 1.02758944, "epoch": 0.6691718021944987, "flos": 32447014081920.0, "grad_norm": 1.6214077068942991, "language_loss": 0.70471781, "learning_rate": 1.0422245958446762e-06, "loss": 0.72603536, "num_input_tokens_seen": 240313460, "step": 11130, "time_per_iteration": 2.715178966522217 }, { "auxiliary_loss_clip": 0.01088381, "auxiliary_loss_mlp": 0.0103708, "balance_loss_clip": 1.03720927, "balance_loss_mlp": 1.02462888, "epoch": 0.6692319254471667, "flos": 23731602825600.0, "grad_norm": 2.6655548100703643, "language_loss": 0.70267725, "learning_rate": 1.0418827148772486e-06, "loss": 0.72393191, "num_input_tokens_seen": 240333540, "step": 11131, "time_per_iteration": 2.747252941131592 }, { "auxiliary_loss_clip": 0.01104604, "auxiliary_loss_mlp": 0.01034247, "balance_loss_clip": 1.03865063, "balance_loss_mlp": 1.01924431, "epoch": 0.6692920486998346, "flos": 14427902620800.0, "grad_norm": 2.56171206247206, "language_loss": 0.65588742, "learning_rate": 1.0415408702417243e-06, "loss": 0.6772759, "num_input_tokens_seen": 240350085, "step": 11132, "time_per_iteration": 2.697385311126709 }, { "auxiliary_loss_clip": 0.01102641, "auxiliary_loss_mlp": 0.01034541, "balance_loss_clip": 1.03688669, "balance_loss_mlp": 1.01992595, "epoch": 0.6693521719525026, "flos": 21507475662720.0, "grad_norm": 1.866615287346346, "language_loss": 0.74370456, "learning_rate": 1.0411990619510661e-06, "loss": 0.7650764, "num_input_tokens_seen": 240370015, "step": 11133, "time_per_iteration": 2.7032175064086914 }, { "auxiliary_loss_clip": 0.01110623, "auxiliary_loss_mlp": 0.01036691, "balance_loss_clip": 1.04268622, "balance_loss_mlp": 1.0218854, "epoch": 0.6694122952051706, "flos": 25406943022080.0, "grad_norm": 1.7566380678066518, "language_loss": 0.66696709, "learning_rate": 1.0408572900182363e-06, "loss": 0.6884402, "num_input_tokens_seen": 240390770, "step": 11134, "time_per_iteration": 2.7601702213287354 }, { "auxiliary_loss_clip": 0.01106772, "auxiliary_loss_mlp": 0.01043022, "balance_loss_clip": 1.04027784, "balance_loss_mlp": 1.0275842, "epoch": 0.6694724184578386, "flos": 25661729168640.0, "grad_norm": 1.8684519143911829, "language_loss": 0.77561742, "learning_rate": 1.0405155544561943e-06, "loss": 0.79711533, "num_input_tokens_seen": 240409590, "step": 11135, "time_per_iteration": 2.6581594944000244 }, { "auxiliary_loss_clip": 0.01104169, "auxiliary_loss_mlp": 0.01034806, "balance_loss_clip": 1.04034281, "balance_loss_mlp": 1.02143669, "epoch": 0.6695325417105066, "flos": 17709311635200.0, "grad_norm": 1.6117039898518706, "language_loss": 0.74245167, "learning_rate": 1.040173855277898e-06, "loss": 0.76384139, "num_input_tokens_seen": 240428180, "step": 11136, "time_per_iteration": 2.7073006629943848 }, { "auxiliary_loss_clip": 0.01109339, "auxiliary_loss_mlp": 0.01037889, "balance_loss_clip": 1.04210007, "balance_loss_mlp": 1.0232265, "epoch": 0.6695926649631745, "flos": 24460050643200.0, "grad_norm": 1.7129981010638282, "language_loss": 0.62248957, "learning_rate": 1.0398321924963061e-06, "loss": 0.64396185, "num_input_tokens_seen": 240447815, "step": 11137, "time_per_iteration": 2.6636767387390137 }, { "auxiliary_loss_clip": 0.01114546, "auxiliary_loss_mlp": 0.01028612, "balance_loss_clip": 1.04025912, "balance_loss_mlp": 1.01526093, "epoch": 0.6696527882158425, "flos": 24280138396800.0, "grad_norm": 1.688540284250028, "language_loss": 0.66006732, "learning_rate": 1.0394905661243724e-06, "loss": 0.68149894, "num_input_tokens_seen": 240468635, "step": 11138, "time_per_iteration": 2.608583688735962 }, { "auxiliary_loss_clip": 0.01077908, "auxiliary_loss_mlp": 0.01040221, "balance_loss_clip": 1.0351193, "balance_loss_mlp": 1.02686357, "epoch": 0.6697129114685104, "flos": 23002759958400.0, "grad_norm": 1.6525815819397558, "language_loss": 0.73112983, "learning_rate": 1.039148976175053e-06, "loss": 0.75231111, "num_input_tokens_seen": 240488550, "step": 11139, "time_per_iteration": 2.6988184452056885 }, { "auxiliary_loss_clip": 0.01073576, "auxiliary_loss_mlp": 0.01036448, "balance_loss_clip": 1.0351299, "balance_loss_mlp": 1.02378786, "epoch": 0.6697730347211784, "flos": 22638123043200.0, "grad_norm": 1.9643022468042264, "language_loss": 0.70518827, "learning_rate": 1.0388074226613016e-06, "loss": 0.72628856, "num_input_tokens_seen": 240508330, "step": 11140, "time_per_iteration": 2.782379150390625 }, { "auxiliary_loss_clip": 0.01103316, "auxiliary_loss_mlp": 0.01029096, "balance_loss_clip": 1.0356679, "balance_loss_mlp": 1.01478446, "epoch": 0.6698331579738463, "flos": 28877242682880.0, "grad_norm": 1.8179612458816414, "language_loss": 0.75826752, "learning_rate": 1.0384659055960691e-06, "loss": 0.77959162, "num_input_tokens_seen": 240528470, "step": 11141, "time_per_iteration": 2.662597417831421 }, { "auxiliary_loss_clip": 0.01103859, "auxiliary_loss_mlp": 0.01038503, "balance_loss_clip": 1.03954339, "balance_loss_mlp": 1.02456141, "epoch": 0.6698932812265144, "flos": 24207096090240.0, "grad_norm": 1.817558320872016, "language_loss": 0.81910652, "learning_rate": 1.0381244249923052e-06, "loss": 0.84053016, "num_input_tokens_seen": 240547815, "step": 11142, "time_per_iteration": 2.6364564895629883 }, { "auxiliary_loss_clip": 0.01063471, "auxiliary_loss_mlp": 0.01030688, "balance_loss_clip": 1.03567362, "balance_loss_mlp": 1.01705074, "epoch": 0.6699534044791823, "flos": 22090269830400.0, "grad_norm": 1.605847382893669, "language_loss": 0.70027417, "learning_rate": 1.037782980862959e-06, "loss": 0.72121578, "num_input_tokens_seen": 240567765, "step": 11143, "time_per_iteration": 2.738811492919922 }, { "auxiliary_loss_clip": 0.01071446, "auxiliary_loss_mlp": 0.00771315, "balance_loss_clip": 1.03594804, "balance_loss_mlp": 1.00014567, "epoch": 0.6700135277318503, "flos": 25192377129600.0, "grad_norm": 1.4724413771665843, "language_loss": 0.70065033, "learning_rate": 1.0374415732209796e-06, "loss": 0.71907794, "num_input_tokens_seen": 240590750, "step": 11144, "time_per_iteration": 2.85090708732605 }, { "auxiliary_loss_clip": 0.01087354, "auxiliary_loss_mlp": 0.01033347, "balance_loss_clip": 1.0364095, "balance_loss_mlp": 1.02025223, "epoch": 0.6700736509845182, "flos": 23440187784960.0, "grad_norm": 1.6283494272446573, "language_loss": 0.74419498, "learning_rate": 1.0371002020793114e-06, "loss": 0.76540208, "num_input_tokens_seen": 240608875, "step": 11145, "time_per_iteration": 2.9192864894866943 }, { "auxiliary_loss_clip": 0.0109431, "auxiliary_loss_mlp": 0.01030391, "balance_loss_clip": 1.03830147, "balance_loss_mlp": 1.01683688, "epoch": 0.6701337742371862, "flos": 24389953251840.0, "grad_norm": 5.654995580149679, "language_loss": 0.7114135, "learning_rate": 1.0367588674509008e-06, "loss": 0.73266053, "num_input_tokens_seen": 240628565, "step": 11146, "time_per_iteration": 2.7690348625183105 }, { "auxiliary_loss_clip": 0.01109374, "auxiliary_loss_mlp": 0.00770286, "balance_loss_clip": 1.03855777, "balance_loss_mlp": 1.00021374, "epoch": 0.6701938974898543, "flos": 14793652857600.0, "grad_norm": 1.9526898160613644, "language_loss": 0.78687358, "learning_rate": 1.0364175693486905e-06, "loss": 0.80567014, "num_input_tokens_seen": 240646325, "step": 11147, "time_per_iteration": 2.6259043216705322 }, { "auxiliary_loss_clip": 0.01104856, "auxiliary_loss_mlp": 0.0077075, "balance_loss_clip": 1.04050827, "balance_loss_mlp": 1.00021648, "epoch": 0.6702540207425222, "flos": 20154002261760.0, "grad_norm": 2.133465120376381, "language_loss": 0.70325512, "learning_rate": 1.0360763077856218e-06, "loss": 0.72201115, "num_input_tokens_seen": 240666145, "step": 11148, "time_per_iteration": 2.6906309127807617 }, { "auxiliary_loss_clip": 0.01094652, "auxiliary_loss_mlp": 0.01033466, "balance_loss_clip": 1.03719747, "balance_loss_mlp": 1.02005529, "epoch": 0.6703141439951902, "flos": 21214157201280.0, "grad_norm": 2.1690530349128148, "language_loss": 0.7037127, "learning_rate": 1.035735082774636e-06, "loss": 0.72499388, "num_input_tokens_seen": 240685570, "step": 11149, "time_per_iteration": 2.6307806968688965 }, { "auxiliary_loss_clip": 0.01092611, "auxiliary_loss_mlp": 0.010296, "balance_loss_clip": 1.03670847, "balance_loss_mlp": 1.01705897, "epoch": 0.6703742672478581, "flos": 23112538899840.0, "grad_norm": 1.6662323590997945, "language_loss": 0.73725748, "learning_rate": 1.0353938943286727e-06, "loss": 0.75847954, "num_input_tokens_seen": 240706945, "step": 11150, "time_per_iteration": 2.6917827129364014 }, { "auxiliary_loss_clip": 0.01103639, "auxiliary_loss_mlp": 0.01036073, "balance_loss_clip": 1.0409379, "balance_loss_mlp": 1.02276325, "epoch": 0.6704343905005261, "flos": 22528918719360.0, "grad_norm": 1.705366168717962, "language_loss": 0.78539407, "learning_rate": 1.035052742460671e-06, "loss": 0.80679119, "num_input_tokens_seen": 240727990, "step": 11151, "time_per_iteration": 2.6567208766937256 }, { "auxiliary_loss_clip": 0.00987579, "auxiliary_loss_mlp": 0.01000572, "balance_loss_clip": 1.01053739, "balance_loss_mlp": 0.99935037, "epoch": 0.670494513753194, "flos": 64793158773120.0, "grad_norm": 0.7884890805336543, "language_loss": 0.55364567, "learning_rate": 1.0347116271835643e-06, "loss": 0.57352722, "num_input_tokens_seen": 240790380, "step": 11152, "time_per_iteration": 3.3006503582000732 }, { "auxiliary_loss_clip": 0.0109132, "auxiliary_loss_mlp": 0.01038631, "balance_loss_clip": 1.03844714, "balance_loss_mlp": 1.025244, "epoch": 0.670554637005862, "flos": 23511506238720.0, "grad_norm": 1.9985135771335918, "language_loss": 0.80859494, "learning_rate": 1.0343705485102896e-06, "loss": 0.82989448, "num_input_tokens_seen": 240811545, "step": 11153, "time_per_iteration": 2.7756435871124268 }, { "auxiliary_loss_clip": 0.01076408, "auxiliary_loss_mlp": 0.00771693, "balance_loss_clip": 1.03820157, "balance_loss_mlp": 1.00020981, "epoch": 0.67061476025853, "flos": 19463404400640.0, "grad_norm": 1.6080859471988709, "language_loss": 0.76408523, "learning_rate": 1.0340295064537814e-06, "loss": 0.78256631, "num_input_tokens_seen": 240831380, "step": 11154, "time_per_iteration": 2.8628106117248535 }, { "auxiliary_loss_clip": 0.01094529, "auxiliary_loss_mlp": 0.01041911, "balance_loss_clip": 1.03737462, "balance_loss_mlp": 1.02754045, "epoch": 0.670674883511198, "flos": 20519967980160.0, "grad_norm": 1.6589905225029438, "language_loss": 0.76200944, "learning_rate": 1.0336885010269702e-06, "loss": 0.78337383, "num_input_tokens_seen": 240851855, "step": 11155, "time_per_iteration": 4.394611120223999 }, { "auxiliary_loss_clip": 0.01115828, "auxiliary_loss_mlp": 0.01036265, "balance_loss_clip": 1.04136384, "balance_loss_mlp": 1.02283049, "epoch": 0.6707350067638659, "flos": 25483971738240.0, "grad_norm": 2.252293716833977, "language_loss": 0.82174289, "learning_rate": 1.0333475322427878e-06, "loss": 0.8432638, "num_input_tokens_seen": 240869980, "step": 11156, "time_per_iteration": 2.672253370285034 }, { "auxiliary_loss_clip": 0.01114074, "auxiliary_loss_mlp": 0.01037802, "balance_loss_clip": 1.04081774, "balance_loss_mlp": 1.02488017, "epoch": 0.6707951300165339, "flos": 22273450214400.0, "grad_norm": 1.89603770151681, "language_loss": 0.7505753, "learning_rate": 1.033006600114165e-06, "loss": 0.77209401, "num_input_tokens_seen": 240888680, "step": 11157, "time_per_iteration": 2.6131577491760254 }, { "auxiliary_loss_clip": 0.01109055, "auxiliary_loss_mlp": 0.01042973, "balance_loss_clip": 1.04226005, "balance_loss_mlp": 1.02867961, "epoch": 0.6708552532692018, "flos": 23984593292160.0, "grad_norm": 1.7747922187460388, "language_loss": 0.74478519, "learning_rate": 1.0326657046540282e-06, "loss": 0.76630545, "num_input_tokens_seen": 240909050, "step": 11158, "time_per_iteration": 2.7293169498443604 }, { "auxiliary_loss_clip": 0.01118082, "auxiliary_loss_mlp": 0.01037488, "balance_loss_clip": 1.04157019, "balance_loss_mlp": 1.02339745, "epoch": 0.6709153765218698, "flos": 24937519155840.0, "grad_norm": 1.5675836402135142, "language_loss": 0.81520784, "learning_rate": 1.0323248458753044e-06, "loss": 0.8367635, "num_input_tokens_seen": 240930035, "step": 11159, "time_per_iteration": 5.697297811508179 }, { "auxiliary_loss_clip": 0.01093112, "auxiliary_loss_mlp": 0.01031466, "balance_loss_clip": 1.037853, "balance_loss_mlp": 1.01822233, "epoch": 0.6709754997745379, "flos": 17530225401600.0, "grad_norm": 1.775658111941971, "language_loss": 0.76943409, "learning_rate": 1.0319840237909193e-06, "loss": 0.79067993, "num_input_tokens_seen": 240948895, "step": 11160, "time_per_iteration": 2.649531602859497 }, { "auxiliary_loss_clip": 0.01088534, "auxiliary_loss_mlp": 0.01033293, "balance_loss_clip": 1.03823304, "balance_loss_mlp": 1.01970327, "epoch": 0.6710356230272058, "flos": 22090880361600.0, "grad_norm": 1.7750116462358165, "language_loss": 0.73715007, "learning_rate": 1.0316432384137978e-06, "loss": 0.75836837, "num_input_tokens_seen": 240967770, "step": 11161, "time_per_iteration": 2.677884817123413 }, { "auxiliary_loss_clip": 0.01093874, "auxiliary_loss_mlp": 0.01041282, "balance_loss_clip": 1.03686976, "balance_loss_mlp": 1.0268575, "epoch": 0.6710957462798738, "flos": 24206449645440.0, "grad_norm": 1.942474500054277, "language_loss": 0.68453658, "learning_rate": 1.0313024897568618e-06, "loss": 0.70588821, "num_input_tokens_seen": 240988985, "step": 11162, "time_per_iteration": 2.7426352500915527 }, { "auxiliary_loss_clip": 0.01089967, "auxiliary_loss_mlp": 0.01042721, "balance_loss_clip": 1.03566909, "balance_loss_mlp": 1.02965569, "epoch": 0.6711558695325417, "flos": 19093955063040.0, "grad_norm": 2.157920195613674, "language_loss": 0.70179218, "learning_rate": 1.030961777833032e-06, "loss": 0.72311902, "num_input_tokens_seen": 241005455, "step": 11163, "time_per_iteration": 2.6737561225891113 }, { "auxiliary_loss_clip": 0.01113094, "auxiliary_loss_mlp": 0.0103505, "balance_loss_clip": 1.0411427, "balance_loss_mlp": 1.02216315, "epoch": 0.6712159927852097, "flos": 25557875971200.0, "grad_norm": 1.7583635951984506, "language_loss": 0.75421375, "learning_rate": 1.0306211026552291e-06, "loss": 0.7756952, "num_input_tokens_seen": 241026175, "step": 11164, "time_per_iteration": 4.2939674854278564 }, { "auxiliary_loss_clip": 0.01115198, "auxiliary_loss_mlp": 0.01033198, "balance_loss_clip": 1.0404532, "balance_loss_mlp": 1.01967335, "epoch": 0.6712761160378776, "flos": 22228812587520.0, "grad_norm": 1.9153842218638528, "language_loss": 0.65245664, "learning_rate": 1.0302804642363704e-06, "loss": 0.67394054, "num_input_tokens_seen": 241044040, "step": 11165, "time_per_iteration": 2.6558966636657715 }, { "auxiliary_loss_clip": 0.01112642, "auxiliary_loss_mlp": 0.01036219, "balance_loss_clip": 1.03975642, "balance_loss_mlp": 1.02284431, "epoch": 0.6713362392905456, "flos": 22455517276800.0, "grad_norm": 2.4551304238389218, "language_loss": 0.71630502, "learning_rate": 1.0299398625893738e-06, "loss": 0.73779362, "num_input_tokens_seen": 241063615, "step": 11166, "time_per_iteration": 2.594005823135376 }, { "auxiliary_loss_clip": 0.01113176, "auxiliary_loss_mlp": 0.01030674, "balance_loss_clip": 1.04087472, "balance_loss_mlp": 1.01834142, "epoch": 0.6713963625432136, "flos": 25630200005760.0, "grad_norm": 2.7039163117890728, "language_loss": 0.77024722, "learning_rate": 1.0295992977271546e-06, "loss": 0.79168576, "num_input_tokens_seen": 241082520, "step": 11167, "time_per_iteration": 2.630964517593384 }, { "auxiliary_loss_clip": 0.01101695, "auxiliary_loss_mlp": 0.01040458, "balance_loss_clip": 1.03634501, "balance_loss_mlp": 1.02711856, "epoch": 0.6714564857958816, "flos": 35006475640320.0, "grad_norm": 1.6082371290328819, "language_loss": 0.68865132, "learning_rate": 1.029258769662629e-06, "loss": 0.71007288, "num_input_tokens_seen": 241103505, "step": 11168, "time_per_iteration": 2.845033884048462 }, { "auxiliary_loss_clip": 0.01078889, "auxiliary_loss_mlp": 0.01043458, "balance_loss_clip": 1.03778422, "balance_loss_mlp": 1.02867651, "epoch": 0.6715166090485495, "flos": 26279931168000.0, "grad_norm": 1.9394421042077383, "language_loss": 0.73349601, "learning_rate": 1.0289182784087068e-06, "loss": 0.75471944, "num_input_tokens_seen": 241122885, "step": 11169, "time_per_iteration": 2.9264886379241943 }, { "auxiliary_loss_clip": 0.0110554, "auxiliary_loss_mlp": 0.01039147, "balance_loss_clip": 1.0378871, "balance_loss_mlp": 1.02427554, "epoch": 0.6715767323012175, "flos": 15924156583680.0, "grad_norm": 1.9283403176707277, "language_loss": 0.76306462, "learning_rate": 1.0285778239783005e-06, "loss": 0.78451145, "num_input_tokens_seen": 241140865, "step": 11170, "time_per_iteration": 2.649400472640991 }, { "auxiliary_loss_clip": 0.01095301, "auxiliary_loss_mlp": 0.0103096, "balance_loss_clip": 1.03898799, "balance_loss_mlp": 1.01709008, "epoch": 0.6716368555538854, "flos": 17491441691520.0, "grad_norm": 1.924665288480993, "language_loss": 0.74140078, "learning_rate": 1.0282374063843212e-06, "loss": 0.76266336, "num_input_tokens_seen": 241158225, "step": 11171, "time_per_iteration": 2.672985076904297 }, { "auxiliary_loss_clip": 0.0107518, "auxiliary_loss_mlp": 0.01054034, "balance_loss_clip": 1.03710866, "balance_loss_mlp": 1.03831053, "epoch": 0.6716969788065534, "flos": 16761521416320.0, "grad_norm": 1.4921239292463526, "language_loss": 0.86225343, "learning_rate": 1.0278970256396762e-06, "loss": 0.88354552, "num_input_tokens_seen": 241175215, "step": 11172, "time_per_iteration": 2.720012664794922 }, { "auxiliary_loss_clip": 0.01098137, "auxiliary_loss_mlp": 0.01041037, "balance_loss_clip": 1.0346545, "balance_loss_mlp": 1.02693462, "epoch": 0.6717571020592215, "flos": 22709800632960.0, "grad_norm": 1.8099463790548698, "language_loss": 0.63222194, "learning_rate": 1.0275566817572733e-06, "loss": 0.65361369, "num_input_tokens_seen": 241195250, "step": 11173, "time_per_iteration": 2.6705803871154785 }, { "auxiliary_loss_clip": 0.0111084, "auxiliary_loss_mlp": 0.01040058, "balance_loss_clip": 1.03873289, "balance_loss_mlp": 1.02487719, "epoch": 0.6718172253118894, "flos": 18734094656640.0, "grad_norm": 2.1708594678401707, "language_loss": 0.71347594, "learning_rate": 1.02721637475002e-06, "loss": 0.73498487, "num_input_tokens_seen": 241210720, "step": 11174, "time_per_iteration": 2.602283477783203 }, { "auxiliary_loss_clip": 0.01075457, "auxiliary_loss_mlp": 0.01030548, "balance_loss_clip": 1.03783953, "balance_loss_mlp": 1.01738167, "epoch": 0.6718773485645574, "flos": 15632526061440.0, "grad_norm": 2.052442882823656, "language_loss": 0.67971045, "learning_rate": 1.0268761046308178e-06, "loss": 0.7007705, "num_input_tokens_seen": 241227395, "step": 11175, "time_per_iteration": 2.669154644012451 }, { "auxiliary_loss_clip": 0.01085, "auxiliary_loss_mlp": 0.01037471, "balance_loss_clip": 1.0389663, "balance_loss_mlp": 1.02479339, "epoch": 0.6719374718172253, "flos": 19354774694400.0, "grad_norm": 2.182967446535966, "language_loss": 0.7362026, "learning_rate": 1.0265358714125714e-06, "loss": 0.75742733, "num_input_tokens_seen": 241246355, "step": 11176, "time_per_iteration": 2.644695997238159 }, { "auxiliary_loss_clip": 0.01093824, "auxiliary_loss_mlp": 0.01037825, "balance_loss_clip": 1.03961146, "balance_loss_mlp": 1.02334082, "epoch": 0.6719975950698933, "flos": 21981316901760.0, "grad_norm": 1.8406147660483967, "language_loss": 0.72720611, "learning_rate": 1.026195675108182e-06, "loss": 0.74852264, "num_input_tokens_seen": 241264180, "step": 11177, "time_per_iteration": 2.6863327026367188 }, { "auxiliary_loss_clip": 0.01115157, "auxiliary_loss_mlp": 0.01038577, "balance_loss_clip": 1.03991175, "balance_loss_mlp": 1.0244683, "epoch": 0.6720577183225612, "flos": 25228072270080.0, "grad_norm": 2.150822621130827, "language_loss": 0.76274478, "learning_rate": 1.025855515730551e-06, "loss": 0.78428215, "num_input_tokens_seen": 241282245, "step": 11178, "time_per_iteration": 2.580979108810425 }, { "auxiliary_loss_clip": 0.01106474, "auxiliary_loss_mlp": 0.0103827, "balance_loss_clip": 1.04109895, "balance_loss_mlp": 1.02494228, "epoch": 0.6721178415752292, "flos": 16945886949120.0, "grad_norm": 1.6631958135032512, "language_loss": 0.69917423, "learning_rate": 1.0255153932925766e-06, "loss": 0.72062165, "num_input_tokens_seen": 241300745, "step": 11179, "time_per_iteration": 2.765749454498291 }, { "auxiliary_loss_clip": 0.01067075, "auxiliary_loss_mlp": 0.01035482, "balance_loss_clip": 1.03598237, "balance_loss_mlp": 1.02269685, "epoch": 0.6721779648278972, "flos": 21541375123200.0, "grad_norm": 1.5427953976374715, "language_loss": 0.74147439, "learning_rate": 1.0251753078071557e-06, "loss": 0.76249993, "num_input_tokens_seen": 241319320, "step": 11180, "time_per_iteration": 2.7570419311523438 }, { "auxiliary_loss_clip": 0.01094967, "auxiliary_loss_mlp": 0.01032611, "balance_loss_clip": 1.03934419, "balance_loss_mlp": 1.01931906, "epoch": 0.6722380880805652, "flos": 22605444645120.0, "grad_norm": 1.3453936001041888, "language_loss": 0.75262862, "learning_rate": 1.0248352592871848e-06, "loss": 0.77390438, "num_input_tokens_seen": 241342225, "step": 11181, "time_per_iteration": 2.805821418762207 }, { "auxiliary_loss_clip": 0.0109711, "auxiliary_loss_mlp": 0.0103353, "balance_loss_clip": 1.03977168, "balance_loss_mlp": 1.0209651, "epoch": 0.6722982113332331, "flos": 15925269905280.0, "grad_norm": 4.685407340613367, "language_loss": 0.74491268, "learning_rate": 1.0244952477455585e-06, "loss": 0.76621902, "num_input_tokens_seen": 241358240, "step": 11182, "time_per_iteration": 2.7147958278656006 }, { "auxiliary_loss_clip": 0.01098785, "auxiliary_loss_mlp": 0.01033728, "balance_loss_clip": 1.03787458, "balance_loss_mlp": 1.02139592, "epoch": 0.6723583345859011, "flos": 20596170683520.0, "grad_norm": 2.0288719371623323, "language_loss": 0.69882548, "learning_rate": 1.0241552731951699e-06, "loss": 0.72015059, "num_input_tokens_seen": 241378420, "step": 11183, "time_per_iteration": 2.6687538623809814 }, { "auxiliary_loss_clip": 0.01064932, "auxiliary_loss_mlp": 0.01033349, "balance_loss_clip": 1.0361743, "balance_loss_mlp": 1.01995015, "epoch": 0.672418457838569, "flos": 21725848396800.0, "grad_norm": 2.97348360718205, "language_loss": 0.77805459, "learning_rate": 1.0238153356489112e-06, "loss": 0.7990374, "num_input_tokens_seen": 241397185, "step": 11184, "time_per_iteration": 2.777731418609619 }, { "auxiliary_loss_clip": 0.0109739, "auxiliary_loss_mlp": 0.00775757, "balance_loss_clip": 1.04143977, "balance_loss_mlp": 1.00022709, "epoch": 0.672478581091237, "flos": 21470379891840.0, "grad_norm": 3.9325636134414426, "language_loss": 0.66277105, "learning_rate": 1.0234754351196743e-06, "loss": 0.68150252, "num_input_tokens_seen": 241415785, "step": 11185, "time_per_iteration": 2.737527370452881 }, { "auxiliary_loss_clip": 0.01076626, "auxiliary_loss_mlp": 0.01036011, "balance_loss_clip": 1.03503013, "balance_loss_mlp": 1.02205157, "epoch": 0.6725387043439051, "flos": 30846763267200.0, "grad_norm": 1.5938972508624505, "language_loss": 0.80483949, "learning_rate": 1.023135571620345e-06, "loss": 0.82596588, "num_input_tokens_seen": 241437390, "step": 11186, "time_per_iteration": 2.8201353549957275 }, { "auxiliary_loss_clip": 0.01101545, "auxiliary_loss_mlp": 0.01035806, "balance_loss_clip": 1.04061747, "balance_loss_mlp": 1.02350974, "epoch": 0.672598827596573, "flos": 24055947659520.0, "grad_norm": 2.88496393330639, "language_loss": 0.80385649, "learning_rate": 1.022795745163813e-06, "loss": 0.82523, "num_input_tokens_seen": 241458085, "step": 11187, "time_per_iteration": 2.7198538780212402 }, { "auxiliary_loss_clip": 0.0107469, "auxiliary_loss_mlp": 0.01033866, "balance_loss_clip": 1.04410124, "balance_loss_mlp": 1.01917362, "epoch": 0.672658950849241, "flos": 21871861182720.0, "grad_norm": 1.9454261923533847, "language_loss": 0.7059114, "learning_rate": 1.022455955762965e-06, "loss": 0.7269969, "num_input_tokens_seen": 241476880, "step": 11188, "time_per_iteration": 2.7985453605651855 }, { "auxiliary_loss_clip": 0.01054991, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.04298103, "balance_loss_mlp": 1.02394819, "epoch": 0.6727190741019089, "flos": 23222102359680.0, "grad_norm": 1.8365043403177213, "language_loss": 0.7589345, "learning_rate": 1.0221162034306842e-06, "loss": 0.77985215, "num_input_tokens_seen": 241496535, "step": 11189, "time_per_iteration": 2.905705213546753 }, { "auxiliary_loss_clip": 0.01116413, "auxiliary_loss_mlp": 0.01032806, "balance_loss_clip": 1.0382818, "balance_loss_mlp": 1.01785755, "epoch": 0.6727791973545769, "flos": 15778610674560.0, "grad_norm": 2.0168522444965986, "language_loss": 0.75364029, "learning_rate": 1.0217764881798562e-06, "loss": 0.77513248, "num_input_tokens_seen": 241513465, "step": 11190, "time_per_iteration": 2.833767890930176 }, { "auxiliary_loss_clip": 0.01048034, "auxiliary_loss_mlp": 0.0103557, "balance_loss_clip": 1.03332615, "balance_loss_mlp": 1.02153933, "epoch": 0.6728393206072448, "flos": 21249852341760.0, "grad_norm": 2.7169326773783236, "language_loss": 0.77364898, "learning_rate": 1.0214368100233612e-06, "loss": 0.79448497, "num_input_tokens_seen": 241534125, "step": 11191, "time_per_iteration": 2.782000780105591 }, { "auxiliary_loss_clip": 0.01111788, "auxiliary_loss_mlp": 0.01034042, "balance_loss_clip": 1.03986657, "balance_loss_mlp": 1.02100623, "epoch": 0.6728994438599128, "flos": 32123279779200.0, "grad_norm": 2.4096830802466416, "language_loss": 0.8635608, "learning_rate": 1.0210971689740802e-06, "loss": 0.88501906, "num_input_tokens_seen": 241556340, "step": 11192, "time_per_iteration": 2.7193620204925537 }, { "auxiliary_loss_clip": 0.01104606, "auxiliary_loss_mlp": 0.0103762, "balance_loss_clip": 1.03892374, "balance_loss_mlp": 1.023458, "epoch": 0.6729595671125808, "flos": 23112359331840.0, "grad_norm": 2.0040177590782906, "language_loss": 0.75960791, "learning_rate": 1.0207575650448923e-06, "loss": 0.78103018, "num_input_tokens_seen": 241575185, "step": 11193, "time_per_iteration": 2.713738441467285 }, { "auxiliary_loss_clip": 0.01081133, "auxiliary_loss_mlp": 0.01033198, "balance_loss_clip": 1.04058063, "balance_loss_mlp": 1.02000737, "epoch": 0.6730196903652488, "flos": 14611406227200.0, "grad_norm": 1.775580074575331, "language_loss": 0.78365123, "learning_rate": 1.0204179982486758e-06, "loss": 0.80479455, "num_input_tokens_seen": 241592970, "step": 11194, "time_per_iteration": 4.453005075454712 }, { "auxiliary_loss_clip": 0.0110231, "auxiliary_loss_mlp": 0.01028753, "balance_loss_clip": 1.03805304, "balance_loss_mlp": 1.01621783, "epoch": 0.6730798136179167, "flos": 21105922544640.0, "grad_norm": 1.9211443871049516, "language_loss": 0.89955217, "learning_rate": 1.0200784685983075e-06, "loss": 0.92086279, "num_input_tokens_seen": 241610245, "step": 11195, "time_per_iteration": 2.6450841426849365 }, { "auxiliary_loss_clip": 0.01101967, "auxiliary_loss_mlp": 0.01032017, "balance_loss_clip": 1.03769374, "balance_loss_mlp": 1.01929736, "epoch": 0.6731399368705847, "flos": 28986267438720.0, "grad_norm": 1.64687980974086, "language_loss": 0.72439396, "learning_rate": 1.019738976106662e-06, "loss": 0.74573386, "num_input_tokens_seen": 241630350, "step": 11196, "time_per_iteration": 2.685826063156128 }, { "auxiliary_loss_clip": 0.00973165, "auxiliary_loss_mlp": 0.01004254, "balance_loss_clip": 1.01249313, "balance_loss_mlp": 1.00303793, "epoch": 0.6732000601232526, "flos": 64743708723840.0, "grad_norm": 0.7752886509100162, "language_loss": 0.5652535, "learning_rate": 1.0193995207866123e-06, "loss": 0.58502769, "num_input_tokens_seen": 241692380, "step": 11197, "time_per_iteration": 3.259193181991577 }, { "auxiliary_loss_clip": 0.01093274, "auxiliary_loss_mlp": 0.01029344, "balance_loss_clip": 1.04169464, "balance_loss_mlp": 1.01701725, "epoch": 0.6732601833759206, "flos": 17201642762880.0, "grad_norm": 2.055708631074821, "language_loss": 0.7532202, "learning_rate": 1.0190601026510312e-06, "loss": 0.77444637, "num_input_tokens_seen": 241710430, "step": 11198, "time_per_iteration": 5.827820777893066 }, { "auxiliary_loss_clip": 0.01103142, "auxiliary_loss_mlp": 0.01033637, "balance_loss_clip": 1.03708792, "balance_loss_mlp": 1.01949286, "epoch": 0.6733203066285887, "flos": 18658861620480.0, "grad_norm": 2.036459352353542, "language_loss": 0.81907552, "learning_rate": 1.0187207217127892e-06, "loss": 0.84044337, "num_input_tokens_seen": 241724775, "step": 11199, "time_per_iteration": 2.5949244499206543 }, { "auxiliary_loss_clip": 0.01059201, "auxiliary_loss_mlp": 0.01036318, "balance_loss_clip": 1.03536808, "balance_loss_mlp": 1.0218575, "epoch": 0.6733804298812566, "flos": 35809330481280.0, "grad_norm": 1.7500176126376645, "language_loss": 0.7166037, "learning_rate": 1.0183813779847552e-06, "loss": 0.73755884, "num_input_tokens_seen": 241744440, "step": 11200, "time_per_iteration": 2.9160830974578857 }, { "auxiliary_loss_clip": 0.01115381, "auxiliary_loss_mlp": 0.01035903, "balance_loss_clip": 1.04130912, "balance_loss_mlp": 1.02295125, "epoch": 0.6734405531339246, "flos": 61638833099520.0, "grad_norm": 2.371327555495297, "language_loss": 0.64769435, "learning_rate": 1.0180420714797987e-06, "loss": 0.66920727, "num_input_tokens_seen": 241771705, "step": 11201, "time_per_iteration": 2.9968230724334717 }, { "auxiliary_loss_clip": 0.01096465, "auxiliary_loss_mlp": 0.01040904, "balance_loss_clip": 1.04056644, "balance_loss_mlp": 1.02676558, "epoch": 0.6735006763865925, "flos": 20522338277760.0, "grad_norm": 2.019287776053706, "language_loss": 0.63276017, "learning_rate": 1.0177028022107856e-06, "loss": 0.65413386, "num_input_tokens_seen": 241790830, "step": 11202, "time_per_iteration": 2.7302961349487305 }, { "auxiliary_loss_clip": 0.01112496, "auxiliary_loss_mlp": 0.01028107, "balance_loss_clip": 1.03865552, "balance_loss_mlp": 1.01558411, "epoch": 0.6735607996392605, "flos": 13918869031680.0, "grad_norm": 1.915253440556218, "language_loss": 0.74535716, "learning_rate": 1.0173635701905796e-06, "loss": 0.76676321, "num_input_tokens_seen": 241808165, "step": 11203, "time_per_iteration": 4.089365243911743 }, { "auxiliary_loss_clip": 0.01098401, "auxiliary_loss_mlp": 0.01034017, "balance_loss_clip": 1.04094148, "balance_loss_mlp": 1.01900887, "epoch": 0.6736209228919284, "flos": 18807244704000.0, "grad_norm": 1.6291352462615132, "language_loss": 0.67681134, "learning_rate": 1.0170243754320456e-06, "loss": 0.6981355, "num_input_tokens_seen": 241826925, "step": 11204, "time_per_iteration": 2.6192142963409424 }, { "auxiliary_loss_clip": 0.01110427, "auxiliary_loss_mlp": 0.01034559, "balance_loss_clip": 1.04293954, "balance_loss_mlp": 1.02012277, "epoch": 0.6736810461445965, "flos": 20373129181440.0, "grad_norm": 1.6630781718701608, "language_loss": 0.74060369, "learning_rate": 1.0166852179480465e-06, "loss": 0.76205349, "num_input_tokens_seen": 241845525, "step": 11205, "time_per_iteration": 2.6068971157073975 }, { "auxiliary_loss_clip": 0.01109012, "auxiliary_loss_mlp": 0.01037861, "balance_loss_clip": 1.03733087, "balance_loss_mlp": 1.02507019, "epoch": 0.6737411693972644, "flos": 30007530927360.0, "grad_norm": 1.5764181927902094, "language_loss": 0.71426833, "learning_rate": 1.0163460977514416e-06, "loss": 0.73573703, "num_input_tokens_seen": 241866815, "step": 11206, "time_per_iteration": 2.6492159366607666 }, { "auxiliary_loss_clip": 0.0107907, "auxiliary_loss_mlp": 0.0077308, "balance_loss_clip": 1.03777742, "balance_loss_mlp": 1.00019574, "epoch": 0.6738012926499324, "flos": 25447342844160.0, "grad_norm": 3.2743303537758712, "language_loss": 0.67471528, "learning_rate": 1.016007014855092e-06, "loss": 0.69323683, "num_input_tokens_seen": 241887050, "step": 11207, "time_per_iteration": 2.7261955738067627 }, { "auxiliary_loss_clip": 0.01062123, "auxiliary_loss_mlp": 0.01037554, "balance_loss_clip": 1.035918, "balance_loss_mlp": 1.02464974, "epoch": 0.6738614159026003, "flos": 20776873029120.0, "grad_norm": 1.9421642242153492, "language_loss": 0.73736989, "learning_rate": 1.0156679692718553e-06, "loss": 0.7583667, "num_input_tokens_seen": 241904280, "step": 11208, "time_per_iteration": 2.7930853366851807 }, { "auxiliary_loss_clip": 0.01097466, "auxiliary_loss_mlp": 0.01047913, "balance_loss_clip": 1.03587651, "balance_loss_mlp": 1.03142679, "epoch": 0.6739215391552683, "flos": 19566898462080.0, "grad_norm": 2.600803881105225, "language_loss": 0.75433391, "learning_rate": 1.0153289610145867e-06, "loss": 0.77578771, "num_input_tokens_seen": 241919190, "step": 11209, "time_per_iteration": 2.626483678817749 }, { "auxiliary_loss_clip": 0.01073019, "auxiliary_loss_mlp": 0.01034768, "balance_loss_clip": 1.03657913, "balance_loss_mlp": 1.02250111, "epoch": 0.6739816624079362, "flos": 24388193485440.0, "grad_norm": 1.7906778547342261, "language_loss": 0.66272515, "learning_rate": 1.0149899900961428e-06, "loss": 0.68380302, "num_input_tokens_seen": 241940525, "step": 11210, "time_per_iteration": 2.711866617202759 }, { "auxiliary_loss_clip": 0.01108754, "auxiliary_loss_mlp": 0.01032447, "balance_loss_clip": 1.03769946, "balance_loss_mlp": 1.02072227, "epoch": 0.6740417856606042, "flos": 22528164533760.0, "grad_norm": 2.164396283420133, "language_loss": 0.80170596, "learning_rate": 1.014651056529377e-06, "loss": 0.82311797, "num_input_tokens_seen": 241959290, "step": 11211, "time_per_iteration": 2.650737762451172 }, { "auxiliary_loss_clip": 0.01065338, "auxiliary_loss_mlp": 0.01034108, "balance_loss_clip": 1.03782678, "balance_loss_mlp": 1.02107227, "epoch": 0.6741019089132723, "flos": 25775458606080.0, "grad_norm": 1.391978533622519, "language_loss": 0.76499903, "learning_rate": 1.014312160327143e-06, "loss": 0.78599358, "num_input_tokens_seen": 241980715, "step": 11212, "time_per_iteration": 2.778548240661621 }, { "auxiliary_loss_clip": 0.0107247, "auxiliary_loss_mlp": 0.00773763, "balance_loss_clip": 1.03496337, "balance_loss_mlp": 1.00017405, "epoch": 0.6741620321659402, "flos": 21105671149440.0, "grad_norm": 1.7101000867736138, "language_loss": 0.7758128, "learning_rate": 1.0139733015022905e-06, "loss": 0.79427516, "num_input_tokens_seen": 241999985, "step": 11213, "time_per_iteration": 2.7835280895233154 }, { "auxiliary_loss_clip": 0.01061037, "auxiliary_loss_mlp": 0.01033911, "balance_loss_clip": 1.03824186, "balance_loss_mlp": 1.0204041, "epoch": 0.6742221554186082, "flos": 20740423703040.0, "grad_norm": 1.981711873743371, "language_loss": 0.67612016, "learning_rate": 1.0136344800676685e-06, "loss": 0.69706964, "num_input_tokens_seen": 242018990, "step": 11214, "time_per_iteration": 2.9053549766540527 }, { "auxiliary_loss_clip": 0.01113738, "auxiliary_loss_mlp": 0.00770067, "balance_loss_clip": 1.03925085, "balance_loss_mlp": 1.00014567, "epoch": 0.6742822786712761, "flos": 37774146384000.0, "grad_norm": 1.8488975905792826, "language_loss": 0.72834229, "learning_rate": 1.0132956960361263e-06, "loss": 0.74718034, "num_input_tokens_seen": 242039340, "step": 11215, "time_per_iteration": 2.7654783725738525 }, { "auxiliary_loss_clip": 0.0110075, "auxiliary_loss_mlp": 0.00770504, "balance_loss_clip": 1.03589749, "balance_loss_mlp": 1.00019991, "epoch": 0.6743424019239441, "flos": 37263891732480.0, "grad_norm": 2.067200737701415, "language_loss": 0.67394143, "learning_rate": 1.0129569494205096e-06, "loss": 0.69265401, "num_input_tokens_seen": 242062215, "step": 11216, "time_per_iteration": 2.7729885578155518 }, { "auxiliary_loss_clip": 0.01032166, "auxiliary_loss_mlp": 0.01006139, "balance_loss_clip": 1.00926828, "balance_loss_mlp": 1.0051198, "epoch": 0.674402525176612, "flos": 65997746300160.0, "grad_norm": 0.6926580332237084, "language_loss": 0.56280029, "learning_rate": 1.0126182402336646e-06, "loss": 0.58318341, "num_input_tokens_seen": 242131130, "step": 11217, "time_per_iteration": 3.255324125289917 }, { "auxiliary_loss_clip": 0.01099919, "auxiliary_loss_mlp": 0.0103498, "balance_loss_clip": 1.0376718, "balance_loss_mlp": 1.02188516, "epoch": 0.67446264842928, "flos": 26461208131200.0, "grad_norm": 1.7874095302934647, "language_loss": 0.74496436, "learning_rate": 1.0122795684884363e-06, "loss": 0.76631337, "num_input_tokens_seen": 242149720, "step": 11218, "time_per_iteration": 2.672130823135376 }, { "auxiliary_loss_clip": 0.01080832, "auxiliary_loss_mlp": 0.01049632, "balance_loss_clip": 1.03884029, "balance_loss_mlp": 1.03438509, "epoch": 0.674522771681948, "flos": 23732392924800.0, "grad_norm": 1.6252161995703833, "language_loss": 0.65911674, "learning_rate": 1.0119409341976639e-06, "loss": 0.68042141, "num_input_tokens_seen": 242168875, "step": 11219, "time_per_iteration": 2.734159469604492 }, { "auxiliary_loss_clip": 0.01070647, "auxiliary_loss_mlp": 0.01046329, "balance_loss_clip": 1.0323844, "balance_loss_mlp": 1.03093362, "epoch": 0.674582894934616, "flos": 24754338771840.0, "grad_norm": 1.8389940842715735, "language_loss": 0.75087273, "learning_rate": 1.0116023373741904e-06, "loss": 0.77204245, "num_input_tokens_seen": 242188465, "step": 11220, "time_per_iteration": 2.6810474395751953 }, { "auxiliary_loss_clip": 0.01097202, "auxiliary_loss_mlp": 0.01035624, "balance_loss_clip": 1.03908563, "balance_loss_mlp": 1.02207017, "epoch": 0.6746430181872839, "flos": 24826626892800.0, "grad_norm": 1.6228841440103556, "language_loss": 0.70216316, "learning_rate": 1.0112637780308554e-06, "loss": 0.72349143, "num_input_tokens_seen": 242208675, "step": 11221, "time_per_iteration": 2.655421733856201 }, { "auxiliary_loss_clip": 0.01076344, "auxiliary_loss_mlp": 0.01033323, "balance_loss_clip": 1.0356853, "balance_loss_mlp": 1.02112806, "epoch": 0.6747031414399519, "flos": 16873491087360.0, "grad_norm": 2.1723447923231554, "language_loss": 0.58043802, "learning_rate": 1.010925256180498e-06, "loss": 0.60153466, "num_input_tokens_seen": 242227440, "step": 11222, "time_per_iteration": 2.698503255844116 }, { "auxiliary_loss_clip": 0.01100055, "auxiliary_loss_mlp": 0.01035283, "balance_loss_clip": 1.03881896, "balance_loss_mlp": 1.02253962, "epoch": 0.6747632646926198, "flos": 22784925928320.0, "grad_norm": 2.113432638374052, "language_loss": 0.76298106, "learning_rate": 1.0105867718359528e-06, "loss": 0.78433442, "num_input_tokens_seen": 242245240, "step": 11223, "time_per_iteration": 2.6607108116149902 }, { "auxiliary_loss_clip": 0.01108219, "auxiliary_loss_mlp": 0.01036403, "balance_loss_clip": 1.04148507, "balance_loss_mlp": 1.02318275, "epoch": 0.6748233879452878, "flos": 20046090827520.0, "grad_norm": 1.767291040158093, "language_loss": 0.75444579, "learning_rate": 1.0102483250100574e-06, "loss": 0.77589202, "num_input_tokens_seen": 242263435, "step": 11224, "time_per_iteration": 2.7242133617401123 }, { "auxiliary_loss_clip": 0.01060708, "auxiliary_loss_mlp": 0.01032334, "balance_loss_clip": 1.03886676, "balance_loss_mlp": 1.02131319, "epoch": 0.6748835111979558, "flos": 23002831785600.0, "grad_norm": 1.693566744371799, "language_loss": 0.6366834, "learning_rate": 1.0099099157156445e-06, "loss": 0.65761381, "num_input_tokens_seen": 242282765, "step": 11225, "time_per_iteration": 2.8525750637054443 }, { "auxiliary_loss_clip": 0.01108343, "auxiliary_loss_mlp": 0.00768466, "balance_loss_clip": 1.03901696, "balance_loss_mlp": 1.00013793, "epoch": 0.6749436344506238, "flos": 12197311009920.0, "grad_norm": 2.4278333466029163, "language_loss": 0.63865972, "learning_rate": 1.0095715439655462e-06, "loss": 0.65742779, "num_input_tokens_seen": 242298980, "step": 11226, "time_per_iteration": 2.5835680961608887 }, { "auxiliary_loss_clip": 0.01105473, "auxiliary_loss_mlp": 0.01037357, "balance_loss_clip": 1.04047155, "balance_loss_mlp": 1.02423763, "epoch": 0.6750037577032918, "flos": 11873720361600.0, "grad_norm": 2.1970918660293717, "language_loss": 0.71417314, "learning_rate": 1.0092332097725945e-06, "loss": 0.73560148, "num_input_tokens_seen": 242315420, "step": 11227, "time_per_iteration": 2.5965003967285156 }, { "auxiliary_loss_clip": 0.01082342, "auxiliary_loss_mlp": 0.01039904, "balance_loss_clip": 1.0346818, "balance_loss_mlp": 1.02601051, "epoch": 0.6750638809559597, "flos": 17019611614080.0, "grad_norm": 1.9754224773045619, "language_loss": 0.71259153, "learning_rate": 1.0088949131496183e-06, "loss": 0.733814, "num_input_tokens_seen": 242332805, "step": 11228, "time_per_iteration": 2.6131396293640137 }, { "auxiliary_loss_clip": 0.01010708, "auxiliary_loss_mlp": 0.01005158, "balance_loss_clip": 1.01072896, "balance_loss_mlp": 1.00386512, "epoch": 0.6751240042086277, "flos": 70951011891840.0, "grad_norm": 0.7503769026779433, "language_loss": 0.5320974, "learning_rate": 1.0085566541094482e-06, "loss": 0.55225611, "num_input_tokens_seen": 242396160, "step": 11229, "time_per_iteration": 3.22717022895813 }, { "auxiliary_loss_clip": 0.01101526, "auxiliary_loss_mlp": 0.01034212, "balance_loss_clip": 1.03896284, "balance_loss_mlp": 1.02234495, "epoch": 0.6751841274612956, "flos": 22675146986880.0, "grad_norm": 1.7298636476457805, "language_loss": 0.8039158, "learning_rate": 1.0082184326649072e-06, "loss": 0.82527316, "num_input_tokens_seen": 242414660, "step": 11230, "time_per_iteration": 2.6328141689300537 }, { "auxiliary_loss_clip": 0.01082067, "auxiliary_loss_mlp": 0.01035691, "balance_loss_clip": 1.03726006, "balance_loss_mlp": 1.02402639, "epoch": 0.6752442507139637, "flos": 21288636051840.0, "grad_norm": 1.6008618014341174, "language_loss": 0.65935898, "learning_rate": 1.0078802488288228e-06, "loss": 0.68053663, "num_input_tokens_seen": 242434225, "step": 11231, "time_per_iteration": 2.626856803894043 }, { "auxiliary_loss_clip": 0.01078317, "auxiliary_loss_mlp": 0.01042803, "balance_loss_clip": 1.04247785, "balance_loss_mlp": 1.02774644, "epoch": 0.6753043739666316, "flos": 28256921781120.0, "grad_norm": 2.0251672391245936, "language_loss": 0.66539383, "learning_rate": 1.0075421026140198e-06, "loss": 0.68660504, "num_input_tokens_seen": 242454355, "step": 11232, "time_per_iteration": 2.743908166885376 }, { "auxiliary_loss_clip": 0.01066681, "auxiliary_loss_mlp": 0.0103246, "balance_loss_clip": 1.03211284, "balance_loss_mlp": 1.01948404, "epoch": 0.6753644972192996, "flos": 21360349555200.0, "grad_norm": 1.6294007960486003, "language_loss": 0.72326458, "learning_rate": 1.0072039940333188e-06, "loss": 0.74425602, "num_input_tokens_seen": 242474935, "step": 11233, "time_per_iteration": 4.338082790374756 }, { "auxiliary_loss_clip": 0.01103097, "auxiliary_loss_mlp": 0.01037684, "balance_loss_clip": 1.03895485, "balance_loss_mlp": 1.02474928, "epoch": 0.6754246204719675, "flos": 26541971861760.0, "grad_norm": 1.5686096839287218, "language_loss": 0.76833057, "learning_rate": 1.0068659230995418e-06, "loss": 0.7897383, "num_input_tokens_seen": 242495530, "step": 11234, "time_per_iteration": 2.6492395401000977 }, { "auxiliary_loss_clip": 0.01111909, "auxiliary_loss_mlp": 0.01036672, "balance_loss_clip": 1.03924251, "balance_loss_mlp": 1.02342129, "epoch": 0.6754847437246355, "flos": 25556690822400.0, "grad_norm": 1.5014850027131166, "language_loss": 0.75410771, "learning_rate": 1.0065278898255101e-06, "loss": 0.77559352, "num_input_tokens_seen": 242514550, "step": 11235, "time_per_iteration": 2.5974621772766113 }, { "auxiliary_loss_clip": 0.01025646, "auxiliary_loss_mlp": 0.0100208, "balance_loss_clip": 1.01184058, "balance_loss_mlp": 1.00095963, "epoch": 0.6755448669773034, "flos": 59513318726400.0, "grad_norm": 0.7779431781811396, "language_loss": 0.51255912, "learning_rate": 1.0061898942240387e-06, "loss": 0.53283638, "num_input_tokens_seen": 242569200, "step": 11236, "time_per_iteration": 3.1667306423187256 }, { "auxiliary_loss_clip": 0.0107986, "auxiliary_loss_mlp": 0.01032431, "balance_loss_clip": 1.03790748, "balance_loss_mlp": 1.01711285, "epoch": 0.6756049902299714, "flos": 23294534135040.0, "grad_norm": 2.192780802483493, "language_loss": 0.75628972, "learning_rate": 1.0058519363079464e-06, "loss": 0.77741265, "num_input_tokens_seen": 242586950, "step": 11237, "time_per_iteration": 5.957702159881592 }, { "auxiliary_loss_clip": 0.01086462, "auxiliary_loss_mlp": 0.01041243, "balance_loss_clip": 1.03836346, "balance_loss_mlp": 1.0282433, "epoch": 0.6756651134826394, "flos": 31575426566400.0, "grad_norm": 2.7350155461999184, "language_loss": 0.77482605, "learning_rate": 1.0055140160900482e-06, "loss": 0.79610306, "num_input_tokens_seen": 242607380, "step": 11238, "time_per_iteration": 2.7448818683624268 }, { "auxiliary_loss_clip": 0.01099837, "auxiliary_loss_mlp": 0.01036444, "balance_loss_clip": 1.03648901, "balance_loss_mlp": 1.0227412, "epoch": 0.6757252367353074, "flos": 27272287186560.0, "grad_norm": 1.6539290066506784, "language_loss": 0.66314852, "learning_rate": 1.0051761335831587e-06, "loss": 0.6845113, "num_input_tokens_seen": 242628025, "step": 11239, "time_per_iteration": 2.740363597869873 }, { "auxiliary_loss_clip": 0.01089775, "auxiliary_loss_mlp": 0.01030806, "balance_loss_clip": 1.04055905, "balance_loss_mlp": 1.0182538, "epoch": 0.6757853599879754, "flos": 16830900535680.0, "grad_norm": 1.7720867918116858, "language_loss": 0.82882285, "learning_rate": 1.0048382888000898e-06, "loss": 0.85002863, "num_input_tokens_seen": 242643825, "step": 11240, "time_per_iteration": 2.7659623622894287 }, { "auxiliary_loss_clip": 0.01090669, "auxiliary_loss_mlp": 0.01035174, "balance_loss_clip": 1.04133797, "balance_loss_mlp": 1.01949787, "epoch": 0.6758454832406433, "flos": 23220055284480.0, "grad_norm": 2.676956533168836, "language_loss": 0.74727547, "learning_rate": 1.0045004817536525e-06, "loss": 0.76853395, "num_input_tokens_seen": 242661820, "step": 11241, "time_per_iteration": 2.7259037494659424 }, { "auxiliary_loss_clip": 0.01064722, "auxiliary_loss_mlp": 0.0103699, "balance_loss_clip": 1.03947997, "balance_loss_mlp": 1.02388871, "epoch": 0.6759056064933113, "flos": 16289547684480.0, "grad_norm": 2.2859314322063415, "language_loss": 0.80506319, "learning_rate": 1.0041627124566572e-06, "loss": 0.82608032, "num_input_tokens_seen": 242679890, "step": 11242, "time_per_iteration": 2.7591724395751953 }, { "auxiliary_loss_clip": 0.01095714, "auxiliary_loss_mlp": 0.01047852, "balance_loss_clip": 1.03617179, "balance_loss_mlp": 1.03376102, "epoch": 0.6759657297459792, "flos": 25922297404800.0, "grad_norm": 1.8958528418461225, "language_loss": 0.72530574, "learning_rate": 1.0038249809219109e-06, "loss": 0.74674141, "num_input_tokens_seen": 242699495, "step": 11243, "time_per_iteration": 4.2785255908966064 }, { "auxiliary_loss_clip": 0.01102771, "auxiliary_loss_mlp": 0.01038727, "balance_loss_clip": 1.03992796, "balance_loss_mlp": 1.02621591, "epoch": 0.6760258529986473, "flos": 23000820624000.0, "grad_norm": 3.620795649046328, "language_loss": 0.72916102, "learning_rate": 1.003487287162221e-06, "loss": 0.75057596, "num_input_tokens_seen": 242719500, "step": 11244, "time_per_iteration": 2.656297445297241 }, { "auxiliary_loss_clip": 0.01115915, "auxiliary_loss_mlp": 0.01045105, "balance_loss_clip": 1.04072213, "balance_loss_mlp": 1.03150368, "epoch": 0.6760859762513152, "flos": 20959335141120.0, "grad_norm": 2.083059893523475, "language_loss": 0.86242104, "learning_rate": 1.003149631190393e-06, "loss": 0.8840313, "num_input_tokens_seen": 242738325, "step": 11245, "time_per_iteration": 2.6280319690704346 }, { "auxiliary_loss_clip": 0.01117876, "auxiliary_loss_mlp": 0.0077189, "balance_loss_clip": 1.04022503, "balance_loss_mlp": 1.00016975, "epoch": 0.6761460995039832, "flos": 23622937205760.0, "grad_norm": 2.1743867677621918, "language_loss": 0.73484135, "learning_rate": 1.0028120130192327e-06, "loss": 0.753739, "num_input_tokens_seen": 242756620, "step": 11246, "time_per_iteration": 2.696730375289917 }, { "auxiliary_loss_clip": 0.0109861, "auxiliary_loss_mlp": 0.01029896, "balance_loss_clip": 1.03731704, "balance_loss_mlp": 1.01679528, "epoch": 0.6762062227566511, "flos": 20770875457920.0, "grad_norm": 1.7495113919795662, "language_loss": 0.87749994, "learning_rate": 1.002474432661539e-06, "loss": 0.89878494, "num_input_tokens_seen": 242774505, "step": 11247, "time_per_iteration": 2.6828203201293945 }, { "auxiliary_loss_clip": 0.01009927, "auxiliary_loss_mlp": 0.01001384, "balance_loss_clip": 1.00954247, "balance_loss_mlp": 1.00016785, "epoch": 0.6762663460093191, "flos": 52818099166080.0, "grad_norm": 0.8307013339921004, "language_loss": 0.53909206, "learning_rate": 1.002136890130115e-06, "loss": 0.55920517, "num_input_tokens_seen": 242828645, "step": 11248, "time_per_iteration": 3.2222228050231934 }, { "auxiliary_loss_clip": 0.01057434, "auxiliary_loss_mlp": 0.01030146, "balance_loss_clip": 1.0432303, "balance_loss_mlp": 1.01780176, "epoch": 0.676326469261987, "flos": 23696302734720.0, "grad_norm": 1.557725146566793, "language_loss": 0.73398393, "learning_rate": 1.001799385437761e-06, "loss": 0.75485975, "num_input_tokens_seen": 242850100, "step": 11249, "time_per_iteration": 2.8122363090515137 }, { "auxiliary_loss_clip": 0.01102856, "auxiliary_loss_mlp": 0.01036455, "balance_loss_clip": 1.03738058, "balance_loss_mlp": 1.02277553, "epoch": 0.676386592514655, "flos": 14063732582400.0, "grad_norm": 2.1313223732491506, "language_loss": 0.73983771, "learning_rate": 1.0014619185972732e-06, "loss": 0.76123083, "num_input_tokens_seen": 242867775, "step": 11250, "time_per_iteration": 2.697199583053589 }, { "auxiliary_loss_clip": 0.01113481, "auxiliary_loss_mlp": 0.01031768, "balance_loss_clip": 1.03948021, "balance_loss_mlp": 1.01904869, "epoch": 0.676446715767323, "flos": 20412236113920.0, "grad_norm": 1.816015271011089, "language_loss": 0.75130785, "learning_rate": 1.0011244896214497e-06, "loss": 0.77276027, "num_input_tokens_seen": 242886865, "step": 11251, "time_per_iteration": 2.6333305835723877 }, { "auxiliary_loss_clip": 0.01078452, "auxiliary_loss_mlp": 0.01031314, "balance_loss_clip": 1.04010725, "balance_loss_mlp": 1.0182966, "epoch": 0.676506839019991, "flos": 21288241002240.0, "grad_norm": 1.551518166422534, "language_loss": 0.69901943, "learning_rate": 1.0007870985230873e-06, "loss": 0.72011709, "num_input_tokens_seen": 242906705, "step": 11252, "time_per_iteration": 2.9181244373321533 }, { "auxiliary_loss_clip": 0.01064839, "auxiliary_loss_mlp": 0.01033228, "balance_loss_clip": 1.03892565, "balance_loss_mlp": 1.02052665, "epoch": 0.676566962272659, "flos": 29932477459200.0, "grad_norm": 1.6718962617994413, "language_loss": 0.66779602, "learning_rate": 1.0004497453149765e-06, "loss": 0.68877667, "num_input_tokens_seen": 242925215, "step": 11253, "time_per_iteration": 2.8428003787994385 }, { "auxiliary_loss_clip": 0.01070699, "auxiliary_loss_mlp": 0.00775318, "balance_loss_clip": 1.03454018, "balance_loss_mlp": 1.00019038, "epoch": 0.6766270855253269, "flos": 17931203902080.0, "grad_norm": 1.5527696111799332, "language_loss": 0.7722379, "learning_rate": 1.0001124300099115e-06, "loss": 0.79069805, "num_input_tokens_seen": 242944750, "step": 11254, "time_per_iteration": 2.712817668914795 }, { "auxiliary_loss_clip": 0.0110248, "auxiliary_loss_mlp": 0.01035428, "balance_loss_clip": 1.03869474, "balance_loss_mlp": 1.02183247, "epoch": 0.6766872087779949, "flos": 23104853389440.0, "grad_norm": 3.950409887802226, "language_loss": 0.72361761, "learning_rate": 9.997751526206835e-07, "loss": 0.74499667, "num_input_tokens_seen": 242963860, "step": 11255, "time_per_iteration": 2.6217257976531982 }, { "auxiliary_loss_clip": 0.01061354, "auxiliary_loss_mlp": 0.00771432, "balance_loss_clip": 1.03389072, "balance_loss_mlp": 1.00019884, "epoch": 0.6767473320306628, "flos": 26213137827840.0, "grad_norm": 2.6592328120058584, "language_loss": 0.75315595, "learning_rate": 9.994379131600828e-07, "loss": 0.7714839, "num_input_tokens_seen": 242983050, "step": 11256, "time_per_iteration": 2.834801435470581 }, { "auxiliary_loss_clip": 0.01105322, "auxiliary_loss_mlp": 0.01036203, "balance_loss_clip": 1.04157603, "balance_loss_mlp": 1.0230726, "epoch": 0.6768074552833309, "flos": 18368739469440.0, "grad_norm": 2.0954099595982836, "language_loss": 0.6498003, "learning_rate": 9.991007116408965e-07, "loss": 0.67121565, "num_input_tokens_seen": 243001125, "step": 11257, "time_per_iteration": 2.6306867599487305 }, { "auxiliary_loss_clip": 0.01067487, "auxiliary_loss_mlp": 0.01033246, "balance_loss_clip": 1.04190707, "balance_loss_mlp": 1.02090788, "epoch": 0.6768675785359988, "flos": 23039927556480.0, "grad_norm": 1.6004297573343516, "language_loss": 0.75491571, "learning_rate": 9.987635480759109e-07, "loss": 0.77592301, "num_input_tokens_seen": 243021865, "step": 11258, "time_per_iteration": 2.9189696311950684 }, { "auxiliary_loss_clip": 0.01089351, "auxiliary_loss_mlp": 0.01036704, "balance_loss_clip": 1.03900814, "balance_loss_mlp": 1.02450931, "epoch": 0.6769277017886668, "flos": 33036524092800.0, "grad_norm": 1.5823758287987741, "language_loss": 0.66676503, "learning_rate": 9.984264224779127e-07, "loss": 0.68802559, "num_input_tokens_seen": 243042970, "step": 11259, "time_per_iteration": 2.7564451694488525 }, { "auxiliary_loss_clip": 0.01090564, "auxiliary_loss_mlp": 0.01035059, "balance_loss_clip": 1.03726125, "balance_loss_mlp": 1.02206516, "epoch": 0.6769878250413347, "flos": 20848406964480.0, "grad_norm": 2.48292750681471, "language_loss": 0.85291499, "learning_rate": 9.980893348596839e-07, "loss": 0.8741712, "num_input_tokens_seen": 243058470, "step": 11260, "time_per_iteration": 2.660332441329956 }, { "auxiliary_loss_clip": 0.01085932, "auxiliary_loss_mlp": 0.01039752, "balance_loss_clip": 1.03486264, "balance_loss_mlp": 1.02588189, "epoch": 0.6770479482940027, "flos": 15595968994560.0, "grad_norm": 2.613252024528438, "language_loss": 0.77209002, "learning_rate": 9.977522852340081e-07, "loss": 0.79334688, "num_input_tokens_seen": 243076630, "step": 11261, "time_per_iteration": 2.6410372257232666 }, { "auxiliary_loss_clip": 0.0109228, "auxiliary_loss_mlp": 0.01040148, "balance_loss_clip": 1.03776324, "balance_loss_mlp": 1.02687383, "epoch": 0.6771080715466706, "flos": 18621011664000.0, "grad_norm": 2.010792691714421, "language_loss": 0.87528884, "learning_rate": 9.97415273613666e-07, "loss": 0.89661312, "num_input_tokens_seen": 243092260, "step": 11262, "time_per_iteration": 2.6288645267486572 }, { "auxiliary_loss_clip": 0.01089821, "auxiliary_loss_mlp": 0.01035622, "balance_loss_clip": 1.03876138, "balance_loss_mlp": 1.02234757, "epoch": 0.6771681947993387, "flos": 12495441893760.0, "grad_norm": 1.942743373156589, "language_loss": 0.74668461, "learning_rate": 9.97078300011439e-07, "loss": 0.76793909, "num_input_tokens_seen": 243109405, "step": 11263, "time_per_iteration": 2.666969060897827 }, { "auxiliary_loss_clip": 0.01107967, "auxiliary_loss_mlp": 0.01034392, "balance_loss_clip": 1.04032826, "balance_loss_mlp": 1.02013433, "epoch": 0.6772283180520066, "flos": 22236964974720.0, "grad_norm": 3.4280923778329435, "language_loss": 0.67490625, "learning_rate": 9.967413644401016e-07, "loss": 0.69632983, "num_input_tokens_seen": 243128135, "step": 11264, "time_per_iteration": 2.620027780532837 }, { "auxiliary_loss_clip": 0.01092011, "auxiliary_loss_mlp": 0.01036919, "balance_loss_clip": 1.04065371, "balance_loss_mlp": 1.02333474, "epoch": 0.6772884413046746, "flos": 16143139848960.0, "grad_norm": 1.9352746586576008, "language_loss": 0.7301234, "learning_rate": 9.964044669124324e-07, "loss": 0.75141263, "num_input_tokens_seen": 243146785, "step": 11265, "time_per_iteration": 2.638399600982666 }, { "auxiliary_loss_clip": 0.0106857, "auxiliary_loss_mlp": 0.01046347, "balance_loss_clip": 1.03400207, "balance_loss_mlp": 1.03247142, "epoch": 0.6773485645573426, "flos": 19135755515520.0, "grad_norm": 2.1206255290710594, "language_loss": 0.61617583, "learning_rate": 9.96067607441207e-07, "loss": 0.63732499, "num_input_tokens_seen": 243165275, "step": 11266, "time_per_iteration": 2.6741204261779785 }, { "auxiliary_loss_clip": 0.01086026, "auxiliary_loss_mlp": 0.01036436, "balance_loss_clip": 1.04107964, "balance_loss_mlp": 1.02305472, "epoch": 0.6774086878100105, "flos": 14136918543360.0, "grad_norm": 1.7639590037989088, "language_loss": 0.7056399, "learning_rate": 9.957307860391976e-07, "loss": 0.72686452, "num_input_tokens_seen": 243182845, "step": 11267, "time_per_iteration": 2.701676607131958 }, { "auxiliary_loss_clip": 0.01112717, "auxiliary_loss_mlp": 0.01034596, "balance_loss_clip": 1.03907073, "balance_loss_mlp": 1.02175152, "epoch": 0.6774688110626785, "flos": 22197067943040.0, "grad_norm": 3.4663937575357093, "language_loss": 0.71232986, "learning_rate": 9.953940027191785e-07, "loss": 0.73380297, "num_input_tokens_seen": 243201475, "step": 11268, "time_per_iteration": 2.582158327102661 }, { "auxiliary_loss_clip": 0.01089704, "auxiliary_loss_mlp": 0.01038381, "balance_loss_clip": 1.03727484, "balance_loss_mlp": 1.02395701, "epoch": 0.6775289343153464, "flos": 23039963470080.0, "grad_norm": 1.54975098078917, "language_loss": 0.76582503, "learning_rate": 9.950572574939194e-07, "loss": 0.78710592, "num_input_tokens_seen": 243221850, "step": 11269, "time_per_iteration": 2.6784608364105225 }, { "auxiliary_loss_clip": 0.01079985, "auxiliary_loss_mlp": 0.01039406, "balance_loss_clip": 1.03688645, "balance_loss_mlp": 1.02560711, "epoch": 0.6775890575680145, "flos": 18293506433280.0, "grad_norm": 3.9513063189541895, "language_loss": 0.74380577, "learning_rate": 9.94720550376189e-07, "loss": 0.76499963, "num_input_tokens_seen": 243239855, "step": 11270, "time_per_iteration": 2.82761812210083 }, { "auxiliary_loss_clip": 0.01059034, "auxiliary_loss_mlp": 0.0104238, "balance_loss_clip": 1.03957486, "balance_loss_mlp": 1.02821255, "epoch": 0.6776491808206824, "flos": 25336450581120.0, "grad_norm": 1.7088059356464216, "language_loss": 0.73103487, "learning_rate": 9.94383881378756e-07, "loss": 0.75204897, "num_input_tokens_seen": 243260085, "step": 11271, "time_per_iteration": 2.7955849170684814 }, { "auxiliary_loss_clip": 0.01113021, "auxiliary_loss_mlp": 0.01036559, "balance_loss_clip": 1.03949916, "balance_loss_mlp": 1.02401781, "epoch": 0.6777093040733504, "flos": 26028233591040.0, "grad_norm": 3.147727016102492, "language_loss": 0.68212342, "learning_rate": 9.94047250514387e-07, "loss": 0.70361924, "num_input_tokens_seen": 243280065, "step": 11272, "time_per_iteration": 2.637103796005249 }, { "auxiliary_loss_clip": 0.01103771, "auxiliary_loss_mlp": 0.01035716, "balance_loss_clip": 1.0390712, "balance_loss_mlp": 1.02126229, "epoch": 0.6777694273260183, "flos": 18003599763840.0, "grad_norm": 1.7828232071604915, "language_loss": 0.73829705, "learning_rate": 9.937106577958481e-07, "loss": 0.75969195, "num_input_tokens_seen": 243297775, "step": 11273, "time_per_iteration": 4.394399642944336 }, { "auxiliary_loss_clip": 0.01094453, "auxiliary_loss_mlp": 0.01041712, "balance_loss_clip": 1.03916848, "balance_loss_mlp": 1.02846813, "epoch": 0.6778295505786863, "flos": 23441085624960.0, "grad_norm": 1.8919224773021028, "language_loss": 0.70701563, "learning_rate": 9.933741032359015e-07, "loss": 0.72837734, "num_input_tokens_seen": 243315760, "step": 11274, "time_per_iteration": 2.5985240936279297 }, { "auxiliary_loss_clip": 0.01114225, "auxiliary_loss_mlp": 0.01033716, "balance_loss_clip": 1.0387696, "balance_loss_mlp": 1.02027476, "epoch": 0.6778896738313542, "flos": 19098408349440.0, "grad_norm": 4.65357993377392, "language_loss": 0.6543079, "learning_rate": 9.930375868473093e-07, "loss": 0.67578733, "num_input_tokens_seen": 243335715, "step": 11275, "time_per_iteration": 2.6151697635650635 }, { "auxiliary_loss_clip": 0.01106727, "auxiliary_loss_mlp": 0.01033632, "balance_loss_clip": 1.04250121, "balance_loss_mlp": 1.02126956, "epoch": 0.6779497970840223, "flos": 26103933504000.0, "grad_norm": 1.5789952929470612, "language_loss": 0.72767758, "learning_rate": 9.927011086428335e-07, "loss": 0.74908113, "num_input_tokens_seen": 243356935, "step": 11276, "time_per_iteration": 5.899662017822266 }, { "auxiliary_loss_clip": 0.01087765, "auxiliary_loss_mlp": 0.00771415, "balance_loss_clip": 1.03646386, "balance_loss_mlp": 1.00016904, "epoch": 0.6780099203366902, "flos": 19719232041600.0, "grad_norm": 1.681215818326951, "language_loss": 0.76681376, "learning_rate": 9.923646686352317e-07, "loss": 0.78540558, "num_input_tokens_seen": 243375625, "step": 11277, "time_per_iteration": 2.6914784908294678 }, { "auxiliary_loss_clip": 0.01092848, "auxiliary_loss_mlp": 0.01033178, "balance_loss_clip": 1.03808713, "balance_loss_mlp": 1.01976132, "epoch": 0.6780700435893582, "flos": 18214538382720.0, "grad_norm": 2.7540725669591724, "language_loss": 0.83632004, "learning_rate": 9.920282668372627e-07, "loss": 0.8575803, "num_input_tokens_seen": 243390195, "step": 11278, "time_per_iteration": 2.637618064880371 }, { "auxiliary_loss_clip": 0.01085002, "auxiliary_loss_mlp": 0.00769413, "balance_loss_clip": 1.04043651, "balance_loss_mlp": 1.00012565, "epoch": 0.6781301668420262, "flos": 25376239872000.0, "grad_norm": 1.5336771376537068, "language_loss": 0.70519423, "learning_rate": 9.916919032616844e-07, "loss": 0.72373849, "num_input_tokens_seen": 243411690, "step": 11279, "time_per_iteration": 2.7715609073638916 }, { "auxiliary_loss_clip": 0.01105152, "auxiliary_loss_mlp": 0.01035574, "balance_loss_clip": 1.03994751, "balance_loss_mlp": 1.0217998, "epoch": 0.6781902900946941, "flos": 24020432087040.0, "grad_norm": 1.8650529100630782, "language_loss": 0.73610586, "learning_rate": 9.913555779212485e-07, "loss": 0.75751317, "num_input_tokens_seen": 243430280, "step": 11280, "time_per_iteration": 2.734544277191162 }, { "auxiliary_loss_clip": 0.01103265, "auxiliary_loss_mlp": 0.01036072, "balance_loss_clip": 1.03754926, "balance_loss_mlp": 1.02211285, "epoch": 0.6782504133473621, "flos": 19646764352640.0, "grad_norm": 2.0625858122456178, "language_loss": 0.70312506, "learning_rate": 9.910192908287104e-07, "loss": 0.72451842, "num_input_tokens_seen": 243448690, "step": 11281, "time_per_iteration": 2.622098684310913 }, { "auxiliary_loss_clip": 0.01111077, "auxiliary_loss_mlp": 0.01028525, "balance_loss_clip": 1.04020238, "balance_loss_mlp": 1.01619864, "epoch": 0.67831053660003, "flos": 24932742647040.0, "grad_norm": 1.4839814095064274, "language_loss": 0.63879716, "learning_rate": 9.906830419968217e-07, "loss": 0.66019315, "num_input_tokens_seen": 243470695, "step": 11282, "time_per_iteration": 4.292442798614502 }, { "auxiliary_loss_clip": 0.01075036, "auxiliary_loss_mlp": 0.01049322, "balance_loss_clip": 1.03349972, "balance_loss_mlp": 1.03204811, "epoch": 0.6783706598526981, "flos": 31208383440000.0, "grad_norm": 1.7556170346158129, "language_loss": 0.74497384, "learning_rate": 9.90346831438334e-07, "loss": 0.76621741, "num_input_tokens_seen": 243493345, "step": 11283, "time_per_iteration": 2.9562923908233643 }, { "auxiliary_loss_clip": 0.01103456, "auxiliary_loss_mlp": 0.01029151, "balance_loss_clip": 1.04012847, "balance_loss_mlp": 1.01659822, "epoch": 0.678430783105366, "flos": 35441317687680.0, "grad_norm": 1.5708851854862296, "language_loss": 0.56767416, "learning_rate": 9.900106591659948e-07, "loss": 0.58900023, "num_input_tokens_seen": 243515670, "step": 11284, "time_per_iteration": 2.8391168117523193 }, { "auxiliary_loss_clip": 0.01090169, "auxiliary_loss_mlp": 0.0103128, "balance_loss_clip": 1.03865993, "balance_loss_mlp": 1.01850688, "epoch": 0.678490906358034, "flos": 14428800460800.0, "grad_norm": 1.928659697893085, "language_loss": 0.75430858, "learning_rate": 9.896745251925535e-07, "loss": 0.77552313, "num_input_tokens_seen": 243533625, "step": 11285, "time_per_iteration": 2.8025879859924316 }, { "auxiliary_loss_clip": 0.01113003, "auxiliary_loss_mlp": 0.01032581, "balance_loss_clip": 1.04134154, "balance_loss_mlp": 1.01964653, "epoch": 0.6785510296107019, "flos": 24311236596480.0, "grad_norm": 1.7863771120930665, "language_loss": 0.66262901, "learning_rate": 9.893384295307557e-07, "loss": 0.68408483, "num_input_tokens_seen": 243553040, "step": 11286, "time_per_iteration": 2.6879425048828125 }, { "auxiliary_loss_clip": 0.0109176, "auxiliary_loss_mlp": 0.01029833, "balance_loss_clip": 1.03810883, "balance_loss_mlp": 1.01649332, "epoch": 0.6786111528633699, "flos": 26977244872320.0, "grad_norm": 2.710702139669138, "language_loss": 0.5293864, "learning_rate": 9.890023721933447e-07, "loss": 0.55060238, "num_input_tokens_seen": 243572590, "step": 11287, "time_per_iteration": 2.6729018688201904 }, { "auxiliary_loss_clip": 0.01070232, "auxiliary_loss_mlp": 0.01039812, "balance_loss_clip": 1.03733265, "balance_loss_mlp": 1.0263176, "epoch": 0.6786712761160378, "flos": 24317557390080.0, "grad_norm": 1.5085934530827387, "language_loss": 0.77353847, "learning_rate": 9.886663531930655e-07, "loss": 0.79463893, "num_input_tokens_seen": 243594140, "step": 11288, "time_per_iteration": 2.76521897315979 }, { "auxiliary_loss_clip": 0.01106153, "auxiliary_loss_mlp": 0.0103743, "balance_loss_clip": 1.04171705, "balance_loss_mlp": 1.0247159, "epoch": 0.6787313993687059, "flos": 22930435923840.0, "grad_norm": 1.9499442288864346, "language_loss": 0.73456311, "learning_rate": 9.883303725426593e-07, "loss": 0.75599885, "num_input_tokens_seen": 243615170, "step": 11289, "time_per_iteration": 2.6673424243927 }, { "auxiliary_loss_clip": 0.01114362, "auxiliary_loss_mlp": 0.010388, "balance_loss_clip": 1.04031169, "balance_loss_mlp": 1.02534115, "epoch": 0.6787915226213738, "flos": 26868435598080.0, "grad_norm": 1.6821989273437945, "language_loss": 0.80101818, "learning_rate": 9.879944302548682e-07, "loss": 0.82254982, "num_input_tokens_seen": 243635675, "step": 11290, "time_per_iteration": 2.632082223892212 }, { "auxiliary_loss_clip": 0.01101296, "auxiliary_loss_mlp": 0.010341, "balance_loss_clip": 1.04066992, "balance_loss_mlp": 1.02134442, "epoch": 0.6788516458740418, "flos": 20008851402240.0, "grad_norm": 1.599385548142358, "language_loss": 0.75065523, "learning_rate": 9.87658526342428e-07, "loss": 0.77200925, "num_input_tokens_seen": 243654950, "step": 11291, "time_per_iteration": 2.6852645874023438 }, { "auxiliary_loss_clip": 0.01096412, "auxiliary_loss_mlp": 0.00771696, "balance_loss_clip": 1.04071581, "balance_loss_mlp": 1.0002079, "epoch": 0.6789117691267098, "flos": 28727099832960.0, "grad_norm": 1.9085592005378407, "language_loss": 0.75479198, "learning_rate": 9.873226608180785e-07, "loss": 0.77347308, "num_input_tokens_seen": 243674970, "step": 11292, "time_per_iteration": 2.699632167816162 }, { "auxiliary_loss_clip": 0.01073788, "auxiliary_loss_mlp": 0.01034074, "balance_loss_clip": 1.03495204, "balance_loss_mlp": 1.02013278, "epoch": 0.6789718923793777, "flos": 23403451150080.0, "grad_norm": 2.0284676657461858, "language_loss": 0.84163547, "learning_rate": 9.869868336945556e-07, "loss": 0.86271405, "num_input_tokens_seen": 243693440, "step": 11293, "time_per_iteration": 2.719419240951538 }, { "auxiliary_loss_clip": 0.01119964, "auxiliary_loss_mlp": 0.01040774, "balance_loss_clip": 1.04201078, "balance_loss_mlp": 1.02618265, "epoch": 0.6790320156320457, "flos": 20448865008000.0, "grad_norm": 2.5902571187100722, "language_loss": 0.79863316, "learning_rate": 9.866510449845929e-07, "loss": 0.8202405, "num_input_tokens_seen": 243710055, "step": 11294, "time_per_iteration": 2.5868771076202393 }, { "auxiliary_loss_clip": 0.0109056, "auxiliary_loss_mlp": 0.01027861, "balance_loss_clip": 1.03927612, "balance_loss_mlp": 1.01579142, "epoch": 0.6790921388847136, "flos": 24167199058560.0, "grad_norm": 1.662741005119297, "language_loss": 0.79054183, "learning_rate": 9.86315294700924e-07, "loss": 0.81172609, "num_input_tokens_seen": 243728635, "step": 11295, "time_per_iteration": 2.677510976791382 }, { "auxiliary_loss_clip": 0.0108512, "auxiliary_loss_mlp": 0.01031535, "balance_loss_clip": 1.03939927, "balance_loss_mlp": 1.02034116, "epoch": 0.6791522621373817, "flos": 21908095027200.0, "grad_norm": 1.7946887652261734, "language_loss": 0.71118504, "learning_rate": 9.859795828562823e-07, "loss": 0.7323516, "num_input_tokens_seen": 243748330, "step": 11296, "time_per_iteration": 2.7060418128967285 }, { "auxiliary_loss_clip": 0.01100933, "auxiliary_loss_mlp": 0.01032318, "balance_loss_clip": 1.03921032, "balance_loss_mlp": 1.01968789, "epoch": 0.6792123853900496, "flos": 24826519152000.0, "grad_norm": 1.4998043731898119, "language_loss": 0.70986772, "learning_rate": 9.856439094633949e-07, "loss": 0.73120022, "num_input_tokens_seen": 243769380, "step": 11297, "time_per_iteration": 2.6602540016174316 }, { "auxiliary_loss_clip": 0.01086842, "auxiliary_loss_mlp": 0.01036373, "balance_loss_clip": 1.03981018, "balance_loss_mlp": 1.02242589, "epoch": 0.6792725086427176, "flos": 17566279678080.0, "grad_norm": 2.428106974293634, "language_loss": 0.66109335, "learning_rate": 9.853082745349918e-07, "loss": 0.68232548, "num_input_tokens_seen": 243785510, "step": 11298, "time_per_iteration": 2.694490671157837 }, { "auxiliary_loss_clip": 0.01105001, "auxiliary_loss_mlp": 0.01027328, "balance_loss_clip": 1.03936362, "balance_loss_mlp": 1.0155797, "epoch": 0.6793326318953855, "flos": 26941837040640.0, "grad_norm": 1.6664116325381613, "language_loss": 0.71988988, "learning_rate": 9.84972678083801e-07, "loss": 0.7412132, "num_input_tokens_seen": 243805545, "step": 11299, "time_per_iteration": 2.713809013366699 }, { "auxiliary_loss_clip": 0.0111669, "auxiliary_loss_mlp": 0.01035745, "balance_loss_clip": 1.04250383, "balance_loss_mlp": 1.02194023, "epoch": 0.6793927551480535, "flos": 24318275662080.0, "grad_norm": 1.2656496116170863, "language_loss": 0.77410668, "learning_rate": 9.846371201225488e-07, "loss": 0.79563105, "num_input_tokens_seen": 243825185, "step": 11300, "time_per_iteration": 2.6434032917022705 }, { "auxiliary_loss_clip": 0.01101279, "auxiliary_loss_mlp": 0.01035248, "balance_loss_clip": 1.0383904, "balance_loss_mlp": 1.02223039, "epoch": 0.6794528784007214, "flos": 11436615757440.0, "grad_norm": 1.7784061043917232, "language_loss": 0.63197196, "learning_rate": 9.843016006639577e-07, "loss": 0.65333718, "num_input_tokens_seen": 243841600, "step": 11301, "time_per_iteration": 2.5723037719726562 }, { "auxiliary_loss_clip": 0.0110239, "auxiliary_loss_mlp": 0.01032092, "balance_loss_clip": 1.03951216, "balance_loss_mlp": 1.01922345, "epoch": 0.6795130016533895, "flos": 25229688382080.0, "grad_norm": 1.6293976559584973, "language_loss": 0.82879919, "learning_rate": 9.839661197207525e-07, "loss": 0.85014397, "num_input_tokens_seen": 243862250, "step": 11302, "time_per_iteration": 2.8143625259399414 }, { "auxiliary_loss_clip": 0.01105417, "auxiliary_loss_mlp": 0.01036251, "balance_loss_clip": 1.039042, "balance_loss_mlp": 1.02304244, "epoch": 0.6795731249060574, "flos": 18296415434880.0, "grad_norm": 2.345439766685576, "language_loss": 0.69651306, "learning_rate": 9.83630677305654e-07, "loss": 0.71792972, "num_input_tokens_seen": 243880560, "step": 11303, "time_per_iteration": 2.685213565826416 }, { "auxiliary_loss_clip": 0.01084889, "auxiliary_loss_mlp": 0.01035911, "balance_loss_clip": 1.03954554, "balance_loss_mlp": 1.02285755, "epoch": 0.6796332481587254, "flos": 20300374183680.0, "grad_norm": 2.3397839406401864, "language_loss": 0.70244884, "learning_rate": 9.832952734313813e-07, "loss": 0.72365683, "num_input_tokens_seen": 243900635, "step": 11304, "time_per_iteration": 2.7310903072357178 }, { "auxiliary_loss_clip": 0.01105112, "auxiliary_loss_mlp": 0.01033253, "balance_loss_clip": 1.04138947, "balance_loss_mlp": 1.0197227, "epoch": 0.6796933714113934, "flos": 23586847015680.0, "grad_norm": 2.2457086045709107, "language_loss": 0.72435552, "learning_rate": 9.829599081106536e-07, "loss": 0.74573922, "num_input_tokens_seen": 243920160, "step": 11305, "time_per_iteration": 2.651684522628784 }, { "auxiliary_loss_clip": 0.01091817, "auxiliary_loss_mlp": 0.01031313, "balance_loss_clip": 1.03978157, "balance_loss_mlp": 1.01869428, "epoch": 0.6797534946640613, "flos": 27119917693440.0, "grad_norm": 2.0035628154788268, "language_loss": 0.66448355, "learning_rate": 9.826245813561882e-07, "loss": 0.68571484, "num_input_tokens_seen": 243939015, "step": 11306, "time_per_iteration": 2.655308723449707 }, { "auxiliary_loss_clip": 0.01089759, "auxiliary_loss_mlp": 0.0103013, "balance_loss_clip": 1.03967357, "balance_loss_mlp": 1.0164274, "epoch": 0.6798136179167293, "flos": 22127437428480.0, "grad_norm": 1.6430540606311845, "language_loss": 0.80062962, "learning_rate": 9.822892931807021e-07, "loss": 0.82182848, "num_input_tokens_seen": 243958470, "step": 11307, "time_per_iteration": 2.661414861679077 }, { "auxiliary_loss_clip": 0.01087499, "auxiliary_loss_mlp": 0.01040608, "balance_loss_clip": 1.03799939, "balance_loss_mlp": 1.0259217, "epoch": 0.6798737411693972, "flos": 17488640430720.0, "grad_norm": 1.645939087248785, "language_loss": 0.89180249, "learning_rate": 9.819540435969066e-07, "loss": 0.91308355, "num_input_tokens_seen": 243975450, "step": 11308, "time_per_iteration": 2.677755117416382 }, { "auxiliary_loss_clip": 0.01075745, "auxiliary_loss_mlp": 0.01043456, "balance_loss_clip": 1.03412437, "balance_loss_mlp": 1.02860808, "epoch": 0.6799338644220653, "flos": 22892262744960.0, "grad_norm": 2.440998746341053, "language_loss": 0.70999914, "learning_rate": 9.816188326175154e-07, "loss": 0.73119116, "num_input_tokens_seen": 243994355, "step": 11309, "time_per_iteration": 2.716607093811035 }, { "auxiliary_loss_clip": 0.01084669, "auxiliary_loss_mlp": 0.01038527, "balance_loss_clip": 1.0404917, "balance_loss_mlp": 1.02482367, "epoch": 0.6799939876747332, "flos": 23180409648000.0, "grad_norm": 1.9482500712222228, "language_loss": 0.84240967, "learning_rate": 9.812836602552411e-07, "loss": 0.86364162, "num_input_tokens_seen": 244011620, "step": 11310, "time_per_iteration": 2.722900152206421 }, { "auxiliary_loss_clip": 0.01085075, "auxiliary_loss_mlp": 0.01035424, "balance_loss_clip": 1.03975177, "balance_loss_mlp": 1.0229609, "epoch": 0.6800541109274012, "flos": 19499925553920.0, "grad_norm": 2.93005287761764, "language_loss": 0.83355272, "learning_rate": 9.80948526522792e-07, "loss": 0.85475767, "num_input_tokens_seen": 244029925, "step": 11311, "time_per_iteration": 2.6596853733062744 }, { "auxiliary_loss_clip": 0.01066687, "auxiliary_loss_mlp": 0.01032083, "balance_loss_clip": 1.03414321, "balance_loss_mlp": 1.01699138, "epoch": 0.6801142341800691, "flos": 22277652105600.0, "grad_norm": 2.3870630839480045, "language_loss": 0.76332116, "learning_rate": 9.806134314328767e-07, "loss": 0.78430879, "num_input_tokens_seen": 244051225, "step": 11312, "time_per_iteration": 4.449703693389893 }, { "auxiliary_loss_clip": 0.01032073, "auxiliary_loss_mlp": 0.01012541, "balance_loss_clip": 1.00875306, "balance_loss_mlp": 1.0114975, "epoch": 0.6801743574327371, "flos": 68714817759360.0, "grad_norm": 0.6670891966515724, "language_loss": 0.57208383, "learning_rate": 9.802783749982038e-07, "loss": 0.59253001, "num_input_tokens_seen": 244115930, "step": 11313, "time_per_iteration": 3.371553897857666 }, { "auxiliary_loss_clip": 0.01103732, "auxiliary_loss_mlp": 0.01030366, "balance_loss_clip": 1.03782439, "balance_loss_mlp": 1.01666939, "epoch": 0.680234480685405, "flos": 29460467813760.0, "grad_norm": 2.3056583415168075, "language_loss": 0.69011742, "learning_rate": 9.799433572314754e-07, "loss": 0.71145844, "num_input_tokens_seen": 244137320, "step": 11314, "time_per_iteration": 2.697596549987793 }, { "auxiliary_loss_clip": 0.01097203, "auxiliary_loss_mlp": 0.01032148, "balance_loss_clip": 1.0348376, "balance_loss_mlp": 1.01988137, "epoch": 0.6802946039380731, "flos": 15916866122880.0, "grad_norm": 1.7422328614888773, "language_loss": 0.81493306, "learning_rate": 9.796083781453972e-07, "loss": 0.83622658, "num_input_tokens_seen": 244152755, "step": 11315, "time_per_iteration": 5.966327905654907 }, { "auxiliary_loss_clip": 0.01074474, "auxiliary_loss_mlp": 0.01028329, "balance_loss_clip": 1.04119301, "balance_loss_mlp": 1.01551998, "epoch": 0.680354727190741, "flos": 22018664067840.0, "grad_norm": 1.6310079102471389, "language_loss": 0.6954093, "learning_rate": 9.792734377526718e-07, "loss": 0.71643734, "num_input_tokens_seen": 244171480, "step": 11316, "time_per_iteration": 2.767069101333618 }, { "auxiliary_loss_clip": 0.01101612, "auxiliary_loss_mlp": 0.01031421, "balance_loss_clip": 1.04027951, "balance_loss_mlp": 1.01897597, "epoch": 0.680414850443409, "flos": 18441494467200.0, "grad_norm": 2.220463387251609, "language_loss": 0.66746044, "learning_rate": 9.789385360660003e-07, "loss": 0.6887908, "num_input_tokens_seen": 244187920, "step": 11317, "time_per_iteration": 2.6441752910614014 }, { "auxiliary_loss_clip": 0.01104685, "auxiliary_loss_mlp": 0.01039234, "balance_loss_clip": 1.04243541, "balance_loss_mlp": 1.02681887, "epoch": 0.680474973696077, "flos": 26358611909760.0, "grad_norm": 1.689359585188632, "language_loss": 0.74998158, "learning_rate": 9.78603673098082e-07, "loss": 0.77142078, "num_input_tokens_seen": 244209565, "step": 11318, "time_per_iteration": 2.6722664833068848 }, { "auxiliary_loss_clip": 0.01082639, "auxiliary_loss_mlp": 0.01031826, "balance_loss_clip": 1.03599584, "balance_loss_mlp": 1.01942801, "epoch": 0.6805350969487449, "flos": 18333116156160.0, "grad_norm": 1.901183060662594, "language_loss": 0.67961919, "learning_rate": 9.782688488616143e-07, "loss": 0.70076376, "num_input_tokens_seen": 244228015, "step": 11319, "time_per_iteration": 2.6768836975097656 }, { "auxiliary_loss_clip": 0.01075168, "auxiliary_loss_mlp": 0.00771315, "balance_loss_clip": 1.04308462, "balance_loss_mlp": 1.00015819, "epoch": 0.6805952202014129, "flos": 19937497034880.0, "grad_norm": 2.0018434908969054, "language_loss": 0.7674346, "learning_rate": 9.779340633692945e-07, "loss": 0.7858994, "num_input_tokens_seen": 244245615, "step": 11320, "time_per_iteration": 2.7204952239990234 }, { "auxiliary_loss_clip": 0.01085122, "auxiliary_loss_mlp": 0.01032538, "balance_loss_clip": 1.03866565, "balance_loss_mlp": 1.01947236, "epoch": 0.6806553434540809, "flos": 25224301342080.0, "grad_norm": 1.7764880825865026, "language_loss": 0.74452037, "learning_rate": 9.77599316633817e-07, "loss": 0.76569694, "num_input_tokens_seen": 244263625, "step": 11321, "time_per_iteration": 4.261602878570557 }, { "auxiliary_loss_clip": 0.01093792, "auxiliary_loss_mlp": 0.0103627, "balance_loss_clip": 1.04168797, "balance_loss_mlp": 1.02327621, "epoch": 0.6807154667067489, "flos": 17785586165760.0, "grad_norm": 1.8638596765379807, "language_loss": 0.72978008, "learning_rate": 9.772646086678758e-07, "loss": 0.75108075, "num_input_tokens_seen": 244282745, "step": 11322, "time_per_iteration": 2.672649383544922 }, { "auxiliary_loss_clip": 0.0106289, "auxiliary_loss_mlp": 0.00772149, "balance_loss_clip": 1.03630495, "balance_loss_mlp": 1.00025296, "epoch": 0.6807755899594168, "flos": 22199905117440.0, "grad_norm": 1.6392607041064693, "language_loss": 0.78432202, "learning_rate": 9.769299394841638e-07, "loss": 0.80267245, "num_input_tokens_seen": 244303770, "step": 11323, "time_per_iteration": 2.9052011966705322 }, { "auxiliary_loss_clip": 0.01000379, "auxiliary_loss_mlp": 0.01000283, "balance_loss_clip": 1.00907302, "balance_loss_mlp": 0.99898928, "epoch": 0.6808357132120848, "flos": 68631073200000.0, "grad_norm": 0.7447348872303097, "language_loss": 0.57086504, "learning_rate": 9.765953090953714e-07, "loss": 0.59087169, "num_input_tokens_seen": 244355910, "step": 11324, "time_per_iteration": 3.0236268043518066 }, { "auxiliary_loss_clip": 0.01094828, "auxiliary_loss_mlp": 0.01037923, "balance_loss_clip": 1.04058325, "balance_loss_mlp": 1.02427304, "epoch": 0.6808958364647527, "flos": 23843357015040.0, "grad_norm": 2.178701691002947, "language_loss": 0.68127519, "learning_rate": 9.76260717514186e-07, "loss": 0.70260274, "num_input_tokens_seen": 244376610, "step": 11325, "time_per_iteration": 2.6579439640045166 }, { "auxiliary_loss_clip": 0.01104202, "auxiliary_loss_mlp": 0.01032415, "balance_loss_clip": 1.03789818, "balance_loss_mlp": 1.01858699, "epoch": 0.6809559597174207, "flos": 17711717846400.0, "grad_norm": 2.3301645499310655, "language_loss": 0.70840496, "learning_rate": 9.759261647532974e-07, "loss": 0.72977114, "num_input_tokens_seen": 244393000, "step": 11326, "time_per_iteration": 2.599581718444824 }, { "auxiliary_loss_clip": 0.0111401, "auxiliary_loss_mlp": 0.01033979, "balance_loss_clip": 1.03960943, "balance_loss_mlp": 1.02112806, "epoch": 0.6810160829700886, "flos": 22491894775680.0, "grad_norm": 1.9162803943824422, "language_loss": 0.73098135, "learning_rate": 9.75591650825392e-07, "loss": 0.75246119, "num_input_tokens_seen": 244409515, "step": 11327, "time_per_iteration": 2.6066248416900635 }, { "auxiliary_loss_clip": 0.01099529, "auxiliary_loss_mlp": 0.01030672, "balance_loss_clip": 1.03761911, "balance_loss_mlp": 1.01766074, "epoch": 0.6810762062227567, "flos": 16832875783680.0, "grad_norm": 1.8101774590253075, "language_loss": 0.774257, "learning_rate": 9.752571757431526e-07, "loss": 0.79555899, "num_input_tokens_seen": 244427165, "step": 11328, "time_per_iteration": 2.6368680000305176 }, { "auxiliary_loss_clip": 0.01114029, "auxiliary_loss_mlp": 0.01029553, "balance_loss_clip": 1.03958964, "balance_loss_mlp": 1.01668477, "epoch": 0.6811363294754246, "flos": 12714676554240.0, "grad_norm": 2.0540376759628183, "language_loss": 0.64911687, "learning_rate": 9.74922739519265e-07, "loss": 0.67055273, "num_input_tokens_seen": 244445705, "step": 11329, "time_per_iteration": 2.573288679122925 }, { "auxiliary_loss_clip": 0.0105984, "auxiliary_loss_mlp": 0.00771154, "balance_loss_clip": 1.03904939, "balance_loss_mlp": 1.00018847, "epoch": 0.6811964527280926, "flos": 17711969241600.0, "grad_norm": 1.9764612571544942, "language_loss": 0.79003155, "learning_rate": 9.745883421664096e-07, "loss": 0.80834144, "num_input_tokens_seen": 244460415, "step": 11330, "time_per_iteration": 2.773776054382324 }, { "auxiliary_loss_clip": 0.01103225, "auxiliary_loss_mlp": 0.01032506, "balance_loss_clip": 1.03934312, "balance_loss_mlp": 1.01867759, "epoch": 0.6812565759807605, "flos": 24863471268480.0, "grad_norm": 2.2394798931993467, "language_loss": 0.6390332, "learning_rate": 9.742539836972665e-07, "loss": 0.66039056, "num_input_tokens_seen": 244480555, "step": 11331, "time_per_iteration": 2.648928165435791 }, { "auxiliary_loss_clip": 0.01066258, "auxiliary_loss_mlp": 0.01041405, "balance_loss_clip": 1.0375067, "balance_loss_mlp": 1.02576447, "epoch": 0.6813166992334285, "flos": 17166019449600.0, "grad_norm": 1.609984813626945, "language_loss": 0.72195572, "learning_rate": 9.739196641245148e-07, "loss": 0.74303234, "num_input_tokens_seen": 244498540, "step": 11332, "time_per_iteration": 2.713545322418213 }, { "auxiliary_loss_clip": 0.01103166, "auxiliary_loss_mlp": 0.01035791, "balance_loss_clip": 1.03957558, "balance_loss_mlp": 1.02199841, "epoch": 0.6813768224860965, "flos": 18843550375680.0, "grad_norm": 2.168847998609439, "language_loss": 0.74432015, "learning_rate": 9.735853834608326e-07, "loss": 0.76570976, "num_input_tokens_seen": 244517015, "step": 11333, "time_per_iteration": 2.6175012588500977 }, { "auxiliary_loss_clip": 0.01105297, "auxiliary_loss_mlp": 0.01033437, "balance_loss_clip": 1.04025042, "balance_loss_mlp": 1.01950097, "epoch": 0.6814369457387645, "flos": 24532733813760.0, "grad_norm": 1.5744091613936917, "language_loss": 0.71911031, "learning_rate": 9.732511417188963e-07, "loss": 0.74049771, "num_input_tokens_seen": 244537450, "step": 11334, "time_per_iteration": 2.6650426387786865 }, { "auxiliary_loss_clip": 0.0109758, "auxiliary_loss_mlp": 0.01035782, "balance_loss_clip": 1.0405134, "balance_loss_mlp": 1.02300835, "epoch": 0.6814970689914325, "flos": 18222978078720.0, "grad_norm": 1.6636732632213396, "language_loss": 0.85627699, "learning_rate": 9.729169389113791e-07, "loss": 0.87761062, "num_input_tokens_seen": 244555640, "step": 11335, "time_per_iteration": 2.6842143535614014 }, { "auxiliary_loss_clip": 0.01094419, "auxiliary_loss_mlp": 0.01029844, "balance_loss_clip": 1.03577423, "balance_loss_mlp": 1.01767254, "epoch": 0.6815571922441004, "flos": 25228790542080.0, "grad_norm": 1.7085573075393508, "language_loss": 0.82023531, "learning_rate": 9.725827750509542e-07, "loss": 0.84147793, "num_input_tokens_seen": 244574005, "step": 11336, "time_per_iteration": 2.6614902019500732 }, { "auxiliary_loss_clip": 0.01068778, "auxiliary_loss_mlp": 0.01036536, "balance_loss_clip": 1.03518701, "balance_loss_mlp": 1.02383399, "epoch": 0.6816173154967684, "flos": 19456078026240.0, "grad_norm": 2.367816368546713, "language_loss": 0.81341016, "learning_rate": 9.72248650150294e-07, "loss": 0.83446324, "num_input_tokens_seen": 244591395, "step": 11337, "time_per_iteration": 2.657960891723633 }, { "auxiliary_loss_clip": 0.01066549, "auxiliary_loss_mlp": 0.01031579, "balance_loss_clip": 1.03811026, "balance_loss_mlp": 1.01948464, "epoch": 0.6816774387494363, "flos": 17931455297280.0, "grad_norm": 1.783864414164821, "language_loss": 0.72693783, "learning_rate": 9.719145642220673e-07, "loss": 0.74791908, "num_input_tokens_seen": 244610400, "step": 11338, "time_per_iteration": 2.7979798316955566 }, { "auxiliary_loss_clip": 0.0107103, "auxiliary_loss_mlp": 0.0103912, "balance_loss_clip": 1.03607392, "balance_loss_mlp": 1.02586377, "epoch": 0.6817375620021043, "flos": 22233014478720.0, "grad_norm": 1.4931481110033054, "language_loss": 0.775159, "learning_rate": 9.715805172789435e-07, "loss": 0.79626048, "num_input_tokens_seen": 244630400, "step": 11339, "time_per_iteration": 2.795623540878296 }, { "auxiliary_loss_clip": 0.01078643, "auxiliary_loss_mlp": 0.01038418, "balance_loss_clip": 1.03534794, "balance_loss_mlp": 1.02518606, "epoch": 0.6817976852547722, "flos": 25374408278400.0, "grad_norm": 2.310939550899193, "language_loss": 0.70462239, "learning_rate": 9.712465093335901e-07, "loss": 0.72579294, "num_input_tokens_seen": 244649155, "step": 11340, "time_per_iteration": 2.7247865200042725 }, { "auxiliary_loss_clip": 0.01095095, "auxiliary_loss_mlp": 0.01039919, "balance_loss_clip": 1.04094124, "balance_loss_mlp": 1.02693105, "epoch": 0.6818578085074403, "flos": 22265764704000.0, "grad_norm": 2.4098119524731483, "language_loss": 0.83344734, "learning_rate": 9.709125403986722e-07, "loss": 0.85479748, "num_input_tokens_seen": 244665470, "step": 11341, "time_per_iteration": 2.693506956100464 }, { "auxiliary_loss_clip": 0.01081074, "auxiliary_loss_mlp": 0.01039727, "balance_loss_clip": 1.03870487, "balance_loss_mlp": 1.02477837, "epoch": 0.6819179317601082, "flos": 19318145800320.0, "grad_norm": 2.2255629522007907, "language_loss": 0.68262535, "learning_rate": 9.705786104868531e-07, "loss": 0.70383334, "num_input_tokens_seen": 244684390, "step": 11342, "time_per_iteration": 2.73895263671875 }, { "auxiliary_loss_clip": 0.01057789, "auxiliary_loss_mlp": 0.01030928, "balance_loss_clip": 1.0362978, "balance_loss_mlp": 1.01732635, "epoch": 0.6819780550127762, "flos": 21104126864640.0, "grad_norm": 1.497693268832665, "language_loss": 0.74835187, "learning_rate": 9.702447196107963e-07, "loss": 0.76923907, "num_input_tokens_seen": 244703370, "step": 11343, "time_per_iteration": 2.713353157043457 }, { "auxiliary_loss_clip": 0.0107318, "auxiliary_loss_mlp": 0.01047273, "balance_loss_clip": 1.03880191, "balance_loss_mlp": 1.03244925, "epoch": 0.6820381782654441, "flos": 29716403195520.0, "grad_norm": 2.1197347783426714, "language_loss": 0.79880822, "learning_rate": 9.699108677831639e-07, "loss": 0.82001281, "num_input_tokens_seen": 244723325, "step": 11344, "time_per_iteration": 2.794928550720215 }, { "auxiliary_loss_clip": 0.01076417, "auxiliary_loss_mlp": 0.01035037, "balance_loss_clip": 1.03809428, "balance_loss_mlp": 1.02200782, "epoch": 0.6820983015181121, "flos": 29242130993280.0, "grad_norm": 2.387575724266914, "language_loss": 0.66499114, "learning_rate": 9.695770550166136e-07, "loss": 0.68610573, "num_input_tokens_seen": 244745650, "step": 11345, "time_per_iteration": 2.7620160579681396 }, { "auxiliary_loss_clip": 0.01095586, "auxiliary_loss_mlp": 0.01037386, "balance_loss_clip": 1.04086328, "balance_loss_mlp": 1.02385557, "epoch": 0.6821584247707801, "flos": 18871775487360.0, "grad_norm": 2.5184537784822347, "language_loss": 0.64879942, "learning_rate": 9.692432813238054e-07, "loss": 0.67012918, "num_input_tokens_seen": 244760270, "step": 11346, "time_per_iteration": 2.6018569469451904 }, { "auxiliary_loss_clip": 0.01047778, "auxiliary_loss_mlp": 0.00774999, "balance_loss_clip": 1.02989614, "balance_loss_mlp": 1.00022125, "epoch": 0.6822185480234481, "flos": 21324582587520.0, "grad_norm": 1.6732433495926922, "language_loss": 0.78631318, "learning_rate": 9.689095467173952e-07, "loss": 0.80454087, "num_input_tokens_seen": 244779565, "step": 11347, "time_per_iteration": 2.7881743907928467 }, { "auxiliary_loss_clip": 0.01023846, "auxiliary_loss_mlp": 0.01003144, "balance_loss_clip": 1.01006222, "balance_loss_mlp": 1.00196934, "epoch": 0.6822786712761161, "flos": 63488306430720.0, "grad_norm": 0.7197437780259376, "language_loss": 0.5252403, "learning_rate": 9.685758512100378e-07, "loss": 0.54551017, "num_input_tokens_seen": 244838480, "step": 11348, "time_per_iteration": 3.159472942352295 }, { "auxiliary_loss_clip": 0.01111335, "auxiliary_loss_mlp": 0.0103669, "balance_loss_clip": 1.03910565, "balance_loss_mlp": 1.02423859, "epoch": 0.682338794528784, "flos": 21068934514560.0, "grad_norm": 1.699268260200451, "language_loss": 0.79743314, "learning_rate": 9.682421948143873e-07, "loss": 0.8189134, "num_input_tokens_seen": 244855265, "step": 11349, "time_per_iteration": 2.6090118885040283 }, { "auxiliary_loss_clip": 0.01107133, "auxiliary_loss_mlp": 0.01033347, "balance_loss_clip": 1.03977346, "balance_loss_mlp": 1.01808834, "epoch": 0.682398917781452, "flos": 36283243547520.0, "grad_norm": 1.8874948598089236, "language_loss": 0.73788822, "learning_rate": 9.67908577543096e-07, "loss": 0.75929302, "num_input_tokens_seen": 244875555, "step": 11350, "time_per_iteration": 2.819202184677124 }, { "auxiliary_loss_clip": 0.01113228, "auxiliary_loss_mlp": 0.01032526, "balance_loss_clip": 1.04043305, "balance_loss_mlp": 1.01912093, "epoch": 0.6824590410341199, "flos": 24859197550080.0, "grad_norm": 1.5956944903953967, "language_loss": 0.79352248, "learning_rate": 9.675749994088161e-07, "loss": 0.81498003, "num_input_tokens_seen": 244895270, "step": 11351, "time_per_iteration": 4.24803614616394 }, { "auxiliary_loss_clip": 0.01100964, "auxiliary_loss_mlp": 0.01038048, "balance_loss_clip": 1.03889775, "balance_loss_mlp": 1.02563834, "epoch": 0.6825191642867879, "flos": 22452392793600.0, "grad_norm": 1.5926332734392936, "language_loss": 0.73048198, "learning_rate": 9.672414604241954e-07, "loss": 0.75187206, "num_input_tokens_seen": 244914535, "step": 11352, "time_per_iteration": 2.608426094055176 }, { "auxiliary_loss_clip": 0.01066712, "auxiliary_loss_mlp": 0.01039687, "balance_loss_clip": 1.03466344, "balance_loss_mlp": 1.0250721, "epoch": 0.6825792875394558, "flos": 29424377623680.0, "grad_norm": 1.4647464086643525, "language_loss": 0.79812884, "learning_rate": 9.669079606018814e-07, "loss": 0.81919283, "num_input_tokens_seen": 244936095, "step": 11353, "time_per_iteration": 2.789823532104492 }, { "auxiliary_loss_clip": 0.0110206, "auxiliary_loss_mlp": 0.01030724, "balance_loss_clip": 1.03809357, "balance_loss_mlp": 1.01783192, "epoch": 0.6826394107921239, "flos": 18770974945920.0, "grad_norm": 1.6494881368897465, "language_loss": 0.78637832, "learning_rate": 9.665744999545218e-07, "loss": 0.80770618, "num_input_tokens_seen": 244955290, "step": 11354, "time_per_iteration": 2.621384382247925 }, { "auxiliary_loss_clip": 0.0105434, "auxiliary_loss_mlp": 0.01030514, "balance_loss_clip": 1.03731966, "balance_loss_mlp": 1.01798463, "epoch": 0.6826995340447918, "flos": 16617591619200.0, "grad_norm": 2.3579555752139107, "language_loss": 0.61690813, "learning_rate": 9.662410784947599e-07, "loss": 0.63775671, "num_input_tokens_seen": 244972935, "step": 11355, "time_per_iteration": 4.416518449783325 }, { "auxiliary_loss_clip": 0.01059431, "auxiliary_loss_mlp": 0.01031412, "balance_loss_clip": 1.03248143, "balance_loss_mlp": 1.01780415, "epoch": 0.6827596572974598, "flos": 20848299223680.0, "grad_norm": 2.0827591580165525, "language_loss": 0.81958997, "learning_rate": 9.659076962352398e-07, "loss": 0.84049839, "num_input_tokens_seen": 244989440, "step": 11356, "time_per_iteration": 2.772223949432373 }, { "auxiliary_loss_clip": 0.0109731, "auxiliary_loss_mlp": 0.01033608, "balance_loss_clip": 1.04186547, "balance_loss_mlp": 1.01991129, "epoch": 0.6828197805501277, "flos": 22748081552640.0, "grad_norm": 1.8123732324115438, "language_loss": 0.78407294, "learning_rate": 9.655743531886052e-07, "loss": 0.80538213, "num_input_tokens_seen": 245007830, "step": 11357, "time_per_iteration": 2.7943849563598633 }, { "auxiliary_loss_clip": 0.01014132, "auxiliary_loss_mlp": 0.01026339, "balance_loss_clip": 1.008708, "balance_loss_mlp": 1.02481925, "epoch": 0.6828799038027957, "flos": 71646565829760.0, "grad_norm": 0.8539086135635664, "language_loss": 0.59534943, "learning_rate": 9.65241049367493e-07, "loss": 0.61575413, "num_input_tokens_seen": 245070720, "step": 11358, "time_per_iteration": 3.2622344493865967 }, { "auxiliary_loss_clip": 0.0107463, "auxiliary_loss_mlp": 0.01049087, "balance_loss_clip": 1.03272629, "balance_loss_mlp": 1.03269577, "epoch": 0.6829400270554637, "flos": 19829154637440.0, "grad_norm": 1.8877966802390573, "language_loss": 0.78321809, "learning_rate": 9.64907784784544e-07, "loss": 0.80445516, "num_input_tokens_seen": 245089070, "step": 11359, "time_per_iteration": 2.7041001319885254 }, { "auxiliary_loss_clip": 0.01102262, "auxiliary_loss_mlp": 0.01035361, "balance_loss_clip": 1.03907776, "balance_loss_mlp": 1.02245069, "epoch": 0.6830001503081317, "flos": 21980634543360.0, "grad_norm": 1.8916230773559268, "language_loss": 0.81728172, "learning_rate": 9.645745594523958e-07, "loss": 0.83865792, "num_input_tokens_seen": 245106500, "step": 11360, "time_per_iteration": 2.7257988452911377 }, { "auxiliary_loss_clip": 0.01103488, "auxiliary_loss_mlp": 0.01039736, "balance_loss_clip": 1.04132986, "balance_loss_mlp": 1.02476335, "epoch": 0.6830602735607997, "flos": 24316767290880.0, "grad_norm": 1.928922588936481, "language_loss": 0.75214481, "learning_rate": 9.642413733836844e-07, "loss": 0.77357709, "num_input_tokens_seen": 245125260, "step": 11361, "time_per_iteration": 4.145911931991577 }, { "auxiliary_loss_clip": 0.01014146, "auxiliary_loss_mlp": 0.01006013, "balance_loss_clip": 1.01607728, "balance_loss_mlp": 1.00464237, "epoch": 0.6831203968134676, "flos": 57690062323200.0, "grad_norm": 0.8723971229353714, "language_loss": 0.59647572, "learning_rate": 9.639082265910437e-07, "loss": 0.6166774, "num_input_tokens_seen": 245188730, "step": 11362, "time_per_iteration": 3.303649425506592 }, { "auxiliary_loss_clip": 0.01085969, "auxiliary_loss_mlp": 0.01032083, "balance_loss_clip": 1.03544044, "balance_loss_mlp": 1.01791525, "epoch": 0.6831805200661356, "flos": 14388436552320.0, "grad_norm": 2.2849011537380384, "language_loss": 0.75293076, "learning_rate": 9.635751190871074e-07, "loss": 0.77411127, "num_input_tokens_seen": 245205065, "step": 11363, "time_per_iteration": 2.646646499633789 }, { "auxiliary_loss_clip": 0.0109026, "auxiliary_loss_mlp": 0.01039827, "balance_loss_clip": 1.03785634, "balance_loss_mlp": 1.02562308, "epoch": 0.6832406433188035, "flos": 22820297846400.0, "grad_norm": 2.373792593478636, "language_loss": 0.89238822, "learning_rate": 9.632420508845063e-07, "loss": 0.91368914, "num_input_tokens_seen": 245224265, "step": 11364, "time_per_iteration": 2.7119343280792236 }, { "auxiliary_loss_clip": 0.0108884, "auxiliary_loss_mlp": 0.01037187, "balance_loss_clip": 1.03655684, "balance_loss_mlp": 1.02403259, "epoch": 0.6833007665714715, "flos": 17561718650880.0, "grad_norm": 2.1030126962068634, "language_loss": 0.88149464, "learning_rate": 9.629090219958697e-07, "loss": 0.9027549, "num_input_tokens_seen": 245243360, "step": 11365, "time_per_iteration": 2.702363967895508 }, { "auxiliary_loss_clip": 0.01078844, "auxiliary_loss_mlp": 0.01042641, "balance_loss_clip": 1.03964448, "balance_loss_mlp": 1.02709591, "epoch": 0.6833608898241395, "flos": 22445928345600.0, "grad_norm": 2.424437854190435, "language_loss": 0.81156111, "learning_rate": 9.625760324338272e-07, "loss": 0.83277589, "num_input_tokens_seen": 245256350, "step": 11366, "time_per_iteration": 2.674567937850952 }, { "auxiliary_loss_clip": 0.01093776, "auxiliary_loss_mlp": 0.01032035, "balance_loss_clip": 1.03835893, "balance_loss_mlp": 1.0188446, "epoch": 0.6834210130768075, "flos": 24534637234560.0, "grad_norm": 6.88774223787515, "language_loss": 0.76857549, "learning_rate": 9.622430822110062e-07, "loss": 0.78983361, "num_input_tokens_seen": 245277575, "step": 11367, "time_per_iteration": 2.759528160095215 }, { "auxiliary_loss_clip": 0.01087848, "auxiliary_loss_mlp": 0.01037587, "balance_loss_clip": 1.03855908, "balance_loss_mlp": 1.0238061, "epoch": 0.6834811363294754, "flos": 20047132321920.0, "grad_norm": 1.645021355885407, "language_loss": 0.69147146, "learning_rate": 9.619101713400312e-07, "loss": 0.71272576, "num_input_tokens_seen": 245296615, "step": 11368, "time_per_iteration": 2.730281352996826 }, { "auxiliary_loss_clip": 0.01073853, "auxiliary_loss_mlp": 0.01036705, "balance_loss_clip": 1.03326845, "balance_loss_mlp": 1.02335334, "epoch": 0.6835412595821434, "flos": 24790752184320.0, "grad_norm": 2.698591110002851, "language_loss": 0.73457599, "learning_rate": 9.615772998335261e-07, "loss": 0.75568151, "num_input_tokens_seen": 245316275, "step": 11369, "time_per_iteration": 2.7577805519104004 }, { "auxiliary_loss_clip": 0.01098451, "auxiliary_loss_mlp": 0.01031594, "balance_loss_clip": 1.03844953, "balance_loss_mlp": 1.01816475, "epoch": 0.6836013828348113, "flos": 19500356517120.0, "grad_norm": 1.7788685178761994, "language_loss": 0.79114872, "learning_rate": 9.612444677041138e-07, "loss": 0.81244916, "num_input_tokens_seen": 245334595, "step": 11370, "time_per_iteration": 2.6396684646606445 }, { "auxiliary_loss_clip": 0.01022242, "auxiliary_loss_mlp": 0.01001045, "balance_loss_clip": 1.00799215, "balance_loss_mlp": 0.99983543, "epoch": 0.6836615060874793, "flos": 58363999251840.0, "grad_norm": 0.7422193722806905, "language_loss": 0.59737813, "learning_rate": 9.609116749644162e-07, "loss": 0.61761105, "num_input_tokens_seen": 245389750, "step": 11371, "time_per_iteration": 3.0740749835968018 }, { "auxiliary_loss_clip": 0.01085535, "auxiliary_loss_mlp": 0.01029543, "balance_loss_clip": 1.03905046, "balance_loss_mlp": 1.01730061, "epoch": 0.6837216293401474, "flos": 12166895168640.0, "grad_norm": 1.4865479653921647, "language_loss": 0.63814664, "learning_rate": 9.605789216270511e-07, "loss": 0.65929747, "num_input_tokens_seen": 245407530, "step": 11372, "time_per_iteration": 2.7413337230682373 }, { "auxiliary_loss_clip": 0.0110109, "auxiliary_loss_mlp": 0.01031341, "balance_loss_clip": 1.03960001, "balance_loss_mlp": 1.01804972, "epoch": 0.6837817525928153, "flos": 22127581082880.0, "grad_norm": 1.4899002874098461, "language_loss": 0.71882284, "learning_rate": 9.602462077046375e-07, "loss": 0.74014717, "num_input_tokens_seen": 245427000, "step": 11373, "time_per_iteration": 2.6774277687072754 }, { "auxiliary_loss_clip": 0.01004865, "auxiliary_loss_mlp": 0.01001461, "balance_loss_clip": 1.00957799, "balance_loss_mlp": 1.00026858, "epoch": 0.6838418758454833, "flos": 65005928985600.0, "grad_norm": 1.2536847263503932, "language_loss": 0.56630689, "learning_rate": 9.599135332097935e-07, "loss": 0.58637011, "num_input_tokens_seen": 245491620, "step": 11374, "time_per_iteration": 3.388324499130249 }, { "auxiliary_loss_clip": 0.01107854, "auxiliary_loss_mlp": 0.01029868, "balance_loss_clip": 1.04247069, "balance_loss_mlp": 1.01605177, "epoch": 0.6839019990981512, "flos": 21030833162880.0, "grad_norm": 1.4678437510466378, "language_loss": 0.74034035, "learning_rate": 9.595808981551312e-07, "loss": 0.76171762, "num_input_tokens_seen": 245511285, "step": 11375, "time_per_iteration": 2.6397507190704346 }, { "auxiliary_loss_clip": 0.01095867, "auxiliary_loss_mlp": 0.01034414, "balance_loss_clip": 1.04174185, "balance_loss_mlp": 1.02130103, "epoch": 0.6839621223508192, "flos": 24935543907840.0, "grad_norm": 1.7532880441573435, "language_loss": 0.70852029, "learning_rate": 9.592483025532651e-07, "loss": 0.72982311, "num_input_tokens_seen": 245532910, "step": 11376, "time_per_iteration": 2.693699598312378 }, { "auxiliary_loss_clip": 0.01115191, "auxiliary_loss_mlp": 0.01033917, "balance_loss_clip": 1.03887844, "balance_loss_mlp": 1.02039814, "epoch": 0.6840222456034871, "flos": 26358827391360.0, "grad_norm": 2.037488504873538, "language_loss": 0.74301463, "learning_rate": 9.58915746416808e-07, "loss": 0.76450574, "num_input_tokens_seen": 245550540, "step": 11377, "time_per_iteration": 2.5986266136169434 }, { "auxiliary_loss_clip": 0.01014709, "auxiliary_loss_mlp": 0.010028, "balance_loss_clip": 1.00959396, "balance_loss_mlp": 1.00172734, "epoch": 0.6840823688561551, "flos": 65988336936960.0, "grad_norm": 0.7236208934827679, "language_loss": 0.56872022, "learning_rate": 9.585832297583707e-07, "loss": 0.58889532, "num_input_tokens_seen": 245619570, "step": 11378, "time_per_iteration": 3.304108142852783 }, { "auxiliary_loss_clip": 0.01114944, "auxiliary_loss_mlp": 0.01038581, "balance_loss_clip": 1.04010487, "balance_loss_mlp": 1.02452612, "epoch": 0.684142492108823, "flos": 21397588980480.0, "grad_norm": 1.9771846674075895, "language_loss": 0.78299057, "learning_rate": 9.58250752590561e-07, "loss": 0.80452579, "num_input_tokens_seen": 245637980, "step": 11379, "time_per_iteration": 2.5876471996307373 }, { "auxiliary_loss_clip": 0.01110374, "auxiliary_loss_mlp": 0.0102753, "balance_loss_clip": 1.04125404, "balance_loss_mlp": 1.01560271, "epoch": 0.6842026153614911, "flos": 18801426700800.0, "grad_norm": 1.9586312359498843, "language_loss": 0.69083488, "learning_rate": 9.57918314925988e-07, "loss": 0.71221387, "num_input_tokens_seen": 245655690, "step": 11380, "time_per_iteration": 2.565652847290039 }, { "auxiliary_loss_clip": 0.0109036, "auxiliary_loss_mlp": 0.01036194, "balance_loss_clip": 1.03853393, "balance_loss_mlp": 1.02266991, "epoch": 0.684262738614159, "flos": 19646405216640.0, "grad_norm": 1.9286622353610317, "language_loss": 0.78519118, "learning_rate": 9.575859167772568e-07, "loss": 0.80645669, "num_input_tokens_seen": 245671525, "step": 11381, "time_per_iteration": 2.6301379203796387 }, { "auxiliary_loss_clip": 0.010226, "auxiliary_loss_mlp": 0.01003847, "balance_loss_clip": 1.00947046, "balance_loss_mlp": 1.00290525, "epoch": 0.684322861866827, "flos": 62354462739840.0, "grad_norm": 0.864991455722599, "language_loss": 0.67092407, "learning_rate": 9.572535581569713e-07, "loss": 0.69118857, "num_input_tokens_seen": 245724115, "step": 11382, "time_per_iteration": 3.0039761066436768 }, { "auxiliary_loss_clip": 0.01021817, "auxiliary_loss_mlp": 0.01001983, "balance_loss_clip": 1.0083313, "balance_loss_mlp": 1.000862, "epoch": 0.6843829851194949, "flos": 65805048812160.0, "grad_norm": 0.8192420417585807, "language_loss": 0.58103538, "learning_rate": 9.569212390777356e-07, "loss": 0.60127336, "num_input_tokens_seen": 245789245, "step": 11383, "time_per_iteration": 3.165360450744629 }, { "auxiliary_loss_clip": 0.01062418, "auxiliary_loss_mlp": 0.01038341, "balance_loss_clip": 1.03478217, "balance_loss_mlp": 1.02372622, "epoch": 0.6844431083721629, "flos": 27855153181440.0, "grad_norm": 2.857238801522205, "language_loss": 0.80316836, "learning_rate": 9.565889595521517e-07, "loss": 0.82417595, "num_input_tokens_seen": 245812420, "step": 11384, "time_per_iteration": 2.770827054977417 }, { "auxiliary_loss_clip": 0.01103805, "auxiliary_loss_mlp": 0.01030338, "balance_loss_clip": 1.03886342, "balance_loss_mlp": 1.01740408, "epoch": 0.684503231624831, "flos": 18255010032000.0, "grad_norm": 2.2679652674144157, "language_loss": 0.77255201, "learning_rate": 9.562567195928187e-07, "loss": 0.79389346, "num_input_tokens_seen": 245829135, "step": 11385, "time_per_iteration": 2.591132164001465 }, { "auxiliary_loss_clip": 0.0108167, "auxiliary_loss_mlp": 0.01042801, "balance_loss_clip": 1.0381335, "balance_loss_mlp": 1.02736902, "epoch": 0.6845633548774989, "flos": 17639681120640.0, "grad_norm": 2.065101540426227, "language_loss": 0.84796238, "learning_rate": 9.55924519212335e-07, "loss": 0.86920702, "num_input_tokens_seen": 245847140, "step": 11386, "time_per_iteration": 2.6891727447509766 }, { "auxiliary_loss_clip": 0.01103811, "auxiliary_loss_mlp": 0.01041499, "balance_loss_clip": 1.04075646, "balance_loss_mlp": 1.02887416, "epoch": 0.6846234781301669, "flos": 20807576179200.0, "grad_norm": 2.1990004125382634, "language_loss": 0.83455414, "learning_rate": 9.555923584232984e-07, "loss": 0.85600722, "num_input_tokens_seen": 245862855, "step": 11387, "time_per_iteration": 2.61997127532959 }, { "auxiliary_loss_clip": 0.01092977, "auxiliary_loss_mlp": 0.01030654, "balance_loss_clip": 1.03438902, "balance_loss_mlp": 1.01760602, "epoch": 0.6846836013828348, "flos": 36101176485120.0, "grad_norm": 1.848194295156486, "language_loss": 0.72119319, "learning_rate": 9.552602372383047e-07, "loss": 0.74242949, "num_input_tokens_seen": 245885415, "step": 11388, "time_per_iteration": 2.7075023651123047 }, { "auxiliary_loss_clip": 0.01098153, "auxiliary_loss_mlp": 0.01027593, "balance_loss_clip": 1.04095197, "balance_loss_mlp": 1.01512408, "epoch": 0.6847437246355028, "flos": 43142468607360.0, "grad_norm": 2.050560945832389, "language_loss": 0.6225087, "learning_rate": 9.549281556699469e-07, "loss": 0.64376616, "num_input_tokens_seen": 245906285, "step": 11389, "time_per_iteration": 2.8079371452331543 }, { "auxiliary_loss_clip": 0.01011672, "auxiliary_loss_mlp": 0.01004667, "balance_loss_clip": 1.00851202, "balance_loss_mlp": 1.00345695, "epoch": 0.6848038478881707, "flos": 71663729552640.0, "grad_norm": 0.7413355837031695, "language_loss": 0.5598197, "learning_rate": 9.54596113730818e-07, "loss": 0.57998312, "num_input_tokens_seen": 245967620, "step": 11390, "time_per_iteration": 5.026982307434082 }, { "auxiliary_loss_clip": 0.01076744, "auxiliary_loss_mlp": 0.00771583, "balance_loss_clip": 1.03878915, "balance_loss_mlp": 1.00011551, "epoch": 0.6848639711408387, "flos": 19937820257280.0, "grad_norm": 1.8276249174915487, "language_loss": 0.87787604, "learning_rate": 9.542641114335109e-07, "loss": 0.89635926, "num_input_tokens_seen": 245985075, "step": 11391, "time_per_iteration": 2.7455759048461914 }, { "auxiliary_loss_clip": 0.01073324, "auxiliary_loss_mlp": 0.01040704, "balance_loss_clip": 1.03858793, "balance_loss_mlp": 1.0274303, "epoch": 0.6849240943935067, "flos": 26867501844480.0, "grad_norm": 1.7326264104251545, "language_loss": 0.79257655, "learning_rate": 9.539321487906117e-07, "loss": 0.81371683, "num_input_tokens_seen": 246003560, "step": 11392, "time_per_iteration": 2.8595845699310303 }, { "auxiliary_loss_clip": 0.0108908, "auxiliary_loss_mlp": 0.01032172, "balance_loss_clip": 1.03764129, "balance_loss_mlp": 1.01933873, "epoch": 0.6849842176461747, "flos": 13735365425280.0, "grad_norm": 2.218400680256619, "language_loss": 0.71076894, "learning_rate": 9.536002258147104e-07, "loss": 0.7319814, "num_input_tokens_seen": 246019600, "step": 11393, "time_per_iteration": 2.680263042449951 }, { "auxiliary_loss_clip": 0.01075845, "auxiliary_loss_mlp": 0.01032815, "balance_loss_clip": 1.03768921, "balance_loss_mlp": 1.01831901, "epoch": 0.6850443408988426, "flos": 24973070641920.0, "grad_norm": 1.8123459031815148, "language_loss": 0.64661837, "learning_rate": 9.532683425183936e-07, "loss": 0.66770494, "num_input_tokens_seen": 246038920, "step": 11394, "time_per_iteration": 4.561980724334717 }, { "auxiliary_loss_clip": 0.01087026, "auxiliary_loss_mlp": 0.00773484, "balance_loss_clip": 1.03753853, "balance_loss_mlp": 1.00009871, "epoch": 0.6851044641515106, "flos": 27744225004800.0, "grad_norm": 2.9988719811827633, "language_loss": 0.80739737, "learning_rate": 9.529364989142468e-07, "loss": 0.82600248, "num_input_tokens_seen": 246060490, "step": 11395, "time_per_iteration": 2.758030891418457 }, { "auxiliary_loss_clip": 0.01077162, "auxiliary_loss_mlp": 0.01035315, "balance_loss_clip": 1.03991926, "balance_loss_mlp": 1.02056861, "epoch": 0.6851645874041785, "flos": 24351061800960.0, "grad_norm": 1.836466665804894, "language_loss": 0.73088896, "learning_rate": 9.526046950148527e-07, "loss": 0.75201374, "num_input_tokens_seen": 246081465, "step": 11396, "time_per_iteration": 2.781780481338501 }, { "auxiliary_loss_clip": 0.01084632, "auxiliary_loss_mlp": 0.01031663, "balance_loss_clip": 1.03876483, "balance_loss_mlp": 1.01705348, "epoch": 0.6852247106568465, "flos": 15077849264640.0, "grad_norm": 3.468595562954195, "language_loss": 0.79397655, "learning_rate": 9.522729308327931e-07, "loss": 0.81513953, "num_input_tokens_seen": 246096110, "step": 11397, "time_per_iteration": 2.759290933609009 }, { "auxiliary_loss_clip": 0.01035311, "auxiliary_loss_mlp": 0.01038237, "balance_loss_clip": 1.03174019, "balance_loss_mlp": 1.02346683, "epoch": 0.6852848339095146, "flos": 18770005278720.0, "grad_norm": 1.7538021552670298, "language_loss": 0.71620733, "learning_rate": 9.519412063806493e-07, "loss": 0.73694277, "num_input_tokens_seen": 246114785, "step": 11398, "time_per_iteration": 2.8469512462615967 }, { "auxiliary_loss_clip": 0.01063012, "auxiliary_loss_mlp": 0.01031401, "balance_loss_clip": 1.03693652, "balance_loss_mlp": 1.01927114, "epoch": 0.6853449571621825, "flos": 27854363082240.0, "grad_norm": 1.5995227781124475, "language_loss": 0.70539916, "learning_rate": 9.516095216709996e-07, "loss": 0.72634327, "num_input_tokens_seen": 246136375, "step": 11399, "time_per_iteration": 2.8067455291748047 }, { "auxiliary_loss_clip": 0.01099638, "auxiliary_loss_mlp": 0.01034955, "balance_loss_clip": 1.03867149, "balance_loss_mlp": 1.02175879, "epoch": 0.6854050804148505, "flos": 18150510389760.0, "grad_norm": 1.5522963452984355, "language_loss": 0.7023446, "learning_rate": 9.512778767164217e-07, "loss": 0.72369051, "num_input_tokens_seen": 246155090, "step": 11400, "time_per_iteration": 4.245120286941528 }, { "auxiliary_loss_clip": 0.01077599, "auxiliary_loss_mlp": 0.01038032, "balance_loss_clip": 1.04081964, "balance_loss_mlp": 1.02140248, "epoch": 0.6854652036675184, "flos": 16326212492160.0, "grad_norm": 2.0326109813245217, "language_loss": 0.77846044, "learning_rate": 9.509462715294927e-07, "loss": 0.79961675, "num_input_tokens_seen": 246172645, "step": 11401, "time_per_iteration": 2.758004665374756 }, { "auxiliary_loss_clip": 0.01113766, "auxiliary_loss_mlp": 0.01037682, "balance_loss_clip": 1.04050303, "balance_loss_mlp": 1.02477169, "epoch": 0.6855253269201864, "flos": 14940814878720.0, "grad_norm": 1.868317908345602, "language_loss": 0.75315881, "learning_rate": 9.50614706122786e-07, "loss": 0.77467334, "num_input_tokens_seen": 246189055, "step": 11402, "time_per_iteration": 2.562199115753174 }, { "auxiliary_loss_clip": 0.0109933, "auxiliary_loss_mlp": 0.01041955, "balance_loss_clip": 1.03740358, "balance_loss_mlp": 1.02720892, "epoch": 0.6855854501728543, "flos": 23037736826880.0, "grad_norm": 1.5371325501517963, "language_loss": 0.72588831, "learning_rate": 9.502831805088742e-07, "loss": 0.74730122, "num_input_tokens_seen": 246207990, "step": 11403, "time_per_iteration": 2.677266836166382 }, { "auxiliary_loss_clip": 0.01114001, "auxiliary_loss_mlp": 0.01034831, "balance_loss_clip": 1.04157901, "balance_loss_mlp": 1.02145004, "epoch": 0.6856455734255223, "flos": 13253623194240.0, "grad_norm": 2.0747553095420175, "language_loss": 0.81451255, "learning_rate": 9.499516947003294e-07, "loss": 0.83600086, "num_input_tokens_seen": 246221595, "step": 11404, "time_per_iteration": 2.5857958793640137 }, { "auxiliary_loss_clip": 0.01086293, "auxiliary_loss_mlp": 0.01040032, "balance_loss_clip": 1.0375222, "balance_loss_mlp": 1.0263046, "epoch": 0.6857056966781903, "flos": 23333461499520.0, "grad_norm": 1.3945702093947172, "language_loss": 0.77848321, "learning_rate": 9.496202487097222e-07, "loss": 0.79974639, "num_input_tokens_seen": 246242970, "step": 11405, "time_per_iteration": 2.743281364440918 }, { "auxiliary_loss_clip": 0.01023454, "auxiliary_loss_mlp": 0.00999881, "balance_loss_clip": 1.00911474, "balance_loss_mlp": 0.99873084, "epoch": 0.6857658199308583, "flos": 61852647784320.0, "grad_norm": 0.7882286280493239, "language_loss": 0.60976082, "learning_rate": 9.492888425496199e-07, "loss": 0.62999415, "num_input_tokens_seen": 246300405, "step": 11406, "time_per_iteration": 3.236720085144043 }, { "auxiliary_loss_clip": 0.01080565, "auxiliary_loss_mlp": 0.01035523, "balance_loss_clip": 1.03731775, "balance_loss_mlp": 1.02062762, "epoch": 0.6858259431835262, "flos": 16654543735680.0, "grad_norm": 1.6671355551751728, "language_loss": 0.76914632, "learning_rate": 9.489574762325907e-07, "loss": 0.79030716, "num_input_tokens_seen": 246318780, "step": 11407, "time_per_iteration": 2.7857916355133057 }, { "auxiliary_loss_clip": 0.01092831, "auxiliary_loss_mlp": 0.01039174, "balance_loss_clip": 1.0389874, "balance_loss_mlp": 1.02427292, "epoch": 0.6858860664361942, "flos": 21872974504320.0, "grad_norm": 2.9798515710303572, "language_loss": 0.71276259, "learning_rate": 9.486261497711991e-07, "loss": 0.7340827, "num_input_tokens_seen": 246339405, "step": 11408, "time_per_iteration": 2.8327853679656982 }, { "auxiliary_loss_clip": 0.01104322, "auxiliary_loss_mlp": 0.01031824, "balance_loss_clip": 1.03901792, "balance_loss_mlp": 1.01819825, "epoch": 0.6859461896888621, "flos": 15267637751040.0, "grad_norm": 1.7652749442295346, "language_loss": 0.70438635, "learning_rate": 9.482948631780087e-07, "loss": 0.72574776, "num_input_tokens_seen": 246357055, "step": 11409, "time_per_iteration": 2.6262388229370117 }, { "auxiliary_loss_clip": 0.01069373, "auxiliary_loss_mlp": 0.01029456, "balance_loss_clip": 1.03979826, "balance_loss_mlp": 1.01718974, "epoch": 0.6860063129415301, "flos": 18620293392000.0, "grad_norm": 1.5800008029842278, "language_loss": 0.78244615, "learning_rate": 9.479636164655825e-07, "loss": 0.80343449, "num_input_tokens_seen": 246374050, "step": 11410, "time_per_iteration": 2.742436408996582 }, { "auxiliary_loss_clip": 0.01104718, "auxiliary_loss_mlp": 0.0103935, "balance_loss_clip": 1.03746653, "balance_loss_mlp": 1.02479458, "epoch": 0.6860664361941982, "flos": 23951376190080.0, "grad_norm": 1.9022000774970669, "language_loss": 0.71458399, "learning_rate": 9.476324096464821e-07, "loss": 0.73602462, "num_input_tokens_seen": 246392910, "step": 11411, "time_per_iteration": 2.7334024906158447 }, { "auxiliary_loss_clip": 0.01062107, "auxiliary_loss_mlp": 0.01047011, "balance_loss_clip": 1.03538156, "balance_loss_mlp": 1.03152537, "epoch": 0.6861265594468661, "flos": 20407782827520.0, "grad_norm": 2.454164229167477, "language_loss": 0.70101523, "learning_rate": 9.473012427332654e-07, "loss": 0.7221064, "num_input_tokens_seen": 246411540, "step": 11412, "time_per_iteration": 2.830611228942871 }, { "auxiliary_loss_clip": 0.01114643, "auxiliary_loss_mlp": 0.01034015, "balance_loss_clip": 1.03966832, "balance_loss_mlp": 1.02018094, "epoch": 0.6861866826995341, "flos": 11428571111040.0, "grad_norm": 3.537487518671294, "language_loss": 0.71493286, "learning_rate": 9.469701157384919e-07, "loss": 0.73641944, "num_input_tokens_seen": 246423295, "step": 11413, "time_per_iteration": 2.5294950008392334 }, { "auxiliary_loss_clip": 0.01104826, "auxiliary_loss_mlp": 0.01034314, "balance_loss_clip": 1.03952384, "balance_loss_mlp": 1.02099848, "epoch": 0.686246805952202, "flos": 15997593939840.0, "grad_norm": 1.8251318339835605, "language_loss": 0.73947906, "learning_rate": 9.466390286747164e-07, "loss": 0.7608704, "num_input_tokens_seen": 246441045, "step": 11414, "time_per_iteration": 2.5965075492858887 }, { "auxiliary_loss_clip": 0.01090896, "auxiliary_loss_mlp": 0.01033258, "balance_loss_clip": 1.03907096, "balance_loss_mlp": 1.01883936, "epoch": 0.68630692920487, "flos": 19826712512640.0, "grad_norm": 2.434529931317787, "language_loss": 0.8682794, "learning_rate": 9.46307981554495e-07, "loss": 0.88952088, "num_input_tokens_seen": 246456905, "step": 11415, "time_per_iteration": 2.6476597785949707 }, { "auxiliary_loss_clip": 0.01106277, "auxiliary_loss_mlp": 0.01036888, "balance_loss_clip": 1.04034388, "balance_loss_mlp": 1.02316129, "epoch": 0.6863670524575379, "flos": 26286216048000.0, "grad_norm": 1.8704963128355632, "language_loss": 0.67290139, "learning_rate": 9.459769743903801e-07, "loss": 0.69433296, "num_input_tokens_seen": 246477545, "step": 11416, "time_per_iteration": 2.658177137374878 }, { "auxiliary_loss_clip": 0.01090013, "auxiliary_loss_mlp": 0.01041407, "balance_loss_clip": 1.03826094, "balance_loss_mlp": 1.02668476, "epoch": 0.686427175710206, "flos": 19173138595200.0, "grad_norm": 1.424267552511055, "language_loss": 0.76128805, "learning_rate": 9.456460071949237e-07, "loss": 0.78260225, "num_input_tokens_seen": 246496705, "step": 11417, "time_per_iteration": 2.6901679039001465 }, { "auxiliary_loss_clip": 0.01087664, "auxiliary_loss_mlp": 0.01036409, "balance_loss_clip": 1.03694177, "balance_loss_mlp": 1.02199018, "epoch": 0.6864872989628739, "flos": 18916628595840.0, "grad_norm": 2.686574302160358, "language_loss": 0.7732662, "learning_rate": 9.45315079980678e-07, "loss": 0.79450691, "num_input_tokens_seen": 246514860, "step": 11418, "time_per_iteration": 2.755699872970581 }, { "auxiliary_loss_clip": 0.01066399, "auxiliary_loss_mlp": 0.01031853, "balance_loss_clip": 1.03764701, "balance_loss_mlp": 1.01901984, "epoch": 0.6865474222155419, "flos": 25956196865280.0, "grad_norm": 1.6068340325317958, "language_loss": 0.76434135, "learning_rate": 9.449841927601887e-07, "loss": 0.78532386, "num_input_tokens_seen": 246536145, "step": 11419, "time_per_iteration": 2.865663766860962 }, { "auxiliary_loss_clip": 0.01111545, "auxiliary_loss_mlp": 0.0103699, "balance_loss_clip": 1.03836358, "balance_loss_mlp": 1.02422845, "epoch": 0.6866075454682098, "flos": 18478087447680.0, "grad_norm": 2.1745569847310624, "language_loss": 0.71438152, "learning_rate": 9.446533455460044e-07, "loss": 0.7358669, "num_input_tokens_seen": 246553265, "step": 11420, "time_per_iteration": 2.6367876529693604 }, { "auxiliary_loss_clip": 0.01071734, "auxiliary_loss_mlp": 0.01035393, "balance_loss_clip": 1.03420091, "balance_loss_mlp": 1.02145147, "epoch": 0.6866676687208778, "flos": 34239998298240.0, "grad_norm": 1.4612378577280256, "language_loss": 0.74987674, "learning_rate": 9.443225383506712e-07, "loss": 0.77094799, "num_input_tokens_seen": 246575130, "step": 11421, "time_per_iteration": 2.905451774597168 }, { "auxiliary_loss_clip": 0.01099049, "auxiliary_loss_mlp": 0.01031312, "balance_loss_clip": 1.03840101, "balance_loss_mlp": 1.01820481, "epoch": 0.6867277919735457, "flos": 21721754246400.0, "grad_norm": 1.8216780462844224, "language_loss": 0.76901162, "learning_rate": 9.439917711867338e-07, "loss": 0.79031521, "num_input_tokens_seen": 246593095, "step": 11422, "time_per_iteration": 2.7650146484375 }, { "auxiliary_loss_clip": 0.01107124, "auxiliary_loss_mlp": 0.01039503, "balance_loss_clip": 1.04101586, "balance_loss_mlp": 1.02516782, "epoch": 0.6867879152262137, "flos": 24097999507200.0, "grad_norm": 1.6784649507913934, "language_loss": 0.77082187, "learning_rate": 9.436610440667334e-07, "loss": 0.79228812, "num_input_tokens_seen": 246612165, "step": 11423, "time_per_iteration": 2.8607351779937744 }, { "auxiliary_loss_clip": 0.01082395, "auxiliary_loss_mlp": 0.01033174, "balance_loss_clip": 1.03814936, "balance_loss_mlp": 1.01943564, "epoch": 0.6868480384788818, "flos": 21615818060160.0, "grad_norm": 1.4803784362494392, "language_loss": 0.72793746, "learning_rate": 9.433303570032129e-07, "loss": 0.74909317, "num_input_tokens_seen": 246632065, "step": 11424, "time_per_iteration": 2.8126673698425293 }, { "auxiliary_loss_clip": 0.01092944, "auxiliary_loss_mlp": 0.0103129, "balance_loss_clip": 1.03935122, "balance_loss_mlp": 1.01783705, "epoch": 0.6869081617315497, "flos": 26286144220800.0, "grad_norm": 1.8921444035478678, "language_loss": 0.65257877, "learning_rate": 9.429997100087112e-07, "loss": 0.67382109, "num_input_tokens_seen": 246651245, "step": 11425, "time_per_iteration": 2.7407920360565186 }, { "auxiliary_loss_clip": 0.01073701, "auxiliary_loss_mlp": 0.01028234, "balance_loss_clip": 1.03880644, "balance_loss_mlp": 1.01543677, "epoch": 0.6869682849842177, "flos": 21105096531840.0, "grad_norm": 1.3754232219458198, "language_loss": 0.71719813, "learning_rate": 9.426691030957657e-07, "loss": 0.73821747, "num_input_tokens_seen": 246672225, "step": 11426, "time_per_iteration": 2.821906089782715 }, { "auxiliary_loss_clip": 0.01060498, "auxiliary_loss_mlp": 0.01034142, "balance_loss_clip": 1.03450513, "balance_loss_mlp": 1.02006936, "epoch": 0.6870284082368856, "flos": 17092653920640.0, "grad_norm": 2.015308300418605, "language_loss": 0.84978002, "learning_rate": 9.423385362769136e-07, "loss": 0.87072647, "num_input_tokens_seen": 246688385, "step": 11427, "time_per_iteration": 2.769426107406616 }, { "auxiliary_loss_clip": 0.01100329, "auxiliary_loss_mlp": 0.01033991, "balance_loss_clip": 1.03816628, "balance_loss_mlp": 1.02096152, "epoch": 0.6870885314895536, "flos": 27308090067840.0, "grad_norm": 1.7434629559161423, "language_loss": 0.76254469, "learning_rate": 9.420080095646909e-07, "loss": 0.78388786, "num_input_tokens_seen": 246710730, "step": 11428, "time_per_iteration": 2.708268165588379 }, { "auxiliary_loss_clip": 0.01079241, "auxiliary_loss_mlp": 0.01041381, "balance_loss_clip": 1.03692877, "balance_loss_mlp": 1.02690864, "epoch": 0.6871486547422215, "flos": 20814543417600.0, "grad_norm": 2.419770929596293, "language_loss": 0.73118293, "learning_rate": 9.4167752297163e-07, "loss": 0.75238913, "num_input_tokens_seen": 246730350, "step": 11429, "time_per_iteration": 2.7956650257110596 }, { "auxiliary_loss_clip": 0.01089951, "auxiliary_loss_mlp": 0.01029631, "balance_loss_clip": 1.03737235, "balance_loss_mlp": 1.01640451, "epoch": 0.6872087779948896, "flos": 30154118330880.0, "grad_norm": 1.9861165427275798, "language_loss": 0.83426887, "learning_rate": 9.413470765102643e-07, "loss": 0.8554647, "num_input_tokens_seen": 246751700, "step": 11430, "time_per_iteration": 4.525273084640503 }, { "auxiliary_loss_clip": 0.01105193, "auxiliary_loss_mlp": 0.01038794, "balance_loss_clip": 1.03941309, "balance_loss_mlp": 1.02549052, "epoch": 0.6872689012475575, "flos": 20704584908160.0, "grad_norm": 2.206529961181577, "language_loss": 0.7042433, "learning_rate": 9.410166701931225e-07, "loss": 0.72568321, "num_input_tokens_seen": 246769860, "step": 11431, "time_per_iteration": 2.6291654109954834 }, { "auxiliary_loss_clip": 0.01093068, "auxiliary_loss_mlp": 0.00771593, "balance_loss_clip": 1.03726888, "balance_loss_mlp": 1.0001148, "epoch": 0.6873290245002255, "flos": 25520852027520.0, "grad_norm": 1.7240375281978666, "language_loss": 0.80058414, "learning_rate": 9.406863040327355e-07, "loss": 0.81923079, "num_input_tokens_seen": 246789905, "step": 11432, "time_per_iteration": 2.7238457202911377 }, { "auxiliary_loss_clip": 0.01089362, "auxiliary_loss_mlp": 0.01029871, "balance_loss_clip": 1.03868675, "balance_loss_mlp": 1.01700783, "epoch": 0.6873891477528934, "flos": 25191479289600.0, "grad_norm": 1.5401718085798923, "language_loss": 0.67718959, "learning_rate": 9.403559780416295e-07, "loss": 0.6983819, "num_input_tokens_seen": 246808815, "step": 11433, "time_per_iteration": 4.300631999969482 }, { "auxiliary_loss_clip": 0.01108222, "auxiliary_loss_mlp": 0.01044912, "balance_loss_clip": 1.04331732, "balance_loss_mlp": 1.03123283, "epoch": 0.6874492710055614, "flos": 35152380685440.0, "grad_norm": 1.9633714481574007, "language_loss": 0.73058158, "learning_rate": 9.400256922323309e-07, "loss": 0.75211298, "num_input_tokens_seen": 246829775, "step": 11434, "time_per_iteration": 4.712211608886719 }, { "auxiliary_loss_clip": 0.0107867, "auxiliary_loss_mlp": 0.01034388, "balance_loss_clip": 1.04082966, "balance_loss_mlp": 1.02101231, "epoch": 0.6875093942582293, "flos": 17822215059840.0, "grad_norm": 1.6101742183302694, "language_loss": 0.80406773, "learning_rate": 9.396954466173657e-07, "loss": 0.82519835, "num_input_tokens_seen": 246848045, "step": 11435, "time_per_iteration": 2.644397735595703 }, { "auxiliary_loss_clip": 0.01116024, "auxiliary_loss_mlp": 0.01035166, "balance_loss_clip": 1.04015982, "balance_loss_mlp": 1.02111077, "epoch": 0.6875695175108973, "flos": 20704548994560.0, "grad_norm": 3.274458448067563, "language_loss": 0.81117046, "learning_rate": 9.393652412092538e-07, "loss": 0.83268237, "num_input_tokens_seen": 246866095, "step": 11436, "time_per_iteration": 2.600048303604126 }, { "auxiliary_loss_clip": 0.0106725, "auxiliary_loss_mlp": 0.0104019, "balance_loss_clip": 1.03428948, "balance_loss_mlp": 1.02743411, "epoch": 0.6876296407635654, "flos": 25374013228800.0, "grad_norm": 1.9842620224172498, "language_loss": 0.82207173, "learning_rate": 9.390350760205183e-07, "loss": 0.84314615, "num_input_tokens_seen": 246883975, "step": 11437, "time_per_iteration": 2.7188313007354736 }, { "auxiliary_loss_clip": 0.01097489, "auxiliary_loss_mlp": 0.01042761, "balance_loss_clip": 1.03876507, "balance_loss_mlp": 1.02794886, "epoch": 0.6876897640162333, "flos": 23222317841280.0, "grad_norm": 4.685984752688369, "language_loss": 0.78381348, "learning_rate": 9.387049510636793e-07, "loss": 0.80521595, "num_input_tokens_seen": 246901560, "step": 11438, "time_per_iteration": 2.6525228023529053 }, { "auxiliary_loss_clip": 0.01108734, "auxiliary_loss_mlp": 0.0103476, "balance_loss_clip": 1.03871489, "balance_loss_mlp": 1.02167058, "epoch": 0.6877498872689013, "flos": 27124335066240.0, "grad_norm": 1.647979155501369, "language_loss": 0.72087812, "learning_rate": 9.383748663512554e-07, "loss": 0.74231309, "num_input_tokens_seen": 246922655, "step": 11439, "time_per_iteration": 4.218140363693237 }, { "auxiliary_loss_clip": 0.01101936, "auxiliary_loss_mlp": 0.01030006, "balance_loss_clip": 1.03944337, "balance_loss_mlp": 1.01658285, "epoch": 0.6878100105215692, "flos": 11581658876160.0, "grad_norm": 1.9534001671179906, "language_loss": 0.75862855, "learning_rate": 9.380448218957623e-07, "loss": 0.779948, "num_input_tokens_seen": 246940100, "step": 11440, "time_per_iteration": 2.580472946166992 }, { "auxiliary_loss_clip": 0.01066967, "auxiliary_loss_mlp": 0.01040415, "balance_loss_clip": 1.03528094, "balance_loss_mlp": 1.02684307, "epoch": 0.6878701337742372, "flos": 20303175444480.0, "grad_norm": 7.861818924260737, "language_loss": 0.71750253, "learning_rate": 9.377148177097167e-07, "loss": 0.73857641, "num_input_tokens_seen": 246958545, "step": 11441, "time_per_iteration": 2.706754207611084 }, { "auxiliary_loss_clip": 0.01074524, "auxiliary_loss_mlp": 0.01043281, "balance_loss_clip": 1.03488159, "balance_loss_mlp": 1.02677059, "epoch": 0.6879302570269051, "flos": 13840080549120.0, "grad_norm": 1.6357806540454092, "language_loss": 0.66401327, "learning_rate": 9.373848538056317e-07, "loss": 0.68519139, "num_input_tokens_seen": 246974805, "step": 11442, "time_per_iteration": 2.7559654712677 }, { "auxiliary_loss_clip": 0.0109822, "auxiliary_loss_mlp": 0.01033105, "balance_loss_clip": 1.03951812, "balance_loss_mlp": 1.02001595, "epoch": 0.6879903802795732, "flos": 21324654414720.0, "grad_norm": 4.42004898936703, "language_loss": 0.69321597, "learning_rate": 9.370549301960189e-07, "loss": 0.71452922, "num_input_tokens_seen": 246992505, "step": 11443, "time_per_iteration": 2.6616227626800537 }, { "auxiliary_loss_clip": 0.0109609, "auxiliary_loss_mlp": 0.01035788, "balance_loss_clip": 1.03986192, "balance_loss_mlp": 1.02196562, "epoch": 0.6880505035322411, "flos": 25152049134720.0, "grad_norm": 2.6937329387099784, "language_loss": 0.76372284, "learning_rate": 9.367250468933893e-07, "loss": 0.78504163, "num_input_tokens_seen": 247013370, "step": 11444, "time_per_iteration": 2.8355183601379395 }, { "auxiliary_loss_clip": 0.01110169, "auxiliary_loss_mlp": 0.01032915, "balance_loss_clip": 1.03819597, "balance_loss_mlp": 1.02007592, "epoch": 0.6881106267849091, "flos": 23215530170880.0, "grad_norm": 2.350463307106156, "language_loss": 0.76555073, "learning_rate": 9.363952039102536e-07, "loss": 0.78698158, "num_input_tokens_seen": 247029855, "step": 11445, "time_per_iteration": 2.567321300506592 }, { "auxiliary_loss_clip": 0.01022025, "auxiliary_loss_mlp": 0.01003467, "balance_loss_clip": 1.00763083, "balance_loss_mlp": 1.00232887, "epoch": 0.688170750037577, "flos": 48484397312640.0, "grad_norm": 0.815591807379434, "language_loss": 0.58349764, "learning_rate": 9.360654012591183e-07, "loss": 0.60375261, "num_input_tokens_seen": 247085030, "step": 11446, "time_per_iteration": 3.1823232173919678 }, { "auxiliary_loss_clip": 0.01102524, "auxiliary_loss_mlp": 0.01031252, "balance_loss_clip": 1.03622508, "balance_loss_mlp": 1.01726246, "epoch": 0.688230873290245, "flos": 22783633038720.0, "grad_norm": 1.4577025181029204, "language_loss": 0.75851154, "learning_rate": 9.357356389524886e-07, "loss": 0.77984923, "num_input_tokens_seen": 247104840, "step": 11447, "time_per_iteration": 2.6292076110839844 }, { "auxiliary_loss_clip": 0.01092756, "auxiliary_loss_mlp": 0.01038788, "balance_loss_clip": 1.0371995, "balance_loss_mlp": 1.02566266, "epoch": 0.6882909965429129, "flos": 22455660931200.0, "grad_norm": 1.9523079882919305, "language_loss": 0.73051161, "learning_rate": 9.354059170028705e-07, "loss": 0.75182706, "num_input_tokens_seen": 247121905, "step": 11448, "time_per_iteration": 2.6177000999450684 }, { "auxiliary_loss_clip": 0.01100637, "auxiliary_loss_mlp": 0.01044689, "balance_loss_clip": 1.0369277, "balance_loss_mlp": 1.02910876, "epoch": 0.688351119795581, "flos": 26214143408640.0, "grad_norm": 1.5228707353550825, "language_loss": 0.74738759, "learning_rate": 9.350762354227673e-07, "loss": 0.76884079, "num_input_tokens_seen": 247142375, "step": 11449, "time_per_iteration": 2.601680040359497 }, { "auxiliary_loss_clip": 0.01111281, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.03867829, "balance_loss_mlp": 1.02147889, "epoch": 0.6884112430482489, "flos": 22565260304640.0, "grad_norm": 3.717332242852324, "language_loss": 0.69703102, "learning_rate": 9.34746594224679e-07, "loss": 0.71848536, "num_input_tokens_seen": 247161095, "step": 11450, "time_per_iteration": 2.664257764816284 }, { "auxiliary_loss_clip": 0.0107707, "auxiliary_loss_mlp": 0.01038466, "balance_loss_clip": 1.03789186, "balance_loss_mlp": 1.02427959, "epoch": 0.6884713663009169, "flos": 17341047446400.0, "grad_norm": 1.8597549906829547, "language_loss": 0.75942892, "learning_rate": 9.344169934211068e-07, "loss": 0.78058428, "num_input_tokens_seen": 247178565, "step": 11451, "time_per_iteration": 2.6398167610168457 }, { "auxiliary_loss_clip": 0.01101483, "auxiliary_loss_mlp": 0.010314, "balance_loss_clip": 1.03904259, "balance_loss_mlp": 1.01854348, "epoch": 0.6885314895535849, "flos": 26470832976000.0, "grad_norm": 1.4408172988825247, "language_loss": 0.69557142, "learning_rate": 9.340874330245505e-07, "loss": 0.71690023, "num_input_tokens_seen": 247202345, "step": 11452, "time_per_iteration": 2.6441712379455566 }, { "auxiliary_loss_clip": 0.01112297, "auxiliary_loss_mlp": 0.01036775, "balance_loss_clip": 1.03905725, "balance_loss_mlp": 1.02143824, "epoch": 0.6885916128062528, "flos": 20521548178560.0, "grad_norm": 1.603751678545201, "language_loss": 0.71996975, "learning_rate": 9.337579130475042e-07, "loss": 0.74146044, "num_input_tokens_seen": 247219240, "step": 11453, "time_per_iteration": 2.564039707183838 }, { "auxiliary_loss_clip": 0.010232, "auxiliary_loss_mlp": 0.0075158, "balance_loss_clip": 1.00928593, "balance_loss_mlp": 0.99959499, "epoch": 0.6886517360589208, "flos": 70715795679360.0, "grad_norm": 0.7798992537715281, "language_loss": 0.50685745, "learning_rate": 9.334284335024644e-07, "loss": 0.52460527, "num_input_tokens_seen": 247272010, "step": 11454, "time_per_iteration": 3.016122341156006 }, { "auxiliary_loss_clip": 0.01097098, "auxiliary_loss_mlp": 0.01035719, "balance_loss_clip": 1.03854132, "balance_loss_mlp": 1.02329731, "epoch": 0.6887118593115887, "flos": 17893533513600.0, "grad_norm": 2.526020135416449, "language_loss": 0.75680363, "learning_rate": 9.330989944019263e-07, "loss": 0.77813178, "num_input_tokens_seen": 247290630, "step": 11455, "time_per_iteration": 2.7730109691619873 }, { "auxiliary_loss_clip": 0.01092116, "auxiliary_loss_mlp": 0.0103676, "balance_loss_clip": 1.03623128, "balance_loss_mlp": 1.02249074, "epoch": 0.6887719825642568, "flos": 17453017117440.0, "grad_norm": 2.7328430061690154, "language_loss": 0.7254653, "learning_rate": 9.327695957583803e-07, "loss": 0.74675405, "num_input_tokens_seen": 247304800, "step": 11456, "time_per_iteration": 2.7660651206970215 }, { "auxiliary_loss_clip": 0.0108935, "auxiliary_loss_mlp": 0.01035233, "balance_loss_clip": 1.03873277, "balance_loss_mlp": 1.02247739, "epoch": 0.6888321058169247, "flos": 23070199743360.0, "grad_norm": 2.090937721500204, "language_loss": 0.81322861, "learning_rate": 9.32440237584319e-07, "loss": 0.83447444, "num_input_tokens_seen": 247323450, "step": 11457, "time_per_iteration": 2.691455841064453 }, { "auxiliary_loss_clip": 0.01105328, "auxiliary_loss_mlp": 0.00771348, "balance_loss_clip": 1.04052448, "balance_loss_mlp": 1.00017715, "epoch": 0.6888922290695927, "flos": 23368833417600.0, "grad_norm": 1.548184255846192, "language_loss": 0.76552927, "learning_rate": 9.321109198922301e-07, "loss": 0.78429604, "num_input_tokens_seen": 247343845, "step": 11458, "time_per_iteration": 2.6362695693969727 }, { "auxiliary_loss_clip": 0.01113281, "auxiliary_loss_mlp": 0.01034444, "balance_loss_clip": 1.03937232, "balance_loss_mlp": 1.02138472, "epoch": 0.6889523523222606, "flos": 17631636474240.0, "grad_norm": 2.7369612879197986, "language_loss": 0.67654693, "learning_rate": 9.31781642694603e-07, "loss": 0.69802415, "num_input_tokens_seen": 247356650, "step": 11459, "time_per_iteration": 2.6157007217407227 }, { "auxiliary_loss_clip": 0.01064164, "auxiliary_loss_mlp": 0.01032258, "balance_loss_clip": 1.03582239, "balance_loss_mlp": 1.01958048, "epoch": 0.6890124755749286, "flos": 25228144097280.0, "grad_norm": 1.4844709645177188, "language_loss": 0.68446231, "learning_rate": 9.314524060039221e-07, "loss": 0.70542651, "num_input_tokens_seen": 247377340, "step": 11460, "time_per_iteration": 2.7714388370513916 }, { "auxiliary_loss_clip": 0.01087273, "auxiliary_loss_mlp": 0.01033379, "balance_loss_clip": 1.03934288, "balance_loss_mlp": 1.01844215, "epoch": 0.6890725988275965, "flos": 20230240878720.0, "grad_norm": 1.8579339278918177, "language_loss": 0.77017105, "learning_rate": 9.311232098326731e-07, "loss": 0.7913776, "num_input_tokens_seen": 247395805, "step": 11461, "time_per_iteration": 2.7195050716400146 }, { "auxiliary_loss_clip": 0.01091784, "auxiliary_loss_mlp": 0.01037341, "balance_loss_clip": 1.03789628, "balance_loss_mlp": 1.02331018, "epoch": 0.6891327220802645, "flos": 14535311264640.0, "grad_norm": 1.7919419635412812, "language_loss": 0.6962589, "learning_rate": 9.307940541933401e-07, "loss": 0.71755016, "num_input_tokens_seen": 247413165, "step": 11462, "time_per_iteration": 2.695122718811035 }, { "auxiliary_loss_clip": 0.01105224, "auxiliary_loss_mlp": 0.01028927, "balance_loss_clip": 1.04118133, "balance_loss_mlp": 1.01500297, "epoch": 0.6891928453329325, "flos": 21139139646720.0, "grad_norm": 1.4465330715019271, "language_loss": 0.8737253, "learning_rate": 9.304649390984034e-07, "loss": 0.89506674, "num_input_tokens_seen": 247433140, "step": 11463, "time_per_iteration": 2.746290922164917 }, { "auxiliary_loss_clip": 0.01064548, "auxiliary_loss_mlp": 0.01030124, "balance_loss_clip": 1.04010975, "balance_loss_mlp": 1.01829851, "epoch": 0.6892529685856005, "flos": 17858520731520.0, "grad_norm": 1.5297822834727555, "language_loss": 0.68426907, "learning_rate": 9.301358645603428e-07, "loss": 0.70521581, "num_input_tokens_seen": 247451265, "step": 11464, "time_per_iteration": 2.8325612545013428 }, { "auxiliary_loss_clip": 0.01102764, "auxiliary_loss_mlp": 0.01040883, "balance_loss_clip": 1.03917408, "balance_loss_mlp": 1.02711463, "epoch": 0.6893130918382685, "flos": 29934811843200.0, "grad_norm": 2.288958108481903, "language_loss": 0.65110016, "learning_rate": 9.298068305916373e-07, "loss": 0.67253661, "num_input_tokens_seen": 247471645, "step": 11465, "time_per_iteration": 2.815046787261963 }, { "auxiliary_loss_clip": 0.01104457, "auxiliary_loss_mlp": 0.01038209, "balance_loss_clip": 1.03854775, "balance_loss_mlp": 1.02463746, "epoch": 0.6893732150909364, "flos": 24388516707840.0, "grad_norm": 1.3495813204241554, "language_loss": 0.72669965, "learning_rate": 9.294778372047649e-07, "loss": 0.74812633, "num_input_tokens_seen": 247491170, "step": 11466, "time_per_iteration": 2.671194314956665 }, { "auxiliary_loss_clip": 0.01114766, "auxiliary_loss_mlp": 0.01034591, "balance_loss_clip": 1.04005003, "balance_loss_mlp": 1.02122736, "epoch": 0.6894333383436044, "flos": 16982874979200.0, "grad_norm": 1.6856701084963044, "language_loss": 0.71847236, "learning_rate": 9.291488844121995e-07, "loss": 0.73996592, "num_input_tokens_seen": 247509005, "step": 11467, "time_per_iteration": 2.759052276611328 }, { "auxiliary_loss_clip": 0.01096068, "auxiliary_loss_mlp": 0.01036799, "balance_loss_clip": 1.03972626, "balance_loss_mlp": 1.02171886, "epoch": 0.6894934615962723, "flos": 18985540838400.0, "grad_norm": 1.978085572567592, "language_loss": 0.80877995, "learning_rate": 9.288199722264156e-07, "loss": 0.83010864, "num_input_tokens_seen": 247527050, "step": 11468, "time_per_iteration": 2.8261470794677734 }, { "auxiliary_loss_clip": 0.01116061, "auxiliary_loss_mlp": 0.01034579, "balance_loss_clip": 1.04050148, "balance_loss_mlp": 1.02103066, "epoch": 0.6895535848489404, "flos": 34531664734080.0, "grad_norm": 1.489529294726542, "language_loss": 0.66164148, "learning_rate": 9.284911006598875e-07, "loss": 0.68314791, "num_input_tokens_seen": 247547765, "step": 11469, "time_per_iteration": 5.082685232162476 }, { "auxiliary_loss_clip": 0.01023211, "auxiliary_loss_mlp": 0.01004328, "balance_loss_clip": 1.00959301, "balance_loss_mlp": 1.00309992, "epoch": 0.6896137081016083, "flos": 50075852273280.0, "grad_norm": 0.7983802511717295, "language_loss": 0.55211931, "learning_rate": 9.281622697250824e-07, "loss": 0.57239467, "num_input_tokens_seen": 247603515, "step": 11470, "time_per_iteration": 3.123518228530884 }, { "auxiliary_loss_clip": 0.01098666, "auxiliary_loss_mlp": 0.01034034, "balance_loss_clip": 1.03866851, "balance_loss_mlp": 1.02299523, "epoch": 0.6896738313542763, "flos": 19938215306880.0, "grad_norm": 1.7748421149249738, "language_loss": 0.78111279, "learning_rate": 9.278334794344715e-07, "loss": 0.80243975, "num_input_tokens_seen": 247622110, "step": 11471, "time_per_iteration": 2.6707584857940674 }, { "auxiliary_loss_clip": 0.01088217, "auxiliary_loss_mlp": 0.01034739, "balance_loss_clip": 1.03501463, "balance_loss_mlp": 1.02104771, "epoch": 0.6897339546069442, "flos": 21725489260800.0, "grad_norm": 1.724757958239778, "language_loss": 0.78451025, "learning_rate": 9.275047298005232e-07, "loss": 0.80573976, "num_input_tokens_seen": 247641905, "step": 11472, "time_per_iteration": 4.256728887557983 }, { "auxiliary_loss_clip": 0.01081643, "auxiliary_loss_mlp": 0.01032054, "balance_loss_clip": 1.03641033, "balance_loss_mlp": 1.0195905, "epoch": 0.6897940778596122, "flos": 19826497031040.0, "grad_norm": 1.5995976978854818, "language_loss": 0.76272285, "learning_rate": 9.271760208357024e-07, "loss": 0.78385979, "num_input_tokens_seen": 247660945, "step": 11473, "time_per_iteration": 4.321485757827759 }, { "auxiliary_loss_clip": 0.01070517, "auxiliary_loss_mlp": 0.0105009, "balance_loss_clip": 1.03430462, "balance_loss_mlp": 1.03352571, "epoch": 0.6898542011122801, "flos": 17310056987520.0, "grad_norm": 1.7861232918293928, "language_loss": 0.75359839, "learning_rate": 9.268473525524751e-07, "loss": 0.77480447, "num_input_tokens_seen": 247678395, "step": 11474, "time_per_iteration": 2.788238525390625 }, { "auxiliary_loss_clip": 0.01068006, "auxiliary_loss_mlp": 0.01032395, "balance_loss_clip": 1.04364872, "balance_loss_mlp": 1.01921058, "epoch": 0.6899143243649482, "flos": 24754051463040.0, "grad_norm": 1.4663281053614279, "language_loss": 0.74502885, "learning_rate": 9.26518724963303e-07, "loss": 0.76603287, "num_input_tokens_seen": 247698380, "step": 11475, "time_per_iteration": 2.878188371658325 }, { "auxiliary_loss_clip": 0.01084179, "auxiliary_loss_mlp": 0.01035391, "balance_loss_clip": 1.03779638, "balance_loss_mlp": 1.02154493, "epoch": 0.6899744476176161, "flos": 17234536642560.0, "grad_norm": 1.9957028062650322, "language_loss": 0.88603026, "learning_rate": 9.261901380806491e-07, "loss": 0.90722603, "num_input_tokens_seen": 247716370, "step": 11476, "time_per_iteration": 2.7922370433807373 }, { "auxiliary_loss_clip": 0.01112551, "auxiliary_loss_mlp": 0.01037017, "balance_loss_clip": 1.03934443, "balance_loss_mlp": 1.02450645, "epoch": 0.6900345708702841, "flos": 25410678036480.0, "grad_norm": 1.5288697914631357, "language_loss": 0.70166922, "learning_rate": 9.258615919169724e-07, "loss": 0.72316492, "num_input_tokens_seen": 247737335, "step": 11477, "time_per_iteration": 2.780515193939209 }, { "auxiliary_loss_clip": 0.01107191, "auxiliary_loss_mlp": 0.01045383, "balance_loss_clip": 1.03964376, "balance_loss_mlp": 1.03081584, "epoch": 0.6900946941229521, "flos": 23434190213760.0, "grad_norm": 2.1987152086234723, "language_loss": 0.68323863, "learning_rate": 9.255330864847313e-07, "loss": 0.70476437, "num_input_tokens_seen": 247756680, "step": 11478, "time_per_iteration": 4.340089559555054 }, { "auxiliary_loss_clip": 0.01104632, "auxiliary_loss_mlp": 0.0103447, "balance_loss_clip": 1.04020643, "balance_loss_mlp": 1.02203012, "epoch": 0.69015481737562, "flos": 17820096157440.0, "grad_norm": 1.918426525633328, "language_loss": 0.76238775, "learning_rate": 9.252046217963843e-07, "loss": 0.78377873, "num_input_tokens_seen": 247774265, "step": 11479, "time_per_iteration": 2.7662193775177 }, { "auxiliary_loss_clip": 0.01104072, "auxiliary_loss_mlp": 0.01030752, "balance_loss_clip": 1.03842354, "balance_loss_mlp": 1.01716816, "epoch": 0.690214940628288, "flos": 17456500736640.0, "grad_norm": 1.8031624410020608, "language_loss": 0.78769386, "learning_rate": 9.248761978643856e-07, "loss": 0.8090421, "num_input_tokens_seen": 247792395, "step": 11480, "time_per_iteration": 2.6917519569396973 }, { "auxiliary_loss_clip": 0.01071212, "auxiliary_loss_mlp": 0.01033409, "balance_loss_clip": 1.03474808, "balance_loss_mlp": 1.01971793, "epoch": 0.6902750638809559, "flos": 29566691308800.0, "grad_norm": 2.1117215547556922, "language_loss": 0.75273913, "learning_rate": 9.245478147011885e-07, "loss": 0.77378535, "num_input_tokens_seen": 247811985, "step": 11481, "time_per_iteration": 2.914005994796753 }, { "auxiliary_loss_clip": 0.01078232, "auxiliary_loss_mlp": 0.01031748, "balance_loss_clip": 1.03950965, "balance_loss_mlp": 1.01795578, "epoch": 0.690335187133624, "flos": 25557121785600.0, "grad_norm": 1.8140875397528662, "language_loss": 0.69146681, "learning_rate": 9.24219472319246e-07, "loss": 0.71256661, "num_input_tokens_seen": 247831880, "step": 11482, "time_per_iteration": 2.888972759246826 }, { "auxiliary_loss_clip": 0.01114892, "auxiliary_loss_mlp": 0.01033455, "balance_loss_clip": 1.04087675, "balance_loss_mlp": 1.02031827, "epoch": 0.6903953103862919, "flos": 22488447070080.0, "grad_norm": 1.4863828280794367, "language_loss": 0.82752049, "learning_rate": 9.238911707310096e-07, "loss": 0.84900403, "num_input_tokens_seen": 247851170, "step": 11483, "time_per_iteration": 2.6664347648620605 }, { "auxiliary_loss_clip": 0.01116625, "auxiliary_loss_mlp": 0.01030991, "balance_loss_clip": 1.0412333, "balance_loss_mlp": 1.01880169, "epoch": 0.6904554336389599, "flos": 26100521712000.0, "grad_norm": 1.9326210731008662, "language_loss": 0.65550387, "learning_rate": 9.235629099489273e-07, "loss": 0.67697996, "num_input_tokens_seen": 247868950, "step": 11484, "time_per_iteration": 2.629709005355835 }, { "auxiliary_loss_clip": 0.01079245, "auxiliary_loss_mlp": 0.01044412, "balance_loss_clip": 1.03618813, "balance_loss_mlp": 1.03035724, "epoch": 0.6905155568916278, "flos": 31171754545920.0, "grad_norm": 1.4648771757692296, "language_loss": 0.7359699, "learning_rate": 9.232346899854479e-07, "loss": 0.75720656, "num_input_tokens_seen": 247889805, "step": 11485, "time_per_iteration": 2.780137300491333 }, { "auxiliary_loss_clip": 0.01100883, "auxiliary_loss_mlp": 0.00771626, "balance_loss_clip": 1.04121161, "balance_loss_mlp": 1.00017738, "epoch": 0.6905756801442958, "flos": 17639681120640.0, "grad_norm": 1.7496856130724467, "language_loss": 0.84967637, "learning_rate": 9.22906510853017e-07, "loss": 0.86840141, "num_input_tokens_seen": 247908585, "step": 11486, "time_per_iteration": 2.6427667140960693 }, { "auxiliary_loss_clip": 0.01053468, "auxiliary_loss_mlp": 0.01037616, "balance_loss_clip": 1.03498769, "balance_loss_mlp": 1.02395463, "epoch": 0.6906358033969637, "flos": 22343691260160.0, "grad_norm": 1.463253599304318, "language_loss": 0.72599518, "learning_rate": 9.225783725640786e-07, "loss": 0.74690592, "num_input_tokens_seen": 247928480, "step": 11487, "time_per_iteration": 2.8396995067596436 }, { "auxiliary_loss_clip": 0.01016718, "auxiliary_loss_mlp": 0.0100075, "balance_loss_clip": 1.01205254, "balance_loss_mlp": 0.99957544, "epoch": 0.6906959266496318, "flos": 69747789081600.0, "grad_norm": 0.9486957802927981, "language_loss": 0.66587651, "learning_rate": 9.222502751310759e-07, "loss": 0.68605119, "num_input_tokens_seen": 247988855, "step": 11488, "time_per_iteration": 3.256028175354004 }, { "auxiliary_loss_clip": 0.01090242, "auxiliary_loss_mlp": 0.01035444, "balance_loss_clip": 1.039554, "balance_loss_mlp": 1.02100825, "epoch": 0.6907560499022997, "flos": 21434253788160.0, "grad_norm": 1.736123733035723, "language_loss": 0.74721605, "learning_rate": 9.219222185664519e-07, "loss": 0.76847291, "num_input_tokens_seen": 248007685, "step": 11489, "time_per_iteration": 2.6813058853149414 }, { "auxiliary_loss_clip": 0.01102738, "auxiliary_loss_mlp": 0.01041104, "balance_loss_clip": 1.03759074, "balance_loss_mlp": 1.0267272, "epoch": 0.6908161731549677, "flos": 14392207480320.0, "grad_norm": 2.0464811594474006, "language_loss": 0.62228811, "learning_rate": 9.215942028826445e-07, "loss": 0.64372647, "num_input_tokens_seen": 248025145, "step": 11490, "time_per_iteration": 2.7024333477020264 }, { "auxiliary_loss_clip": 0.01090002, "auxiliary_loss_mlp": 0.01032703, "balance_loss_clip": 1.03779197, "balance_loss_mlp": 1.01960242, "epoch": 0.6908762964076357, "flos": 20010970304640.0, "grad_norm": 1.709286193075208, "language_loss": 0.72809607, "learning_rate": 9.212662280920937e-07, "loss": 0.74932313, "num_input_tokens_seen": 248043750, "step": 11491, "time_per_iteration": 2.746288537979126 }, { "auxiliary_loss_clip": 0.01089559, "auxiliary_loss_mlp": 0.00771788, "balance_loss_clip": 1.03801966, "balance_loss_mlp": 1.00016296, "epoch": 0.6909364196603036, "flos": 28769079853440.0, "grad_norm": 1.39649539646883, "language_loss": 0.70297456, "learning_rate": 9.20938294207235e-07, "loss": 0.72158802, "num_input_tokens_seen": 248065765, "step": 11492, "time_per_iteration": 2.7897520065307617 }, { "auxiliary_loss_clip": 0.010831, "auxiliary_loss_mlp": 0.01033716, "balance_loss_clip": 1.04620051, "balance_loss_mlp": 1.0190773, "epoch": 0.6909965429129716, "flos": 22528128620160.0, "grad_norm": 1.7344123027630052, "language_loss": 0.74773538, "learning_rate": 9.206104012405049e-07, "loss": 0.76890349, "num_input_tokens_seen": 248083810, "step": 11493, "time_per_iteration": 2.9563519954681396 }, { "auxiliary_loss_clip": 0.01114123, "auxiliary_loss_mlp": 0.01030007, "balance_loss_clip": 1.04090369, "balance_loss_mlp": 1.01648879, "epoch": 0.6910566661656395, "flos": 18405942981120.0, "grad_norm": 1.7115108202132974, "language_loss": 0.74647975, "learning_rate": 9.20282549204336e-07, "loss": 0.76792109, "num_input_tokens_seen": 248103185, "step": 11494, "time_per_iteration": 2.606947422027588 }, { "auxiliary_loss_clip": 0.01086005, "auxiliary_loss_mlp": 0.01030061, "balance_loss_clip": 1.03727913, "balance_loss_mlp": 1.01682854, "epoch": 0.6911167894183076, "flos": 30773972355840.0, "grad_norm": 1.4748735208604244, "language_loss": 0.68749166, "learning_rate": 9.19954738111161e-07, "loss": 0.70865232, "num_input_tokens_seen": 248125665, "step": 11495, "time_per_iteration": 2.768889904022217 }, { "auxiliary_loss_clip": 0.01089976, "auxiliary_loss_mlp": 0.01029957, "balance_loss_clip": 1.03646207, "balance_loss_mlp": 1.01640916, "epoch": 0.6911769126709755, "flos": 13735724561280.0, "grad_norm": 1.8085674885564547, "language_loss": 0.74088383, "learning_rate": 9.196269679734119e-07, "loss": 0.76208317, "num_input_tokens_seen": 248142545, "step": 11496, "time_per_iteration": 2.6374707221984863 }, { "auxiliary_loss_clip": 0.01075882, "auxiliary_loss_mlp": 0.01033532, "balance_loss_clip": 1.03445745, "balance_loss_mlp": 1.02084804, "epoch": 0.6912370359236435, "flos": 17566854295680.0, "grad_norm": 2.1445438478171, "language_loss": 0.80236906, "learning_rate": 9.19299238803515e-07, "loss": 0.82346314, "num_input_tokens_seen": 248160225, "step": 11497, "time_per_iteration": 2.6873879432678223 }, { "auxiliary_loss_clip": 0.01074496, "auxiliary_loss_mlp": 0.01037916, "balance_loss_clip": 1.03591168, "balance_loss_mlp": 1.02401567, "epoch": 0.6912971591763114, "flos": 22090772620800.0, "grad_norm": 1.4245028324847169, "language_loss": 0.8060286, "learning_rate": 9.189715506138993e-07, "loss": 0.82715273, "num_input_tokens_seen": 248180430, "step": 11498, "time_per_iteration": 2.7175493240356445 }, { "auxiliary_loss_clip": 0.01099715, "auxiliary_loss_mlp": 0.01033226, "balance_loss_clip": 1.03920996, "balance_loss_mlp": 1.01955223, "epoch": 0.6913572824289794, "flos": 29971476650880.0, "grad_norm": 1.5050738051892152, "language_loss": 0.86088848, "learning_rate": 9.186439034169915e-07, "loss": 0.88221788, "num_input_tokens_seen": 248202365, "step": 11499, "time_per_iteration": 2.7579431533813477 }, { "auxiliary_loss_clip": 0.01080625, "auxiliary_loss_mlp": 0.00771124, "balance_loss_clip": 1.040236, "balance_loss_mlp": 1.00014019, "epoch": 0.6914174056816473, "flos": 20448936835200.0, "grad_norm": 1.7961404954828535, "language_loss": 0.75816536, "learning_rate": 9.183162972252145e-07, "loss": 0.77668285, "num_input_tokens_seen": 248221750, "step": 11500, "time_per_iteration": 2.658766031265259 }, { "auxiliary_loss_clip": 0.01058615, "auxiliary_loss_mlp": 0.01050016, "balance_loss_clip": 1.03728688, "balance_loss_mlp": 1.03423262, "epoch": 0.6914775289343154, "flos": 21282530739840.0, "grad_norm": 1.8214656574654693, "language_loss": 0.77514184, "learning_rate": 9.179887320509921e-07, "loss": 0.79622817, "num_input_tokens_seen": 248239535, "step": 11501, "time_per_iteration": 2.751330614089966 }, { "auxiliary_loss_clip": 0.01099448, "auxiliary_loss_mlp": 0.01040566, "balance_loss_clip": 1.03807986, "balance_loss_mlp": 1.02625489, "epoch": 0.6915376521869833, "flos": 23878118401920.0, "grad_norm": 1.7335303734743124, "language_loss": 0.73580784, "learning_rate": 9.176612079067458e-07, "loss": 0.75720799, "num_input_tokens_seen": 248259055, "step": 11502, "time_per_iteration": 2.8098790645599365 }, { "auxiliary_loss_clip": 0.01041175, "auxiliary_loss_mlp": 0.01044606, "balance_loss_clip": 1.034199, "balance_loss_mlp": 1.02875125, "epoch": 0.6915977754396513, "flos": 11510268595200.0, "grad_norm": 2.5749254426128743, "language_loss": 0.73368824, "learning_rate": 9.173337248048953e-07, "loss": 0.75454605, "num_input_tokens_seen": 248276765, "step": 11503, "time_per_iteration": 2.747083902359009 }, { "auxiliary_loss_clip": 0.01098455, "auxiliary_loss_mlp": 0.01041122, "balance_loss_clip": 1.03777838, "balance_loss_mlp": 1.02701986, "epoch": 0.6916578986923193, "flos": 22601278667520.0, "grad_norm": 1.7356607503629284, "language_loss": 0.77010226, "learning_rate": 9.170062827578575e-07, "loss": 0.79149806, "num_input_tokens_seen": 248295310, "step": 11504, "time_per_iteration": 2.706209182739258 }, { "auxiliary_loss_clip": 0.01069336, "auxiliary_loss_mlp": 0.01039527, "balance_loss_clip": 1.0342164, "balance_loss_mlp": 1.02457845, "epoch": 0.6917180219449872, "flos": 23477355383040.0, "grad_norm": 1.7715532639399074, "language_loss": 0.73565066, "learning_rate": 9.166788817780499e-07, "loss": 0.75673938, "num_input_tokens_seen": 248315230, "step": 11505, "time_per_iteration": 2.725203514099121 }, { "auxiliary_loss_clip": 0.01054739, "auxiliary_loss_mlp": 0.00772936, "balance_loss_clip": 1.03434849, "balance_loss_mlp": 1.00009656, "epoch": 0.6917781451976552, "flos": 23732536579200.0, "grad_norm": 1.8090122394504842, "language_loss": 0.88027036, "learning_rate": 9.163515218778886e-07, "loss": 0.89854711, "num_input_tokens_seen": 248332980, "step": 11506, "time_per_iteration": 2.796102285385132 }, { "auxiliary_loss_clip": 0.01086001, "auxiliary_loss_mlp": 0.01030005, "balance_loss_clip": 1.03935504, "balance_loss_mlp": 1.01724994, "epoch": 0.6918382684503231, "flos": 31466760946560.0, "grad_norm": 2.045878343291588, "language_loss": 0.7011205, "learning_rate": 9.160242030697856e-07, "loss": 0.72228056, "num_input_tokens_seen": 248352865, "step": 11507, "time_per_iteration": 2.755439043045044 }, { "auxiliary_loss_clip": 0.01086914, "auxiliary_loss_mlp": 0.01036889, "balance_loss_clip": 1.03763783, "balance_loss_mlp": 1.02344775, "epoch": 0.6918983917029912, "flos": 21650471706240.0, "grad_norm": 1.853503068489786, "language_loss": 0.76915097, "learning_rate": 9.156969253661538e-07, "loss": 0.79038906, "num_input_tokens_seen": 248371125, "step": 11508, "time_per_iteration": 4.521030426025391 }, { "auxiliary_loss_clip": 0.0109627, "auxiliary_loss_mlp": 0.01034219, "balance_loss_clip": 1.03752184, "balance_loss_mlp": 1.02148128, "epoch": 0.6919585149556591, "flos": 25550082720000.0, "grad_norm": 1.8821969374944694, "language_loss": 0.75171518, "learning_rate": 9.153696887794027e-07, "loss": 0.77302009, "num_input_tokens_seen": 248390455, "step": 11509, "time_per_iteration": 2.69903826713562 }, { "auxiliary_loss_clip": 0.01062313, "auxiliary_loss_mlp": 0.01036661, "balance_loss_clip": 1.03829181, "balance_loss_mlp": 1.02342892, "epoch": 0.6920186382083271, "flos": 23659781581440.0, "grad_norm": 1.9874772496775723, "language_loss": 0.64212132, "learning_rate": 9.150424933219425e-07, "loss": 0.66311103, "num_input_tokens_seen": 248411305, "step": 11510, "time_per_iteration": 2.848520278930664 }, { "auxiliary_loss_clip": 0.0108123, "auxiliary_loss_mlp": 0.01034683, "balance_loss_clip": 1.03798079, "balance_loss_mlp": 1.02002048, "epoch": 0.692078761460995, "flos": 19061959023360.0, "grad_norm": 1.835249008120565, "language_loss": 0.75375962, "learning_rate": 9.147153390061788e-07, "loss": 0.77491868, "num_input_tokens_seen": 248430190, "step": 11511, "time_per_iteration": 4.174523115158081 }, { "auxiliary_loss_clip": 0.01084843, "auxiliary_loss_mlp": 0.01029668, "balance_loss_clip": 1.04214227, "balance_loss_mlp": 1.01793194, "epoch": 0.692138884713663, "flos": 29023291382400.0, "grad_norm": 1.7047662296255404, "language_loss": 0.62659085, "learning_rate": 9.143882258445184e-07, "loss": 0.64773595, "num_input_tokens_seen": 248450830, "step": 11512, "time_per_iteration": 4.534400224685669 }, { "auxiliary_loss_clip": 0.01080139, "auxiliary_loss_mlp": 0.01036917, "balance_loss_clip": 1.03771234, "balance_loss_mlp": 1.02366054, "epoch": 0.6921990079663309, "flos": 14757849976320.0, "grad_norm": 1.7738066365158425, "language_loss": 0.82885146, "learning_rate": 9.140611538493666e-07, "loss": 0.85002202, "num_input_tokens_seen": 248468585, "step": 11513, "time_per_iteration": 2.744152545928955 }, { "auxiliary_loss_clip": 0.01050332, "auxiliary_loss_mlp": 0.0103433, "balance_loss_clip": 1.03769469, "balance_loss_mlp": 1.02236128, "epoch": 0.692259131218999, "flos": 23841848643840.0, "grad_norm": 1.4085469853389758, "language_loss": 0.78494793, "learning_rate": 9.137341230331233e-07, "loss": 0.8057946, "num_input_tokens_seen": 248490535, "step": 11514, "time_per_iteration": 2.7933335304260254 }, { "auxiliary_loss_clip": 0.0106844, "auxiliary_loss_mlp": 0.01038567, "balance_loss_clip": 1.03552842, "balance_loss_mlp": 1.0250721, "epoch": 0.6923192544716669, "flos": 19135073157120.0, "grad_norm": 2.196765924687951, "language_loss": 0.75278533, "learning_rate": 9.134071334081907e-07, "loss": 0.77385533, "num_input_tokens_seen": 248508575, "step": 11515, "time_per_iteration": 2.7745299339294434 }, { "auxiliary_loss_clip": 0.01070009, "auxiliary_loss_mlp": 0.01033102, "balance_loss_clip": 1.03992462, "balance_loss_mlp": 1.02032304, "epoch": 0.6923793777243349, "flos": 28074639237120.0, "grad_norm": 1.799388111244089, "language_loss": 0.53198493, "learning_rate": 9.130801849869694e-07, "loss": 0.55301601, "num_input_tokens_seen": 248527025, "step": 11516, "time_per_iteration": 2.775190830230713 }, { "auxiliary_loss_clip": 0.01097274, "auxiliary_loss_mlp": 0.01037809, "balance_loss_clip": 1.03787732, "balance_loss_mlp": 1.02451098, "epoch": 0.6924395009770029, "flos": 16581250033920.0, "grad_norm": 1.6962423082360507, "language_loss": 0.72982675, "learning_rate": 9.127532777818557e-07, "loss": 0.75117755, "num_input_tokens_seen": 248544275, "step": 11517, "time_per_iteration": 2.598116397857666 }, { "auxiliary_loss_clip": 0.0111384, "auxiliary_loss_mlp": 0.01037211, "balance_loss_clip": 1.03925538, "balance_loss_mlp": 1.02354932, "epoch": 0.6924996242296708, "flos": 16655297921280.0, "grad_norm": 1.6338598791065129, "language_loss": 0.76462078, "learning_rate": 9.124264118052465e-07, "loss": 0.78613126, "num_input_tokens_seen": 248561870, "step": 11518, "time_per_iteration": 4.141700983047485 }, { "auxiliary_loss_clip": 0.0110627, "auxiliary_loss_mlp": 0.01040853, "balance_loss_clip": 1.04075885, "balance_loss_mlp": 1.02653027, "epoch": 0.6925597474823388, "flos": 34754167532160.0, "grad_norm": 1.3592469216685072, "language_loss": 0.64467025, "learning_rate": 9.120995870695376e-07, "loss": 0.66614151, "num_input_tokens_seen": 248588190, "step": 11519, "time_per_iteration": 2.8347549438476562 }, { "auxiliary_loss_clip": 0.01080573, "auxiliary_loss_mlp": 0.0103987, "balance_loss_clip": 1.03696394, "balance_loss_mlp": 1.02670944, "epoch": 0.6926198707350067, "flos": 21871717528320.0, "grad_norm": 2.1051263263306805, "language_loss": 0.62538528, "learning_rate": 9.117728035871212e-07, "loss": 0.64658964, "num_input_tokens_seen": 248606460, "step": 11520, "time_per_iteration": 2.7294435501098633 }, { "auxiliary_loss_clip": 0.01075792, "auxiliary_loss_mlp": 0.01037449, "balance_loss_clip": 1.03631949, "balance_loss_mlp": 1.0228461, "epoch": 0.6926799939876748, "flos": 13006271162880.0, "grad_norm": 2.2378150496595013, "language_loss": 0.77924216, "learning_rate": 9.114460613703887e-07, "loss": 0.80037463, "num_input_tokens_seen": 248623715, "step": 11521, "time_per_iteration": 2.717240571975708 }, { "auxiliary_loss_clip": 0.01100684, "auxiliary_loss_mlp": 0.0103794, "balance_loss_clip": 1.03691578, "balance_loss_mlp": 1.02260375, "epoch": 0.6927401172403427, "flos": 16761234107520.0, "grad_norm": 2.442345109030128, "language_loss": 0.81992316, "learning_rate": 9.111193604317304e-07, "loss": 0.84130937, "num_input_tokens_seen": 248640575, "step": 11522, "time_per_iteration": 2.6045098304748535 }, { "auxiliary_loss_clip": 0.01100284, "auxiliary_loss_mlp": 0.01034079, "balance_loss_clip": 1.04276228, "balance_loss_mlp": 1.02152598, "epoch": 0.6928002404930107, "flos": 25705648523520.0, "grad_norm": 1.8984649858129847, "language_loss": 0.76575756, "learning_rate": 9.107927007835361e-07, "loss": 0.78710121, "num_input_tokens_seen": 248663535, "step": 11523, "time_per_iteration": 2.6705586910247803 }, { "auxiliary_loss_clip": 0.01082858, "auxiliary_loss_mlp": 0.01035266, "balance_loss_clip": 1.03894114, "balance_loss_mlp": 1.02276123, "epoch": 0.6928603637456786, "flos": 18588261438720.0, "grad_norm": 2.087470687803226, "language_loss": 0.68297094, "learning_rate": 9.104660824381915e-07, "loss": 0.70415223, "num_input_tokens_seen": 248681125, "step": 11524, "time_per_iteration": 2.6786375045776367 }, { "auxiliary_loss_clip": 0.0108268, "auxiliary_loss_mlp": 0.01033665, "balance_loss_clip": 1.03927469, "balance_loss_mlp": 1.01960993, "epoch": 0.6929204869983466, "flos": 22200874784640.0, "grad_norm": 1.782896915319788, "language_loss": 0.64250147, "learning_rate": 9.101395054080815e-07, "loss": 0.66366494, "num_input_tokens_seen": 248700555, "step": 11525, "time_per_iteration": 2.709665536880493 }, { "auxiliary_loss_clip": 0.01076674, "auxiliary_loss_mlp": 0.01040353, "balance_loss_clip": 1.04186177, "balance_loss_mlp": 1.02660835, "epoch": 0.6929806102510145, "flos": 17894754576000.0, "grad_norm": 2.1892792904192366, "language_loss": 0.70518214, "learning_rate": 9.098129697055907e-07, "loss": 0.72635239, "num_input_tokens_seen": 248716095, "step": 11526, "time_per_iteration": 2.7389345169067383 }, { "auxiliary_loss_clip": 0.01089418, "auxiliary_loss_mlp": 0.01034739, "balance_loss_clip": 1.03708529, "balance_loss_mlp": 1.02210879, "epoch": 0.6930407335036826, "flos": 19755178577280.0, "grad_norm": 2.017152131296503, "language_loss": 0.76394051, "learning_rate": 9.094864753431022e-07, "loss": 0.78518212, "num_input_tokens_seen": 248735330, "step": 11527, "time_per_iteration": 2.675387382507324 }, { "auxiliary_loss_clip": 0.01084801, "auxiliary_loss_mlp": 0.01040813, "balance_loss_clip": 1.03604603, "balance_loss_mlp": 1.02701497, "epoch": 0.6931008567563505, "flos": 21544248211200.0, "grad_norm": 1.6619978055585172, "language_loss": 0.7924946, "learning_rate": 9.091600223329952e-07, "loss": 0.81375074, "num_input_tokens_seen": 248754530, "step": 11528, "time_per_iteration": 2.708937883377075 }, { "auxiliary_loss_clip": 0.01097731, "auxiliary_loss_mlp": 0.01032598, "balance_loss_clip": 1.03879142, "balance_loss_mlp": 1.02049828, "epoch": 0.6931609800090185, "flos": 26250018117120.0, "grad_norm": 1.5147905000718478, "language_loss": 0.76348805, "learning_rate": 9.088336106876491e-07, "loss": 0.78479135, "num_input_tokens_seen": 248775825, "step": 11529, "time_per_iteration": 2.7546539306640625 }, { "auxiliary_loss_clip": 0.01110971, "auxiliary_loss_mlp": 0.00770303, "balance_loss_clip": 1.03999567, "balance_loss_mlp": 1.00013018, "epoch": 0.6932211032616865, "flos": 32343376366080.0, "grad_norm": 1.6406393226660527, "language_loss": 0.7214883, "learning_rate": 9.085072404194436e-07, "loss": 0.74030107, "num_input_tokens_seen": 248796180, "step": 11530, "time_per_iteration": 2.6844561100006104 }, { "auxiliary_loss_clip": 0.01098446, "auxiliary_loss_mlp": 0.01035346, "balance_loss_clip": 1.04138708, "balance_loss_mlp": 1.02000356, "epoch": 0.6932812265143544, "flos": 22049079909120.0, "grad_norm": 1.6484845997572906, "language_loss": 0.78485453, "learning_rate": 9.081809115407513e-07, "loss": 0.80619252, "num_input_tokens_seen": 248814735, "step": 11531, "time_per_iteration": 2.753316879272461 }, { "auxiliary_loss_clip": 0.010964, "auxiliary_loss_mlp": 0.01038589, "balance_loss_clip": 1.03926003, "balance_loss_mlp": 1.02656698, "epoch": 0.6933413497670224, "flos": 26256626219520.0, "grad_norm": 1.5040049491252714, "language_loss": 0.69552708, "learning_rate": 9.078546240639484e-07, "loss": 0.71687698, "num_input_tokens_seen": 248839140, "step": 11532, "time_per_iteration": 2.7001755237579346 }, { "auxiliary_loss_clip": 0.01087082, "auxiliary_loss_mlp": 0.01032046, "balance_loss_clip": 1.03650141, "balance_loss_mlp": 1.01820564, "epoch": 0.6934014730196904, "flos": 19573003774080.0, "grad_norm": 1.314927604950551, "language_loss": 0.6689446, "learning_rate": 9.075283780014082e-07, "loss": 0.69013584, "num_input_tokens_seen": 248858300, "step": 11533, "time_per_iteration": 2.761096239089966 }, { "auxiliary_loss_clip": 0.01089563, "auxiliary_loss_mlp": 0.01038191, "balance_loss_clip": 1.04126263, "balance_loss_mlp": 1.02426171, "epoch": 0.6934615962723584, "flos": 22119249127680.0, "grad_norm": 3.125205881661687, "language_loss": 0.58564359, "learning_rate": 9.072021733655007e-07, "loss": 0.60692114, "num_input_tokens_seen": 248876310, "step": 11534, "time_per_iteration": 2.6929404735565186 }, { "auxiliary_loss_clip": 0.01078734, "auxiliary_loss_mlp": 0.01030158, "balance_loss_clip": 1.03795767, "balance_loss_mlp": 1.01613939, "epoch": 0.6935217195250263, "flos": 21360816432000.0, "grad_norm": 2.25045203731707, "language_loss": 0.71212113, "learning_rate": 9.068760101685971e-07, "loss": 0.73321003, "num_input_tokens_seen": 248895650, "step": 11535, "time_per_iteration": 2.68656849861145 }, { "auxiliary_loss_clip": 0.01013917, "auxiliary_loss_mlp": 0.01003832, "balance_loss_clip": 1.00924766, "balance_loss_mlp": 1.00264609, "epoch": 0.6935818427776943, "flos": 64063813115520.0, "grad_norm": 0.7110018734854711, "language_loss": 0.59062427, "learning_rate": 9.065498884230638e-07, "loss": 0.61080176, "num_input_tokens_seen": 248963920, "step": 11536, "time_per_iteration": 3.347024917602539 }, { "auxiliary_loss_clip": 0.0110154, "auxiliary_loss_mlp": 0.00771293, "balance_loss_clip": 1.04176164, "balance_loss_mlp": 1.00036359, "epoch": 0.6936419660303622, "flos": 20302564913280.0, "grad_norm": 1.511578579133692, "language_loss": 0.72917026, "learning_rate": 9.062238081412692e-07, "loss": 0.74789858, "num_input_tokens_seen": 248983380, "step": 11537, "time_per_iteration": 2.7138421535491943 }, { "auxiliary_loss_clip": 0.01022423, "auxiliary_loss_mlp": 0.00751474, "balance_loss_clip": 1.0083034, "balance_loss_mlp": 0.99969625, "epoch": 0.6937020892830302, "flos": 67182581347200.0, "grad_norm": 0.7456734981947979, "language_loss": 0.55525714, "learning_rate": 9.058977693355767e-07, "loss": 0.57299614, "num_input_tokens_seen": 249044680, "step": 11538, "time_per_iteration": 3.1686036586761475 }, { "auxiliary_loss_clip": 0.01097095, "auxiliary_loss_mlp": 0.01037153, "balance_loss_clip": 1.03834343, "balance_loss_mlp": 1.02519631, "epoch": 0.6937622125356981, "flos": 23878190229120.0, "grad_norm": 1.582889813468805, "language_loss": 0.77747178, "learning_rate": 9.055717720183505e-07, "loss": 0.79881424, "num_input_tokens_seen": 249061060, "step": 11539, "time_per_iteration": 2.7487149238586426 }, { "auxiliary_loss_clip": 0.01088793, "auxiliary_loss_mlp": 0.01029242, "balance_loss_clip": 1.03841698, "balance_loss_mlp": 1.01741016, "epoch": 0.6938223357883662, "flos": 28730619365760.0, "grad_norm": 1.696359380020658, "language_loss": 0.63957608, "learning_rate": 9.05245816201953e-07, "loss": 0.66075647, "num_input_tokens_seen": 249081430, "step": 11540, "time_per_iteration": 2.897141456604004 }, { "auxiliary_loss_clip": 0.01064567, "auxiliary_loss_mlp": 0.01031848, "balance_loss_clip": 1.0352695, "balance_loss_mlp": 1.01913404, "epoch": 0.6938824590410341, "flos": 28655027193600.0, "grad_norm": 1.5143087108308135, "language_loss": 0.86776996, "learning_rate": 9.049199018987437e-07, "loss": 0.8887341, "num_input_tokens_seen": 249103020, "step": 11541, "time_per_iteration": 2.790721893310547 }, { "auxiliary_loss_clip": 0.01113533, "auxiliary_loss_mlp": 0.00771014, "balance_loss_clip": 1.04010653, "balance_loss_mlp": 1.00017405, "epoch": 0.6939425822937021, "flos": 18983062800000.0, "grad_norm": 2.0467106914623483, "language_loss": 0.84313244, "learning_rate": 9.04594029121081e-07, "loss": 0.86197793, "num_input_tokens_seen": 249120810, "step": 11542, "time_per_iteration": 2.6897356510162354 }, { "auxiliary_loss_clip": 0.01101602, "auxiliary_loss_mlp": 0.0103373, "balance_loss_clip": 1.03908658, "balance_loss_mlp": 1.01946616, "epoch": 0.6940027055463701, "flos": 23075838178560.0, "grad_norm": 1.712845406510252, "language_loss": 0.75460529, "learning_rate": 9.04268197881323e-07, "loss": 0.7759586, "num_input_tokens_seen": 249138050, "step": 11543, "time_per_iteration": 2.6957714557647705 }, { "auxiliary_loss_clip": 0.01092628, "auxiliary_loss_mlp": 0.01030945, "balance_loss_clip": 1.03984666, "balance_loss_mlp": 1.01842248, "epoch": 0.694062828799038, "flos": 18186564666240.0, "grad_norm": 1.7601740067431768, "language_loss": 0.76118124, "learning_rate": 9.039424081918241e-07, "loss": 0.782417, "num_input_tokens_seen": 249155570, "step": 11544, "time_per_iteration": 2.6654560565948486 }, { "auxiliary_loss_clip": 0.01059106, "auxiliary_loss_mlp": 0.01041973, "balance_loss_clip": 1.03483558, "balance_loss_mlp": 1.02701259, "epoch": 0.694122952051706, "flos": 17821532701440.0, "grad_norm": 1.7077891138664472, "language_loss": 0.71304005, "learning_rate": 9.036166600649388e-07, "loss": 0.73405087, "num_input_tokens_seen": 249172960, "step": 11545, "time_per_iteration": 2.6869020462036133 }, { "auxiliary_loss_clip": 0.0109854, "auxiliary_loss_mlp": 0.01030108, "balance_loss_clip": 1.04018188, "balance_loss_mlp": 1.01828814, "epoch": 0.694183075304374, "flos": 21215306436480.0, "grad_norm": 1.7532682541368763, "language_loss": 0.79367101, "learning_rate": 9.0329095351302e-07, "loss": 0.8149575, "num_input_tokens_seen": 249192450, "step": 11546, "time_per_iteration": 2.6320011615753174 }, { "auxiliary_loss_clip": 0.01080505, "auxiliary_loss_mlp": 0.01029935, "balance_loss_clip": 1.03777122, "balance_loss_mlp": 1.01704824, "epoch": 0.694243198557042, "flos": 24060508686720.0, "grad_norm": 1.4008683277346297, "language_loss": 0.78635859, "learning_rate": 9.029652885484194e-07, "loss": 0.80746305, "num_input_tokens_seen": 249214320, "step": 11547, "time_per_iteration": 2.7307076454162598 }, { "auxiliary_loss_clip": 0.010916, "auxiliary_loss_mlp": 0.00771764, "balance_loss_clip": 1.04151332, "balance_loss_mlp": 1.00021195, "epoch": 0.6943033218097099, "flos": 21141869080320.0, "grad_norm": 2.101396590702846, "language_loss": 0.80507267, "learning_rate": 9.026396651834834e-07, "loss": 0.82370633, "num_input_tokens_seen": 249230925, "step": 11548, "time_per_iteration": 4.426462650299072 }, { "auxiliary_loss_clip": 0.01032364, "auxiliary_loss_mlp": 0.0075149, "balance_loss_clip": 1.00922537, "balance_loss_mlp": 0.99970454, "epoch": 0.6943634450623779, "flos": 57812015975040.0, "grad_norm": 0.6903286764237632, "language_loss": 0.53703904, "learning_rate": 9.023140834305613e-07, "loss": 0.55487758, "num_input_tokens_seen": 249293975, "step": 11549, "time_per_iteration": 3.1308066844940186 }, { "auxiliary_loss_clip": 0.01093982, "auxiliary_loss_mlp": 0.01035759, "balance_loss_clip": 1.03542507, "balance_loss_mlp": 1.02189505, "epoch": 0.6944235683150458, "flos": 30590684231040.0, "grad_norm": 1.55426436192092, "language_loss": 0.73198104, "learning_rate": 9.01988543302e-07, "loss": 0.75327837, "num_input_tokens_seen": 249315285, "step": 11550, "time_per_iteration": 5.8028564453125 }, { "auxiliary_loss_clip": 0.010896, "auxiliary_loss_mlp": 0.01039664, "balance_loss_clip": 1.04099548, "balance_loss_mlp": 1.02650332, "epoch": 0.6944836915677138, "flos": 19719447523200.0, "grad_norm": 1.9506864007678324, "language_loss": 0.74081314, "learning_rate": 9.016630448101425e-07, "loss": 0.76210582, "num_input_tokens_seen": 249333505, "step": 11551, "time_per_iteration": 2.665813446044922 }, { "auxiliary_loss_clip": 0.01114588, "auxiliary_loss_mlp": 0.01038306, "balance_loss_clip": 1.0404079, "balance_loss_mlp": 1.0249548, "epoch": 0.6945438148203817, "flos": 24863579009280.0, "grad_norm": 1.5863003603219143, "language_loss": 0.84288925, "learning_rate": 9.01337587967333e-07, "loss": 0.86441821, "num_input_tokens_seen": 249354180, "step": 11552, "time_per_iteration": 2.8407604694366455 }, { "auxiliary_loss_clip": 0.01112485, "auxiliary_loss_mlp": 0.01035825, "balance_loss_clip": 1.03997219, "balance_loss_mlp": 1.02287877, "epoch": 0.6946039380730498, "flos": 33326646243840.0, "grad_norm": 1.6205787984736582, "language_loss": 0.6727165, "learning_rate": 9.010121727859117e-07, "loss": 0.69419956, "num_input_tokens_seen": 249377035, "step": 11553, "time_per_iteration": 2.7572171688079834 }, { "auxiliary_loss_clip": 0.01097133, "auxiliary_loss_mlp": 0.01031798, "balance_loss_clip": 1.04150629, "balance_loss_mlp": 1.0176357, "epoch": 0.6946640613257177, "flos": 20850956830080.0, "grad_norm": 2.0885031059024017, "language_loss": 0.79817116, "learning_rate": 9.006867992782195e-07, "loss": 0.81946045, "num_input_tokens_seen": 249396155, "step": 11554, "time_per_iteration": 2.721204996109009 }, { "auxiliary_loss_clip": 0.01101639, "auxiliary_loss_mlp": 0.01028417, "balance_loss_clip": 1.03683937, "balance_loss_mlp": 1.01538706, "epoch": 0.6947241845783857, "flos": 19354846521600.0, "grad_norm": 5.498909507177023, "language_loss": 0.72485244, "learning_rate": 9.003614674565934e-07, "loss": 0.746153, "num_input_tokens_seen": 249414555, "step": 11555, "time_per_iteration": 2.5764734745025635 }, { "auxiliary_loss_clip": 0.01075985, "auxiliary_loss_mlp": 0.01033395, "balance_loss_clip": 1.0355674, "balance_loss_mlp": 1.02071118, "epoch": 0.6947843078310536, "flos": 27120240915840.0, "grad_norm": 1.691992683709007, "language_loss": 0.78099442, "learning_rate": 9.000361773333705e-07, "loss": 0.80208826, "num_input_tokens_seen": 249433570, "step": 11556, "time_per_iteration": 2.709371328353882 }, { "auxiliary_loss_clip": 0.01053238, "auxiliary_loss_mlp": 0.01042754, "balance_loss_clip": 1.03567553, "balance_loss_mlp": 1.02977192, "epoch": 0.6948444310837216, "flos": 28585109370240.0, "grad_norm": 2.67608324512941, "language_loss": 0.6078257, "learning_rate": 8.997109289208869e-07, "loss": 0.62878561, "num_input_tokens_seen": 249453735, "step": 11557, "time_per_iteration": 2.802755832672119 }, { "auxiliary_loss_clip": 0.01091412, "auxiliary_loss_mlp": 0.01036617, "balance_loss_clip": 1.04582477, "balance_loss_mlp": 1.02432072, "epoch": 0.6949045543363896, "flos": 15669262696320.0, "grad_norm": 1.8868639353826757, "language_loss": 0.85245895, "learning_rate": 8.993857222314752e-07, "loss": 0.87373924, "num_input_tokens_seen": 249470805, "step": 11558, "time_per_iteration": 4.191239595413208 }, { "auxiliary_loss_clip": 0.01103665, "auxiliary_loss_mlp": 0.01036679, "balance_loss_clip": 1.03848016, "balance_loss_mlp": 1.02259421, "epoch": 0.6949646775890576, "flos": 23259413612160.0, "grad_norm": 1.6011995670577914, "language_loss": 0.70525056, "learning_rate": 8.990605572774664e-07, "loss": 0.72665399, "num_input_tokens_seen": 249491150, "step": 11559, "time_per_iteration": 2.7076830863952637 }, { "auxiliary_loss_clip": 0.01078357, "auxiliary_loss_mlp": 0.01032244, "balance_loss_clip": 1.03816533, "balance_loss_mlp": 1.01998925, "epoch": 0.6950248008417256, "flos": 22382546797440.0, "grad_norm": 2.0259020832909234, "language_loss": 0.78594178, "learning_rate": 8.987354340711921e-07, "loss": 0.80704772, "num_input_tokens_seen": 249511560, "step": 11560, "time_per_iteration": 2.7197508811950684 }, { "auxiliary_loss_clip": 0.01087442, "auxiliary_loss_mlp": 0.01034646, "balance_loss_clip": 1.03931344, "balance_loss_mlp": 1.0221293, "epoch": 0.6950849240943935, "flos": 23477355383040.0, "grad_norm": 1.532648657325296, "language_loss": 0.76758087, "learning_rate": 8.9841035262498e-07, "loss": 0.78880179, "num_input_tokens_seen": 249531910, "step": 11561, "time_per_iteration": 2.707702159881592 }, { "auxiliary_loss_clip": 0.01108982, "auxiliary_loss_mlp": 0.01032613, "balance_loss_clip": 1.03717422, "balance_loss_mlp": 1.01877272, "epoch": 0.6951450473470615, "flos": 17420554200960.0, "grad_norm": 1.812422200416747, "language_loss": 0.78550988, "learning_rate": 8.980853129511577e-07, "loss": 0.80692589, "num_input_tokens_seen": 249550300, "step": 11562, "time_per_iteration": 2.5765740871429443 }, { "auxiliary_loss_clip": 0.01104346, "auxiliary_loss_mlp": 0.01034539, "balance_loss_clip": 1.0394088, "balance_loss_mlp": 1.02134836, "epoch": 0.6952051705997294, "flos": 20485745297280.0, "grad_norm": 1.9668484309221967, "language_loss": 0.69117391, "learning_rate": 8.977603150620515e-07, "loss": 0.7125628, "num_input_tokens_seen": 249567740, "step": 11563, "time_per_iteration": 2.6727218627929688 }, { "auxiliary_loss_clip": 0.01090765, "auxiliary_loss_mlp": 0.0102846, "balance_loss_clip": 1.03766811, "balance_loss_mlp": 1.0160023, "epoch": 0.6952652938523974, "flos": 13989541040640.0, "grad_norm": 2.495686990019142, "language_loss": 0.73530227, "learning_rate": 8.974353589699846e-07, "loss": 0.75649452, "num_input_tokens_seen": 249582700, "step": 11564, "time_per_iteration": 2.576385259628296 }, { "auxiliary_loss_clip": 0.01083646, "auxiliary_loss_mlp": 0.01038821, "balance_loss_clip": 1.04269266, "balance_loss_mlp": 1.02250147, "epoch": 0.6953254171050653, "flos": 30953956429440.0, "grad_norm": 1.8121742039667086, "language_loss": 0.71753776, "learning_rate": 8.971104446872785e-07, "loss": 0.73876244, "num_input_tokens_seen": 249602920, "step": 11565, "time_per_iteration": 2.732823133468628 }, { "auxiliary_loss_clip": 0.01016312, "auxiliary_loss_mlp": 0.01000486, "balance_loss_clip": 1.01167345, "balance_loss_mlp": 0.99898958, "epoch": 0.6953855403577334, "flos": 61670257499520.0, "grad_norm": 0.9560441236848968, "language_loss": 0.58358735, "learning_rate": 8.96785572226255e-07, "loss": 0.60375541, "num_input_tokens_seen": 249660400, "step": 11566, "time_per_iteration": 3.0193676948547363 }, { "auxiliary_loss_clip": 0.01081084, "auxiliary_loss_mlp": 0.01031181, "balance_loss_clip": 1.04008102, "balance_loss_mlp": 1.01653004, "epoch": 0.6954456636104013, "flos": 23039029716480.0, "grad_norm": 1.9855993328717996, "language_loss": 0.7417689, "learning_rate": 8.964607415992338e-07, "loss": 0.76289153, "num_input_tokens_seen": 249679335, "step": 11567, "time_per_iteration": 2.72933030128479 }, { "auxiliary_loss_clip": 0.01081196, "auxiliary_loss_mlp": 0.01034089, "balance_loss_clip": 1.03550458, "balance_loss_mlp": 1.02039182, "epoch": 0.6955057868630693, "flos": 23918518224000.0, "grad_norm": 1.2846819146580761, "language_loss": 0.76948917, "learning_rate": 8.961359528185313e-07, "loss": 0.79064202, "num_input_tokens_seen": 249701805, "step": 11568, "time_per_iteration": 2.715871572494507 }, { "auxiliary_loss_clip": 0.01096832, "auxiliary_loss_mlp": 0.01035387, "balance_loss_clip": 1.04105902, "balance_loss_mlp": 1.02265501, "epoch": 0.6955659101157372, "flos": 22594634651520.0, "grad_norm": 1.619102090378134, "language_loss": 0.72502244, "learning_rate": 8.958112058964649e-07, "loss": 0.74634463, "num_input_tokens_seen": 249720550, "step": 11569, "time_per_iteration": 2.645249366760254 }, { "auxiliary_loss_clip": 0.01091211, "auxiliary_loss_mlp": 0.01033237, "balance_loss_clip": 1.04227805, "balance_loss_mlp": 1.01993299, "epoch": 0.6956260333684052, "flos": 24572523104640.0, "grad_norm": 1.7249170948582337, "language_loss": 0.76852113, "learning_rate": 8.954865008453471e-07, "loss": 0.78976554, "num_input_tokens_seen": 249740325, "step": 11570, "time_per_iteration": 2.7455241680145264 }, { "auxiliary_loss_clip": 0.01102536, "auxiliary_loss_mlp": 0.01035635, "balance_loss_clip": 1.03880751, "balance_loss_mlp": 1.0223434, "epoch": 0.6956861566210732, "flos": 25846058787840.0, "grad_norm": 2.852635379776112, "language_loss": 0.7431376, "learning_rate": 8.95161837677493e-07, "loss": 0.76451933, "num_input_tokens_seen": 249760570, "step": 11571, "time_per_iteration": 2.6448328495025635 }, { "auxiliary_loss_clip": 0.01094888, "auxiliary_loss_mlp": 0.01033197, "balance_loss_clip": 1.03635645, "balance_loss_mlp": 1.01997066, "epoch": 0.6957462798737412, "flos": 15301393557120.0, "grad_norm": 1.759907053555304, "language_loss": 0.74442685, "learning_rate": 8.948372164052118e-07, "loss": 0.76570773, "num_input_tokens_seen": 249778290, "step": 11572, "time_per_iteration": 2.6260786056518555 }, { "auxiliary_loss_clip": 0.01089599, "auxiliary_loss_mlp": 0.01029265, "balance_loss_clip": 1.03659272, "balance_loss_mlp": 1.01614022, "epoch": 0.6958064031264092, "flos": 36246830135040.0, "grad_norm": 1.9436396296550662, "language_loss": 0.7025919, "learning_rate": 8.94512637040814e-07, "loss": 0.72378051, "num_input_tokens_seen": 249800925, "step": 11573, "time_per_iteration": 2.783256769180298 }, { "auxiliary_loss_clip": 0.0109259, "auxiliary_loss_mlp": 0.01036465, "balance_loss_clip": 1.04023504, "balance_loss_mlp": 1.02252948, "epoch": 0.6958665263790771, "flos": 19208725994880.0, "grad_norm": 2.0706527554899137, "language_loss": 0.75003505, "learning_rate": 8.941880995966095e-07, "loss": 0.77132565, "num_input_tokens_seen": 249820500, "step": 11574, "time_per_iteration": 2.684457540512085 }, { "auxiliary_loss_clip": 0.01077067, "auxiliary_loss_mlp": 0.01034435, "balance_loss_clip": 1.03447127, "balance_loss_mlp": 1.02117276, "epoch": 0.6959266496317451, "flos": 21795838047360.0, "grad_norm": 1.601224427976484, "language_loss": 0.74403846, "learning_rate": 8.938636040849014e-07, "loss": 0.76515353, "num_input_tokens_seen": 249839845, "step": 11575, "time_per_iteration": 2.7856502532958984 }, { "auxiliary_loss_clip": 0.01102844, "auxiliary_loss_mlp": 0.0103293, "balance_loss_clip": 1.03945291, "balance_loss_mlp": 1.01965618, "epoch": 0.695986772884413, "flos": 20558248899840.0, "grad_norm": 1.7874437641987468, "language_loss": 0.78887069, "learning_rate": 8.935391505179966e-07, "loss": 0.81022847, "num_input_tokens_seen": 249857400, "step": 11576, "time_per_iteration": 2.6610217094421387 }, { "auxiliary_loss_clip": 0.01068698, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.03670073, "balance_loss_mlp": 1.02041745, "epoch": 0.696046896137081, "flos": 14936217937920.0, "grad_norm": 2.693764444619471, "language_loss": 0.567918, "learning_rate": 8.932147389081985e-07, "loss": 0.58893347, "num_input_tokens_seen": 249871645, "step": 11577, "time_per_iteration": 2.666973114013672 }, { "auxiliary_loss_clip": 0.01034011, "auxiliary_loss_mlp": 0.01034025, "balance_loss_clip": 1.03415358, "balance_loss_mlp": 1.02244925, "epoch": 0.696107019389749, "flos": 30740216549760.0, "grad_norm": 1.3814908758376254, "language_loss": 0.77030635, "learning_rate": 8.928903692678081e-07, "loss": 0.79098672, "num_input_tokens_seen": 249894215, "step": 11578, "time_per_iteration": 2.8858745098114014 }, { "auxiliary_loss_clip": 0.01078498, "auxiliary_loss_mlp": 0.01037884, "balance_loss_clip": 1.03798914, "balance_loss_mlp": 1.02474117, "epoch": 0.696167142642417, "flos": 20776729374720.0, "grad_norm": 1.8210944500658799, "language_loss": 0.79498136, "learning_rate": 8.925660416091254e-07, "loss": 0.81614518, "num_input_tokens_seen": 249912850, "step": 11579, "time_per_iteration": 2.664579153060913 }, { "auxiliary_loss_clip": 0.01072667, "auxiliary_loss_mlp": 0.01030035, "balance_loss_clip": 1.03569424, "balance_loss_mlp": 1.01685047, "epoch": 0.6962272658950849, "flos": 22565152563840.0, "grad_norm": 1.691551451223947, "language_loss": 0.72261667, "learning_rate": 8.922417559444502e-07, "loss": 0.7436437, "num_input_tokens_seen": 249932650, "step": 11580, "time_per_iteration": 2.61865496635437 }, { "auxiliary_loss_clip": 0.01096209, "auxiliary_loss_mlp": 0.0103361, "balance_loss_clip": 1.04078209, "balance_loss_mlp": 1.01977623, "epoch": 0.6962873891477529, "flos": 22200156512640.0, "grad_norm": 2.100362129300219, "language_loss": 0.65822804, "learning_rate": 8.919175122860787e-07, "loss": 0.67952627, "num_input_tokens_seen": 249951205, "step": 11581, "time_per_iteration": 2.559589385986328 }, { "auxiliary_loss_clip": 0.0111328, "auxiliary_loss_mlp": 0.01033054, "balance_loss_clip": 1.03978491, "balance_loss_mlp": 1.02038205, "epoch": 0.6963475124004208, "flos": 12489695717760.0, "grad_norm": 1.963411305318447, "language_loss": 0.76478052, "learning_rate": 8.915933106463056e-07, "loss": 0.78624392, "num_input_tokens_seen": 249967045, "step": 11582, "time_per_iteration": 2.4627199172973633 }, { "auxiliary_loss_clip": 0.01086826, "auxiliary_loss_mlp": 0.01032933, "balance_loss_clip": 1.03617883, "balance_loss_mlp": 1.0209347, "epoch": 0.6964076356530888, "flos": 17165085696000.0, "grad_norm": 2.040960017039001, "language_loss": 0.69914186, "learning_rate": 8.91269151037425e-07, "loss": 0.72033942, "num_input_tokens_seen": 249984565, "step": 11583, "time_per_iteration": 2.5302255153656006 }, { "auxiliary_loss_clip": 0.01087174, "auxiliary_loss_mlp": 0.01035499, "balance_loss_clip": 1.0447557, "balance_loss_mlp": 1.02192092, "epoch": 0.6964677589057569, "flos": 19937317466880.0, "grad_norm": 4.628034393643301, "language_loss": 0.8247509, "learning_rate": 8.909450334717301e-07, "loss": 0.84597766, "num_input_tokens_seen": 250004235, "step": 11584, "time_per_iteration": 2.6446306705474854 }, { "auxiliary_loss_clip": 0.01064623, "auxiliary_loss_mlp": 0.01039599, "balance_loss_clip": 1.04226518, "balance_loss_mlp": 1.02518678, "epoch": 0.6965278821584248, "flos": 22784064001920.0, "grad_norm": 2.204044356312338, "language_loss": 0.80097932, "learning_rate": 8.906209579615107e-07, "loss": 0.82202154, "num_input_tokens_seen": 250017645, "step": 11585, "time_per_iteration": 2.739288568496704 }, { "auxiliary_loss_clip": 0.01109133, "auxiliary_loss_mlp": 0.01033696, "balance_loss_clip": 1.03880882, "balance_loss_mlp": 1.02158463, "epoch": 0.6965880054110928, "flos": 20047563285120.0, "grad_norm": 1.6413269485286424, "language_loss": 0.7727071, "learning_rate": 8.90296924519055e-07, "loss": 0.79413539, "num_input_tokens_seen": 250037640, "step": 11586, "time_per_iteration": 2.624624013900757 }, { "auxiliary_loss_clip": 0.01098392, "auxiliary_loss_mlp": 0.01031126, "balance_loss_clip": 1.03904796, "balance_loss_mlp": 1.0188241, "epoch": 0.6966481286637607, "flos": 21908238681600.0, "grad_norm": 1.6313830444330661, "language_loss": 0.78602171, "learning_rate": 8.899729331566519e-07, "loss": 0.8073169, "num_input_tokens_seen": 250056490, "step": 11587, "time_per_iteration": 4.355099439620972 }, { "auxiliary_loss_clip": 0.01088702, "auxiliary_loss_mlp": 0.01033313, "balance_loss_clip": 1.04040754, "balance_loss_mlp": 1.02037311, "epoch": 0.6967082519164287, "flos": 15633172506240.0, "grad_norm": 1.7929628743992274, "language_loss": 0.72862899, "learning_rate": 8.896489838865857e-07, "loss": 0.74984908, "num_input_tokens_seen": 250074285, "step": 11588, "time_per_iteration": 2.609231472015381 }, { "auxiliary_loss_clip": 0.01084626, "auxiliary_loss_mlp": 0.01027673, "balance_loss_clip": 1.03783381, "balance_loss_mlp": 1.01608634, "epoch": 0.6967683751690966, "flos": 24024598064640.0, "grad_norm": 1.8203274112017525, "language_loss": 0.75158805, "learning_rate": 8.893250767211413e-07, "loss": 0.77271104, "num_input_tokens_seen": 250093350, "step": 11589, "time_per_iteration": 4.13300895690918 }, { "auxiliary_loss_clip": 0.01092018, "auxiliary_loss_mlp": 0.01029816, "balance_loss_clip": 1.0398705, "balance_loss_mlp": 1.01764512, "epoch": 0.6968284984217646, "flos": 31024700265600.0, "grad_norm": 2.003391247755446, "language_loss": 0.63547194, "learning_rate": 8.890012116726012e-07, "loss": 0.6566903, "num_input_tokens_seen": 250114170, "step": 11590, "time_per_iteration": 4.382747411727905 }, { "auxiliary_loss_clip": 0.0099554, "auxiliary_loss_mlp": 0.01020589, "balance_loss_clip": 1.01743388, "balance_loss_mlp": 1.01875329, "epoch": 0.6968886216744326, "flos": 67622990002560.0, "grad_norm": 0.7509859568380568, "language_loss": 0.61225605, "learning_rate": 8.88677388753248e-07, "loss": 0.63241732, "num_input_tokens_seen": 250178250, "step": 11591, "time_per_iteration": 3.3300459384918213 }, { "auxiliary_loss_clip": 0.01070341, "auxiliary_loss_mlp": 0.00770828, "balance_loss_clip": 1.04759347, "balance_loss_mlp": 1.0002979, "epoch": 0.6969487449271006, "flos": 24863686750080.0, "grad_norm": 3.0435355552778893, "language_loss": 0.69087148, "learning_rate": 8.883536079753582e-07, "loss": 0.70928317, "num_input_tokens_seen": 250198420, "step": 11592, "time_per_iteration": 2.8862390518188477 }, { "auxiliary_loss_clip": 0.010765, "auxiliary_loss_mlp": 0.01030332, "balance_loss_clip": 1.03645754, "balance_loss_mlp": 1.01758289, "epoch": 0.6970088681797685, "flos": 28767858791040.0, "grad_norm": 1.559625176666528, "language_loss": 0.6217618, "learning_rate": 8.880298693512109e-07, "loss": 0.64283013, "num_input_tokens_seen": 250220650, "step": 11593, "time_per_iteration": 2.743360757827759 }, { "auxiliary_loss_clip": 0.01085759, "auxiliary_loss_mlp": 0.01027148, "balance_loss_clip": 1.0385946, "balance_loss_mlp": 1.01526248, "epoch": 0.6970689914324365, "flos": 27308556944640.0, "grad_norm": 4.989009563072009, "language_loss": 0.54315436, "learning_rate": 8.877061728930832e-07, "loss": 0.56428343, "num_input_tokens_seen": 250241750, "step": 11594, "time_per_iteration": 2.738746404647827 }, { "auxiliary_loss_clip": 0.01100892, "auxiliary_loss_mlp": 0.01029271, "balance_loss_clip": 1.04011106, "balance_loss_mlp": 1.01718903, "epoch": 0.6971291146851044, "flos": 19136258305920.0, "grad_norm": 1.879143273787494, "language_loss": 0.76764494, "learning_rate": 8.87382518613248e-07, "loss": 0.78894663, "num_input_tokens_seen": 250259445, "step": 11595, "time_per_iteration": 2.6188101768493652 }, { "auxiliary_loss_clip": 0.01091633, "auxiliary_loss_mlp": 0.00771425, "balance_loss_clip": 1.04053104, "balance_loss_mlp": 1.00017834, "epoch": 0.6971892379377724, "flos": 14610508387200.0, "grad_norm": 3.031219644590871, "language_loss": 0.71711326, "learning_rate": 8.870589065239793e-07, "loss": 0.73574388, "num_input_tokens_seen": 250275640, "step": 11596, "time_per_iteration": 4.301288843154907 }, { "auxiliary_loss_clip": 0.01114621, "auxiliary_loss_mlp": 0.0103195, "balance_loss_clip": 1.04142773, "balance_loss_mlp": 1.0187242, "epoch": 0.6972493611904405, "flos": 22307457415680.0, "grad_norm": 1.6942281967354775, "language_loss": 0.76373446, "learning_rate": 8.867353366375492e-07, "loss": 0.78520012, "num_input_tokens_seen": 250296435, "step": 11597, "time_per_iteration": 2.6642062664031982 }, { "auxiliary_loss_clip": 0.0110086, "auxiliary_loss_mlp": 0.01034036, "balance_loss_clip": 1.03868103, "balance_loss_mlp": 1.02113771, "epoch": 0.6973094844431084, "flos": 17420374632960.0, "grad_norm": 1.9087026852292102, "language_loss": 0.74286294, "learning_rate": 8.864118089662267e-07, "loss": 0.76421189, "num_input_tokens_seen": 250314035, "step": 11598, "time_per_iteration": 2.6227927207946777 }, { "auxiliary_loss_clip": 0.01097599, "auxiliary_loss_mlp": 0.01035152, "balance_loss_clip": 1.04099619, "balance_loss_mlp": 1.02143049, "epoch": 0.6973696076957764, "flos": 27235370983680.0, "grad_norm": 1.7329159354231054, "language_loss": 0.89613545, "learning_rate": 8.860883235222791e-07, "loss": 0.91746294, "num_input_tokens_seen": 250332995, "step": 11599, "time_per_iteration": 2.6820266246795654 }, { "auxiliary_loss_clip": 0.0111129, "auxiliary_loss_mlp": 0.01041078, "balance_loss_clip": 1.04336691, "balance_loss_mlp": 1.02599812, "epoch": 0.6974297309484443, "flos": 22018089450240.0, "grad_norm": 2.876326408269763, "language_loss": 0.69646597, "learning_rate": 8.85764880317974e-07, "loss": 0.71798968, "num_input_tokens_seen": 250352120, "step": 11600, "time_per_iteration": 2.6357643604278564 }, { "auxiliary_loss_clip": 0.01071835, "auxiliary_loss_mlp": 0.01034549, "balance_loss_clip": 1.03591609, "balance_loss_mlp": 1.02162719, "epoch": 0.6974898542011123, "flos": 28366449327360.0, "grad_norm": 1.8870788782898071, "language_loss": 0.77037942, "learning_rate": 8.854414793655771e-07, "loss": 0.79144335, "num_input_tokens_seen": 250371705, "step": 11601, "time_per_iteration": 2.767747402191162 }, { "auxiliary_loss_clip": 0.01095268, "auxiliary_loss_mlp": 0.00769859, "balance_loss_clip": 1.03727877, "balance_loss_mlp": 1.0001725, "epoch": 0.6975499774537802, "flos": 15232050351360.0, "grad_norm": 1.9951550875439237, "language_loss": 0.7223537, "learning_rate": 8.851181206773508e-07, "loss": 0.74100494, "num_input_tokens_seen": 250390485, "step": 11602, "time_per_iteration": 2.7312278747558594 }, { "auxiliary_loss_clip": 0.0109282, "auxiliary_loss_mlp": 0.00770776, "balance_loss_clip": 1.03932607, "balance_loss_mlp": 1.00030899, "epoch": 0.6976101007064482, "flos": 22157422306560.0, "grad_norm": 2.0827075826583115, "language_loss": 0.76365876, "learning_rate": 8.847948042655567e-07, "loss": 0.78229469, "num_input_tokens_seen": 250407020, "step": 11603, "time_per_iteration": 2.689286231994629 }, { "auxiliary_loss_clip": 0.01062872, "auxiliary_loss_mlp": 0.01031896, "balance_loss_clip": 1.03511834, "balance_loss_mlp": 1.01951063, "epoch": 0.6976702239591162, "flos": 22273522041600.0, "grad_norm": 1.585564011892126, "language_loss": 0.62287712, "learning_rate": 8.844715301424557e-07, "loss": 0.64382482, "num_input_tokens_seen": 250425880, "step": 11604, "time_per_iteration": 2.7053442001342773 }, { "auxiliary_loss_clip": 0.01097384, "auxiliary_loss_mlp": 0.01033963, "balance_loss_clip": 1.0394032, "balance_loss_mlp": 1.01989651, "epoch": 0.6977303472117842, "flos": 25848608653440.0, "grad_norm": 2.5387493951629954, "language_loss": 0.82072401, "learning_rate": 8.841482983203057e-07, "loss": 0.8420375, "num_input_tokens_seen": 250442925, "step": 11605, "time_per_iteration": 2.62129545211792 }, { "auxiliary_loss_clip": 0.01101547, "auxiliary_loss_mlp": 0.01035684, "balance_loss_clip": 1.03945065, "balance_loss_mlp": 1.02364397, "epoch": 0.6977904704644521, "flos": 20959586536320.0, "grad_norm": 1.5446115393296036, "language_loss": 0.70200372, "learning_rate": 8.838251088113638e-07, "loss": 0.72337604, "num_input_tokens_seen": 250461220, "step": 11606, "time_per_iteration": 2.5922529697418213 }, { "auxiliary_loss_clip": 0.01092847, "auxiliary_loss_mlp": 0.01030242, "balance_loss_clip": 1.03967309, "balance_loss_mlp": 1.01759934, "epoch": 0.6978505937171201, "flos": 22055041566720.0, "grad_norm": 2.256488814952284, "language_loss": 0.82503331, "learning_rate": 8.835019616278856e-07, "loss": 0.84626418, "num_input_tokens_seen": 250480975, "step": 11607, "time_per_iteration": 2.6494314670562744 }, { "auxiliary_loss_clip": 0.0109393, "auxiliary_loss_mlp": 0.01035205, "balance_loss_clip": 1.04016328, "balance_loss_mlp": 1.02121639, "epoch": 0.697910716969788, "flos": 20043720529920.0, "grad_norm": 1.8306370219101196, "language_loss": 0.78975695, "learning_rate": 8.831788567821265e-07, "loss": 0.81104833, "num_input_tokens_seen": 250497980, "step": 11608, "time_per_iteration": 2.6512763500213623 }, { "auxiliary_loss_clip": 0.01095127, "auxiliary_loss_mlp": 0.0103495, "balance_loss_clip": 1.03882265, "balance_loss_mlp": 1.02155733, "epoch": 0.697970840222456, "flos": 15888245961600.0, "grad_norm": 2.0581522063930104, "language_loss": 0.89782465, "learning_rate": 8.828557942863357e-07, "loss": 0.91912538, "num_input_tokens_seen": 250511910, "step": 11609, "time_per_iteration": 2.608104944229126 }, { "auxiliary_loss_clip": 0.01078996, "auxiliary_loss_mlp": 0.01028801, "balance_loss_clip": 1.03936923, "balance_loss_mlp": 1.0155983, "epoch": 0.698030963475124, "flos": 21215629658880.0, "grad_norm": 1.5744545790773263, "language_loss": 0.63836277, "learning_rate": 8.82532774152765e-07, "loss": 0.65944076, "num_input_tokens_seen": 250531090, "step": 11610, "time_per_iteration": 2.743638038635254 }, { "auxiliary_loss_clip": 0.0108087, "auxiliary_loss_mlp": 0.01031692, "balance_loss_clip": 1.03968239, "balance_loss_mlp": 1.01942515, "epoch": 0.698091086727792, "flos": 33759728524800.0, "grad_norm": 1.9401424134223804, "language_loss": 0.84452772, "learning_rate": 8.822097963936643e-07, "loss": 0.86565328, "num_input_tokens_seen": 250551565, "step": 11611, "time_per_iteration": 2.840013265609741 }, { "auxiliary_loss_clip": 0.01102996, "auxiliary_loss_mlp": 0.01033816, "balance_loss_clip": 1.03944659, "balance_loss_mlp": 1.02075076, "epoch": 0.69815120998046, "flos": 15887850912000.0, "grad_norm": 1.998962318660509, "language_loss": 0.70772141, "learning_rate": 8.818868610212793e-07, "loss": 0.7290895, "num_input_tokens_seen": 250569625, "step": 11612, "time_per_iteration": 2.625783681869507 }, { "auxiliary_loss_clip": 0.01094811, "auxiliary_loss_mlp": 0.01031465, "balance_loss_clip": 1.03740323, "balance_loss_mlp": 1.01831591, "epoch": 0.6982113332331279, "flos": 18947044437120.0, "grad_norm": 1.6349871835857621, "language_loss": 0.80866611, "learning_rate": 8.815639680478573e-07, "loss": 0.82992887, "num_input_tokens_seen": 250586960, "step": 11613, "time_per_iteration": 2.601461887359619 }, { "auxiliary_loss_clip": 0.0110142, "auxiliary_loss_mlp": 0.01035984, "balance_loss_clip": 1.03952324, "balance_loss_mlp": 1.02403307, "epoch": 0.6982714564857959, "flos": 24389594115840.0, "grad_norm": 1.8644816544131795, "language_loss": 0.75648409, "learning_rate": 8.812411174856411e-07, "loss": 0.77785814, "num_input_tokens_seen": 250605080, "step": 11614, "time_per_iteration": 2.5961225032806396 }, { "auxiliary_loss_clip": 0.01054504, "auxiliary_loss_mlp": 0.01034441, "balance_loss_clip": 1.04426599, "balance_loss_mlp": 1.0214231, "epoch": 0.6983315797384638, "flos": 20083725302400.0, "grad_norm": 2.0397367451391033, "language_loss": 0.77191758, "learning_rate": 8.809183093468746e-07, "loss": 0.79280698, "num_input_tokens_seen": 250623965, "step": 11615, "time_per_iteration": 2.9072482585906982 }, { "auxiliary_loss_clip": 0.01083429, "auxiliary_loss_mlp": 0.01032942, "balance_loss_clip": 1.03927791, "balance_loss_mlp": 1.0201925, "epoch": 0.6983917029911318, "flos": 13512431664000.0, "grad_norm": 1.8809513154033768, "language_loss": 0.73199165, "learning_rate": 8.80595543643797e-07, "loss": 0.75315541, "num_input_tokens_seen": 250640675, "step": 11616, "time_per_iteration": 2.961540937423706 }, { "auxiliary_loss_clip": 0.01114861, "auxiliary_loss_mlp": 0.01037909, "balance_loss_clip": 1.04299331, "balance_loss_mlp": 1.02498102, "epoch": 0.6984518262437998, "flos": 22018412672640.0, "grad_norm": 1.6387107954550246, "language_loss": 0.84313893, "learning_rate": 8.802728203886487e-07, "loss": 0.86466658, "num_input_tokens_seen": 250660295, "step": 11617, "time_per_iteration": 2.586671829223633 }, { "auxiliary_loss_clip": 0.01074632, "auxiliary_loss_mlp": 0.01043262, "balance_loss_clip": 1.03665471, "balance_loss_mlp": 1.02901638, "epoch": 0.6985119494964678, "flos": 18770615809920.0, "grad_norm": 2.7737807419222102, "language_loss": 0.59687322, "learning_rate": 8.799501395936682e-07, "loss": 0.61805212, "num_input_tokens_seen": 250678155, "step": 11618, "time_per_iteration": 2.6617705821990967 }, { "auxiliary_loss_clip": 0.01090766, "auxiliary_loss_mlp": 0.01037543, "balance_loss_clip": 1.04022956, "balance_loss_mlp": 1.02521658, "epoch": 0.6985720727491357, "flos": 22382834106240.0, "grad_norm": 1.6558560034629248, "language_loss": 0.83071142, "learning_rate": 8.796275012710903e-07, "loss": 0.85199451, "num_input_tokens_seen": 250697230, "step": 11619, "time_per_iteration": 2.6859943866729736 }, { "auxiliary_loss_clip": 0.0109875, "auxiliary_loss_mlp": 0.01029593, "balance_loss_clip": 1.04005098, "balance_loss_mlp": 1.01863778, "epoch": 0.6986321960018037, "flos": 39567884785920.0, "grad_norm": 1.8289485555178766, "language_loss": 0.67044997, "learning_rate": 8.793049054331494e-07, "loss": 0.69173336, "num_input_tokens_seen": 250719865, "step": 11620, "time_per_iteration": 2.7397263050079346 }, { "auxiliary_loss_clip": 0.01062849, "auxiliary_loss_mlp": 0.01030294, "balance_loss_clip": 1.03714263, "balance_loss_mlp": 1.01732397, "epoch": 0.6986923192544716, "flos": 17967725055360.0, "grad_norm": 2.0135534332411353, "language_loss": 0.72860134, "learning_rate": 8.789823520920794e-07, "loss": 0.7495327, "num_input_tokens_seen": 250736565, "step": 11621, "time_per_iteration": 2.731579303741455 }, { "auxiliary_loss_clip": 0.01060219, "auxiliary_loss_mlp": 0.01043603, "balance_loss_clip": 1.03684866, "balance_loss_mlp": 1.02971494, "epoch": 0.6987524425071396, "flos": 25594325297280.0, "grad_norm": 2.5508212623422573, "language_loss": 0.68065464, "learning_rate": 8.7865984126011e-07, "loss": 0.70169282, "num_input_tokens_seen": 250757235, "step": 11622, "time_per_iteration": 2.7399957180023193 }, { "auxiliary_loss_clip": 0.01044503, "auxiliary_loss_mlp": 0.01029486, "balance_loss_clip": 1.03589344, "balance_loss_mlp": 1.01729143, "epoch": 0.6988125657598077, "flos": 17530081747200.0, "grad_norm": 4.399012607705736, "language_loss": 0.62462044, "learning_rate": 8.783373729494721e-07, "loss": 0.64536035, "num_input_tokens_seen": 250775585, "step": 11623, "time_per_iteration": 2.712000846862793 }, { "auxiliary_loss_clip": 0.01115272, "auxiliary_loss_mlp": 0.01029213, "balance_loss_clip": 1.03893709, "balance_loss_mlp": 1.01590955, "epoch": 0.6988726890124756, "flos": 39165721136640.0, "grad_norm": 1.8283272495674747, "language_loss": 0.6077522, "learning_rate": 8.780149471723932e-07, "loss": 0.62919706, "num_input_tokens_seen": 250795725, "step": 11624, "time_per_iteration": 2.765336275100708 }, { "auxiliary_loss_clip": 0.01103178, "auxiliary_loss_mlp": 0.01043279, "balance_loss_clip": 1.0379796, "balance_loss_mlp": 1.02959347, "epoch": 0.6989328122651436, "flos": 20193468330240.0, "grad_norm": 3.187254618002262, "language_loss": 0.78135574, "learning_rate": 8.776925639411017e-07, "loss": 0.80282032, "num_input_tokens_seen": 250814555, "step": 11625, "time_per_iteration": 2.673025608062744 }, { "auxiliary_loss_clip": 0.01074244, "auxiliary_loss_mlp": 0.01033485, "balance_loss_clip": 1.03602779, "balance_loss_mlp": 1.0217849, "epoch": 0.6989929355178115, "flos": 21834873152640.0, "grad_norm": 2.0895075471451903, "language_loss": 0.65869164, "learning_rate": 8.773702232678188e-07, "loss": 0.67976898, "num_input_tokens_seen": 250833105, "step": 11626, "time_per_iteration": 4.503946542739868 }, { "auxiliary_loss_clip": 0.01092456, "auxiliary_loss_mlp": 0.00770949, "balance_loss_clip": 1.03971887, "balance_loss_mlp": 1.000265, "epoch": 0.6990530587704795, "flos": 26322880855680.0, "grad_norm": 1.5700426870038287, "language_loss": 0.70198143, "learning_rate": 8.770479251647697e-07, "loss": 0.72061551, "num_input_tokens_seen": 250852570, "step": 11627, "time_per_iteration": 2.7615082263946533 }, { "auxiliary_loss_clip": 0.01110072, "auxiliary_loss_mlp": 0.01029931, "balance_loss_clip": 1.04070234, "balance_loss_mlp": 1.01854658, "epoch": 0.6991131820231474, "flos": 19828975069440.0, "grad_norm": 1.7298548393631112, "language_loss": 0.6256994, "learning_rate": 8.767256696441768e-07, "loss": 0.64709944, "num_input_tokens_seen": 250870500, "step": 11628, "time_per_iteration": 2.5734152793884277 }, { "auxiliary_loss_clip": 0.01103325, "auxiliary_loss_mlp": 0.01035699, "balance_loss_clip": 1.03856647, "balance_loss_mlp": 1.02237749, "epoch": 0.6991733052758154, "flos": 33984817102080.0, "grad_norm": 2.290690432267753, "language_loss": 0.67708141, "learning_rate": 8.764034567182581e-07, "loss": 0.69847167, "num_input_tokens_seen": 250892745, "step": 11629, "time_per_iteration": 5.866469621658325 }, { "auxiliary_loss_clip": 0.01112912, "auxiliary_loss_mlp": 0.01036564, "balance_loss_clip": 1.04074121, "balance_loss_mlp": 1.02318311, "epoch": 0.6992334285284834, "flos": 15633136592640.0, "grad_norm": 1.543023133812331, "language_loss": 0.72312945, "learning_rate": 8.760812863992337e-07, "loss": 0.74462426, "num_input_tokens_seen": 250910225, "step": 11630, "time_per_iteration": 2.657487392425537 }, { "auxiliary_loss_clip": 0.01113352, "auxiliary_loss_mlp": 0.01034612, "balance_loss_clip": 1.04170883, "balance_loss_mlp": 1.02198827, "epoch": 0.6992935517811514, "flos": 21726279360000.0, "grad_norm": 1.59875756731347, "language_loss": 0.73932934, "learning_rate": 8.757591586993196e-07, "loss": 0.76080894, "num_input_tokens_seen": 250929715, "step": 11631, "time_per_iteration": 2.5861480236053467 }, { "auxiliary_loss_clip": 0.01104832, "auxiliary_loss_mlp": 0.01034444, "balance_loss_clip": 1.0415566, "balance_loss_mlp": 1.02083039, "epoch": 0.6993536750338193, "flos": 20115254465280.0, "grad_norm": 2.102391376159487, "language_loss": 0.89547968, "learning_rate": 8.7543707363073e-07, "loss": 0.91687244, "num_input_tokens_seen": 250944230, "step": 11632, "time_per_iteration": 2.590348482131958 }, { "auxiliary_loss_clip": 0.01094827, "auxiliary_loss_mlp": 0.01040482, "balance_loss_clip": 1.04256976, "balance_loss_mlp": 1.02795899, "epoch": 0.6994137982864873, "flos": 22010547594240.0, "grad_norm": 1.8034038956889087, "language_loss": 0.80041152, "learning_rate": 8.751150312056792e-07, "loss": 0.82176459, "num_input_tokens_seen": 250961865, "step": 11633, "time_per_iteration": 2.681643486022949 }, { "auxiliary_loss_clip": 0.011161, "auxiliary_loss_mlp": 0.01037638, "balance_loss_clip": 1.04005051, "balance_loss_mlp": 1.02334523, "epoch": 0.6994739215391552, "flos": 25519020433920.0, "grad_norm": 1.9182761585422314, "language_loss": 0.67464936, "learning_rate": 8.747930314363794e-07, "loss": 0.69618672, "num_input_tokens_seen": 250982025, "step": 11634, "time_per_iteration": 2.604487419128418 }, { "auxiliary_loss_clip": 0.01010044, "auxiliary_loss_mlp": 0.01002813, "balance_loss_clip": 1.01407039, "balance_loss_mlp": 1.00143051, "epoch": 0.6995340447918232, "flos": 59128357691520.0, "grad_norm": 0.6847666166760457, "language_loss": 0.53152555, "learning_rate": 8.744710743350412e-07, "loss": 0.5516541, "num_input_tokens_seen": 251046900, "step": 11635, "time_per_iteration": 4.9191200733184814 }, { "auxiliary_loss_clip": 0.01086524, "auxiliary_loss_mlp": 0.01034637, "balance_loss_clip": 1.03990149, "balance_loss_mlp": 1.02167284, "epoch": 0.6995941680444913, "flos": 17967832796160.0, "grad_norm": 1.5113357083257617, "language_loss": 0.81950343, "learning_rate": 8.741491599138726e-07, "loss": 0.84071505, "num_input_tokens_seen": 251065050, "step": 11636, "time_per_iteration": 2.6814749240875244 }, { "auxiliary_loss_clip": 0.01114034, "auxiliary_loss_mlp": 0.01030856, "balance_loss_clip": 1.04003048, "balance_loss_mlp": 1.01799953, "epoch": 0.6996542912971592, "flos": 21980095839360.0, "grad_norm": 4.210307970710786, "language_loss": 0.83255941, "learning_rate": 8.738272881850801e-07, "loss": 0.85400826, "num_input_tokens_seen": 251083355, "step": 11637, "time_per_iteration": 2.6101019382476807 }, { "auxiliary_loss_clip": 0.01063351, "auxiliary_loss_mlp": 0.01039835, "balance_loss_clip": 1.0357244, "balance_loss_mlp": 1.0266149, "epoch": 0.6997144145498272, "flos": 11686158518400.0, "grad_norm": 1.88143316282286, "language_loss": 0.68318653, "learning_rate": 8.735054591608704e-07, "loss": 0.70421839, "num_input_tokens_seen": 251096420, "step": 11638, "time_per_iteration": 2.757967233657837 }, { "auxiliary_loss_clip": 0.0110744, "auxiliary_loss_mlp": 0.01034716, "balance_loss_clip": 1.04054809, "balance_loss_mlp": 1.02038121, "epoch": 0.6997745378024951, "flos": 29607162958080.0, "grad_norm": 2.0225103573047334, "language_loss": 0.77908248, "learning_rate": 8.731836728534459e-07, "loss": 0.80050403, "num_input_tokens_seen": 251115410, "step": 11639, "time_per_iteration": 2.7171573638916016 }, { "auxiliary_loss_clip": 0.01088431, "auxiliary_loss_mlp": 0.01044388, "balance_loss_clip": 1.03905129, "balance_loss_mlp": 1.03095889, "epoch": 0.6998346610551631, "flos": 20886616056960.0, "grad_norm": 2.0145862528244542, "language_loss": 0.82033116, "learning_rate": 8.728619292750093e-07, "loss": 0.84165937, "num_input_tokens_seen": 251133530, "step": 11640, "time_per_iteration": 2.746412515640259 }, { "auxiliary_loss_clip": 0.01079412, "auxiliary_loss_mlp": 0.01033499, "balance_loss_clip": 1.03800678, "balance_loss_mlp": 1.02089286, "epoch": 0.699894784307831, "flos": 27163046949120.0, "grad_norm": 1.988818239892088, "language_loss": 0.75212121, "learning_rate": 8.725402284377619e-07, "loss": 0.77325034, "num_input_tokens_seen": 251153985, "step": 11641, "time_per_iteration": 2.789306879043579 }, { "auxiliary_loss_clip": 0.01089337, "auxiliary_loss_mlp": 0.01024848, "balance_loss_clip": 1.03791595, "balance_loss_mlp": 1.01126993, "epoch": 0.699954907560499, "flos": 20923640000640.0, "grad_norm": 1.9133228013475547, "language_loss": 0.77589947, "learning_rate": 8.722185703539022e-07, "loss": 0.79704136, "num_input_tokens_seen": 251173225, "step": 11642, "time_per_iteration": 2.6469504833221436 }, { "auxiliary_loss_clip": 0.01110134, "auxiliary_loss_mlp": 0.01039612, "balance_loss_clip": 1.04202175, "balance_loss_mlp": 1.02436519, "epoch": 0.700015030813167, "flos": 28657792540800.0, "grad_norm": 1.9777967243613577, "language_loss": 0.74846154, "learning_rate": 8.718969550356266e-07, "loss": 0.76995897, "num_input_tokens_seen": 251192485, "step": 11643, "time_per_iteration": 2.6794352531433105 }, { "auxiliary_loss_clip": 0.01079698, "auxiliary_loss_mlp": 0.01030119, "balance_loss_clip": 1.03809059, "balance_loss_mlp": 1.01665401, "epoch": 0.700075154065835, "flos": 29205286617600.0, "grad_norm": 1.5319096526083835, "language_loss": 0.60467082, "learning_rate": 8.715753824951315e-07, "loss": 0.62576902, "num_input_tokens_seen": 251214965, "step": 11644, "time_per_iteration": 2.7573509216308594 }, { "auxiliary_loss_clip": 0.01098759, "auxiliary_loss_mlp": 0.0103069, "balance_loss_clip": 1.03691936, "balance_loss_mlp": 1.01848316, "epoch": 0.7001352773185029, "flos": 23112431159040.0, "grad_norm": 2.3431210800913203, "language_loss": 0.81582069, "learning_rate": 8.712538527446119e-07, "loss": 0.83711517, "num_input_tokens_seen": 251234500, "step": 11645, "time_per_iteration": 2.6731204986572266 }, { "auxiliary_loss_clip": 0.01102676, "auxiliary_loss_mlp": 0.01031812, "balance_loss_clip": 1.03974915, "balance_loss_mlp": 1.01880574, "epoch": 0.7001954005711709, "flos": 21322858734720.0, "grad_norm": 2.1357575492399143, "language_loss": 0.68504727, "learning_rate": 8.709323657962584e-07, "loss": 0.70639217, "num_input_tokens_seen": 251254360, "step": 11646, "time_per_iteration": 2.622621774673462 }, { "auxiliary_loss_clip": 0.01096745, "auxiliary_loss_mlp": 0.01045056, "balance_loss_clip": 1.03817129, "balance_loss_mlp": 1.03125119, "epoch": 0.7002555238238388, "flos": 24535822383360.0, "grad_norm": 1.6406688140332686, "language_loss": 0.71264708, "learning_rate": 8.706109216622635e-07, "loss": 0.73406506, "num_input_tokens_seen": 251274790, "step": 11647, "time_per_iteration": 2.627837896347046 }, { "auxiliary_loss_clip": 0.01105019, "auxiliary_loss_mlp": 0.01036652, "balance_loss_clip": 1.041466, "balance_loss_mlp": 1.02333069, "epoch": 0.7003156470765068, "flos": 39056552726400.0, "grad_norm": 1.5459607229268986, "language_loss": 0.71446347, "learning_rate": 8.702895203548155e-07, "loss": 0.7358802, "num_input_tokens_seen": 251296275, "step": 11648, "time_per_iteration": 2.753802537918091 }, { "auxiliary_loss_clip": 0.01057301, "auxiliary_loss_mlp": 0.01037599, "balance_loss_clip": 1.03298402, "balance_loss_mlp": 1.02418768, "epoch": 0.7003757703291749, "flos": 28804092635520.0, "grad_norm": 1.667578697289853, "language_loss": 0.77163005, "learning_rate": 8.699681618861014e-07, "loss": 0.79257905, "num_input_tokens_seen": 251317375, "step": 11649, "time_per_iteration": 2.7761147022247314 }, { "auxiliary_loss_clip": 0.01090081, "auxiliary_loss_mlp": 0.01033039, "balance_loss_clip": 1.03838015, "balance_loss_mlp": 1.02049243, "epoch": 0.7004358935818428, "flos": 15953854152960.0, "grad_norm": 2.229815779289175, "language_loss": 0.787054, "learning_rate": 8.69646846268308e-07, "loss": 0.80828524, "num_input_tokens_seen": 251333570, "step": 11650, "time_per_iteration": 2.651338815689087 }, { "auxiliary_loss_clip": 0.01087246, "auxiliary_loss_mlp": 0.01026359, "balance_loss_clip": 1.03717887, "balance_loss_mlp": 1.01436639, "epoch": 0.7004960168345108, "flos": 20411984718720.0, "grad_norm": 2.805466583174802, "language_loss": 0.78653586, "learning_rate": 8.693255735136194e-07, "loss": 0.8076719, "num_input_tokens_seen": 251351070, "step": 11651, "time_per_iteration": 2.650684118270874 }, { "auxiliary_loss_clip": 0.01078764, "auxiliary_loss_mlp": 0.01048293, "balance_loss_clip": 1.03785074, "balance_loss_mlp": 1.03431594, "epoch": 0.7005561400871787, "flos": 17347547808000.0, "grad_norm": 1.6343941081799256, "language_loss": 0.69484842, "learning_rate": 8.690043436342198e-07, "loss": 0.71611905, "num_input_tokens_seen": 251370005, "step": 11652, "time_per_iteration": 2.807304859161377 }, { "auxiliary_loss_clip": 0.01104104, "auxiliary_loss_mlp": 0.0103143, "balance_loss_clip": 1.04046094, "balance_loss_mlp": 1.01811981, "epoch": 0.7006162633398467, "flos": 25302120157440.0, "grad_norm": 1.3561275324415532, "language_loss": 0.74232221, "learning_rate": 8.686831566422874e-07, "loss": 0.7636776, "num_input_tokens_seen": 251391210, "step": 11653, "time_per_iteration": 2.696967601776123 }, { "auxiliary_loss_clip": 0.01087115, "auxiliary_loss_mlp": 0.01035907, "balance_loss_clip": 1.03951633, "balance_loss_mlp": 1.02182245, "epoch": 0.7006763865925146, "flos": 20668997508480.0, "grad_norm": 2.1100512473261808, "language_loss": 0.70994234, "learning_rate": 8.68362012550003e-07, "loss": 0.73117256, "num_input_tokens_seen": 251411505, "step": 11654, "time_per_iteration": 2.6838555335998535 }, { "auxiliary_loss_clip": 0.01066217, "auxiliary_loss_mlp": 0.01033137, "balance_loss_clip": 1.03629136, "balance_loss_mlp": 1.0182364, "epoch": 0.7007365098451827, "flos": 20046449963520.0, "grad_norm": 2.5073875946500093, "language_loss": 0.73771894, "learning_rate": 8.680409113695453e-07, "loss": 0.75871241, "num_input_tokens_seen": 251428975, "step": 11655, "time_per_iteration": 2.7410359382629395 }, { "auxiliary_loss_clip": 0.01111257, "auxiliary_loss_mlp": 0.01039302, "balance_loss_clip": 1.04240656, "balance_loss_mlp": 1.02404356, "epoch": 0.7007966330978506, "flos": 20777375819520.0, "grad_norm": 1.9005607339243875, "language_loss": 0.70418394, "learning_rate": 8.677198531130889e-07, "loss": 0.72568953, "num_input_tokens_seen": 251446940, "step": 11656, "time_per_iteration": 2.731491804122925 }, { "auxiliary_loss_clip": 0.01066256, "auxiliary_loss_mlp": 0.01032163, "balance_loss_clip": 1.03605461, "balance_loss_mlp": 1.0202893, "epoch": 0.7008567563505186, "flos": 29638189330560.0, "grad_norm": 1.6373792531081708, "language_loss": 0.7814554, "learning_rate": 8.673988377928092e-07, "loss": 0.80243957, "num_input_tokens_seen": 251466205, "step": 11657, "time_per_iteration": 2.77717924118042 }, { "auxiliary_loss_clip": 0.01118749, "auxiliary_loss_mlp": 0.010372, "balance_loss_clip": 1.04163647, "balance_loss_mlp": 1.02257895, "epoch": 0.7009168796031865, "flos": 17092007475840.0, "grad_norm": 1.9768682726826163, "language_loss": 0.78330362, "learning_rate": 8.670778654208797e-07, "loss": 0.8048631, "num_input_tokens_seen": 251484820, "step": 11658, "time_per_iteration": 2.6049365997314453 }, { "auxiliary_loss_clip": 0.01086248, "auxiliary_loss_mlp": 0.0103209, "balance_loss_clip": 1.03589261, "balance_loss_mlp": 1.01928139, "epoch": 0.7009770028558545, "flos": 20448972748800.0, "grad_norm": 5.565674692031433, "language_loss": 0.82623971, "learning_rate": 8.667569360094713e-07, "loss": 0.84742308, "num_input_tokens_seen": 251502670, "step": 11659, "time_per_iteration": 2.686923027038574 }, { "auxiliary_loss_clip": 0.01069607, "auxiliary_loss_mlp": 0.01030844, "balance_loss_clip": 1.03661668, "balance_loss_mlp": 1.0180831, "epoch": 0.7010371261085224, "flos": 19245139407360.0, "grad_norm": 1.865164954565192, "language_loss": 0.6914413, "learning_rate": 8.664360495707526e-07, "loss": 0.71244586, "num_input_tokens_seen": 251521630, "step": 11660, "time_per_iteration": 2.6798696517944336 }, { "auxiliary_loss_clip": 0.01114876, "auxiliary_loss_mlp": 0.01039123, "balance_loss_clip": 1.03920841, "balance_loss_mlp": 1.02499104, "epoch": 0.7010972493611904, "flos": 22127581082880.0, "grad_norm": 1.6974567931874, "language_loss": 0.81309623, "learning_rate": 8.661152061168924e-07, "loss": 0.83463621, "num_input_tokens_seen": 251540105, "step": 11661, "time_per_iteration": 2.665506601333618 }, { "auxiliary_loss_clip": 0.01100544, "auxiliary_loss_mlp": 0.01036779, "balance_loss_clip": 1.03771257, "balance_loss_mlp": 1.02428651, "epoch": 0.7011573726138585, "flos": 31391132860800.0, "grad_norm": 1.6549167062780274, "language_loss": 0.79250038, "learning_rate": 8.657944056600579e-07, "loss": 0.81387359, "num_input_tokens_seen": 251560530, "step": 11662, "time_per_iteration": 2.747738838195801 }, { "auxiliary_loss_clip": 0.01099278, "auxiliary_loss_mlp": 0.01034891, "balance_loss_clip": 1.03757107, "balance_loss_mlp": 1.02009749, "epoch": 0.7012174958665264, "flos": 18150582216960.0, "grad_norm": 1.8518490996849224, "language_loss": 0.83547205, "learning_rate": 8.654736482124134e-07, "loss": 0.85681379, "num_input_tokens_seen": 251577930, "step": 11663, "time_per_iteration": 2.631399631500244 }, { "auxiliary_loss_clip": 0.01021926, "auxiliary_loss_mlp": 0.00999736, "balance_loss_clip": 1.00981212, "balance_loss_mlp": 0.99871653, "epoch": 0.7012776191191944, "flos": 60651256567680.0, "grad_norm": 0.8199034033651936, "language_loss": 0.5377062, "learning_rate": 8.651529337861209e-07, "loss": 0.55792284, "num_input_tokens_seen": 251638820, "step": 11664, "time_per_iteration": 3.219939708709717 }, { "auxiliary_loss_clip": 0.01091352, "auxiliary_loss_mlp": 0.01036957, "balance_loss_clip": 1.03675961, "balance_loss_mlp": 1.02283645, "epoch": 0.7013377423718623, "flos": 27198598435200.0, "grad_norm": 25.234897895477353, "language_loss": 0.78593969, "learning_rate": 8.64832262393344e-07, "loss": 0.80722272, "num_input_tokens_seen": 251658070, "step": 11665, "time_per_iteration": 2.7333061695098877 }, { "auxiliary_loss_clip": 0.01097626, "auxiliary_loss_mlp": 0.01033659, "balance_loss_clip": 1.03675759, "balance_loss_mlp": 1.02039695, "epoch": 0.7013978656245303, "flos": 16543543731840.0, "grad_norm": 3.3099112213098576, "language_loss": 0.76706922, "learning_rate": 8.645116340462404e-07, "loss": 0.78838205, "num_input_tokens_seen": 251671575, "step": 11666, "time_per_iteration": 4.164456844329834 }, { "auxiliary_loss_clip": 0.0109964, "auxiliary_loss_mlp": 0.01033563, "balance_loss_clip": 1.03881526, "balance_loss_mlp": 1.02059937, "epoch": 0.7014579888771982, "flos": 23143780753920.0, "grad_norm": 1.9172711089554313, "language_loss": 0.81507015, "learning_rate": 8.641910487569695e-07, "loss": 0.83640218, "num_input_tokens_seen": 251689350, "step": 11667, "time_per_iteration": 2.6507012844085693 }, { "auxiliary_loss_clip": 0.01080493, "auxiliary_loss_mlp": 0.01039617, "balance_loss_clip": 1.03758526, "balance_loss_mlp": 1.02586019, "epoch": 0.7015181121298663, "flos": 25082095397760.0, "grad_norm": 4.617945846615331, "language_loss": 0.65072989, "learning_rate": 8.638705065376879e-07, "loss": 0.67193091, "num_input_tokens_seen": 251704635, "step": 11668, "time_per_iteration": 4.238234758377075 }, { "auxiliary_loss_clip": 0.01094365, "auxiliary_loss_mlp": 0.0102871, "balance_loss_clip": 1.03865385, "balance_loss_mlp": 1.01505494, "epoch": 0.7015782353825342, "flos": 23327894891520.0, "grad_norm": 2.259598520998094, "language_loss": 0.7661069, "learning_rate": 8.635500074005519e-07, "loss": 0.78733766, "num_input_tokens_seen": 251723035, "step": 11669, "time_per_iteration": 4.344635248184204 }, { "auxiliary_loss_clip": 0.01013949, "auxiliary_loss_mlp": 0.0100684, "balance_loss_clip": 1.00989032, "balance_loss_mlp": 1.00561166, "epoch": 0.7016383586352022, "flos": 70397161107840.0, "grad_norm": 0.6970325216386312, "language_loss": 0.54508567, "learning_rate": 8.632295513577122e-07, "loss": 0.56529355, "num_input_tokens_seen": 251791630, "step": 11670, "time_per_iteration": 3.3269011974334717 }, { "auxiliary_loss_clip": 0.01088398, "auxiliary_loss_mlp": 0.01044417, "balance_loss_clip": 1.04069412, "balance_loss_mlp": 1.03119707, "epoch": 0.7016984818878701, "flos": 19792274348160.0, "grad_norm": 1.7508124659386841, "language_loss": 0.81738812, "learning_rate": 8.629091384213218e-07, "loss": 0.83871627, "num_input_tokens_seen": 251809840, "step": 11671, "time_per_iteration": 2.622065544128418 }, { "auxiliary_loss_clip": 0.0110729, "auxiliary_loss_mlp": 0.01033835, "balance_loss_clip": 1.04194474, "balance_loss_mlp": 1.02070975, "epoch": 0.7017586051405381, "flos": 12896923184640.0, "grad_norm": 2.274862917429984, "language_loss": 0.75504148, "learning_rate": 8.625887686035313e-07, "loss": 0.77645272, "num_input_tokens_seen": 251827550, "step": 11672, "time_per_iteration": 2.6540980339050293 }, { "auxiliary_loss_clip": 0.01096652, "auxiliary_loss_mlp": 0.01034604, "balance_loss_clip": 1.03700793, "balance_loss_mlp": 1.02045953, "epoch": 0.701818728393206, "flos": 18332828847360.0, "grad_norm": 1.5989851774558104, "language_loss": 0.87145984, "learning_rate": 8.622684419164883e-07, "loss": 0.8927725, "num_input_tokens_seen": 251844880, "step": 11673, "time_per_iteration": 2.8490025997161865 }, { "auxiliary_loss_clip": 0.01096229, "auxiliary_loss_mlp": 0.01029311, "balance_loss_clip": 1.0356524, "balance_loss_mlp": 1.01583493, "epoch": 0.701878851645874, "flos": 17384212615680.0, "grad_norm": 1.8214270702877817, "language_loss": 0.73174304, "learning_rate": 8.619481583723399e-07, "loss": 0.75299847, "num_input_tokens_seen": 251861025, "step": 11674, "time_per_iteration": 2.679823160171509 }, { "auxiliary_loss_clip": 0.01096759, "auxiliary_loss_mlp": 0.00769911, "balance_loss_clip": 1.04201114, "balance_loss_mlp": 1.00022173, "epoch": 0.701938974898542, "flos": 23915501481600.0, "grad_norm": 1.622926679018359, "language_loss": 0.72171724, "learning_rate": 8.616279179832329e-07, "loss": 0.74038392, "num_input_tokens_seen": 251880175, "step": 11675, "time_per_iteration": 4.312408447265625 }, { "auxiliary_loss_clip": 0.01074264, "auxiliary_loss_mlp": 0.01030849, "balance_loss_clip": 1.03631043, "balance_loss_mlp": 1.01713443, "epoch": 0.70199909815121, "flos": 21795586652160.0, "grad_norm": 2.2219041729549143, "language_loss": 0.51501888, "learning_rate": 8.613077207613078e-07, "loss": 0.53606999, "num_input_tokens_seen": 251899005, "step": 11676, "time_per_iteration": 2.6962332725524902 }, { "auxiliary_loss_clip": 0.01010504, "auxiliary_loss_mlp": 0.0075156, "balance_loss_clip": 1.00856614, "balance_loss_mlp": 0.99960405, "epoch": 0.702059221403878, "flos": 71715047109120.0, "grad_norm": 0.7294989296672769, "language_loss": 0.59194738, "learning_rate": 8.609875667187079e-07, "loss": 0.60956806, "num_input_tokens_seen": 251966790, "step": 11677, "time_per_iteration": 3.283904552459717 }, { "auxiliary_loss_clip": 0.01100162, "auxiliary_loss_mlp": 0.01038032, "balance_loss_clip": 1.03780138, "balance_loss_mlp": 1.02260053, "epoch": 0.7021193446565459, "flos": 28111052649600.0, "grad_norm": 2.3043069619356333, "language_loss": 0.62869537, "learning_rate": 8.606674558675737e-07, "loss": 0.65007722, "num_input_tokens_seen": 251989315, "step": 11678, "time_per_iteration": 2.683986186981201 }, { "auxiliary_loss_clip": 0.01114626, "auxiliary_loss_mlp": 0.01034461, "balance_loss_clip": 1.04092705, "balance_loss_mlp": 1.02130055, "epoch": 0.7021794679092139, "flos": 22924905229440.0, "grad_norm": 1.616922175472628, "language_loss": 0.79195565, "learning_rate": 8.603473882200444e-07, "loss": 0.81344652, "num_input_tokens_seen": 252006620, "step": 11679, "time_per_iteration": 2.6574866771698 }, { "auxiliary_loss_clip": 0.01084266, "auxiliary_loss_mlp": 0.01048047, "balance_loss_clip": 1.03683782, "balance_loss_mlp": 1.03429675, "epoch": 0.7022395911618818, "flos": 18077827219200.0, "grad_norm": 2.0679583940680746, "language_loss": 0.70934772, "learning_rate": 8.600273637882567e-07, "loss": 0.73067081, "num_input_tokens_seen": 252024570, "step": 11680, "time_per_iteration": 2.7358908653259277 }, { "auxiliary_loss_clip": 0.01074807, "auxiliary_loss_mlp": 0.010398, "balance_loss_clip": 1.03587687, "balance_loss_mlp": 1.02517891, "epoch": 0.7022997144145499, "flos": 16034294661120.0, "grad_norm": 1.6247051825758914, "language_loss": 0.74976349, "learning_rate": 8.597073825843446e-07, "loss": 0.77090955, "num_input_tokens_seen": 252042775, "step": 11681, "time_per_iteration": 2.774574041366577 }, { "auxiliary_loss_clip": 0.01094616, "auxiliary_loss_mlp": 0.0103624, "balance_loss_clip": 1.03856039, "balance_loss_mlp": 1.0238483, "epoch": 0.7023598376672178, "flos": 26468678160000.0, "grad_norm": 1.575109913537797, "language_loss": 0.76865822, "learning_rate": 8.593874446204434e-07, "loss": 0.78996682, "num_input_tokens_seen": 252063690, "step": 11682, "time_per_iteration": 2.7486395835876465 }, { "auxiliary_loss_clip": 0.01082555, "auxiliary_loss_mlp": 0.00772032, "balance_loss_clip": 1.03884804, "balance_loss_mlp": 1.00019991, "epoch": 0.7024199609198858, "flos": 17055917285760.0, "grad_norm": 2.008069790408466, "language_loss": 0.737746, "learning_rate": 8.590675499086841e-07, "loss": 0.75629187, "num_input_tokens_seen": 252080335, "step": 11683, "time_per_iteration": 2.744171142578125 }, { "auxiliary_loss_clip": 0.01079915, "auxiliary_loss_mlp": 0.01035548, "balance_loss_clip": 1.03894246, "balance_loss_mlp": 1.02157617, "epoch": 0.7024800841725537, "flos": 25849039616640.0, "grad_norm": 1.8760496906578064, "language_loss": 0.71592307, "learning_rate": 8.587476984611976e-07, "loss": 0.73707771, "num_input_tokens_seen": 252101075, "step": 11684, "time_per_iteration": 2.88992977142334 }, { "auxiliary_loss_clip": 0.01104368, "auxiliary_loss_mlp": 0.01036296, "balance_loss_clip": 1.03960109, "balance_loss_mlp": 1.02242017, "epoch": 0.7025402074252217, "flos": 23513014609920.0, "grad_norm": 1.7667874173043265, "language_loss": 0.71676773, "learning_rate": 8.584278902901128e-07, "loss": 0.73817438, "num_input_tokens_seen": 252120510, "step": 11685, "time_per_iteration": 2.7373046875 }, { "auxiliary_loss_clip": 0.01101099, "auxiliary_loss_mlp": 0.01033615, "balance_loss_clip": 1.03761411, "balance_loss_mlp": 1.02074599, "epoch": 0.7026003306778896, "flos": 20150985519360.0, "grad_norm": 2.3305068980612695, "language_loss": 0.84660101, "learning_rate": 8.581081254075582e-07, "loss": 0.86794817, "num_input_tokens_seen": 252137590, "step": 11686, "time_per_iteration": 2.6728014945983887 }, { "auxiliary_loss_clip": 0.0101853, "auxiliary_loss_mlp": 0.01001761, "balance_loss_clip": 1.00980115, "balance_loss_mlp": 1.00045574, "epoch": 0.7026604539305576, "flos": 64772400712320.0, "grad_norm": 1.2905405547920359, "language_loss": 0.69901091, "learning_rate": 8.577884038256566e-07, "loss": 0.71921384, "num_input_tokens_seen": 252199830, "step": 11687, "time_per_iteration": 3.3107638359069824 }, { "auxiliary_loss_clip": 0.01076554, "auxiliary_loss_mlp": 0.01032291, "balance_loss_clip": 1.03496408, "balance_loss_mlp": 1.01832569, "epoch": 0.7027205771832256, "flos": 21871466133120.0, "grad_norm": 1.932037995437553, "language_loss": 0.7684707, "learning_rate": 8.574687255565329e-07, "loss": 0.78955913, "num_input_tokens_seen": 252217200, "step": 11688, "time_per_iteration": 2.7459444999694824 }, { "auxiliary_loss_clip": 0.0111428, "auxiliary_loss_mlp": 0.0103517, "balance_loss_clip": 1.04030085, "balance_loss_mlp": 1.02199149, "epoch": 0.7027807004358936, "flos": 23367791923200.0, "grad_norm": 2.3717928399741117, "language_loss": 0.68631124, "learning_rate": 8.571490906123107e-07, "loss": 0.70780575, "num_input_tokens_seen": 252236105, "step": 11689, "time_per_iteration": 2.615769624710083 }, { "auxiliary_loss_clip": 0.01092147, "auxiliary_loss_mlp": 0.01039659, "balance_loss_clip": 1.03881717, "balance_loss_mlp": 1.02628398, "epoch": 0.7028408236885616, "flos": 15304266645120.0, "grad_norm": 2.2678896966391293, "language_loss": 0.79724276, "learning_rate": 8.568294990051086e-07, "loss": 0.81856084, "num_input_tokens_seen": 252253315, "step": 11690, "time_per_iteration": 2.752448081970215 }, { "auxiliary_loss_clip": 0.01114987, "auxiliary_loss_mlp": 0.01035613, "balance_loss_clip": 1.04160511, "balance_loss_mlp": 1.02232075, "epoch": 0.7029009469412295, "flos": 22018197191040.0, "grad_norm": 1.8737745525801948, "language_loss": 0.76049984, "learning_rate": 8.56509950747047e-07, "loss": 0.78200579, "num_input_tokens_seen": 252272765, "step": 11691, "time_per_iteration": 2.6119184494018555 }, { "auxiliary_loss_clip": 0.0108875, "auxiliary_loss_mlp": 0.01032492, "balance_loss_clip": 1.03811002, "balance_loss_mlp": 1.01972437, "epoch": 0.7029610701938975, "flos": 21835519597440.0, "grad_norm": 9.903733322151682, "language_loss": 0.81749791, "learning_rate": 8.561904458502429e-07, "loss": 0.83871031, "num_input_tokens_seen": 252290510, "step": 11692, "time_per_iteration": 2.69521427154541 }, { "auxiliary_loss_clip": 0.0108957, "auxiliary_loss_mlp": 0.01032565, "balance_loss_clip": 1.03853154, "balance_loss_mlp": 1.01875424, "epoch": 0.7030211934465654, "flos": 19135647774720.0, "grad_norm": 1.5579267825971022, "language_loss": 0.76325333, "learning_rate": 8.558709843268111e-07, "loss": 0.78447467, "num_input_tokens_seen": 252309365, "step": 11693, "time_per_iteration": 2.6727678775787354 }, { "auxiliary_loss_clip": 0.01089511, "auxiliary_loss_mlp": 0.01037963, "balance_loss_clip": 1.04170704, "balance_loss_mlp": 1.02457595, "epoch": 0.7030813166992335, "flos": 38546010766080.0, "grad_norm": 1.4223920880081815, "language_loss": 0.68617809, "learning_rate": 8.55551566188866e-07, "loss": 0.70745289, "num_input_tokens_seen": 252333010, "step": 11694, "time_per_iteration": 2.858931541442871 }, { "auxiliary_loss_clip": 0.0111374, "auxiliary_loss_mlp": 0.01034494, "balance_loss_clip": 1.03941369, "balance_loss_mlp": 1.02133918, "epoch": 0.7031414399519014, "flos": 14720897859840.0, "grad_norm": 2.09445306191262, "language_loss": 0.75264633, "learning_rate": 8.552321914485203e-07, "loss": 0.77412868, "num_input_tokens_seen": 252351330, "step": 11695, "time_per_iteration": 2.631002902984619 }, { "auxiliary_loss_clip": 0.01092725, "auxiliary_loss_mlp": 0.01042661, "balance_loss_clip": 1.04262757, "balance_loss_mlp": 1.02838016, "epoch": 0.7032015632045694, "flos": 14027247342720.0, "grad_norm": 2.095793582645169, "language_loss": 0.73874116, "learning_rate": 8.549128601178852e-07, "loss": 0.760095, "num_input_tokens_seen": 252369580, "step": 11696, "time_per_iteration": 2.7669694423675537 }, { "auxiliary_loss_clip": 0.01097157, "auxiliary_loss_mlp": 0.01032154, "balance_loss_clip": 1.0388881, "balance_loss_mlp": 1.01825988, "epoch": 0.7032616864572373, "flos": 27637175496960.0, "grad_norm": 1.7096974428239413, "language_loss": 0.75290072, "learning_rate": 8.545935722090693e-07, "loss": 0.77419376, "num_input_tokens_seen": 252390525, "step": 11697, "time_per_iteration": 2.763500928878784 }, { "auxiliary_loss_clip": 0.01063183, "auxiliary_loss_mlp": 0.01044698, "balance_loss_clip": 1.03909528, "balance_loss_mlp": 1.02815211, "epoch": 0.7033218097099053, "flos": 17967294092160.0, "grad_norm": 1.8043330597848055, "language_loss": 0.81064868, "learning_rate": 8.542743277341793e-07, "loss": 0.8317275, "num_input_tokens_seen": 252407470, "step": 11698, "time_per_iteration": 2.869485378265381 }, { "auxiliary_loss_clip": 0.01087007, "auxiliary_loss_mlp": 0.01041792, "balance_loss_clip": 1.03696036, "balance_loss_mlp": 1.02600873, "epoch": 0.7033819329625732, "flos": 19501721233920.0, "grad_norm": 1.788545707110732, "language_loss": 0.84702611, "learning_rate": 8.539551267053222e-07, "loss": 0.86831409, "num_input_tokens_seen": 252427025, "step": 11699, "time_per_iteration": 2.664696216583252 }, { "auxiliary_loss_clip": 0.01097664, "auxiliary_loss_mlp": 0.01034973, "balance_loss_clip": 1.03910065, "balance_loss_mlp": 1.02029788, "epoch": 0.7034420562152413, "flos": 23987645948160.0, "grad_norm": 2.037813318278341, "language_loss": 0.78878331, "learning_rate": 8.53635969134601e-07, "loss": 0.81010973, "num_input_tokens_seen": 252445410, "step": 11700, "time_per_iteration": 2.6491341590881348 }, { "auxiliary_loss_clip": 0.01104199, "auxiliary_loss_mlp": 0.01030571, "balance_loss_clip": 1.04006886, "balance_loss_mlp": 1.01655197, "epoch": 0.7035021794679092, "flos": 35043427756800.0, "grad_norm": 1.9483737724471917, "language_loss": 0.74603212, "learning_rate": 8.533168550341186e-07, "loss": 0.76737982, "num_input_tokens_seen": 252463905, "step": 11701, "time_per_iteration": 2.75663423538208 }, { "auxiliary_loss_clip": 0.0110842, "auxiliary_loss_mlp": 0.01031935, "balance_loss_clip": 1.04136193, "balance_loss_mlp": 1.01701057, "epoch": 0.7035623027205772, "flos": 10997428164480.0, "grad_norm": 2.4684106998326913, "language_loss": 0.84602612, "learning_rate": 8.529977844159769e-07, "loss": 0.86742967, "num_input_tokens_seen": 252478655, "step": 11702, "time_per_iteration": 2.691843032836914 }, { "auxiliary_loss_clip": 0.01114954, "auxiliary_loss_mlp": 0.01040813, "balance_loss_clip": 1.03995621, "balance_loss_mlp": 1.02679968, "epoch": 0.7036224259732452, "flos": 23623727304960.0, "grad_norm": 2.401456792119983, "language_loss": 0.61207104, "learning_rate": 8.526787572922738e-07, "loss": 0.63362873, "num_input_tokens_seen": 252498740, "step": 11703, "time_per_iteration": 2.6257216930389404 }, { "auxiliary_loss_clip": 0.01112246, "auxiliary_loss_mlp": 0.01030363, "balance_loss_clip": 1.03811026, "balance_loss_mlp": 1.01622462, "epoch": 0.7036825492259131, "flos": 31686175175040.0, "grad_norm": 1.8586997056929888, "language_loss": 0.61509585, "learning_rate": 8.523597736751067e-07, "loss": 0.63652194, "num_input_tokens_seen": 252517800, "step": 11704, "time_per_iteration": 2.6846559047698975 }, { "auxiliary_loss_clip": 0.01096047, "auxiliary_loss_mlp": 0.01032429, "balance_loss_clip": 1.0392369, "balance_loss_mlp": 1.02010882, "epoch": 0.7037426724785811, "flos": 30192866127360.0, "grad_norm": 1.6136715073341366, "language_loss": 0.70614809, "learning_rate": 8.520408335765719e-07, "loss": 0.72743285, "num_input_tokens_seen": 252539620, "step": 11705, "time_per_iteration": 4.335879564285278 }, { "auxiliary_loss_clip": 0.01103119, "auxiliary_loss_mlp": 0.01036985, "balance_loss_clip": 1.04067218, "balance_loss_mlp": 1.02324617, "epoch": 0.703802795731249, "flos": 24311523905280.0, "grad_norm": 1.8826981498859905, "language_loss": 0.61822981, "learning_rate": 8.517219370087645e-07, "loss": 0.63963085, "num_input_tokens_seen": 252557300, "step": 11706, "time_per_iteration": 2.593494176864624 }, { "auxiliary_loss_clip": 0.01106671, "auxiliary_loss_mlp": 0.01030439, "balance_loss_clip": 1.04123783, "balance_loss_mlp": 1.01777911, "epoch": 0.7038629189839171, "flos": 22528954632960.0, "grad_norm": 2.04643987571859, "language_loss": 0.67915642, "learning_rate": 8.514030839837756e-07, "loss": 0.70052749, "num_input_tokens_seen": 252576715, "step": 11707, "time_per_iteration": 2.7215147018432617 }, { "auxiliary_loss_clip": 0.01112969, "auxiliary_loss_mlp": 0.01030922, "balance_loss_clip": 1.04011774, "balance_loss_mlp": 1.01814246, "epoch": 0.703923042236585, "flos": 26250484993920.0, "grad_norm": 1.7862705599659854, "language_loss": 0.76583481, "learning_rate": 8.510842745136974e-07, "loss": 0.78727371, "num_input_tokens_seen": 252596190, "step": 11708, "time_per_iteration": 4.144139051437378 }, { "auxiliary_loss_clip": 0.01090944, "auxiliary_loss_mlp": 0.01034476, "balance_loss_clip": 1.03818655, "balance_loss_mlp": 1.02149391, "epoch": 0.703983165489253, "flos": 19390254353280.0, "grad_norm": 1.9126700939118615, "language_loss": 0.72069716, "learning_rate": 8.50765508610619e-07, "loss": 0.74195135, "num_input_tokens_seen": 252613410, "step": 11709, "time_per_iteration": 4.317174911499023 }, { "auxiliary_loss_clip": 0.01103216, "auxiliary_loss_mlp": 0.01032072, "balance_loss_clip": 1.04039192, "balance_loss_mlp": 1.01939988, "epoch": 0.7040432887419209, "flos": 16683630773760.0, "grad_norm": 2.4201158088388337, "language_loss": 0.78757358, "learning_rate": 8.504467862866267e-07, "loss": 0.80892646, "num_input_tokens_seen": 252629150, "step": 11710, "time_per_iteration": 2.6521589756011963 }, { "auxiliary_loss_clip": 0.01106607, "auxiliary_loss_mlp": 0.01035061, "balance_loss_clip": 1.04086101, "balance_loss_mlp": 1.02094674, "epoch": 0.7041034119945889, "flos": 21141402203520.0, "grad_norm": 1.5478201961623483, "language_loss": 0.77396274, "learning_rate": 8.501281075538076e-07, "loss": 0.7953794, "num_input_tokens_seen": 252648225, "step": 11711, "time_per_iteration": 2.673774242401123 }, { "auxiliary_loss_clip": 0.01077655, "auxiliary_loss_mlp": 0.01031566, "balance_loss_clip": 1.03709733, "balance_loss_mlp": 1.01935935, "epoch": 0.7041635352472568, "flos": 16910299549440.0, "grad_norm": 2.4253841205918945, "language_loss": 0.74240053, "learning_rate": 8.498094724242457e-07, "loss": 0.7634927, "num_input_tokens_seen": 252665380, "step": 11712, "time_per_iteration": 2.7232208251953125 }, { "auxiliary_loss_clip": 0.00994093, "auxiliary_loss_mlp": 0.01000365, "balance_loss_clip": 1.0117116, "balance_loss_mlp": 0.99926871, "epoch": 0.7042236584999249, "flos": 71681219475840.0, "grad_norm": 0.8868310854542714, "language_loss": 0.64613295, "learning_rate": 8.494908809100247e-07, "loss": 0.66607749, "num_input_tokens_seen": 252727950, "step": 11713, "time_per_iteration": 3.285946846008301 }, { "auxiliary_loss_clip": 0.01098435, "auxiliary_loss_mlp": 0.01032407, "balance_loss_clip": 1.03652644, "balance_loss_mlp": 1.01949096, "epoch": 0.7042837817525928, "flos": 28658187590400.0, "grad_norm": 2.013493705916732, "language_loss": 0.73046267, "learning_rate": 8.49172333023225e-07, "loss": 0.75177109, "num_input_tokens_seen": 252746770, "step": 11714, "time_per_iteration": 4.236090898513794 }, { "auxiliary_loss_clip": 0.01087938, "auxiliary_loss_mlp": 0.0077181, "balance_loss_clip": 1.03839374, "balance_loss_mlp": 1.00026381, "epoch": 0.7043439050052608, "flos": 19753562465280.0, "grad_norm": 1.7093963248736088, "language_loss": 0.79507661, "learning_rate": 8.488538287759248e-07, "loss": 0.81367409, "num_input_tokens_seen": 252765610, "step": 11715, "time_per_iteration": 2.6581244468688965 }, { "auxiliary_loss_clip": 0.01084772, "auxiliary_loss_mlp": 0.0104235, "balance_loss_clip": 1.03780401, "balance_loss_mlp": 1.02811062, "epoch": 0.7044040282579288, "flos": 11538529620480.0, "grad_norm": 2.525582703515887, "language_loss": 0.71628869, "learning_rate": 8.485353681802037e-07, "loss": 0.73755985, "num_input_tokens_seen": 252781610, "step": 11716, "time_per_iteration": 2.6553633213043213 }, { "auxiliary_loss_clip": 0.0108292, "auxiliary_loss_mlp": 0.01035705, "balance_loss_clip": 1.04328799, "balance_loss_mlp": 1.02210903, "epoch": 0.7044641515105967, "flos": 33656126722560.0, "grad_norm": 1.9767167849127631, "language_loss": 0.66507739, "learning_rate": 8.482169512481358e-07, "loss": 0.68626368, "num_input_tokens_seen": 252800600, "step": 11717, "time_per_iteration": 2.8526103496551514 }, { "auxiliary_loss_clip": 0.01115695, "auxiliary_loss_mlp": 0.01028921, "balance_loss_clip": 1.04154098, "balance_loss_mlp": 1.01575446, "epoch": 0.7045242747632647, "flos": 26723859356160.0, "grad_norm": 1.3833751412057287, "language_loss": 0.74381793, "learning_rate": 8.478985779917967e-07, "loss": 0.76526403, "num_input_tokens_seen": 252822310, "step": 11718, "time_per_iteration": 2.6526429653167725 }, { "auxiliary_loss_clip": 0.0110132, "auxiliary_loss_mlp": 0.01033293, "balance_loss_clip": 1.039011, "balance_loss_mlp": 1.02069247, "epoch": 0.7045843980159326, "flos": 26797655848320.0, "grad_norm": 2.395703855617911, "language_loss": 0.79719883, "learning_rate": 8.475802484232606e-07, "loss": 0.81854498, "num_input_tokens_seen": 252842355, "step": 11719, "time_per_iteration": 2.66690731048584 }, { "auxiliary_loss_clip": 0.01105187, "auxiliary_loss_mlp": 0.01041375, "balance_loss_clip": 1.04220223, "balance_loss_mlp": 1.02782118, "epoch": 0.7046445212686007, "flos": 41574824363520.0, "grad_norm": 1.733178571450649, "language_loss": 0.65760505, "learning_rate": 8.472619625545951e-07, "loss": 0.67907059, "num_input_tokens_seen": 252866785, "step": 11720, "time_per_iteration": 2.808574914932251 }, { "auxiliary_loss_clip": 0.0109618, "auxiliary_loss_mlp": 0.0103147, "balance_loss_clip": 1.04084325, "balance_loss_mlp": 1.01776671, "epoch": 0.7047046445212686, "flos": 15560166113280.0, "grad_norm": 2.168655366214879, "language_loss": 0.80443633, "learning_rate": 8.46943720397872e-07, "loss": 0.8257128, "num_input_tokens_seen": 252881870, "step": 11721, "time_per_iteration": 2.7525858879089355 }, { "auxiliary_loss_clip": 0.01001442, "auxiliary_loss_mlp": 0.00998843, "balance_loss_clip": 1.0093838, "balance_loss_mlp": 0.99760932, "epoch": 0.7047647677739366, "flos": 70410269571840.0, "grad_norm": 0.7632832348458642, "language_loss": 0.64800274, "learning_rate": 8.466255219651582e-07, "loss": 0.66800559, "num_input_tokens_seen": 252951300, "step": 11722, "time_per_iteration": 3.413194179534912 }, { "auxiliary_loss_clip": 0.010923, "auxiliary_loss_mlp": 0.01034777, "balance_loss_clip": 1.03915524, "balance_loss_mlp": 1.02249277, "epoch": 0.7048248910266045, "flos": 23660032976640.0, "grad_norm": 1.7290381066238394, "language_loss": 0.65823722, "learning_rate": 8.463073672685211e-07, "loss": 0.67950797, "num_input_tokens_seen": 252971400, "step": 11723, "time_per_iteration": 2.668208360671997 }, { "auxiliary_loss_clip": 0.01083668, "auxiliary_loss_mlp": 0.01031266, "balance_loss_clip": 1.03798199, "balance_loss_mlp": 1.01790833, "epoch": 0.7048850142792725, "flos": 21397158017280.0, "grad_norm": 1.6832847250880896, "language_loss": 0.80916411, "learning_rate": 8.459892563200235e-07, "loss": 0.8303135, "num_input_tokens_seen": 252989475, "step": 11724, "time_per_iteration": 2.7347311973571777 }, { "auxiliary_loss_clip": 0.01104162, "auxiliary_loss_mlp": 0.01035842, "balance_loss_clip": 1.03983295, "balance_loss_mlp": 1.02229953, "epoch": 0.7049451375319404, "flos": 21648101408640.0, "grad_norm": 1.7664791855809618, "language_loss": 0.7323097, "learning_rate": 8.456711891317296e-07, "loss": 0.75370979, "num_input_tokens_seen": 253007220, "step": 11725, "time_per_iteration": 2.654641628265381 }, { "auxiliary_loss_clip": 0.01066947, "auxiliary_loss_mlp": 0.01039139, "balance_loss_clip": 1.03276384, "balance_loss_mlp": 1.02378523, "epoch": 0.7050052607846085, "flos": 14866802904960.0, "grad_norm": 2.572506179811176, "language_loss": 0.78501201, "learning_rate": 8.453531657156998e-07, "loss": 0.80607283, "num_input_tokens_seen": 253025410, "step": 11726, "time_per_iteration": 2.7266314029693604 }, { "auxiliary_loss_clip": 0.01093418, "auxiliary_loss_mlp": 0.01038851, "balance_loss_clip": 1.03780484, "balance_loss_mlp": 1.02567255, "epoch": 0.7050653840372764, "flos": 19241763528960.0, "grad_norm": 2.180783221878792, "language_loss": 0.70615113, "learning_rate": 8.450351860839931e-07, "loss": 0.72747386, "num_input_tokens_seen": 253043305, "step": 11727, "time_per_iteration": 2.6545214653015137 }, { "auxiliary_loss_clip": 0.0110651, "auxiliary_loss_mlp": 0.00770675, "balance_loss_clip": 1.03787398, "balance_loss_mlp": 1.00010693, "epoch": 0.7051255072899444, "flos": 27780422935680.0, "grad_norm": 1.6658028000538345, "language_loss": 0.68843293, "learning_rate": 8.44717250248668e-07, "loss": 0.7072047, "num_input_tokens_seen": 253062790, "step": 11728, "time_per_iteration": 2.7480993270874023 }, { "auxiliary_loss_clip": 0.01080875, "auxiliary_loss_mlp": 0.00771073, "balance_loss_clip": 1.03778076, "balance_loss_mlp": 1.00025976, "epoch": 0.7051856305426124, "flos": 27892033470720.0, "grad_norm": 1.6755629992737011, "language_loss": 0.73434365, "learning_rate": 8.443993582217803e-07, "loss": 0.75286305, "num_input_tokens_seen": 253082055, "step": 11729, "time_per_iteration": 2.762924909591675 }, { "auxiliary_loss_clip": 0.01101913, "auxiliary_loss_mlp": 0.01033478, "balance_loss_clip": 1.04192829, "balance_loss_mlp": 1.01929784, "epoch": 0.7052457537952803, "flos": 25043563082880.0, "grad_norm": 1.5899274462099697, "language_loss": 0.77899098, "learning_rate": 8.440815100153862e-07, "loss": 0.80034494, "num_input_tokens_seen": 253102575, "step": 11730, "time_per_iteration": 2.6950738430023193 }, { "auxiliary_loss_clip": 0.01113225, "auxiliary_loss_mlp": 0.01040637, "balance_loss_clip": 1.0385406, "balance_loss_mlp": 1.02747023, "epoch": 0.7053058770479483, "flos": 21871717528320.0, "grad_norm": 1.9830603646297307, "language_loss": 0.62745416, "learning_rate": 8.437637056415359e-07, "loss": 0.64899278, "num_input_tokens_seen": 253121290, "step": 11731, "time_per_iteration": 2.588109016418457 }, { "auxiliary_loss_clip": 0.01058245, "auxiliary_loss_mlp": 0.01032885, "balance_loss_clip": 1.0364511, "balance_loss_mlp": 1.01818633, "epoch": 0.7053660003006162, "flos": 16398716094720.0, "grad_norm": 1.9950553391193862, "language_loss": 0.74299359, "learning_rate": 8.434459451122815e-07, "loss": 0.76390493, "num_input_tokens_seen": 253139720, "step": 11732, "time_per_iteration": 2.6930272579193115 }, { "auxiliary_loss_clip": 0.01102907, "auxiliary_loss_mlp": 0.01034149, "balance_loss_clip": 1.04072869, "balance_loss_mlp": 1.0211252, "epoch": 0.7054261235532843, "flos": 22711560399360.0, "grad_norm": 1.5877435619523543, "language_loss": 0.71181738, "learning_rate": 8.431282284396735e-07, "loss": 0.73318791, "num_input_tokens_seen": 253160250, "step": 11733, "time_per_iteration": 2.6677498817443848 }, { "auxiliary_loss_clip": 0.01077175, "auxiliary_loss_mlp": 0.01034316, "balance_loss_clip": 1.03793108, "balance_loss_mlp": 1.02100611, "epoch": 0.7054862468059522, "flos": 13589711775360.0, "grad_norm": 1.9036570704847118, "language_loss": 0.73538595, "learning_rate": 8.428105556357583e-07, "loss": 0.75650084, "num_input_tokens_seen": 253178710, "step": 11734, "time_per_iteration": 2.660600185394287 }, { "auxiliary_loss_clip": 0.01080202, "auxiliary_loss_mlp": 0.01045919, "balance_loss_clip": 1.03852844, "balance_loss_mlp": 1.02982593, "epoch": 0.7055463700586202, "flos": 15880704105600.0, "grad_norm": 2.4268970887811685, "language_loss": 0.6969564, "learning_rate": 8.424929267125829e-07, "loss": 0.71821761, "num_input_tokens_seen": 253194805, "step": 11735, "time_per_iteration": 2.6809842586517334 }, { "auxiliary_loss_clip": 0.01084684, "auxiliary_loss_mlp": 0.01039756, "balance_loss_clip": 1.03542256, "balance_loss_mlp": 1.02454448, "epoch": 0.7056064933112881, "flos": 23076161400960.0, "grad_norm": 2.1216259487349918, "language_loss": 0.72249383, "learning_rate": 8.421753416821933e-07, "loss": 0.74373823, "num_input_tokens_seen": 253213895, "step": 11736, "time_per_iteration": 2.7173984050750732 }, { "auxiliary_loss_clip": 0.01093397, "auxiliary_loss_mlp": 0.01028697, "balance_loss_clip": 1.03913403, "balance_loss_mlp": 1.01618576, "epoch": 0.7056666165639561, "flos": 24057168721920.0, "grad_norm": 1.8303036629192204, "language_loss": 0.68867785, "learning_rate": 8.41857800556629e-07, "loss": 0.70989877, "num_input_tokens_seen": 253231620, "step": 11737, "time_per_iteration": 2.7358338832855225 }, { "auxiliary_loss_clip": 0.01082807, "auxiliary_loss_mlp": 0.01039951, "balance_loss_clip": 1.04143405, "balance_loss_mlp": 1.02608716, "epoch": 0.705726739816624, "flos": 17493237371520.0, "grad_norm": 2.0769803991045848, "language_loss": 0.67978764, "learning_rate": 8.415403033479332e-07, "loss": 0.70101517, "num_input_tokens_seen": 253249590, "step": 11738, "time_per_iteration": 2.7042016983032227 }, { "auxiliary_loss_clip": 0.01114904, "auxiliary_loss_mlp": 0.01037563, "balance_loss_clip": 1.04074073, "balance_loss_mlp": 1.02349627, "epoch": 0.7057868630692921, "flos": 51350426472960.0, "grad_norm": 1.923264037015028, "language_loss": 0.75011027, "learning_rate": 8.41222850068145e-07, "loss": 0.77163494, "num_input_tokens_seen": 253273870, "step": 11739, "time_per_iteration": 2.9135007858276367 }, { "auxiliary_loss_clip": 0.01084303, "auxiliary_loss_mlp": 0.00770885, "balance_loss_clip": 1.03611875, "balance_loss_mlp": 1.00016105, "epoch": 0.70584698632196, "flos": 26102963836800.0, "grad_norm": 1.6688083494293096, "language_loss": 0.71504521, "learning_rate": 8.409054407293032e-07, "loss": 0.7335971, "num_input_tokens_seen": 253293720, "step": 11740, "time_per_iteration": 2.7146854400634766 }, { "auxiliary_loss_clip": 0.01081608, "auxiliary_loss_mlp": 0.01029234, "balance_loss_clip": 1.03897929, "balance_loss_mlp": 1.01712787, "epoch": 0.705907109574628, "flos": 21543134889600.0, "grad_norm": 1.6427960474243053, "language_loss": 0.81782758, "learning_rate": 8.405880753434434e-07, "loss": 0.83893597, "num_input_tokens_seen": 253313700, "step": 11741, "time_per_iteration": 2.7265563011169434 }, { "auxiliary_loss_clip": 0.01091272, "auxiliary_loss_mlp": 0.01033057, "balance_loss_clip": 1.03843784, "balance_loss_mlp": 1.01918101, "epoch": 0.705967232827296, "flos": 22710842127360.0, "grad_norm": 1.7910600093045685, "language_loss": 0.77970088, "learning_rate": 8.402707539225993e-07, "loss": 0.80094415, "num_input_tokens_seen": 253332425, "step": 11742, "time_per_iteration": 2.744617462158203 }, { "auxiliary_loss_clip": 0.01119104, "auxiliary_loss_mlp": 0.01034721, "balance_loss_clip": 1.04196119, "balance_loss_mlp": 1.02049327, "epoch": 0.7060273560799639, "flos": 28691225124480.0, "grad_norm": 1.5236916078434313, "language_loss": 0.64199877, "learning_rate": 8.39953476478805e-07, "loss": 0.66353697, "num_input_tokens_seen": 253353620, "step": 11743, "time_per_iteration": 2.6587469577789307 }, { "auxiliary_loss_clip": 0.01087403, "auxiliary_loss_mlp": 0.01037922, "balance_loss_clip": 1.03589988, "balance_loss_mlp": 1.02340233, "epoch": 0.7060874793326319, "flos": 15706178899200.0, "grad_norm": 1.87431643891437, "language_loss": 0.65725398, "learning_rate": 8.396362430240902e-07, "loss": 0.67850721, "num_input_tokens_seen": 253370930, "step": 11744, "time_per_iteration": 2.651118278503418 }, { "auxiliary_loss_clip": 0.01100616, "auxiliary_loss_mlp": 0.01034598, "balance_loss_clip": 1.03792346, "balance_loss_mlp": 1.02068591, "epoch": 0.7061476025852998, "flos": 21506757390720.0, "grad_norm": 1.7030797660453072, "language_loss": 0.63694406, "learning_rate": 8.393190535704857e-07, "loss": 0.65829617, "num_input_tokens_seen": 253389810, "step": 11745, "time_per_iteration": 4.299595832824707 }, { "auxiliary_loss_clip": 0.01077796, "auxiliary_loss_mlp": 0.01034157, "balance_loss_clip": 1.03395259, "balance_loss_mlp": 1.02075148, "epoch": 0.7062077258379679, "flos": 28181832399360.0, "grad_norm": 1.8209890328260383, "language_loss": 0.71859854, "learning_rate": 8.390019081300188e-07, "loss": 0.73971808, "num_input_tokens_seen": 253408685, "step": 11746, "time_per_iteration": 2.736166000366211 }, { "auxiliary_loss_clip": 0.01057236, "auxiliary_loss_mlp": 0.01033707, "balance_loss_clip": 1.03863013, "balance_loss_mlp": 1.02068353, "epoch": 0.7062678490906358, "flos": 27853680723840.0, "grad_norm": 1.467695639044188, "language_loss": 0.79042476, "learning_rate": 8.386848067147175e-07, "loss": 0.81133419, "num_input_tokens_seen": 253429685, "step": 11747, "time_per_iteration": 4.4075751304626465 }, { "auxiliary_loss_clip": 0.01099667, "auxiliary_loss_mlp": 0.0103307, "balance_loss_clip": 1.03842044, "balance_loss_mlp": 1.02088141, "epoch": 0.7063279723433038, "flos": 23184862934400.0, "grad_norm": 1.8618400783249733, "language_loss": 0.65024734, "learning_rate": 8.383677493366031e-07, "loss": 0.67157471, "num_input_tokens_seen": 253448260, "step": 11748, "time_per_iteration": 4.207107305526733 }, { "auxiliary_loss_clip": 0.01067203, "auxiliary_loss_mlp": 0.01036794, "balance_loss_clip": 1.03578413, "balance_loss_mlp": 1.02373433, "epoch": 0.7063880955959717, "flos": 20188655907840.0, "grad_norm": 2.0013045839241337, "language_loss": 0.79624116, "learning_rate": 8.380507360077003e-07, "loss": 0.81728113, "num_input_tokens_seen": 253467725, "step": 11749, "time_per_iteration": 2.8175888061523438 }, { "auxiliary_loss_clip": 0.01033129, "auxiliary_loss_mlp": 0.01002303, "balance_loss_clip": 1.01007652, "balance_loss_mlp": 1.001194, "epoch": 0.7064482188486397, "flos": 63668182763520.0, "grad_norm": 0.7992789353290359, "language_loss": 0.54014421, "learning_rate": 8.377337667400304e-07, "loss": 0.56049848, "num_input_tokens_seen": 253526940, "step": 11750, "time_per_iteration": 3.154975175857544 }, { "auxiliary_loss_clip": 0.01092158, "auxiliary_loss_mlp": 0.01037282, "balance_loss_clip": 1.03940368, "balance_loss_mlp": 1.0240972, "epoch": 0.7065083421013076, "flos": 25191227894400.0, "grad_norm": 1.6870361976430077, "language_loss": 0.78464556, "learning_rate": 8.37416841545612e-07, "loss": 0.80593991, "num_input_tokens_seen": 253546160, "step": 11751, "time_per_iteration": 2.732318878173828 }, { "auxiliary_loss_clip": 0.01074241, "auxiliary_loss_mlp": 0.01032324, "balance_loss_clip": 1.03658986, "balance_loss_mlp": 1.02024841, "epoch": 0.7065684653539757, "flos": 22893699288960.0, "grad_norm": 2.1027740219742928, "language_loss": 0.67992324, "learning_rate": 8.370999604364634e-07, "loss": 0.70098889, "num_input_tokens_seen": 253565505, "step": 11752, "time_per_iteration": 2.810976505279541 }, { "auxiliary_loss_clip": 0.01058629, "auxiliary_loss_mlp": 0.00771253, "balance_loss_clip": 1.03738487, "balance_loss_mlp": 1.00034094, "epoch": 0.7066285886066436, "flos": 23550254035200.0, "grad_norm": 2.005268773121489, "language_loss": 0.7630111, "learning_rate": 8.367831234246025e-07, "loss": 0.7813099, "num_input_tokens_seen": 253585125, "step": 11753, "time_per_iteration": 2.7646682262420654 }, { "auxiliary_loss_clip": 0.01082791, "auxiliary_loss_mlp": 0.00770025, "balance_loss_clip": 1.0389396, "balance_loss_mlp": 1.00026608, "epoch": 0.7066887118593116, "flos": 21069293650560.0, "grad_norm": 1.5279654497487496, "language_loss": 0.70828259, "learning_rate": 8.364663305220405e-07, "loss": 0.72681069, "num_input_tokens_seen": 253604815, "step": 11754, "time_per_iteration": 4.359536170959473 }, { "auxiliary_loss_clip": 0.01072435, "auxiliary_loss_mlp": 0.01043933, "balance_loss_clip": 1.03650284, "balance_loss_mlp": 1.02949619, "epoch": 0.7067488351119796, "flos": 21176307244800.0, "grad_norm": 2.2420199206717104, "language_loss": 0.89593709, "learning_rate": 8.361495817407919e-07, "loss": 0.91710079, "num_input_tokens_seen": 253622855, "step": 11755, "time_per_iteration": 2.682011365890503 }, { "auxiliary_loss_clip": 0.01088944, "auxiliary_loss_mlp": 0.00770375, "balance_loss_clip": 1.03828812, "balance_loss_mlp": 1.00012851, "epoch": 0.7068089583646475, "flos": 20449224144000.0, "grad_norm": 1.7866584888812729, "language_loss": 0.79776525, "learning_rate": 8.358328770928678e-07, "loss": 0.81635845, "num_input_tokens_seen": 253642760, "step": 11756, "time_per_iteration": 2.7065372467041016 }, { "auxiliary_loss_clip": 0.00998647, "auxiliary_loss_mlp": 0.01001672, "balance_loss_clip": 1.01305819, "balance_loss_mlp": 1.00066495, "epoch": 0.7068690816173155, "flos": 59109179829120.0, "grad_norm": 1.037725439626294, "language_loss": 0.60347986, "learning_rate": 8.355162165902785e-07, "loss": 0.62348306, "num_input_tokens_seen": 253695685, "step": 11757, "time_per_iteration": 3.031812906265259 }, { "auxiliary_loss_clip": 0.0107753, "auxiliary_loss_mlp": 0.01034781, "balance_loss_clip": 1.03763008, "balance_loss_mlp": 1.02182341, "epoch": 0.7069292048699835, "flos": 16251554073600.0, "grad_norm": 1.6437193466007092, "language_loss": 0.80430943, "learning_rate": 8.351996002450307e-07, "loss": 0.82543254, "num_input_tokens_seen": 253713305, "step": 11758, "time_per_iteration": 2.655449628829956 }, { "auxiliary_loss_clip": 0.01071031, "auxiliary_loss_mlp": 0.00770922, "balance_loss_clip": 1.03758502, "balance_loss_mlp": 1.000157, "epoch": 0.7069893281226515, "flos": 41172768455040.0, "grad_norm": 2.774739921576683, "language_loss": 0.77785885, "learning_rate": 8.348830280691304e-07, "loss": 0.79627836, "num_input_tokens_seen": 253736100, "step": 11759, "time_per_iteration": 2.8444526195526123 }, { "auxiliary_loss_clip": 0.01101914, "auxiliary_loss_mlp": 0.01031651, "balance_loss_clip": 1.03790534, "balance_loss_mlp": 1.01810324, "epoch": 0.7070494513753194, "flos": 24207275658240.0, "grad_norm": 1.9103417618984617, "language_loss": 0.67560184, "learning_rate": 8.34566500074583e-07, "loss": 0.69693744, "num_input_tokens_seen": 253757350, "step": 11760, "time_per_iteration": 2.715236186981201 }, { "auxiliary_loss_clip": 0.01076213, "auxiliary_loss_mlp": 0.01033273, "balance_loss_clip": 1.03878856, "balance_loss_mlp": 1.02084506, "epoch": 0.7071095746279874, "flos": 20185675079040.0, "grad_norm": 1.9224750289593278, "language_loss": 0.80442196, "learning_rate": 8.342500162733899e-07, "loss": 0.82551688, "num_input_tokens_seen": 253772855, "step": 11761, "time_per_iteration": 2.6564581394195557 }, { "auxiliary_loss_clip": 0.01086413, "auxiliary_loss_mlp": 0.01043874, "balance_loss_clip": 1.03556442, "balance_loss_mlp": 1.02777457, "epoch": 0.7071696978806553, "flos": 18183045133440.0, "grad_norm": 2.457600250400544, "language_loss": 0.75026697, "learning_rate": 8.33933576677553e-07, "loss": 0.77156985, "num_input_tokens_seen": 253790360, "step": 11762, "time_per_iteration": 2.6615617275238037 }, { "auxiliary_loss_clip": 0.01087856, "auxiliary_loss_mlp": 0.01032971, "balance_loss_clip": 1.0366205, "balance_loss_mlp": 1.0203824, "epoch": 0.7072298211333233, "flos": 24131719399680.0, "grad_norm": 1.9130475183821334, "language_loss": 0.76827163, "learning_rate": 8.336171812990724e-07, "loss": 0.78947991, "num_input_tokens_seen": 253810585, "step": 11763, "time_per_iteration": 2.7182300090789795 }, { "auxiliary_loss_clip": 0.01083565, "auxiliary_loss_mlp": 0.00771257, "balance_loss_clip": 1.03937089, "balance_loss_mlp": 1.0002048, "epoch": 0.7072899443859912, "flos": 27198418867200.0, "grad_norm": 2.255368812760523, "language_loss": 0.78756404, "learning_rate": 8.333008301499453e-07, "loss": 0.80611229, "num_input_tokens_seen": 253829080, "step": 11764, "time_per_iteration": 2.7578113079071045 }, { "auxiliary_loss_clip": 0.01064836, "auxiliary_loss_mlp": 0.01037677, "balance_loss_clip": 1.03533673, "balance_loss_mlp": 1.02406919, "epoch": 0.7073500676386593, "flos": 16435596384000.0, "grad_norm": 1.727630740916822, "language_loss": 0.79465842, "learning_rate": 8.32984523242167e-07, "loss": 0.81568348, "num_input_tokens_seen": 253846780, "step": 11765, "time_per_iteration": 2.7866904735565186 }, { "auxiliary_loss_clip": 0.01109005, "auxiliary_loss_mlp": 0.01028153, "balance_loss_clip": 1.03910017, "balance_loss_mlp": 1.017102, "epoch": 0.7074101908913272, "flos": 27673732563840.0, "grad_norm": 1.7516856530265033, "language_loss": 0.68398869, "learning_rate": 8.326682605877324e-07, "loss": 0.70536023, "num_input_tokens_seen": 253867075, "step": 11766, "time_per_iteration": 2.701338768005371 }, { "auxiliary_loss_clip": 0.01090324, "auxiliary_loss_mlp": 0.01038357, "balance_loss_clip": 1.03629494, "balance_loss_mlp": 1.02530944, "epoch": 0.7074703141439952, "flos": 22238078296320.0, "grad_norm": 2.1562679089059245, "language_loss": 0.63844228, "learning_rate": 8.323520421986352e-07, "loss": 0.65972912, "num_input_tokens_seen": 253885790, "step": 11767, "time_per_iteration": 2.682774543762207 }, { "auxiliary_loss_clip": 0.01101727, "auxiliary_loss_mlp": 0.01027229, "balance_loss_clip": 1.03870296, "balance_loss_mlp": 1.01452112, "epoch": 0.7075304373966632, "flos": 29643217234560.0, "grad_norm": 1.942959612416706, "language_loss": 0.5247106, "learning_rate": 8.320358680868646e-07, "loss": 0.54600012, "num_input_tokens_seen": 253907070, "step": 11768, "time_per_iteration": 2.753188133239746 }, { "auxiliary_loss_clip": 0.01088433, "auxiliary_loss_mlp": 0.00770686, "balance_loss_clip": 1.03836966, "balance_loss_mlp": 1.00015306, "epoch": 0.7075905606493311, "flos": 19755214490880.0, "grad_norm": 1.6006404938341818, "language_loss": 0.75532562, "learning_rate": 8.317197382644119e-07, "loss": 0.77391684, "num_input_tokens_seen": 253927290, "step": 11769, "time_per_iteration": 2.7288544178009033 }, { "auxiliary_loss_clip": 0.01013903, "auxiliary_loss_mlp": 0.01015517, "balance_loss_clip": 1.00881553, "balance_loss_mlp": 1.01409209, "epoch": 0.7076506839019991, "flos": 65716132694400.0, "grad_norm": 0.8537079866047984, "language_loss": 0.61981726, "learning_rate": 8.314036527432637e-07, "loss": 0.64011145, "num_input_tokens_seen": 253983440, "step": 11770, "time_per_iteration": 3.14176607131958 }, { "auxiliary_loss_clip": 0.01078902, "auxiliary_loss_mlp": 0.01036173, "balance_loss_clip": 1.03566778, "balance_loss_mlp": 1.02311409, "epoch": 0.707710807154667, "flos": 23765286804480.0, "grad_norm": 1.7412518327218227, "language_loss": 0.76630473, "learning_rate": 8.310876115354055e-07, "loss": 0.7874555, "num_input_tokens_seen": 254003825, "step": 11771, "time_per_iteration": 2.8118982315063477 }, { "auxiliary_loss_clip": 0.01097524, "auxiliary_loss_mlp": 0.01028735, "balance_loss_clip": 1.03753102, "balance_loss_mlp": 1.01656938, "epoch": 0.7077709304073351, "flos": 21251360712960.0, "grad_norm": 1.6484169639122062, "language_loss": 0.70931244, "learning_rate": 8.307716146528221e-07, "loss": 0.73057497, "num_input_tokens_seen": 254023345, "step": 11772, "time_per_iteration": 2.6268296241760254 }, { "auxiliary_loss_clip": 0.01063148, "auxiliary_loss_mlp": 0.01033223, "balance_loss_clip": 1.03564882, "balance_loss_mlp": 1.01975262, "epoch": 0.707831053660003, "flos": 20740746925440.0, "grad_norm": 1.8398984713825175, "language_loss": 0.69417477, "learning_rate": 8.30455662107496e-07, "loss": 0.71513855, "num_input_tokens_seen": 254041815, "step": 11773, "time_per_iteration": 2.7778313159942627 }, { "auxiliary_loss_clip": 0.01104178, "auxiliary_loss_mlp": 0.01034694, "balance_loss_clip": 1.03965759, "balance_loss_mlp": 1.02178395, "epoch": 0.707891176912671, "flos": 21980993679360.0, "grad_norm": 1.4556965212861144, "language_loss": 0.7014932, "learning_rate": 8.301397539114095e-07, "loss": 0.72288191, "num_input_tokens_seen": 254062065, "step": 11774, "time_per_iteration": 2.68330979347229 }, { "auxiliary_loss_clip": 0.01081938, "auxiliary_loss_mlp": 0.01028977, "balance_loss_clip": 1.04048491, "balance_loss_mlp": 1.01713347, "epoch": 0.7079513001653389, "flos": 21068970428160.0, "grad_norm": 1.5670559751490778, "language_loss": 0.74400485, "learning_rate": 8.298238900765407e-07, "loss": 0.76511401, "num_input_tokens_seen": 254080605, "step": 11775, "time_per_iteration": 2.672057628631592 }, { "auxiliary_loss_clip": 0.01074662, "auxiliary_loss_mlp": 0.00770567, "balance_loss_clip": 1.03893805, "balance_loss_mlp": 1.00032187, "epoch": 0.7080114234180069, "flos": 18040659621120.0, "grad_norm": 1.9150110614912736, "language_loss": 0.86714977, "learning_rate": 8.295080706148665e-07, "loss": 0.88560206, "num_input_tokens_seen": 254098710, "step": 11776, "time_per_iteration": 2.68167781829834 }, { "auxiliary_loss_clip": 0.01093973, "auxiliary_loss_mlp": 0.01034049, "balance_loss_clip": 1.03666544, "balance_loss_mlp": 1.02201438, "epoch": 0.7080715466706748, "flos": 15122271409920.0, "grad_norm": 1.4933339123942304, "language_loss": 0.75204122, "learning_rate": 8.291922955383641e-07, "loss": 0.77332139, "num_input_tokens_seen": 254117200, "step": 11777, "time_per_iteration": 2.617124319076538 }, { "auxiliary_loss_clip": 0.0109467, "auxiliary_loss_mlp": 0.01033372, "balance_loss_clip": 1.04061341, "balance_loss_mlp": 1.02046156, "epoch": 0.7081316699233429, "flos": 14422802889600.0, "grad_norm": 2.468930422112918, "language_loss": 0.8228538, "learning_rate": 8.288765648590066e-07, "loss": 0.84413421, "num_input_tokens_seen": 254132115, "step": 11778, "time_per_iteration": 2.7087488174438477 }, { "auxiliary_loss_clip": 0.01082719, "auxiliary_loss_mlp": 0.0103363, "balance_loss_clip": 1.03583169, "balance_loss_mlp": 1.02246594, "epoch": 0.7081917931760108, "flos": 23222389668480.0, "grad_norm": 1.6098616985666978, "language_loss": 0.85021019, "learning_rate": 8.285608785887673e-07, "loss": 0.87137371, "num_input_tokens_seen": 254152285, "step": 11779, "time_per_iteration": 2.6744067668914795 }, { "auxiliary_loss_clip": 0.0108855, "auxiliary_loss_mlp": 0.01032676, "balance_loss_clip": 1.03944063, "balance_loss_mlp": 1.01993263, "epoch": 0.7082519164286788, "flos": 39308429871360.0, "grad_norm": 2.2882732237177326, "language_loss": 0.72005677, "learning_rate": 8.28245236739618e-07, "loss": 0.74126905, "num_input_tokens_seen": 254172805, "step": 11780, "time_per_iteration": 2.8406546115875244 }, { "auxiliary_loss_clip": 0.0105972, "auxiliary_loss_mlp": 0.010316, "balance_loss_clip": 1.03477693, "balance_loss_mlp": 1.01896429, "epoch": 0.7083120396813467, "flos": 21651154064640.0, "grad_norm": 1.4192183070754045, "language_loss": 0.73349321, "learning_rate": 8.279296393235256e-07, "loss": 0.75440645, "num_input_tokens_seen": 254191890, "step": 11781, "time_per_iteration": 2.8251590728759766 }, { "auxiliary_loss_clip": 0.01099337, "auxiliary_loss_mlp": 0.01032456, "balance_loss_clip": 1.0398531, "balance_loss_mlp": 1.02066612, "epoch": 0.7083721629340147, "flos": 17567033863680.0, "grad_norm": 1.678957129171523, "language_loss": 0.77408248, "learning_rate": 8.276140863524585e-07, "loss": 0.79540044, "num_input_tokens_seen": 254210150, "step": 11782, "time_per_iteration": 2.6758499145507812 }, { "auxiliary_loss_clip": 0.0108554, "auxiliary_loss_mlp": 0.01029552, "balance_loss_clip": 1.03717136, "balance_loss_mlp": 1.01893663, "epoch": 0.7084322861866827, "flos": 29350509304320.0, "grad_norm": 1.5187238607788938, "language_loss": 0.69871926, "learning_rate": 8.272985778383828e-07, "loss": 0.71987015, "num_input_tokens_seen": 254233015, "step": 11783, "time_per_iteration": 2.8378536701202393 }, { "auxiliary_loss_clip": 0.01073688, "auxiliary_loss_mlp": 0.01028822, "balance_loss_clip": 1.03804398, "balance_loss_mlp": 1.01622784, "epoch": 0.7084924094393507, "flos": 20194294343040.0, "grad_norm": 1.5952065311243613, "language_loss": 0.78930736, "learning_rate": 8.269831137932632e-07, "loss": 0.81033248, "num_input_tokens_seen": 254251345, "step": 11784, "time_per_iteration": 4.362036943435669 }, { "auxiliary_loss_clip": 0.01111276, "auxiliary_loss_mlp": 0.0103335, "balance_loss_clip": 1.04004228, "balance_loss_mlp": 1.02080894, "epoch": 0.7085525326920187, "flos": 23477211728640.0, "grad_norm": 2.08544088937736, "language_loss": 0.77696943, "learning_rate": 8.266676942290609e-07, "loss": 0.79841572, "num_input_tokens_seen": 254269905, "step": 11785, "time_per_iteration": 2.5937209129333496 }, { "auxiliary_loss_clip": 0.01085039, "auxiliary_loss_mlp": 0.01034205, "balance_loss_clip": 1.0364778, "balance_loss_mlp": 1.02091932, "epoch": 0.7086126559446866, "flos": 25958818558080.0, "grad_norm": 1.9412182789537995, "language_loss": 0.78004217, "learning_rate": 8.26352319157738e-07, "loss": 0.80123466, "num_input_tokens_seen": 254289990, "step": 11786, "time_per_iteration": 4.211290121078491 }, { "auxiliary_loss_clip": 0.01113302, "auxiliary_loss_mlp": 0.01030804, "balance_loss_clip": 1.03969085, "balance_loss_mlp": 1.01798928, "epoch": 0.7086727791973546, "flos": 26724793109760.0, "grad_norm": 1.9518945204498503, "language_loss": 0.78987539, "learning_rate": 8.260369885912526e-07, "loss": 0.81131643, "num_input_tokens_seen": 254309085, "step": 11787, "time_per_iteration": 4.215709447860718 }, { "auxiliary_loss_clip": 0.01100936, "auxiliary_loss_mlp": 0.01032335, "balance_loss_clip": 1.0393877, "balance_loss_mlp": 1.02004457, "epoch": 0.7087329024500225, "flos": 21683365585920.0, "grad_norm": 1.963463516719362, "language_loss": 0.76586342, "learning_rate": 8.257217025415615e-07, "loss": 0.7871961, "num_input_tokens_seen": 254327045, "step": 11788, "time_per_iteration": 2.6296236515045166 }, { "auxiliary_loss_clip": 0.0107305, "auxiliary_loss_mlp": 0.01037785, "balance_loss_clip": 1.03411317, "balance_loss_mlp": 1.02229357, "epoch": 0.7087930257026905, "flos": 17931060247680.0, "grad_norm": 1.8171493958934544, "language_loss": 0.67838019, "learning_rate": 8.254064610206212e-07, "loss": 0.69948852, "num_input_tokens_seen": 254344585, "step": 11789, "time_per_iteration": 2.664304733276367 }, { "auxiliary_loss_clip": 0.0105779, "auxiliary_loss_mlp": 0.01034389, "balance_loss_clip": 1.03825188, "balance_loss_mlp": 1.02094257, "epoch": 0.7088531489553584, "flos": 18911528864640.0, "grad_norm": 1.6253389079610685, "language_loss": 0.77915251, "learning_rate": 8.250912640403858e-07, "loss": 0.80007434, "num_input_tokens_seen": 254362470, "step": 11790, "time_per_iteration": 2.745398759841919 }, { "auxiliary_loss_clip": 0.01093327, "auxiliary_loss_mlp": 0.01033055, "balance_loss_clip": 1.03802967, "balance_loss_mlp": 1.01917279, "epoch": 0.7089132722080265, "flos": 27380880979200.0, "grad_norm": 2.6743877386072046, "language_loss": 0.70789683, "learning_rate": 8.247761116128085e-07, "loss": 0.72916067, "num_input_tokens_seen": 254383190, "step": 11791, "time_per_iteration": 2.7536044120788574 }, { "auxiliary_loss_clip": 0.0110278, "auxiliary_loss_mlp": 0.01035298, "balance_loss_clip": 1.04042172, "balance_loss_mlp": 1.02178025, "epoch": 0.7089733954606944, "flos": 22162917087360.0, "grad_norm": 1.574032400084743, "language_loss": 0.82329011, "learning_rate": 8.244610037498376e-07, "loss": 0.84467089, "num_input_tokens_seen": 254403115, "step": 11792, "time_per_iteration": 2.658579111099243 }, { "auxiliary_loss_clip": 0.01071076, "auxiliary_loss_mlp": 0.01032048, "balance_loss_clip": 1.03814042, "balance_loss_mlp": 1.01898229, "epoch": 0.7090335187133624, "flos": 24425827960320.0, "grad_norm": 2.4251661207172406, "language_loss": 0.64878172, "learning_rate": 8.241459404634232e-07, "loss": 0.66981292, "num_input_tokens_seen": 254421875, "step": 11793, "time_per_iteration": 4.261074066162109 }, { "auxiliary_loss_clip": 0.01097375, "auxiliary_loss_mlp": 0.01035971, "balance_loss_clip": 1.03896296, "balance_loss_mlp": 1.02244079, "epoch": 0.7090936419660303, "flos": 21835232288640.0, "grad_norm": 5.81708329493613, "language_loss": 0.70618987, "learning_rate": 8.238309217655133e-07, "loss": 0.72752333, "num_input_tokens_seen": 254440765, "step": 11794, "time_per_iteration": 2.6876423358917236 }, { "auxiliary_loss_clip": 0.01091573, "auxiliary_loss_mlp": 0.01037583, "balance_loss_clip": 1.04035616, "balance_loss_mlp": 1.02522707, "epoch": 0.7091537652186983, "flos": 20082360585600.0, "grad_norm": 1.8634208156139904, "language_loss": 0.76080108, "learning_rate": 8.23515947668052e-07, "loss": 0.78209263, "num_input_tokens_seen": 254459480, "step": 11795, "time_per_iteration": 2.63566255569458 }, { "auxiliary_loss_clip": 0.01075226, "auxiliary_loss_mlp": 0.01033753, "balance_loss_clip": 1.03935122, "balance_loss_mlp": 1.02176011, "epoch": 0.7092138884713663, "flos": 13151565676800.0, "grad_norm": 2.3261568456734816, "language_loss": 0.75312549, "learning_rate": 8.232010181829838e-07, "loss": 0.77421528, "num_input_tokens_seen": 254473985, "step": 11796, "time_per_iteration": 2.716097116470337 }, { "auxiliary_loss_clip": 0.01103014, "auxiliary_loss_mlp": 0.01042156, "balance_loss_clip": 1.03999233, "balance_loss_mlp": 1.02640212, "epoch": 0.7092740117240343, "flos": 21645982506240.0, "grad_norm": 1.559472378141648, "language_loss": 0.74076355, "learning_rate": 8.228861333222523e-07, "loss": 0.76221526, "num_input_tokens_seen": 254492135, "step": 11797, "time_per_iteration": 2.6320409774780273 }, { "auxiliary_loss_clip": 0.0106907, "auxiliary_loss_mlp": 0.01035432, "balance_loss_clip": 1.03879786, "balance_loss_mlp": 1.02290332, "epoch": 0.7093341349767023, "flos": 21032521102080.0, "grad_norm": 1.5130058981356065, "language_loss": 0.79285604, "learning_rate": 8.225712930977953e-07, "loss": 0.81390107, "num_input_tokens_seen": 254512865, "step": 11798, "time_per_iteration": 2.7878127098083496 }, { "auxiliary_loss_clip": 0.01079128, "auxiliary_loss_mlp": 0.01040993, "balance_loss_clip": 1.03444886, "balance_loss_mlp": 1.02752233, "epoch": 0.7093942582293702, "flos": 22017658487040.0, "grad_norm": 1.8215442061382334, "language_loss": 0.6698848, "learning_rate": 8.222564975215529e-07, "loss": 0.69108605, "num_input_tokens_seen": 254532605, "step": 11799, "time_per_iteration": 2.6869001388549805 }, { "auxiliary_loss_clip": 0.01112483, "auxiliary_loss_mlp": 0.01028449, "balance_loss_clip": 1.03966284, "balance_loss_mlp": 1.01535368, "epoch": 0.7094543814820382, "flos": 27235586465280.0, "grad_norm": 1.576526567313424, "language_loss": 0.81716406, "learning_rate": 8.219417466054622e-07, "loss": 0.8385734, "num_input_tokens_seen": 254553780, "step": 11800, "time_per_iteration": 2.658963680267334 }, { "auxiliary_loss_clip": 0.01088302, "auxiliary_loss_mlp": 0.01033764, "balance_loss_clip": 1.03797638, "balance_loss_mlp": 1.02189112, "epoch": 0.7095145047347061, "flos": 12089148180480.0, "grad_norm": 1.9061714801555572, "language_loss": 0.86517024, "learning_rate": 8.21627040361459e-07, "loss": 0.88639092, "num_input_tokens_seen": 254567510, "step": 11801, "time_per_iteration": 2.6748046875 }, { "auxiliary_loss_clip": 0.0111264, "auxiliary_loss_mlp": 0.0103384, "balance_loss_clip": 1.03984725, "balance_loss_mlp": 1.02127492, "epoch": 0.7095746279873741, "flos": 19383789905280.0, "grad_norm": 1.9051932445720021, "language_loss": 0.7623291, "learning_rate": 8.213123788014758e-07, "loss": 0.78379387, "num_input_tokens_seen": 254585565, "step": 11802, "time_per_iteration": 2.618805170059204 }, { "auxiliary_loss_clip": 0.01097308, "auxiliary_loss_mlp": 0.01046076, "balance_loss_clip": 1.03798604, "balance_loss_mlp": 1.03312433, "epoch": 0.709634751240042, "flos": 21360600950400.0, "grad_norm": 1.6390429241877804, "language_loss": 0.81943619, "learning_rate": 8.209977619374462e-07, "loss": 0.84087008, "num_input_tokens_seen": 254603465, "step": 11803, "time_per_iteration": 2.68537974357605 }, { "auxiliary_loss_clip": 0.01112366, "auxiliary_loss_mlp": 0.01034568, "balance_loss_clip": 1.03814209, "balance_loss_mlp": 1.02085912, "epoch": 0.7096948744927101, "flos": 13917037438080.0, "grad_norm": 2.293538036404514, "language_loss": 0.67322147, "learning_rate": 8.206831897812995e-07, "loss": 0.69469082, "num_input_tokens_seen": 254620500, "step": 11804, "time_per_iteration": 2.642585277557373 }, { "auxiliary_loss_clip": 0.01097953, "auxiliary_loss_mlp": 0.01028551, "balance_loss_clip": 1.03816247, "balance_loss_mlp": 1.01730967, "epoch": 0.709754997745378, "flos": 30298335436800.0, "grad_norm": 1.964223724359439, "language_loss": 0.78081644, "learning_rate": 8.203686623449637e-07, "loss": 0.80208147, "num_input_tokens_seen": 254638565, "step": 11805, "time_per_iteration": 2.720667600631714 }, { "auxiliary_loss_clip": 0.01091353, "auxiliary_loss_mlp": 0.00771338, "balance_loss_clip": 1.03825855, "balance_loss_mlp": 1.00015116, "epoch": 0.709815120998046, "flos": 18515147304960.0, "grad_norm": 9.192055527956402, "language_loss": 0.79064679, "learning_rate": 8.200541796403667e-07, "loss": 0.80927366, "num_input_tokens_seen": 254657505, "step": 11806, "time_per_iteration": 2.681230306625366 }, { "auxiliary_loss_clip": 0.01083674, "auxiliary_loss_mlp": 0.01041523, "balance_loss_clip": 1.03755128, "balance_loss_mlp": 1.02857089, "epoch": 0.7098752442507139, "flos": 22272588288000.0, "grad_norm": 2.0898066634684573, "language_loss": 0.56422603, "learning_rate": 8.197397416794332e-07, "loss": 0.58547801, "num_input_tokens_seen": 254674730, "step": 11807, "time_per_iteration": 2.734550714492798 }, { "auxiliary_loss_clip": 0.01114828, "auxiliary_loss_mlp": 0.01043868, "balance_loss_clip": 1.03833497, "balance_loss_mlp": 1.03099334, "epoch": 0.7099353675033819, "flos": 19275447507840.0, "grad_norm": 2.0105184465729464, "language_loss": 0.68802261, "learning_rate": 8.194253484740882e-07, "loss": 0.70960963, "num_input_tokens_seen": 254691665, "step": 11808, "time_per_iteration": 2.6423966884613037 }, { "auxiliary_loss_clip": 0.01098171, "auxiliary_loss_mlp": 0.01032542, "balance_loss_clip": 1.03855026, "balance_loss_mlp": 1.02025712, "epoch": 0.70999549075605, "flos": 21908525990400.0, "grad_norm": 2.1402280143316834, "language_loss": 0.71625412, "learning_rate": 8.191110000362513e-07, "loss": 0.73756123, "num_input_tokens_seen": 254711610, "step": 11809, "time_per_iteration": 2.627044200897217 }, { "auxiliary_loss_clip": 0.01031591, "auxiliary_loss_mlp": 0.0100231, "balance_loss_clip": 1.00862455, "balance_loss_mlp": 1.00124347, "epoch": 0.7100556140087179, "flos": 70456053456000.0, "grad_norm": 0.7494928075068129, "language_loss": 0.5943656, "learning_rate": 8.187966963778435e-07, "loss": 0.61470461, "num_input_tokens_seen": 254772615, "step": 11810, "time_per_iteration": 3.2029061317443848 }, { "auxiliary_loss_clip": 0.01033991, "auxiliary_loss_mlp": 0.01048321, "balance_loss_clip": 1.03268588, "balance_loss_mlp": 1.03488612, "epoch": 0.7101157372613859, "flos": 23039568420480.0, "grad_norm": 2.72885983888825, "language_loss": 0.74159658, "learning_rate": 8.18482437510784e-07, "loss": 0.7624197, "num_input_tokens_seen": 254791375, "step": 11811, "time_per_iteration": 2.8374974727630615 }, { "auxiliary_loss_clip": 0.01073985, "auxiliary_loss_mlp": 0.01027518, "balance_loss_clip": 1.03985, "balance_loss_mlp": 1.015275, "epoch": 0.7101758605140538, "flos": 23185329811200.0, "grad_norm": 2.0242190076468223, "language_loss": 0.83632278, "learning_rate": 8.181682234469882e-07, "loss": 0.85733783, "num_input_tokens_seen": 254809300, "step": 11812, "time_per_iteration": 2.757760763168335 }, { "auxiliary_loss_clip": 0.01114938, "auxiliary_loss_mlp": 0.01031671, "balance_loss_clip": 1.04106176, "balance_loss_mlp": 1.01833153, "epoch": 0.7102359837667218, "flos": 23696123166720.0, "grad_norm": 1.5968911597601785, "language_loss": 0.6982094, "learning_rate": 8.178540541983716e-07, "loss": 0.71967542, "num_input_tokens_seen": 254829325, "step": 11813, "time_per_iteration": 2.593907594680786 }, { "auxiliary_loss_clip": 0.01109186, "auxiliary_loss_mlp": 0.01028954, "balance_loss_clip": 1.03841877, "balance_loss_mlp": 1.01689565, "epoch": 0.7102961070193897, "flos": 19391116279680.0, "grad_norm": 1.9874956852039145, "language_loss": 0.81565011, "learning_rate": 8.175399297768495e-07, "loss": 0.83703148, "num_input_tokens_seen": 254847690, "step": 11814, "time_per_iteration": 2.5443472862243652 }, { "auxiliary_loss_clip": 0.01112342, "auxiliary_loss_mlp": 0.01033274, "balance_loss_clip": 1.04030275, "balance_loss_mlp": 1.02032149, "epoch": 0.7103562302720577, "flos": 21507511576320.0, "grad_norm": 2.7298772348158074, "language_loss": 0.75506926, "learning_rate": 8.172258501943301e-07, "loss": 0.77652538, "num_input_tokens_seen": 254865960, "step": 11815, "time_per_iteration": 2.5428481101989746 }, { "auxiliary_loss_clip": 0.01067291, "auxiliary_loss_mlp": 0.01031233, "balance_loss_clip": 1.03749645, "balance_loss_mlp": 1.01881731, "epoch": 0.7104163535247257, "flos": 14535059869440.0, "grad_norm": 1.7974303130693923, "language_loss": 0.78488684, "learning_rate": 8.16911815462725e-07, "loss": 0.80587208, "num_input_tokens_seen": 254882815, "step": 11816, "time_per_iteration": 2.7543234825134277 }, { "auxiliary_loss_clip": 0.01085859, "auxiliary_loss_mlp": 0.01038436, "balance_loss_clip": 1.03845906, "balance_loss_mlp": 1.02579379, "epoch": 0.7104764767773937, "flos": 11400310085760.0, "grad_norm": 1.72092645420432, "language_loss": 0.86654431, "learning_rate": 8.165978255939426e-07, "loss": 0.88778722, "num_input_tokens_seen": 254898705, "step": 11817, "time_per_iteration": 2.6052346229553223 }, { "auxiliary_loss_clip": 0.01064818, "auxiliary_loss_mlp": 0.01029492, "balance_loss_clip": 1.0393579, "balance_loss_mlp": 1.01749921, "epoch": 0.7105366000300616, "flos": 11690432236800.0, "grad_norm": 2.3052427315849964, "language_loss": 0.848396, "learning_rate": 8.162838805998897e-07, "loss": 0.86933911, "num_input_tokens_seen": 254913665, "step": 11818, "time_per_iteration": 2.664659023284912 }, { "auxiliary_loss_clip": 0.01111214, "auxiliary_loss_mlp": 0.01029578, "balance_loss_clip": 1.03756952, "balance_loss_mlp": 1.01640534, "epoch": 0.7105967232827296, "flos": 19354020508800.0, "grad_norm": 2.1251714303337006, "language_loss": 0.76013577, "learning_rate": 8.159699804924709e-07, "loss": 0.78154367, "num_input_tokens_seen": 254932140, "step": 11819, "time_per_iteration": 2.5721442699432373 }, { "auxiliary_loss_clip": 0.01069448, "auxiliary_loss_mlp": 0.01034158, "balance_loss_clip": 1.0366652, "balance_loss_mlp": 1.01895833, "epoch": 0.7106568465353975, "flos": 22930400010240.0, "grad_norm": 1.554661416155005, "language_loss": 0.70843577, "learning_rate": 8.156561252835883e-07, "loss": 0.7294718, "num_input_tokens_seen": 254951580, "step": 11820, "time_per_iteration": 2.7395031452178955 }, { "auxiliary_loss_clip": 0.01101119, "auxiliary_loss_mlp": 0.01029233, "balance_loss_clip": 1.03955543, "balance_loss_mlp": 1.01675773, "epoch": 0.7107169697880655, "flos": 19099665325440.0, "grad_norm": 1.8332126356579708, "language_loss": 0.75666863, "learning_rate": 8.153423149851449e-07, "loss": 0.7779721, "num_input_tokens_seen": 254969425, "step": 11821, "time_per_iteration": 2.6001696586608887 }, { "auxiliary_loss_clip": 0.00987426, "auxiliary_loss_mlp": 0.00999944, "balance_loss_clip": 1.01348448, "balance_loss_mlp": 0.99880552, "epoch": 0.7107770930407336, "flos": 63638054231040.0, "grad_norm": 0.7717757980179868, "language_loss": 0.5505957, "learning_rate": 8.150285496090388e-07, "loss": 0.57046944, "num_input_tokens_seen": 255032680, "step": 11822, "time_per_iteration": 3.295065402984619 }, { "auxiliary_loss_clip": 0.0109566, "auxiliary_loss_mlp": 0.01026855, "balance_loss_clip": 1.03837609, "balance_loss_mlp": 1.01429629, "epoch": 0.7108372162934015, "flos": 22054466949120.0, "grad_norm": 1.928380251227047, "language_loss": 0.60496062, "learning_rate": 8.147148291671688e-07, "loss": 0.62618577, "num_input_tokens_seen": 255054400, "step": 11823, "time_per_iteration": 2.6415092945098877 }, { "auxiliary_loss_clip": 0.01099793, "auxiliary_loss_mlp": 0.01032476, "balance_loss_clip": 1.03883433, "balance_loss_mlp": 1.02019811, "epoch": 0.7108973395460695, "flos": 19135144984320.0, "grad_norm": 2.0421558606422434, "language_loss": 0.71593511, "learning_rate": 8.144011536714322e-07, "loss": 0.73725778, "num_input_tokens_seen": 255072785, "step": 11824, "time_per_iteration": 4.298635244369507 }, { "auxiliary_loss_clip": 0.01077795, "auxiliary_loss_mlp": 0.00772624, "balance_loss_clip": 1.03366399, "balance_loss_mlp": 1.00021195, "epoch": 0.7109574627987374, "flos": 17894431353600.0, "grad_norm": 2.7239449344013322, "language_loss": 0.72674167, "learning_rate": 8.140875231337223e-07, "loss": 0.74524581, "num_input_tokens_seen": 255091820, "step": 11825, "time_per_iteration": 2.652414083480835 }, { "auxiliary_loss_clip": 0.01081872, "auxiliary_loss_mlp": 0.01031095, "balance_loss_clip": 1.03761208, "balance_loss_mlp": 1.01838112, "epoch": 0.7110175860514054, "flos": 28979623422720.0, "grad_norm": 1.6201501744547915, "language_loss": 0.79405123, "learning_rate": 8.137739375659321e-07, "loss": 0.8151809, "num_input_tokens_seen": 255111720, "step": 11826, "time_per_iteration": 4.22081995010376 }, { "auxiliary_loss_clip": 0.01098598, "auxiliary_loss_mlp": 0.01034594, "balance_loss_clip": 1.03932214, "balance_loss_mlp": 1.02239263, "epoch": 0.7110777093040733, "flos": 26173312623360.0, "grad_norm": 1.462780118765175, "language_loss": 0.8310101, "learning_rate": 8.134603969799527e-07, "loss": 0.85234201, "num_input_tokens_seen": 255133495, "step": 11827, "time_per_iteration": 4.226747512817383 }, { "auxiliary_loss_clip": 0.01079454, "auxiliary_loss_mlp": 0.01032813, "balance_loss_clip": 1.03688717, "balance_loss_mlp": 1.01972437, "epoch": 0.7111378325567413, "flos": 26869943969280.0, "grad_norm": 1.4792451308451542, "language_loss": 0.6237936, "learning_rate": 8.131469013876748e-07, "loss": 0.6449163, "num_input_tokens_seen": 255156880, "step": 11828, "time_per_iteration": 2.7983498573303223 }, { "auxiliary_loss_clip": 0.01111659, "auxiliary_loss_mlp": 0.01034434, "balance_loss_clip": 1.03956318, "balance_loss_mlp": 1.02155936, "epoch": 0.7111979558094093, "flos": 27271820309760.0, "grad_norm": 1.434450077194213, "language_loss": 0.72024685, "learning_rate": 8.128334508009846e-07, "loss": 0.7417078, "num_input_tokens_seen": 255178920, "step": 11829, "time_per_iteration": 2.6990365982055664 }, { "auxiliary_loss_clip": 0.01111652, "auxiliary_loss_mlp": 0.01034605, "balance_loss_clip": 1.04012764, "balance_loss_mlp": 1.02254748, "epoch": 0.7112580790620773, "flos": 25046938961280.0, "grad_norm": 1.7220593364674301, "language_loss": 0.80250454, "learning_rate": 8.125200452317697e-07, "loss": 0.8239671, "num_input_tokens_seen": 255198095, "step": 11830, "time_per_iteration": 2.573199987411499 }, { "auxiliary_loss_clip": 0.01099477, "auxiliary_loss_mlp": 0.0103532, "balance_loss_clip": 1.0376153, "balance_loss_mlp": 1.02277327, "epoch": 0.7113182023147452, "flos": 21646628951040.0, "grad_norm": 1.7248457668834243, "language_loss": 0.84357107, "learning_rate": 8.122066846919138e-07, "loss": 0.86491901, "num_input_tokens_seen": 255215860, "step": 11831, "time_per_iteration": 2.6142139434814453 }, { "auxiliary_loss_clip": 0.01088822, "auxiliary_loss_mlp": 0.01032858, "balance_loss_clip": 1.0360502, "balance_loss_mlp": 1.02048969, "epoch": 0.7113783255674132, "flos": 20996287257600.0, "grad_norm": 3.6930845590637986, "language_loss": 0.77417958, "learning_rate": 8.118933691932985e-07, "loss": 0.79539645, "num_input_tokens_seen": 255235425, "step": 11832, "time_per_iteration": 2.6712517738342285 }, { "auxiliary_loss_clip": 0.01020951, "auxiliary_loss_mlp": 0.01006539, "balance_loss_clip": 1.00784588, "balance_loss_mlp": 1.00549638, "epoch": 0.7114384488200811, "flos": 66771080161920.0, "grad_norm": 0.7440594945981316, "language_loss": 0.56577992, "learning_rate": 8.115800987478059e-07, "loss": 0.5860548, "num_input_tokens_seen": 255291680, "step": 11833, "time_per_iteration": 4.970557689666748 }, { "auxiliary_loss_clip": 0.01063684, "auxiliary_loss_mlp": 0.01035215, "balance_loss_clip": 1.03803515, "balance_loss_mlp": 1.02324665, "epoch": 0.7114985720727491, "flos": 25010058672000.0, "grad_norm": 1.530403195160948, "language_loss": 0.70702851, "learning_rate": 8.11266873367315e-07, "loss": 0.72801757, "num_input_tokens_seen": 255313880, "step": 11834, "time_per_iteration": 2.814005136489868 }, { "auxiliary_loss_clip": 0.0111468, "auxiliary_loss_mlp": 0.01035977, "balance_loss_clip": 1.04097462, "balance_loss_mlp": 1.02278614, "epoch": 0.7115586953254172, "flos": 21470128496640.0, "grad_norm": 2.039356893023201, "language_loss": 0.79006612, "learning_rate": 8.10953693063704e-07, "loss": 0.81157267, "num_input_tokens_seen": 255332390, "step": 11835, "time_per_iteration": 2.6193342208862305 }, { "auxiliary_loss_clip": 0.01098428, "auxiliary_loss_mlp": 0.01030957, "balance_loss_clip": 1.0383265, "balance_loss_mlp": 1.01929832, "epoch": 0.7116188185780851, "flos": 28622600190720.0, "grad_norm": 1.4832382343509314, "language_loss": 0.75895661, "learning_rate": 8.10640557848848e-07, "loss": 0.78025043, "num_input_tokens_seen": 255354025, "step": 11836, "time_per_iteration": 2.796912670135498 }, { "auxiliary_loss_clip": 0.01041174, "auxiliary_loss_mlp": 0.01035442, "balance_loss_clip": 1.03577304, "balance_loss_mlp": 1.02302051, "epoch": 0.7116789418307531, "flos": 25293608634240.0, "grad_norm": 1.738152097420041, "language_loss": 0.69952178, "learning_rate": 8.103274677346208e-07, "loss": 0.72028792, "num_input_tokens_seen": 255371400, "step": 11837, "time_per_iteration": 2.850287914276123 }, { "auxiliary_loss_clip": 0.01104188, "auxiliary_loss_mlp": 0.01038013, "balance_loss_clip": 1.04023147, "balance_loss_mlp": 1.02389264, "epoch": 0.711739065083421, "flos": 25557301353600.0, "grad_norm": 1.8562944097025111, "language_loss": 0.61769348, "learning_rate": 8.100144227328958e-07, "loss": 0.63911551, "num_input_tokens_seen": 255390710, "step": 11838, "time_per_iteration": 2.6722800731658936 }, { "auxiliary_loss_clip": 0.01103036, "auxiliary_loss_mlp": 0.01032537, "balance_loss_clip": 1.04213476, "balance_loss_mlp": 1.02000856, "epoch": 0.711799188336089, "flos": 26140993361280.0, "grad_norm": 2.198172519021995, "language_loss": 0.67758644, "learning_rate": 8.097014228555426e-07, "loss": 0.69894218, "num_input_tokens_seen": 255408790, "step": 11839, "time_per_iteration": 2.700693130493164 }, { "auxiliary_loss_clip": 0.01113567, "auxiliary_loss_mlp": 0.01032497, "balance_loss_clip": 1.04118514, "balance_loss_mlp": 1.02025414, "epoch": 0.7118593115887569, "flos": 21140648017920.0, "grad_norm": 2.000863214598685, "language_loss": 0.84081334, "learning_rate": 8.093884681144305e-07, "loss": 0.86227405, "num_input_tokens_seen": 255426280, "step": 11840, "time_per_iteration": 2.6260175704956055 }, { "auxiliary_loss_clip": 0.01089291, "auxiliary_loss_mlp": 0.01032161, "balance_loss_clip": 1.03793836, "balance_loss_mlp": 1.01938784, "epoch": 0.711919434841425, "flos": 14975684006400.0, "grad_norm": 1.8693362232508501, "language_loss": 0.76592988, "learning_rate": 8.090755585214277e-07, "loss": 0.78714442, "num_input_tokens_seen": 255442935, "step": 11841, "time_per_iteration": 2.7380130290985107 }, { "auxiliary_loss_clip": 0.01097544, "auxiliary_loss_mlp": 0.01031388, "balance_loss_clip": 1.0421263, "balance_loss_mlp": 1.01840663, "epoch": 0.7119795580940929, "flos": 16508997826560.0, "grad_norm": 2.0423814070424546, "language_loss": 0.75526315, "learning_rate": 8.087626940883994e-07, "loss": 0.77655244, "num_input_tokens_seen": 255460925, "step": 11842, "time_per_iteration": 2.7132010459899902 }, { "auxiliary_loss_clip": 0.01025805, "auxiliary_loss_mlp": 0.01005384, "balance_loss_clip": 1.01843143, "balance_loss_mlp": 1.00406706, "epoch": 0.7120396813467609, "flos": 66570736055040.0, "grad_norm": 0.7903612051800185, "language_loss": 0.61607522, "learning_rate": 8.084498748272082e-07, "loss": 0.63638717, "num_input_tokens_seen": 255521360, "step": 11843, "time_per_iteration": 3.199335813522339 }, { "auxiliary_loss_clip": 0.01110982, "auxiliary_loss_mlp": 0.01027269, "balance_loss_clip": 1.04004669, "balance_loss_mlp": 1.01506245, "epoch": 0.7120998045994288, "flos": 26432731624320.0, "grad_norm": 2.817805590014094, "language_loss": 0.80302823, "learning_rate": 8.081371007497171e-07, "loss": 0.82441074, "num_input_tokens_seen": 255541435, "step": 11844, "time_per_iteration": 2.7244338989257812 }, { "auxiliary_loss_clip": 0.010573, "auxiliary_loss_mlp": 0.01034033, "balance_loss_clip": 1.03133631, "balance_loss_mlp": 1.02053213, "epoch": 0.7121599278520968, "flos": 16427982700800.0, "grad_norm": 2.6971267365188565, "language_loss": 0.79268605, "learning_rate": 8.078243718677873e-07, "loss": 0.81359935, "num_input_tokens_seen": 255558505, "step": 11845, "time_per_iteration": 2.7217719554901123 }, { "auxiliary_loss_clip": 0.01094755, "auxiliary_loss_mlp": 0.0103426, "balance_loss_clip": 1.03866315, "balance_loss_mlp": 1.02122474, "epoch": 0.7122200511047647, "flos": 28949889939840.0, "grad_norm": 2.005574335935101, "language_loss": 0.77602625, "learning_rate": 8.075116881932762e-07, "loss": 0.79731637, "num_input_tokens_seen": 255577815, "step": 11846, "time_per_iteration": 2.64569354057312 }, { "auxiliary_loss_clip": 0.01101916, "auxiliary_loss_mlp": 0.01033988, "balance_loss_clip": 1.03883851, "balance_loss_mlp": 1.0209887, "epoch": 0.7122801743574327, "flos": 16471866142080.0, "grad_norm": 1.8418760265221825, "language_loss": 0.58981413, "learning_rate": 8.071990497380421e-07, "loss": 0.61117315, "num_input_tokens_seen": 255595885, "step": 11847, "time_per_iteration": 2.626909017562866 }, { "auxiliary_loss_clip": 0.01095645, "auxiliary_loss_mlp": 0.00769201, "balance_loss_clip": 1.03844142, "balance_loss_mlp": 1.00012553, "epoch": 0.7123402976101008, "flos": 20631039811200.0, "grad_norm": 2.0944282784493353, "language_loss": 0.71676862, "learning_rate": 8.068864565139395e-07, "loss": 0.73541707, "num_input_tokens_seen": 255616750, "step": 11848, "time_per_iteration": 2.7139625549316406 }, { "auxiliary_loss_clip": 0.01023376, "auxiliary_loss_mlp": 0.01000864, "balance_loss_clip": 1.00891995, "balance_loss_mlp": 0.99969596, "epoch": 0.7124004208627687, "flos": 62325734837760.0, "grad_norm": 0.8463803916761699, "language_loss": 0.62977934, "learning_rate": 8.065739085328211e-07, "loss": 0.65002173, "num_input_tokens_seen": 255677900, "step": 11849, "time_per_iteration": 3.1411380767822266 }, { "auxiliary_loss_clip": 0.01083662, "auxiliary_loss_mlp": 0.01037172, "balance_loss_clip": 1.03620315, "balance_loss_mlp": 1.0243752, "epoch": 0.7124605441154367, "flos": 39675975788160.0, "grad_norm": 2.803971224135637, "language_loss": 0.63841069, "learning_rate": 8.0626140580654e-07, "loss": 0.65961903, "num_input_tokens_seen": 255699140, "step": 11850, "time_per_iteration": 2.923384428024292 }, { "auxiliary_loss_clip": 0.01102405, "auxiliary_loss_mlp": 0.01032425, "balance_loss_clip": 1.03889465, "balance_loss_mlp": 1.01953292, "epoch": 0.7125206673681046, "flos": 28181868312960.0, "grad_norm": 1.6311022275087306, "language_loss": 0.69985723, "learning_rate": 8.05948948346946e-07, "loss": 0.72120547, "num_input_tokens_seen": 255719640, "step": 11851, "time_per_iteration": 2.7820382118225098 }, { "auxiliary_loss_clip": 0.0110311, "auxiliary_loss_mlp": 0.01034212, "balance_loss_clip": 1.04154539, "balance_loss_mlp": 1.02258956, "epoch": 0.7125807906207726, "flos": 26176939896960.0, "grad_norm": 1.8696019158411576, "language_loss": 0.83187509, "learning_rate": 8.056365361658882e-07, "loss": 0.8532483, "num_input_tokens_seen": 255740450, "step": 11852, "time_per_iteration": 2.6444952487945557 }, { "auxiliary_loss_clip": 0.01100225, "auxiliary_loss_mlp": 0.00771762, "balance_loss_clip": 1.03667736, "balance_loss_mlp": 1.00029016, "epoch": 0.7126409138734405, "flos": 17157328358400.0, "grad_norm": 2.349353252161211, "language_loss": 0.73249555, "learning_rate": 8.053241692752126e-07, "loss": 0.75121546, "num_input_tokens_seen": 255758070, "step": 11853, "time_per_iteration": 2.637211799621582 }, { "auxiliary_loss_clip": 0.0107018, "auxiliary_loss_mlp": 0.0103944, "balance_loss_clip": 1.03552818, "balance_loss_mlp": 1.02707744, "epoch": 0.7127010371261085, "flos": 18769933451520.0, "grad_norm": 1.913315807088991, "language_loss": 0.92358422, "learning_rate": 8.050118476867635e-07, "loss": 0.94468045, "num_input_tokens_seen": 255775685, "step": 11854, "time_per_iteration": 2.7072691917419434 }, { "auxiliary_loss_clip": 0.01098797, "auxiliary_loss_mlp": 0.01033688, "balance_loss_clip": 1.03843451, "balance_loss_mlp": 1.02162969, "epoch": 0.7127611603787765, "flos": 20376433232640.0, "grad_norm": 1.737694299359858, "language_loss": 0.7940923, "learning_rate": 8.046995714123856e-07, "loss": 0.81541711, "num_input_tokens_seen": 255794750, "step": 11855, "time_per_iteration": 2.6459240913391113 }, { "auxiliary_loss_clip": 0.01062363, "auxiliary_loss_mlp": 0.0103668, "balance_loss_clip": 1.03427172, "balance_loss_mlp": 1.02277398, "epoch": 0.7128212836314445, "flos": 20449008662400.0, "grad_norm": 1.6847297518773263, "language_loss": 0.72626299, "learning_rate": 8.043873404639192e-07, "loss": 0.74725342, "num_input_tokens_seen": 255813325, "step": 11856, "time_per_iteration": 2.798802614212036 }, { "auxiliary_loss_clip": 0.01105236, "auxiliary_loss_mlp": 0.01030789, "balance_loss_clip": 1.0418961, "balance_loss_mlp": 1.01811683, "epoch": 0.7128814068841124, "flos": 23440834229760.0, "grad_norm": 1.7617515399603183, "language_loss": 0.70205921, "learning_rate": 8.040751548532046e-07, "loss": 0.72341949, "num_input_tokens_seen": 255832470, "step": 11857, "time_per_iteration": 2.7193527221679688 }, { "auxiliary_loss_clip": 0.01097533, "auxiliary_loss_mlp": 0.01029351, "balance_loss_clip": 1.03706014, "balance_loss_mlp": 1.01644111, "epoch": 0.7129415301367804, "flos": 18222942165120.0, "grad_norm": 2.6735250437319684, "language_loss": 0.85148036, "learning_rate": 8.03763014592081e-07, "loss": 0.87274927, "num_input_tokens_seen": 255849740, "step": 11858, "time_per_iteration": 2.640803813934326 }, { "auxiliary_loss_clip": 0.01116792, "auxiliary_loss_mlp": 0.01033094, "balance_loss_clip": 1.04105759, "balance_loss_mlp": 1.020136, "epoch": 0.7130016533894483, "flos": 15523896355200.0, "grad_norm": 1.6211685141896377, "language_loss": 0.80374736, "learning_rate": 8.034509196923829e-07, "loss": 0.82524627, "num_input_tokens_seen": 255866975, "step": 11859, "time_per_iteration": 2.600557565689087 }, { "auxiliary_loss_clip": 0.01088815, "auxiliary_loss_mlp": 0.01031931, "balance_loss_clip": 1.03991199, "balance_loss_mlp": 1.01981378, "epoch": 0.7130617766421163, "flos": 57115668960000.0, "grad_norm": 1.670734379003671, "language_loss": 0.68986422, "learning_rate": 8.031388701659456e-07, "loss": 0.71107167, "num_input_tokens_seen": 255892915, "step": 11860, "time_per_iteration": 3.0131988525390625 }, { "auxiliary_loss_clip": 0.01101154, "auxiliary_loss_mlp": 0.01031024, "balance_loss_clip": 1.03928113, "balance_loss_mlp": 1.01791143, "epoch": 0.7131218998947844, "flos": 19788252024960.0, "grad_norm": 1.6914333481475103, "language_loss": 0.64537835, "learning_rate": 8.028268660246023e-07, "loss": 0.66670012, "num_input_tokens_seen": 255911480, "step": 11861, "time_per_iteration": 2.609196424484253 }, { "auxiliary_loss_clip": 0.01095274, "auxiliary_loss_mlp": 0.01033317, "balance_loss_clip": 1.04040623, "balance_loss_mlp": 1.01967335, "epoch": 0.7131820231474523, "flos": 26651894457600.0, "grad_norm": 1.5298656478489163, "language_loss": 0.66931856, "learning_rate": 8.025149072801849e-07, "loss": 0.69060439, "num_input_tokens_seen": 255931140, "step": 11862, "time_per_iteration": 2.7272536754608154 }, { "auxiliary_loss_clip": 0.01084067, "auxiliary_loss_mlp": 0.01040707, "balance_loss_clip": 1.03703427, "balance_loss_mlp": 1.02913761, "epoch": 0.7132421464001203, "flos": 29205609840000.0, "grad_norm": 1.958177792409317, "language_loss": 0.66627884, "learning_rate": 8.022029939445214e-07, "loss": 0.68752658, "num_input_tokens_seen": 255951665, "step": 11863, "time_per_iteration": 4.389364957809448 }, { "auxiliary_loss_clip": 0.01071831, "auxiliary_loss_mlp": 0.01047442, "balance_loss_clip": 1.03993106, "balance_loss_mlp": 1.03224277, "epoch": 0.7133022696527882, "flos": 23073611535360.0, "grad_norm": 1.9615071733998684, "language_loss": 0.65745306, "learning_rate": 8.018911260294414e-07, "loss": 0.67864573, "num_input_tokens_seen": 255970055, "step": 11864, "time_per_iteration": 2.7246596813201904 }, { "auxiliary_loss_clip": 0.01101997, "auxiliary_loss_mlp": 0.01032961, "balance_loss_clip": 1.03820133, "balance_loss_mlp": 1.01960993, "epoch": 0.7133623929054562, "flos": 17457111267840.0, "grad_norm": 1.8809252452747804, "language_loss": 0.86299706, "learning_rate": 8.015793035467697e-07, "loss": 0.8843466, "num_input_tokens_seen": 255987720, "step": 11865, "time_per_iteration": 4.186030149459839 }, { "auxiliary_loss_clip": 0.01071299, "auxiliary_loss_mlp": 0.01037479, "balance_loss_clip": 1.0331533, "balance_loss_mlp": 1.02338219, "epoch": 0.7134225161581241, "flos": 19536554448000.0, "grad_norm": 4.424017900151165, "language_loss": 0.75215453, "learning_rate": 8.012675265083304e-07, "loss": 0.77324229, "num_input_tokens_seen": 256005490, "step": 11866, "time_per_iteration": 4.38300085067749 }, { "auxiliary_loss_clip": 0.01075897, "auxiliary_loss_mlp": 0.01035618, "balance_loss_clip": 1.03858542, "balance_loss_mlp": 1.02196276, "epoch": 0.7134826394107922, "flos": 26250089944320.0, "grad_norm": 1.787273089908781, "language_loss": 0.70222098, "learning_rate": 8.009557949259464e-07, "loss": 0.72333616, "num_input_tokens_seen": 256026030, "step": 11867, "time_per_iteration": 2.7252299785614014 }, { "auxiliary_loss_clip": 0.0109972, "auxiliary_loss_mlp": 0.01030726, "balance_loss_clip": 1.03978539, "balance_loss_mlp": 1.01921654, "epoch": 0.7135427626634601, "flos": 15815311395840.0, "grad_norm": 2.323638504940392, "language_loss": 0.72056556, "learning_rate": 8.006441088114397e-07, "loss": 0.74186999, "num_input_tokens_seen": 256043680, "step": 11868, "time_per_iteration": 2.6166346073150635 }, { "auxiliary_loss_clip": 0.01063174, "auxiliary_loss_mlp": 0.01035209, "balance_loss_clip": 1.03661656, "balance_loss_mlp": 1.02014136, "epoch": 0.7136028859161281, "flos": 18223409041920.0, "grad_norm": 2.386444069797043, "language_loss": 0.66029108, "learning_rate": 8.003324681766286e-07, "loss": 0.68127489, "num_input_tokens_seen": 256059705, "step": 11869, "time_per_iteration": 2.6557157039642334 }, { "auxiliary_loss_clip": 0.01086038, "auxiliary_loss_mlp": 0.01027932, "balance_loss_clip": 1.03453624, "balance_loss_mlp": 1.01540327, "epoch": 0.713663009168796, "flos": 24314827956480.0, "grad_norm": 1.5122287108134371, "language_loss": 0.77901238, "learning_rate": 8.000208730333298e-07, "loss": 0.80015206, "num_input_tokens_seen": 256079785, "step": 11870, "time_per_iteration": 2.767284870147705 }, { "auxiliary_loss_clip": 0.01062535, "auxiliary_loss_mlp": 0.01035444, "balance_loss_clip": 1.03716147, "balance_loss_mlp": 1.02176499, "epoch": 0.713723132421464, "flos": 26538488242560.0, "grad_norm": 1.7572988726243002, "language_loss": 0.81102479, "learning_rate": 7.997093233933597e-07, "loss": 0.83200461, "num_input_tokens_seen": 256099000, "step": 11871, "time_per_iteration": 2.7799062728881836 }, { "auxiliary_loss_clip": 0.01081304, "auxiliary_loss_mlp": 0.01037741, "balance_loss_clip": 1.03814363, "balance_loss_mlp": 1.02452111, "epoch": 0.7137832556741319, "flos": 19865675790720.0, "grad_norm": 1.5739267518019031, "language_loss": 0.78791887, "learning_rate": 7.993978192685331e-07, "loss": 0.80910927, "num_input_tokens_seen": 256117985, "step": 11872, "time_per_iteration": 4.27405309677124 }, { "auxiliary_loss_clip": 0.01104458, "auxiliary_loss_mlp": 0.01029654, "balance_loss_clip": 1.04009414, "balance_loss_mlp": 1.0162369, "epoch": 0.7138433789267999, "flos": 21688932193920.0, "grad_norm": 2.3871550053143893, "language_loss": 0.84496498, "learning_rate": 7.990863606706606e-07, "loss": 0.86630619, "num_input_tokens_seen": 256134350, "step": 11873, "time_per_iteration": 2.6260197162628174 }, { "auxiliary_loss_clip": 0.01073276, "auxiliary_loss_mlp": 0.01032331, "balance_loss_clip": 1.03462076, "balance_loss_mlp": 1.02040982, "epoch": 0.713903502179468, "flos": 17602729004160.0, "grad_norm": 2.5229231415013116, "language_loss": 0.86355793, "learning_rate": 7.987749476115539e-07, "loss": 0.88461399, "num_input_tokens_seen": 256150610, "step": 11874, "time_per_iteration": 2.680554151535034 }, { "auxiliary_loss_clip": 0.01103576, "auxiliary_loss_mlp": 0.01031521, "balance_loss_clip": 1.0389179, "balance_loss_mlp": 1.01873553, "epoch": 0.7139636254321359, "flos": 18040336398720.0, "grad_norm": 1.760053080674637, "language_loss": 0.8337326, "learning_rate": 7.984635801030228e-07, "loss": 0.85508358, "num_input_tokens_seen": 256168620, "step": 11875, "time_per_iteration": 2.597926616668701 }, { "auxiliary_loss_clip": 0.01091056, "auxiliary_loss_mlp": 0.01038348, "balance_loss_clip": 1.03766298, "balance_loss_mlp": 1.02233863, "epoch": 0.7140237486848039, "flos": 23331127115520.0, "grad_norm": 1.7238625463047035, "language_loss": 0.69539726, "learning_rate": 7.981522581568721e-07, "loss": 0.71669132, "num_input_tokens_seen": 256186700, "step": 11876, "time_per_iteration": 2.7075090408325195 }, { "auxiliary_loss_clip": 0.01115515, "auxiliary_loss_mlp": 0.01036257, "balance_loss_clip": 1.04096556, "balance_loss_mlp": 1.02292919, "epoch": 0.7140838719374718, "flos": 16837077674880.0, "grad_norm": 1.7495986259479304, "language_loss": 0.78027952, "learning_rate": 7.978409817849079e-07, "loss": 0.80179715, "num_input_tokens_seen": 256205390, "step": 11877, "time_per_iteration": 2.579984188079834 }, { "auxiliary_loss_clip": 0.01100542, "auxiliary_loss_mlp": 0.01039441, "balance_loss_clip": 1.03897512, "balance_loss_mlp": 1.02755046, "epoch": 0.7141439951901398, "flos": 21142012734720.0, "grad_norm": 2.00893168794746, "language_loss": 0.69702816, "learning_rate": 7.97529750998934e-07, "loss": 0.71842802, "num_input_tokens_seen": 256224575, "step": 11878, "time_per_iteration": 2.7117369174957275 }, { "auxiliary_loss_clip": 0.01075067, "auxiliary_loss_mlp": 0.01035543, "balance_loss_clip": 1.03836131, "balance_loss_mlp": 1.024194, "epoch": 0.7142041184428077, "flos": 24717709877760.0, "grad_norm": 1.9345471164629369, "language_loss": 0.67564619, "learning_rate": 7.972185658107535e-07, "loss": 0.69675231, "num_input_tokens_seen": 256242130, "step": 11879, "time_per_iteration": 2.781487226486206 }, { "auxiliary_loss_clip": 0.01052886, "auxiliary_loss_mlp": 0.01039936, "balance_loss_clip": 1.03587782, "balance_loss_mlp": 1.02522612, "epoch": 0.7142642416954758, "flos": 21908202768000.0, "grad_norm": 2.4025708755379136, "language_loss": 0.68949473, "learning_rate": 7.969074262321646e-07, "loss": 0.71042299, "num_input_tokens_seen": 256261920, "step": 11880, "time_per_iteration": 2.7956559658050537 }, { "auxiliary_loss_clip": 0.01085326, "auxiliary_loss_mlp": 0.01038627, "balance_loss_clip": 1.03614664, "balance_loss_mlp": 1.02517402, "epoch": 0.7143243649481437, "flos": 20805636844800.0, "grad_norm": 2.7211845383040263, "language_loss": 0.80758023, "learning_rate": 7.965963322749674e-07, "loss": 0.82881975, "num_input_tokens_seen": 256277970, "step": 11881, "time_per_iteration": 2.7760164737701416 }, { "auxiliary_loss_clip": 0.01069489, "auxiliary_loss_mlp": 0.01031682, "balance_loss_clip": 1.03435218, "balance_loss_mlp": 1.01974893, "epoch": 0.7143844882008117, "flos": 27235011847680.0, "grad_norm": 1.9142544481843773, "language_loss": 0.63496864, "learning_rate": 7.962852839509579e-07, "loss": 0.65598035, "num_input_tokens_seen": 256298205, "step": 11882, "time_per_iteration": 2.8055615425109863 }, { "auxiliary_loss_clip": 0.01115484, "auxiliary_loss_mlp": 0.01033467, "balance_loss_clip": 1.0405947, "balance_loss_mlp": 1.02086067, "epoch": 0.7144446114534796, "flos": 17929623703680.0, "grad_norm": 1.6563668139876793, "language_loss": 0.68799591, "learning_rate": 7.959742812719304e-07, "loss": 0.70948541, "num_input_tokens_seen": 256316685, "step": 11883, "time_per_iteration": 2.6891119480133057 }, { "auxiliary_loss_clip": 0.0110208, "auxiliary_loss_mlp": 0.01037358, "balance_loss_clip": 1.04018784, "balance_loss_mlp": 1.02402401, "epoch": 0.7145047347061476, "flos": 20740962407040.0, "grad_norm": 1.7218148321096673, "language_loss": 0.77569342, "learning_rate": 7.956633242496788e-07, "loss": 0.79708779, "num_input_tokens_seen": 256334205, "step": 11884, "time_per_iteration": 2.6530849933624268 }, { "auxiliary_loss_clip": 0.01107156, "auxiliary_loss_mlp": 0.01036925, "balance_loss_clip": 1.0385685, "balance_loss_mlp": 1.02221453, "epoch": 0.7145648579588155, "flos": 21178605715200.0, "grad_norm": 4.109766479944614, "language_loss": 0.73748314, "learning_rate": 7.953524128959954e-07, "loss": 0.75892401, "num_input_tokens_seen": 256353340, "step": 11885, "time_per_iteration": 2.8627066612243652 }, { "auxiliary_loss_clip": 0.01014823, "auxiliary_loss_mlp": 0.00999083, "balance_loss_clip": 1.01118517, "balance_loss_mlp": 0.9980278, "epoch": 0.7146249812114835, "flos": 64784539509120.0, "grad_norm": 0.8971094917641942, "language_loss": 0.66321898, "learning_rate": 7.95041547222669e-07, "loss": 0.68335795, "num_input_tokens_seen": 256411550, "step": 11886, "time_per_iteration": 3.2624523639678955 }, { "auxiliary_loss_clip": 0.01068235, "auxiliary_loss_mlp": 0.01029631, "balance_loss_clip": 1.03730834, "balance_loss_mlp": 1.01627326, "epoch": 0.7146851044641516, "flos": 18113881495680.0, "grad_norm": 1.637061044438449, "language_loss": 0.74940675, "learning_rate": 7.947307272414874e-07, "loss": 0.77038538, "num_input_tokens_seen": 256430360, "step": 11887, "time_per_iteration": 2.951922655105591 }, { "auxiliary_loss_clip": 0.0110054, "auxiliary_loss_mlp": 0.01027491, "balance_loss_clip": 1.03856289, "balance_loss_mlp": 1.01542068, "epoch": 0.7147452277168195, "flos": 19243846517760.0, "grad_norm": 1.834582654468692, "language_loss": 0.71475005, "learning_rate": 7.944199529642372e-07, "loss": 0.73603028, "num_input_tokens_seen": 256449750, "step": 11888, "time_per_iteration": 2.7142348289489746 }, { "auxiliary_loss_clip": 0.01097744, "auxiliary_loss_mlp": 0.0103845, "balance_loss_clip": 1.03603697, "balance_loss_mlp": 1.02444923, "epoch": 0.7148053509694875, "flos": 23764712186880.0, "grad_norm": 1.9131125822464334, "language_loss": 0.84173727, "learning_rate": 7.941092244027041e-07, "loss": 0.86309922, "num_input_tokens_seen": 256467330, "step": 11889, "time_per_iteration": 2.7939958572387695 }, { "auxiliary_loss_clip": 0.01066177, "auxiliary_loss_mlp": 0.01028337, "balance_loss_clip": 1.04017806, "balance_loss_mlp": 1.01598644, "epoch": 0.7148654742221554, "flos": 22485322586880.0, "grad_norm": 1.7213621841277236, "language_loss": 0.76025808, "learning_rate": 7.937985415686695e-07, "loss": 0.78120321, "num_input_tokens_seen": 256485705, "step": 11890, "time_per_iteration": 2.909778594970703 }, { "auxiliary_loss_clip": 0.0106853, "auxiliary_loss_mlp": 0.01036534, "balance_loss_clip": 1.03322911, "balance_loss_mlp": 1.0240227, "epoch": 0.7149255974748234, "flos": 24679213476480.0, "grad_norm": 1.510956653160521, "language_loss": 0.74061215, "learning_rate": 7.934879044739147e-07, "loss": 0.76166284, "num_input_tokens_seen": 256504755, "step": 11891, "time_per_iteration": 2.870742082595825 }, { "auxiliary_loss_clip": 0.01069165, "auxiliary_loss_mlp": 0.01036831, "balance_loss_clip": 1.03776526, "balance_loss_mlp": 1.0234617, "epoch": 0.7149857207274913, "flos": 18405583845120.0, "grad_norm": 2.1855656268859565, "language_loss": 0.67586207, "learning_rate": 7.931773131302211e-07, "loss": 0.69692206, "num_input_tokens_seen": 256523670, "step": 11892, "time_per_iteration": 2.879074811935425 }, { "auxiliary_loss_clip": 0.01078901, "auxiliary_loss_mlp": 0.01034356, "balance_loss_clip": 1.03972173, "balance_loss_mlp": 1.02015805, "epoch": 0.7150458439801594, "flos": 24969515195520.0, "grad_norm": 1.7990304927260297, "language_loss": 0.737535, "learning_rate": 7.928667675493632e-07, "loss": 0.75866759, "num_input_tokens_seen": 256542225, "step": 11893, "time_per_iteration": 2.797793388366699 }, { "auxiliary_loss_clip": 0.01118028, "auxiliary_loss_mlp": 0.01031131, "balance_loss_clip": 1.04243374, "balance_loss_mlp": 1.01739264, "epoch": 0.7151059672328273, "flos": 16690777580160.0, "grad_norm": 2.922265419299714, "language_loss": 0.67378318, "learning_rate": 7.925562677431185e-07, "loss": 0.69527477, "num_input_tokens_seen": 256560730, "step": 11894, "time_per_iteration": 2.6411194801330566 }, { "auxiliary_loss_clip": 0.01079135, "auxiliary_loss_mlp": 0.01032012, "balance_loss_clip": 1.04023933, "balance_loss_mlp": 1.01957238, "epoch": 0.7151660904854953, "flos": 27271820309760.0, "grad_norm": 1.6674046722753406, "language_loss": 0.77498591, "learning_rate": 7.922458137232613e-07, "loss": 0.7960974, "num_input_tokens_seen": 256580505, "step": 11895, "time_per_iteration": 2.9311444759368896 }, { "auxiliary_loss_clip": 0.01102223, "auxiliary_loss_mlp": 0.01031828, "balance_loss_clip": 1.03921759, "balance_loss_mlp": 1.0176903, "epoch": 0.7152262137381632, "flos": 18332254229760.0, "grad_norm": 1.8566798780150704, "language_loss": 0.69233418, "learning_rate": 7.919354055015643e-07, "loss": 0.71367466, "num_input_tokens_seen": 256597330, "step": 11896, "time_per_iteration": 2.708909034729004 }, { "auxiliary_loss_clip": 0.010908, "auxiliary_loss_mlp": 0.01041603, "balance_loss_clip": 1.03788733, "balance_loss_mlp": 1.02761424, "epoch": 0.7152863369908312, "flos": 21799285752960.0, "grad_norm": 2.0196702259188952, "language_loss": 0.86874604, "learning_rate": 7.91625043089798e-07, "loss": 0.89007008, "num_input_tokens_seen": 256616030, "step": 11897, "time_per_iteration": 2.8452200889587402 }, { "auxiliary_loss_clip": 0.01091656, "auxiliary_loss_mlp": 0.01035463, "balance_loss_clip": 1.03988373, "balance_loss_mlp": 1.0220046, "epoch": 0.7153464602434991, "flos": 22158427887360.0, "grad_norm": 3.4189922155736965, "language_loss": 0.7799052, "learning_rate": 7.913147264997304e-07, "loss": 0.80117643, "num_input_tokens_seen": 256635570, "step": 11898, "time_per_iteration": 2.73362398147583 }, { "auxiliary_loss_clip": 0.01089871, "auxiliary_loss_mlp": 0.01033056, "balance_loss_clip": 1.03692102, "balance_loss_mlp": 1.01879835, "epoch": 0.7154065834961671, "flos": 24716057852160.0, "grad_norm": 2.2668196785220895, "language_loss": 0.73072803, "learning_rate": 7.910044557431302e-07, "loss": 0.7519573, "num_input_tokens_seen": 256655290, "step": 11899, "time_per_iteration": 2.7390663623809814 }, { "auxiliary_loss_clip": 0.01101493, "auxiliary_loss_mlp": 0.01034602, "balance_loss_clip": 1.03773189, "balance_loss_mlp": 1.02130437, "epoch": 0.7154667067488351, "flos": 22601494149120.0, "grad_norm": 5.969579255187867, "language_loss": 0.75829309, "learning_rate": 7.906942308317614e-07, "loss": 0.77965403, "num_input_tokens_seen": 256671605, "step": 11900, "time_per_iteration": 2.6649601459503174 }, { "auxiliary_loss_clip": 0.01103632, "auxiliary_loss_mlp": 0.01030943, "balance_loss_clip": 1.04124916, "balance_loss_mlp": 1.01839614, "epoch": 0.7155268300015031, "flos": 18771154513920.0, "grad_norm": 1.8695849033514778, "language_loss": 0.80723226, "learning_rate": 7.903840517773886e-07, "loss": 0.828578, "num_input_tokens_seen": 256689680, "step": 11901, "time_per_iteration": 2.7060022354125977 }, { "auxiliary_loss_clip": 0.01080211, "auxiliary_loss_mlp": 0.01038068, "balance_loss_clip": 1.03678465, "balance_loss_mlp": 1.02424598, "epoch": 0.7155869532541711, "flos": 18296343607680.0, "grad_norm": 1.8343268513832525, "language_loss": 0.81889194, "learning_rate": 7.900739185917744e-07, "loss": 0.84007472, "num_input_tokens_seen": 256707760, "step": 11902, "time_per_iteration": 2.7816693782806396 }, { "auxiliary_loss_clip": 0.01069017, "auxiliary_loss_mlp": 0.01030025, "balance_loss_clip": 1.03530717, "balance_loss_mlp": 1.01750159, "epoch": 0.715647076506839, "flos": 11980805783040.0, "grad_norm": 1.7267279020747466, "language_loss": 0.68092871, "learning_rate": 7.897638312866785e-07, "loss": 0.70191914, "num_input_tokens_seen": 256724150, "step": 11903, "time_per_iteration": 4.382705926895142 }, { "auxiliary_loss_clip": 0.0106915, "auxiliary_loss_mlp": 0.01031243, "balance_loss_clip": 1.03447473, "balance_loss_mlp": 1.01918483, "epoch": 0.715707199759507, "flos": 18951641377920.0, "grad_norm": 5.343255365048286, "language_loss": 0.75641096, "learning_rate": 7.894537898738589e-07, "loss": 0.77741492, "num_input_tokens_seen": 256742780, "step": 11904, "time_per_iteration": 4.288340330123901 }, { "auxiliary_loss_clip": 0.01091072, "auxiliary_loss_mlp": 0.01039419, "balance_loss_clip": 1.03938174, "balance_loss_mlp": 1.02566779, "epoch": 0.7157673230121749, "flos": 15304410299520.0, "grad_norm": 2.088773074445301, "language_loss": 0.72025734, "learning_rate": 7.891437943650727e-07, "loss": 0.74156225, "num_input_tokens_seen": 256761355, "step": 11905, "time_per_iteration": 4.343631267547607 }, { "auxiliary_loss_clip": 0.01077244, "auxiliary_loss_mlp": 0.0103248, "balance_loss_clip": 1.03842819, "balance_loss_mlp": 1.02001703, "epoch": 0.715827446264843, "flos": 23221850964480.0, "grad_norm": 1.657779728748099, "language_loss": 0.779338, "learning_rate": 7.88833844772076e-07, "loss": 0.8004353, "num_input_tokens_seen": 256781335, "step": 11906, "time_per_iteration": 2.8104159832000732 }, { "auxiliary_loss_clip": 0.01014211, "auxiliary_loss_mlp": 0.0099711, "balance_loss_clip": 1.01162815, "balance_loss_mlp": 0.99602473, "epoch": 0.7158875695175109, "flos": 60975421833600.0, "grad_norm": 0.7366961855147857, "language_loss": 0.55325353, "learning_rate": 7.885239411066205e-07, "loss": 0.57336664, "num_input_tokens_seen": 256838890, "step": 11907, "time_per_iteration": 3.1521129608154297 }, { "auxiliary_loss_clip": 0.01094066, "auxiliary_loss_mlp": 0.01039845, "balance_loss_clip": 1.03540492, "balance_loss_mlp": 1.02677381, "epoch": 0.7159476927701789, "flos": 17128780024320.0, "grad_norm": 1.89939740443007, "language_loss": 0.69593656, "learning_rate": 7.882140833804593e-07, "loss": 0.71727568, "num_input_tokens_seen": 256858145, "step": 11908, "time_per_iteration": 2.6724538803100586 }, { "auxiliary_loss_clip": 0.01059783, "auxiliary_loss_mlp": 0.01036603, "balance_loss_clip": 1.03303337, "balance_loss_mlp": 1.02254832, "epoch": 0.7160078160228468, "flos": 22490601886080.0, "grad_norm": 1.6751841094057447, "language_loss": 0.71237969, "learning_rate": 7.879042716053415e-07, "loss": 0.7333436, "num_input_tokens_seen": 256878545, "step": 11909, "time_per_iteration": 2.779273509979248 }, { "auxiliary_loss_clip": 0.01099917, "auxiliary_loss_mlp": 0.01030028, "balance_loss_clip": 1.0387938, "balance_loss_mlp": 1.01755881, "epoch": 0.7160679392755148, "flos": 30590935626240.0, "grad_norm": 1.4959151522362304, "language_loss": 0.75010902, "learning_rate": 7.875945057930144e-07, "loss": 0.7714085, "num_input_tokens_seen": 256899920, "step": 11910, "time_per_iteration": 2.7424912452697754 }, { "auxiliary_loss_clip": 0.01085268, "auxiliary_loss_mlp": 0.01034213, "balance_loss_clip": 1.0382638, "balance_loss_mlp": 1.02263737, "epoch": 0.7161280625281827, "flos": 21323648833920.0, "grad_norm": 1.5302691845486787, "language_loss": 0.76587963, "learning_rate": 7.872847859552251e-07, "loss": 0.78707445, "num_input_tokens_seen": 256918460, "step": 11911, "time_per_iteration": 4.259274244308472 }, { "auxiliary_loss_clip": 0.01069944, "auxiliary_loss_mlp": 0.01043229, "balance_loss_clip": 1.03755224, "balance_loss_mlp": 1.02831018, "epoch": 0.7161881857808508, "flos": 61860078921600.0, "grad_norm": 1.649161828071685, "language_loss": 0.58413363, "learning_rate": 7.869751121037192e-07, "loss": 0.60526532, "num_input_tokens_seen": 256942015, "step": 11912, "time_per_iteration": 3.1699318885803223 }, { "auxiliary_loss_clip": 0.01101612, "auxiliary_loss_mlp": 0.01031486, "balance_loss_clip": 1.04070008, "balance_loss_mlp": 1.01849806, "epoch": 0.7162483090335187, "flos": 20812101292800.0, "grad_norm": 1.859500164824888, "language_loss": 0.7810173, "learning_rate": 7.866654842502376e-07, "loss": 0.80234826, "num_input_tokens_seen": 256961065, "step": 11913, "time_per_iteration": 2.704882860183716 }, { "auxiliary_loss_clip": 0.01087765, "auxiliary_loss_mlp": 0.0102754, "balance_loss_clip": 1.03807175, "balance_loss_mlp": 1.01646566, "epoch": 0.7163084322861867, "flos": 24097532630400.0, "grad_norm": 1.6076637682641197, "language_loss": 0.74075729, "learning_rate": 7.863559024065234e-07, "loss": 0.76191038, "num_input_tokens_seen": 256982165, "step": 11914, "time_per_iteration": 2.7636988162994385 }, { "auxiliary_loss_clip": 0.01075409, "auxiliary_loss_mlp": 0.01033103, "balance_loss_clip": 1.036044, "balance_loss_mlp": 1.02074111, "epoch": 0.7163685555388547, "flos": 20080888128000.0, "grad_norm": 1.6922973692533387, "language_loss": 0.74138194, "learning_rate": 7.860463665843143e-07, "loss": 0.76246703, "num_input_tokens_seen": 256999825, "step": 11915, "time_per_iteration": 2.816134452819824 }, { "auxiliary_loss_clip": 0.01111475, "auxiliary_loss_mlp": 0.01032503, "balance_loss_clip": 1.0383029, "balance_loss_mlp": 1.02015853, "epoch": 0.7164286787915226, "flos": 17456967613440.0, "grad_norm": 2.8016306362793353, "language_loss": 0.80886412, "learning_rate": 7.85736876795349e-07, "loss": 0.83030391, "num_input_tokens_seen": 257017450, "step": 11916, "time_per_iteration": 2.666930675506592 }, { "auxiliary_loss_clip": 0.01033862, "auxiliary_loss_mlp": 0.01034435, "balance_loss_clip": 1.03228307, "balance_loss_mlp": 1.0218699, "epoch": 0.7164888020441906, "flos": 19718908819200.0, "grad_norm": 1.9058816458994292, "language_loss": 0.6875428, "learning_rate": 7.854274330513626e-07, "loss": 0.70822579, "num_input_tokens_seen": 257035465, "step": 11917, "time_per_iteration": 3.0043599605560303 }, { "auxiliary_loss_clip": 0.0108964, "auxiliary_loss_mlp": 0.01035669, "balance_loss_clip": 1.03826666, "balance_loss_mlp": 1.0224905, "epoch": 0.7165489252968585, "flos": 21470523546240.0, "grad_norm": 2.1418903876984614, "language_loss": 0.75930321, "learning_rate": 7.851180353640896e-07, "loss": 0.78055626, "num_input_tokens_seen": 257053750, "step": 11918, "time_per_iteration": 2.8666863441467285 }, { "auxiliary_loss_clip": 0.01012914, "auxiliary_loss_mlp": 0.0100742, "balance_loss_clip": 1.00994635, "balance_loss_mlp": 1.00643027, "epoch": 0.7166090485495266, "flos": 69928060464000.0, "grad_norm": 0.6290817017745445, "language_loss": 0.53839982, "learning_rate": 7.848086837452639e-07, "loss": 0.55860317, "num_input_tokens_seen": 257121215, "step": 11919, "time_per_iteration": 3.3189728260040283 }, { "auxiliary_loss_clip": 0.01090721, "auxiliary_loss_mlp": 0.0103132, "balance_loss_clip": 1.03968215, "balance_loss_mlp": 1.01944053, "epoch": 0.7166691718021945, "flos": 27343892949120.0, "grad_norm": 2.4558245905246188, "language_loss": 0.68792629, "learning_rate": 7.844993782066132e-07, "loss": 0.70914674, "num_input_tokens_seen": 257143370, "step": 11920, "time_per_iteration": 2.7760236263275146 }, { "auxiliary_loss_clip": 0.01093244, "auxiliary_loss_mlp": 0.01042352, "balance_loss_clip": 1.03837049, "balance_loss_mlp": 1.02936387, "epoch": 0.7167292950548625, "flos": 30408868563840.0, "grad_norm": 1.7838996304195904, "language_loss": 0.75269383, "learning_rate": 7.841901187598678e-07, "loss": 0.77404976, "num_input_tokens_seen": 257162160, "step": 11921, "time_per_iteration": 2.775209426879883 }, { "auxiliary_loss_clip": 0.01081729, "auxiliary_loss_mlp": 0.01036086, "balance_loss_clip": 1.04076838, "balance_loss_mlp": 1.02052867, "epoch": 0.7167894183075304, "flos": 14571257800320.0, "grad_norm": 2.2701090477680546, "language_loss": 0.75837505, "learning_rate": 7.83880905416755e-07, "loss": 0.77955317, "num_input_tokens_seen": 257179300, "step": 11922, "time_per_iteration": 2.7607452869415283 }, { "auxiliary_loss_clip": 0.01014406, "auxiliary_loss_mlp": 0.01014898, "balance_loss_clip": 1.00970268, "balance_loss_mlp": 1.01383746, "epoch": 0.7168495415601984, "flos": 64110674407680.0, "grad_norm": 0.7523286809102585, "language_loss": 0.55089313, "learning_rate": 7.83571738189001e-07, "loss": 0.57118618, "num_input_tokens_seen": 257235470, "step": 11923, "time_per_iteration": 3.0676429271698 }, { "auxiliary_loss_clip": 0.01080014, "auxiliary_loss_mlp": 0.01037915, "balance_loss_clip": 1.03623641, "balance_loss_mlp": 1.024611, "epoch": 0.7169096648128663, "flos": 24681440119680.0, "grad_norm": 1.4525334689031153, "language_loss": 0.7698282, "learning_rate": 7.832626170883279e-07, "loss": 0.79100752, "num_input_tokens_seen": 257255850, "step": 11924, "time_per_iteration": 2.823679208755493 }, { "auxiliary_loss_clip": 0.01078337, "auxiliary_loss_mlp": 0.01034494, "balance_loss_clip": 1.0381155, "balance_loss_mlp": 1.02288294, "epoch": 0.7169697880655344, "flos": 20667525050880.0, "grad_norm": 1.7352538037253364, "language_loss": 0.68109524, "learning_rate": 7.829535421264588e-07, "loss": 0.70222354, "num_input_tokens_seen": 257275425, "step": 11925, "time_per_iteration": 2.7586591243743896 }, { "auxiliary_loss_clip": 0.01080533, "auxiliary_loss_mlp": 0.01032844, "balance_loss_clip": 1.03722239, "balance_loss_mlp": 1.02085745, "epoch": 0.7170299113182023, "flos": 21032700670080.0, "grad_norm": 1.565689357795704, "language_loss": 0.77380347, "learning_rate": 7.826445133151133e-07, "loss": 0.79493719, "num_input_tokens_seen": 257295740, "step": 11926, "time_per_iteration": 2.777597188949585 }, { "auxiliary_loss_clip": 0.01099959, "auxiliary_loss_mlp": 0.00771085, "balance_loss_clip": 1.03791356, "balance_loss_mlp": 1.00019264, "epoch": 0.7170900345708703, "flos": 22893304239360.0, "grad_norm": 1.9891447928832446, "language_loss": 0.77106082, "learning_rate": 7.823355306660093e-07, "loss": 0.78977132, "num_input_tokens_seen": 257315970, "step": 11927, "time_per_iteration": 2.722008228302002 }, { "auxiliary_loss_clip": 0.01103176, "auxiliary_loss_mlp": 0.01032942, "balance_loss_clip": 1.04161656, "balance_loss_mlp": 1.01948345, "epoch": 0.7171501578235383, "flos": 15518688883200.0, "grad_norm": 1.5109458575320354, "language_loss": 0.69240952, "learning_rate": 7.820265941908642e-07, "loss": 0.71377075, "num_input_tokens_seen": 257334230, "step": 11928, "time_per_iteration": 2.685173511505127 }, { "auxiliary_loss_clip": 0.01063212, "auxiliary_loss_mlp": 0.01033233, "balance_loss_clip": 1.03615737, "balance_loss_mlp": 1.02093053, "epoch": 0.7172102810762062, "flos": 26104292640000.0, "grad_norm": 1.8437632186543573, "language_loss": 0.64895999, "learning_rate": 7.817177039013931e-07, "loss": 0.66992444, "num_input_tokens_seen": 257352145, "step": 11929, "time_per_iteration": 2.811458110809326 }, { "auxiliary_loss_clip": 0.01084354, "auxiliary_loss_mlp": 0.0103302, "balance_loss_clip": 1.03474772, "balance_loss_mlp": 1.0201571, "epoch": 0.7172704043288742, "flos": 21506649649920.0, "grad_norm": 4.025535473729134, "language_loss": 0.70036447, "learning_rate": 7.81408859809308e-07, "loss": 0.72153819, "num_input_tokens_seen": 257371460, "step": 11930, "time_per_iteration": 2.7018861770629883 }, { "auxiliary_loss_clip": 0.01073615, "auxiliary_loss_mlp": 0.01032596, "balance_loss_clip": 1.03261399, "balance_loss_mlp": 1.01994824, "epoch": 0.7173305275815421, "flos": 18770939032320.0, "grad_norm": 1.865130875534894, "language_loss": 0.80753005, "learning_rate": 7.811000619263219e-07, "loss": 0.82859218, "num_input_tokens_seen": 257390800, "step": 11931, "time_per_iteration": 2.814512252807617 }, { "auxiliary_loss_clip": 0.01099893, "auxiliary_loss_mlp": 0.01032133, "balance_loss_clip": 1.03860784, "balance_loss_mlp": 1.02030206, "epoch": 0.7173906508342102, "flos": 16179876483840.0, "grad_norm": 2.1237811167102967, "language_loss": 0.77989686, "learning_rate": 7.80791310264143e-07, "loss": 0.80121714, "num_input_tokens_seen": 257407495, "step": 11932, "time_per_iteration": 2.643590211868286 }, { "auxiliary_loss_clip": 0.01094325, "auxiliary_loss_mlp": 0.01031086, "balance_loss_clip": 1.03725207, "balance_loss_mlp": 1.01856303, "epoch": 0.7174507740868781, "flos": 26613864933120.0, "grad_norm": 1.4329540611911684, "language_loss": 0.75208265, "learning_rate": 7.804826048344803e-07, "loss": 0.77333677, "num_input_tokens_seen": 257429675, "step": 11933, "time_per_iteration": 2.73256254196167 }, { "auxiliary_loss_clip": 0.01118631, "auxiliary_loss_mlp": 0.01038608, "balance_loss_clip": 1.04044771, "balance_loss_mlp": 1.02359951, "epoch": 0.7175108973395461, "flos": 18432911116800.0, "grad_norm": 2.5273912434143537, "language_loss": 0.69165599, "learning_rate": 7.801739456490388e-07, "loss": 0.71322834, "num_input_tokens_seen": 257442765, "step": 11934, "time_per_iteration": 2.63053822517395 }, { "auxiliary_loss_clip": 0.01101966, "auxiliary_loss_mlp": 0.01034522, "balance_loss_clip": 1.03851914, "balance_loss_mlp": 1.02134395, "epoch": 0.717571020592214, "flos": 23914962777600.0, "grad_norm": 2.3786346670781886, "language_loss": 0.86663944, "learning_rate": 7.798653327195237e-07, "loss": 0.88800436, "num_input_tokens_seen": 257459310, "step": 11935, "time_per_iteration": 2.7059433460235596 }, { "auxiliary_loss_clip": 0.01068502, "auxiliary_loss_mlp": 0.01030899, "balance_loss_clip": 1.03335261, "balance_loss_mlp": 1.01750588, "epoch": 0.717631143844882, "flos": 38256930109440.0, "grad_norm": 1.5650811923001593, "language_loss": 0.73900878, "learning_rate": 7.795567660576388e-07, "loss": 0.76000285, "num_input_tokens_seen": 257484750, "step": 11936, "time_per_iteration": 2.8850317001342773 }, { "auxiliary_loss_clip": 0.01029429, "auxiliary_loss_mlp": 0.01001743, "balance_loss_clip": 1.00656271, "balance_loss_mlp": 1.00076544, "epoch": 0.7176912670975499, "flos": 65515896328320.0, "grad_norm": 0.7545285974494826, "language_loss": 0.55848956, "learning_rate": 7.79248245675082e-07, "loss": 0.57880127, "num_input_tokens_seen": 257543110, "step": 11937, "time_per_iteration": 3.1446144580841064 }, { "auxiliary_loss_clip": 0.01104456, "auxiliary_loss_mlp": 0.01037308, "balance_loss_clip": 1.03975892, "balance_loss_mlp": 1.02318776, "epoch": 0.717751390350218, "flos": 31281066610560.0, "grad_norm": 1.8325127153814165, "language_loss": 0.54673332, "learning_rate": 7.789397715835542e-07, "loss": 0.568151, "num_input_tokens_seen": 257567410, "step": 11938, "time_per_iteration": 2.7281179428100586 }, { "auxiliary_loss_clip": 0.01098499, "auxiliary_loss_mlp": 0.01031442, "balance_loss_clip": 1.03886163, "balance_loss_mlp": 1.01891303, "epoch": 0.7178115136028859, "flos": 19859031774720.0, "grad_norm": 1.5418999350026745, "language_loss": 0.76693535, "learning_rate": 7.786313437947527e-07, "loss": 0.78823477, "num_input_tokens_seen": 257586270, "step": 11939, "time_per_iteration": 2.681007146835327 }, { "auxiliary_loss_clip": 0.01013928, "auxiliary_loss_mlp": 0.01000101, "balance_loss_clip": 1.01088846, "balance_loss_mlp": 0.99894488, "epoch": 0.7178716368555539, "flos": 64348655967360.0, "grad_norm": 0.7513743107787466, "language_loss": 0.61356354, "learning_rate": 7.783229623203738e-07, "loss": 0.63370389, "num_input_tokens_seen": 257647415, "step": 11940, "time_per_iteration": 3.202899694442749 }, { "auxiliary_loss_clip": 0.01071936, "auxiliary_loss_mlp": 0.01033376, "balance_loss_clip": 1.03435445, "balance_loss_mlp": 1.02100182, "epoch": 0.7179317601082219, "flos": 26762607152640.0, "grad_norm": 1.8000940083228283, "language_loss": 0.58835107, "learning_rate": 7.780146271721097e-07, "loss": 0.60940421, "num_input_tokens_seen": 257669795, "step": 11941, "time_per_iteration": 2.8157269954681396 }, { "auxiliary_loss_clip": 0.01090967, "auxiliary_loss_mlp": 0.01035406, "balance_loss_clip": 1.03997254, "balance_loss_mlp": 1.02213192, "epoch": 0.7179918833608898, "flos": 23513804709120.0, "grad_norm": 1.9761608738591345, "language_loss": 0.79027683, "learning_rate": 7.777063383616543e-07, "loss": 0.8115406, "num_input_tokens_seen": 257687415, "step": 11942, "time_per_iteration": 4.7641441822052 }, { "auxiliary_loss_clip": 0.01101717, "auxiliary_loss_mlp": 0.01043851, "balance_loss_clip": 1.03940737, "balance_loss_mlp": 1.03082132, "epoch": 0.7180520066135578, "flos": 17165588486400.0, "grad_norm": 2.14920903348502, "language_loss": 0.66369361, "learning_rate": 7.773980959006968e-07, "loss": 0.68514931, "num_input_tokens_seen": 257706215, "step": 11943, "time_per_iteration": 4.182480335235596 }, { "auxiliary_loss_clip": 0.01111064, "auxiliary_loss_mlp": 0.01032443, "balance_loss_clip": 1.03972828, "balance_loss_mlp": 1.01943135, "epoch": 0.7181121298662257, "flos": 17566638814080.0, "grad_norm": 1.806449010910671, "language_loss": 0.79078984, "learning_rate": 7.770898998009254e-07, "loss": 0.81222498, "num_input_tokens_seen": 257724740, "step": 11944, "time_per_iteration": 2.5949878692626953 }, { "auxiliary_loss_clip": 0.01088381, "auxiliary_loss_mlp": 0.00771584, "balance_loss_clip": 1.0390811, "balance_loss_mlp": 1.00018096, "epoch": 0.7181722531188938, "flos": 11947660508160.0, "grad_norm": 2.453862625605413, "language_loss": 0.63021427, "learning_rate": 7.767817500740277e-07, "loss": 0.64881396, "num_input_tokens_seen": 257742060, "step": 11945, "time_per_iteration": 4.4570722579956055 }, { "auxiliary_loss_clip": 0.01016433, "auxiliary_loss_mlp": 0.01004566, "balance_loss_clip": 1.00740266, "balance_loss_mlp": 1.00340927, "epoch": 0.7182323763715617, "flos": 65503649790720.0, "grad_norm": 0.7009639524775984, "language_loss": 0.51063281, "learning_rate": 7.76473646731689e-07, "loss": 0.53084278, "num_input_tokens_seen": 257802250, "step": 11946, "time_per_iteration": 3.083326816558838 }, { "auxiliary_loss_clip": 0.01082274, "auxiliary_loss_mlp": 0.01035503, "balance_loss_clip": 1.03858232, "balance_loss_mlp": 1.02061403, "epoch": 0.7182924996242297, "flos": 20630932070400.0, "grad_norm": 1.6221308546961208, "language_loss": 0.74305403, "learning_rate": 7.761655897855925e-07, "loss": 0.7642318, "num_input_tokens_seen": 257821155, "step": 11947, "time_per_iteration": 2.690142869949341 }, { "auxiliary_loss_clip": 0.01063215, "auxiliary_loss_mlp": 0.00770856, "balance_loss_clip": 1.03264832, "balance_loss_mlp": 1.0000999, "epoch": 0.7183526228768976, "flos": 16216433550720.0, "grad_norm": 1.4641702489475559, "language_loss": 0.72301382, "learning_rate": 7.758575792474187e-07, "loss": 0.74135453, "num_input_tokens_seen": 257839905, "step": 11948, "time_per_iteration": 2.722843647003174 }, { "auxiliary_loss_clip": 0.01090958, "auxiliary_loss_mlp": 0.01044843, "balance_loss_clip": 1.03650224, "balance_loss_mlp": 1.0302515, "epoch": 0.7184127461295656, "flos": 22232655342720.0, "grad_norm": 1.5800605567869153, "language_loss": 0.71426845, "learning_rate": 7.755496151288483e-07, "loss": 0.73562646, "num_input_tokens_seen": 257860055, "step": 11949, "time_per_iteration": 2.6724255084991455 }, { "auxiliary_loss_clip": 0.01110775, "auxiliary_loss_mlp": 0.00770919, "balance_loss_clip": 1.03964746, "balance_loss_mlp": 1.00022686, "epoch": 0.7184728693822335, "flos": 27344503480320.0, "grad_norm": 2.2408917866135116, "language_loss": 0.76207352, "learning_rate": 7.752416974415598e-07, "loss": 0.78089041, "num_input_tokens_seen": 257879315, "step": 11950, "time_per_iteration": 4.192263603210449 }, { "auxiliary_loss_clip": 0.011156, "auxiliary_loss_mlp": 0.01034636, "balance_loss_clip": 1.04076946, "balance_loss_mlp": 1.02039647, "epoch": 0.7185329926349016, "flos": 16508530949760.0, "grad_norm": 2.236939541243443, "language_loss": 0.67911047, "learning_rate": 7.749338261972282e-07, "loss": 0.70061278, "num_input_tokens_seen": 257896570, "step": 11951, "time_per_iteration": 2.506354808807373 }, { "auxiliary_loss_clip": 0.01093328, "auxiliary_loss_mlp": 0.01038497, "balance_loss_clip": 1.03931642, "balance_loss_mlp": 1.02329814, "epoch": 0.7185931158875695, "flos": 23951052967680.0, "grad_norm": 1.74286410335133, "language_loss": 0.78158391, "learning_rate": 7.746260014075286e-07, "loss": 0.8029021, "num_input_tokens_seen": 257916855, "step": 11952, "time_per_iteration": 2.660937547683716 }, { "auxiliary_loss_clip": 0.01106031, "auxiliary_loss_mlp": 0.01036025, "balance_loss_clip": 1.03961015, "balance_loss_mlp": 1.02241182, "epoch": 0.7186532391402375, "flos": 26542007775360.0, "grad_norm": 1.8142092778297234, "language_loss": 0.74966663, "learning_rate": 7.743182230841352e-07, "loss": 0.77108717, "num_input_tokens_seen": 257937140, "step": 11953, "time_per_iteration": 2.64990234375 }, { "auxiliary_loss_clip": 0.01104406, "auxiliary_loss_mlp": 0.010347, "balance_loss_clip": 1.03859532, "balance_loss_mlp": 1.0209074, "epoch": 0.7187133623929055, "flos": 22383049587840.0, "grad_norm": 1.8633986860843366, "language_loss": 0.73231012, "learning_rate": 7.740104912387164e-07, "loss": 0.75370121, "num_input_tokens_seen": 257956785, "step": 11954, "time_per_iteration": 2.667728900909424 }, { "auxiliary_loss_clip": 0.01092336, "auxiliary_loss_mlp": 0.01037743, "balance_loss_clip": 1.04056668, "balance_loss_mlp": 1.02468944, "epoch": 0.7187734856455734, "flos": 15779580341760.0, "grad_norm": 1.6371467452088548, "language_loss": 0.7436921, "learning_rate": 7.737028058829425e-07, "loss": 0.76499295, "num_input_tokens_seen": 257975455, "step": 11955, "time_per_iteration": 2.750943660736084 }, { "auxiliary_loss_clip": 0.01077053, "auxiliary_loss_mlp": 0.01034821, "balance_loss_clip": 1.03667569, "balance_loss_mlp": 1.02145171, "epoch": 0.7188336088982414, "flos": 31759612531200.0, "grad_norm": 1.63362456002572, "language_loss": 0.73112231, "learning_rate": 7.733951670284817e-07, "loss": 0.75224108, "num_input_tokens_seen": 257996850, "step": 11956, "time_per_iteration": 2.7964000701904297 }, { "auxiliary_loss_clip": 0.01027108, "auxiliary_loss_mlp": 0.01054242, "balance_loss_clip": 1.0295012, "balance_loss_mlp": 1.0388875, "epoch": 0.7188937321509093, "flos": 21465208333440.0, "grad_norm": 1.634055582279059, "language_loss": 0.71066529, "learning_rate": 7.730875746869987e-07, "loss": 0.73147881, "num_input_tokens_seen": 258016145, "step": 11957, "time_per_iteration": 2.920449733734131 }, { "auxiliary_loss_clip": 0.01066083, "auxiliary_loss_mlp": 0.01046033, "balance_loss_clip": 1.03746307, "balance_loss_mlp": 1.03144193, "epoch": 0.7189538554035774, "flos": 27271497087360.0, "grad_norm": 1.9298649142974575, "language_loss": 0.73817873, "learning_rate": 7.727800288701582e-07, "loss": 0.75929987, "num_input_tokens_seen": 258035420, "step": 11958, "time_per_iteration": 2.8204050064086914 }, { "auxiliary_loss_clip": 0.01097894, "auxiliary_loss_mlp": 0.01043657, "balance_loss_clip": 1.03673959, "balance_loss_mlp": 1.03006124, "epoch": 0.7190139786562453, "flos": 21580625710080.0, "grad_norm": 1.5794968369614186, "language_loss": 0.83998394, "learning_rate": 7.724725295896215e-07, "loss": 0.86139941, "num_input_tokens_seen": 258053520, "step": 11959, "time_per_iteration": 2.7135143280029297 }, { "auxiliary_loss_clip": 0.01118944, "auxiliary_loss_mlp": 0.01033809, "balance_loss_clip": 1.04263496, "balance_loss_mlp": 1.0193491, "epoch": 0.7190741019089133, "flos": 26721237663360.0, "grad_norm": 1.6672676962556263, "language_loss": 0.81917083, "learning_rate": 7.7216507685705e-07, "loss": 0.84069836, "num_input_tokens_seen": 258073020, "step": 11960, "time_per_iteration": 2.6510887145996094 }, { "auxiliary_loss_clip": 0.01085237, "auxiliary_loss_mlp": 0.01040267, "balance_loss_clip": 1.03664184, "balance_loss_mlp": 1.02624774, "epoch": 0.7191342251615812, "flos": 26104759516800.0, "grad_norm": 1.541177269995967, "language_loss": 0.77309084, "learning_rate": 7.718576706841013e-07, "loss": 0.79434586, "num_input_tokens_seen": 258093155, "step": 11961, "time_per_iteration": 2.720644950866699 }, { "auxiliary_loss_clip": 0.01093865, "auxiliary_loss_mlp": 0.01034927, "balance_loss_clip": 1.03698349, "balance_loss_mlp": 1.02280951, "epoch": 0.7191943484142492, "flos": 22967028904320.0, "grad_norm": 1.422930099710146, "language_loss": 0.75150669, "learning_rate": 7.715503110824326e-07, "loss": 0.7727946, "num_input_tokens_seen": 258113905, "step": 11962, "time_per_iteration": 2.602642774581909 }, { "auxiliary_loss_clip": 0.01101563, "auxiliary_loss_mlp": 0.01033205, "balance_loss_clip": 1.03852582, "balance_loss_mlp": 1.01952553, "epoch": 0.7192544716669171, "flos": 22565332131840.0, "grad_norm": 1.6830971031616218, "language_loss": 0.74998534, "learning_rate": 7.712429980637001e-07, "loss": 0.77133304, "num_input_tokens_seen": 258132820, "step": 11963, "time_per_iteration": 2.6065595149993896 }, { "auxiliary_loss_clip": 0.01076507, "auxiliary_loss_mlp": 0.0103598, "balance_loss_clip": 1.03903389, "balance_loss_mlp": 1.02130532, "epoch": 0.7193145949195852, "flos": 18982200873600.0, "grad_norm": 2.2290722742706253, "language_loss": 0.80742419, "learning_rate": 7.709357316395564e-07, "loss": 0.82854903, "num_input_tokens_seen": 258148055, "step": 11964, "time_per_iteration": 2.623037338256836 }, { "auxiliary_loss_clip": 0.0110166, "auxiliary_loss_mlp": 0.01035653, "balance_loss_clip": 1.03931797, "balance_loss_mlp": 1.02267718, "epoch": 0.7193747181722531, "flos": 18004246208640.0, "grad_norm": 1.8511533341084931, "language_loss": 0.74847329, "learning_rate": 7.70628511821652e-07, "loss": 0.76984644, "num_input_tokens_seen": 258165995, "step": 11965, "time_per_iteration": 2.6308131217956543 }, { "auxiliary_loss_clip": 0.01088669, "auxiliary_loss_mlp": 0.0103597, "balance_loss_clip": 1.04116011, "balance_loss_mlp": 1.02225494, "epoch": 0.7194348414249211, "flos": 24389414547840.0, "grad_norm": 1.5072398598153138, "language_loss": 0.77484959, "learning_rate": 7.703213386216377e-07, "loss": 0.79609603, "num_input_tokens_seen": 258186165, "step": 11966, "time_per_iteration": 2.7064943313598633 }, { "auxiliary_loss_clip": 0.0108693, "auxiliary_loss_mlp": 0.01040354, "balance_loss_clip": 1.03570664, "balance_loss_mlp": 1.02598929, "epoch": 0.7194949646775891, "flos": 22163455791360.0, "grad_norm": 2.094523780207328, "language_loss": 0.72974217, "learning_rate": 7.700142120511619e-07, "loss": 0.75101507, "num_input_tokens_seen": 258204595, "step": 11967, "time_per_iteration": 2.6798341274261475 }, { "auxiliary_loss_clip": 0.01084414, "auxiliary_loss_mlp": 0.01030462, "balance_loss_clip": 1.03810835, "balance_loss_mlp": 1.01876187, "epoch": 0.719555087930257, "flos": 20266366982400.0, "grad_norm": 1.6400995747939784, "language_loss": 0.81876254, "learning_rate": 7.6970713212187e-07, "loss": 0.83991134, "num_input_tokens_seen": 258223110, "step": 11968, "time_per_iteration": 2.5945241451263428 }, { "auxiliary_loss_clip": 0.01090809, "auxiliary_loss_mlp": 0.01030819, "balance_loss_clip": 1.03921008, "balance_loss_mlp": 1.01730037, "epoch": 0.719615211182925, "flos": 24716309247360.0, "grad_norm": 6.059293757732166, "language_loss": 0.76039946, "learning_rate": 7.69400098845407e-07, "loss": 0.78161573, "num_input_tokens_seen": 258242660, "step": 11969, "time_per_iteration": 2.669769763946533 }, { "auxiliary_loss_clip": 0.01071764, "auxiliary_loss_mlp": 0.01035075, "balance_loss_clip": 1.03422332, "balance_loss_mlp": 1.02085924, "epoch": 0.719675334435593, "flos": 20009641501440.0, "grad_norm": 1.7540280095141672, "language_loss": 0.71060121, "learning_rate": 7.69093112233417e-07, "loss": 0.7316696, "num_input_tokens_seen": 258261850, "step": 11970, "time_per_iteration": 2.679556131362915 }, { "auxiliary_loss_clip": 0.01013659, "auxiliary_loss_mlp": 0.01009131, "balance_loss_clip": 1.00968122, "balance_loss_mlp": 1.00800419, "epoch": 0.719735457688261, "flos": 44199861177600.0, "grad_norm": 0.9164669258671052, "language_loss": 0.60825729, "learning_rate": 7.68786172297538e-07, "loss": 0.6284852, "num_input_tokens_seen": 258312570, "step": 11971, "time_per_iteration": 3.07918381690979 }, { "auxiliary_loss_clip": 0.01119878, "auxiliary_loss_mlp": 0.0103657, "balance_loss_clip": 1.04122591, "balance_loss_mlp": 1.02223504, "epoch": 0.7197955809409289, "flos": 16802890905600.0, "grad_norm": 2.0890119632055772, "language_loss": 0.80200607, "learning_rate": 7.684792790494105e-07, "loss": 0.82357055, "num_input_tokens_seen": 258331600, "step": 11972, "time_per_iteration": 2.6157615184783936 }, { "auxiliary_loss_clip": 0.01094231, "auxiliary_loss_mlp": 0.01036827, "balance_loss_clip": 1.03909624, "balance_loss_mlp": 1.02286744, "epoch": 0.7198557041935969, "flos": 24535391420160.0, "grad_norm": 1.4459296159534718, "language_loss": 0.75361621, "learning_rate": 7.681724325006733e-07, "loss": 0.77492678, "num_input_tokens_seen": 258351785, "step": 11973, "time_per_iteration": 2.7092697620391846 }, { "auxiliary_loss_clip": 0.00998126, "auxiliary_loss_mlp": 0.01000341, "balance_loss_clip": 1.01353586, "balance_loss_mlp": 0.99922049, "epoch": 0.7199158274462648, "flos": 70710839602560.0, "grad_norm": 0.8513128948563679, "language_loss": 0.5708431, "learning_rate": 7.6786563266296e-07, "loss": 0.5908277, "num_input_tokens_seen": 258404035, "step": 11974, "time_per_iteration": 3.085857391357422 }, { "auxiliary_loss_clip": 0.01087282, "auxiliary_loss_mlp": 0.01034258, "balance_loss_clip": 1.03747392, "balance_loss_mlp": 1.02043021, "epoch": 0.7199759506989328, "flos": 29347995352320.0, "grad_norm": 2.3096725812803225, "language_loss": 0.61059892, "learning_rate": 7.675588795479062e-07, "loss": 0.6318143, "num_input_tokens_seen": 258424850, "step": 11975, "time_per_iteration": 2.7332818508148193 }, { "auxiliary_loss_clip": 0.01100807, "auxiliary_loss_mlp": 0.01033871, "balance_loss_clip": 1.03652167, "balance_loss_mlp": 1.02041817, "epoch": 0.7200360739516007, "flos": 24640465680000.0, "grad_norm": 2.671508087455807, "language_loss": 0.67916059, "learning_rate": 7.672521731671425e-07, "loss": 0.7005074, "num_input_tokens_seen": 258445485, "step": 11976, "time_per_iteration": 2.6940202713012695 }, { "auxiliary_loss_clip": 0.0108397, "auxiliary_loss_mlp": 0.01030323, "balance_loss_clip": 1.03955865, "balance_loss_mlp": 1.0175494, "epoch": 0.7200961972042688, "flos": 20812855478400.0, "grad_norm": 1.8443077153848637, "language_loss": 0.67261469, "learning_rate": 7.669455135323004e-07, "loss": 0.69375765, "num_input_tokens_seen": 258464505, "step": 11977, "time_per_iteration": 2.6581647396087646 }, { "auxiliary_loss_clip": 0.01091707, "auxiliary_loss_mlp": 0.01036372, "balance_loss_clip": 1.03710294, "balance_loss_mlp": 1.02315187, "epoch": 0.7201563204569367, "flos": 31245910174080.0, "grad_norm": 1.5443170627433962, "language_loss": 0.75495118, "learning_rate": 7.666389006550074e-07, "loss": 0.776232, "num_input_tokens_seen": 258487190, "step": 11978, "time_per_iteration": 2.8164350986480713 }, { "auxiliary_loss_clip": 0.0111045, "auxiliary_loss_mlp": 0.01033615, "balance_loss_clip": 1.03798056, "balance_loss_mlp": 1.02009642, "epoch": 0.7202164437096047, "flos": 26651391667200.0, "grad_norm": 2.011628151794158, "language_loss": 0.78906727, "learning_rate": 7.663323345468908e-07, "loss": 0.81050789, "num_input_tokens_seen": 258503790, "step": 11979, "time_per_iteration": 2.603609323501587 }, { "auxiliary_loss_clip": 0.01100805, "auxiliary_loss_mlp": 0.01032476, "balance_loss_clip": 1.03782308, "balance_loss_mlp": 1.01863027, "epoch": 0.7202765669622727, "flos": 25959608657280.0, "grad_norm": 1.489458439869756, "language_loss": 0.64516908, "learning_rate": 7.660258152195767e-07, "loss": 0.66650194, "num_input_tokens_seen": 258527335, "step": 11980, "time_per_iteration": 2.6712260246276855 }, { "auxiliary_loss_clip": 0.01106474, "auxiliary_loss_mlp": 0.01037898, "balance_loss_clip": 1.04096806, "balance_loss_mlp": 1.02322936, "epoch": 0.7203366902149406, "flos": 28512354372480.0, "grad_norm": 3.283132344520263, "language_loss": 0.67034644, "learning_rate": 7.657193426846871e-07, "loss": 0.69179016, "num_input_tokens_seen": 258546690, "step": 11981, "time_per_iteration": 4.248534202575684 }, { "auxiliary_loss_clip": 0.01080413, "auxiliary_loss_mlp": 0.01035174, "balance_loss_clip": 1.03540182, "balance_loss_mlp": 1.02077293, "epoch": 0.7203968134676086, "flos": 21106030285440.0, "grad_norm": 1.9200957279106055, "language_loss": 0.74228042, "learning_rate": 7.65412916953843e-07, "loss": 0.76343632, "num_input_tokens_seen": 258566340, "step": 11982, "time_per_iteration": 2.6612656116485596 }, { "auxiliary_loss_clip": 0.01082612, "auxiliary_loss_mlp": 0.00771666, "balance_loss_clip": 1.03610659, "balance_loss_mlp": 1.00010824, "epoch": 0.7204569367202766, "flos": 18332146488960.0, "grad_norm": 1.9444187102114145, "language_loss": 0.65890288, "learning_rate": 7.65106538038665e-07, "loss": 0.67744565, "num_input_tokens_seen": 258584455, "step": 11983, "time_per_iteration": 5.959589004516602 }, { "auxiliary_loss_clip": 0.01084437, "auxiliary_loss_mlp": 0.01035638, "balance_loss_clip": 1.04208398, "balance_loss_mlp": 1.02224469, "epoch": 0.7205170599729446, "flos": 23255103980160.0, "grad_norm": 1.5232420204646802, "language_loss": 0.66515326, "learning_rate": 7.648002059507715e-07, "loss": 0.68635398, "num_input_tokens_seen": 258604725, "step": 11984, "time_per_iteration": 2.6606063842773438 }, { "auxiliary_loss_clip": 0.01102672, "auxiliary_loss_mlp": 0.01035615, "balance_loss_clip": 1.03870726, "balance_loss_mlp": 1.02119064, "epoch": 0.7205771832256125, "flos": 20120892900480.0, "grad_norm": 1.688320312491579, "language_loss": 0.74081761, "learning_rate": 7.644939207017771e-07, "loss": 0.76220047, "num_input_tokens_seen": 258622885, "step": 11985, "time_per_iteration": 2.6758813858032227 }, { "auxiliary_loss_clip": 0.01100706, "auxiliary_loss_mlp": 0.01032026, "balance_loss_clip": 1.03882444, "balance_loss_mlp": 1.01896691, "epoch": 0.7206373064782805, "flos": 27703250565120.0, "grad_norm": 2.1824579845147287, "language_loss": 0.62681192, "learning_rate": 7.641876823032977e-07, "loss": 0.64813924, "num_input_tokens_seen": 258644305, "step": 11986, "time_per_iteration": 2.6787214279174805 }, { "auxiliary_loss_clip": 0.01094506, "auxiliary_loss_mlp": 0.01035959, "balance_loss_clip": 1.0400337, "balance_loss_mlp": 1.02129614, "epoch": 0.7206974297309484, "flos": 17968156018560.0, "grad_norm": 1.6774381581209574, "language_loss": 0.72387213, "learning_rate": 7.638814907669455e-07, "loss": 0.74517679, "num_input_tokens_seen": 258661775, "step": 11987, "time_per_iteration": 2.6494300365448 }, { "auxiliary_loss_clip": 0.01091554, "auxiliary_loss_mlp": 0.01036778, "balance_loss_clip": 1.03807402, "balance_loss_mlp": 1.0230689, "epoch": 0.7207575529836164, "flos": 16983162288000.0, "grad_norm": 2.0154158747708886, "language_loss": 0.78542352, "learning_rate": 7.635753461043301e-07, "loss": 0.80670691, "num_input_tokens_seen": 258679830, "step": 11988, "time_per_iteration": 2.7818825244903564 }, { "auxiliary_loss_clip": 0.01112006, "auxiliary_loss_mlp": 0.0103674, "balance_loss_clip": 1.03854907, "balance_loss_mlp": 1.02319229, "epoch": 0.7208176762362843, "flos": 18727594295040.0, "grad_norm": 2.5683487455576013, "language_loss": 0.78912222, "learning_rate": 7.632692483270618e-07, "loss": 0.8106097, "num_input_tokens_seen": 258697415, "step": 11989, "time_per_iteration": 4.105331659317017 }, { "auxiliary_loss_clip": 0.01110244, "auxiliary_loss_mlp": 0.01035931, "balance_loss_clip": 1.03845143, "balance_loss_mlp": 1.02281189, "epoch": 0.7208777994889524, "flos": 18734489706240.0, "grad_norm": 1.667538370245498, "language_loss": 0.8218925, "learning_rate": 7.629631974467481e-07, "loss": 0.84335428, "num_input_tokens_seen": 258716755, "step": 11990, "time_per_iteration": 2.59250545501709 }, { "auxiliary_loss_clip": 0.01084798, "auxiliary_loss_mlp": 0.01039501, "balance_loss_clip": 1.03765297, "balance_loss_mlp": 1.0263406, "epoch": 0.7209379227416203, "flos": 14793437376000.0, "grad_norm": 2.0017944237848146, "language_loss": 0.76018798, "learning_rate": 7.626571934749931e-07, "loss": 0.78143102, "num_input_tokens_seen": 258733270, "step": 11991, "time_per_iteration": 2.6581742763519287 }, { "auxiliary_loss_clip": 0.01069068, "auxiliary_loss_mlp": 0.01036802, "balance_loss_clip": 1.03637481, "balance_loss_mlp": 1.02277708, "epoch": 0.7209980459942883, "flos": 29636860527360.0, "grad_norm": 1.4417836781723634, "language_loss": 0.7278806, "learning_rate": 7.623512364234022e-07, "loss": 0.74893934, "num_input_tokens_seen": 258755270, "step": 11992, "time_per_iteration": 2.762066602706909 }, { "auxiliary_loss_clip": 0.01101853, "auxiliary_loss_mlp": 0.01035181, "balance_loss_clip": 1.03684831, "balance_loss_mlp": 1.0217396, "epoch": 0.7210581692469563, "flos": 23477175815040.0, "grad_norm": 1.590664380995942, "language_loss": 0.66213107, "learning_rate": 7.620453263035755e-07, "loss": 0.68350136, "num_input_tokens_seen": 258775340, "step": 11993, "time_per_iteration": 2.669746160507202 }, { "auxiliary_loss_clip": 0.01103083, "auxiliary_loss_mlp": 0.01034792, "balance_loss_clip": 1.03803623, "balance_loss_mlp": 1.02193534, "epoch": 0.7211182924996242, "flos": 26099839353600.0, "grad_norm": 3.884072112544962, "language_loss": 0.65876019, "learning_rate": 7.61739463127115e-07, "loss": 0.68013895, "num_input_tokens_seen": 258794580, "step": 11994, "time_per_iteration": 2.6249778270721436 }, { "auxiliary_loss_clip": 0.01103021, "auxiliary_loss_mlp": 0.01036805, "balance_loss_clip": 1.03799295, "balance_loss_mlp": 1.02208841, "epoch": 0.7211784157522922, "flos": 17712076982400.0, "grad_norm": 2.8589170011893006, "language_loss": 0.67324853, "learning_rate": 7.614336469056172e-07, "loss": 0.69464678, "num_input_tokens_seen": 258812330, "step": 11995, "time_per_iteration": 2.5577452182769775 }, { "auxiliary_loss_clip": 0.01084316, "auxiliary_loss_mlp": 0.01033821, "balance_loss_clip": 1.03543901, "balance_loss_mlp": 1.01986206, "epoch": 0.7212385390049602, "flos": 24423637230720.0, "grad_norm": 2.2331481184505537, "language_loss": 0.79888833, "learning_rate": 7.6112787765068e-07, "loss": 0.82006973, "num_input_tokens_seen": 258831770, "step": 11996, "time_per_iteration": 2.6798765659332275 }, { "auxiliary_loss_clip": 0.01112754, "auxiliary_loss_mlp": 0.01038784, "balance_loss_clip": 1.03908491, "balance_loss_mlp": 1.02556992, "epoch": 0.7212986622576282, "flos": 28147250580480.0, "grad_norm": 1.9315796052948224, "language_loss": 0.81023175, "learning_rate": 7.60822155373899e-07, "loss": 0.83174717, "num_input_tokens_seen": 258849090, "step": 11997, "time_per_iteration": 2.656759023666382 }, { "auxiliary_loss_clip": 0.01114647, "auxiliary_loss_mlp": 0.0103518, "balance_loss_clip": 1.03915894, "balance_loss_mlp": 1.02126861, "epoch": 0.7213587855102961, "flos": 21835770992640.0, "grad_norm": 1.8930751745760046, "language_loss": 0.67190164, "learning_rate": 7.605164800868646e-07, "loss": 0.69339991, "num_input_tokens_seen": 258868230, "step": 11998, "time_per_iteration": 2.6269752979278564 }, { "auxiliary_loss_clip": 0.01113247, "auxiliary_loss_mlp": 0.01032004, "balance_loss_clip": 1.0402633, "balance_loss_mlp": 1.01999927, "epoch": 0.7214189087629641, "flos": 14611549881600.0, "grad_norm": 2.2123816168992287, "language_loss": 0.72197175, "learning_rate": 7.602108518011696e-07, "loss": 0.74342418, "num_input_tokens_seen": 258885525, "step": 11999, "time_per_iteration": 2.7030436992645264 }, { "auxiliary_loss_clip": 0.01095225, "auxiliary_loss_mlp": 0.01030017, "balance_loss_clip": 1.03975248, "balance_loss_mlp": 1.01632595, "epoch": 0.721479032015632, "flos": 19390864884480.0, "grad_norm": 2.1896556782870986, "language_loss": 0.82891619, "learning_rate": 7.599052705284039e-07, "loss": 0.85016865, "num_input_tokens_seen": 258903245, "step": 12000, "time_per_iteration": 2.72419810295105 }, { "auxiliary_loss_clip": 0.0110488, "auxiliary_loss_mlp": 0.01036877, "balance_loss_clip": 1.04077649, "balance_loss_mlp": 1.02337074, "epoch": 0.7215391552683, "flos": 18512884748160.0, "grad_norm": 2.238210081957985, "language_loss": 0.77015889, "learning_rate": 7.59599736280154e-07, "loss": 0.79157639, "num_input_tokens_seen": 258921245, "step": 12001, "time_per_iteration": 2.6786983013153076 }, { "auxiliary_loss_clip": 0.01096613, "auxiliary_loss_mlp": 0.01041613, "balance_loss_clip": 1.03922153, "balance_loss_mlp": 1.02826142, "epoch": 0.721599278520968, "flos": 23258731253760.0, "grad_norm": 1.7647688561278618, "language_loss": 0.81434, "learning_rate": 7.592942490680066e-07, "loss": 0.83572221, "num_input_tokens_seen": 258939425, "step": 12002, "time_per_iteration": 2.766787052154541 }, { "auxiliary_loss_clip": 0.01103657, "auxiliary_loss_mlp": 0.0102914, "balance_loss_clip": 1.03956521, "balance_loss_mlp": 1.01506686, "epoch": 0.721659401773636, "flos": 39199045979520.0, "grad_norm": 1.90156490746599, "language_loss": 0.62442046, "learning_rate": 7.589888089035462e-07, "loss": 0.64574844, "num_input_tokens_seen": 258960710, "step": 12003, "time_per_iteration": 2.7572412490844727 }, { "auxiliary_loss_clip": 0.01114647, "auxiliary_loss_mlp": 0.01033073, "balance_loss_clip": 1.0397718, "balance_loss_mlp": 1.019418, "epoch": 0.7217195250263039, "flos": 14939917038720.0, "grad_norm": 2.6609118210523146, "language_loss": 0.6843828, "learning_rate": 7.586834157983544e-07, "loss": 0.70586002, "num_input_tokens_seen": 258978475, "step": 12004, "time_per_iteration": 2.553619623184204 }, { "auxiliary_loss_clip": 0.01013578, "auxiliary_loss_mlp": 0.01003303, "balance_loss_clip": 1.01591694, "balance_loss_mlp": 1.0020926, "epoch": 0.7217796482789719, "flos": 70869206666880.0, "grad_norm": 0.858251890961465, "language_loss": 0.54091179, "learning_rate": 7.583780697640112e-07, "loss": 0.56108057, "num_input_tokens_seen": 259037520, "step": 12005, "time_per_iteration": 3.186676502227783 }, { "auxiliary_loss_clip": 0.0107998, "auxiliary_loss_mlp": 0.01033092, "balance_loss_clip": 1.03859079, "balance_loss_mlp": 1.0192821, "epoch": 0.7218397715316398, "flos": 37451525402880.0, "grad_norm": 1.66711169237072, "language_loss": 0.63384253, "learning_rate": 7.580727708120962e-07, "loss": 0.65497327, "num_input_tokens_seen": 259061325, "step": 12006, "time_per_iteration": 2.8096885681152344 }, { "auxiliary_loss_clip": 0.01084341, "auxiliary_loss_mlp": 0.01034652, "balance_loss_clip": 1.03541422, "balance_loss_mlp": 1.02141964, "epoch": 0.7218998947843078, "flos": 22710662559360.0, "grad_norm": 1.8415091001444905, "language_loss": 0.91831303, "learning_rate": 7.577675189541865e-07, "loss": 0.93950289, "num_input_tokens_seen": 259078135, "step": 12007, "time_per_iteration": 2.636061668395996 }, { "auxiliary_loss_clip": 0.01074819, "auxiliary_loss_mlp": 0.01038291, "balance_loss_clip": 1.03386235, "balance_loss_mlp": 1.02249599, "epoch": 0.7219600180369758, "flos": 12167182477440.0, "grad_norm": 1.9560042300828953, "language_loss": 0.64139968, "learning_rate": 7.574623142018568e-07, "loss": 0.66253078, "num_input_tokens_seen": 259095910, "step": 12008, "time_per_iteration": 2.6658670902252197 }, { "auxiliary_loss_clip": 0.0110234, "auxiliary_loss_mlp": 0.0103902, "balance_loss_clip": 1.03860354, "balance_loss_mlp": 1.02491176, "epoch": 0.7220201412896438, "flos": 22596573985920.0, "grad_norm": 1.9949931171952824, "language_loss": 0.78768408, "learning_rate": 7.57157156566681e-07, "loss": 0.80909771, "num_input_tokens_seen": 259114225, "step": 12009, "time_per_iteration": 2.6496176719665527 }, { "auxiliary_loss_clip": 0.01103715, "auxiliary_loss_mlp": 0.01040084, "balance_loss_clip": 1.04009509, "balance_loss_mlp": 1.02490854, "epoch": 0.7220802645423118, "flos": 26718651884160.0, "grad_norm": 1.8397913257632763, "language_loss": 0.64088428, "learning_rate": 7.568520460602297e-07, "loss": 0.66232234, "num_input_tokens_seen": 259134660, "step": 12010, "time_per_iteration": 2.7039434909820557 }, { "auxiliary_loss_clip": 0.01112341, "auxiliary_loss_mlp": 0.01028267, "balance_loss_clip": 1.0384059, "balance_loss_mlp": 1.01517224, "epoch": 0.7221403877949797, "flos": 24420548661120.0, "grad_norm": 2.031062192481546, "language_loss": 0.7745133, "learning_rate": 7.565469826940742e-07, "loss": 0.79591942, "num_input_tokens_seen": 259153300, "step": 12011, "time_per_iteration": 2.6566684246063232 }, { "auxiliary_loss_clip": 0.01095954, "auxiliary_loss_mlp": 0.01036039, "balance_loss_clip": 1.03788853, "balance_loss_mlp": 1.02336133, "epoch": 0.7222005110476477, "flos": 23514379326720.0, "grad_norm": 2.0943143808042617, "language_loss": 0.78936207, "learning_rate": 7.56241966479781e-07, "loss": 0.81068206, "num_input_tokens_seen": 259172115, "step": 12012, "time_per_iteration": 2.6651875972747803 }, { "auxiliary_loss_clip": 0.0109279, "auxiliary_loss_mlp": 0.01031271, "balance_loss_clip": 1.03982329, "balance_loss_mlp": 1.01809883, "epoch": 0.7222606343003156, "flos": 23112538899840.0, "grad_norm": 1.7259096547472548, "language_loss": 0.75816202, "learning_rate": 7.559369974289171e-07, "loss": 0.77940267, "num_input_tokens_seen": 259191345, "step": 12013, "time_per_iteration": 2.6666300296783447 }, { "auxiliary_loss_clip": 0.01112282, "auxiliary_loss_mlp": 0.01027778, "balance_loss_clip": 1.03951406, "balance_loss_mlp": 1.01493895, "epoch": 0.7223207575529836, "flos": 24351169541760.0, "grad_norm": 1.5900887482073394, "language_loss": 0.76009625, "learning_rate": 7.556320755530484e-07, "loss": 0.78149676, "num_input_tokens_seen": 259211700, "step": 12014, "time_per_iteration": 2.8077309131622314 }, { "auxiliary_loss_clip": 0.01103939, "auxiliary_loss_mlp": 0.01031964, "balance_loss_clip": 1.03792763, "balance_loss_mlp": 1.01870835, "epoch": 0.7223808808056515, "flos": 28330179569280.0, "grad_norm": 1.5772479389327612, "language_loss": 0.86851835, "learning_rate": 7.553272008637346e-07, "loss": 0.88987738, "num_input_tokens_seen": 259233825, "step": 12015, "time_per_iteration": 2.658083915710449 }, { "auxiliary_loss_clip": 0.01099282, "auxiliary_loss_mlp": 0.01033999, "balance_loss_clip": 1.0388813, "balance_loss_mlp": 1.02105308, "epoch": 0.7224410040583196, "flos": 21069437304960.0, "grad_norm": 1.834690814791336, "language_loss": 0.7801137, "learning_rate": 7.55022373372538e-07, "loss": 0.80144656, "num_input_tokens_seen": 259253055, "step": 12016, "time_per_iteration": 2.623483180999756 }, { "auxiliary_loss_clip": 0.01067391, "auxiliary_loss_mlp": 0.0105171, "balance_loss_clip": 1.03403831, "balance_loss_mlp": 1.03612971, "epoch": 0.7225011273109875, "flos": 26795429205120.0, "grad_norm": 1.3753282936745013, "language_loss": 0.77807558, "learning_rate": 7.547175930910186e-07, "loss": 0.79926664, "num_input_tokens_seen": 259273420, "step": 12017, "time_per_iteration": 2.7652459144592285 }, { "auxiliary_loss_clip": 0.01109706, "auxiliary_loss_mlp": 0.01031666, "balance_loss_clip": 1.03881669, "balance_loss_mlp": 1.01943493, "epoch": 0.7225612505636555, "flos": 23583578878080.0, "grad_norm": 1.9142448581528158, "language_loss": 0.73780286, "learning_rate": 7.54412860030732e-07, "loss": 0.75921661, "num_input_tokens_seen": 259291000, "step": 12018, "time_per_iteration": 2.640007495880127 }, { "auxiliary_loss_clip": 0.01084854, "auxiliary_loss_mlp": 0.01034783, "balance_loss_clip": 1.04522383, "balance_loss_mlp": 1.02281451, "epoch": 0.7226213738163234, "flos": 20777627214720.0, "grad_norm": 4.152096025533445, "language_loss": 0.77579439, "learning_rate": 7.541081742032347e-07, "loss": 0.79699075, "num_input_tokens_seen": 259312390, "step": 12019, "time_per_iteration": 2.6887192726135254 }, { "auxiliary_loss_clip": 0.01087897, "auxiliary_loss_mlp": 0.01029897, "balance_loss_clip": 1.03979766, "balance_loss_mlp": 1.01615798, "epoch": 0.7226814970689914, "flos": 32635832901120.0, "grad_norm": 1.8249624922907017, "language_loss": 0.73749167, "learning_rate": 7.53803535620081e-07, "loss": 0.75866961, "num_input_tokens_seen": 259332645, "step": 12020, "time_per_iteration": 2.714838743209839 }, { "auxiliary_loss_clip": 0.01096548, "auxiliary_loss_mlp": 0.01033021, "balance_loss_clip": 1.03796768, "balance_loss_mlp": 1.0203011, "epoch": 0.7227416203216595, "flos": 22454368041600.0, "grad_norm": 1.8291980950612234, "language_loss": 0.77410042, "learning_rate": 7.534989442928219e-07, "loss": 0.79539609, "num_input_tokens_seen": 259353810, "step": 12021, "time_per_iteration": 4.313388347625732 }, { "auxiliary_loss_clip": 0.01074387, "auxiliary_loss_mlp": 0.0103505, "balance_loss_clip": 1.03570378, "balance_loss_mlp": 1.02155018, "epoch": 0.7228017435743274, "flos": 21652303299840.0, "grad_norm": 1.8872518659613802, "language_loss": 0.68324184, "learning_rate": 7.531944002330073e-07, "loss": 0.70433629, "num_input_tokens_seen": 259372460, "step": 12022, "time_per_iteration": 2.7648468017578125 }, { "auxiliary_loss_clip": 0.01102722, "auxiliary_loss_mlp": 0.0103106, "balance_loss_clip": 1.03769839, "balance_loss_mlp": 1.01741076, "epoch": 0.7228618668269954, "flos": 29533474206720.0, "grad_norm": 1.7890580535020497, "language_loss": 0.69560903, "learning_rate": 7.528899034521858e-07, "loss": 0.71694684, "num_input_tokens_seen": 259393275, "step": 12023, "time_per_iteration": 5.942451000213623 }, { "auxiliary_loss_clip": 0.01082247, "auxiliary_loss_mlp": 0.01030033, "balance_loss_clip": 1.03305829, "balance_loss_mlp": 1.0162704, "epoch": 0.7229219900796633, "flos": 27453815544960.0, "grad_norm": 1.630981256405689, "language_loss": 0.71236169, "learning_rate": 7.525854539619052e-07, "loss": 0.73348451, "num_input_tokens_seen": 259416205, "step": 12024, "time_per_iteration": 2.673879861831665 }, { "auxiliary_loss_clip": 0.01079579, "auxiliary_loss_mlp": 0.01035111, "balance_loss_clip": 1.0382725, "balance_loss_mlp": 1.02249229, "epoch": 0.7229821133323313, "flos": 16289368116480.0, "grad_norm": 2.2051730456809655, "language_loss": 0.75628078, "learning_rate": 7.522810517737089e-07, "loss": 0.77742761, "num_input_tokens_seen": 259433115, "step": 12025, "time_per_iteration": 2.7355802059173584 }, { "auxiliary_loss_clip": 0.01099666, "auxiliary_loss_mlp": 0.01030116, "balance_loss_clip": 1.03707576, "balance_loss_mlp": 1.01740193, "epoch": 0.7230422365849992, "flos": 20412343854720.0, "grad_norm": 2.068852797373043, "language_loss": 0.76397157, "learning_rate": 7.519766968991395e-07, "loss": 0.78526938, "num_input_tokens_seen": 259450475, "step": 12026, "time_per_iteration": 2.6082088947296143 }, { "auxiliary_loss_clip": 0.01102144, "auxiliary_loss_mlp": 0.01042375, "balance_loss_clip": 1.0383482, "balance_loss_mlp": 1.02952373, "epoch": 0.7231023598376672, "flos": 25593499284480.0, "grad_norm": 1.9477752433448912, "language_loss": 0.6773926, "learning_rate": 7.516723893497388e-07, "loss": 0.69883776, "num_input_tokens_seen": 259469355, "step": 12027, "time_per_iteration": 2.6620283126831055 }, { "auxiliary_loss_clip": 0.01062411, "auxiliary_loss_mlp": 0.01030772, "balance_loss_clip": 1.0401032, "balance_loss_mlp": 1.0175457, "epoch": 0.7231624830903352, "flos": 25149607009920.0, "grad_norm": 2.2693920109033403, "language_loss": 0.79310131, "learning_rate": 7.513681291370469e-07, "loss": 0.81403315, "num_input_tokens_seen": 259486565, "step": 12028, "time_per_iteration": 4.312790870666504 }, { "auxiliary_loss_clip": 0.01071831, "auxiliary_loss_mlp": 0.01030546, "balance_loss_clip": 1.03564012, "balance_loss_mlp": 1.01683056, "epoch": 0.7232226063430032, "flos": 21725740656000.0, "grad_norm": 1.7649088716190047, "language_loss": 0.8226198, "learning_rate": 7.510639162726e-07, "loss": 0.84364355, "num_input_tokens_seen": 259505070, "step": 12029, "time_per_iteration": 2.6882169246673584 }, { "auxiliary_loss_clip": 0.01012512, "auxiliary_loss_mlp": 0.01001695, "balance_loss_clip": 1.01107883, "balance_loss_mlp": 1.00054455, "epoch": 0.7232827295956711, "flos": 68436798491520.0, "grad_norm": 0.8099058839034723, "language_loss": 0.61733758, "learning_rate": 7.507597507679347e-07, "loss": 0.63747966, "num_input_tokens_seen": 259569135, "step": 12030, "time_per_iteration": 3.252488136291504 }, { "auxiliary_loss_clip": 0.01094272, "auxiliary_loss_mlp": 0.01037532, "balance_loss_clip": 1.03575993, "balance_loss_mlp": 1.02277446, "epoch": 0.7233428528483391, "flos": 20192642317440.0, "grad_norm": 1.6655467134622794, "language_loss": 0.77807963, "learning_rate": 7.504556326345859e-07, "loss": 0.79939759, "num_input_tokens_seen": 259587035, "step": 12031, "time_per_iteration": 2.6133508682250977 }, { "auxiliary_loss_clip": 0.01102197, "auxiliary_loss_mlp": 0.01030343, "balance_loss_clip": 1.0374577, "balance_loss_mlp": 1.01696777, "epoch": 0.723402976101007, "flos": 23949472769280.0, "grad_norm": 1.9738785195921462, "language_loss": 0.81575108, "learning_rate": 7.501515618840834e-07, "loss": 0.83707643, "num_input_tokens_seen": 259606140, "step": 12032, "time_per_iteration": 2.7112133502960205 }, { "auxiliary_loss_clip": 0.01075376, "auxiliary_loss_mlp": 0.01037925, "balance_loss_clip": 1.03567076, "balance_loss_mlp": 1.02435255, "epoch": 0.723463099353675, "flos": 20813394182400.0, "grad_norm": 1.776312475495692, "language_loss": 0.75339031, "learning_rate": 7.498475385279592e-07, "loss": 0.77452338, "num_input_tokens_seen": 259624275, "step": 12033, "time_per_iteration": 2.718799114227295 }, { "auxiliary_loss_clip": 0.01077923, "auxiliary_loss_mlp": 0.01029541, "balance_loss_clip": 1.03677177, "balance_loss_mlp": 1.01704192, "epoch": 0.723523222606343, "flos": 19098013299840.0, "grad_norm": 1.7129862588080287, "language_loss": 0.75157291, "learning_rate": 7.495435625777423e-07, "loss": 0.7726475, "num_input_tokens_seen": 259643465, "step": 12034, "time_per_iteration": 2.6831793785095215 }, { "auxiliary_loss_clip": 0.01089243, "auxiliary_loss_mlp": 0.01032235, "balance_loss_clip": 1.03759241, "balance_loss_mlp": 1.01996899, "epoch": 0.723583345859011, "flos": 26506994993280.0, "grad_norm": 1.842898991016843, "language_loss": 0.80809641, "learning_rate": 7.492396340449578e-07, "loss": 0.82931113, "num_input_tokens_seen": 259662500, "step": 12035, "time_per_iteration": 2.695371627807617 }, { "auxiliary_loss_clip": 0.01050925, "auxiliary_loss_mlp": 0.01037786, "balance_loss_clip": 1.03530586, "balance_loss_mlp": 1.0243392, "epoch": 0.723643469111679, "flos": 16033863697920.0, "grad_norm": 2.241481195422046, "language_loss": 0.61241198, "learning_rate": 7.489357529411326e-07, "loss": 0.63329911, "num_input_tokens_seen": 259680140, "step": 12036, "time_per_iteration": 2.809441566467285 }, { "auxiliary_loss_clip": 0.01095223, "auxiliary_loss_mlp": 0.01037212, "balance_loss_clip": 1.03603697, "balance_loss_mlp": 1.02554715, "epoch": 0.7237035923643469, "flos": 21945549934080.0, "grad_norm": 1.6262385259954, "language_loss": 0.67594683, "learning_rate": 7.486319192777883e-07, "loss": 0.69727111, "num_input_tokens_seen": 259700160, "step": 12037, "time_per_iteration": 2.7354328632354736 }, { "auxiliary_loss_clip": 0.0111287, "auxiliary_loss_mlp": 0.01037592, "balance_loss_clip": 1.03997326, "balance_loss_mlp": 1.02422309, "epoch": 0.7237637156170149, "flos": 23583112001280.0, "grad_norm": 2.066772048559837, "language_loss": 0.72353923, "learning_rate": 7.483281330664479e-07, "loss": 0.74504387, "num_input_tokens_seen": 259720525, "step": 12038, "time_per_iteration": 2.704622983932495 }, { "auxiliary_loss_clip": 0.01111581, "auxiliary_loss_mlp": 0.01034396, "balance_loss_clip": 1.0390476, "balance_loss_mlp": 1.02059746, "epoch": 0.7238238388696828, "flos": 20594698225920.0, "grad_norm": 1.734011651040034, "language_loss": 0.72293609, "learning_rate": 7.480243943186293e-07, "loss": 0.74439585, "num_input_tokens_seen": 259738680, "step": 12039, "time_per_iteration": 2.6200029850006104 }, { "auxiliary_loss_clip": 0.01112988, "auxiliary_loss_mlp": 0.01033651, "balance_loss_clip": 1.03924608, "balance_loss_mlp": 1.02135432, "epoch": 0.7238839621223508, "flos": 24207024263040.0, "grad_norm": 1.7505923285041294, "language_loss": 0.76183081, "learning_rate": 7.477207030458513e-07, "loss": 0.78329718, "num_input_tokens_seen": 259758790, "step": 12040, "time_per_iteration": 2.560269832611084 }, { "auxiliary_loss_clip": 0.01079576, "auxiliary_loss_mlp": 0.01035383, "balance_loss_clip": 1.03573811, "balance_loss_mlp": 1.0221684, "epoch": 0.7239440853750188, "flos": 14209745368320.0, "grad_norm": 2.0682435916617075, "language_loss": 0.7625649, "learning_rate": 7.474170592596301e-07, "loss": 0.78371453, "num_input_tokens_seen": 259777370, "step": 12041, "time_per_iteration": 2.714940309524536 }, { "auxiliary_loss_clip": 0.01102621, "auxiliary_loss_mlp": 0.01029545, "balance_loss_clip": 1.0374378, "balance_loss_mlp": 1.01699817, "epoch": 0.7240042086276868, "flos": 21614812479360.0, "grad_norm": 2.6117170122590636, "language_loss": 0.63805127, "learning_rate": 7.471134629714797e-07, "loss": 0.65937293, "num_input_tokens_seen": 259794665, "step": 12042, "time_per_iteration": 2.6314237117767334 }, { "auxiliary_loss_clip": 0.01075777, "auxiliary_loss_mlp": 0.01034099, "balance_loss_clip": 1.03741169, "balance_loss_mlp": 1.02075338, "epoch": 0.7240643318803547, "flos": 23331450337920.0, "grad_norm": 1.8128616053031077, "language_loss": 0.83376384, "learning_rate": 7.468099141929116e-07, "loss": 0.85486257, "num_input_tokens_seen": 259811110, "step": 12043, "time_per_iteration": 2.676255226135254 }, { "auxiliary_loss_clip": 0.01079486, "auxiliary_loss_mlp": 0.01030427, "balance_loss_clip": 1.03760707, "balance_loss_mlp": 1.01697433, "epoch": 0.7241244551330227, "flos": 24024849459840.0, "grad_norm": 1.7443833351104767, "language_loss": 0.64167023, "learning_rate": 7.465064129354379e-07, "loss": 0.66276932, "num_input_tokens_seen": 259831080, "step": 12044, "time_per_iteration": 2.7761828899383545 }, { "auxiliary_loss_clip": 0.0111317, "auxiliary_loss_mlp": 0.01032715, "balance_loss_clip": 1.04010856, "balance_loss_mlp": 1.01904798, "epoch": 0.7241845783856906, "flos": 18730323728640.0, "grad_norm": 1.9383242043113957, "language_loss": 0.81468868, "learning_rate": 7.462029592105658e-07, "loss": 0.83614755, "num_input_tokens_seen": 259850135, "step": 12045, "time_per_iteration": 2.5996835231781006 }, { "auxiliary_loss_clip": 0.01108154, "auxiliary_loss_mlp": 0.01032292, "balance_loss_clip": 1.03746927, "balance_loss_mlp": 1.01956022, "epoch": 0.7242447016383586, "flos": 19498668577920.0, "grad_norm": 1.5954644621567537, "language_loss": 0.71763444, "learning_rate": 7.458995530298034e-07, "loss": 0.73903888, "num_input_tokens_seen": 259868185, "step": 12046, "time_per_iteration": 2.5615580081939697 }, { "auxiliary_loss_clip": 0.01075175, "auxiliary_loss_mlp": 0.01033472, "balance_loss_clip": 1.03313971, "balance_loss_mlp": 1.01897645, "epoch": 0.7243048248910267, "flos": 22163491704960.0, "grad_norm": 2.0910154490498125, "language_loss": 0.71177173, "learning_rate": 7.455961944046553e-07, "loss": 0.73285818, "num_input_tokens_seen": 259887055, "step": 12047, "time_per_iteration": 2.700878381729126 }, { "auxiliary_loss_clip": 0.01086391, "auxiliary_loss_mlp": 0.01041787, "balance_loss_clip": 1.03794575, "balance_loss_mlp": 1.02800667, "epoch": 0.7243649481436946, "flos": 27672762896640.0, "grad_norm": 1.5839782384796177, "language_loss": 0.70204568, "learning_rate": 7.45292883346627e-07, "loss": 0.72332752, "num_input_tokens_seen": 259908295, "step": 12048, "time_per_iteration": 2.690060615539551 }, { "auxiliary_loss_clip": 0.01011684, "auxiliary_loss_mlp": 0.01004259, "balance_loss_clip": 1.00705278, "balance_loss_mlp": 1.0028162, "epoch": 0.7244250713963626, "flos": 63244545759360.0, "grad_norm": 0.8298796504425336, "language_loss": 0.53679693, "learning_rate": 7.449896198672168e-07, "loss": 0.55695641, "num_input_tokens_seen": 259968475, "step": 12049, "time_per_iteration": 3.2119057178497314 }, { "auxiliary_loss_clip": 0.01088982, "auxiliary_loss_mlp": 0.01032865, "balance_loss_clip": 1.03676033, "balance_loss_mlp": 1.01766598, "epoch": 0.7244851946490305, "flos": 17967114524160.0, "grad_norm": 2.0687483221381897, "language_loss": 0.59396434, "learning_rate": 7.446864039779258e-07, "loss": 0.61518282, "num_input_tokens_seen": 259984865, "step": 12050, "time_per_iteration": 2.632354736328125 }, { "auxiliary_loss_clip": 0.0099629, "auxiliary_loss_mlp": 0.01011839, "balance_loss_clip": 1.0111258, "balance_loss_mlp": 1.01062906, "epoch": 0.7245453179016985, "flos": 70943649603840.0, "grad_norm": 0.7230865999860119, "language_loss": 0.53218287, "learning_rate": 7.443832356902528e-07, "loss": 0.55226415, "num_input_tokens_seen": 260046735, "step": 12051, "time_per_iteration": 3.2180604934692383 }, { "auxiliary_loss_clip": 0.01097618, "auxiliary_loss_mlp": 0.01032159, "balance_loss_clip": 1.03679287, "balance_loss_mlp": 1.02010143, "epoch": 0.7246054411543664, "flos": 24568464867840.0, "grad_norm": 1.7070120237628115, "language_loss": 0.72170782, "learning_rate": 7.440801150156927e-07, "loss": 0.74300563, "num_input_tokens_seen": 260067950, "step": 12052, "time_per_iteration": 2.6380202770233154 }, { "auxiliary_loss_clip": 0.01099407, "auxiliary_loss_mlp": 0.01034919, "balance_loss_clip": 1.03736925, "balance_loss_mlp": 1.01992285, "epoch": 0.7246655644070344, "flos": 32338312548480.0, "grad_norm": 1.8571757187229716, "language_loss": 0.74080825, "learning_rate": 7.437770419657415e-07, "loss": 0.76215148, "num_input_tokens_seen": 260087730, "step": 12053, "time_per_iteration": 2.691523790359497 }, { "auxiliary_loss_clip": 0.01072566, "auxiliary_loss_mlp": 0.0103532, "balance_loss_clip": 1.03622317, "balance_loss_mlp": 1.02119958, "epoch": 0.7247256876597024, "flos": 21872471713920.0, "grad_norm": 1.7294141781477532, "language_loss": 0.78110063, "learning_rate": 7.434740165518898e-07, "loss": 0.80217946, "num_input_tokens_seen": 260107760, "step": 12054, "time_per_iteration": 2.658952236175537 }, { "auxiliary_loss_clip": 0.01077648, "auxiliary_loss_mlp": 0.01035486, "balance_loss_clip": 1.03661764, "balance_loss_mlp": 1.02215791, "epoch": 0.7247858109123704, "flos": 16213093585920.0, "grad_norm": 2.4200013582642437, "language_loss": 0.67830694, "learning_rate": 7.431710387856301e-07, "loss": 0.69943827, "num_input_tokens_seen": 260123660, "step": 12055, "time_per_iteration": 2.646244525909424 }, { "auxiliary_loss_clip": 0.01080369, "auxiliary_loss_mlp": 0.01036731, "balance_loss_clip": 1.03789568, "balance_loss_mlp": 1.02451193, "epoch": 0.7248459341650383, "flos": 20850705434880.0, "grad_norm": 1.6702264045613682, "language_loss": 0.74097568, "learning_rate": 7.428681086784496e-07, "loss": 0.76214665, "num_input_tokens_seen": 260142690, "step": 12056, "time_per_iteration": 2.7628982067108154 }, { "auxiliary_loss_clip": 0.01108663, "auxiliary_loss_mlp": 0.010276, "balance_loss_clip": 1.03835511, "balance_loss_mlp": 1.01454699, "epoch": 0.7249060574177063, "flos": 25921794614400.0, "grad_norm": 1.66863868022831, "language_loss": 0.70870286, "learning_rate": 7.425652262418368e-07, "loss": 0.73006552, "num_input_tokens_seen": 260162590, "step": 12057, "time_per_iteration": 2.71063232421875 }, { "auxiliary_loss_clip": 0.01058179, "auxiliary_loss_mlp": 0.01044744, "balance_loss_clip": 1.03556621, "balance_loss_mlp": 1.03009939, "epoch": 0.7249661806703742, "flos": 17345536646400.0, "grad_norm": 1.8439836916669041, "language_loss": 0.6237672, "learning_rate": 7.42262391487277e-07, "loss": 0.64479643, "num_input_tokens_seen": 260181065, "step": 12058, "time_per_iteration": 2.8430051803588867 }, { "auxiliary_loss_clip": 0.01070122, "auxiliary_loss_mlp": 0.01031873, "balance_loss_clip": 1.03506172, "balance_loss_mlp": 1.01852131, "epoch": 0.7250263039230422, "flos": 19574153009280.0, "grad_norm": 1.8897334856820058, "language_loss": 0.74905157, "learning_rate": 7.419596044262535e-07, "loss": 0.77007163, "num_input_tokens_seen": 260200330, "step": 12059, "time_per_iteration": 2.832826614379883 }, { "auxiliary_loss_clip": 0.01098356, "auxiliary_loss_mlp": 0.01033461, "balance_loss_clip": 1.03746486, "balance_loss_mlp": 1.02145672, "epoch": 0.7250864271757103, "flos": 21976648133760.0, "grad_norm": 1.8419617438371911, "language_loss": 0.79300022, "learning_rate": 7.416568650702472e-07, "loss": 0.81431836, "num_input_tokens_seen": 260219975, "step": 12060, "time_per_iteration": 4.281320095062256 }, { "auxiliary_loss_clip": 0.01100606, "auxiliary_loss_mlp": 0.01026628, "balance_loss_clip": 1.03860307, "balance_loss_mlp": 1.01334846, "epoch": 0.7251465504283782, "flos": 25012608537600.0, "grad_norm": 1.7927785216248016, "language_loss": 0.76260906, "learning_rate": 7.413541734307393e-07, "loss": 0.78388143, "num_input_tokens_seen": 260242025, "step": 12061, "time_per_iteration": 2.748656749725342 }, { "auxiliary_loss_clip": 0.01108857, "auxiliary_loss_mlp": 0.00769754, "balance_loss_clip": 1.03873777, "balance_loss_mlp": 1.00011206, "epoch": 0.7252066736810462, "flos": 16690131135360.0, "grad_norm": 1.7879167066361221, "language_loss": 0.81589133, "learning_rate": 7.410515295192068e-07, "loss": 0.83467746, "num_input_tokens_seen": 260260015, "step": 12062, "time_per_iteration": 4.1720802783966064 }, { "auxiliary_loss_clip": 0.0106197, "auxiliary_loss_mlp": 0.01035204, "balance_loss_clip": 1.03478372, "balance_loss_mlp": 1.0198977, "epoch": 0.7252667969337141, "flos": 25703026830720.0, "grad_norm": 2.017910234455411, "language_loss": 0.69402146, "learning_rate": 7.407489333471262e-07, "loss": 0.71499324, "num_input_tokens_seen": 260278635, "step": 12063, "time_per_iteration": 4.450777769088745 }, { "auxiliary_loss_clip": 0.01076449, "auxiliary_loss_mlp": 0.01034693, "balance_loss_clip": 1.03741121, "balance_loss_mlp": 1.02178848, "epoch": 0.7253269201863821, "flos": 18259930195200.0, "grad_norm": 1.4900878050946833, "language_loss": 0.69918656, "learning_rate": 7.40446384925973e-07, "loss": 0.72029793, "num_input_tokens_seen": 260298510, "step": 12064, "time_per_iteration": 2.7114603519439697 }, { "auxiliary_loss_clip": 0.01091634, "auxiliary_loss_mlp": 0.01035443, "balance_loss_clip": 1.03896451, "balance_loss_mlp": 1.02210331, "epoch": 0.72538704343905, "flos": 20411805150720.0, "grad_norm": 1.7705588276559046, "language_loss": 0.90465009, "learning_rate": 7.401438842672192e-07, "loss": 0.92592084, "num_input_tokens_seen": 260317405, "step": 12065, "time_per_iteration": 2.723996877670288 }, { "auxiliary_loss_clip": 0.01020643, "auxiliary_loss_mlp": 0.01001515, "balance_loss_clip": 1.00699556, "balance_loss_mlp": 1.00026369, "epoch": 0.725447166691718, "flos": 70151209706880.0, "grad_norm": 0.6554583314348987, "language_loss": 0.56083691, "learning_rate": 7.398414313823349e-07, "loss": 0.58105844, "num_input_tokens_seen": 260388085, "step": 12066, "time_per_iteration": 3.332350254058838 }, { "auxiliary_loss_clip": 0.01062291, "auxiliary_loss_mlp": 0.01030469, "balance_loss_clip": 1.03549218, "balance_loss_mlp": 1.01799369, "epoch": 0.725507289944386, "flos": 27052334254080.0, "grad_norm": 1.7495752784177439, "language_loss": 0.76740146, "learning_rate": 7.395390262827897e-07, "loss": 0.78832901, "num_input_tokens_seen": 260406165, "step": 12067, "time_per_iteration": 2.815978765487671 }, { "auxiliary_loss_clip": 0.0101369, "auxiliary_loss_mlp": 0.01006237, "balance_loss_clip": 1.01036, "balance_loss_mlp": 1.0050863, "epoch": 0.725567413197054, "flos": 62921924778240.0, "grad_norm": 0.722755917983848, "language_loss": 0.56971467, "learning_rate": 7.392366689800515e-07, "loss": 0.58991396, "num_input_tokens_seen": 260461365, "step": 12068, "time_per_iteration": 4.744567394256592 }, { "auxiliary_loss_clip": 0.0099354, "auxiliary_loss_mlp": 0.01007822, "balance_loss_clip": 1.00846553, "balance_loss_mlp": 1.00654685, "epoch": 0.7256275364497219, "flos": 60295957188480.0, "grad_norm": 0.663737486882956, "language_loss": 0.55370045, "learning_rate": 7.389343594855848e-07, "loss": 0.57371408, "num_input_tokens_seen": 260523795, "step": 12069, "time_per_iteration": 3.275995969772339 }, { "auxiliary_loss_clip": 0.01077438, "auxiliary_loss_mlp": 0.01027102, "balance_loss_clip": 1.03855562, "balance_loss_mlp": 1.01507938, "epoch": 0.7256876597023899, "flos": 24498511130880.0, "grad_norm": 1.6562852272905184, "language_loss": 0.79984176, "learning_rate": 7.38632097810854e-07, "loss": 0.82088709, "num_input_tokens_seen": 260544765, "step": 12070, "time_per_iteration": 2.806398391723633 }, { "auxiliary_loss_clip": 0.01083416, "auxiliary_loss_mlp": 0.01036302, "balance_loss_clip": 1.03607607, "balance_loss_mlp": 1.02395165, "epoch": 0.7257477829550578, "flos": 24352749740160.0, "grad_norm": 1.8683198427961691, "language_loss": 0.71817708, "learning_rate": 7.383298839673197e-07, "loss": 0.73937428, "num_input_tokens_seen": 260564340, "step": 12071, "time_per_iteration": 2.7380881309509277 }, { "auxiliary_loss_clip": 0.01108781, "auxiliary_loss_mlp": 0.01039283, "balance_loss_clip": 1.03857553, "balance_loss_mlp": 1.02693939, "epoch": 0.7258079062077258, "flos": 17202217380480.0, "grad_norm": 2.1132155235444183, "language_loss": 0.70214903, "learning_rate": 7.380277179664436e-07, "loss": 0.72362965, "num_input_tokens_seen": 260582565, "step": 12072, "time_per_iteration": 2.639300584793091 }, { "auxiliary_loss_clip": 0.01075383, "auxiliary_loss_mlp": 0.01033628, "balance_loss_clip": 1.03398466, "balance_loss_mlp": 1.01966858, "epoch": 0.7258680294603939, "flos": 21580338401280.0, "grad_norm": 1.7211132025466964, "language_loss": 0.78522944, "learning_rate": 7.377255998196821e-07, "loss": 0.80631953, "num_input_tokens_seen": 260601700, "step": 12073, "time_per_iteration": 2.707505226135254 }, { "auxiliary_loss_clip": 0.01089188, "auxiliary_loss_mlp": 0.01031416, "balance_loss_clip": 1.03761029, "balance_loss_mlp": 1.0188278, "epoch": 0.7259281527130618, "flos": 34855399036800.0, "grad_norm": 1.5601813308837964, "language_loss": 0.70586532, "learning_rate": 7.374235295384923e-07, "loss": 0.72707134, "num_input_tokens_seen": 260623040, "step": 12074, "time_per_iteration": 2.7605321407318115 }, { "auxiliary_loss_clip": 0.01089374, "auxiliary_loss_mlp": 0.01031427, "balance_loss_clip": 1.03541577, "balance_loss_mlp": 1.01786137, "epoch": 0.7259882759657298, "flos": 25404644551680.0, "grad_norm": 1.7787306902519031, "language_loss": 0.74126077, "learning_rate": 7.371215071343302e-07, "loss": 0.76246876, "num_input_tokens_seen": 260642735, "step": 12075, "time_per_iteration": 2.809924840927124 }, { "auxiliary_loss_clip": 0.01102235, "auxiliary_loss_mlp": 0.01037145, "balance_loss_clip": 1.03854585, "balance_loss_mlp": 1.02345967, "epoch": 0.7260483992183977, "flos": 62953630531200.0, "grad_norm": 2.761502875821282, "language_loss": 0.63991046, "learning_rate": 7.368195326186458e-07, "loss": 0.6613043, "num_input_tokens_seen": 260669935, "step": 12076, "time_per_iteration": 3.073396921157837 }, { "auxiliary_loss_clip": 0.01073377, "auxiliary_loss_mlp": 0.01030745, "balance_loss_clip": 1.03426909, "balance_loss_mlp": 1.01711977, "epoch": 0.7261085224710657, "flos": 26467528924800.0, "grad_norm": 1.967529180708395, "language_loss": 0.78661555, "learning_rate": 7.365176060028912e-07, "loss": 0.80765676, "num_input_tokens_seen": 260689605, "step": 12077, "time_per_iteration": 2.748734712600708 }, { "auxiliary_loss_clip": 0.01030217, "auxiliary_loss_mlp": 0.00751512, "balance_loss_clip": 1.00731969, "balance_loss_mlp": 0.99968779, "epoch": 0.7261686457237336, "flos": 66772732187520.0, "grad_norm": 0.8834354289567558, "language_loss": 0.64973843, "learning_rate": 7.362157272985163e-07, "loss": 0.66755569, "num_input_tokens_seen": 260748265, "step": 12078, "time_per_iteration": 3.1502130031585693 }, { "auxiliary_loss_clip": 0.01023011, "auxiliary_loss_mlp": 0.01002876, "balance_loss_clip": 1.00983262, "balance_loss_mlp": 1.00162983, "epoch": 0.7262287689764017, "flos": 69999594399360.0, "grad_norm": 0.7148369654201937, "language_loss": 0.59227604, "learning_rate": 7.359138965169671e-07, "loss": 0.61253494, "num_input_tokens_seen": 260816715, "step": 12079, "time_per_iteration": 3.2680857181549072 }, { "auxiliary_loss_clip": 0.01064199, "auxiliary_loss_mlp": 0.01033019, "balance_loss_clip": 1.03485882, "balance_loss_mlp": 1.01984644, "epoch": 0.7262888922290696, "flos": 23805435231360.0, "grad_norm": 2.2028126662157383, "language_loss": 0.64762789, "learning_rate": 7.356121136696895e-07, "loss": 0.66860008, "num_input_tokens_seen": 260836765, "step": 12080, "time_per_iteration": 2.718738317489624 }, { "auxiliary_loss_clip": 0.01064639, "auxiliary_loss_mlp": 0.01029211, "balance_loss_clip": 1.03282523, "balance_loss_mlp": 1.01555538, "epoch": 0.7263490154817376, "flos": 19500320603520.0, "grad_norm": 2.4686396191281235, "language_loss": 0.69309068, "learning_rate": 7.35310378768128e-07, "loss": 0.71402919, "num_input_tokens_seen": 260854610, "step": 12081, "time_per_iteration": 2.869288444519043 }, { "auxiliary_loss_clip": 0.01114886, "auxiliary_loss_mlp": 0.01031024, "balance_loss_clip": 1.04031432, "balance_loss_mlp": 1.01794684, "epoch": 0.7264091387344055, "flos": 16286243633280.0, "grad_norm": 1.842145300936274, "language_loss": 0.81440926, "learning_rate": 7.350086918237237e-07, "loss": 0.83586842, "num_input_tokens_seen": 260871620, "step": 12082, "time_per_iteration": 2.558000087738037 }, { "auxiliary_loss_clip": 0.01104122, "auxiliary_loss_mlp": 0.01037212, "balance_loss_clip": 1.0367763, "balance_loss_mlp": 1.02259684, "epoch": 0.7264692619870735, "flos": 24352031468160.0, "grad_norm": 1.7329186007952004, "language_loss": 0.77324694, "learning_rate": 7.347070528479158e-07, "loss": 0.79466033, "num_input_tokens_seen": 260890490, "step": 12083, "time_per_iteration": 2.707674741744995 }, { "auxiliary_loss_clip": 0.01114141, "auxiliary_loss_mlp": 0.01032066, "balance_loss_clip": 1.04018736, "balance_loss_mlp": 1.01889968, "epoch": 0.7265293852397414, "flos": 25119478477440.0, "grad_norm": 1.8409046940193436, "language_loss": 0.73034543, "learning_rate": 7.344054618521433e-07, "loss": 0.75180745, "num_input_tokens_seen": 260909700, "step": 12084, "time_per_iteration": 2.656688928604126 }, { "auxiliary_loss_clip": 0.01114376, "auxiliary_loss_mlp": 0.01036848, "balance_loss_clip": 1.03960419, "balance_loss_mlp": 1.02362156, "epoch": 0.7265895084924094, "flos": 22638230784000.0, "grad_norm": 3.047460171891373, "language_loss": 0.7778368, "learning_rate": 7.34103918847843e-07, "loss": 0.79934901, "num_input_tokens_seen": 260929090, "step": 12085, "time_per_iteration": 2.645911693572998 }, { "auxiliary_loss_clip": 0.01099641, "auxiliary_loss_mlp": 0.01034439, "balance_loss_clip": 1.03661323, "balance_loss_mlp": 1.02154636, "epoch": 0.7266496317450775, "flos": 23368222886400.0, "grad_norm": 1.5977221637963412, "language_loss": 0.72068805, "learning_rate": 7.338024238464493e-07, "loss": 0.74202883, "num_input_tokens_seen": 260946615, "step": 12086, "time_per_iteration": 2.6855533123016357 }, { "auxiliary_loss_clip": 0.01073096, "auxiliary_loss_mlp": 0.01041748, "balance_loss_clip": 1.03401077, "balance_loss_mlp": 1.02729964, "epoch": 0.7267097549977454, "flos": 28074603323520.0, "grad_norm": 1.6297510133590103, "language_loss": 0.6963405, "learning_rate": 7.335009768593938e-07, "loss": 0.71748894, "num_input_tokens_seen": 260968515, "step": 12087, "time_per_iteration": 2.8121585845947266 }, { "auxiliary_loss_clip": 0.01115392, "auxiliary_loss_mlp": 0.01035484, "balance_loss_clip": 1.04074097, "balance_loss_mlp": 1.02153099, "epoch": 0.7267698782504134, "flos": 22195523658240.0, "grad_norm": 5.160414648565969, "language_loss": 0.79164052, "learning_rate": 7.331995778981088e-07, "loss": 0.81314927, "num_input_tokens_seen": 260986790, "step": 12088, "time_per_iteration": 2.563143491744995 }, { "auxiliary_loss_clip": 0.01097059, "auxiliary_loss_mlp": 0.01037688, "balance_loss_clip": 1.03751171, "balance_loss_mlp": 1.02490282, "epoch": 0.7268300015030813, "flos": 18514859996160.0, "grad_norm": 1.723527946831352, "language_loss": 0.73941064, "learning_rate": 7.328982269740221e-07, "loss": 0.76075816, "num_input_tokens_seen": 261004925, "step": 12089, "time_per_iteration": 2.6264712810516357 }, { "auxiliary_loss_clip": 0.01088906, "auxiliary_loss_mlp": 0.01035753, "balance_loss_clip": 1.03559196, "balance_loss_mlp": 1.02308106, "epoch": 0.7268901247557493, "flos": 23986029836160.0, "grad_norm": 1.6147699540484286, "language_loss": 0.70883548, "learning_rate": 7.325969240985616e-07, "loss": 0.73008209, "num_input_tokens_seen": 261023895, "step": 12090, "time_per_iteration": 2.674154281616211 }, { "auxiliary_loss_clip": 0.01057949, "auxiliary_loss_mlp": 0.01033382, "balance_loss_clip": 1.03447902, "balance_loss_mlp": 1.01989388, "epoch": 0.7269502480084172, "flos": 32088087429120.0, "grad_norm": 1.7900263733785062, "language_loss": 0.7724641, "learning_rate": 7.322956692831528e-07, "loss": 0.7933774, "num_input_tokens_seen": 261045445, "step": 12091, "time_per_iteration": 2.837162494659424 }, { "auxiliary_loss_clip": 0.0109404, "auxiliary_loss_mlp": 0.00771553, "balance_loss_clip": 1.03523159, "balance_loss_mlp": 1.00019574, "epoch": 0.7270103712610853, "flos": 19062785036160.0, "grad_norm": 2.0691872442271415, "language_loss": 0.71682477, "learning_rate": 7.319944625392205e-07, "loss": 0.73548067, "num_input_tokens_seen": 261064275, "step": 12092, "time_per_iteration": 2.6305599212646484 }, { "auxiliary_loss_clip": 0.01101746, "auxiliary_loss_mlp": 0.01033427, "balance_loss_clip": 1.03929043, "balance_loss_mlp": 1.02035582, "epoch": 0.7270704945137532, "flos": 34532921710080.0, "grad_norm": 2.2398684774576156, "language_loss": 0.61100423, "learning_rate": 7.31693303878184e-07, "loss": 0.63235605, "num_input_tokens_seen": 261083310, "step": 12093, "time_per_iteration": 2.750157117843628 }, { "auxiliary_loss_clip": 0.01090608, "auxiliary_loss_mlp": 0.01037448, "balance_loss_clip": 1.03955996, "balance_loss_mlp": 1.02412009, "epoch": 0.7271306177664212, "flos": 21507583403520.0, "grad_norm": 1.6663796185948798, "language_loss": 0.75200593, "learning_rate": 7.313921933114644e-07, "loss": 0.77328646, "num_input_tokens_seen": 261103460, "step": 12094, "time_per_iteration": 2.63088059425354 }, { "auxiliary_loss_clip": 0.01076646, "auxiliary_loss_mlp": 0.01031624, "balance_loss_clip": 1.03417659, "balance_loss_mlp": 1.01941681, "epoch": 0.7271907410190891, "flos": 22272444633600.0, "grad_norm": 1.8443350683921131, "language_loss": 0.84625936, "learning_rate": 7.310911308504808e-07, "loss": 0.867342, "num_input_tokens_seen": 261121375, "step": 12095, "time_per_iteration": 2.7300918102264404 }, { "auxiliary_loss_clip": 0.010978, "auxiliary_loss_mlp": 0.01037417, "balance_loss_clip": 1.03561294, "balance_loss_mlp": 1.02383316, "epoch": 0.7272508642717571, "flos": 22893124671360.0, "grad_norm": 2.27024179817087, "language_loss": 0.77610254, "learning_rate": 7.307901165066479e-07, "loss": 0.79745466, "num_input_tokens_seen": 261141105, "step": 12096, "time_per_iteration": 2.754016399383545 }, { "auxiliary_loss_clip": 0.01113914, "auxiliary_loss_mlp": 0.01037152, "balance_loss_clip": 1.04082382, "balance_loss_mlp": 1.02434897, "epoch": 0.727310987524425, "flos": 11655886331520.0, "grad_norm": 1.96001611615308, "language_loss": 0.72508037, "learning_rate": 7.30489150291381e-07, "loss": 0.74659109, "num_input_tokens_seen": 261159255, "step": 12097, "time_per_iteration": 2.57547664642334 }, { "auxiliary_loss_clip": 0.01101296, "auxiliary_loss_mlp": 0.00771623, "balance_loss_clip": 1.03833079, "balance_loss_mlp": 1.00024211, "epoch": 0.727371110777093, "flos": 24535319592960.0, "grad_norm": 1.744636039852928, "language_loss": 0.77178752, "learning_rate": 7.301882322160935e-07, "loss": 0.79051673, "num_input_tokens_seen": 261177960, "step": 12098, "time_per_iteration": 2.697739601135254 }, { "auxiliary_loss_clip": 0.01090376, "auxiliary_loss_mlp": 0.01033826, "balance_loss_clip": 1.03530288, "balance_loss_mlp": 1.02023625, "epoch": 0.7274312340297611, "flos": 74739835405440.0, "grad_norm": 1.6470814614885703, "language_loss": 0.67452812, "learning_rate": 7.298873622921952e-07, "loss": 0.69577014, "num_input_tokens_seen": 261205660, "step": 12099, "time_per_iteration": 4.734724283218384 }, { "auxiliary_loss_clip": 0.01100384, "auxiliary_loss_mlp": 0.01040178, "balance_loss_clip": 1.0354315, "balance_loss_mlp": 1.02401924, "epoch": 0.727491357282429, "flos": 22342865247360.0, "grad_norm": 1.6470477467852347, "language_loss": 0.72511584, "learning_rate": 7.29586540531095e-07, "loss": 0.74652147, "num_input_tokens_seen": 261225185, "step": 12100, "time_per_iteration": 2.6307733058929443 }, { "auxiliary_loss_clip": 0.01101803, "auxiliary_loss_mlp": 0.01038394, "balance_loss_clip": 1.03856468, "balance_loss_mlp": 1.02577031, "epoch": 0.727551480535097, "flos": 23297550877440.0, "grad_norm": 1.4604095726641635, "language_loss": 0.74780536, "learning_rate": 7.292857669442005e-07, "loss": 0.76920736, "num_input_tokens_seen": 261247965, "step": 12101, "time_per_iteration": 2.6731035709381104 }, { "auxiliary_loss_clip": 0.01070063, "auxiliary_loss_mlp": 0.01029767, "balance_loss_clip": 1.03622627, "balance_loss_mlp": 1.01775718, "epoch": 0.7276116037877649, "flos": 21470559459840.0, "grad_norm": 1.7882931756264577, "language_loss": 0.82550085, "learning_rate": 7.289850415429177e-07, "loss": 0.8464992, "num_input_tokens_seen": 261267585, "step": 12102, "time_per_iteration": 5.8568243980407715 }, { "auxiliary_loss_clip": 0.01100092, "auxiliary_loss_mlp": 0.01035417, "balance_loss_clip": 1.03823566, "balance_loss_mlp": 1.02270937, "epoch": 0.7276717270404329, "flos": 21464059098240.0, "grad_norm": 2.5021196197746396, "language_loss": 0.81821334, "learning_rate": 7.286843643386495e-07, "loss": 0.83956838, "num_input_tokens_seen": 261285200, "step": 12103, "time_per_iteration": 2.619070291519165 }, { "auxiliary_loss_clip": 0.0109026, "auxiliary_loss_mlp": 0.01027412, "balance_loss_clip": 1.03774977, "balance_loss_mlp": 1.01372027, "epoch": 0.7277318502931008, "flos": 16837221329280.0, "grad_norm": 1.6323298348226507, "language_loss": 0.66439486, "learning_rate": 7.283837353427968e-07, "loss": 0.68557155, "num_input_tokens_seen": 261303645, "step": 12104, "time_per_iteration": 2.7373523712158203 }, { "auxiliary_loss_clip": 0.01079506, "auxiliary_loss_mlp": 0.01033638, "balance_loss_clip": 1.03706837, "balance_loss_mlp": 1.02034616, "epoch": 0.7277919735457689, "flos": 33400550476800.0, "grad_norm": 3.4364169718839324, "language_loss": 0.66114849, "learning_rate": 7.280831545667611e-07, "loss": 0.68227994, "num_input_tokens_seen": 261323265, "step": 12105, "time_per_iteration": 2.767533302307129 }, { "auxiliary_loss_clip": 0.01115684, "auxiliary_loss_mlp": 0.01034833, "balance_loss_clip": 1.04181576, "balance_loss_mlp": 1.02132106, "epoch": 0.7278520967984368, "flos": 19206499351680.0, "grad_norm": 3.014598256639034, "language_loss": 0.75495023, "learning_rate": 7.27782622021939e-07, "loss": 0.7764554, "num_input_tokens_seen": 261339745, "step": 12106, "time_per_iteration": 2.595414161682129 }, { "auxiliary_loss_clip": 0.01103034, "auxiliary_loss_mlp": 0.01033288, "balance_loss_clip": 1.03735209, "balance_loss_mlp": 1.01898909, "epoch": 0.7279122200511048, "flos": 34094667870720.0, "grad_norm": 2.1351092676673162, "language_loss": 0.70326072, "learning_rate": 7.274821377197273e-07, "loss": 0.72462392, "num_input_tokens_seen": 261359310, "step": 12107, "time_per_iteration": 4.187346935272217 }, { "auxiliary_loss_clip": 0.01094591, "auxiliary_loss_mlp": 0.0103929, "balance_loss_clip": 1.03660846, "balance_loss_mlp": 1.02583683, "epoch": 0.7279723433037727, "flos": 54599049348480.0, "grad_norm": 1.7543215604249431, "language_loss": 0.75391257, "learning_rate": 7.271817016715205e-07, "loss": 0.77525139, "num_input_tokens_seen": 261384640, "step": 12108, "time_per_iteration": 2.922069549560547 }, { "auxiliary_loss_clip": 0.01111137, "auxiliary_loss_mlp": 0.01031166, "balance_loss_clip": 1.03809679, "balance_loss_mlp": 1.01802313, "epoch": 0.7280324665564407, "flos": 36137482156800.0, "grad_norm": 1.5176447474285724, "language_loss": 0.67057818, "learning_rate": 7.268813138887124e-07, "loss": 0.69200122, "num_input_tokens_seen": 261405290, "step": 12109, "time_per_iteration": 2.691226005554199 }, { "auxiliary_loss_clip": 0.01073593, "auxiliary_loss_mlp": 0.01033469, "balance_loss_clip": 1.03573251, "balance_loss_mlp": 1.01958656, "epoch": 0.7280925898091086, "flos": 11618539165440.0, "grad_norm": 2.3584964062920646, "language_loss": 0.63489443, "learning_rate": 7.265809743826912e-07, "loss": 0.65596509, "num_input_tokens_seen": 261419710, "step": 12110, "time_per_iteration": 2.7957284450531006 }, { "auxiliary_loss_clip": 0.01079859, "auxiliary_loss_mlp": 0.01029854, "balance_loss_clip": 1.03503799, "balance_loss_mlp": 1.01581717, "epoch": 0.7281527130617766, "flos": 34277094069120.0, "grad_norm": 2.403450287181842, "language_loss": 0.58412719, "learning_rate": 7.26280683164847e-07, "loss": 0.60522431, "num_input_tokens_seen": 261442385, "step": 12111, "time_per_iteration": 2.8229284286499023 }, { "auxiliary_loss_clip": 0.01063232, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.03874135, "balance_loss_mlp": 1.01915908, "epoch": 0.7282128363144446, "flos": 13918043018880.0, "grad_norm": 2.038144887813222, "language_loss": 0.73754865, "learning_rate": 7.259804402465677e-07, "loss": 0.75851005, "num_input_tokens_seen": 261459805, "step": 12112, "time_per_iteration": 2.7780139446258545 }, { "auxiliary_loss_clip": 0.01098263, "auxiliary_loss_mlp": 0.01031686, "balance_loss_clip": 1.03572726, "balance_loss_mlp": 1.01943743, "epoch": 0.7282729595671126, "flos": 20777627214720.0, "grad_norm": 2.316952642046255, "language_loss": 0.66911846, "learning_rate": 7.25680245639237e-07, "loss": 0.69041795, "num_input_tokens_seen": 261477175, "step": 12113, "time_per_iteration": 2.6054317951202393 }, { "auxiliary_loss_clip": 0.01073794, "auxiliary_loss_mlp": 0.01034736, "balance_loss_clip": 1.03603506, "balance_loss_mlp": 1.02081203, "epoch": 0.7283330828197806, "flos": 16325422392960.0, "grad_norm": 2.2071094228181716, "language_loss": 0.73312247, "learning_rate": 7.253800993542399e-07, "loss": 0.75420773, "num_input_tokens_seen": 261494990, "step": 12114, "time_per_iteration": 2.779949188232422 }, { "auxiliary_loss_clip": 0.0108015, "auxiliary_loss_mlp": 0.01031985, "balance_loss_clip": 1.03596735, "balance_loss_mlp": 1.01860976, "epoch": 0.7283932060724485, "flos": 27490193043840.0, "grad_norm": 2.0284186088728604, "language_loss": 0.68312764, "learning_rate": 7.250800014029564e-07, "loss": 0.70424896, "num_input_tokens_seen": 261514445, "step": 12115, "time_per_iteration": 2.7396066188812256 }, { "auxiliary_loss_clip": 0.01112838, "auxiliary_loss_mlp": 0.01035969, "balance_loss_clip": 1.03786767, "balance_loss_mlp": 1.02284992, "epoch": 0.7284533293251165, "flos": 18367877543040.0, "grad_norm": 1.7392304859469863, "language_loss": 0.60055017, "learning_rate": 7.247799517967674e-07, "loss": 0.62203836, "num_input_tokens_seen": 261533565, "step": 12116, "time_per_iteration": 2.6416893005371094 }, { "auxiliary_loss_clip": 0.01101571, "auxiliary_loss_mlp": 0.010328, "balance_loss_clip": 1.03989601, "balance_loss_mlp": 1.01943648, "epoch": 0.7285134525777844, "flos": 21725525174400.0, "grad_norm": 1.8456050280461243, "language_loss": 0.73165786, "learning_rate": 7.2447995054705e-07, "loss": 0.75300157, "num_input_tokens_seen": 261553795, "step": 12117, "time_per_iteration": 2.680856704711914 }, { "auxiliary_loss_clip": 0.01096697, "auxiliary_loss_mlp": 0.01032842, "balance_loss_clip": 1.03561711, "balance_loss_mlp": 1.01907897, "epoch": 0.7285735758304525, "flos": 20741357456640.0, "grad_norm": 1.892233976782661, "language_loss": 0.69420332, "learning_rate": 7.241799976651807e-07, "loss": 0.71549869, "num_input_tokens_seen": 261572565, "step": 12118, "time_per_iteration": 2.689328908920288 }, { "auxiliary_loss_clip": 0.01054191, "auxiliary_loss_mlp": 0.01039747, "balance_loss_clip": 1.03333414, "balance_loss_mlp": 1.026968, "epoch": 0.7286336990831204, "flos": 17310954827520.0, "grad_norm": 6.128645472594502, "language_loss": 0.84134108, "learning_rate": 7.238800931625346e-07, "loss": 0.86228043, "num_input_tokens_seen": 261590910, "step": 12119, "time_per_iteration": 2.811901330947876 }, { "auxiliary_loss_clip": 0.01112084, "auxiliary_loss_mlp": 0.01029087, "balance_loss_clip": 1.03825903, "balance_loss_mlp": 1.01655173, "epoch": 0.7286938223357884, "flos": 19787390098560.0, "grad_norm": 2.0681771064873544, "language_loss": 0.81878972, "learning_rate": 7.235802370504831e-07, "loss": 0.84020138, "num_input_tokens_seen": 261606005, "step": 12120, "time_per_iteration": 2.6672909259796143 }, { "auxiliary_loss_clip": 0.01072804, "auxiliary_loss_mlp": 0.01040557, "balance_loss_clip": 1.03617036, "balance_loss_mlp": 1.02706861, "epoch": 0.7287539455884563, "flos": 15340859625600.0, "grad_norm": 1.933953511288546, "language_loss": 0.7878201, "learning_rate": 7.232804293403963e-07, "loss": 0.8089537, "num_input_tokens_seen": 261622305, "step": 12121, "time_per_iteration": 2.6193950176239014 }, { "auxiliary_loss_clip": 0.01111609, "auxiliary_loss_mlp": 0.01036655, "balance_loss_clip": 1.0360496, "balance_loss_mlp": 1.02327943, "epoch": 0.7288140688411243, "flos": 25192484870400.0, "grad_norm": 1.533681893436525, "language_loss": 0.69097638, "learning_rate": 7.229806700436441e-07, "loss": 0.71245903, "num_input_tokens_seen": 261642465, "step": 12122, "time_per_iteration": 2.650777578353882 }, { "auxiliary_loss_clip": 0.01064636, "auxiliary_loss_mlp": 0.01033566, "balance_loss_clip": 1.03321254, "balance_loss_mlp": 1.02150214, "epoch": 0.7288741920937922, "flos": 23984162328960.0, "grad_norm": 1.9841747121514857, "language_loss": 0.87224233, "learning_rate": 7.226809591715923e-07, "loss": 0.89322436, "num_input_tokens_seen": 261661420, "step": 12123, "time_per_iteration": 2.767803907394409 }, { "auxiliary_loss_clip": 0.01077874, "auxiliary_loss_mlp": 0.01035309, "balance_loss_clip": 1.0370611, "balance_loss_mlp": 1.02279758, "epoch": 0.7289343153464602, "flos": 22744921155840.0, "grad_norm": 19.006549121525065, "language_loss": 0.8255595, "learning_rate": 7.223812967356065e-07, "loss": 0.84669125, "num_input_tokens_seen": 261680865, "step": 12124, "time_per_iteration": 2.7401580810546875 }, { "auxiliary_loss_clip": 0.01082733, "auxiliary_loss_mlp": 0.01032589, "balance_loss_clip": 1.03729665, "balance_loss_mlp": 1.01955354, "epoch": 0.7289944385991282, "flos": 24900028335360.0, "grad_norm": 2.2469511782017726, "language_loss": 0.67069578, "learning_rate": 7.220816827470499e-07, "loss": 0.69184899, "num_input_tokens_seen": 261701455, "step": 12125, "time_per_iteration": 2.681535243988037 }, { "auxiliary_loss_clip": 0.01104267, "auxiliary_loss_mlp": 0.01038084, "balance_loss_clip": 1.03742492, "balance_loss_mlp": 1.02412462, "epoch": 0.7290545618517962, "flos": 22967064817920.0, "grad_norm": 2.039401823737763, "language_loss": 0.74920547, "learning_rate": 7.217821172172855e-07, "loss": 0.77062899, "num_input_tokens_seen": 261721260, "step": 12126, "time_per_iteration": 2.6920571327209473 }, { "auxiliary_loss_clip": 0.01016131, "auxiliary_loss_mlp": 0.00997812, "balance_loss_clip": 1.01327682, "balance_loss_mlp": 0.99669784, "epoch": 0.7291146851044642, "flos": 61901523216000.0, "grad_norm": 0.8366377087030958, "language_loss": 0.5864383, "learning_rate": 7.2148260015767e-07, "loss": 0.60657775, "num_input_tokens_seen": 261779370, "step": 12127, "time_per_iteration": 3.1948511600494385 }, { "auxiliary_loss_clip": 0.01076598, "auxiliary_loss_mlp": 0.01031063, "balance_loss_clip": 1.03621507, "balance_loss_mlp": 1.01911807, "epoch": 0.7291748083571321, "flos": 23330947547520.0, "grad_norm": 2.1989684199567376, "language_loss": 0.68995476, "learning_rate": 7.21183131579562e-07, "loss": 0.71103132, "num_input_tokens_seen": 261798050, "step": 12128, "time_per_iteration": 2.761828899383545 }, { "auxiliary_loss_clip": 0.01085147, "auxiliary_loss_mlp": 0.01035559, "balance_loss_clip": 1.03663111, "balance_loss_mlp": 1.02137899, "epoch": 0.7292349316098001, "flos": 28330000001280.0, "grad_norm": 1.8229974773388113, "language_loss": 0.65319067, "learning_rate": 7.20883711494319e-07, "loss": 0.67439777, "num_input_tokens_seen": 261817660, "step": 12129, "time_per_iteration": 2.7223851680755615 }, { "auxiliary_loss_clip": 0.01108826, "auxiliary_loss_mlp": 0.01030564, "balance_loss_clip": 1.03813577, "balance_loss_mlp": 1.01728415, "epoch": 0.729295054862468, "flos": 24132222190080.0, "grad_norm": 1.987746290779436, "language_loss": 0.74474001, "learning_rate": 7.205843399132927e-07, "loss": 0.7661339, "num_input_tokens_seen": 261837935, "step": 12130, "time_per_iteration": 2.624861001968384 }, { "auxiliary_loss_clip": 0.01084684, "auxiliary_loss_mlp": 0.01036003, "balance_loss_clip": 1.03371596, "balance_loss_mlp": 1.02260351, "epoch": 0.7293551781151361, "flos": 22816239609600.0, "grad_norm": 1.9230016702733295, "language_loss": 0.69777483, "learning_rate": 7.202850168478374e-07, "loss": 0.71898174, "num_input_tokens_seen": 261857575, "step": 12131, "time_per_iteration": 2.686483383178711 }, { "auxiliary_loss_clip": 0.01075038, "auxiliary_loss_mlp": 0.01032777, "balance_loss_clip": 1.03706694, "balance_loss_mlp": 1.02072525, "epoch": 0.729415301367804, "flos": 22126683242880.0, "grad_norm": 1.5997534699121376, "language_loss": 0.77348047, "learning_rate": 7.199857423093025e-07, "loss": 0.79455858, "num_input_tokens_seen": 261877265, "step": 12132, "time_per_iteration": 2.7391042709350586 }, { "auxiliary_loss_clip": 0.0110301, "auxiliary_loss_mlp": 0.01038259, "balance_loss_clip": 1.03978968, "balance_loss_mlp": 1.02559876, "epoch": 0.729475424620472, "flos": 12349608675840.0, "grad_norm": 2.2281458510507797, "language_loss": 0.78860861, "learning_rate": 7.196865163090358e-07, "loss": 0.81002128, "num_input_tokens_seen": 261893695, "step": 12133, "time_per_iteration": 2.5943353176116943 }, { "auxiliary_loss_clip": 0.01060968, "auxiliary_loss_mlp": 0.01032725, "balance_loss_clip": 1.03212547, "balance_loss_mlp": 1.01933742, "epoch": 0.7295355478731399, "flos": 22195308176640.0, "grad_norm": 2.7553273898402333, "language_loss": 0.72054434, "learning_rate": 7.193873388583846e-07, "loss": 0.7414813, "num_input_tokens_seen": 261911825, "step": 12134, "time_per_iteration": 2.764251470565796 }, { "auxiliary_loss_clip": 0.01091285, "auxiliary_loss_mlp": 0.01040465, "balance_loss_clip": 1.03840399, "balance_loss_mlp": 1.02753675, "epoch": 0.7295956711258079, "flos": 23222030532480.0, "grad_norm": 2.1447336349614203, "language_loss": 0.71251649, "learning_rate": 7.190882099686939e-07, "loss": 0.73383397, "num_input_tokens_seen": 261931190, "step": 12135, "time_per_iteration": 2.7322559356689453 }, { "auxiliary_loss_clip": 0.01077251, "auxiliary_loss_mlp": 0.01035683, "balance_loss_clip": 1.03486896, "balance_loss_mlp": 1.02259374, "epoch": 0.7296557943784758, "flos": 31869104163840.0, "grad_norm": 2.309450763309982, "language_loss": 0.61924529, "learning_rate": 7.187891296513075e-07, "loss": 0.64037454, "num_input_tokens_seen": 261951240, "step": 12136, "time_per_iteration": 2.7608072757720947 }, { "auxiliary_loss_clip": 0.01094465, "auxiliary_loss_mlp": 0.00770512, "balance_loss_clip": 1.03708506, "balance_loss_mlp": 1.00022686, "epoch": 0.7297159176311439, "flos": 26651714889600.0, "grad_norm": 1.8756317332834676, "language_loss": 0.74414635, "learning_rate": 7.184900979175654e-07, "loss": 0.76279616, "num_input_tokens_seen": 261971605, "step": 12137, "time_per_iteration": 2.6699535846710205 }, { "auxiliary_loss_clip": 0.01104052, "auxiliary_loss_mlp": 0.00771068, "balance_loss_clip": 1.04109406, "balance_loss_mlp": 1.00024545, "epoch": 0.7297760408838118, "flos": 24749562263040.0, "grad_norm": 1.6416252206910797, "language_loss": 0.74556518, "learning_rate": 7.181911147788069e-07, "loss": 0.76431638, "num_input_tokens_seen": 261990830, "step": 12138, "time_per_iteration": 2.6462252140045166 }, { "auxiliary_loss_clip": 0.01073993, "auxiliary_loss_mlp": 0.01030576, "balance_loss_clip": 1.03440869, "balance_loss_mlp": 1.01832712, "epoch": 0.7298361641364798, "flos": 18073768982400.0, "grad_norm": 2.2048130444672527, "language_loss": 0.71792364, "learning_rate": 7.178921802463702e-07, "loss": 0.73896933, "num_input_tokens_seen": 262008190, "step": 12139, "time_per_iteration": 2.637579917907715 }, { "auxiliary_loss_clip": 0.01094798, "auxiliary_loss_mlp": 0.01029337, "balance_loss_clip": 1.03654766, "balance_loss_mlp": 1.01727343, "epoch": 0.7298962873891478, "flos": 29895597169920.0, "grad_norm": 1.5727231241692394, "language_loss": 0.73340857, "learning_rate": 7.175932943315898e-07, "loss": 0.75464988, "num_input_tokens_seen": 262030460, "step": 12140, "time_per_iteration": 4.322738170623779 }, { "auxiliary_loss_clip": 0.01086242, "auxiliary_loss_mlp": 0.01033553, "balance_loss_clip": 1.03733993, "balance_loss_mlp": 1.02028465, "epoch": 0.7299564106418157, "flos": 32266096254720.0, "grad_norm": 2.108634462016176, "language_loss": 0.55439997, "learning_rate": 7.172944570458003e-07, "loss": 0.57559788, "num_input_tokens_seen": 262050830, "step": 12141, "time_per_iteration": 4.280510425567627 }, { "auxiliary_loss_clip": 0.01072661, "auxiliary_loss_mlp": 0.01030923, "balance_loss_clip": 1.03414416, "balance_loss_mlp": 1.0185132, "epoch": 0.7300165338944837, "flos": 22930292269440.0, "grad_norm": 1.6200088413354243, "language_loss": 0.72661757, "learning_rate": 7.169956684003342e-07, "loss": 0.74765337, "num_input_tokens_seen": 262071245, "step": 12142, "time_per_iteration": 4.36347508430481 }, { "auxiliary_loss_clip": 0.01109011, "auxiliary_loss_mlp": 0.01039998, "balance_loss_clip": 1.03754866, "balance_loss_mlp": 1.02798176, "epoch": 0.7300766571471516, "flos": 19828795501440.0, "grad_norm": 1.8395683964833187, "language_loss": 0.73354667, "learning_rate": 7.16696928406521e-07, "loss": 0.75503671, "num_input_tokens_seen": 262087525, "step": 12143, "time_per_iteration": 2.562661647796631 }, { "auxiliary_loss_clip": 0.01072117, "auxiliary_loss_mlp": 0.01036251, "balance_loss_clip": 1.0354147, "balance_loss_mlp": 1.02270293, "epoch": 0.7301367803998197, "flos": 24347829576960.0, "grad_norm": 11.58693755368333, "language_loss": 0.67069697, "learning_rate": 7.163982370756882e-07, "loss": 0.69178069, "num_input_tokens_seen": 262107355, "step": 12144, "time_per_iteration": 2.7019169330596924 }, { "auxiliary_loss_clip": 0.01087218, "auxiliary_loss_mlp": 0.01031157, "balance_loss_clip": 1.03756452, "balance_loss_mlp": 1.01808596, "epoch": 0.7301969036524876, "flos": 15304518040320.0, "grad_norm": 2.004686825288867, "language_loss": 0.79088622, "learning_rate": 7.160995944191627e-07, "loss": 0.81206995, "num_input_tokens_seen": 262125645, "step": 12145, "time_per_iteration": 2.609962224960327 }, { "auxiliary_loss_clip": 0.01071068, "auxiliary_loss_mlp": 0.01038463, "balance_loss_clip": 1.03582478, "balance_loss_mlp": 1.02542722, "epoch": 0.7302570269051556, "flos": 23507268433920.0, "grad_norm": 2.189602190838667, "language_loss": 0.91191077, "learning_rate": 7.158010004482702e-07, "loss": 0.93300605, "num_input_tokens_seen": 262144075, "step": 12146, "time_per_iteration": 4.17360258102417 }, { "auxiliary_loss_clip": 0.01107983, "auxiliary_loss_mlp": 0.01027437, "balance_loss_clip": 1.03820586, "balance_loss_mlp": 1.01547432, "epoch": 0.7303171501578235, "flos": 20523056549760.0, "grad_norm": 1.801228566583195, "language_loss": 0.62361127, "learning_rate": 7.155024551743316e-07, "loss": 0.64496547, "num_input_tokens_seen": 262165940, "step": 12147, "time_per_iteration": 2.7316384315490723 }, { "auxiliary_loss_clip": 0.01113892, "auxiliary_loss_mlp": 0.01039081, "balance_loss_clip": 1.0402323, "balance_loss_mlp": 1.02578294, "epoch": 0.7303772734104915, "flos": 18332613365760.0, "grad_norm": 1.9466892860385239, "language_loss": 0.75526571, "learning_rate": 7.152039586086693e-07, "loss": 0.77679539, "num_input_tokens_seen": 262184520, "step": 12148, "time_per_iteration": 2.55757999420166 }, { "auxiliary_loss_clip": 0.01010613, "auxiliary_loss_mlp": 0.0075184, "balance_loss_clip": 1.00818348, "balance_loss_mlp": 0.99964029, "epoch": 0.7304373966631594, "flos": 60654776100480.0, "grad_norm": 0.6918687189528673, "language_loss": 0.56630087, "learning_rate": 7.149055107626017e-07, "loss": 0.58392537, "num_input_tokens_seen": 262247070, "step": 12149, "time_per_iteration": 3.1780648231506348 }, { "auxiliary_loss_clip": 0.01090981, "auxiliary_loss_mlp": 0.01036437, "balance_loss_clip": 1.03665161, "balance_loss_mlp": 1.02352667, "epoch": 0.7304975199158275, "flos": 19828077229440.0, "grad_norm": 1.6617515713368272, "language_loss": 0.73949683, "learning_rate": 7.146071116474451e-07, "loss": 0.76077104, "num_input_tokens_seen": 262266605, "step": 12150, "time_per_iteration": 2.6775600910186768 }, { "auxiliary_loss_clip": 0.0111323, "auxiliary_loss_mlp": 0.01034483, "balance_loss_clip": 1.03854418, "balance_loss_mlp": 1.02156699, "epoch": 0.7305576431684954, "flos": 13223997452160.0, "grad_norm": 2.052406638174018, "language_loss": 0.84060204, "learning_rate": 7.143087612745158e-07, "loss": 0.86207914, "num_input_tokens_seen": 262283880, "step": 12151, "time_per_iteration": 2.589292049407959 }, { "auxiliary_loss_clip": 0.01072466, "auxiliary_loss_mlp": 0.01040374, "balance_loss_clip": 1.03497267, "balance_loss_mlp": 1.02686191, "epoch": 0.7306177664211634, "flos": 24060472773120.0, "grad_norm": 1.844893248025129, "language_loss": 0.78079808, "learning_rate": 7.14010459655127e-07, "loss": 0.80192649, "num_input_tokens_seen": 262304155, "step": 12152, "time_per_iteration": 2.7783727645874023 }, { "auxiliary_loss_clip": 0.01075382, "auxiliary_loss_mlp": 0.01032051, "balance_loss_clip": 1.03711772, "balance_loss_mlp": 1.01889646, "epoch": 0.7306778896738314, "flos": 27089106802560.0, "grad_norm": 2.295487047202377, "language_loss": 0.79554176, "learning_rate": 7.137122068005919e-07, "loss": 0.81661606, "num_input_tokens_seen": 262325660, "step": 12153, "time_per_iteration": 2.773252010345459 }, { "auxiliary_loss_clip": 0.01100913, "auxiliary_loss_mlp": 0.01037363, "balance_loss_clip": 1.03726029, "balance_loss_mlp": 1.02455413, "epoch": 0.7307380129264993, "flos": 16690669839360.0, "grad_norm": 1.708854446027603, "language_loss": 0.67438841, "learning_rate": 7.134140027222173e-07, "loss": 0.69577122, "num_input_tokens_seen": 262344075, "step": 12154, "time_per_iteration": 2.657804489135742 }, { "auxiliary_loss_clip": 0.01064569, "auxiliary_loss_mlp": 0.01032574, "balance_loss_clip": 1.03754902, "balance_loss_mlp": 1.01900196, "epoch": 0.7307981361791673, "flos": 21725740656000.0, "grad_norm": 1.7409892720521978, "language_loss": 0.6598506, "learning_rate": 7.131158474313128e-07, "loss": 0.68082201, "num_input_tokens_seen": 262363305, "step": 12155, "time_per_iteration": 2.727818012237549 }, { "auxiliary_loss_clip": 0.01090955, "auxiliary_loss_mlp": 0.01028633, "balance_loss_clip": 1.03944302, "balance_loss_mlp": 1.01606798, "epoch": 0.7308582594318352, "flos": 18040659621120.0, "grad_norm": 2.059846064937341, "language_loss": 0.81401372, "learning_rate": 7.128177409391851e-07, "loss": 0.83520961, "num_input_tokens_seen": 262380730, "step": 12156, "time_per_iteration": 2.6713905334472656 }, { "auxiliary_loss_clip": 0.01069178, "auxiliary_loss_mlp": 0.01038604, "balance_loss_clip": 1.03357935, "balance_loss_mlp": 1.02677894, "epoch": 0.7309183826845033, "flos": 13844964798720.0, "grad_norm": 2.368813587947745, "language_loss": 0.7572211, "learning_rate": 7.125196832571367e-07, "loss": 0.77829891, "num_input_tokens_seen": 262395480, "step": 12157, "time_per_iteration": 2.6478710174560547 }, { "auxiliary_loss_clip": 0.01097661, "auxiliary_loss_mlp": 0.01029375, "balance_loss_clip": 1.03817534, "balance_loss_mlp": 1.01818156, "epoch": 0.7309785059371712, "flos": 17019216564480.0, "grad_norm": 2.20999557197409, "language_loss": 0.72660947, "learning_rate": 7.122216743964713e-07, "loss": 0.74787986, "num_input_tokens_seen": 262413340, "step": 12158, "time_per_iteration": 2.6752305030822754 }, { "auxiliary_loss_clip": 0.01090002, "auxiliary_loss_mlp": 0.01036269, "balance_loss_clip": 1.03874135, "balance_loss_mlp": 1.02343071, "epoch": 0.7310386291898392, "flos": 26502398052480.0, "grad_norm": 1.5980086656224926, "language_loss": 0.85433125, "learning_rate": 7.119237143684896e-07, "loss": 0.87559396, "num_input_tokens_seen": 262433455, "step": 12159, "time_per_iteration": 2.722282886505127 }, { "auxiliary_loss_clip": 0.01090808, "auxiliary_loss_mlp": 0.01033782, "balance_loss_clip": 1.0357151, "balance_loss_mlp": 1.01996553, "epoch": 0.7310987524425071, "flos": 16945922862720.0, "grad_norm": 2.240373926166887, "language_loss": 0.73471999, "learning_rate": 7.116258031844895e-07, "loss": 0.75596595, "num_input_tokens_seen": 262450335, "step": 12160, "time_per_iteration": 2.6522862911224365 }, { "auxiliary_loss_clip": 0.01103069, "auxiliary_loss_mlp": 0.01035667, "balance_loss_clip": 1.0388577, "balance_loss_mlp": 1.0220058, "epoch": 0.7311588756951751, "flos": 13845288021120.0, "grad_norm": 1.9039689153632533, "language_loss": 0.72493577, "learning_rate": 7.113279408557675e-07, "loss": 0.74632311, "num_input_tokens_seen": 262468240, "step": 12161, "time_per_iteration": 2.5589683055877686 }, { "auxiliary_loss_clip": 0.01083193, "auxiliary_loss_mlp": 0.00772186, "balance_loss_clip": 1.03667367, "balance_loss_mlp": 1.00028253, "epoch": 0.731218998947843, "flos": 28767894704640.0, "grad_norm": 1.765712961659712, "language_loss": 0.69565916, "learning_rate": 7.110301273936192e-07, "loss": 0.71421289, "num_input_tokens_seen": 262487045, "step": 12162, "time_per_iteration": 2.8083322048187256 }, { "auxiliary_loss_clip": 0.01102238, "auxiliary_loss_mlp": 0.01030969, "balance_loss_clip": 1.03934407, "balance_loss_mlp": 1.01765895, "epoch": 0.7312791222005111, "flos": 27088783580160.0, "grad_norm": 1.79396916880486, "language_loss": 0.66982478, "learning_rate": 7.107323628093382e-07, "loss": 0.69115686, "num_input_tokens_seen": 262504855, "step": 12163, "time_per_iteration": 2.664005756378174 }, { "auxiliary_loss_clip": 0.01088818, "auxiliary_loss_mlp": 0.01029215, "balance_loss_clip": 1.03657246, "balance_loss_mlp": 1.01618505, "epoch": 0.731339245453179, "flos": 20924035050240.0, "grad_norm": 1.4858782021210455, "language_loss": 0.68422931, "learning_rate": 7.104346471142153e-07, "loss": 0.70540965, "num_input_tokens_seen": 262524920, "step": 12164, "time_per_iteration": 2.730407953262329 }, { "auxiliary_loss_clip": 0.01064444, "auxiliary_loss_mlp": 0.01035496, "balance_loss_clip": 1.03925169, "balance_loss_mlp": 1.02344418, "epoch": 0.731399368705847, "flos": 23075694524160.0, "grad_norm": 1.621904213104564, "language_loss": 0.73121232, "learning_rate": 7.101369803195391e-07, "loss": 0.75221169, "num_input_tokens_seen": 262545725, "step": 12165, "time_per_iteration": 2.745304584503174 }, { "auxiliary_loss_clip": 0.01104061, "auxiliary_loss_mlp": 0.0103506, "balance_loss_clip": 1.03919411, "balance_loss_mlp": 1.02191114, "epoch": 0.731459491958515, "flos": 23582681038080.0, "grad_norm": 1.959130136013477, "language_loss": 0.7631768, "learning_rate": 7.098393624365988e-07, "loss": 0.78456795, "num_input_tokens_seen": 262565480, "step": 12166, "time_per_iteration": 2.655210256576538 }, { "auxiliary_loss_clip": 0.01083193, "auxiliary_loss_mlp": 0.01031215, "balance_loss_clip": 1.03837287, "balance_loss_mlp": 1.01877546, "epoch": 0.7315196152111829, "flos": 22379278659840.0, "grad_norm": 1.7735251016583573, "language_loss": 0.79791737, "learning_rate": 7.095417934766781e-07, "loss": 0.81906146, "num_input_tokens_seen": 262584145, "step": 12167, "time_per_iteration": 2.686013698577881 }, { "auxiliary_loss_clip": 0.01099781, "auxiliary_loss_mlp": 0.01043597, "balance_loss_clip": 1.03856659, "balance_loss_mlp": 1.03108573, "epoch": 0.7315797384638509, "flos": 26177047637760.0, "grad_norm": 1.6689116898679521, "language_loss": 0.76710904, "learning_rate": 7.092442734510622e-07, "loss": 0.78854281, "num_input_tokens_seen": 262604045, "step": 12168, "time_per_iteration": 2.6875557899475098 }, { "auxiliary_loss_clip": 0.0109665, "auxiliary_loss_mlp": 0.01043712, "balance_loss_clip": 1.03574252, "balance_loss_mlp": 1.02774954, "epoch": 0.7316398617165188, "flos": 21506326427520.0, "grad_norm": 2.5442709815389684, "language_loss": 0.81822222, "learning_rate": 7.089468023710326e-07, "loss": 0.83962584, "num_input_tokens_seen": 262624540, "step": 12169, "time_per_iteration": 2.592453718185425 }, { "auxiliary_loss_clip": 0.01097824, "auxiliary_loss_mlp": 0.01039563, "balance_loss_clip": 1.03882432, "balance_loss_mlp": 1.0264802, "epoch": 0.7316999849691869, "flos": 30482557315200.0, "grad_norm": 1.9915594425883627, "language_loss": 0.69992799, "learning_rate": 7.08649380247871e-07, "loss": 0.72130191, "num_input_tokens_seen": 262644545, "step": 12170, "time_per_iteration": 2.7040326595306396 }, { "auxiliary_loss_clip": 0.01109905, "auxiliary_loss_mlp": 0.01032057, "balance_loss_clip": 1.03831005, "balance_loss_mlp": 1.01799059, "epoch": 0.7317601082218548, "flos": 21543781334400.0, "grad_norm": 15.0863481947429, "language_loss": 0.69820881, "learning_rate": 7.083520070928533e-07, "loss": 0.71962845, "num_input_tokens_seen": 262662570, "step": 12171, "time_per_iteration": 2.5760347843170166 }, { "auxiliary_loss_clip": 0.01111903, "auxiliary_loss_mlp": 0.0104052, "balance_loss_clip": 1.03991163, "balance_loss_mlp": 1.0280571, "epoch": 0.7318202314745228, "flos": 33251592775680.0, "grad_norm": 4.139375107953077, "language_loss": 0.65600061, "learning_rate": 7.080546829172564e-07, "loss": 0.67752481, "num_input_tokens_seen": 262683245, "step": 12172, "time_per_iteration": 2.629512071609497 }, { "auxiliary_loss_clip": 0.01112155, "auxiliary_loss_mlp": 0.01027678, "balance_loss_clip": 1.03968287, "balance_loss_mlp": 1.01504803, "epoch": 0.7318803547271907, "flos": 20157054917760.0, "grad_norm": 2.4544456450965577, "language_loss": 0.6181004, "learning_rate": 7.077574077323564e-07, "loss": 0.63949871, "num_input_tokens_seen": 262701585, "step": 12173, "time_per_iteration": 2.714617967605591 }, { "auxiliary_loss_clip": 0.01056565, "auxiliary_loss_mlp": 0.01030506, "balance_loss_clip": 1.03468084, "balance_loss_mlp": 1.01789331, "epoch": 0.7319404779798587, "flos": 20558536208640.0, "grad_norm": 3.4474002403228714, "language_loss": 0.74141943, "learning_rate": 7.074601815494243e-07, "loss": 0.76229018, "num_input_tokens_seen": 262719295, "step": 12174, "time_per_iteration": 2.691361427307129 }, { "auxiliary_loss_clip": 0.0110738, "auxiliary_loss_mlp": 0.01029138, "balance_loss_clip": 1.03786492, "balance_loss_mlp": 1.01689529, "epoch": 0.7320006012325266, "flos": 28695391102080.0, "grad_norm": 1.70169272855857, "language_loss": 0.80771077, "learning_rate": 7.071630043797317e-07, "loss": 0.82907599, "num_input_tokens_seen": 262739995, "step": 12175, "time_per_iteration": 2.6333701610565186 }, { "auxiliary_loss_clip": 0.01091186, "auxiliary_loss_mlp": 0.01029927, "balance_loss_clip": 1.03785181, "balance_loss_mlp": 1.01719511, "epoch": 0.7320607244851947, "flos": 16362697731840.0, "grad_norm": 2.2994636661960777, "language_loss": 0.76175666, "learning_rate": 7.068658762345488e-07, "loss": 0.78296781, "num_input_tokens_seen": 262757680, "step": 12176, "time_per_iteration": 2.6684181690216064 }, { "auxiliary_loss_clip": 0.01099222, "auxiliary_loss_mlp": 0.01033517, "balance_loss_clip": 1.03950393, "balance_loss_mlp": 1.02143455, "epoch": 0.7321208477378626, "flos": 20955097336320.0, "grad_norm": 1.7266339084119442, "language_loss": 0.76393938, "learning_rate": 7.065687971251399e-07, "loss": 0.78526676, "num_input_tokens_seen": 262776990, "step": 12177, "time_per_iteration": 2.5895602703094482 }, { "auxiliary_loss_clip": 0.01076316, "auxiliary_loss_mlp": 0.0103859, "balance_loss_clip": 1.03529096, "balance_loss_mlp": 1.02638888, "epoch": 0.7321809709905306, "flos": 13845072539520.0, "grad_norm": 2.2196900974647003, "language_loss": 0.74673522, "learning_rate": 7.06271767062772e-07, "loss": 0.76788431, "num_input_tokens_seen": 262795440, "step": 12178, "time_per_iteration": 2.6741504669189453 }, { "auxiliary_loss_clip": 0.01091987, "auxiliary_loss_mlp": 0.01034757, "balance_loss_clip": 1.03604901, "balance_loss_mlp": 1.02187705, "epoch": 0.7322410942431986, "flos": 26979938392320.0, "grad_norm": 2.2839200958654584, "language_loss": 0.82424951, "learning_rate": 7.059747860587084e-07, "loss": 0.84551692, "num_input_tokens_seen": 262816385, "step": 12179, "time_per_iteration": 4.333508253097534 }, { "auxiliary_loss_clip": 0.01073556, "auxiliary_loss_mlp": 0.01040091, "balance_loss_clip": 1.03531742, "balance_loss_mlp": 1.02663827, "epoch": 0.7323012174958665, "flos": 17639717034240.0, "grad_norm": 4.252835567274656, "language_loss": 0.74462938, "learning_rate": 7.056778541242115e-07, "loss": 0.76576585, "num_input_tokens_seen": 262834955, "step": 12180, "time_per_iteration": 2.64694881439209 }, { "auxiliary_loss_clip": 0.01100626, "auxiliary_loss_mlp": 0.00770628, "balance_loss_clip": 1.03525329, "balance_loss_mlp": 1.00013947, "epoch": 0.7323613407485345, "flos": 32342765834880.0, "grad_norm": 2.118039690946721, "language_loss": 0.79425126, "learning_rate": 7.053809712705396e-07, "loss": 0.81296378, "num_input_tokens_seen": 262853555, "step": 12181, "time_per_iteration": 5.950862407684326 }, { "auxiliary_loss_clip": 0.01104749, "auxiliary_loss_mlp": 0.00770994, "balance_loss_clip": 1.0405333, "balance_loss_mlp": 1.00015044, "epoch": 0.7324214640012024, "flos": 18362777811840.0, "grad_norm": 3.5037562339731343, "language_loss": 0.72006238, "learning_rate": 7.050841375089506e-07, "loss": 0.73881984, "num_input_tokens_seen": 262870975, "step": 12182, "time_per_iteration": 2.60955810546875 }, { "auxiliary_loss_clip": 0.01113664, "auxiliary_loss_mlp": 0.01031542, "balance_loss_clip": 1.04023218, "balance_loss_mlp": 1.01922774, "epoch": 0.7324815872538705, "flos": 30812289189120.0, "grad_norm": 1.455017822583619, "language_loss": 0.7080251, "learning_rate": 7.047873528507015e-07, "loss": 0.72947717, "num_input_tokens_seen": 262892635, "step": 12183, "time_per_iteration": 2.651121139526367 }, { "auxiliary_loss_clip": 0.01100782, "auxiliary_loss_mlp": 0.01035961, "balance_loss_clip": 1.04088736, "balance_loss_mlp": 1.02230549, "epoch": 0.7325417105065384, "flos": 21505069451520.0, "grad_norm": 1.9960836350213491, "language_loss": 0.73006004, "learning_rate": 7.04490617307045e-07, "loss": 0.75142741, "num_input_tokens_seen": 262910725, "step": 12184, "time_per_iteration": 4.158590078353882 }, { "auxiliary_loss_clip": 0.01011352, "auxiliary_loss_mlp": 0.01007926, "balance_loss_clip": 1.00717974, "balance_loss_mlp": 1.0068059, "epoch": 0.7326018337592064, "flos": 67257742556160.0, "grad_norm": 0.7629811613061157, "language_loss": 0.65181279, "learning_rate": 7.041939308892344e-07, "loss": 0.67200553, "num_input_tokens_seen": 262974150, "step": 12185, "time_per_iteration": 3.1753084659576416 }, { "auxiliary_loss_clip": 0.01110902, "auxiliary_loss_mlp": 0.01027525, "balance_loss_clip": 1.03751791, "balance_loss_mlp": 1.01419187, "epoch": 0.7326619570118743, "flos": 22857070394880.0, "grad_norm": 1.8466605492768327, "language_loss": 0.80407894, "learning_rate": 7.038972936085197e-07, "loss": 0.82546324, "num_input_tokens_seen": 262993370, "step": 12186, "time_per_iteration": 2.7113280296325684 }, { "auxiliary_loss_clip": 0.01095897, "auxiliary_loss_mlp": 0.01035902, "balance_loss_clip": 1.03822923, "balance_loss_mlp": 1.02185869, "epoch": 0.7327220802645423, "flos": 23327499841920.0, "grad_norm": 1.6891374777680592, "language_loss": 0.73376352, "learning_rate": 7.036007054761508e-07, "loss": 0.75508153, "num_input_tokens_seen": 263012665, "step": 12187, "time_per_iteration": 2.6341447830200195 }, { "auxiliary_loss_clip": 0.01113144, "auxiliary_loss_mlp": 0.01032976, "balance_loss_clip": 1.03975987, "balance_loss_mlp": 1.020298, "epoch": 0.7327822035172102, "flos": 23180661043200.0, "grad_norm": 1.849813706667638, "language_loss": 0.88717717, "learning_rate": 7.033041665033716e-07, "loss": 0.90863836, "num_input_tokens_seen": 263031475, "step": 12188, "time_per_iteration": 2.5466268062591553 }, { "auxiliary_loss_clip": 0.01068599, "auxiliary_loss_mlp": 0.01036205, "balance_loss_clip": 1.03427935, "balance_loss_mlp": 1.02241302, "epoch": 0.7328423267698783, "flos": 21066600130560.0, "grad_norm": 2.0499334322207856, "language_loss": 0.74851215, "learning_rate": 7.030076767014284e-07, "loss": 0.76956022, "num_input_tokens_seen": 263051445, "step": 12189, "time_per_iteration": 2.7621939182281494 }, { "auxiliary_loss_clip": 0.01078663, "auxiliary_loss_mlp": 0.0103229, "balance_loss_clip": 1.03718972, "balance_loss_mlp": 1.01898003, "epoch": 0.7329024500225462, "flos": 21689578638720.0, "grad_norm": 1.96321719925377, "language_loss": 0.82236755, "learning_rate": 7.027112360815648e-07, "loss": 0.84347707, "num_input_tokens_seen": 263070835, "step": 12190, "time_per_iteration": 2.701537609100342 }, { "auxiliary_loss_clip": 0.01073099, "auxiliary_loss_mlp": 0.01036113, "balance_loss_clip": 1.03755641, "balance_loss_mlp": 1.02225447, "epoch": 0.7329625732752142, "flos": 24164038661760.0, "grad_norm": 1.6849977085368404, "language_loss": 0.71588874, "learning_rate": 7.024148446550204e-07, "loss": 0.73698092, "num_input_tokens_seen": 263090070, "step": 12191, "time_per_iteration": 2.72813081741333 }, { "auxiliary_loss_clip": 0.01112512, "auxiliary_loss_mlp": 0.01035784, "balance_loss_clip": 1.03892088, "balance_loss_mlp": 1.02245009, "epoch": 0.7330226965278822, "flos": 30077915627520.0, "grad_norm": 1.5354384218805013, "language_loss": 0.69254857, "learning_rate": 7.021185024330361e-07, "loss": 0.71403152, "num_input_tokens_seen": 263110030, "step": 12192, "time_per_iteration": 2.6177656650543213 }, { "auxiliary_loss_clip": 0.01099104, "auxiliary_loss_mlp": 0.01030904, "balance_loss_clip": 1.0388236, "balance_loss_mlp": 1.01836967, "epoch": 0.7330828197805501, "flos": 23368294713600.0, "grad_norm": 1.627423362173816, "language_loss": 0.73143125, "learning_rate": 7.01822209426848e-07, "loss": 0.75273132, "num_input_tokens_seen": 263129735, "step": 12193, "time_per_iteration": 2.6829118728637695 }, { "auxiliary_loss_clip": 0.01094199, "auxiliary_loss_mlp": 0.01034877, "balance_loss_clip": 1.03632629, "balance_loss_mlp": 1.02171612, "epoch": 0.7331429430332181, "flos": 21032808410880.0, "grad_norm": 2.400736232898333, "language_loss": 0.76939815, "learning_rate": 7.015259656476911e-07, "loss": 0.79068899, "num_input_tokens_seen": 263149100, "step": 12194, "time_per_iteration": 2.589165687561035 }, { "auxiliary_loss_clip": 0.01100113, "auxiliary_loss_mlp": 0.01029986, "balance_loss_clip": 1.03972054, "balance_loss_mlp": 1.01695681, "epoch": 0.733203066285886, "flos": 14647891466880.0, "grad_norm": 1.9190061960430176, "language_loss": 0.70403659, "learning_rate": 7.012297711067998e-07, "loss": 0.72533756, "num_input_tokens_seen": 263166620, "step": 12195, "time_per_iteration": 2.550752639770508 }, { "auxiliary_loss_clip": 0.01111325, "auxiliary_loss_mlp": 0.01036105, "balance_loss_clip": 1.03835511, "balance_loss_mlp": 1.02386189, "epoch": 0.7332631895385541, "flos": 17165301177600.0, "grad_norm": 1.958340476490106, "language_loss": 0.72090805, "learning_rate": 7.009336258154057e-07, "loss": 0.74238235, "num_input_tokens_seen": 263184780, "step": 12196, "time_per_iteration": 2.540836811065674 }, { "auxiliary_loss_clip": 0.01111546, "auxiliary_loss_mlp": 0.01030796, "balance_loss_clip": 1.04016924, "balance_loss_mlp": 1.01791, "epoch": 0.733323312791222, "flos": 28658151676800.0, "grad_norm": 1.92503318264866, "language_loss": 0.71952534, "learning_rate": 7.006375297847394e-07, "loss": 0.7409488, "num_input_tokens_seen": 263204625, "step": 12197, "time_per_iteration": 2.6192398071289062 }, { "auxiliary_loss_clip": 0.01058905, "auxiliary_loss_mlp": 0.00771452, "balance_loss_clip": 1.03431988, "balance_loss_mlp": 1.00020027, "epoch": 0.73338343604389, "flos": 16618417632000.0, "grad_norm": 3.2701178801425983, "language_loss": 0.77824599, "learning_rate": 7.003414830260282e-07, "loss": 0.79654956, "num_input_tokens_seen": 263221565, "step": 12198, "time_per_iteration": 2.751495599746704 }, { "auxiliary_loss_clip": 0.0105527, "auxiliary_loss_mlp": 0.01033337, "balance_loss_clip": 1.0351963, "balance_loss_mlp": 1.02071261, "epoch": 0.7334435592965579, "flos": 21142084561920.0, "grad_norm": 1.9440363866172514, "language_loss": 0.74263847, "learning_rate": 7.000454855504974e-07, "loss": 0.76352453, "num_input_tokens_seen": 263240620, "step": 12199, "time_per_iteration": 2.767896890640259 }, { "auxiliary_loss_clip": 0.01094013, "auxiliary_loss_mlp": 0.01032416, "balance_loss_clip": 1.03940797, "balance_loss_mlp": 1.01919568, "epoch": 0.7335036825492259, "flos": 17125332318720.0, "grad_norm": 2.5044351330443377, "language_loss": 0.76926482, "learning_rate": 6.997495373693729e-07, "loss": 0.79052913, "num_input_tokens_seen": 263254365, "step": 12200, "time_per_iteration": 2.6367027759552 }, { "auxiliary_loss_clip": 0.01074082, "auxiliary_loss_mlp": 0.01027226, "balance_loss_clip": 1.03776014, "balance_loss_mlp": 1.01524007, "epoch": 0.7335638058018938, "flos": 23731818307200.0, "grad_norm": 2.389152390847936, "language_loss": 0.61618876, "learning_rate": 6.994536384938754e-07, "loss": 0.63720185, "num_input_tokens_seen": 263275880, "step": 12201, "time_per_iteration": 2.6798954010009766 }, { "auxiliary_loss_clip": 0.0107342, "auxiliary_loss_mlp": 0.00770019, "balance_loss_clip": 1.03417397, "balance_loss_mlp": 1.00014138, "epoch": 0.7336239290545619, "flos": 34933289679360.0, "grad_norm": 2.0307356501592526, "language_loss": 0.52253979, "learning_rate": 6.991577889352264e-07, "loss": 0.5409742, "num_input_tokens_seen": 263298315, "step": 12202, "time_per_iteration": 2.8340702056884766 }, { "auxiliary_loss_clip": 0.01087087, "auxiliary_loss_mlp": 0.01030166, "balance_loss_clip": 1.03677177, "balance_loss_mlp": 1.017923, "epoch": 0.7336840523072298, "flos": 21103049456640.0, "grad_norm": 1.7212231979753123, "language_loss": 0.68485624, "learning_rate": 6.98861988704645e-07, "loss": 0.70602876, "num_input_tokens_seen": 263318615, "step": 12203, "time_per_iteration": 2.642812967300415 }, { "auxiliary_loss_clip": 0.01088423, "auxiliary_loss_mlp": 0.01037493, "balance_loss_clip": 1.03938603, "balance_loss_mlp": 1.02476776, "epoch": 0.7337441755598978, "flos": 24024418496640.0, "grad_norm": 2.034834601717817, "language_loss": 0.6607222, "learning_rate": 6.985662378133474e-07, "loss": 0.68198133, "num_input_tokens_seen": 263336705, "step": 12204, "time_per_iteration": 2.74241042137146 }, { "auxiliary_loss_clip": 0.01089625, "auxiliary_loss_mlp": 0.01034455, "balance_loss_clip": 1.04081655, "balance_loss_mlp": 1.02211094, "epoch": 0.7338042988125658, "flos": 22711309004160.0, "grad_norm": 1.8580582529828333, "language_loss": 0.77225935, "learning_rate": 6.982705362725479e-07, "loss": 0.79350007, "num_input_tokens_seen": 263355065, "step": 12205, "time_per_iteration": 2.6422648429870605 }, { "auxiliary_loss_clip": 0.01058875, "auxiliary_loss_mlp": 0.01032513, "balance_loss_clip": 1.03662992, "balance_loss_mlp": 1.02064013, "epoch": 0.7338644220652337, "flos": 21360996000000.0, "grad_norm": 2.159301504218906, "language_loss": 0.79434526, "learning_rate": 6.979748840934601e-07, "loss": 0.8152591, "num_input_tokens_seen": 263374460, "step": 12206, "time_per_iteration": 2.722921848297119 }, { "auxiliary_loss_clip": 0.01071317, "auxiliary_loss_mlp": 0.01031287, "balance_loss_clip": 1.03451514, "balance_loss_mlp": 1.01825154, "epoch": 0.7339245453179017, "flos": 30920236536960.0, "grad_norm": 2.0535884600804817, "language_loss": 0.71176481, "learning_rate": 6.976792812872958e-07, "loss": 0.73279089, "num_input_tokens_seen": 263393610, "step": 12207, "time_per_iteration": 2.9302005767822266 }, { "auxiliary_loss_clip": 0.01014266, "auxiliary_loss_mlp": 0.01003684, "balance_loss_clip": 1.01024389, "balance_loss_mlp": 1.00252759, "epoch": 0.7339846685705697, "flos": 67899429072000.0, "grad_norm": 0.7780632600453249, "language_loss": 0.54746544, "learning_rate": 6.97383727865263e-07, "loss": 0.56764495, "num_input_tokens_seen": 263450340, "step": 12208, "time_per_iteration": 3.267242431640625 }, { "auxiliary_loss_clip": 0.01111313, "auxiliary_loss_mlp": 0.01029971, "balance_loss_clip": 1.03991294, "balance_loss_mlp": 1.01901555, "epoch": 0.7340447918232377, "flos": 22236749493120.0, "grad_norm": 1.4520136816915177, "language_loss": 0.8051306, "learning_rate": 6.970882238385703e-07, "loss": 0.82654339, "num_input_tokens_seen": 263471735, "step": 12209, "time_per_iteration": 2.6250216960906982 }, { "auxiliary_loss_clip": 0.01108587, "auxiliary_loss_mlp": 0.01033248, "balance_loss_clip": 1.0370816, "balance_loss_mlp": 1.02134514, "epoch": 0.7341049150759056, "flos": 23764784014080.0, "grad_norm": 1.461722216284673, "language_loss": 0.79026657, "learning_rate": 6.96792769218423e-07, "loss": 0.81168497, "num_input_tokens_seen": 263493245, "step": 12210, "time_per_iteration": 2.5592970848083496 }, { "auxiliary_loss_clip": 0.01108387, "auxiliary_loss_mlp": 0.01029679, "balance_loss_clip": 1.03799284, "balance_loss_mlp": 1.01709008, "epoch": 0.7341650383285736, "flos": 17236547804160.0, "grad_norm": 1.73695170749579, "language_loss": 0.76122808, "learning_rate": 6.964973640160236e-07, "loss": 0.78260869, "num_input_tokens_seen": 263511660, "step": 12211, "time_per_iteration": 2.571751117706299 }, { "auxiliary_loss_clip": 0.01087498, "auxiliary_loss_mlp": 0.01031413, "balance_loss_clip": 1.03891158, "balance_loss_mlp": 1.018592, "epoch": 0.7342251615812415, "flos": 23403953940480.0, "grad_norm": 6.531715121329498, "language_loss": 0.71997905, "learning_rate": 6.962020082425748e-07, "loss": 0.74116814, "num_input_tokens_seen": 263530875, "step": 12212, "time_per_iteration": 2.6509475708007812 }, { "auxiliary_loss_clip": 0.01112722, "auxiliary_loss_mlp": 0.01033281, "balance_loss_clip": 1.04100943, "balance_loss_mlp": 1.02054381, "epoch": 0.7342852848339095, "flos": 22747183712640.0, "grad_norm": 1.5833725401172443, "language_loss": 0.68744397, "learning_rate": 6.959067019092766e-07, "loss": 0.70890403, "num_input_tokens_seen": 263551585, "step": 12213, "time_per_iteration": 2.5494189262390137 }, { "auxiliary_loss_clip": 0.010305, "auxiliary_loss_mlp": 0.01005419, "balance_loss_clip": 1.00768566, "balance_loss_mlp": 1.004251, "epoch": 0.7343454080865774, "flos": 53942353925760.0, "grad_norm": 0.7305513742092771, "language_loss": 0.54231656, "learning_rate": 6.956114450273276e-07, "loss": 0.56267571, "num_input_tokens_seen": 263609545, "step": 12214, "time_per_iteration": 3.0239064693450928 }, { "auxiliary_loss_clip": 0.01112827, "auxiliary_loss_mlp": 0.01031447, "balance_loss_clip": 1.03797483, "balance_loss_mlp": 1.01904964, "epoch": 0.7344055313392455, "flos": 12166859255040.0, "grad_norm": 1.9946109817082227, "language_loss": 0.70621991, "learning_rate": 6.953162376079233e-07, "loss": 0.72766268, "num_input_tokens_seen": 263627880, "step": 12215, "time_per_iteration": 2.5570547580718994 }, { "auxiliary_loss_clip": 0.01082063, "auxiliary_loss_mlp": 0.01033308, "balance_loss_clip": 1.03650701, "balance_loss_mlp": 1.02130389, "epoch": 0.7344656545919134, "flos": 18550052346240.0, "grad_norm": 1.5883175175393598, "language_loss": 0.72867477, "learning_rate": 6.950210796622573e-07, "loss": 0.74982846, "num_input_tokens_seen": 263645665, "step": 12216, "time_per_iteration": 2.621229887008667 }, { "auxiliary_loss_clip": 0.0111704, "auxiliary_loss_mlp": 0.01039831, "balance_loss_clip": 1.0392859, "balance_loss_mlp": 1.02483487, "epoch": 0.7345257778445814, "flos": 23661649088640.0, "grad_norm": 1.6902289453280186, "language_loss": 0.78386879, "learning_rate": 6.947259712015236e-07, "loss": 0.80543745, "num_input_tokens_seen": 263668170, "step": 12217, "time_per_iteration": 2.594928503036499 }, { "auxiliary_loss_clip": 0.01072057, "auxiliary_loss_mlp": 0.01027279, "balance_loss_clip": 1.03669691, "balance_loss_mlp": 1.01602566, "epoch": 0.7345859010972494, "flos": 13808659127040.0, "grad_norm": 1.9223508730662753, "language_loss": 0.77991557, "learning_rate": 6.94430912236911e-07, "loss": 0.80090904, "num_input_tokens_seen": 263684190, "step": 12218, "time_per_iteration": 4.173985958099365 }, { "auxiliary_loss_clip": 0.01060122, "auxiliary_loss_mlp": 0.01038245, "balance_loss_clip": 1.03246057, "balance_loss_mlp": 1.02410722, "epoch": 0.7346460243499173, "flos": 22272731942400.0, "grad_norm": 1.7300149246142222, "language_loss": 0.71998847, "learning_rate": 6.941359027796092e-07, "loss": 0.74097216, "num_input_tokens_seen": 263702095, "step": 12219, "time_per_iteration": 2.7360141277313232 }, { "auxiliary_loss_clip": 0.01084965, "auxiliary_loss_mlp": 0.01031146, "balance_loss_clip": 1.03496408, "balance_loss_mlp": 1.01936817, "epoch": 0.7347061476025853, "flos": 23255247634560.0, "grad_norm": 6.086208044404794, "language_loss": 0.74677491, "learning_rate": 6.938409428408061e-07, "loss": 0.76793599, "num_input_tokens_seen": 263721385, "step": 12220, "time_per_iteration": 4.237574577331543 }, { "auxiliary_loss_clip": 0.01101059, "auxiliary_loss_mlp": 0.01032621, "balance_loss_clip": 1.03634357, "balance_loss_mlp": 1.02002692, "epoch": 0.7347662708552533, "flos": 15267565923840.0, "grad_norm": 1.7582091320116324, "language_loss": 0.65720487, "learning_rate": 6.93546032431684e-07, "loss": 0.67854166, "num_input_tokens_seen": 263737835, "step": 12221, "time_per_iteration": 4.174748182296753 }, { "auxiliary_loss_clip": 0.0108489, "auxiliary_loss_mlp": 0.01039185, "balance_loss_clip": 1.0352186, "balance_loss_mlp": 1.02567315, "epoch": 0.7348263941079213, "flos": 24859987649280.0, "grad_norm": 1.907694939441604, "language_loss": 0.69323444, "learning_rate": 6.932511715634273e-07, "loss": 0.71447521, "num_input_tokens_seen": 263756480, "step": 12222, "time_per_iteration": 2.704784393310547 }, { "auxiliary_loss_clip": 0.01063424, "auxiliary_loss_mlp": 0.01030995, "balance_loss_clip": 1.03514957, "balance_loss_mlp": 1.01988506, "epoch": 0.7348865173605892, "flos": 24352103295360.0, "grad_norm": 1.9184398882939155, "language_loss": 0.66062474, "learning_rate": 6.92956360247217e-07, "loss": 0.68156886, "num_input_tokens_seen": 263776440, "step": 12223, "time_per_iteration": 2.8198130130767822 }, { "auxiliary_loss_clip": 0.01094086, "auxiliary_loss_mlp": 0.01029505, "balance_loss_clip": 1.03635502, "balance_loss_mlp": 1.01708925, "epoch": 0.7349466406132572, "flos": 20004613597440.0, "grad_norm": 1.6947927626477597, "language_loss": 0.72573948, "learning_rate": 6.926615984942332e-07, "loss": 0.7469753, "num_input_tokens_seen": 263793700, "step": 12224, "time_per_iteration": 4.08525276184082 }, { "auxiliary_loss_clip": 0.01085057, "auxiliary_loss_mlp": 0.01029564, "balance_loss_clip": 1.04095888, "balance_loss_mlp": 1.01713049, "epoch": 0.7350067638659251, "flos": 29825068815360.0, "grad_norm": 1.830057292997908, "language_loss": 0.72199714, "learning_rate": 6.92366886315652e-07, "loss": 0.74314332, "num_input_tokens_seen": 263814620, "step": 12225, "time_per_iteration": 2.736055850982666 }, { "auxiliary_loss_clip": 0.0111514, "auxiliary_loss_mlp": 0.01035041, "balance_loss_clip": 1.03917527, "balance_loss_mlp": 1.02134943, "epoch": 0.7350668871185931, "flos": 21866150920320.0, "grad_norm": 1.7365051701265057, "language_loss": 0.76401973, "learning_rate": 6.920722237226501e-07, "loss": 0.78552151, "num_input_tokens_seen": 263832725, "step": 12226, "time_per_iteration": 2.578805446624756 }, { "auxiliary_loss_clip": 0.01085278, "auxiliary_loss_mlp": 0.0103433, "balance_loss_clip": 1.03646374, "balance_loss_mlp": 1.01977456, "epoch": 0.735127010371261, "flos": 22566122231040.0, "grad_norm": 1.442598448518307, "language_loss": 0.6717149, "learning_rate": 6.917776107264008e-07, "loss": 0.69291103, "num_input_tokens_seen": 263853850, "step": 12227, "time_per_iteration": 2.638720989227295 }, { "auxiliary_loss_clip": 0.01101144, "auxiliary_loss_mlp": 0.0103552, "balance_loss_clip": 1.03755474, "balance_loss_mlp": 1.02331293, "epoch": 0.7351871336239291, "flos": 25884339707520.0, "grad_norm": 2.1955172179062536, "language_loss": 0.63554502, "learning_rate": 6.914830473380749e-07, "loss": 0.65691161, "num_input_tokens_seen": 263874760, "step": 12228, "time_per_iteration": 2.646679162979126 }, { "auxiliary_loss_clip": 0.0109047, "auxiliary_loss_mlp": 0.01036115, "balance_loss_clip": 1.03838301, "balance_loss_mlp": 1.02450967, "epoch": 0.735247256876597, "flos": 17932173569280.0, "grad_norm": 1.6447533892101769, "language_loss": 0.63384873, "learning_rate": 6.911885335688427e-07, "loss": 0.65511459, "num_input_tokens_seen": 263893390, "step": 12229, "time_per_iteration": 2.626433849334717 }, { "auxiliary_loss_clip": 0.01087319, "auxiliary_loss_mlp": 0.01037821, "balance_loss_clip": 1.03916466, "balance_loss_mlp": 1.02470779, "epoch": 0.735307380129265, "flos": 28875159694080.0, "grad_norm": 1.6569871387550634, "language_loss": 0.73374206, "learning_rate": 6.908940694298726e-07, "loss": 0.75499344, "num_input_tokens_seen": 263911180, "step": 12230, "time_per_iteration": 2.719008207321167 }, { "auxiliary_loss_clip": 0.01058297, "auxiliary_loss_mlp": 0.01032553, "balance_loss_clip": 1.03558922, "balance_loss_mlp": 1.0192132, "epoch": 0.7353675033819329, "flos": 13625658311040.0, "grad_norm": 2.410798964065256, "language_loss": 0.72446096, "learning_rate": 6.90599654932332e-07, "loss": 0.74536955, "num_input_tokens_seen": 263928975, "step": 12231, "time_per_iteration": 2.7233800888061523 }, { "auxiliary_loss_clip": 0.01102609, "auxiliary_loss_mlp": 0.01037553, "balance_loss_clip": 1.0392592, "balance_loss_mlp": 1.0230689, "epoch": 0.7354276266346009, "flos": 19463081178240.0, "grad_norm": 2.5985105749536332, "language_loss": 0.63813508, "learning_rate": 6.903052900873823e-07, "loss": 0.65953672, "num_input_tokens_seen": 263944495, "step": 12232, "time_per_iteration": 2.626089334487915 }, { "auxiliary_loss_clip": 0.0109166, "auxiliary_loss_mlp": 0.01032177, "balance_loss_clip": 1.03764665, "balance_loss_mlp": 1.01987481, "epoch": 0.735487749887269, "flos": 15771858917760.0, "grad_norm": 1.7852756816189446, "language_loss": 0.75511599, "learning_rate": 6.900109749061874e-07, "loss": 0.77635431, "num_input_tokens_seen": 263961325, "step": 12233, "time_per_iteration": 2.614691972732544 }, { "auxiliary_loss_clip": 0.01112187, "auxiliary_loss_mlp": 0.01028949, "balance_loss_clip": 1.03919733, "balance_loss_mlp": 1.01619315, "epoch": 0.7355478731399369, "flos": 18260648467200.0, "grad_norm": 4.244761548872676, "language_loss": 0.73351365, "learning_rate": 6.897167093999079e-07, "loss": 0.75492501, "num_input_tokens_seen": 263980445, "step": 12234, "time_per_iteration": 2.5742101669311523 }, { "auxiliary_loss_clip": 0.01099473, "auxiliary_loss_mlp": 0.0103086, "balance_loss_clip": 1.03804564, "balance_loss_mlp": 1.01721096, "epoch": 0.7356079963926049, "flos": 26542043688960.0, "grad_norm": 2.1824026453384078, "language_loss": 0.59852672, "learning_rate": 6.894224935797017e-07, "loss": 0.61983013, "num_input_tokens_seen": 263999330, "step": 12235, "time_per_iteration": 2.661247730255127 }, { "auxiliary_loss_clip": 0.01088694, "auxiliary_loss_mlp": 0.01027233, "balance_loss_clip": 1.03844726, "balance_loss_mlp": 1.01487708, "epoch": 0.7356681196452728, "flos": 10778624467200.0, "grad_norm": 2.763935396627176, "language_loss": 0.85834122, "learning_rate": 6.891283274567259e-07, "loss": 0.87950051, "num_input_tokens_seen": 264014150, "step": 12236, "time_per_iteration": 2.589035749435425 }, { "auxiliary_loss_clip": 0.0110083, "auxiliary_loss_mlp": 0.00769741, "balance_loss_clip": 1.03816271, "balance_loss_mlp": 1.00019503, "epoch": 0.7357282428979408, "flos": 19718693337600.0, "grad_norm": 5.1654234015242215, "language_loss": 0.69555867, "learning_rate": 6.888342110421364e-07, "loss": 0.71426433, "num_input_tokens_seen": 264033140, "step": 12237, "time_per_iteration": 2.652851104736328 }, { "auxiliary_loss_clip": 0.01022711, "auxiliary_loss_mlp": 0.01031396, "balance_loss_clip": 1.02870941, "balance_loss_mlp": 1.01868236, "epoch": 0.7357883661506087, "flos": 19464014931840.0, "grad_norm": 1.6842160267600648, "language_loss": 0.72287041, "learning_rate": 6.885401443470839e-07, "loss": 0.74341154, "num_input_tokens_seen": 264052105, "step": 12238, "time_per_iteration": 2.887967586517334 }, { "auxiliary_loss_clip": 0.0108237, "auxiliary_loss_mlp": 0.01030077, "balance_loss_clip": 1.03519797, "balance_loss_mlp": 1.01672542, "epoch": 0.7358484894032767, "flos": 27123006263040.0, "grad_norm": 2.119394608491001, "language_loss": 0.72818553, "learning_rate": 6.882461273827205e-07, "loss": 0.74930996, "num_input_tokens_seen": 264070690, "step": 12239, "time_per_iteration": 3.308215618133545 }, { "auxiliary_loss_clip": 0.01079481, "auxiliary_loss_mlp": 0.0103047, "balance_loss_clip": 1.03759682, "balance_loss_mlp": 1.01827478, "epoch": 0.7359086126559446, "flos": 24502282058880.0, "grad_norm": 1.656407411551667, "language_loss": 0.78889048, "learning_rate": 6.879521601601954e-07, "loss": 0.80998993, "num_input_tokens_seen": 264094225, "step": 12240, "time_per_iteration": 2.6716065406799316 }, { "auxiliary_loss_clip": 0.01101629, "auxiliary_loss_mlp": 0.01037535, "balance_loss_clip": 1.03955805, "balance_loss_mlp": 1.02480888, "epoch": 0.7359687359086127, "flos": 23331270769920.0, "grad_norm": 1.888852774104125, "language_loss": 0.82579136, "learning_rate": 6.876582426906565e-07, "loss": 0.84718299, "num_input_tokens_seen": 264113190, "step": 12241, "time_per_iteration": 2.687603712081909 }, { "auxiliary_loss_clip": 0.01097273, "auxiliary_loss_mlp": 0.01025951, "balance_loss_clip": 1.03536153, "balance_loss_mlp": 1.01373816, "epoch": 0.7360288591612806, "flos": 20193396503040.0, "grad_norm": 1.823724311239111, "language_loss": 0.78747702, "learning_rate": 6.873643749852484e-07, "loss": 0.80870926, "num_input_tokens_seen": 264132050, "step": 12242, "time_per_iteration": 2.6332826614379883 }, { "auxiliary_loss_clip": 0.01062855, "auxiliary_loss_mlp": 0.01032129, "balance_loss_clip": 1.03485787, "balance_loss_mlp": 1.01942182, "epoch": 0.7360889824139486, "flos": 24972783333120.0, "grad_norm": 1.7248872165867588, "language_loss": 0.79574555, "learning_rate": 6.870705570551145e-07, "loss": 0.81669545, "num_input_tokens_seen": 264152800, "step": 12243, "time_per_iteration": 2.6513876914978027 }, { "auxiliary_loss_clip": 0.01101249, "auxiliary_loss_mlp": 0.01032749, "balance_loss_clip": 1.03733206, "balance_loss_mlp": 1.01998186, "epoch": 0.7361491056666165, "flos": 15012312900480.0, "grad_norm": 2.291279589424139, "language_loss": 0.74445826, "learning_rate": 6.867767889113969e-07, "loss": 0.76579821, "num_input_tokens_seen": 264169650, "step": 12244, "time_per_iteration": 2.4683594703674316 }, { "auxiliary_loss_clip": 0.01094664, "auxiliary_loss_mlp": 0.01032807, "balance_loss_clip": 1.03583598, "balance_loss_mlp": 1.02007556, "epoch": 0.7362092289192845, "flos": 22930400010240.0, "grad_norm": 1.867590406442262, "language_loss": 0.69203222, "learning_rate": 6.864830705652347e-07, "loss": 0.7133069, "num_input_tokens_seen": 264190530, "step": 12245, "time_per_iteration": 2.687621831893921 }, { "auxiliary_loss_clip": 0.01072242, "auxiliary_loss_mlp": 0.01034229, "balance_loss_clip": 1.03500962, "balance_loss_mlp": 1.02093101, "epoch": 0.7362693521719526, "flos": 20702681487360.0, "grad_norm": 1.5504904420549481, "language_loss": 0.73484623, "learning_rate": 6.861894020277658e-07, "loss": 0.75591099, "num_input_tokens_seen": 264210820, "step": 12246, "time_per_iteration": 2.73628568649292 }, { "auxiliary_loss_clip": 0.01084679, "auxiliary_loss_mlp": 0.01025875, "balance_loss_clip": 1.03512716, "balance_loss_mlp": 1.01378119, "epoch": 0.7363294754246205, "flos": 13111381336320.0, "grad_norm": 2.1569575321455163, "language_loss": 0.73685145, "learning_rate": 6.858957833101266e-07, "loss": 0.75795692, "num_input_tokens_seen": 264227430, "step": 12247, "time_per_iteration": 2.5930237770080566 }, { "auxiliary_loss_clip": 0.01101325, "auxiliary_loss_mlp": 0.01032427, "balance_loss_clip": 1.04162931, "balance_loss_mlp": 1.02031505, "epoch": 0.7363895986772885, "flos": 14027426910720.0, "grad_norm": 1.6102027523975817, "language_loss": 0.7423265, "learning_rate": 6.856022144234526e-07, "loss": 0.76366401, "num_input_tokens_seen": 264245230, "step": 12248, "time_per_iteration": 2.5792789459228516 }, { "auxiliary_loss_clip": 0.0109033, "auxiliary_loss_mlp": 0.01033502, "balance_loss_clip": 1.03816319, "balance_loss_mlp": 1.02057934, "epoch": 0.7364497219299564, "flos": 19719986227200.0, "grad_norm": 1.8750204418443517, "language_loss": 0.72477007, "learning_rate": 6.853086953788727e-07, "loss": 0.7460084, "num_input_tokens_seen": 264263945, "step": 12249, "time_per_iteration": 2.624386787414551 }, { "auxiliary_loss_clip": 0.01089724, "auxiliary_loss_mlp": 0.01033559, "balance_loss_clip": 1.03801394, "balance_loss_mlp": 1.02015996, "epoch": 0.7365098451826244, "flos": 21361391049600.0, "grad_norm": 2.586847113789983, "language_loss": 0.77382159, "learning_rate": 6.850152261875189e-07, "loss": 0.7950545, "num_input_tokens_seen": 264281500, "step": 12250, "time_per_iteration": 2.6388142108917236 }, { "auxiliary_loss_clip": 0.01066882, "auxiliary_loss_mlp": 0.01031238, "balance_loss_clip": 1.03667164, "balance_loss_mlp": 1.01857233, "epoch": 0.7365699684352923, "flos": 23368222886400.0, "grad_norm": 1.6519467305081468, "language_loss": 0.71352232, "learning_rate": 6.8472180686052e-07, "loss": 0.73450345, "num_input_tokens_seen": 264301625, "step": 12251, "time_per_iteration": 2.7391629219055176 }, { "auxiliary_loss_clip": 0.01095208, "auxiliary_loss_mlp": 0.01035371, "balance_loss_clip": 1.03801441, "balance_loss_mlp": 1.0229789, "epoch": 0.7366300916879603, "flos": 59524879927680.0, "grad_norm": 1.575545988693255, "language_loss": 0.65908438, "learning_rate": 6.844284374090015e-07, "loss": 0.68039018, "num_input_tokens_seen": 264323975, "step": 12252, "time_per_iteration": 2.9795963764190674 }, { "auxiliary_loss_clip": 0.0106263, "auxiliary_loss_mlp": 0.01035896, "balance_loss_clip": 1.03544736, "balance_loss_mlp": 1.02261591, "epoch": 0.7366902149406283, "flos": 20923137210240.0, "grad_norm": 1.669933486125426, "language_loss": 0.79418141, "learning_rate": 6.841351178440884e-07, "loss": 0.81516671, "num_input_tokens_seen": 264343785, "step": 12253, "time_per_iteration": 2.762692451477051 }, { "auxiliary_loss_clip": 0.01107479, "auxiliary_loss_mlp": 0.00769571, "balance_loss_clip": 1.03836572, "balance_loss_mlp": 1.00025702, "epoch": 0.7367503381932963, "flos": 17348158339200.0, "grad_norm": 2.0410258772790604, "language_loss": 0.76204622, "learning_rate": 6.83841848176905e-07, "loss": 0.78081673, "num_input_tokens_seen": 264361130, "step": 12254, "time_per_iteration": 2.518159866333008 }, { "auxiliary_loss_clip": 0.01085242, "auxiliary_loss_mlp": 0.01042117, "balance_loss_clip": 1.03690898, "balance_loss_mlp": 1.02805638, "epoch": 0.7368104614459642, "flos": 17821317219840.0, "grad_norm": 4.287032087933439, "language_loss": 0.7025637, "learning_rate": 6.835486284185692e-07, "loss": 0.72383738, "num_input_tokens_seen": 264376965, "step": 12255, "time_per_iteration": 2.589442729949951 }, { "auxiliary_loss_clip": 0.0110157, "auxiliary_loss_mlp": 0.01029844, "balance_loss_clip": 1.03971469, "balance_loss_mlp": 1.01649857, "epoch": 0.7368705846986322, "flos": 24606099342720.0, "grad_norm": 1.8002690456311732, "language_loss": 0.75496477, "learning_rate": 6.832554585802012e-07, "loss": 0.77627891, "num_input_tokens_seen": 264396310, "step": 12256, "time_per_iteration": 2.6408097743988037 }, { "auxiliary_loss_clip": 0.0110194, "auxiliary_loss_mlp": 0.01031829, "balance_loss_clip": 1.03902447, "balance_loss_mlp": 1.01861525, "epoch": 0.7369307079513001, "flos": 34970169968640.0, "grad_norm": 1.8159152177837306, "language_loss": 0.73517919, "learning_rate": 6.829623386729182e-07, "loss": 0.75651693, "num_input_tokens_seen": 264418085, "step": 12257, "time_per_iteration": 2.6984493732452393 }, { "auxiliary_loss_clip": 0.01092873, "auxiliary_loss_mlp": 0.01038875, "balance_loss_clip": 1.03521228, "balance_loss_mlp": 1.02668011, "epoch": 0.7369908312039681, "flos": 21214588164480.0, "grad_norm": 1.793311215899037, "language_loss": 0.78370535, "learning_rate": 6.826692687078362e-07, "loss": 0.80502284, "num_input_tokens_seen": 264437595, "step": 12258, "time_per_iteration": 4.2666544914245605 }, { "auxiliary_loss_clip": 0.01103154, "auxiliary_loss_mlp": 0.0103457, "balance_loss_clip": 1.03888559, "balance_loss_mlp": 1.02195156, "epoch": 0.7370509544566362, "flos": 23623655477760.0, "grad_norm": 1.4256743681063133, "language_loss": 0.66447318, "learning_rate": 6.823762486960674e-07, "loss": 0.68585044, "num_input_tokens_seen": 264457385, "step": 12259, "time_per_iteration": 2.6215436458587646 }, { "auxiliary_loss_clip": 0.01101635, "auxiliary_loss_mlp": 0.01036273, "balance_loss_clip": 1.0403527, "balance_loss_mlp": 1.02288604, "epoch": 0.7371110777093041, "flos": 24827704300800.0, "grad_norm": 1.885600567170779, "language_loss": 0.73500818, "learning_rate": 6.820832786487225e-07, "loss": 0.75638729, "num_input_tokens_seen": 264477205, "step": 12260, "time_per_iteration": 5.883468866348267 }, { "auxiliary_loss_clip": 0.01096844, "auxiliary_loss_mlp": 0.010343, "balance_loss_clip": 1.0374378, "balance_loss_mlp": 1.02105618, "epoch": 0.7371712009619721, "flos": 23149491016320.0, "grad_norm": 1.6200420783650578, "language_loss": 0.73566377, "learning_rate": 6.817903585769125e-07, "loss": 0.75697523, "num_input_tokens_seen": 264497195, "step": 12261, "time_per_iteration": 2.611388683319092 }, { "auxiliary_loss_clip": 0.01091123, "auxiliary_loss_mlp": 0.01034498, "balance_loss_clip": 1.03705454, "balance_loss_mlp": 1.02096152, "epoch": 0.73723132421464, "flos": 23112898035840.0, "grad_norm": 1.9187106646052445, "language_loss": 0.66943705, "learning_rate": 6.814974884917438e-07, "loss": 0.69069326, "num_input_tokens_seen": 264516950, "step": 12262, "time_per_iteration": 2.605332374572754 }, { "auxiliary_loss_clip": 0.01112628, "auxiliary_loss_mlp": 0.01032891, "balance_loss_clip": 1.03917944, "balance_loss_mlp": 1.01943254, "epoch": 0.737291447467308, "flos": 19273328605440.0, "grad_norm": 2.61578609371499, "language_loss": 0.88660431, "learning_rate": 6.81204668404322e-07, "loss": 0.90805948, "num_input_tokens_seen": 264532675, "step": 12263, "time_per_iteration": 4.228296279907227 }, { "auxiliary_loss_clip": 0.01107513, "auxiliary_loss_mlp": 0.01028636, "balance_loss_clip": 1.03926718, "balance_loss_mlp": 1.01731133, "epoch": 0.7373515707199759, "flos": 25118257415040.0, "grad_norm": 1.6036669439356246, "language_loss": 0.67279935, "learning_rate": 6.809118983257522e-07, "loss": 0.69416088, "num_input_tokens_seen": 264555635, "step": 12264, "time_per_iteration": 2.6264944076538086 }, { "auxiliary_loss_clip": 0.01107424, "auxiliary_loss_mlp": 0.01032446, "balance_loss_clip": 1.0380187, "balance_loss_mlp": 1.02020919, "epoch": 0.737411693972644, "flos": 32408481767040.0, "grad_norm": 5.628920745941572, "language_loss": 0.80262679, "learning_rate": 6.806191782671356e-07, "loss": 0.82402551, "num_input_tokens_seen": 264573140, "step": 12265, "time_per_iteration": 2.6175074577331543 }, { "auxiliary_loss_clip": 0.01104877, "auxiliary_loss_mlp": 0.01031656, "balance_loss_clip": 1.03860068, "balance_loss_mlp": 1.01912761, "epoch": 0.7374718172253119, "flos": 24315797623680.0, "grad_norm": 2.6431361651655094, "language_loss": 0.74271613, "learning_rate": 6.803265082395711e-07, "loss": 0.76408148, "num_input_tokens_seen": 264591610, "step": 12266, "time_per_iteration": 2.6342427730560303 }, { "auxiliary_loss_clip": 0.01102733, "auxiliary_loss_mlp": 0.01039895, "balance_loss_clip": 1.03989673, "balance_loss_mlp": 1.02624547, "epoch": 0.7375319404779799, "flos": 27156115624320.0, "grad_norm": 1.6143075154919249, "language_loss": 0.72911859, "learning_rate": 6.800338882541576e-07, "loss": 0.75054485, "num_input_tokens_seen": 264611170, "step": 12267, "time_per_iteration": 2.638545036315918 }, { "auxiliary_loss_clip": 0.01075616, "auxiliary_loss_mlp": 0.01036972, "balance_loss_clip": 1.03733301, "balance_loss_mlp": 1.02528942, "epoch": 0.7375920637306478, "flos": 18879999701760.0, "grad_norm": 2.114502804369275, "language_loss": 0.83173954, "learning_rate": 6.797413183219923e-07, "loss": 0.85286546, "num_input_tokens_seen": 264629365, "step": 12268, "time_per_iteration": 2.6624231338500977 }, { "auxiliary_loss_clip": 0.0111022, "auxiliary_loss_mlp": 0.01043154, "balance_loss_clip": 1.03934455, "balance_loss_mlp": 1.03039253, "epoch": 0.7376521869833158, "flos": 15669765486720.0, "grad_norm": 1.8306850804928718, "language_loss": 0.73056579, "learning_rate": 6.794487984541677e-07, "loss": 0.75209951, "num_input_tokens_seen": 264647915, "step": 12269, "time_per_iteration": 2.5542378425598145 }, { "auxiliary_loss_clip": 0.01086703, "auxiliary_loss_mlp": 0.01036517, "balance_loss_clip": 1.03575897, "balance_loss_mlp": 1.02278399, "epoch": 0.7377123102359837, "flos": 36971973901440.0, "grad_norm": 2.033998429253707, "language_loss": 0.70437771, "learning_rate": 6.791563286617776e-07, "loss": 0.72560984, "num_input_tokens_seen": 264669620, "step": 12270, "time_per_iteration": 2.738266706466675 }, { "auxiliary_loss_clip": 0.01096302, "auxiliary_loss_mlp": 0.01032958, "balance_loss_clip": 1.03592134, "balance_loss_mlp": 1.02121568, "epoch": 0.7377724334886517, "flos": 24496284487680.0, "grad_norm": 1.5966861797114758, "language_loss": 0.69652647, "learning_rate": 6.788639089559119e-07, "loss": 0.71781904, "num_input_tokens_seen": 264689345, "step": 12271, "time_per_iteration": 2.664652109146118 }, { "auxiliary_loss_clip": 0.01080906, "auxiliary_loss_mlp": 0.01034393, "balance_loss_clip": 1.03928661, "balance_loss_mlp": 1.02066565, "epoch": 0.7378325567413198, "flos": 24390025079040.0, "grad_norm": 2.652639550639501, "language_loss": 0.67802662, "learning_rate": 6.785715393476586e-07, "loss": 0.69917965, "num_input_tokens_seen": 264707625, "step": 12272, "time_per_iteration": 2.6848604679107666 }, { "auxiliary_loss_clip": 0.01086013, "auxiliary_loss_mlp": 0.01030527, "balance_loss_clip": 1.03750646, "balance_loss_mlp": 1.01848674, "epoch": 0.7378926799939877, "flos": 17416388223360.0, "grad_norm": 2.2309811346874655, "language_loss": 0.780334, "learning_rate": 6.782792198481049e-07, "loss": 0.80149937, "num_input_tokens_seen": 264725575, "step": 12273, "time_per_iteration": 2.635556936264038 }, { "auxiliary_loss_clip": 0.01109904, "auxiliary_loss_mlp": 0.01030975, "balance_loss_clip": 1.03768742, "balance_loss_mlp": 1.01857686, "epoch": 0.7379528032466557, "flos": 18474208778880.0, "grad_norm": 1.8331912360811773, "language_loss": 0.83564162, "learning_rate": 6.779869504683355e-07, "loss": 0.85705042, "num_input_tokens_seen": 264742855, "step": 12274, "time_per_iteration": 2.5715138912200928 }, { "auxiliary_loss_clip": 0.01091523, "auxiliary_loss_mlp": 0.00771783, "balance_loss_clip": 1.03963578, "balance_loss_mlp": 1.00021505, "epoch": 0.7380129264993236, "flos": 17821999578240.0, "grad_norm": 2.3015182106996237, "language_loss": 0.73600042, "learning_rate": 6.776947312194341e-07, "loss": 0.75463349, "num_input_tokens_seen": 264761155, "step": 12275, "time_per_iteration": 2.715363025665283 }, { "auxiliary_loss_clip": 0.01078211, "auxiliary_loss_mlp": 0.01054085, "balance_loss_clip": 1.03664327, "balance_loss_mlp": 1.03894567, "epoch": 0.7380730497519916, "flos": 22997372918400.0, "grad_norm": 1.6539392854769155, "language_loss": 0.73462373, "learning_rate": 6.774025621124813e-07, "loss": 0.75594664, "num_input_tokens_seen": 264780660, "step": 12276, "time_per_iteration": 2.7231481075286865 }, { "auxiliary_loss_clip": 0.01112925, "auxiliary_loss_mlp": 0.01031524, "balance_loss_clip": 1.03907084, "balance_loss_mlp": 1.01874495, "epoch": 0.7381331730046595, "flos": 20266259241600.0, "grad_norm": 1.9864441113033549, "language_loss": 0.7796191, "learning_rate": 6.771104431585551e-07, "loss": 0.80106354, "num_input_tokens_seen": 264798850, "step": 12277, "time_per_iteration": 2.5575850009918213 }, { "auxiliary_loss_clip": 0.01110863, "auxiliary_loss_mlp": 0.01038626, "balance_loss_clip": 1.03995776, "balance_loss_mlp": 1.0259068, "epoch": 0.7381932962573275, "flos": 19754532132480.0, "grad_norm": 2.416998693757566, "language_loss": 0.78511059, "learning_rate": 6.768183743687338e-07, "loss": 0.80660546, "num_input_tokens_seen": 264816795, "step": 12278, "time_per_iteration": 2.542168617248535 }, { "auxiliary_loss_clip": 0.01102779, "auxiliary_loss_mlp": 0.00771237, "balance_loss_clip": 1.03840756, "balance_loss_mlp": 1.00024569, "epoch": 0.7382534195099955, "flos": 17305316392320.0, "grad_norm": 2.0236332127409, "language_loss": 0.72539043, "learning_rate": 6.765263557540921e-07, "loss": 0.74413061, "num_input_tokens_seen": 264834105, "step": 12279, "time_per_iteration": 2.612534761428833 }, { "auxiliary_loss_clip": 0.01103104, "auxiliary_loss_mlp": 0.01035418, "balance_loss_clip": 1.03738606, "balance_loss_mlp": 1.02173257, "epoch": 0.7383135427626635, "flos": 18697358021760.0, "grad_norm": 2.394018024730235, "language_loss": 0.86069536, "learning_rate": 6.762343873257034e-07, "loss": 0.88208055, "num_input_tokens_seen": 264850895, "step": 12280, "time_per_iteration": 2.611475944519043 }, { "auxiliary_loss_clip": 0.01073789, "auxiliary_loss_mlp": 0.01032032, "balance_loss_clip": 1.03634775, "balance_loss_mlp": 1.01885295, "epoch": 0.7383736660153314, "flos": 20881300844160.0, "grad_norm": 1.8693617932134328, "language_loss": 0.72391272, "learning_rate": 6.759424690946408e-07, "loss": 0.74497092, "num_input_tokens_seen": 264869505, "step": 12281, "time_per_iteration": 2.718876361846924 }, { "auxiliary_loss_clip": 0.0106943, "auxiliary_loss_mlp": 0.01035051, "balance_loss_clip": 1.0354619, "balance_loss_mlp": 1.02190232, "epoch": 0.7384337892679994, "flos": 20663215418880.0, "grad_norm": 1.705222549149129, "language_loss": 0.60742152, "learning_rate": 6.756506010719711e-07, "loss": 0.62846637, "num_input_tokens_seen": 264886915, "step": 12282, "time_per_iteration": 2.70023775100708 }, { "auxiliary_loss_clip": 0.01077848, "auxiliary_loss_mlp": 0.01030119, "balance_loss_clip": 1.03686452, "balance_loss_mlp": 1.01697028, "epoch": 0.7384939125206673, "flos": 29169627390720.0, "grad_norm": 1.8611774916735326, "language_loss": 0.6824851, "learning_rate": 6.753587832687632e-07, "loss": 0.70356476, "num_input_tokens_seen": 264910350, "step": 12283, "time_per_iteration": 2.758152484893799 }, { "auxiliary_loss_clip": 0.01112935, "auxiliary_loss_mlp": 0.00771245, "balance_loss_clip": 1.040452, "balance_loss_mlp": 1.00015855, "epoch": 0.7385540357733353, "flos": 36312833376000.0, "grad_norm": 1.7271477850401677, "language_loss": 0.76260293, "learning_rate": 6.750670156960832e-07, "loss": 0.78144467, "num_input_tokens_seen": 264930705, "step": 12284, "time_per_iteration": 2.7076218128204346 }, { "auxiliary_loss_clip": 0.01094916, "auxiliary_loss_mlp": 0.01035376, "balance_loss_clip": 1.03557301, "balance_loss_mlp": 1.02121985, "epoch": 0.7386141590260034, "flos": 20302600826880.0, "grad_norm": 1.9358750531249929, "language_loss": 0.68962932, "learning_rate": 6.747752983649954e-07, "loss": 0.7109322, "num_input_tokens_seen": 264946975, "step": 12285, "time_per_iteration": 2.572366714477539 }, { "auxiliary_loss_clip": 0.01095815, "auxiliary_loss_mlp": 0.01038084, "balance_loss_clip": 1.03904641, "balance_loss_mlp": 1.02421951, "epoch": 0.7386742822786713, "flos": 25483792170240.0, "grad_norm": 1.9975794318154387, "language_loss": 0.79803824, "learning_rate": 6.744836312865602e-07, "loss": 0.81937724, "num_input_tokens_seen": 264967665, "step": 12286, "time_per_iteration": 2.6924288272857666 }, { "auxiliary_loss_clip": 0.01062201, "auxiliary_loss_mlp": 0.01027877, "balance_loss_clip": 1.03638017, "balance_loss_mlp": 1.01515102, "epoch": 0.7387344055313393, "flos": 13771958405760.0, "grad_norm": 2.075219582835579, "language_loss": 0.65311086, "learning_rate": 6.741920144718396e-07, "loss": 0.67401159, "num_input_tokens_seen": 264985480, "step": 12287, "time_per_iteration": 2.7654411792755127 }, { "auxiliary_loss_clip": 0.010848, "auxiliary_loss_mlp": 0.01026868, "balance_loss_clip": 1.03562939, "balance_loss_mlp": 1.01483417, "epoch": 0.7387945287840072, "flos": 27855189095040.0, "grad_norm": 2.1085520874155046, "language_loss": 0.76855958, "learning_rate": 6.739004479318903e-07, "loss": 0.78967619, "num_input_tokens_seen": 265004790, "step": 12288, "time_per_iteration": 2.6597485542297363 }, { "auxiliary_loss_clip": 0.01104274, "auxiliary_loss_mlp": 0.00771655, "balance_loss_clip": 1.04053795, "balance_loss_mlp": 1.00024295, "epoch": 0.7388546520366752, "flos": 44233039388160.0, "grad_norm": 1.5714095328418676, "language_loss": 0.58359075, "learning_rate": 6.736089316777684e-07, "loss": 0.60235, "num_input_tokens_seen": 265028790, "step": 12289, "time_per_iteration": 2.790731906890869 }, { "auxiliary_loss_clip": 0.01031232, "auxiliary_loss_mlp": 0.00751213, "balance_loss_clip": 1.00846362, "balance_loss_mlp": 0.99965459, "epoch": 0.7389147752893431, "flos": 70680890638080.0, "grad_norm": 0.6357735365195177, "language_loss": 0.49246126, "learning_rate": 6.733174657205287e-07, "loss": 0.51028574, "num_input_tokens_seen": 265096660, "step": 12290, "time_per_iteration": 3.243767261505127 }, { "auxiliary_loss_clip": 0.01096247, "auxiliary_loss_mlp": 0.01035221, "balance_loss_clip": 1.03841698, "balance_loss_mlp": 1.02171409, "epoch": 0.7389748985420111, "flos": 25994980575360.0, "grad_norm": 3.780514148170796, "language_loss": 0.67435575, "learning_rate": 6.730260500712237e-07, "loss": 0.69567037, "num_input_tokens_seen": 265116375, "step": 12291, "time_per_iteration": 2.605470895767212 }, { "auxiliary_loss_clip": 0.0099264, "auxiliary_loss_mlp": 0.01000802, "balance_loss_clip": 1.00994468, "balance_loss_mlp": 0.99969369, "epoch": 0.7390350217946791, "flos": 54403661318400.0, "grad_norm": 0.9871071197765896, "language_loss": 0.60852838, "learning_rate": 6.727346847409052e-07, "loss": 0.62846279, "num_input_tokens_seen": 265161230, "step": 12292, "time_per_iteration": 2.888421058654785 }, { "auxiliary_loss_clip": 0.0106381, "auxiliary_loss_mlp": 0.01034194, "balance_loss_clip": 1.03513324, "balance_loss_mlp": 1.0222311, "epoch": 0.7390951450473471, "flos": 32196968530560.0, "grad_norm": 2.192815626746647, "language_loss": 0.66975296, "learning_rate": 6.724433697406191e-07, "loss": 0.69073296, "num_input_tokens_seen": 265182515, "step": 12293, "time_per_iteration": 2.8275856971740723 }, { "auxiliary_loss_clip": 0.01100034, "auxiliary_loss_mlp": 0.0103067, "balance_loss_clip": 1.03730226, "balance_loss_mlp": 1.01779556, "epoch": 0.739155268300015, "flos": 16684241304960.0, "grad_norm": 1.9827271257615733, "language_loss": 0.83464789, "learning_rate": 6.721521050814134e-07, "loss": 0.85595489, "num_input_tokens_seen": 265198160, "step": 12294, "time_per_iteration": 2.597766160964966 }, { "auxiliary_loss_clip": 0.01077206, "auxiliary_loss_mlp": 0.01033056, "balance_loss_clip": 1.03740942, "balance_loss_mlp": 1.0197401, "epoch": 0.739215391552683, "flos": 31649761762560.0, "grad_norm": 1.5365825507794162, "language_loss": 0.72879148, "learning_rate": 6.718608907743337e-07, "loss": 0.74989408, "num_input_tokens_seen": 265218480, "step": 12295, "time_per_iteration": 2.7728140354156494 }, { "auxiliary_loss_clip": 0.0109979, "auxiliary_loss_mlp": 0.01037539, "balance_loss_clip": 1.03960156, "balance_loss_mlp": 1.02521241, "epoch": 0.7392755148053509, "flos": 29718522097920.0, "grad_norm": 2.087551297048025, "language_loss": 0.7901718, "learning_rate": 6.715697268304215e-07, "loss": 0.81154513, "num_input_tokens_seen": 265240165, "step": 12296, "time_per_iteration": 2.7069132328033447 }, { "auxiliary_loss_clip": 0.01112194, "auxiliary_loss_mlp": 0.01031879, "balance_loss_clip": 1.03957283, "balance_loss_mlp": 1.01797891, "epoch": 0.7393356380580189, "flos": 37050475075200.0, "grad_norm": 2.421267182668315, "language_loss": 0.66443473, "learning_rate": 6.712786132607182e-07, "loss": 0.68587548, "num_input_tokens_seen": 265263295, "step": 12297, "time_per_iteration": 4.15710186958313 }, { "auxiliary_loss_clip": 0.01086243, "auxiliary_loss_mlp": 0.01038586, "balance_loss_clip": 1.03743219, "balance_loss_mlp": 1.02521062, "epoch": 0.739395761310687, "flos": 19719627091200.0, "grad_norm": 2.031169028874948, "language_loss": 0.68639588, "learning_rate": 6.709875500762645e-07, "loss": 0.70764422, "num_input_tokens_seen": 265282740, "step": 12298, "time_per_iteration": 2.6803133487701416 }, { "auxiliary_loss_clip": 0.01083526, "auxiliary_loss_mlp": 0.0103472, "balance_loss_clip": 1.03630257, "balance_loss_mlp": 1.02177382, "epoch": 0.7394558845633549, "flos": 11801504067840.0, "grad_norm": 1.810073219689882, "language_loss": 0.7460804, "learning_rate": 6.706965372880946e-07, "loss": 0.76726282, "num_input_tokens_seen": 265300175, "step": 12299, "time_per_iteration": 4.1317057609558105 }, { "auxiliary_loss_clip": 0.01013835, "auxiliary_loss_mlp": 0.00999495, "balance_loss_clip": 1.0160886, "balance_loss_mlp": 0.99818373, "epoch": 0.7395160078160229, "flos": 66195827850240.0, "grad_norm": 0.7191377980528004, "language_loss": 0.60850734, "learning_rate": 6.704055749072455e-07, "loss": 0.62864065, "num_input_tokens_seen": 265363275, "step": 12300, "time_per_iteration": 4.986863136291504 }, { "auxiliary_loss_clip": 0.01084534, "auxiliary_loss_mlp": 0.01031775, "balance_loss_clip": 1.03953075, "balance_loss_mlp": 1.01876962, "epoch": 0.7395761310686908, "flos": 21249708687360.0, "grad_norm": 1.6608612377328966, "language_loss": 0.80444926, "learning_rate": 6.7011466294475e-07, "loss": 0.82561237, "num_input_tokens_seen": 265382935, "step": 12301, "time_per_iteration": 2.635004997253418 }, { "auxiliary_loss_clip": 0.01109746, "auxiliary_loss_mlp": 0.01029708, "balance_loss_clip": 1.03857565, "balance_loss_mlp": 1.01823974, "epoch": 0.7396362543213588, "flos": 25955299025280.0, "grad_norm": 1.5135415761232773, "language_loss": 0.73152131, "learning_rate": 6.698238014116406e-07, "loss": 0.75291586, "num_input_tokens_seen": 265403245, "step": 12302, "time_per_iteration": 2.612121105194092 }, { "auxiliary_loss_clip": 0.01113143, "auxiliary_loss_mlp": 0.01041216, "balance_loss_clip": 1.03972757, "balance_loss_mlp": 1.02819264, "epoch": 0.7396963775740267, "flos": 27377936064000.0, "grad_norm": 6.478228728649492, "language_loss": 0.73720932, "learning_rate": 6.695329903189451e-07, "loss": 0.75875294, "num_input_tokens_seen": 265423105, "step": 12303, "time_per_iteration": 4.152388334274292 }, { "auxiliary_loss_clip": 0.01109918, "auxiliary_loss_mlp": 0.01030651, "balance_loss_clip": 1.03906059, "balance_loss_mlp": 1.01861048, "epoch": 0.7397565008266948, "flos": 25520133755520.0, "grad_norm": 1.665147368260365, "language_loss": 0.53981858, "learning_rate": 6.692422296776927e-07, "loss": 0.56122428, "num_input_tokens_seen": 265443445, "step": 12304, "time_per_iteration": 2.6007986068725586 }, { "auxiliary_loss_clip": 0.01088478, "auxiliary_loss_mlp": 0.01038752, "balance_loss_clip": 1.03643012, "balance_loss_mlp": 1.02587104, "epoch": 0.7398166240793627, "flos": 23727760070400.0, "grad_norm": 4.218553993502621, "language_loss": 0.84787995, "learning_rate": 6.689515194989084e-07, "loss": 0.86915219, "num_input_tokens_seen": 265462085, "step": 12305, "time_per_iteration": 2.7033863067626953 }, { "auxiliary_loss_clip": 0.01007992, "auxiliary_loss_mlp": 0.01002097, "balance_loss_clip": 1.00802636, "balance_loss_mlp": 1.00075579, "epoch": 0.7398767473320307, "flos": 67267582882560.0, "grad_norm": 0.8984474927660691, "language_loss": 0.57649475, "learning_rate": 6.68660859793615e-07, "loss": 0.59659564, "num_input_tokens_seen": 265521190, "step": 12306, "time_per_iteration": 3.190584421157837 }, { "auxiliary_loss_clip": 0.01091647, "auxiliary_loss_mlp": 0.01034585, "balance_loss_clip": 1.03991795, "balance_loss_mlp": 1.02137649, "epoch": 0.7399368705846986, "flos": 22018699981440.0, "grad_norm": 1.9564303795331826, "language_loss": 0.81826288, "learning_rate": 6.683702505728355e-07, "loss": 0.83952522, "num_input_tokens_seen": 265539705, "step": 12307, "time_per_iteration": 2.760991096496582 }, { "auxiliary_loss_clip": 0.01094355, "auxiliary_loss_mlp": 0.01035489, "balance_loss_clip": 1.04020476, "balance_loss_mlp": 1.0237112, "epoch": 0.7399969938373666, "flos": 14173870659840.0, "grad_norm": 1.7875471048417528, "language_loss": 0.69662929, "learning_rate": 6.680796918475893e-07, "loss": 0.71792769, "num_input_tokens_seen": 265555855, "step": 12308, "time_per_iteration": 2.786059617996216 }, { "auxiliary_loss_clip": 0.01080019, "auxiliary_loss_mlp": 0.01030655, "balance_loss_clip": 1.03736496, "balance_loss_mlp": 1.01869845, "epoch": 0.7400571170900345, "flos": 25301473712640.0, "grad_norm": 1.9234846760439523, "language_loss": 0.81795132, "learning_rate": 6.67789183628896e-07, "loss": 0.83905804, "num_input_tokens_seen": 265575455, "step": 12309, "time_per_iteration": 2.6756904125213623 }, { "auxiliary_loss_clip": 0.01100831, "auxiliary_loss_mlp": 0.01034821, "balance_loss_clip": 1.03873348, "balance_loss_mlp": 1.02133226, "epoch": 0.7401172403427025, "flos": 22711344917760.0, "grad_norm": 3.264420183038049, "language_loss": 0.72705656, "learning_rate": 6.674987259277692e-07, "loss": 0.74841309, "num_input_tokens_seen": 265595250, "step": 12310, "time_per_iteration": 2.7013933658599854 }, { "auxiliary_loss_clip": 0.01075917, "auxiliary_loss_mlp": 0.01042964, "balance_loss_clip": 1.0368607, "balance_loss_mlp": 1.02921319, "epoch": 0.7401773635953706, "flos": 18067448188800.0, "grad_norm": 2.4013054691194915, "language_loss": 0.88485903, "learning_rate": 6.672083187552239e-07, "loss": 0.90604782, "num_input_tokens_seen": 265606945, "step": 12311, "time_per_iteration": 2.6424548625946045 }, { "auxiliary_loss_clip": 0.01046645, "auxiliary_loss_mlp": 0.01029353, "balance_loss_clip": 1.0324477, "balance_loss_mlp": 1.01692545, "epoch": 0.7402374868480385, "flos": 22712135016960.0, "grad_norm": 1.58737852842035, "language_loss": 0.80510384, "learning_rate": 6.669179621222738e-07, "loss": 0.82586384, "num_input_tokens_seen": 265626115, "step": 12312, "time_per_iteration": 2.820053815841675 }, { "auxiliary_loss_clip": 0.01060693, "auxiliary_loss_mlp": 0.01035735, "balance_loss_clip": 1.03197908, "balance_loss_mlp": 1.02264023, "epoch": 0.7402976101007065, "flos": 22856675345280.0, "grad_norm": 1.990612245665929, "language_loss": 0.78425479, "learning_rate": 6.666276560399273e-07, "loss": 0.80521905, "num_input_tokens_seen": 265646520, "step": 12313, "time_per_iteration": 2.756864547729492 }, { "auxiliary_loss_clip": 0.01059901, "auxiliary_loss_mlp": 0.01038311, "balance_loss_clip": 1.03464198, "balance_loss_mlp": 1.02487016, "epoch": 0.7403577333533744, "flos": 12345801834240.0, "grad_norm": 2.1312329589300947, "language_loss": 0.78784394, "learning_rate": 6.663374005191937e-07, "loss": 0.80882609, "num_input_tokens_seen": 265661875, "step": 12314, "time_per_iteration": 2.7299044132232666 }, { "auxiliary_loss_clip": 0.01020285, "auxiliary_loss_mlp": 0.01000472, "balance_loss_clip": 1.00777555, "balance_loss_mlp": 0.99948281, "epoch": 0.7404178566060424, "flos": 60327270869760.0, "grad_norm": 0.9319847439120421, "language_loss": 0.55094397, "learning_rate": 6.660471955710809e-07, "loss": 0.57115149, "num_input_tokens_seen": 265721255, "step": 12315, "time_per_iteration": 3.201897382736206 }, { "auxiliary_loss_clip": 0.01093771, "auxiliary_loss_mlp": 0.01036367, "balance_loss_clip": 1.03759921, "balance_loss_mlp": 1.02371287, "epoch": 0.7404779798587103, "flos": 32014650072960.0, "grad_norm": 1.5030342819067668, "language_loss": 0.79353088, "learning_rate": 6.65757041206591e-07, "loss": 0.81483227, "num_input_tokens_seen": 265743970, "step": 12316, "time_per_iteration": 2.705349922180176 }, { "auxiliary_loss_clip": 0.01098009, "auxiliary_loss_mlp": 0.01031964, "balance_loss_clip": 1.03624582, "balance_loss_mlp": 1.01957273, "epoch": 0.7405381031113784, "flos": 12889704551040.0, "grad_norm": 1.7371134770990158, "language_loss": 0.7492671, "learning_rate": 6.654669374367275e-07, "loss": 0.77056682, "num_input_tokens_seen": 265760890, "step": 12317, "time_per_iteration": 2.637202024459839 }, { "auxiliary_loss_clip": 0.01078909, "auxiliary_loss_mlp": 0.01035186, "balance_loss_clip": 1.03754401, "balance_loss_mlp": 1.02296102, "epoch": 0.7405982263640463, "flos": 20229127557120.0, "grad_norm": 1.520938817583414, "language_loss": 0.81343406, "learning_rate": 6.651768842724917e-07, "loss": 0.834575, "num_input_tokens_seen": 265779600, "step": 12318, "time_per_iteration": 2.7076103687286377 }, { "auxiliary_loss_clip": 0.01084776, "auxiliary_loss_mlp": 0.0103153, "balance_loss_clip": 1.03475654, "balance_loss_mlp": 1.0187031, "epoch": 0.7406583496167143, "flos": 17567213431680.0, "grad_norm": 1.9057934883865575, "language_loss": 0.76502925, "learning_rate": 6.648868817248827e-07, "loss": 0.7861923, "num_input_tokens_seen": 265797030, "step": 12319, "time_per_iteration": 2.6530611515045166 }, { "auxiliary_loss_clip": 0.01080701, "auxiliary_loss_mlp": 0.01032886, "balance_loss_clip": 1.0368222, "balance_loss_mlp": 1.0211314, "epoch": 0.7407184728693822, "flos": 18295733076480.0, "grad_norm": 2.7907820586254064, "language_loss": 0.64157581, "learning_rate": 6.64596929804897e-07, "loss": 0.66271174, "num_input_tokens_seen": 265815055, "step": 12320, "time_per_iteration": 2.7634599208831787 }, { "auxiliary_loss_clip": 0.0110264, "auxiliary_loss_mlp": 0.01041469, "balance_loss_clip": 1.03931427, "balance_loss_mlp": 1.02880883, "epoch": 0.7407785961220502, "flos": 16690562098560.0, "grad_norm": 2.6669296111663168, "language_loss": 0.8214829, "learning_rate": 6.643070285235288e-07, "loss": 0.842924, "num_input_tokens_seen": 265828480, "step": 12321, "time_per_iteration": 2.603889226913452 }, { "auxiliary_loss_clip": 0.01091833, "auxiliary_loss_mlp": 0.01048957, "balance_loss_clip": 1.03682292, "balance_loss_mlp": 1.03459191, "epoch": 0.7408387193747181, "flos": 22088330496000.0, "grad_norm": 2.755383259535151, "language_loss": 0.72079754, "learning_rate": 6.640171778917727e-07, "loss": 0.74220538, "num_input_tokens_seen": 265845825, "step": 12322, "time_per_iteration": 2.5962164402008057 }, { "auxiliary_loss_clip": 0.01100778, "auxiliary_loss_mlp": 0.00770917, "balance_loss_clip": 1.03753436, "balance_loss_mlp": 1.0002656, "epoch": 0.7408988426273861, "flos": 24236721832320.0, "grad_norm": 1.859375439746312, "language_loss": 0.64215767, "learning_rate": 6.637273779206183e-07, "loss": 0.66087461, "num_input_tokens_seen": 265866335, "step": 12323, "time_per_iteration": 2.650984525680542 }, { "auxiliary_loss_clip": 0.01074935, "auxiliary_loss_mlp": 0.01032883, "balance_loss_clip": 1.03454328, "balance_loss_mlp": 1.01984739, "epoch": 0.7409589658800542, "flos": 29023004073600.0, "grad_norm": 1.364972718978451, "language_loss": 0.75983679, "learning_rate": 6.634376286210559e-07, "loss": 0.78091496, "num_input_tokens_seen": 265888945, "step": 12324, "time_per_iteration": 2.758053779602051 }, { "auxiliary_loss_clip": 0.01079211, "auxiliary_loss_mlp": 0.01027292, "balance_loss_clip": 1.03694987, "balance_loss_mlp": 1.01489401, "epoch": 0.7410190891327221, "flos": 19351362902400.0, "grad_norm": 1.7409894929083622, "language_loss": 0.74638963, "learning_rate": 6.63147930004073e-07, "loss": 0.76745468, "num_input_tokens_seen": 265908030, "step": 12325, "time_per_iteration": 2.6512198448181152 }, { "auxiliary_loss_clip": 0.01070767, "auxiliary_loss_mlp": 0.01038532, "balance_loss_clip": 1.03589809, "balance_loss_mlp": 1.02524054, "epoch": 0.7410792123853901, "flos": 22747650589440.0, "grad_norm": 1.8899213582685095, "language_loss": 0.68341279, "learning_rate": 6.628582820806545e-07, "loss": 0.7045058, "num_input_tokens_seen": 265927030, "step": 12326, "time_per_iteration": 2.760312557220459 }, { "auxiliary_loss_clip": 0.01072406, "auxiliary_loss_mlp": 0.01028918, "balance_loss_clip": 1.03731251, "balance_loss_mlp": 1.01672876, "epoch": 0.741139335638058, "flos": 25372433030400.0, "grad_norm": 1.6031079526338634, "language_loss": 0.89560592, "learning_rate": 6.625686848617835e-07, "loss": 0.91661912, "num_input_tokens_seen": 265945490, "step": 12327, "time_per_iteration": 2.753051519393921 }, { "auxiliary_loss_clip": 0.01110031, "auxiliary_loss_mlp": 0.01032575, "balance_loss_clip": 1.03885567, "balance_loss_mlp": 1.0198555, "epoch": 0.741199458890726, "flos": 18585639745920.0, "grad_norm": 1.7237905370438114, "language_loss": 0.85383123, "learning_rate": 6.62279138358442e-07, "loss": 0.87525725, "num_input_tokens_seen": 265963265, "step": 12328, "time_per_iteration": 2.5977120399475098 }, { "auxiliary_loss_clip": 0.01098285, "auxiliary_loss_mlp": 0.01032958, "balance_loss_clip": 1.0383029, "balance_loss_mlp": 1.01909983, "epoch": 0.7412595821433939, "flos": 22127078292480.0, "grad_norm": 1.669888281499519, "language_loss": 0.66867191, "learning_rate": 6.619896425816103e-07, "loss": 0.68998432, "num_input_tokens_seen": 265982270, "step": 12329, "time_per_iteration": 2.63157057762146 }, { "auxiliary_loss_clip": 0.01078104, "auxiliary_loss_mlp": 0.01042687, "balance_loss_clip": 1.03691041, "balance_loss_mlp": 1.02878761, "epoch": 0.741319705396062, "flos": 29169699217920.0, "grad_norm": 1.6151090072025307, "language_loss": 0.66697407, "learning_rate": 6.617001975422647e-07, "loss": 0.688182, "num_input_tokens_seen": 266003835, "step": 12330, "time_per_iteration": 2.8134610652923584 }, { "auxiliary_loss_clip": 0.01078521, "auxiliary_loss_mlp": 0.01036152, "balance_loss_clip": 1.04112339, "balance_loss_mlp": 1.02134609, "epoch": 0.7413798286487299, "flos": 20667489137280.0, "grad_norm": 2.0428405490816837, "language_loss": 0.85805637, "learning_rate": 6.614108032513823e-07, "loss": 0.87920308, "num_input_tokens_seen": 266021595, "step": 12331, "time_per_iteration": 2.812793493270874 }, { "auxiliary_loss_clip": 0.01048375, "auxiliary_loss_mlp": 0.01034839, "balance_loss_clip": 1.0381304, "balance_loss_mlp": 1.02189922, "epoch": 0.7414399519013979, "flos": 16398895662720.0, "grad_norm": 1.9478476477957887, "language_loss": 0.6967262, "learning_rate": 6.611214597199364e-07, "loss": 0.71755838, "num_input_tokens_seen": 266039860, "step": 12332, "time_per_iteration": 3.0447654724121094 }, { "auxiliary_loss_clip": 0.01112852, "auxiliary_loss_mlp": 0.01040645, "balance_loss_clip": 1.03986526, "balance_loss_mlp": 1.02710271, "epoch": 0.7415000751540658, "flos": 25630235919360.0, "grad_norm": 1.894199070779257, "language_loss": 0.63652647, "learning_rate": 6.608321669588984e-07, "loss": 0.65806139, "num_input_tokens_seen": 266058050, "step": 12333, "time_per_iteration": 2.8000104427337646 }, { "auxiliary_loss_clip": 0.010897, "auxiliary_loss_mlp": 0.01035147, "balance_loss_clip": 1.04135418, "balance_loss_mlp": 1.02300525, "epoch": 0.7415601984067338, "flos": 24499732193280.0, "grad_norm": 1.6946502841165116, "language_loss": 0.71084702, "learning_rate": 6.605429249792387e-07, "loss": 0.73209548, "num_input_tokens_seen": 266078060, "step": 12334, "time_per_iteration": 2.7801129817962646 }, { "auxiliary_loss_clip": 0.01065371, "auxiliary_loss_mlp": 0.01027933, "balance_loss_clip": 1.0374248, "balance_loss_mlp": 1.01558292, "epoch": 0.7416203216594017, "flos": 20887154760960.0, "grad_norm": 1.6662744969405867, "language_loss": 0.82556254, "learning_rate": 6.602537337919257e-07, "loss": 0.84649551, "num_input_tokens_seen": 266097110, "step": 12335, "time_per_iteration": 2.7619669437408447 }, { "auxiliary_loss_clip": 0.01111608, "auxiliary_loss_mlp": 0.01031427, "balance_loss_clip": 1.03896701, "balance_loss_mlp": 1.01763475, "epoch": 0.7416804449120697, "flos": 15624265933440.0, "grad_norm": 2.6708776221620134, "language_loss": 0.74853325, "learning_rate": 6.599645934079259e-07, "loss": 0.76996362, "num_input_tokens_seen": 266110870, "step": 12336, "time_per_iteration": 4.294764518737793 }, { "auxiliary_loss_clip": 0.01068313, "auxiliary_loss_mlp": 0.01036465, "balance_loss_clip": 1.03603351, "balance_loss_mlp": 1.02284563, "epoch": 0.7417405681647377, "flos": 17120483982720.0, "grad_norm": 1.9180906175997412, "language_loss": 0.73796511, "learning_rate": 6.596755038382029e-07, "loss": 0.75901294, "num_input_tokens_seen": 266127845, "step": 12337, "time_per_iteration": 2.8595807552337646 }, { "auxiliary_loss_clip": 0.01083057, "auxiliary_loss_mlp": 0.01039105, "balance_loss_clip": 1.03681028, "balance_loss_mlp": 1.0262543, "epoch": 0.7418006914174057, "flos": 18880322924160.0, "grad_norm": 1.6574383205520367, "language_loss": 0.76809967, "learning_rate": 6.593864650937186e-07, "loss": 0.78932124, "num_input_tokens_seen": 266145400, "step": 12338, "time_per_iteration": 4.203794240951538 }, { "auxiliary_loss_clip": 0.01099752, "auxiliary_loss_mlp": 0.01031615, "balance_loss_clip": 1.03882122, "balance_loss_mlp": 1.02033818, "epoch": 0.7418608146700737, "flos": 21580733450880.0, "grad_norm": 1.7161166457507804, "language_loss": 0.73070621, "learning_rate": 6.590974771854345e-07, "loss": 0.75201988, "num_input_tokens_seen": 266164430, "step": 12339, "time_per_iteration": 4.210087776184082 }, { "auxiliary_loss_clip": 0.01092405, "auxiliary_loss_mlp": 0.01031981, "balance_loss_clip": 1.0387336, "balance_loss_mlp": 1.01890945, "epoch": 0.7419209379227416, "flos": 22340459036160.0, "grad_norm": 2.0219989421818276, "language_loss": 0.79605651, "learning_rate": 6.588085401243077e-07, "loss": 0.81730038, "num_input_tokens_seen": 266183855, "step": 12340, "time_per_iteration": 2.670774221420288 }, { "auxiliary_loss_clip": 0.01069023, "auxiliary_loss_mlp": 0.01036356, "balance_loss_clip": 1.03491449, "balance_loss_mlp": 1.02310038, "epoch": 0.7419810611754096, "flos": 16762275601920.0, "grad_norm": 2.432257860237773, "language_loss": 0.75854677, "learning_rate": 6.585196539212958e-07, "loss": 0.77960056, "num_input_tokens_seen": 266202085, "step": 12341, "time_per_iteration": 2.686434268951416 }, { "auxiliary_loss_clip": 0.0107769, "auxiliary_loss_mlp": 0.01041186, "balance_loss_clip": 1.03510964, "balance_loss_mlp": 1.02783489, "epoch": 0.7420411844280775, "flos": 26212958259840.0, "grad_norm": 1.4473494294427032, "language_loss": 0.8024286, "learning_rate": 6.582308185873535e-07, "loss": 0.8236174, "num_input_tokens_seen": 266223445, "step": 12342, "time_per_iteration": 4.343433380126953 }, { "auxiliary_loss_clip": 0.01075896, "auxiliary_loss_mlp": 0.01027447, "balance_loss_clip": 1.03609908, "balance_loss_mlp": 1.01511443, "epoch": 0.7421013076807456, "flos": 68529371840640.0, "grad_norm": 1.749760257309467, "language_loss": 0.77626014, "learning_rate": 6.57942034133433e-07, "loss": 0.79729354, "num_input_tokens_seen": 266246575, "step": 12343, "time_per_iteration": 3.107714891433716 }, { "auxiliary_loss_clip": 0.01082874, "auxiliary_loss_mlp": 0.01034526, "balance_loss_clip": 1.03323293, "balance_loss_mlp": 1.02221727, "epoch": 0.7421614309334135, "flos": 24425325169920.0, "grad_norm": 1.6706510034937676, "language_loss": 0.67636979, "learning_rate": 6.576533005704843e-07, "loss": 0.69754374, "num_input_tokens_seen": 266266055, "step": 12344, "time_per_iteration": 2.7599802017211914 }, { "auxiliary_loss_clip": 0.01065258, "auxiliary_loss_mlp": 0.01037206, "balance_loss_clip": 1.03660572, "balance_loss_mlp": 1.02291846, "epoch": 0.7422215541860815, "flos": 12311076360960.0, "grad_norm": 2.3156123925604692, "language_loss": 0.81109858, "learning_rate": 6.573646179094572e-07, "loss": 0.83212328, "num_input_tokens_seen": 266282240, "step": 12345, "time_per_iteration": 2.7414791584014893 }, { "auxiliary_loss_clip": 0.01072147, "auxiliary_loss_mlp": 0.0103856, "balance_loss_clip": 1.03549957, "balance_loss_mlp": 1.02523887, "epoch": 0.7422816774387494, "flos": 19645579203840.0, "grad_norm": 1.9382183588535902, "language_loss": 0.70441389, "learning_rate": 6.570759861612988e-07, "loss": 0.72552097, "num_input_tokens_seen": 266300980, "step": 12346, "time_per_iteration": 2.728034734725952 }, { "auxiliary_loss_clip": 0.01102385, "auxiliary_loss_mlp": 0.0103363, "balance_loss_clip": 1.03974307, "balance_loss_mlp": 1.02126789, "epoch": 0.7423418006914174, "flos": 32015978876160.0, "grad_norm": 2.081189833506492, "language_loss": 0.73518687, "learning_rate": 6.56787405336953e-07, "loss": 0.75654697, "num_input_tokens_seen": 266322215, "step": 12347, "time_per_iteration": 2.691364049911499 }, { "auxiliary_loss_clip": 0.01090637, "auxiliary_loss_mlp": 0.01034498, "balance_loss_clip": 1.03648269, "balance_loss_mlp": 1.02162337, "epoch": 0.7424019239440853, "flos": 18916951818240.0, "grad_norm": 1.681708315108595, "language_loss": 0.80881745, "learning_rate": 6.564988754473642e-07, "loss": 0.83006883, "num_input_tokens_seen": 266341600, "step": 12348, "time_per_iteration": 2.719554901123047 }, { "auxiliary_loss_clip": 0.01110126, "auxiliary_loss_mlp": 0.01033153, "balance_loss_clip": 1.03918421, "balance_loss_mlp": 1.02082634, "epoch": 0.7424620471967533, "flos": 35876518871040.0, "grad_norm": 1.8616740684019923, "language_loss": 0.73023462, "learning_rate": 6.562103965034724e-07, "loss": 0.7516675, "num_input_tokens_seen": 266362895, "step": 12349, "time_per_iteration": 2.762857437133789 }, { "auxiliary_loss_clip": 0.01091582, "auxiliary_loss_mlp": 0.01035038, "balance_loss_clip": 1.03577137, "balance_loss_mlp": 1.02081633, "epoch": 0.7425221704494213, "flos": 27016603200000.0, "grad_norm": 2.2070987228261427, "language_loss": 0.78727913, "learning_rate": 6.559219685162165e-07, "loss": 0.80854535, "num_input_tokens_seen": 266384015, "step": 12350, "time_per_iteration": 2.67797589302063 }, { "auxiliary_loss_clip": 0.01067839, "auxiliary_loss_mlp": 0.01035914, "balance_loss_clip": 1.03754306, "balance_loss_mlp": 1.0233134, "epoch": 0.7425822937020893, "flos": 34167135559680.0, "grad_norm": 1.5216618153297856, "language_loss": 0.74963629, "learning_rate": 6.556335914965343e-07, "loss": 0.77067381, "num_input_tokens_seen": 266405990, "step": 12351, "time_per_iteration": 2.8214800357818604 }, { "auxiliary_loss_clip": 0.01055755, "auxiliary_loss_mlp": 0.01030345, "balance_loss_clip": 1.0381254, "balance_loss_mlp": 1.01733303, "epoch": 0.7426424169547573, "flos": 21283572234240.0, "grad_norm": 2.67642082180286, "language_loss": 0.81345606, "learning_rate": 6.553452654553611e-07, "loss": 0.83431703, "num_input_tokens_seen": 266424260, "step": 12352, "time_per_iteration": 2.8043935298919678 }, { "auxiliary_loss_clip": 0.01103554, "auxiliary_loss_mlp": 0.01039938, "balance_loss_clip": 1.0413506, "balance_loss_mlp": 1.02751637, "epoch": 0.7427025402074252, "flos": 22448442297600.0, "grad_norm": 1.8427124307905225, "language_loss": 0.72003049, "learning_rate": 6.550569904036307e-07, "loss": 0.74146539, "num_input_tokens_seen": 266444580, "step": 12353, "time_per_iteration": 2.726813793182373 }, { "auxiliary_loss_clip": 0.0110208, "auxiliary_loss_mlp": 0.01030474, "balance_loss_clip": 1.04067636, "balance_loss_mlp": 1.01913714, "epoch": 0.7427626634600932, "flos": 22524609087360.0, "grad_norm": 2.0628021124051275, "language_loss": 0.72218555, "learning_rate": 6.547687663522739e-07, "loss": 0.74351114, "num_input_tokens_seen": 266465640, "step": 12354, "time_per_iteration": 2.6648378372192383 }, { "auxiliary_loss_clip": 0.01020848, "auxiliary_loss_mlp": 0.01006019, "balance_loss_clip": 1.00787544, "balance_loss_mlp": 1.00489271, "epoch": 0.7428227867127611, "flos": 67209477655680.0, "grad_norm": 0.694826107122343, "language_loss": 0.59537125, "learning_rate": 6.544805933122199e-07, "loss": 0.61563993, "num_input_tokens_seen": 266531950, "step": 12355, "time_per_iteration": 3.3000428676605225 }, { "auxiliary_loss_clip": 0.01111904, "auxiliary_loss_mlp": 0.01030428, "balance_loss_clip": 1.03898406, "balance_loss_mlp": 1.01765478, "epoch": 0.7428829099654292, "flos": 14721221082240.0, "grad_norm": 1.7387842003260185, "language_loss": 0.677315, "learning_rate": 6.541924712943971e-07, "loss": 0.69873834, "num_input_tokens_seen": 266550665, "step": 12356, "time_per_iteration": 2.577047824859619 }, { "auxiliary_loss_clip": 0.01100444, "auxiliary_loss_mlp": 0.00771382, "balance_loss_clip": 1.03524387, "balance_loss_mlp": 1.00019741, "epoch": 0.7429430332180971, "flos": 48646496413440.0, "grad_norm": 1.7685989280794623, "language_loss": 0.72208947, "learning_rate": 6.539044003097301e-07, "loss": 0.74080771, "num_input_tokens_seen": 266572455, "step": 12357, "time_per_iteration": 2.9096696376800537 }, { "auxiliary_loss_clip": 0.01088209, "auxiliary_loss_mlp": 0.01029654, "balance_loss_clip": 1.03906703, "balance_loss_mlp": 1.01782274, "epoch": 0.7430031564707651, "flos": 16764071281920.0, "grad_norm": 1.8287713858548653, "language_loss": 0.65631384, "learning_rate": 6.53616380369143e-07, "loss": 0.6774925, "num_input_tokens_seen": 266590895, "step": 12358, "time_per_iteration": 2.668260097503662 }, { "auxiliary_loss_clip": 0.01073582, "auxiliary_loss_mlp": 0.0103526, "balance_loss_clip": 1.0399549, "balance_loss_mlp": 1.02100861, "epoch": 0.743063279723433, "flos": 23870576545920.0, "grad_norm": 1.7940637938845212, "language_loss": 0.81230819, "learning_rate": 6.533284114835591e-07, "loss": 0.83339661, "num_input_tokens_seen": 266607660, "step": 12359, "time_per_iteration": 2.750425338745117 }, { "auxiliary_loss_clip": 0.01100028, "auxiliary_loss_mlp": 0.01032725, "balance_loss_clip": 1.03793418, "balance_loss_mlp": 1.01983833, "epoch": 0.743123402976101, "flos": 14391704689920.0, "grad_norm": 2.122041383037816, "language_loss": 0.67954987, "learning_rate": 6.530404936638956e-07, "loss": 0.70087737, "num_input_tokens_seen": 266624260, "step": 12360, "time_per_iteration": 2.638991355895996 }, { "auxiliary_loss_clip": 0.01099874, "auxiliary_loss_mlp": 0.00770722, "balance_loss_clip": 1.03788424, "balance_loss_mlp": 1.00024271, "epoch": 0.7431835262287689, "flos": 27454318335360.0, "grad_norm": 1.6135955801091852, "language_loss": 0.72960168, "learning_rate": 6.527526269210715e-07, "loss": 0.74830765, "num_input_tokens_seen": 266644210, "step": 12361, "time_per_iteration": 2.6851212978363037 }, { "auxiliary_loss_clip": 0.01061783, "auxiliary_loss_mlp": 0.01043643, "balance_loss_clip": 1.03427052, "balance_loss_mlp": 1.02964807, "epoch": 0.743243649481437, "flos": 20959514709120.0, "grad_norm": 1.8538295437323902, "language_loss": 0.55904317, "learning_rate": 6.524648112660027e-07, "loss": 0.58009744, "num_input_tokens_seen": 266664230, "step": 12362, "time_per_iteration": 2.6957335472106934 }, { "auxiliary_loss_clip": 0.01075259, "auxiliary_loss_mlp": 0.01030795, "balance_loss_clip": 1.03825688, "balance_loss_mlp": 1.01771164, "epoch": 0.7433037727341049, "flos": 22783166161920.0, "grad_norm": 1.5750012237947109, "language_loss": 0.77069867, "learning_rate": 6.521770467096039e-07, "loss": 0.79175913, "num_input_tokens_seen": 266683270, "step": 12363, "time_per_iteration": 2.7211437225341797 }, { "auxiliary_loss_clip": 0.01082709, "auxiliary_loss_mlp": 0.01036524, "balance_loss_clip": 1.03588808, "balance_loss_mlp": 1.02383995, "epoch": 0.7433638959867729, "flos": 22196708807040.0, "grad_norm": 1.6083671142844838, "language_loss": 0.78007239, "learning_rate": 6.518893332627862e-07, "loss": 0.8012647, "num_input_tokens_seen": 266701235, "step": 12364, "time_per_iteration": 2.6894009113311768 }, { "auxiliary_loss_clip": 0.01098885, "auxiliary_loss_mlp": 0.01037373, "balance_loss_clip": 1.03761303, "balance_loss_mlp": 1.025172, "epoch": 0.7434240192394409, "flos": 23296760778240.0, "grad_norm": 1.5760163793718025, "language_loss": 0.78754139, "learning_rate": 6.516016709364604e-07, "loss": 0.80890405, "num_input_tokens_seen": 266721495, "step": 12365, "time_per_iteration": 2.625281572341919 }, { "auxiliary_loss_clip": 0.01087609, "auxiliary_loss_mlp": 0.01033626, "balance_loss_clip": 1.03624249, "balance_loss_mlp": 1.02065635, "epoch": 0.7434841424921088, "flos": 54009575251200.0, "grad_norm": 1.5814760031444242, "language_loss": 0.76864719, "learning_rate": 6.513140597415346e-07, "loss": 0.78985953, "num_input_tokens_seen": 266747400, "step": 12366, "time_per_iteration": 2.9688045978546143 }, { "auxiliary_loss_clip": 0.01099866, "auxiliary_loss_mlp": 0.01028311, "balance_loss_clip": 1.04013896, "balance_loss_mlp": 1.01761758, "epoch": 0.7435442657447768, "flos": 21433966479360.0, "grad_norm": 1.3642058865548359, "language_loss": 0.71373397, "learning_rate": 6.510264996889141e-07, "loss": 0.73501575, "num_input_tokens_seen": 266767630, "step": 12367, "time_per_iteration": 2.661372184753418 }, { "auxiliary_loss_clip": 0.01084148, "auxiliary_loss_mlp": 0.01036563, "balance_loss_clip": 1.0383482, "balance_loss_mlp": 1.02371848, "epoch": 0.7436043889974447, "flos": 24499408970880.0, "grad_norm": 1.5961683932504214, "language_loss": 0.74215865, "learning_rate": 6.507389907895038e-07, "loss": 0.76336575, "num_input_tokens_seen": 266788015, "step": 12368, "time_per_iteration": 2.712043285369873 }, { "auxiliary_loss_clip": 0.01097444, "auxiliary_loss_mlp": 0.01031837, "balance_loss_clip": 1.03949308, "balance_loss_mlp": 1.02042866, "epoch": 0.7436645122501128, "flos": 40698388512000.0, "grad_norm": 2.9422959785728757, "language_loss": 0.69383776, "learning_rate": 6.50451533054207e-07, "loss": 0.71513051, "num_input_tokens_seen": 266809010, "step": 12369, "time_per_iteration": 2.7961301803588867 }, { "auxiliary_loss_clip": 0.01088683, "auxiliary_loss_mlp": 0.00770011, "balance_loss_clip": 1.03793979, "balance_loss_mlp": 1.00026274, "epoch": 0.7437246355027807, "flos": 18908835344640.0, "grad_norm": 1.8064840643083067, "language_loss": 0.75919938, "learning_rate": 6.501641264939233e-07, "loss": 0.77778637, "num_input_tokens_seen": 266825390, "step": 12370, "time_per_iteration": 2.7155323028564453 }, { "auxiliary_loss_clip": 0.01111903, "auxiliary_loss_mlp": 0.01036048, "balance_loss_clip": 1.04072666, "balance_loss_mlp": 1.02287519, "epoch": 0.7437847587554487, "flos": 21543817248000.0, "grad_norm": 2.0269448614883863, "language_loss": 0.78193456, "learning_rate": 6.498767711195503e-07, "loss": 0.80341411, "num_input_tokens_seen": 266844675, "step": 12371, "time_per_iteration": 2.6484358310699463 }, { "auxiliary_loss_clip": 0.01091423, "auxiliary_loss_mlp": 0.01029857, "balance_loss_clip": 1.03848553, "balance_loss_mlp": 1.01723862, "epoch": 0.7438448820081166, "flos": 27782470010880.0, "grad_norm": 1.6126638712287897, "language_loss": 0.69267446, "learning_rate": 6.495894669419857e-07, "loss": 0.71388721, "num_input_tokens_seen": 266865160, "step": 12372, "time_per_iteration": 2.7042236328125 }, { "auxiliary_loss_clip": 0.01079002, "auxiliary_loss_mlp": 0.01037244, "balance_loss_clip": 1.03700709, "balance_loss_mlp": 1.02461922, "epoch": 0.7439050052607846, "flos": 17967832796160.0, "grad_norm": 1.9384549362985082, "language_loss": 0.75196183, "learning_rate": 6.493022139721245e-07, "loss": 0.77312428, "num_input_tokens_seen": 266883285, "step": 12373, "time_per_iteration": 2.6364054679870605 }, { "auxiliary_loss_clip": 0.01057413, "auxiliary_loss_mlp": 0.01039492, "balance_loss_clip": 1.03332591, "balance_loss_mlp": 1.02528191, "epoch": 0.7439651285134525, "flos": 22958696949120.0, "grad_norm": 1.7332911073866848, "language_loss": 0.7709462, "learning_rate": 6.49015012220858e-07, "loss": 0.7919153, "num_input_tokens_seen": 266900960, "step": 12374, "time_per_iteration": 2.7238872051239014 }, { "auxiliary_loss_clip": 0.01048312, "auxiliary_loss_mlp": 0.01037947, "balance_loss_clip": 1.03520083, "balance_loss_mlp": 1.02472675, "epoch": 0.7440252517661206, "flos": 18806777827200.0, "grad_norm": 2.3876563861488496, "language_loss": 0.76403177, "learning_rate": 6.487278616990774e-07, "loss": 0.78489435, "num_input_tokens_seen": 266917710, "step": 12375, "time_per_iteration": 2.8014628887176514 }, { "auxiliary_loss_clip": 0.01098112, "auxiliary_loss_mlp": 0.01032005, "balance_loss_clip": 1.03817892, "balance_loss_mlp": 1.02062082, "epoch": 0.7440853750187885, "flos": 20266295155200.0, "grad_norm": 1.9311839942562836, "language_loss": 0.77011836, "learning_rate": 6.484407624176733e-07, "loss": 0.79141957, "num_input_tokens_seen": 266934220, "step": 12376, "time_per_iteration": 4.12352442741394 }, { "auxiliary_loss_clip": 0.01071601, "auxiliary_loss_mlp": 0.01038876, "balance_loss_clip": 1.03379536, "balance_loss_mlp": 1.02320004, "epoch": 0.7441454982714565, "flos": 25337276593920.0, "grad_norm": 1.692291173938847, "language_loss": 0.79398865, "learning_rate": 6.481537143875296e-07, "loss": 0.8150934, "num_input_tokens_seen": 266955210, "step": 12377, "time_per_iteration": 4.235915184020996 }, { "auxiliary_loss_clip": 0.010991, "auxiliary_loss_mlp": 0.01030466, "balance_loss_clip": 1.03905261, "balance_loss_mlp": 1.01754928, "epoch": 0.7442056215241245, "flos": 64480910866560.0, "grad_norm": 1.9747138110607607, "language_loss": 0.67284125, "learning_rate": 6.478667176195322e-07, "loss": 0.69413698, "num_input_tokens_seen": 266976555, "step": 12378, "time_per_iteration": 4.622121572494507 }, { "auxiliary_loss_clip": 0.010776, "auxiliary_loss_mlp": 0.01037137, "balance_loss_clip": 1.03861165, "balance_loss_mlp": 1.02326727, "epoch": 0.7442657447767924, "flos": 31285376242560.0, "grad_norm": 1.7913513654463005, "language_loss": 0.71687776, "learning_rate": 6.475797721245648e-07, "loss": 0.73802519, "num_input_tokens_seen": 266997640, "step": 12379, "time_per_iteration": 2.7747161388397217 }, { "auxiliary_loss_clip": 0.01072089, "auxiliary_loss_mlp": 0.00772364, "balance_loss_clip": 1.0351454, "balance_loss_mlp": 1.00025105, "epoch": 0.7443258680294604, "flos": 20807899401600.0, "grad_norm": 2.0210704518096523, "language_loss": 0.65216178, "learning_rate": 6.472928779135085e-07, "loss": 0.67060632, "num_input_tokens_seen": 267016165, "step": 12380, "time_per_iteration": 2.7074787616729736 }, { "auxiliary_loss_clip": 0.01101589, "auxiliary_loss_mlp": 0.01034892, "balance_loss_clip": 1.03957582, "balance_loss_mlp": 1.0219394, "epoch": 0.7443859912821283, "flos": 22199833290240.0, "grad_norm": 2.7838482793597388, "language_loss": 0.78674221, "learning_rate": 6.470060349972411e-07, "loss": 0.80810702, "num_input_tokens_seen": 267034075, "step": 12381, "time_per_iteration": 2.6567366123199463 }, { "auxiliary_loss_clip": 0.01072016, "auxiliary_loss_mlp": 0.01045243, "balance_loss_clip": 1.03785646, "balance_loss_mlp": 1.02981174, "epoch": 0.7444461145347964, "flos": 22017838055040.0, "grad_norm": 2.878241445403415, "language_loss": 0.72793961, "learning_rate": 6.467192433866411e-07, "loss": 0.74911219, "num_input_tokens_seen": 267053645, "step": 12382, "time_per_iteration": 4.307409763336182 }, { "auxiliary_loss_clip": 0.01005043, "auxiliary_loss_mlp": 0.01004958, "balance_loss_clip": 1.01257348, "balance_loss_mlp": 1.00380802, "epoch": 0.7445062377874643, "flos": 70559047704960.0, "grad_norm": 0.6531954820349142, "language_loss": 0.54669428, "learning_rate": 6.464325030925831e-07, "loss": 0.56679428, "num_input_tokens_seen": 267121830, "step": 12383, "time_per_iteration": 3.4219913482666016 }, { "auxiliary_loss_clip": 0.01085875, "auxiliary_loss_mlp": 0.01027749, "balance_loss_clip": 1.03667879, "balance_loss_mlp": 1.01498723, "epoch": 0.7445663610401323, "flos": 22164425458560.0, "grad_norm": 5.0246719589759365, "language_loss": 0.76023626, "learning_rate": 6.461458141259395e-07, "loss": 0.78137243, "num_input_tokens_seen": 267141145, "step": 12384, "time_per_iteration": 2.6512553691864014 }, { "auxiliary_loss_clip": 0.01098981, "auxiliary_loss_mlp": 0.01029554, "balance_loss_clip": 1.03816116, "balance_loss_mlp": 1.01680422, "epoch": 0.7446264842928002, "flos": 24170251714560.0, "grad_norm": 1.9156504433833408, "language_loss": 0.78836381, "learning_rate": 6.458591764975823e-07, "loss": 0.80964911, "num_input_tokens_seen": 267159280, "step": 12385, "time_per_iteration": 2.6723034381866455 }, { "auxiliary_loss_clip": 0.01078718, "auxiliary_loss_mlp": 0.01032784, "balance_loss_clip": 1.03725076, "balance_loss_mlp": 1.01855612, "epoch": 0.7446866075454682, "flos": 24134556574080.0, "grad_norm": 1.6864726540587271, "language_loss": 0.81386524, "learning_rate": 6.455725902183813e-07, "loss": 0.83498025, "num_input_tokens_seen": 267179390, "step": 12386, "time_per_iteration": 2.724527359008789 }, { "auxiliary_loss_clip": 0.01097105, "auxiliary_loss_mlp": 0.01034795, "balance_loss_clip": 1.03846228, "balance_loss_mlp": 1.02235591, "epoch": 0.7447467307981361, "flos": 23548063305600.0, "grad_norm": 1.6785981407963104, "language_loss": 0.71043932, "learning_rate": 6.452860552992037e-07, "loss": 0.73175836, "num_input_tokens_seen": 267198165, "step": 12387, "time_per_iteration": 2.7917346954345703 }, { "auxiliary_loss_clip": 0.0107995, "auxiliary_loss_mlp": 0.01031199, "balance_loss_clip": 1.03891492, "balance_loss_mlp": 1.01899815, "epoch": 0.7448068540508042, "flos": 19567832215680.0, "grad_norm": 2.0106336394947597, "language_loss": 0.70168763, "learning_rate": 6.449995717509138e-07, "loss": 0.72279912, "num_input_tokens_seen": 267214520, "step": 12388, "time_per_iteration": 2.831563949584961 }, { "auxiliary_loss_clip": 0.01099712, "auxiliary_loss_mlp": 0.010311, "balance_loss_clip": 1.03740311, "balance_loss_mlp": 1.01846361, "epoch": 0.7448669773034721, "flos": 21839721488640.0, "grad_norm": 2.075908043210206, "language_loss": 0.84796858, "learning_rate": 6.447131395843761e-07, "loss": 0.86927676, "num_input_tokens_seen": 267236555, "step": 12389, "time_per_iteration": 2.6563961505889893 }, { "auxiliary_loss_clip": 0.01069109, "auxiliary_loss_mlp": 0.01035071, "balance_loss_clip": 1.03659904, "balance_loss_mlp": 1.02245224, "epoch": 0.7449271005561401, "flos": 25155389099520.0, "grad_norm": 2.0263392027511298, "language_loss": 0.79228258, "learning_rate": 6.444267588104526e-07, "loss": 0.81332433, "num_input_tokens_seen": 267254800, "step": 12390, "time_per_iteration": 2.756574869155884 }, { "auxiliary_loss_clip": 0.01089478, "auxiliary_loss_mlp": 0.0103054, "balance_loss_clip": 1.03688502, "balance_loss_mlp": 1.01727843, "epoch": 0.7449872238088081, "flos": 22273342473600.0, "grad_norm": 1.8599579606909906, "language_loss": 0.851529, "learning_rate": 6.441404294400014e-07, "loss": 0.87272918, "num_input_tokens_seen": 267274610, "step": 12391, "time_per_iteration": 2.6953816413879395 }, { "auxiliary_loss_clip": 0.01111566, "auxiliary_loss_mlp": 0.01028434, "balance_loss_clip": 1.03942573, "balance_loss_mlp": 1.0161674, "epoch": 0.745047347061476, "flos": 20594805966720.0, "grad_norm": 1.7091676728035188, "language_loss": 0.73478818, "learning_rate": 6.438541514838811e-07, "loss": 0.75618815, "num_input_tokens_seen": 267292600, "step": 12392, "time_per_iteration": 2.566464424133301 }, { "auxiliary_loss_clip": 0.010973, "auxiliary_loss_mlp": 0.01035758, "balance_loss_clip": 1.03854799, "balance_loss_mlp": 1.02366483, "epoch": 0.745107470314144, "flos": 22127545169280.0, "grad_norm": 3.0948074405421617, "language_loss": 0.76522237, "learning_rate": 6.435679249529487e-07, "loss": 0.78655297, "num_input_tokens_seen": 267311295, "step": 12393, "time_per_iteration": 2.614400625228882 }, { "auxiliary_loss_clip": 0.01100705, "auxiliary_loss_mlp": 0.0103966, "balance_loss_clip": 1.03918004, "balance_loss_mlp": 1.02523553, "epoch": 0.745167593566812, "flos": 22236498097920.0, "grad_norm": 1.8734262060070255, "language_loss": 0.72774941, "learning_rate": 6.432817498580552e-07, "loss": 0.74915308, "num_input_tokens_seen": 267328390, "step": 12394, "time_per_iteration": 2.6467761993408203 }, { "auxiliary_loss_clip": 0.01058489, "auxiliary_loss_mlp": 0.00770508, "balance_loss_clip": 1.04145324, "balance_loss_mlp": 1.0001545, "epoch": 0.74522771681948, "flos": 20666232161280.0, "grad_norm": 1.9226493220785308, "language_loss": 0.81523216, "learning_rate": 6.429956262100535e-07, "loss": 0.83352214, "num_input_tokens_seen": 267348185, "step": 12395, "time_per_iteration": 2.772284984588623 }, { "auxiliary_loss_clip": 0.0110524, "auxiliary_loss_mlp": 0.01037708, "balance_loss_clip": 1.03964758, "balance_loss_mlp": 1.0240705, "epoch": 0.7452878400721479, "flos": 21106999952640.0, "grad_norm": 1.9270177813162948, "language_loss": 0.7149328, "learning_rate": 6.427095540197937e-07, "loss": 0.73636222, "num_input_tokens_seen": 267367010, "step": 12396, "time_per_iteration": 2.6198830604553223 }, { "auxiliary_loss_clip": 0.0107235, "auxiliary_loss_mlp": 0.01033046, "balance_loss_clip": 1.03889275, "balance_loss_mlp": 1.02018356, "epoch": 0.7453479633248159, "flos": 26688056474880.0, "grad_norm": 1.7432262055203618, "language_loss": 0.68239546, "learning_rate": 6.424235332981245e-07, "loss": 0.70344937, "num_input_tokens_seen": 267386605, "step": 12397, "time_per_iteration": 2.8147408962249756 }, { "auxiliary_loss_clip": 0.01111263, "auxiliary_loss_mlp": 0.01038637, "balance_loss_clip": 1.03894281, "balance_loss_mlp": 1.02567935, "epoch": 0.7454080865774838, "flos": 17016056167680.0, "grad_norm": 1.7819734884556382, "language_loss": 0.77117336, "learning_rate": 6.421375640558908e-07, "loss": 0.79267234, "num_input_tokens_seen": 267404135, "step": 12398, "time_per_iteration": 2.561169385910034 }, { "auxiliary_loss_clip": 0.01100902, "auxiliary_loss_mlp": 0.01029647, "balance_loss_clip": 1.04031086, "balance_loss_mlp": 1.01657581, "epoch": 0.7454682098301518, "flos": 21323900229120.0, "grad_norm": 1.713165335415303, "language_loss": 0.779158, "learning_rate": 6.418516463039363e-07, "loss": 0.80046344, "num_input_tokens_seen": 267423120, "step": 12399, "time_per_iteration": 2.6413347721099854 }, { "auxiliary_loss_clip": 0.010824, "auxiliary_loss_mlp": 0.01035168, "balance_loss_clip": 1.03334904, "balance_loss_mlp": 1.02338409, "epoch": 0.7455283330828197, "flos": 17858341163520.0, "grad_norm": 2.1285775991405482, "language_loss": 0.73999, "learning_rate": 6.415657800531038e-07, "loss": 0.76116568, "num_input_tokens_seen": 267441250, "step": 12400, "time_per_iteration": 2.696606159210205 }, { "auxiliary_loss_clip": 0.01096917, "auxiliary_loss_mlp": 0.01030276, "balance_loss_clip": 1.03786886, "balance_loss_mlp": 1.01809835, "epoch": 0.7455884563354878, "flos": 30774259664640.0, "grad_norm": 2.4044760151763174, "language_loss": 0.82103872, "learning_rate": 6.412799653142327e-07, "loss": 0.84231067, "num_input_tokens_seen": 267462820, "step": 12401, "time_per_iteration": 2.700671434402466 }, { "auxiliary_loss_clip": 0.01078431, "auxiliary_loss_mlp": 0.01035329, "balance_loss_clip": 1.03934383, "balance_loss_mlp": 1.02312756, "epoch": 0.7456485795881557, "flos": 23185545292800.0, "grad_norm": 2.1019998768613326, "language_loss": 0.64676833, "learning_rate": 6.409942020981611e-07, "loss": 0.66790593, "num_input_tokens_seen": 267483065, "step": 12402, "time_per_iteration": 2.775984287261963 }, { "auxiliary_loss_clip": 0.01077021, "auxiliary_loss_mlp": 0.01033791, "balance_loss_clip": 1.03509498, "balance_loss_mlp": 1.02227569, "epoch": 0.7457087028408237, "flos": 38727144074880.0, "grad_norm": 1.560080300868097, "language_loss": 0.73373783, "learning_rate": 6.407084904157265e-07, "loss": 0.75484598, "num_input_tokens_seen": 267504825, "step": 12403, "time_per_iteration": 2.8398375511169434 }, { "auxiliary_loss_clip": 0.01002548, "auxiliary_loss_mlp": 0.01008627, "balance_loss_clip": 1.01085329, "balance_loss_mlp": 1.00753641, "epoch": 0.7457688260934917, "flos": 56043737337600.0, "grad_norm": 0.830633313503113, "language_loss": 0.58735222, "learning_rate": 6.404228302777621e-07, "loss": 0.60746402, "num_input_tokens_seen": 267559260, "step": 12404, "time_per_iteration": 3.018889904022217 }, { "auxiliary_loss_clip": 0.01110759, "auxiliary_loss_mlp": 0.01032429, "balance_loss_clip": 1.03871632, "balance_loss_mlp": 1.020383, "epoch": 0.7458289493461596, "flos": 20116152305280.0, "grad_norm": 1.8575983002348149, "language_loss": 0.77702922, "learning_rate": 6.401372216950995e-07, "loss": 0.79846108, "num_input_tokens_seen": 267578720, "step": 12405, "time_per_iteration": 2.607694625854492 }, { "auxiliary_loss_clip": 0.01083469, "auxiliary_loss_mlp": 0.01036873, "balance_loss_clip": 1.03548229, "balance_loss_mlp": 1.02420723, "epoch": 0.7458890725988276, "flos": 20193073280640.0, "grad_norm": 1.6476155913625474, "language_loss": 0.69351685, "learning_rate": 6.398516646785698e-07, "loss": 0.71472031, "num_input_tokens_seen": 267598250, "step": 12406, "time_per_iteration": 2.651949882507324 }, { "auxiliary_loss_clip": 0.01047021, "auxiliary_loss_mlp": 0.01036186, "balance_loss_clip": 1.03744388, "balance_loss_mlp": 1.02236354, "epoch": 0.7459491958514956, "flos": 17018749687680.0, "grad_norm": 2.2152803431091685, "language_loss": 0.65254861, "learning_rate": 6.39566159239002e-07, "loss": 0.67338073, "num_input_tokens_seen": 267615430, "step": 12407, "time_per_iteration": 2.761862277984619 }, { "auxiliary_loss_clip": 0.01070552, "auxiliary_loss_mlp": 0.01034751, "balance_loss_clip": 1.03763545, "balance_loss_mlp": 1.02068424, "epoch": 0.7460093191041636, "flos": 25078719519360.0, "grad_norm": 2.453425787686552, "language_loss": 0.72200561, "learning_rate": 6.392807053872212e-07, "loss": 0.74305862, "num_input_tokens_seen": 267635075, "step": 12408, "time_per_iteration": 2.7553751468658447 }, { "auxiliary_loss_clip": 0.01105957, "auxiliary_loss_mlp": 0.0103312, "balance_loss_clip": 1.03999674, "balance_loss_mlp": 1.01942849, "epoch": 0.7460694423568315, "flos": 21908525990400.0, "grad_norm": 2.19086035143854, "language_loss": 0.72995472, "learning_rate": 6.38995303134053e-07, "loss": 0.7513454, "num_input_tokens_seen": 267654105, "step": 12409, "time_per_iteration": 2.6748335361480713 }, { "auxiliary_loss_clip": 0.01097314, "auxiliary_loss_mlp": 0.01031591, "balance_loss_clip": 1.03749943, "balance_loss_mlp": 1.02024233, "epoch": 0.7461295656094995, "flos": 21215737399680.0, "grad_norm": 2.015553074030815, "language_loss": 0.65646017, "learning_rate": 6.38709952490319e-07, "loss": 0.67774916, "num_input_tokens_seen": 267673090, "step": 12410, "time_per_iteration": 2.599883794784546 }, { "auxiliary_loss_clip": 0.01094288, "auxiliary_loss_mlp": 0.00770134, "balance_loss_clip": 1.0380162, "balance_loss_mlp": 1.00011945, "epoch": 0.7461896888621674, "flos": 22346851656960.0, "grad_norm": 1.8387948527336508, "language_loss": 0.84203392, "learning_rate": 6.384246534668396e-07, "loss": 0.86067814, "num_input_tokens_seen": 267690605, "step": 12411, "time_per_iteration": 2.7593939304351807 }, { "auxiliary_loss_clip": 0.01076302, "auxiliary_loss_mlp": 0.01030521, "balance_loss_clip": 1.0369643, "balance_loss_mlp": 1.01747966, "epoch": 0.7462498121148354, "flos": 25482930243840.0, "grad_norm": 2.2444375236075578, "language_loss": 0.77899462, "learning_rate": 6.381394060744339e-07, "loss": 0.80006284, "num_input_tokens_seen": 267710540, "step": 12412, "time_per_iteration": 2.880466938018799 }, { "auxiliary_loss_clip": 0.01069141, "auxiliary_loss_mlp": 0.01041632, "balance_loss_clip": 1.03378701, "balance_loss_mlp": 1.02820313, "epoch": 0.7463099353675033, "flos": 33947936812800.0, "grad_norm": 2.442333824498856, "language_loss": 0.62740505, "learning_rate": 6.378542103239188e-07, "loss": 0.64851284, "num_input_tokens_seen": 267730780, "step": 12413, "time_per_iteration": 2.8031466007232666 }, { "auxiliary_loss_clip": 0.01023176, "auxiliary_loss_mlp": 0.00751261, "balance_loss_clip": 1.00943136, "balance_loss_mlp": 0.99959147, "epoch": 0.7463700586201714, "flos": 62767723691520.0, "grad_norm": 0.7172744197889728, "language_loss": 0.54801792, "learning_rate": 6.375690662261082e-07, "loss": 0.56576228, "num_input_tokens_seen": 267794240, "step": 12414, "time_per_iteration": 3.2076735496520996 }, { "auxiliary_loss_clip": 0.01081911, "auxiliary_loss_mlp": 0.01031661, "balance_loss_clip": 1.03365874, "balance_loss_mlp": 1.01806545, "epoch": 0.7464301818728393, "flos": 33432654257280.0, "grad_norm": 1.4875618615685628, "language_loss": 0.5517059, "learning_rate": 6.372839737918154e-07, "loss": 0.57284164, "num_input_tokens_seen": 267817190, "step": 12415, "time_per_iteration": 4.414318084716797 }, { "auxiliary_loss_clip": 0.0104777, "auxiliary_loss_mlp": 0.01036648, "balance_loss_clip": 1.03617668, "balance_loss_mlp": 1.02174664, "epoch": 0.7464903051255073, "flos": 26869872142080.0, "grad_norm": 1.6764979613333528, "language_loss": 0.75015157, "learning_rate": 6.369989330318506e-07, "loss": 0.77099568, "num_input_tokens_seen": 267836245, "step": 12416, "time_per_iteration": 2.831061840057373 }, { "auxiliary_loss_clip": 0.01060971, "auxiliary_loss_mlp": 0.01042536, "balance_loss_clip": 1.03266478, "balance_loss_mlp": 1.02845144, "epoch": 0.7465504283781753, "flos": 44086954775040.0, "grad_norm": 5.110704099754697, "language_loss": 0.69582009, "learning_rate": 6.367139439570233e-07, "loss": 0.71685511, "num_input_tokens_seen": 267858310, "step": 12417, "time_per_iteration": 6.061137676239014 }, { "auxiliary_loss_clip": 0.01087135, "auxiliary_loss_mlp": 0.010359, "balance_loss_clip": 1.04298329, "balance_loss_mlp": 1.02211332, "epoch": 0.7466105516308432, "flos": 19676102785920.0, "grad_norm": 1.7520602773189389, "language_loss": 0.73654354, "learning_rate": 6.364290065781392e-07, "loss": 0.75777388, "num_input_tokens_seen": 267876345, "step": 12418, "time_per_iteration": 2.719461441040039 }, { "auxiliary_loss_clip": 0.01101371, "auxiliary_loss_mlp": 0.01032166, "balance_loss_clip": 1.03970969, "balance_loss_mlp": 1.01958394, "epoch": 0.7466706748835112, "flos": 20520722165760.0, "grad_norm": 1.5723677716415394, "language_loss": 0.68733931, "learning_rate": 6.361441209060039e-07, "loss": 0.70867467, "num_input_tokens_seen": 267896740, "step": 12419, "time_per_iteration": 2.658419370651245 }, { "auxiliary_loss_clip": 0.01106886, "auxiliary_loss_mlp": 0.01034487, "balance_loss_clip": 1.03877735, "balance_loss_mlp": 1.0225246, "epoch": 0.7467307981361792, "flos": 21690260997120.0, "grad_norm": 2.325148718588452, "language_loss": 0.74999017, "learning_rate": 6.358592869514216e-07, "loss": 0.77140391, "num_input_tokens_seen": 267914765, "step": 12420, "time_per_iteration": 2.6232640743255615 }, { "auxiliary_loss_clip": 0.01105813, "auxiliary_loss_mlp": 0.0103159, "balance_loss_clip": 1.04157043, "balance_loss_mlp": 1.01868558, "epoch": 0.7467909213888472, "flos": 19573686132480.0, "grad_norm": 1.5853276507887042, "language_loss": 0.6715399, "learning_rate": 6.355745047251904e-07, "loss": 0.69291389, "num_input_tokens_seen": 267934085, "step": 12421, "time_per_iteration": 4.228281021118164 }, { "auxiliary_loss_clip": 0.01087742, "auxiliary_loss_mlp": 0.01034402, "balance_loss_clip": 1.03845739, "balance_loss_mlp": 1.02044845, "epoch": 0.7468510446415151, "flos": 23695225326720.0, "grad_norm": 1.7891201641771508, "language_loss": 0.72700393, "learning_rate": 6.352897742381107e-07, "loss": 0.74822545, "num_input_tokens_seen": 267955170, "step": 12422, "time_per_iteration": 2.678581953048706 }, { "auxiliary_loss_clip": 0.0107257, "auxiliary_loss_mlp": 0.01034325, "balance_loss_clip": 1.03739822, "balance_loss_mlp": 1.02140832, "epoch": 0.7469111678941831, "flos": 29315783831040.0, "grad_norm": 1.7729815610764, "language_loss": 0.7519784, "learning_rate": 6.350050955009796e-07, "loss": 0.77304733, "num_input_tokens_seen": 267974980, "step": 12423, "time_per_iteration": 2.884932518005371 }, { "auxiliary_loss_clip": 0.01097508, "auxiliary_loss_mlp": 0.01026494, "balance_loss_clip": 1.03815055, "balance_loss_mlp": 1.01491261, "epoch": 0.746971291146851, "flos": 21798639308160.0, "grad_norm": 1.3102627766091752, "language_loss": 0.67454731, "learning_rate": 6.347204685245929e-07, "loss": 0.69578731, "num_input_tokens_seen": 267994985, "step": 12424, "time_per_iteration": 2.665360927581787 }, { "auxiliary_loss_clip": 0.01106731, "auxiliary_loss_mlp": 0.0103674, "balance_loss_clip": 1.04188585, "balance_loss_mlp": 1.02385378, "epoch": 0.747031414399519, "flos": 36245070368640.0, "grad_norm": 1.8168677421099624, "language_loss": 0.7413224, "learning_rate": 6.344358933197418e-07, "loss": 0.76275706, "num_input_tokens_seen": 268014985, "step": 12425, "time_per_iteration": 2.684622049331665 }, { "auxiliary_loss_clip": 0.01071034, "auxiliary_loss_mlp": 0.01034399, "balance_loss_clip": 1.03520596, "balance_loss_mlp": 1.0205828, "epoch": 0.7470915376521869, "flos": 19974916028160.0, "grad_norm": 8.361913341794455, "language_loss": 0.69433403, "learning_rate": 6.341513698972194e-07, "loss": 0.71538836, "num_input_tokens_seen": 268034395, "step": 12426, "time_per_iteration": 2.686992645263672 }, { "auxiliary_loss_clip": 0.01070297, "auxiliary_loss_mlp": 0.01035278, "balance_loss_clip": 1.03655338, "balance_loss_mlp": 1.02329111, "epoch": 0.747151660904855, "flos": 20084299920000.0, "grad_norm": 1.4050021872275102, "language_loss": 0.65497875, "learning_rate": 6.338668982678139e-07, "loss": 0.67603451, "num_input_tokens_seen": 268054485, "step": 12427, "time_per_iteration": 2.8737995624542236 }, { "auxiliary_loss_clip": 0.0111177, "auxiliary_loss_mlp": 0.01030082, "balance_loss_clip": 1.03934562, "balance_loss_mlp": 1.01686215, "epoch": 0.7472117841575229, "flos": 16290373697280.0, "grad_norm": 1.6443370194470839, "language_loss": 0.74700832, "learning_rate": 6.335824784423118e-07, "loss": 0.7684269, "num_input_tokens_seen": 268072250, "step": 12428, "time_per_iteration": 2.5923843383789062 }, { "auxiliary_loss_clip": 0.01105561, "auxiliary_loss_mlp": 0.0103113, "balance_loss_clip": 1.03948128, "balance_loss_mlp": 1.01726604, "epoch": 0.7472719074101909, "flos": 21389939383680.0, "grad_norm": 1.8997644217403626, "language_loss": 0.5859766, "learning_rate": 6.33298110431499e-07, "loss": 0.60734349, "num_input_tokens_seen": 268089840, "step": 12429, "time_per_iteration": 2.673205614089966 }, { "auxiliary_loss_clip": 0.01100742, "auxiliary_loss_mlp": 0.01035285, "balance_loss_clip": 1.0397048, "balance_loss_mlp": 1.02210021, "epoch": 0.7473320306628589, "flos": 29643289061760.0, "grad_norm": 2.191924076091365, "language_loss": 0.60676718, "learning_rate": 6.330137942461595e-07, "loss": 0.62812746, "num_input_tokens_seen": 268109360, "step": 12430, "time_per_iteration": 2.695838212966919 }, { "auxiliary_loss_clip": 0.01089402, "auxiliary_loss_mlp": 0.01035646, "balance_loss_clip": 1.0370059, "balance_loss_mlp": 1.02266431, "epoch": 0.7473921539155268, "flos": 24136100858880.0, "grad_norm": 1.60839761436318, "language_loss": 0.75666201, "learning_rate": 6.327295298970734e-07, "loss": 0.7779125, "num_input_tokens_seen": 268131840, "step": 12431, "time_per_iteration": 2.7131593227386475 }, { "auxiliary_loss_clip": 0.01098694, "auxiliary_loss_mlp": 0.01031285, "balance_loss_clip": 1.03696167, "balance_loss_mlp": 1.01853514, "epoch": 0.7474522771681948, "flos": 17487958072320.0, "grad_norm": 1.8735643765316532, "language_loss": 0.75119841, "learning_rate": 6.32445317395021e-07, "loss": 0.77249819, "num_input_tokens_seen": 268148300, "step": 12432, "time_per_iteration": 2.596440315246582 }, { "auxiliary_loss_clip": 0.01088473, "auxiliary_loss_mlp": 0.01036339, "balance_loss_clip": 1.03782606, "balance_loss_mlp": 1.02223635, "epoch": 0.7475124004208628, "flos": 16727298733440.0, "grad_norm": 2.50552734802935, "language_loss": 0.69950736, "learning_rate": 6.321611567507787e-07, "loss": 0.72075546, "num_input_tokens_seen": 268166450, "step": 12433, "time_per_iteration": 2.606110095977783 }, { "auxiliary_loss_clip": 0.01063022, "auxiliary_loss_mlp": 0.01031886, "balance_loss_clip": 1.03389204, "balance_loss_mlp": 1.01835036, "epoch": 0.7475725236735308, "flos": 19720237622400.0, "grad_norm": 2.703159081845411, "language_loss": 0.67130244, "learning_rate": 6.318770479751232e-07, "loss": 0.6922515, "num_input_tokens_seen": 268186165, "step": 12434, "time_per_iteration": 2.751291513442993 }, { "auxiliary_loss_clip": 0.01105439, "auxiliary_loss_mlp": 0.01035376, "balance_loss_clip": 1.03803849, "balance_loss_mlp": 1.02368116, "epoch": 0.7476326469261987, "flos": 26286000566400.0, "grad_norm": 3.4930601864100224, "language_loss": 0.7979542, "learning_rate": 6.315929910788263e-07, "loss": 0.8193624, "num_input_tokens_seen": 268208145, "step": 12435, "time_per_iteration": 2.6472816467285156 }, { "auxiliary_loss_clip": 0.01083734, "auxiliary_loss_mlp": 0.01027736, "balance_loss_clip": 1.03813887, "balance_loss_mlp": 1.01502252, "epoch": 0.7476927701788667, "flos": 31831828824960.0, "grad_norm": 1.8861832027521432, "language_loss": 0.68124855, "learning_rate": 6.313089860726604e-07, "loss": 0.70236325, "num_input_tokens_seen": 268228345, "step": 12436, "time_per_iteration": 2.813854694366455 }, { "auxiliary_loss_clip": 0.0108534, "auxiliary_loss_mlp": 0.01034815, "balance_loss_clip": 1.0374372, "balance_loss_mlp": 1.02242923, "epoch": 0.7477528934315346, "flos": 31795487239680.0, "grad_norm": 1.9570276627413858, "language_loss": 0.70576406, "learning_rate": 6.31025032967396e-07, "loss": 0.72696555, "num_input_tokens_seen": 268250260, "step": 12437, "time_per_iteration": 2.7825896739959717 }, { "auxiliary_loss_clip": 0.01071415, "auxiliary_loss_mlp": 0.01028505, "balance_loss_clip": 1.03356171, "balance_loss_mlp": 1.01697707, "epoch": 0.7478130166842026, "flos": 20371979946240.0, "grad_norm": 2.395892152897482, "language_loss": 0.67251343, "learning_rate": 6.307411317737986e-07, "loss": 0.69351262, "num_input_tokens_seen": 268268440, "step": 12438, "time_per_iteration": 2.706458568572998 }, { "auxiliary_loss_clip": 0.01087999, "auxiliary_loss_mlp": 0.01035267, "balance_loss_clip": 1.03646779, "balance_loss_mlp": 1.0229404, "epoch": 0.7478731399368705, "flos": 18148930191360.0, "grad_norm": 1.593097914021623, "language_loss": 0.8085202, "learning_rate": 6.304572825026344e-07, "loss": 0.8297528, "num_input_tokens_seen": 268285765, "step": 12439, "time_per_iteration": 2.665294647216797 }, { "auxiliary_loss_clip": 0.01074236, "auxiliary_loss_mlp": 0.01040046, "balance_loss_clip": 1.03548503, "balance_loss_mlp": 1.02805412, "epoch": 0.7479332631895386, "flos": 15267889146240.0, "grad_norm": 2.6477676249196334, "language_loss": 0.70738852, "learning_rate": 6.301734851646674e-07, "loss": 0.72853136, "num_input_tokens_seen": 268304015, "step": 12440, "time_per_iteration": 2.7106735706329346 }, { "auxiliary_loss_clip": 0.01088049, "auxiliary_loss_mlp": 0.01026861, "balance_loss_clip": 1.04011965, "balance_loss_mlp": 1.01467144, "epoch": 0.7479933864422065, "flos": 21142515525120.0, "grad_norm": 1.6270418049825819, "language_loss": 0.74380887, "learning_rate": 6.298897397706597e-07, "loss": 0.7649579, "num_input_tokens_seen": 268323290, "step": 12441, "time_per_iteration": 2.7022409439086914 }, { "auxiliary_loss_clip": 0.01105099, "auxiliary_loss_mlp": 0.00770813, "balance_loss_clip": 1.04095459, "balance_loss_mlp": 1.00020576, "epoch": 0.7480535096948745, "flos": 14392027912320.0, "grad_norm": 2.187499472876037, "language_loss": 0.82711899, "learning_rate": 6.296060463313698e-07, "loss": 0.84587812, "num_input_tokens_seen": 268339490, "step": 12442, "time_per_iteration": 2.7588963508605957 }, { "auxiliary_loss_clip": 0.0105579, "auxiliary_loss_mlp": 0.01031459, "balance_loss_clip": 1.03666043, "balance_loss_mlp": 1.01823914, "epoch": 0.7481136329475425, "flos": 27344683048320.0, "grad_norm": 2.073136454951009, "language_loss": 0.63220263, "learning_rate": 6.293224048575565e-07, "loss": 0.65307516, "num_input_tokens_seen": 268359865, "step": 12443, "time_per_iteration": 2.874648094177246 }, { "auxiliary_loss_clip": 0.01067932, "auxiliary_loss_mlp": 0.0102658, "balance_loss_clip": 1.03455901, "balance_loss_mlp": 1.01451015, "epoch": 0.7481737562002104, "flos": 19531454716800.0, "grad_norm": 2.062388953360283, "language_loss": 0.7137714, "learning_rate": 6.29038815359975e-07, "loss": 0.73471653, "num_input_tokens_seen": 268377065, "step": 12444, "time_per_iteration": 2.703878402709961 }, { "auxiliary_loss_clip": 0.01059747, "auxiliary_loss_mlp": 0.01031938, "balance_loss_clip": 1.03627777, "balance_loss_mlp": 1.01890206, "epoch": 0.7482338794528784, "flos": 21760035166080.0, "grad_norm": 1.378277825499583, "language_loss": 0.69101679, "learning_rate": 6.287552778493786e-07, "loss": 0.71193373, "num_input_tokens_seen": 268396935, "step": 12445, "time_per_iteration": 2.757577657699585 }, { "auxiliary_loss_clip": 0.01098864, "auxiliary_loss_mlp": 0.0102548, "balance_loss_clip": 1.03871107, "balance_loss_mlp": 1.01329112, "epoch": 0.7482940027055464, "flos": 18697358021760.0, "grad_norm": 1.944144924482792, "language_loss": 0.74288422, "learning_rate": 6.28471792336519e-07, "loss": 0.76412767, "num_input_tokens_seen": 268414460, "step": 12446, "time_per_iteration": 2.69356107711792 }, { "auxiliary_loss_clip": 0.01094765, "auxiliary_loss_mlp": 0.00771514, "balance_loss_clip": 1.04004169, "balance_loss_mlp": 1.0002172, "epoch": 0.7483541259582144, "flos": 15998024903040.0, "grad_norm": 2.4465560126403245, "language_loss": 0.7326262, "learning_rate": 6.281883588321475e-07, "loss": 0.75128901, "num_input_tokens_seen": 268432225, "step": 12447, "time_per_iteration": 2.662238597869873 }, { "auxiliary_loss_clip": 0.01068097, "auxiliary_loss_mlp": 0.01031231, "balance_loss_clip": 1.03563976, "balance_loss_mlp": 1.0193516, "epoch": 0.7484142492108823, "flos": 25556295772800.0, "grad_norm": 2.4715537752348906, "language_loss": 0.7231704, "learning_rate": 6.279049773470109e-07, "loss": 0.74416363, "num_input_tokens_seen": 268449270, "step": 12448, "time_per_iteration": 2.7589666843414307 }, { "auxiliary_loss_clip": 0.01113987, "auxiliary_loss_mlp": 0.01037666, "balance_loss_clip": 1.04052019, "balance_loss_mlp": 1.02560151, "epoch": 0.7484743724635503, "flos": 22887737631360.0, "grad_norm": 1.8427048424278483, "language_loss": 0.73759341, "learning_rate": 6.276216478918543e-07, "loss": 0.75910997, "num_input_tokens_seen": 268467250, "step": 12449, "time_per_iteration": 2.6071417331695557 }, { "auxiliary_loss_clip": 0.01076255, "auxiliary_loss_mlp": 0.01037131, "balance_loss_clip": 1.03802109, "balance_loss_mlp": 1.02391624, "epoch": 0.7485344957162182, "flos": 25300288563840.0, "grad_norm": 2.0043420955718716, "language_loss": 0.6146363, "learning_rate": 6.273383704774225e-07, "loss": 0.6357702, "num_input_tokens_seen": 268487270, "step": 12450, "time_per_iteration": 2.7463302612304688 }, { "auxiliary_loss_clip": 0.01106441, "auxiliary_loss_mlp": 0.01026536, "balance_loss_clip": 1.03821647, "balance_loss_mlp": 1.01458502, "epoch": 0.7485946189688862, "flos": 27053016612480.0, "grad_norm": 1.9632558902155064, "language_loss": 0.70478344, "learning_rate": 6.270551451144577e-07, "loss": 0.7261132, "num_input_tokens_seen": 268508020, "step": 12451, "time_per_iteration": 2.632495641708374 }, { "auxiliary_loss_clip": 0.01103126, "auxiliary_loss_mlp": 0.01029478, "balance_loss_clip": 1.03716731, "balance_loss_mlp": 1.0168184, "epoch": 0.7486547422215541, "flos": 26906752431360.0, "grad_norm": 2.915106727246987, "language_loss": 0.80665791, "learning_rate": 6.267719718136988e-07, "loss": 0.82798392, "num_input_tokens_seen": 268527375, "step": 12452, "time_per_iteration": 2.6505486965179443 }, { "auxiliary_loss_clip": 0.01119519, "auxiliary_loss_mlp": 0.0103336, "balance_loss_clip": 1.04324985, "balance_loss_mlp": 1.02005577, "epoch": 0.7487148654742222, "flos": 22346277039360.0, "grad_norm": 2.8493444529110215, "language_loss": 0.71248496, "learning_rate": 6.264888505858843e-07, "loss": 0.73401374, "num_input_tokens_seen": 268544870, "step": 12453, "time_per_iteration": 2.6861732006073 }, { "auxiliary_loss_clip": 0.01091229, "auxiliary_loss_mlp": 0.01034252, "balance_loss_clip": 1.03970766, "balance_loss_mlp": 1.02196777, "epoch": 0.7487749887268901, "flos": 23038814234880.0, "grad_norm": 1.5893693791461498, "language_loss": 0.73979241, "learning_rate": 6.262057814417517e-07, "loss": 0.76104718, "num_input_tokens_seen": 268564580, "step": 12454, "time_per_iteration": 2.716642379760742 }, { "auxiliary_loss_clip": 0.0100113, "auxiliary_loss_mlp": 0.01001978, "balance_loss_clip": 1.00717449, "balance_loss_mlp": 1.00067317, "epoch": 0.7488351119795581, "flos": 71525294536320.0, "grad_norm": 0.7358432419267441, "language_loss": 0.59396183, "learning_rate": 6.259227643920322e-07, "loss": 0.61399293, "num_input_tokens_seen": 268629550, "step": 12455, "time_per_iteration": 4.886117935180664 }, { "auxiliary_loss_clip": 0.01072127, "auxiliary_loss_mlp": 0.01029798, "balance_loss_clip": 1.0343852, "balance_loss_mlp": 1.01737666, "epoch": 0.748895235232226, "flos": 17196255722880.0, "grad_norm": 2.489880729520255, "language_loss": 0.79817784, "learning_rate": 6.256397994474592e-07, "loss": 0.81919706, "num_input_tokens_seen": 268646645, "step": 12456, "time_per_iteration": 5.9515721797943115 }, { "auxiliary_loss_clip": 0.01020316, "auxiliary_loss_mlp": 0.01001663, "balance_loss_clip": 1.00686383, "balance_loss_mlp": 1.00054216, "epoch": 0.748955358484894, "flos": 58979256336000.0, "grad_norm": 0.849157440562182, "language_loss": 0.61421359, "learning_rate": 6.25356886618763e-07, "loss": 0.63443339, "num_input_tokens_seen": 268702275, "step": 12457, "time_per_iteration": 3.1303980350494385 }, { "auxiliary_loss_clip": 0.01098576, "auxiliary_loss_mlp": 0.01035418, "balance_loss_clip": 1.04226291, "balance_loss_mlp": 1.02326477, "epoch": 0.749015481737562, "flos": 11360413054080.0, "grad_norm": 1.9444047716710122, "language_loss": 0.6761775, "learning_rate": 6.250740259166711e-07, "loss": 0.6975174, "num_input_tokens_seen": 268716265, "step": 12458, "time_per_iteration": 2.665384292602539 }, { "auxiliary_loss_clip": 0.0105583, "auxiliary_loss_mlp": 0.01032729, "balance_loss_clip": 1.03316355, "balance_loss_mlp": 1.02080858, "epoch": 0.74907560499023, "flos": 21106497162240.0, "grad_norm": 2.619057127646577, "language_loss": 0.79952264, "learning_rate": 6.247912173519106e-07, "loss": 0.82040823, "num_input_tokens_seen": 268734330, "step": 12459, "time_per_iteration": 2.754957675933838 }, { "auxiliary_loss_clip": 0.01072944, "auxiliary_loss_mlp": 0.01036735, "balance_loss_clip": 1.0369221, "balance_loss_mlp": 1.02394927, "epoch": 0.749135728242898, "flos": 22268027260800.0, "grad_norm": 1.4984057584596764, "language_loss": 0.80603898, "learning_rate": 6.245084609352043e-07, "loss": 0.82713568, "num_input_tokens_seen": 268753500, "step": 12460, "time_per_iteration": 4.2594664096832275 }, { "auxiliary_loss_clip": 0.01082271, "auxiliary_loss_mlp": 0.01031927, "balance_loss_clip": 1.03578806, "balance_loss_mlp": 1.01876581, "epoch": 0.7491958514955659, "flos": 24057527857920.0, "grad_norm": 1.80824785320189, "language_loss": 0.85877681, "learning_rate": 6.242257566772755e-07, "loss": 0.87991881, "num_input_tokens_seen": 268772055, "step": 12461, "time_per_iteration": 2.6852405071258545 }, { "auxiliary_loss_clip": 0.01093212, "auxiliary_loss_mlp": 0.01035506, "balance_loss_clip": 1.03965092, "balance_loss_mlp": 1.02309084, "epoch": 0.7492559747482339, "flos": 24492118510080.0, "grad_norm": 1.8962735896690046, "language_loss": 0.69416398, "learning_rate": 6.239431045888435e-07, "loss": 0.71545118, "num_input_tokens_seen": 268792265, "step": 12462, "time_per_iteration": 2.768845319747925 }, { "auxiliary_loss_clip": 0.01110765, "auxiliary_loss_mlp": 0.01033779, "balance_loss_clip": 1.03923655, "balance_loss_mlp": 1.02101731, "epoch": 0.7493160980009018, "flos": 27745338326400.0, "grad_norm": 2.365885457635203, "language_loss": 0.7031799, "learning_rate": 6.236605046806267e-07, "loss": 0.72462535, "num_input_tokens_seen": 268812735, "step": 12463, "time_per_iteration": 2.6340458393096924 }, { "auxiliary_loss_clip": 0.01074204, "auxiliary_loss_mlp": 0.01032497, "balance_loss_clip": 1.03618455, "balance_loss_mlp": 1.02071965, "epoch": 0.7493762212535698, "flos": 30226190970240.0, "grad_norm": 2.141316726058728, "language_loss": 0.77804828, "learning_rate": 6.233779569633419e-07, "loss": 0.7991153, "num_input_tokens_seen": 268833090, "step": 12464, "time_per_iteration": 2.751758098602295 }, { "auxiliary_loss_clip": 0.0108502, "auxiliary_loss_mlp": 0.01026607, "balance_loss_clip": 1.03515768, "balance_loss_mlp": 1.01502621, "epoch": 0.7494363445062378, "flos": 21944472526080.0, "grad_norm": 1.8114572432161449, "language_loss": 0.78449178, "learning_rate": 6.230954614477034e-07, "loss": 0.80560803, "num_input_tokens_seen": 268851880, "step": 12465, "time_per_iteration": 2.6739721298217773 }, { "auxiliary_loss_clip": 0.0108024, "auxiliary_loss_mlp": 0.01039302, "balance_loss_clip": 1.03586817, "balance_loss_mlp": 1.02480614, "epoch": 0.7494964677589058, "flos": 12490342162560.0, "grad_norm": 2.3217514076277697, "language_loss": 0.74179816, "learning_rate": 6.22813018144422e-07, "loss": 0.76299357, "num_input_tokens_seen": 268867910, "step": 12466, "time_per_iteration": 2.63236665725708 }, { "auxiliary_loss_clip": 0.01098476, "auxiliary_loss_mlp": 0.01036506, "balance_loss_clip": 1.03608537, "balance_loss_mlp": 1.02381599, "epoch": 0.7495565910115737, "flos": 21653057485440.0, "grad_norm": 2.1977964760321362, "language_loss": 0.66625774, "learning_rate": 6.22530627064209e-07, "loss": 0.68760759, "num_input_tokens_seen": 268887260, "step": 12467, "time_per_iteration": 2.6381313800811768 }, { "auxiliary_loss_clip": 0.01062241, "auxiliary_loss_mlp": 0.00773108, "balance_loss_clip": 1.03538942, "balance_loss_mlp": 1.00025678, "epoch": 0.7496167142642417, "flos": 15268535591040.0, "grad_norm": 2.2660950859773425, "language_loss": 0.76690638, "learning_rate": 6.222482882177735e-07, "loss": 0.7852599, "num_input_tokens_seen": 268902520, "step": 12468, "time_per_iteration": 2.717893123626709 }, { "auxiliary_loss_clip": 0.01071579, "auxiliary_loss_mlp": 0.01029806, "balance_loss_clip": 1.03752029, "balance_loss_mlp": 1.0167706, "epoch": 0.7496768375169096, "flos": 22054933825920.0, "grad_norm": 2.258197229303168, "language_loss": 0.69274288, "learning_rate": 6.219660016158201e-07, "loss": 0.7137568, "num_input_tokens_seen": 268920970, "step": 12469, "time_per_iteration": 2.7141220569610596 }, { "auxiliary_loss_clip": 0.01089029, "auxiliary_loss_mlp": 0.01032283, "balance_loss_clip": 1.03684139, "balance_loss_mlp": 1.01970625, "epoch": 0.7497369607695776, "flos": 19057038860160.0, "grad_norm": 1.9749809581101754, "language_loss": 0.69305575, "learning_rate": 6.216837672690543e-07, "loss": 0.71426892, "num_input_tokens_seen": 268936600, "step": 12470, "time_per_iteration": 2.736288547515869 }, { "auxiliary_loss_clip": 0.01082647, "auxiliary_loss_mlp": 0.01033832, "balance_loss_clip": 1.03593028, "balance_loss_mlp": 1.01967597, "epoch": 0.7497970840222457, "flos": 21617434172160.0, "grad_norm": 1.8937148851838135, "language_loss": 0.75178516, "learning_rate": 6.214015851881793e-07, "loss": 0.77294993, "num_input_tokens_seen": 268956560, "step": 12471, "time_per_iteration": 2.664313554763794 }, { "auxiliary_loss_clip": 0.01084709, "auxiliary_loss_mlp": 0.01035791, "balance_loss_clip": 1.0353353, "balance_loss_mlp": 1.02159286, "epoch": 0.7498572072749136, "flos": 13735580906880.0, "grad_norm": 2.796464416827846, "language_loss": 0.77233744, "learning_rate": 6.211194553838929e-07, "loss": 0.79354239, "num_input_tokens_seen": 268973945, "step": 12472, "time_per_iteration": 2.657557487487793 }, { "auxiliary_loss_clip": 0.01094543, "auxiliary_loss_mlp": 0.00769819, "balance_loss_clip": 1.03535211, "balance_loss_mlp": 1.00018263, "epoch": 0.7499173305275816, "flos": 22966526113920.0, "grad_norm": 1.5448300611730317, "language_loss": 0.84419262, "learning_rate": 6.208373778668951e-07, "loss": 0.8628363, "num_input_tokens_seen": 268993245, "step": 12473, "time_per_iteration": 2.7043027877807617 }, { "auxiliary_loss_clip": 0.01079095, "auxiliary_loss_mlp": 0.01031863, "balance_loss_clip": 1.03500473, "balance_loss_mlp": 1.01823711, "epoch": 0.7499774537802495, "flos": 22740467869440.0, "grad_norm": 2.038219260751869, "language_loss": 0.7402907, "learning_rate": 6.205553526478829e-07, "loss": 0.76140028, "num_input_tokens_seen": 269012125, "step": 12474, "time_per_iteration": 2.74438214302063 }, { "auxiliary_loss_clip": 0.01088373, "auxiliary_loss_mlp": 0.01038948, "balance_loss_clip": 1.03736258, "balance_loss_mlp": 1.02587676, "epoch": 0.7500375770329175, "flos": 18296559089280.0, "grad_norm": 2.2001386818620263, "language_loss": 0.74208605, "learning_rate": 6.202733797375492e-07, "loss": 0.76335931, "num_input_tokens_seen": 269030545, "step": 12475, "time_per_iteration": 2.6366353034973145 }, { "auxiliary_loss_clip": 0.0110606, "auxiliary_loss_mlp": 0.01035347, "balance_loss_clip": 1.03846169, "balance_loss_mlp": 1.02150083, "epoch": 0.7500977002855854, "flos": 19169978198400.0, "grad_norm": 1.7274221168077015, "language_loss": 0.80403024, "learning_rate": 6.199914591465878e-07, "loss": 0.82544434, "num_input_tokens_seen": 269048180, "step": 12476, "time_per_iteration": 2.622103691101074 }, { "auxiliary_loss_clip": 0.01076959, "auxiliary_loss_mlp": 0.01034735, "balance_loss_clip": 1.0369035, "balance_loss_mlp": 1.02214074, "epoch": 0.7501578235382534, "flos": 22163886754560.0, "grad_norm": 1.9425018569967707, "language_loss": 0.77756828, "learning_rate": 6.19709590885688e-07, "loss": 0.79868519, "num_input_tokens_seen": 269068600, "step": 12477, "time_per_iteration": 2.6923439502716064 }, { "auxiliary_loss_clip": 0.01010213, "auxiliary_loss_mlp": 0.01001269, "balance_loss_clip": 1.00773573, "balance_loss_mlp": 1.00022018, "epoch": 0.7502179467909214, "flos": 64465040033280.0, "grad_norm": 0.8187484606770943, "language_loss": 0.54458755, "learning_rate": 6.194277749655394e-07, "loss": 0.56470239, "num_input_tokens_seen": 269119045, "step": 12478, "time_per_iteration": 3.204738140106201 }, { "auxiliary_loss_clip": 0.0108167, "auxiliary_loss_mlp": 0.01032285, "balance_loss_clip": 1.03592229, "balance_loss_mlp": 1.02035236, "epoch": 0.7502780700435894, "flos": 20478275268480.0, "grad_norm": 1.6024244799309337, "language_loss": 0.80039358, "learning_rate": 6.191460113968272e-07, "loss": 0.82153314, "num_input_tokens_seen": 269136755, "step": 12479, "time_per_iteration": 2.690080165863037 }, { "auxiliary_loss_clip": 0.01104663, "auxiliary_loss_mlp": 0.01038768, "balance_loss_clip": 1.03951621, "balance_loss_mlp": 1.02505875, "epoch": 0.7503381932962573, "flos": 20445273648000.0, "grad_norm": 2.9599564657820805, "language_loss": 0.62753713, "learning_rate": 6.188643001902369e-07, "loss": 0.64897144, "num_input_tokens_seen": 269156120, "step": 12480, "time_per_iteration": 2.6162097454071045 }, { "auxiliary_loss_clip": 0.0108428, "auxiliary_loss_mlp": 0.01034909, "balance_loss_clip": 1.03689194, "balance_loss_mlp": 1.02272034, "epoch": 0.7503983165489253, "flos": 22381936266240.0, "grad_norm": 2.3943314671981955, "language_loss": 0.78243744, "learning_rate": 6.185826413564512e-07, "loss": 0.80362934, "num_input_tokens_seen": 269175650, "step": 12481, "time_per_iteration": 2.669548988342285 }, { "auxiliary_loss_clip": 0.0106997, "auxiliary_loss_mlp": 0.01038221, "balance_loss_clip": 1.0354079, "balance_loss_mlp": 1.02479172, "epoch": 0.7504584398015932, "flos": 24899453717760.0, "grad_norm": 1.8872543755880817, "language_loss": 0.71297598, "learning_rate": 6.183010349061501e-07, "loss": 0.73405796, "num_input_tokens_seen": 269197080, "step": 12482, "time_per_iteration": 2.7567055225372314 }, { "auxiliary_loss_clip": 0.01111149, "auxiliary_loss_mlp": 0.01035657, "balance_loss_clip": 1.03868306, "balance_loss_mlp": 1.02335453, "epoch": 0.7505185630542612, "flos": 25885237547520.0, "grad_norm": 1.839701731381712, "language_loss": 0.6994698, "learning_rate": 6.180194808500118e-07, "loss": 0.72093785, "num_input_tokens_seen": 269218600, "step": 12483, "time_per_iteration": 2.606757402420044 }, { "auxiliary_loss_clip": 0.01110582, "auxiliary_loss_mlp": 0.01027036, "balance_loss_clip": 1.03916931, "balance_loss_mlp": 1.01574111, "epoch": 0.7505786863069293, "flos": 23143852581120.0, "grad_norm": 2.0560449071537943, "language_loss": 0.74602097, "learning_rate": 6.177379791987131e-07, "loss": 0.76739717, "num_input_tokens_seen": 269239245, "step": 12484, "time_per_iteration": 2.618504285812378 }, { "auxiliary_loss_clip": 0.01087809, "auxiliary_loss_mlp": 0.01029947, "balance_loss_clip": 1.03753555, "balance_loss_mlp": 1.01745975, "epoch": 0.7506388095595972, "flos": 16983377769600.0, "grad_norm": 1.9415131613647365, "language_loss": 0.84624791, "learning_rate": 6.174565299629295e-07, "loss": 0.86742544, "num_input_tokens_seen": 269258520, "step": 12485, "time_per_iteration": 2.697805404663086 }, { "auxiliary_loss_clip": 0.01072795, "auxiliary_loss_mlp": 0.0103091, "balance_loss_clip": 1.03648996, "balance_loss_mlp": 1.01851201, "epoch": 0.7506989328122652, "flos": 22344984149760.0, "grad_norm": 1.6745365119179365, "language_loss": 0.78448224, "learning_rate": 6.171751331533323e-07, "loss": 0.80551928, "num_input_tokens_seen": 269278320, "step": 12486, "time_per_iteration": 2.714510202407837 }, { "auxiliary_loss_clip": 0.01099772, "auxiliary_loss_mlp": 0.01033013, "balance_loss_clip": 1.03659987, "balance_loss_mlp": 1.01920259, "epoch": 0.7507590560649331, "flos": 25776069137280.0, "grad_norm": 2.4012807743392477, "language_loss": 0.72792411, "learning_rate": 6.168937887805932e-07, "loss": 0.74925202, "num_input_tokens_seen": 269298025, "step": 12487, "time_per_iteration": 2.7071502208709717 }, { "auxiliary_loss_clip": 0.01085256, "auxiliary_loss_mlp": 0.0103171, "balance_loss_clip": 1.0348568, "balance_loss_mlp": 1.01866841, "epoch": 0.7508191793176011, "flos": 24279420124800.0, "grad_norm": 4.846564325155201, "language_loss": 0.67752981, "learning_rate": 6.166124968553801e-07, "loss": 0.69869953, "num_input_tokens_seen": 269316770, "step": 12488, "time_per_iteration": 2.644109010696411 }, { "auxiliary_loss_clip": 0.01045289, "auxiliary_loss_mlp": 0.01033728, "balance_loss_clip": 1.0341351, "balance_loss_mlp": 1.02041197, "epoch": 0.750879302570269, "flos": 19899575251200.0, "grad_norm": 1.8778545582321347, "language_loss": 0.77185404, "learning_rate": 6.163312573883592e-07, "loss": 0.7926442, "num_input_tokens_seen": 269334755, "step": 12489, "time_per_iteration": 2.73962664604187 }, { "auxiliary_loss_clip": 0.01096988, "auxiliary_loss_mlp": 0.01031266, "balance_loss_clip": 1.03820062, "balance_loss_mlp": 1.01943493, "epoch": 0.750939425822937, "flos": 29205681667200.0, "grad_norm": 2.4146735284189393, "language_loss": 0.75405651, "learning_rate": 6.160500703901956e-07, "loss": 0.77533901, "num_input_tokens_seen": 269353810, "step": 12490, "time_per_iteration": 2.6824519634246826 }, { "auxiliary_loss_clip": 0.01109505, "auxiliary_loss_mlp": 0.0103058, "balance_loss_clip": 1.03855062, "balance_loss_mlp": 1.01803946, "epoch": 0.750999549075605, "flos": 21142300043520.0, "grad_norm": 1.5627078953093116, "language_loss": 0.78168178, "learning_rate": 6.157689358715527e-07, "loss": 0.80308264, "num_input_tokens_seen": 269372910, "step": 12491, "time_per_iteration": 2.6018178462982178 }, { "auxiliary_loss_clip": 0.01097672, "auxiliary_loss_mlp": 0.01031858, "balance_loss_clip": 1.03719735, "balance_loss_mlp": 1.02034187, "epoch": 0.751059672328273, "flos": 23547740083200.0, "grad_norm": 1.6628642916222176, "language_loss": 0.76332009, "learning_rate": 6.154878538430899e-07, "loss": 0.7846154, "num_input_tokens_seen": 269391545, "step": 12492, "time_per_iteration": 2.691298484802246 }, { "auxiliary_loss_clip": 0.01078534, "auxiliary_loss_mlp": 0.01032513, "balance_loss_clip": 1.03569448, "balance_loss_mlp": 1.02058053, "epoch": 0.7511197955809409, "flos": 18989742729600.0, "grad_norm": 1.9903305425404954, "language_loss": 0.71488953, "learning_rate": 6.152068243154671e-07, "loss": 0.736, "num_input_tokens_seen": 269408530, "step": 12493, "time_per_iteration": 2.718707323074341 }, { "auxiliary_loss_clip": 0.01099033, "auxiliary_loss_mlp": 0.00770094, "balance_loss_clip": 1.03731656, "balance_loss_mlp": 1.00024784, "epoch": 0.7511799188336089, "flos": 22046961006720.0, "grad_norm": 4.285406665827556, "language_loss": 0.80753833, "learning_rate": 6.149258472993395e-07, "loss": 0.82622963, "num_input_tokens_seen": 269425930, "step": 12494, "time_per_iteration": 4.076538562774658 }, { "auxiliary_loss_clip": 0.01111429, "auxiliary_loss_mlp": 0.01029891, "balance_loss_clip": 1.03875446, "balance_loss_mlp": 1.01716495, "epoch": 0.7512400420862768, "flos": 16467125546880.0, "grad_norm": 3.1005011583642084, "language_loss": 0.78857327, "learning_rate": 6.146449228053634e-07, "loss": 0.80998647, "num_input_tokens_seen": 269443945, "step": 12495, "time_per_iteration": 2.608964204788208 }, { "auxiliary_loss_clip": 0.01110172, "auxiliary_loss_mlp": 0.00769806, "balance_loss_clip": 1.03854084, "balance_loss_mlp": 1.0001905, "epoch": 0.7513001653389448, "flos": 20448326304000.0, "grad_norm": 2.1655519437431967, "language_loss": 0.7114259, "learning_rate": 6.143640508441898e-07, "loss": 0.73022562, "num_input_tokens_seen": 269463625, "step": 12496, "time_per_iteration": 5.996880769729614 }, { "auxiliary_loss_clip": 0.01065225, "auxiliary_loss_mlp": 0.01035626, "balance_loss_clip": 1.03378069, "balance_loss_mlp": 1.02353823, "epoch": 0.7513602885916129, "flos": 23476816679040.0, "grad_norm": 1.61396701477023, "language_loss": 0.78199899, "learning_rate": 6.140832314264705e-07, "loss": 0.80300748, "num_input_tokens_seen": 269483415, "step": 12497, "time_per_iteration": 2.9391214847564697 }, { "auxiliary_loss_clip": 0.01100389, "auxiliary_loss_mlp": 0.0103612, "balance_loss_clip": 1.03779829, "balance_loss_mlp": 1.02334642, "epoch": 0.7514204118442808, "flos": 26797224885120.0, "grad_norm": 1.6137991944491499, "language_loss": 0.76816785, "learning_rate": 6.13802464562855e-07, "loss": 0.7895329, "num_input_tokens_seen": 269504635, "step": 12498, "time_per_iteration": 2.6544244289398193 }, { "auxiliary_loss_clip": 0.0108807, "auxiliary_loss_mlp": 0.01033331, "balance_loss_clip": 1.03969288, "balance_loss_mlp": 1.02200651, "epoch": 0.7514805350969488, "flos": 19865639877120.0, "grad_norm": 1.7444376873678542, "language_loss": 0.74047679, "learning_rate": 6.135217502639878e-07, "loss": 0.7616908, "num_input_tokens_seen": 269523955, "step": 12499, "time_per_iteration": 4.209566831588745 }, { "auxiliary_loss_clip": 0.01096501, "auxiliary_loss_mlp": 0.01028936, "balance_loss_clip": 1.03525448, "balance_loss_mlp": 1.01752162, "epoch": 0.7515406583496167, "flos": 24571553437440.0, "grad_norm": 2.0366798192363698, "language_loss": 0.79610258, "learning_rate": 6.132410885405148e-07, "loss": 0.81735694, "num_input_tokens_seen": 269544410, "step": 12500, "time_per_iteration": 2.6563799381256104 }, { "auxiliary_loss_clip": 0.01108205, "auxiliary_loss_mlp": 0.01037277, "balance_loss_clip": 1.03992486, "balance_loss_mlp": 1.02259684, "epoch": 0.7516007816022847, "flos": 20120246455680.0, "grad_norm": 3.0425120159741588, "language_loss": 0.73648608, "learning_rate": 6.129604794030794e-07, "loss": 0.75794089, "num_input_tokens_seen": 269563315, "step": 12501, "time_per_iteration": 2.744978666305542 }, { "auxiliary_loss_clip": 0.01086633, "auxiliary_loss_mlp": 0.01027091, "balance_loss_clip": 1.03513741, "balance_loss_mlp": 1.01484871, "epoch": 0.7516609048549526, "flos": 22784638619520.0, "grad_norm": 1.7898399637161078, "language_loss": 0.78497088, "learning_rate": 6.126799228623207e-07, "loss": 0.80610812, "num_input_tokens_seen": 269583950, "step": 12502, "time_per_iteration": 2.738304615020752 }, { "auxiliary_loss_clip": 0.01089762, "auxiliary_loss_mlp": 0.01036729, "balance_loss_clip": 1.03781581, "balance_loss_mlp": 1.02402735, "epoch": 0.7517210281076206, "flos": 10634012311680.0, "grad_norm": 2.4706577261656277, "language_loss": 0.70263046, "learning_rate": 6.123994189288786e-07, "loss": 0.72389537, "num_input_tokens_seen": 269600120, "step": 12503, "time_per_iteration": 2.647141695022583 }, { "auxiliary_loss_clip": 0.0102855, "auxiliary_loss_mlp": 0.00998893, "balance_loss_clip": 1.00588393, "balance_loss_mlp": 0.99776667, "epoch": 0.7517811513602886, "flos": 66052221275520.0, "grad_norm": 0.9994462005888404, "language_loss": 0.63930368, "learning_rate": 6.121189676133903e-07, "loss": 0.65957808, "num_input_tokens_seen": 269659815, "step": 12504, "time_per_iteration": 3.0780868530273438 }, { "auxiliary_loss_clip": 0.01067894, "auxiliary_loss_mlp": 0.01035688, "balance_loss_clip": 1.03288054, "balance_loss_mlp": 1.02317679, "epoch": 0.7518412746129566, "flos": 37268345018880.0, "grad_norm": 1.4012015647577118, "language_loss": 0.68983722, "learning_rate": 6.118385689264896e-07, "loss": 0.71087301, "num_input_tokens_seen": 269684565, "step": 12505, "time_per_iteration": 2.979429244995117 }, { "auxiliary_loss_clip": 0.01018848, "auxiliary_loss_mlp": 0.00750909, "balance_loss_clip": 1.00648499, "balance_loss_mlp": 0.9996025, "epoch": 0.7519013978656245, "flos": 60518567727360.0, "grad_norm": 1.3160178950136667, "language_loss": 0.55058348, "learning_rate": 6.11558222878809e-07, "loss": 0.56828105, "num_input_tokens_seen": 269752325, "step": 12506, "time_per_iteration": 3.3165297508239746 }, { "auxiliary_loss_clip": 0.01099755, "auxiliary_loss_mlp": 0.0103953, "balance_loss_clip": 1.03766441, "balance_loss_mlp": 1.02648234, "epoch": 0.7519615211182925, "flos": 18806885568000.0, "grad_norm": 2.1648531082865, "language_loss": 0.78275704, "learning_rate": 6.112779294809796e-07, "loss": 0.80414987, "num_input_tokens_seen": 269770630, "step": 12507, "time_per_iteration": 2.608264923095703 }, { "auxiliary_loss_clip": 0.01083834, "auxiliary_loss_mlp": 0.01032238, "balance_loss_clip": 1.03923869, "balance_loss_mlp": 1.02056146, "epoch": 0.7520216443709604, "flos": 14575244209920.0, "grad_norm": 1.7931867783569413, "language_loss": 0.71366513, "learning_rate": 6.10997688743631e-07, "loss": 0.73482585, "num_input_tokens_seen": 269787280, "step": 12508, "time_per_iteration": 2.695327043533325 }, { "auxiliary_loss_clip": 0.01095204, "auxiliary_loss_mlp": 0.01031492, "balance_loss_clip": 1.03605807, "balance_loss_mlp": 1.01884961, "epoch": 0.7520817676236284, "flos": 17056599644160.0, "grad_norm": 1.7777789026239683, "language_loss": 0.71897995, "learning_rate": 6.107175006773885e-07, "loss": 0.74024695, "num_input_tokens_seen": 269805205, "step": 12509, "time_per_iteration": 2.649292230606079 }, { "auxiliary_loss_clip": 0.01116565, "auxiliary_loss_mlp": 0.01036188, "balance_loss_clip": 1.04018068, "balance_loss_mlp": 1.02252054, "epoch": 0.7521418908762965, "flos": 25666397936640.0, "grad_norm": 1.6131110543422247, "language_loss": 0.62129647, "learning_rate": 6.104373652928785e-07, "loss": 0.64282399, "num_input_tokens_seen": 269824820, "step": 12510, "time_per_iteration": 2.5948257446289062 }, { "auxiliary_loss_clip": 0.01097666, "auxiliary_loss_mlp": 0.01031956, "balance_loss_clip": 1.03866065, "balance_loss_mlp": 1.01975513, "epoch": 0.7522020141289644, "flos": 20886759711360.0, "grad_norm": 3.9854305674745474, "language_loss": 0.81469762, "learning_rate": 6.10157282600722e-07, "loss": 0.83599389, "num_input_tokens_seen": 269842825, "step": 12511, "time_per_iteration": 2.6505610942840576 }, { "auxiliary_loss_clip": 0.01087038, "auxiliary_loss_mlp": 0.01038744, "balance_loss_clip": 1.03666866, "balance_loss_mlp": 1.02523184, "epoch": 0.7522621373816324, "flos": 12640305444480.0, "grad_norm": 1.8827619698116422, "language_loss": 0.75637031, "learning_rate": 6.098772526115412e-07, "loss": 0.77762812, "num_input_tokens_seen": 269859000, "step": 12512, "time_per_iteration": 2.647817850112915 }, { "auxiliary_loss_clip": 0.01093893, "auxiliary_loss_mlp": 0.01030376, "balance_loss_clip": 1.03682494, "balance_loss_mlp": 1.01915812, "epoch": 0.7523222606343003, "flos": 25626141768960.0, "grad_norm": 1.9631337780364373, "language_loss": 0.82056046, "learning_rate": 6.095972753359537e-07, "loss": 0.84180307, "num_input_tokens_seen": 269878895, "step": 12513, "time_per_iteration": 2.6645336151123047 }, { "auxiliary_loss_clip": 0.01097529, "auxiliary_loss_mlp": 0.01037828, "balance_loss_clip": 1.03800118, "balance_loss_mlp": 1.02469754, "epoch": 0.7523823838869683, "flos": 20448900921600.0, "grad_norm": 2.0970776368608846, "language_loss": 0.74827564, "learning_rate": 6.093173507845771e-07, "loss": 0.76962924, "num_input_tokens_seen": 269897280, "step": 12514, "time_per_iteration": 2.617037057876587 }, { "auxiliary_loss_clip": 0.01090674, "auxiliary_loss_mlp": 0.01031924, "balance_loss_clip": 1.03809762, "balance_loss_mlp": 1.02052188, "epoch": 0.7524425071396362, "flos": 14720610551040.0, "grad_norm": 3.0939358054724146, "language_loss": 0.68892944, "learning_rate": 6.090374789680271e-07, "loss": 0.71015543, "num_input_tokens_seen": 269914640, "step": 12515, "time_per_iteration": 2.59306001663208 }, { "auxiliary_loss_clip": 0.01100231, "auxiliary_loss_mlp": 0.0103411, "balance_loss_clip": 1.03811383, "balance_loss_mlp": 1.02207565, "epoch": 0.7525026303923043, "flos": 30592048947840.0, "grad_norm": 2.040446314697398, "language_loss": 0.69929761, "learning_rate": 6.087576598969137e-07, "loss": 0.72064102, "num_input_tokens_seen": 269934960, "step": 12516, "time_per_iteration": 2.6960794925689697 }, { "auxiliary_loss_clip": 0.0106292, "auxiliary_loss_mlp": 0.01032821, "balance_loss_clip": 1.03736663, "balance_loss_mlp": 1.020751, "epoch": 0.7525627536449722, "flos": 24791757765120.0, "grad_norm": 1.5503440947431564, "language_loss": 0.89659667, "learning_rate": 6.084778935818495e-07, "loss": 0.91755402, "num_input_tokens_seen": 269956655, "step": 12517, "time_per_iteration": 2.9062299728393555 }, { "auxiliary_loss_clip": 0.0108799, "auxiliary_loss_mlp": 0.01032916, "balance_loss_clip": 1.036955, "balance_loss_mlp": 1.02054834, "epoch": 0.7526228768976402, "flos": 20779782030720.0, "grad_norm": 1.60812028776888, "language_loss": 0.74420178, "learning_rate": 6.081981800334437e-07, "loss": 0.76541078, "num_input_tokens_seen": 269976835, "step": 12518, "time_per_iteration": 2.9830613136291504 }, { "auxiliary_loss_clip": 0.00997374, "auxiliary_loss_mlp": 0.01010959, "balance_loss_clip": 1.01711154, "balance_loss_mlp": 1.0097965, "epoch": 0.7526830001503081, "flos": 66559243703040.0, "grad_norm": 0.708684039109314, "language_loss": 0.55700099, "learning_rate": 6.079185192623017e-07, "loss": 0.5770843, "num_input_tokens_seen": 270040630, "step": 12519, "time_per_iteration": 3.3146941661834717 }, { "auxiliary_loss_clip": 0.01093149, "auxiliary_loss_mlp": 0.01034827, "balance_loss_clip": 1.0377202, "balance_loss_mlp": 1.0233829, "epoch": 0.7527431234029761, "flos": 23477894087040.0, "grad_norm": 1.471289032229335, "language_loss": 0.77771223, "learning_rate": 6.07638911279029e-07, "loss": 0.79899204, "num_input_tokens_seen": 270059695, "step": 12520, "time_per_iteration": 2.6884288787841797 }, { "auxiliary_loss_clip": 0.01092157, "auxiliary_loss_mlp": 0.01040045, "balance_loss_clip": 1.0348748, "balance_loss_mlp": 1.02787995, "epoch": 0.752803246655644, "flos": 22049546785920.0, "grad_norm": 1.9875940404305874, "language_loss": 0.73850435, "learning_rate": 6.07359356094229e-07, "loss": 0.75982636, "num_input_tokens_seen": 270078420, "step": 12521, "time_per_iteration": 2.6750216484069824 }, { "auxiliary_loss_clip": 0.01088057, "auxiliary_loss_mlp": 0.01037596, "balance_loss_clip": 1.03908432, "balance_loss_mlp": 1.02416718, "epoch": 0.752863369908312, "flos": 30153795108480.0, "grad_norm": 2.5051463409980634, "language_loss": 0.67080051, "learning_rate": 6.070798537185016e-07, "loss": 0.69205701, "num_input_tokens_seen": 270097040, "step": 12522, "time_per_iteration": 2.772545576095581 }, { "auxiliary_loss_clip": 0.01101858, "auxiliary_loss_mlp": 0.01042231, "balance_loss_clip": 1.0390172, "balance_loss_mlp": 1.02954745, "epoch": 0.7529234931609801, "flos": 24567638855040.0, "grad_norm": 1.9325900284520732, "language_loss": 0.78271604, "learning_rate": 6.068004041624453e-07, "loss": 0.8041569, "num_input_tokens_seen": 270116365, "step": 12523, "time_per_iteration": 2.5928404331207275 }, { "auxiliary_loss_clip": 0.01107861, "auxiliary_loss_mlp": 0.01033051, "balance_loss_clip": 1.03753757, "balance_loss_mlp": 1.02056384, "epoch": 0.752983616413648, "flos": 23112395245440.0, "grad_norm": 1.9643840273203326, "language_loss": 0.80548674, "learning_rate": 6.065210074366571e-07, "loss": 0.82689583, "num_input_tokens_seen": 270135395, "step": 12524, "time_per_iteration": 2.5654656887054443 }, { "auxiliary_loss_clip": 0.01100021, "auxiliary_loss_mlp": 0.00769024, "balance_loss_clip": 1.03862953, "balance_loss_mlp": 1.00022125, "epoch": 0.753043739666316, "flos": 24316946858880.0, "grad_norm": 1.7390985823733704, "language_loss": 0.74004686, "learning_rate": 6.062416635517326e-07, "loss": 0.75873733, "num_input_tokens_seen": 270156425, "step": 12525, "time_per_iteration": 2.629235029220581 }, { "auxiliary_loss_clip": 0.01076975, "auxiliary_loss_mlp": 0.01030405, "balance_loss_clip": 1.03678906, "balance_loss_mlp": 1.01844287, "epoch": 0.7531038629189839, "flos": 24243294021120.0, "grad_norm": 1.8793498338294334, "language_loss": 0.72428775, "learning_rate": 6.059623725182641e-07, "loss": 0.74536157, "num_input_tokens_seen": 270176905, "step": 12526, "time_per_iteration": 2.7281389236450195 }, { "auxiliary_loss_clip": 0.01088063, "auxiliary_loss_mlp": 0.01028205, "balance_loss_clip": 1.03752398, "balance_loss_mlp": 1.01674342, "epoch": 0.7531639861716519, "flos": 30188807890560.0, "grad_norm": 1.5986018665355572, "language_loss": 0.72446048, "learning_rate": 6.056831343468414e-07, "loss": 0.74562311, "num_input_tokens_seen": 270196640, "step": 12527, "time_per_iteration": 2.765742301940918 }, { "auxiliary_loss_clip": 0.01077327, "auxiliary_loss_mlp": 0.01027333, "balance_loss_clip": 1.03892338, "balance_loss_mlp": 1.01588261, "epoch": 0.7532241094243198, "flos": 18223193560320.0, "grad_norm": 1.7490164070315937, "language_loss": 0.81002724, "learning_rate": 6.054039490480539e-07, "loss": 0.83107388, "num_input_tokens_seen": 270213905, "step": 12528, "time_per_iteration": 2.8258893489837646 }, { "auxiliary_loss_clip": 0.01062731, "auxiliary_loss_mlp": 0.01037431, "balance_loss_clip": 1.04194808, "balance_loss_mlp": 1.02391267, "epoch": 0.7532842326769879, "flos": 20881049448960.0, "grad_norm": 2.0737257705998084, "language_loss": 0.84989285, "learning_rate": 6.051248166324892e-07, "loss": 0.87089443, "num_input_tokens_seen": 270231995, "step": 12529, "time_per_iteration": 2.8930623531341553 }, { "auxiliary_loss_clip": 0.01084647, "auxiliary_loss_mlp": 0.01037761, "balance_loss_clip": 1.04127479, "balance_loss_mlp": 1.02479124, "epoch": 0.7533443559296558, "flos": 18078689145600.0, "grad_norm": 1.9050070504159882, "language_loss": 0.73877907, "learning_rate": 6.048457371107303e-07, "loss": 0.76000321, "num_input_tokens_seen": 270251480, "step": 12530, "time_per_iteration": 2.765109062194824 }, { "auxiliary_loss_clip": 0.0098332, "auxiliary_loss_mlp": 0.01008335, "balance_loss_clip": 1.01471329, "balance_loss_mlp": 1.00720811, "epoch": 0.7534044791823238, "flos": 50254830766080.0, "grad_norm": 0.8264532859334547, "language_loss": 0.63601089, "learning_rate": 6.045667104933612e-07, "loss": 0.65592742, "num_input_tokens_seen": 270306480, "step": 12531, "time_per_iteration": 3.203054428100586 }, { "auxiliary_loss_clip": 0.01090436, "auxiliary_loss_mlp": 0.01030983, "balance_loss_clip": 1.03936112, "balance_loss_mlp": 1.01770902, "epoch": 0.7534646024349917, "flos": 20850274471680.0, "grad_norm": 2.3022787240399087, "language_loss": 0.69915926, "learning_rate": 6.042877367909633e-07, "loss": 0.72037345, "num_input_tokens_seen": 270324595, "step": 12532, "time_per_iteration": 2.8513519763946533 }, { "auxiliary_loss_clip": 0.01080734, "auxiliary_loss_mlp": 0.010295, "balance_loss_clip": 1.0378058, "balance_loss_mlp": 1.01846147, "epoch": 0.7535247256876597, "flos": 23071779941760.0, "grad_norm": 1.6087653356009437, "language_loss": 0.77676719, "learning_rate": 6.040088160141132e-07, "loss": 0.79786956, "num_input_tokens_seen": 270344375, "step": 12533, "time_per_iteration": 5.849594831466675 }, { "auxiliary_loss_clip": 0.01019649, "auxiliary_loss_mlp": 0.01000792, "balance_loss_clip": 1.00604045, "balance_loss_mlp": 0.99969578, "epoch": 0.7535848489403276, "flos": 58623418252800.0, "grad_norm": 0.7831538235922403, "language_loss": 0.57278597, "learning_rate": 6.037299481733886e-07, "loss": 0.5929904, "num_input_tokens_seen": 270405235, "step": 12534, "time_per_iteration": 4.745975494384766 }, { "auxiliary_loss_clip": 0.01087528, "auxiliary_loss_mlp": 0.01028431, "balance_loss_clip": 1.03641176, "balance_loss_mlp": 1.01590824, "epoch": 0.7536449721929956, "flos": 26577882483840.0, "grad_norm": 1.758420059943407, "language_loss": 0.71251357, "learning_rate": 6.03451133279365e-07, "loss": 0.73367316, "num_input_tokens_seen": 270425820, "step": 12535, "time_per_iteration": 4.465839862823486 }, { "auxiliary_loss_clip": 0.01084954, "auxiliary_loss_mlp": 0.01032093, "balance_loss_clip": 1.0328145, "balance_loss_mlp": 1.01900923, "epoch": 0.7537050954456637, "flos": 25735992537600.0, "grad_norm": 1.6235192895129946, "language_loss": 0.80976534, "learning_rate": 6.031723713426135e-07, "loss": 0.83093584, "num_input_tokens_seen": 270447120, "step": 12536, "time_per_iteration": 2.8644282817840576 }, { "auxiliary_loss_clip": 0.01075788, "auxiliary_loss_mlp": 0.01032433, "balance_loss_clip": 1.03380179, "balance_loss_mlp": 1.02025628, "epoch": 0.7537652186983316, "flos": 30224431203840.0, "grad_norm": 2.1219622248663095, "language_loss": 0.74480766, "learning_rate": 6.028936623737067e-07, "loss": 0.76588988, "num_input_tokens_seen": 270468680, "step": 12537, "time_per_iteration": 2.8825693130493164 }, { "auxiliary_loss_clip": 0.01110837, "auxiliary_loss_mlp": 0.01034517, "balance_loss_clip": 1.03765774, "balance_loss_mlp": 1.0218091, "epoch": 0.7538253419509996, "flos": 12641239198080.0, "grad_norm": 1.7916470224859762, "language_loss": 0.74127239, "learning_rate": 6.026150063832111e-07, "loss": 0.76272595, "num_input_tokens_seen": 270486310, "step": 12538, "time_per_iteration": 2.6497671604156494 }, { "auxiliary_loss_clip": 0.0107304, "auxiliary_loss_mlp": 0.01037868, "balance_loss_clip": 1.03775454, "balance_loss_mlp": 1.02487969, "epoch": 0.7538854652036675, "flos": 23185976256000.0, "grad_norm": 1.6497097895252697, "language_loss": 0.67839807, "learning_rate": 6.023364033816956e-07, "loss": 0.69950712, "num_input_tokens_seen": 270507210, "step": 12539, "time_per_iteration": 4.390820503234863 }, { "auxiliary_loss_clip": 0.01109728, "auxiliary_loss_mlp": 0.01030908, "balance_loss_clip": 1.03869367, "balance_loss_mlp": 1.01831353, "epoch": 0.7539455884563355, "flos": 23186227651200.0, "grad_norm": 1.7887923247322153, "language_loss": 0.74677789, "learning_rate": 6.020578533797229e-07, "loss": 0.76818419, "num_input_tokens_seen": 270525250, "step": 12540, "time_per_iteration": 2.6644110679626465 }, { "auxiliary_loss_clip": 0.01112068, "auxiliary_loss_mlp": 0.01031348, "balance_loss_clip": 1.03821325, "balance_loss_mlp": 1.01833093, "epoch": 0.7540057117090034, "flos": 13181155505280.0, "grad_norm": 2.2413467917064844, "language_loss": 0.72496325, "learning_rate": 6.017793563878566e-07, "loss": 0.74639738, "num_input_tokens_seen": 270539295, "step": 12541, "time_per_iteration": 2.6159961223602295 }, { "auxiliary_loss_clip": 0.0110964, "auxiliary_loss_mlp": 0.01031618, "balance_loss_clip": 1.03806591, "balance_loss_mlp": 1.0190115, "epoch": 0.7540658349616715, "flos": 45478134478080.0, "grad_norm": 1.701926826906392, "language_loss": 0.72403926, "learning_rate": 6.015009124166576e-07, "loss": 0.74545187, "num_input_tokens_seen": 270562815, "step": 12542, "time_per_iteration": 2.8387362957000732 }, { "auxiliary_loss_clip": 0.01085175, "auxiliary_loss_mlp": 0.0102804, "balance_loss_clip": 1.03588843, "balance_loss_mlp": 1.01508224, "epoch": 0.7541259582143394, "flos": 19930817105280.0, "grad_norm": 2.526006786337045, "language_loss": 0.8460182, "learning_rate": 6.012225214766844e-07, "loss": 0.86715031, "num_input_tokens_seen": 270579055, "step": 12543, "time_per_iteration": 2.6901259422302246 }, { "auxiliary_loss_clip": 0.01077755, "auxiliary_loss_mlp": 0.01034905, "balance_loss_clip": 1.04070735, "balance_loss_mlp": 1.02253056, "epoch": 0.7541860814670074, "flos": 27198239299200.0, "grad_norm": 2.1653550809548587, "language_loss": 0.73906153, "learning_rate": 6.009441835784927e-07, "loss": 0.7601881, "num_input_tokens_seen": 270599080, "step": 12544, "time_per_iteration": 2.729667901992798 }, { "auxiliary_loss_clip": 0.0109777, "auxiliary_loss_mlp": 0.01030811, "balance_loss_clip": 1.03749204, "balance_loss_mlp": 1.01909888, "epoch": 0.7542462047196753, "flos": 21324151624320.0, "grad_norm": 1.9325798259203488, "language_loss": 0.6805954, "learning_rate": 6.006658987326383e-07, "loss": 0.70188129, "num_input_tokens_seen": 270618715, "step": 12545, "time_per_iteration": 2.6119935512542725 }, { "auxiliary_loss_clip": 0.01085784, "auxiliary_loss_mlp": 0.01033147, "balance_loss_clip": 1.03426456, "balance_loss_mlp": 1.0204457, "epoch": 0.7543063279723433, "flos": 11940944664960.0, "grad_norm": 1.8867589100270292, "language_loss": 0.68448645, "learning_rate": 6.003876669496728e-07, "loss": 0.70567578, "num_input_tokens_seen": 270635695, "step": 12546, "time_per_iteration": 2.644932270050049 }, { "auxiliary_loss_clip": 0.01096622, "auxiliary_loss_mlp": 0.01036368, "balance_loss_clip": 1.03690362, "balance_loss_mlp": 1.02293336, "epoch": 0.7543664512250112, "flos": 22819974624000.0, "grad_norm": 2.226922026836887, "language_loss": 0.73148012, "learning_rate": 6.00109488240147e-07, "loss": 0.75281006, "num_input_tokens_seen": 270654325, "step": 12547, "time_per_iteration": 2.592843770980835 }, { "auxiliary_loss_clip": 0.0110976, "auxiliary_loss_mlp": 0.01029553, "balance_loss_clip": 1.037709, "balance_loss_mlp": 1.01641619, "epoch": 0.7544265744776792, "flos": 20923855482240.0, "grad_norm": 2.152338960632508, "language_loss": 0.67440069, "learning_rate": 5.998313626146099e-07, "loss": 0.69579387, "num_input_tokens_seen": 270674260, "step": 12548, "time_per_iteration": 2.646831750869751 }, { "auxiliary_loss_clip": 0.01090643, "auxiliary_loss_mlp": 0.01034531, "balance_loss_clip": 1.03753376, "balance_loss_mlp": 1.02168059, "epoch": 0.7544866977303473, "flos": 15195493284480.0, "grad_norm": 1.8439150079595696, "language_loss": 0.87032682, "learning_rate": 5.995532900836088e-07, "loss": 0.89157856, "num_input_tokens_seen": 270692200, "step": 12549, "time_per_iteration": 2.73703932762146 }, { "auxiliary_loss_clip": 0.01062401, "auxiliary_loss_mlp": 0.0103417, "balance_loss_clip": 1.03635311, "balance_loss_mlp": 1.02223134, "epoch": 0.7545468209830152, "flos": 27083683848960.0, "grad_norm": 1.964347561010599, "language_loss": 0.77038085, "learning_rate": 5.992752706576865e-07, "loss": 0.79134655, "num_input_tokens_seen": 270709675, "step": 12550, "time_per_iteration": 2.7760634422302246 }, { "auxiliary_loss_clip": 0.01110423, "auxiliary_loss_mlp": 0.01024865, "balance_loss_clip": 1.03772533, "balance_loss_mlp": 1.01295626, "epoch": 0.7546069442356832, "flos": 26871703735680.0, "grad_norm": 1.48969324659374, "language_loss": 0.69521177, "learning_rate": 5.98997304347386e-07, "loss": 0.71656471, "num_input_tokens_seen": 270733055, "step": 12551, "time_per_iteration": 2.612513303756714 }, { "auxiliary_loss_clip": 0.0108872, "auxiliary_loss_mlp": 0.01029008, "balance_loss_clip": 1.03803182, "balance_loss_mlp": 1.01590717, "epoch": 0.7546670674883511, "flos": 15743131015680.0, "grad_norm": 1.9528134557769512, "language_loss": 0.86114484, "learning_rate": 5.987193911632487e-07, "loss": 0.88232207, "num_input_tokens_seen": 270749275, "step": 12552, "time_per_iteration": 2.7746293544769287 }, { "auxiliary_loss_clip": 0.0110308, "auxiliary_loss_mlp": 0.01032898, "balance_loss_clip": 1.03883934, "balance_loss_mlp": 1.02059603, "epoch": 0.7547271907410191, "flos": 23477714519040.0, "grad_norm": 1.7307464295257877, "language_loss": 0.78382206, "learning_rate": 5.98441531115812e-07, "loss": 0.8051818, "num_input_tokens_seen": 270768230, "step": 12553, "time_per_iteration": 2.7080695629119873 }, { "auxiliary_loss_clip": 0.01099832, "auxiliary_loss_mlp": 0.01035652, "balance_loss_clip": 1.0393219, "balance_loss_mlp": 1.0227654, "epoch": 0.754787313993687, "flos": 31722804069120.0, "grad_norm": 2.043637353991968, "language_loss": 0.62419349, "learning_rate": 5.981637242156135e-07, "loss": 0.64554828, "num_input_tokens_seen": 270786285, "step": 12554, "time_per_iteration": 2.6828603744506836 }, { "auxiliary_loss_clip": 0.01087132, "auxiliary_loss_mlp": 0.01036641, "balance_loss_clip": 1.03482223, "balance_loss_mlp": 1.02456522, "epoch": 0.7548474372463551, "flos": 27563055782400.0, "grad_norm": 1.8726381797429124, "language_loss": 0.73138636, "learning_rate": 5.978859704731864e-07, "loss": 0.75262409, "num_input_tokens_seen": 270805505, "step": 12555, "time_per_iteration": 2.765606164932251 }, { "auxiliary_loss_clip": 0.01089159, "auxiliary_loss_mlp": 0.01033035, "balance_loss_clip": 1.04132962, "balance_loss_mlp": 1.0199343, "epoch": 0.754907560499023, "flos": 19318576763520.0, "grad_norm": 1.687430506523416, "language_loss": 0.78570682, "learning_rate": 5.976082698990645e-07, "loss": 0.80692875, "num_input_tokens_seen": 270824610, "step": 12556, "time_per_iteration": 2.7887120246887207 }, { "auxiliary_loss_clip": 0.0102254, "auxiliary_loss_mlp": 0.01000624, "balance_loss_clip": 1.00953579, "balance_loss_mlp": 0.99957508, "epoch": 0.754967683751691, "flos": 69744628684800.0, "grad_norm": 0.7056309097064257, "language_loss": 0.50379604, "learning_rate": 5.973306225037769e-07, "loss": 0.52402771, "num_input_tokens_seen": 270886155, "step": 12557, "time_per_iteration": 3.15433931350708 }, { "auxiliary_loss_clip": 0.01101663, "auxiliary_loss_mlp": 0.0103538, "balance_loss_clip": 1.0402422, "balance_loss_mlp": 1.02214742, "epoch": 0.7550278070043589, "flos": 24421913377920.0, "grad_norm": 1.864770698097121, "language_loss": 0.71454239, "learning_rate": 5.970530282978525e-07, "loss": 0.7359128, "num_input_tokens_seen": 270905325, "step": 12558, "time_per_iteration": 2.6398966312408447 }, { "auxiliary_loss_clip": 0.01086077, "auxiliary_loss_mlp": 0.01039687, "balance_loss_clip": 1.03605294, "balance_loss_mlp": 1.02564383, "epoch": 0.7550879302570269, "flos": 32634611838720.0, "grad_norm": 1.9214211385606932, "language_loss": 0.80440283, "learning_rate": 5.967754872918187e-07, "loss": 0.82566047, "num_input_tokens_seen": 270927535, "step": 12559, "time_per_iteration": 2.774087905883789 }, { "auxiliary_loss_clip": 0.01064062, "auxiliary_loss_mlp": 0.01030442, "balance_loss_clip": 1.03735518, "balance_loss_mlp": 1.01727533, "epoch": 0.7551480535096948, "flos": 21795550738560.0, "grad_norm": 1.681888372687875, "language_loss": 0.78732002, "learning_rate": 5.96497999496199e-07, "loss": 0.80826509, "num_input_tokens_seen": 270946920, "step": 12560, "time_per_iteration": 2.773224115371704 }, { "auxiliary_loss_clip": 0.01059602, "auxiliary_loss_mlp": 0.01042887, "balance_loss_clip": 1.03382421, "balance_loss_mlp": 1.0288794, "epoch": 0.7552081767623628, "flos": 18515111391360.0, "grad_norm": 2.5084045238772625, "language_loss": 0.70966113, "learning_rate": 5.96220564921515e-07, "loss": 0.73068601, "num_input_tokens_seen": 270965705, "step": 12561, "time_per_iteration": 2.7290430068969727 }, { "auxiliary_loss_clip": 0.01084123, "auxiliary_loss_mlp": 0.0077333, "balance_loss_clip": 1.03486896, "balance_loss_mlp": 1.00013804, "epoch": 0.7552683000150308, "flos": 27634805199360.0, "grad_norm": 1.645858172778927, "language_loss": 0.7574439, "learning_rate": 5.959431835782889e-07, "loss": 0.7760185, "num_input_tokens_seen": 270986550, "step": 12562, "time_per_iteration": 2.766808032989502 }, { "auxiliary_loss_clip": 0.01084916, "auxiliary_loss_mlp": 0.01028344, "balance_loss_clip": 1.03713727, "balance_loss_mlp": 1.01534379, "epoch": 0.7553284232676988, "flos": 20302924049280.0, "grad_norm": 1.8387284199108043, "language_loss": 0.76086068, "learning_rate": 5.956658554770371e-07, "loss": 0.78199327, "num_input_tokens_seen": 271006250, "step": 12563, "time_per_iteration": 2.6442339420318604 }, { "auxiliary_loss_clip": 0.01082317, "auxiliary_loss_mlp": 0.01032838, "balance_loss_clip": 1.03697193, "balance_loss_mlp": 1.01750755, "epoch": 0.7553885465203668, "flos": 33255471444480.0, "grad_norm": 2.643775015065329, "language_loss": 0.67393947, "learning_rate": 5.953885806282768e-07, "loss": 0.69509107, "num_input_tokens_seen": 271025575, "step": 12564, "time_per_iteration": 2.780668020248413 }, { "auxiliary_loss_clip": 0.01084523, "auxiliary_loss_mlp": 0.01036081, "balance_loss_clip": 1.03688002, "balance_loss_mlp": 1.02188349, "epoch": 0.7554486697730347, "flos": 21616249023360.0, "grad_norm": 2.407953823392175, "language_loss": 0.69013596, "learning_rate": 5.951113590425228e-07, "loss": 0.71134198, "num_input_tokens_seen": 271045805, "step": 12565, "time_per_iteration": 2.665789842605591 }, { "auxiliary_loss_clip": 0.01091959, "auxiliary_loss_mlp": 0.01032729, "balance_loss_clip": 1.03703809, "balance_loss_mlp": 1.01887071, "epoch": 0.7555087930257027, "flos": 27632973605760.0, "grad_norm": 1.874045640971064, "language_loss": 0.75261271, "learning_rate": 5.94834190730287e-07, "loss": 0.77385962, "num_input_tokens_seen": 271066065, "step": 12566, "time_per_iteration": 2.6897921562194824 }, { "auxiliary_loss_clip": 0.01105994, "auxiliary_loss_mlp": 0.01036283, "balance_loss_clip": 1.03961587, "balance_loss_mlp": 1.02240658, "epoch": 0.7555689162783706, "flos": 23621644316160.0, "grad_norm": 2.19029922676856, "language_loss": 0.73804015, "learning_rate": 5.945570757020789e-07, "loss": 0.75946295, "num_input_tokens_seen": 271085870, "step": 12567, "time_per_iteration": 2.681082248687744 }, { "auxiliary_loss_clip": 0.01112381, "auxiliary_loss_mlp": 0.01028784, "balance_loss_clip": 1.03940594, "balance_loss_mlp": 1.01680374, "epoch": 0.7556290395310387, "flos": 24863076218880.0, "grad_norm": 2.047451974712634, "language_loss": 0.62868547, "learning_rate": 5.942800139684073e-07, "loss": 0.65009713, "num_input_tokens_seen": 271104260, "step": 12568, "time_per_iteration": 2.663501739501953 }, { "auxiliary_loss_clip": 0.0102291, "auxiliary_loss_mlp": 0.01041785, "balance_loss_clip": 1.03343916, "balance_loss_mlp": 1.02825463, "epoch": 0.7556891627837066, "flos": 43543770330240.0, "grad_norm": 1.8712587826927434, "language_loss": 0.66730088, "learning_rate": 5.940030055397789e-07, "loss": 0.68794787, "num_input_tokens_seen": 271125745, "step": 12569, "time_per_iteration": 3.4009649753570557 }, { "auxiliary_loss_clip": 0.01104803, "auxiliary_loss_mlp": 0.01036972, "balance_loss_clip": 1.03995872, "balance_loss_mlp": 1.02256608, "epoch": 0.7557492860363746, "flos": 26650924790400.0, "grad_norm": 1.7459864458479233, "language_loss": 0.67298895, "learning_rate": 5.93726050426697e-07, "loss": 0.69440669, "num_input_tokens_seen": 271147145, "step": 12570, "time_per_iteration": 3.006865978240967 }, { "auxiliary_loss_clip": 0.01112467, "auxiliary_loss_mlp": 0.01035865, "balance_loss_clip": 1.03923225, "balance_loss_mlp": 1.02238834, "epoch": 0.7558094092890425, "flos": 55182885010560.0, "grad_norm": 1.8543133954373656, "language_loss": 0.71857494, "learning_rate": 5.934491486396647e-07, "loss": 0.74005824, "num_input_tokens_seen": 271170865, "step": 12571, "time_per_iteration": 2.9743287563323975 }, { "auxiliary_loss_clip": 0.01066938, "auxiliary_loss_mlp": 0.01037876, "balance_loss_clip": 1.03525424, "balance_loss_mlp": 1.02339244, "epoch": 0.7558695325417105, "flos": 23988292392960.0, "grad_norm": 1.8208269811999866, "language_loss": 0.73415917, "learning_rate": 5.931723001891811e-07, "loss": 0.7552073, "num_input_tokens_seen": 271191450, "step": 12572, "time_per_iteration": 2.819380044937134 }, { "auxiliary_loss_clip": 0.0109252, "auxiliary_loss_mlp": 0.01033068, "balance_loss_clip": 1.04051542, "balance_loss_mlp": 1.02049112, "epoch": 0.7559296557943784, "flos": 14611262572800.0, "grad_norm": 2.0177969949137577, "language_loss": 0.76612377, "learning_rate": 5.928955050857456e-07, "loss": 0.78737968, "num_input_tokens_seen": 271207335, "step": 12573, "time_per_iteration": 4.514675617218018 }, { "auxiliary_loss_clip": 0.01087889, "auxiliary_loss_mlp": 0.01036063, "balance_loss_clip": 1.04069138, "balance_loss_mlp": 1.02323067, "epoch": 0.7559897790470465, "flos": 18550483309440.0, "grad_norm": 1.5618514080613375, "language_loss": 0.69153476, "learning_rate": 5.926187633398527e-07, "loss": 0.71277434, "num_input_tokens_seen": 271226895, "step": 12574, "time_per_iteration": 4.325180530548096 }, { "auxiliary_loss_clip": 0.0107305, "auxiliary_loss_mlp": 0.01033848, "balance_loss_clip": 1.03176165, "balance_loss_mlp": 1.01988304, "epoch": 0.7560499022997144, "flos": 17967868709760.0, "grad_norm": 2.3174142994510065, "language_loss": 0.71567178, "learning_rate": 5.923420749619974e-07, "loss": 0.73674083, "num_input_tokens_seen": 271244375, "step": 12575, "time_per_iteration": 4.343465805053711 }, { "auxiliary_loss_clip": 0.0110949, "auxiliary_loss_mlp": 0.00770549, "balance_loss_clip": 1.03735065, "balance_loss_mlp": 1.00018251, "epoch": 0.7561100255523824, "flos": 15737815802880.0, "grad_norm": 2.1045282969153125, "language_loss": 0.71783686, "learning_rate": 5.92065439962673e-07, "loss": 0.73663723, "num_input_tokens_seen": 271259530, "step": 12576, "time_per_iteration": 2.6967074871063232 }, { "auxiliary_loss_clip": 0.01076617, "auxiliary_loss_mlp": 0.01031643, "balance_loss_clip": 1.03790188, "balance_loss_mlp": 1.01866078, "epoch": 0.7561701488050504, "flos": 15888102307200.0, "grad_norm": 2.0401468166974857, "language_loss": 0.67187804, "learning_rate": 5.917888583523669e-07, "loss": 0.69296062, "num_input_tokens_seen": 271276835, "step": 12577, "time_per_iteration": 2.6873996257781982 }, { "auxiliary_loss_clip": 0.01088122, "auxiliary_loss_mlp": 0.01037075, "balance_loss_clip": 1.03602171, "balance_loss_mlp": 1.02463531, "epoch": 0.7562302720577183, "flos": 20339157893760.0, "grad_norm": 1.8873015547804852, "language_loss": 0.78041267, "learning_rate": 5.915123301415685e-07, "loss": 0.80166459, "num_input_tokens_seen": 271296275, "step": 12578, "time_per_iteration": 4.377631664276123 }, { "auxiliary_loss_clip": 0.01100787, "auxiliary_loss_mlp": 0.01032965, "balance_loss_clip": 1.03763413, "balance_loss_mlp": 1.02016759, "epoch": 0.7562903953103863, "flos": 20812209033600.0, "grad_norm": 1.6803508279416246, "language_loss": 0.75802839, "learning_rate": 5.912358553407641e-07, "loss": 0.7793659, "num_input_tokens_seen": 271315685, "step": 12579, "time_per_iteration": 2.778723955154419 }, { "auxiliary_loss_clip": 0.01070667, "auxiliary_loss_mlp": 0.01036269, "balance_loss_clip": 1.03752732, "balance_loss_mlp": 1.02198792, "epoch": 0.7563505185630542, "flos": 37596999484800.0, "grad_norm": 2.5693429830085397, "language_loss": 0.627738, "learning_rate": 5.90959433960437e-07, "loss": 0.64880729, "num_input_tokens_seen": 271336790, "step": 12580, "time_per_iteration": 2.996838331222534 }, { "auxiliary_loss_clip": 0.01067496, "auxiliary_loss_mlp": 0.01033758, "balance_loss_clip": 1.03585196, "balance_loss_mlp": 1.02117586, "epoch": 0.7564106418157223, "flos": 20230995064320.0, "grad_norm": 1.6306554766999415, "language_loss": 0.74993187, "learning_rate": 5.906830660110691e-07, "loss": 0.77094436, "num_input_tokens_seen": 271355470, "step": 12581, "time_per_iteration": 2.8892579078674316 }, { "auxiliary_loss_clip": 0.01071961, "auxiliary_loss_mlp": 0.01033537, "balance_loss_clip": 1.03673053, "balance_loss_mlp": 1.02031684, "epoch": 0.7564707650683902, "flos": 24754877475840.0, "grad_norm": 1.6534906098525708, "language_loss": 0.62473053, "learning_rate": 5.904067515031412e-07, "loss": 0.64578557, "num_input_tokens_seen": 271375810, "step": 12582, "time_per_iteration": 2.78520131111145 }, { "auxiliary_loss_clip": 0.01031417, "auxiliary_loss_mlp": 0.0099978, "balance_loss_clip": 1.00870466, "balance_loss_mlp": 0.99880236, "epoch": 0.7565308883210582, "flos": 48530076433920.0, "grad_norm": 0.9397092341612294, "language_loss": 0.6060046, "learning_rate": 5.901304904471307e-07, "loss": 0.62631655, "num_input_tokens_seen": 271424775, "step": 12583, "time_per_iteration": 2.9951171875 }, { "auxiliary_loss_clip": 0.01084102, "auxiliary_loss_mlp": 0.01041483, "balance_loss_clip": 1.03840542, "balance_loss_mlp": 1.02859008, "epoch": 0.7565910115737261, "flos": 12495082757760.0, "grad_norm": 2.173716211625989, "language_loss": 0.7912221, "learning_rate": 5.898542828535125e-07, "loss": 0.81247795, "num_input_tokens_seen": 271440500, "step": 12584, "time_per_iteration": 2.724681854248047 }, { "auxiliary_loss_clip": 0.01079444, "auxiliary_loss_mlp": 0.01038295, "balance_loss_clip": 1.03406775, "balance_loss_mlp": 1.02354908, "epoch": 0.7566511348263941, "flos": 21173003193600.0, "grad_norm": 2.334504412939606, "language_loss": 0.77645278, "learning_rate": 5.895781287327612e-07, "loss": 0.79763019, "num_input_tokens_seen": 271458180, "step": 12585, "time_per_iteration": 2.7006850242614746 }, { "auxiliary_loss_clip": 0.01116119, "auxiliary_loss_mlp": 0.01038399, "balance_loss_clip": 1.04165065, "balance_loss_mlp": 1.0249517, "epoch": 0.756711258079062, "flos": 21754827694080.0, "grad_norm": 1.6643260913798816, "language_loss": 0.83146328, "learning_rate": 5.893020280953493e-07, "loss": 0.85300845, "num_input_tokens_seen": 271475730, "step": 12586, "time_per_iteration": 2.7549026012420654 }, { "auxiliary_loss_clip": 0.01115138, "auxiliary_loss_mlp": 0.01030771, "balance_loss_clip": 1.04039466, "balance_loss_mlp": 1.01833797, "epoch": 0.75677138133173, "flos": 22382905933440.0, "grad_norm": 2.0582325962784207, "language_loss": 0.83617753, "learning_rate": 5.890259809517459e-07, "loss": 0.85763657, "num_input_tokens_seen": 271495030, "step": 12587, "time_per_iteration": 2.6982500553131104 }, { "auxiliary_loss_clip": 0.01076996, "auxiliary_loss_mlp": 0.01027662, "balance_loss_clip": 1.03665411, "balance_loss_mlp": 1.01509786, "epoch": 0.756831504584398, "flos": 22708974620160.0, "grad_norm": 1.4789161114631317, "language_loss": 0.71016109, "learning_rate": 5.88749987312418e-07, "loss": 0.73120773, "num_input_tokens_seen": 271515355, "step": 12588, "time_per_iteration": 2.811058282852173 }, { "auxiliary_loss_clip": 0.01113651, "auxiliary_loss_mlp": 0.00770901, "balance_loss_clip": 1.03982472, "balance_loss_mlp": 1.00019073, "epoch": 0.756891627837066, "flos": 24098358643200.0, "grad_norm": 1.7170735982642948, "language_loss": 0.68827093, "learning_rate": 5.884740471878327e-07, "loss": 0.70711648, "num_input_tokens_seen": 271535090, "step": 12589, "time_per_iteration": 2.668159008026123 }, { "auxiliary_loss_clip": 0.01100202, "auxiliary_loss_mlp": 0.01029096, "balance_loss_clip": 1.03817892, "balance_loss_mlp": 1.01629877, "epoch": 0.756951751089734, "flos": 19749001438080.0, "grad_norm": 1.693382160425306, "language_loss": 0.92356181, "learning_rate": 5.881981605884522e-07, "loss": 0.9448548, "num_input_tokens_seen": 271551075, "step": 12590, "time_per_iteration": 2.737993001937866 }, { "auxiliary_loss_clip": 0.01081772, "auxiliary_loss_mlp": 0.01030735, "balance_loss_clip": 1.03454733, "balance_loss_mlp": 1.01852822, "epoch": 0.7570118743424019, "flos": 35079266551680.0, "grad_norm": 2.1087448505355364, "language_loss": 0.6530177, "learning_rate": 5.879223275247391e-07, "loss": 0.67414272, "num_input_tokens_seen": 271571035, "step": 12591, "time_per_iteration": 2.836533308029175 }, { "auxiliary_loss_clip": 0.01099676, "auxiliary_loss_mlp": 0.01029683, "balance_loss_clip": 1.03951907, "balance_loss_mlp": 1.01828074, "epoch": 0.7570719975950699, "flos": 25594540778880.0, "grad_norm": 10.362773010711903, "language_loss": 0.73889554, "learning_rate": 5.876465480071528e-07, "loss": 0.76018918, "num_input_tokens_seen": 271592950, "step": 12592, "time_per_iteration": 2.729740619659424 }, { "auxiliary_loss_clip": 0.01100337, "auxiliary_loss_mlp": 0.01036287, "balance_loss_clip": 1.03738928, "balance_loss_mlp": 1.02323985, "epoch": 0.7571321208477378, "flos": 10816223028480.0, "grad_norm": 2.217401018900874, "language_loss": 0.71442747, "learning_rate": 5.873708220461522e-07, "loss": 0.73579371, "num_input_tokens_seen": 271608835, "step": 12593, "time_per_iteration": 2.684826135635376 }, { "auxiliary_loss_clip": 0.01112155, "auxiliary_loss_mlp": 0.01031949, "balance_loss_clip": 1.03883767, "balance_loss_mlp": 1.01900887, "epoch": 0.7571922441004059, "flos": 18260109763200.0, "grad_norm": 2.277271762211562, "language_loss": 0.66408104, "learning_rate": 5.870951496521903e-07, "loss": 0.68552208, "num_input_tokens_seen": 271627730, "step": 12594, "time_per_iteration": 2.66044545173645 }, { "auxiliary_loss_clip": 0.01081064, "auxiliary_loss_mlp": 0.01034067, "balance_loss_clip": 1.03765464, "balance_loss_mlp": 1.02116287, "epoch": 0.7572523673530738, "flos": 22890502978560.0, "grad_norm": 1.5512103327237567, "language_loss": 0.80722225, "learning_rate": 5.86819530835722e-07, "loss": 0.82837361, "num_input_tokens_seen": 271646415, "step": 12595, "time_per_iteration": 2.75352144241333 }, { "auxiliary_loss_clip": 0.01078291, "auxiliary_loss_mlp": 0.01034396, "balance_loss_clip": 1.03972101, "balance_loss_mlp": 1.02266574, "epoch": 0.7573124906057418, "flos": 20996323171200.0, "grad_norm": 1.9894880322297872, "language_loss": 0.71883428, "learning_rate": 5.865439656071993e-07, "loss": 0.73996115, "num_input_tokens_seen": 271666240, "step": 12596, "time_per_iteration": 2.830939531326294 }, { "auxiliary_loss_clip": 0.01013568, "auxiliary_loss_mlp": 0.01033181, "balance_loss_clip": 1.03547406, "balance_loss_mlp": 1.02174306, "epoch": 0.7573726138584097, "flos": 20886292834560.0, "grad_norm": 1.6646538422679886, "language_loss": 0.80251002, "learning_rate": 5.862684539770706e-07, "loss": 0.82297754, "num_input_tokens_seen": 271686370, "step": 12597, "time_per_iteration": 3.2770867347717285 }, { "auxiliary_loss_clip": 0.01084183, "auxiliary_loss_mlp": 0.01030961, "balance_loss_clip": 1.04002273, "balance_loss_mlp": 1.01700711, "epoch": 0.7574327371110777, "flos": 24530507170560.0, "grad_norm": 2.8794945787689477, "language_loss": 0.83217478, "learning_rate": 5.859929959557835e-07, "loss": 0.8533262, "num_input_tokens_seen": 271705050, "step": 12598, "time_per_iteration": 3.5696053504943848 }, { "auxiliary_loss_clip": 0.01083032, "auxiliary_loss_mlp": 0.0102703, "balance_loss_clip": 1.03725743, "balance_loss_mlp": 1.01568758, "epoch": 0.7574928603637456, "flos": 23364523785600.0, "grad_norm": 1.9599324451937288, "language_loss": 0.62513769, "learning_rate": 5.857175915537845e-07, "loss": 0.64623827, "num_input_tokens_seen": 271724915, "step": 12599, "time_per_iteration": 2.9659054279327393 }, { "auxiliary_loss_clip": 0.01087639, "auxiliary_loss_mlp": 0.00772119, "balance_loss_clip": 1.03743839, "balance_loss_mlp": 1.00022399, "epoch": 0.7575529836164137, "flos": 13516274419200.0, "grad_norm": 2.6514435576773767, "language_loss": 0.63275111, "learning_rate": 5.854422407815161e-07, "loss": 0.65134871, "num_input_tokens_seen": 271742410, "step": 12600, "time_per_iteration": 2.761671304702759 }, { "auxiliary_loss_clip": 0.01081508, "auxiliary_loss_mlp": 0.010341, "balance_loss_clip": 1.03465056, "balance_loss_mlp": 1.01968765, "epoch": 0.7576131068690816, "flos": 19646584784640.0, "grad_norm": 1.9759732214873023, "language_loss": 0.66604817, "learning_rate": 5.851669436494191e-07, "loss": 0.68720412, "num_input_tokens_seen": 271761425, "step": 12601, "time_per_iteration": 2.8752126693725586 }, { "auxiliary_loss_clip": 0.01081767, "auxiliary_loss_mlp": 0.01030188, "balance_loss_clip": 1.03683853, "balance_loss_mlp": 1.01856518, "epoch": 0.7576732301217496, "flos": 20048245643520.0, "grad_norm": 1.862908723746181, "language_loss": 0.6777848, "learning_rate": 5.848917001679335e-07, "loss": 0.69890434, "num_input_tokens_seen": 271780875, "step": 12602, "time_per_iteration": 2.7810614109039307 }, { "auxiliary_loss_clip": 0.01102089, "auxiliary_loss_mlp": 0.01035949, "balance_loss_clip": 1.03889537, "balance_loss_mlp": 1.02206695, "epoch": 0.7577333533744176, "flos": 15377093470080.0, "grad_norm": 3.3133966859560138, "language_loss": 0.67229289, "learning_rate": 5.846165103474967e-07, "loss": 0.69367325, "num_input_tokens_seen": 271799490, "step": 12603, "time_per_iteration": 2.677644968032837 }, { "auxiliary_loss_clip": 0.01086121, "auxiliary_loss_mlp": 0.01033317, "balance_loss_clip": 1.03466463, "balance_loss_mlp": 1.02153969, "epoch": 0.7577934766270855, "flos": 17894862316800.0, "grad_norm": 2.091164920728678, "language_loss": 0.61460161, "learning_rate": 5.843413741985439e-07, "loss": 0.63579607, "num_input_tokens_seen": 271817040, "step": 12604, "time_per_iteration": 2.690556287765503 }, { "auxiliary_loss_clip": 0.01113132, "auxiliary_loss_mlp": 0.01037248, "balance_loss_clip": 1.04157591, "balance_loss_mlp": 1.0240519, "epoch": 0.7578535998797535, "flos": 21613770984960.0, "grad_norm": 1.860643993925951, "language_loss": 0.79847634, "learning_rate": 5.840662917315076e-07, "loss": 0.81998014, "num_input_tokens_seen": 271835480, "step": 12605, "time_per_iteration": 2.650987148284912 }, { "auxiliary_loss_clip": 0.01114865, "auxiliary_loss_mlp": 0.01030793, "balance_loss_clip": 1.03956521, "balance_loss_mlp": 1.01750159, "epoch": 0.7579137231324214, "flos": 18478374756480.0, "grad_norm": 2.6305225547150286, "language_loss": 0.79649675, "learning_rate": 5.837912629568198e-07, "loss": 0.81795335, "num_input_tokens_seen": 271849835, "step": 12606, "time_per_iteration": 2.6179733276367188 }, { "auxiliary_loss_clip": 0.01094397, "auxiliary_loss_mlp": 0.01030813, "balance_loss_clip": 1.03664708, "balance_loss_mlp": 1.01947641, "epoch": 0.7579738463850895, "flos": 23255032152960.0, "grad_norm": 1.422559911510894, "language_loss": 0.73040879, "learning_rate": 5.835162878849087e-07, "loss": 0.75166082, "num_input_tokens_seen": 271869560, "step": 12607, "time_per_iteration": 2.660883903503418 }, { "auxiliary_loss_clip": 0.01085893, "auxiliary_loss_mlp": 0.01032003, "balance_loss_clip": 1.03795099, "balance_loss_mlp": 1.01872361, "epoch": 0.7580339696377574, "flos": 14027031861120.0, "grad_norm": 1.8402667548029668, "language_loss": 0.75154114, "learning_rate": 5.83241366526202e-07, "loss": 0.7727201, "num_input_tokens_seen": 271887950, "step": 12608, "time_per_iteration": 2.7164134979248047 }, { "auxiliary_loss_clip": 0.01074571, "auxiliary_loss_mlp": 0.00770045, "balance_loss_clip": 1.0365268, "balance_loss_mlp": 1.00018573, "epoch": 0.7580940928904254, "flos": 25082777756160.0, "grad_norm": 1.7434049205366062, "language_loss": 0.71609342, "learning_rate": 5.829664988911245e-07, "loss": 0.73453957, "num_input_tokens_seen": 271907700, "step": 12609, "time_per_iteration": 2.788742780685425 }, { "auxiliary_loss_clip": 0.0111318, "auxiliary_loss_mlp": 0.01033401, "balance_loss_clip": 1.0384202, "balance_loss_mlp": 1.02005589, "epoch": 0.7581542161430933, "flos": 23836425690240.0, "grad_norm": 1.6307456106692844, "language_loss": 0.81648767, "learning_rate": 5.826916849901007e-07, "loss": 0.83795345, "num_input_tokens_seen": 271926840, "step": 12610, "time_per_iteration": 2.6684138774871826 }, { "auxiliary_loss_clip": 0.01096074, "auxiliary_loss_mlp": 0.01034645, "balance_loss_clip": 1.03990412, "balance_loss_mlp": 1.0215261, "epoch": 0.7582143393957613, "flos": 22237000888320.0, "grad_norm": 1.7108192328279062, "language_loss": 0.70459145, "learning_rate": 5.824169248335488e-07, "loss": 0.72589862, "num_input_tokens_seen": 271946465, "step": 12611, "time_per_iteration": 2.7615582942962646 }, { "auxiliary_loss_clip": 0.01111911, "auxiliary_loss_mlp": 0.01031837, "balance_loss_clip": 1.03971386, "balance_loss_mlp": 1.01939797, "epoch": 0.7582744626484292, "flos": 21106389421440.0, "grad_norm": 1.4842490716025172, "language_loss": 0.70994782, "learning_rate": 5.821422184318893e-07, "loss": 0.73138535, "num_input_tokens_seen": 271967295, "step": 12612, "time_per_iteration": 4.388495445251465 }, { "auxiliary_loss_clip": 0.01051139, "auxiliary_loss_mlp": 0.01043129, "balance_loss_clip": 1.03555894, "balance_loss_mlp": 1.03022408, "epoch": 0.7583345859010973, "flos": 24604770539520.0, "grad_norm": 1.3817563743236791, "language_loss": 0.59341693, "learning_rate": 5.818675657955397e-07, "loss": 0.61435962, "num_input_tokens_seen": 271987960, "step": 12613, "time_per_iteration": 4.679025411605835 }, { "auxiliary_loss_clip": 0.01085628, "auxiliary_loss_mlp": 0.01039806, "balance_loss_clip": 1.0359726, "balance_loss_mlp": 1.02548921, "epoch": 0.7583947091537652, "flos": 33546814657920.0, "grad_norm": 1.5041496360989353, "language_loss": 0.59715927, "learning_rate": 5.815929669349135e-07, "loss": 0.61841357, "num_input_tokens_seen": 272011780, "step": 12614, "time_per_iteration": 4.3984222412109375 }, { "auxiliary_loss_clip": 0.01075793, "auxiliary_loss_mlp": 0.01029168, "balance_loss_clip": 1.03387702, "balance_loss_mlp": 1.01572776, "epoch": 0.7584548324064332, "flos": 20121000641280.0, "grad_norm": 1.921615771870116, "language_loss": 0.73268729, "learning_rate": 5.813184218604246e-07, "loss": 0.75373691, "num_input_tokens_seen": 272030825, "step": 12615, "time_per_iteration": 2.8314290046691895 }, { "auxiliary_loss_clip": 0.01011548, "auxiliary_loss_mlp": 0.00999712, "balance_loss_clip": 1.01067567, "balance_loss_mlp": 0.99848443, "epoch": 0.7585149556591012, "flos": 70402584061440.0, "grad_norm": 0.8045882645133534, "language_loss": 0.67647672, "learning_rate": 5.810439305824828e-07, "loss": 0.69658935, "num_input_tokens_seen": 272095825, "step": 12616, "time_per_iteration": 3.260563850402832 }, { "auxiliary_loss_clip": 0.0108171, "auxiliary_loss_mlp": 0.01039897, "balance_loss_clip": 1.03870976, "balance_loss_mlp": 1.02642608, "epoch": 0.7585750789117691, "flos": 16143786293760.0, "grad_norm": 1.736965809635246, "language_loss": 0.84524256, "learning_rate": 5.807694931114979e-07, "loss": 0.86645865, "num_input_tokens_seen": 272113950, "step": 12617, "time_per_iteration": 2.8263378143310547 }, { "auxiliary_loss_clip": 0.01078721, "auxiliary_loss_mlp": 0.01039251, "balance_loss_clip": 1.0390234, "balance_loss_mlp": 1.02730036, "epoch": 0.7586352021644371, "flos": 17493165544320.0, "grad_norm": 2.328657460169902, "language_loss": 0.74700725, "learning_rate": 5.804951094578757e-07, "loss": 0.76818699, "num_input_tokens_seen": 272130315, "step": 12618, "time_per_iteration": 4.2552900314331055 }, { "auxiliary_loss_clip": 0.0109138, "auxiliary_loss_mlp": 0.01032269, "balance_loss_clip": 1.03749752, "balance_loss_mlp": 1.01850069, "epoch": 0.758695325417105, "flos": 17275187859840.0, "grad_norm": 1.9133972925292233, "language_loss": 0.77211189, "learning_rate": 5.802207796320209e-07, "loss": 0.79334843, "num_input_tokens_seen": 272149080, "step": 12619, "time_per_iteration": 2.7758803367614746 }, { "auxiliary_loss_clip": 0.0107017, "auxiliary_loss_mlp": 0.01037442, "balance_loss_clip": 1.03423786, "balance_loss_mlp": 1.02421534, "epoch": 0.7587554486697731, "flos": 29495660163840.0, "grad_norm": 1.9790425844010804, "language_loss": 0.82581639, "learning_rate": 5.79946503644337e-07, "loss": 0.84689248, "num_input_tokens_seen": 272168285, "step": 12620, "time_per_iteration": 2.860680341720581 }, { "auxiliary_loss_clip": 0.01086979, "auxiliary_loss_mlp": 0.01039531, "balance_loss_clip": 1.03506887, "balance_loss_mlp": 1.02535069, "epoch": 0.758815571922441, "flos": 16100800692480.0, "grad_norm": 2.7719237542052335, "language_loss": 0.82916582, "learning_rate": 5.796722815052242e-07, "loss": 0.85043091, "num_input_tokens_seen": 272184585, "step": 12621, "time_per_iteration": 2.6819448471069336 }, { "auxiliary_loss_clip": 0.01090396, "auxiliary_loss_mlp": 0.01032874, "balance_loss_clip": 1.03831124, "balance_loss_mlp": 1.02035689, "epoch": 0.758875695175109, "flos": 16143714466560.0, "grad_norm": 2.331369198169253, "language_loss": 0.73694873, "learning_rate": 5.7939811322508e-07, "loss": 0.75818145, "num_input_tokens_seen": 272200205, "step": 12622, "time_per_iteration": 2.867482900619507 }, { "auxiliary_loss_clip": 0.01020627, "auxiliary_loss_mlp": 0.00999479, "balance_loss_clip": 1.00808787, "balance_loss_mlp": 0.99837667, "epoch": 0.7589358184277769, "flos": 68462006860800.0, "grad_norm": 0.8939637430208361, "language_loss": 0.60890412, "learning_rate": 5.791239988143024e-07, "loss": 0.62910521, "num_input_tokens_seen": 272259670, "step": 12623, "time_per_iteration": 3.399125814437866 }, { "auxiliary_loss_clip": 0.01108354, "auxiliary_loss_mlp": 0.01032352, "balance_loss_clip": 1.0389595, "balance_loss_mlp": 1.0204668, "epoch": 0.7589959416804449, "flos": 20047311889920.0, "grad_norm": 2.1862107817163374, "language_loss": 0.67437398, "learning_rate": 5.788499382832847e-07, "loss": 0.69578105, "num_input_tokens_seen": 272277925, "step": 12624, "time_per_iteration": 2.7711076736450195 }, { "auxiliary_loss_clip": 0.01108684, "auxiliary_loss_mlp": 0.01029908, "balance_loss_clip": 1.03826535, "balance_loss_mlp": 1.01691461, "epoch": 0.7590560649331128, "flos": 18771800958720.0, "grad_norm": 11.908372705872578, "language_loss": 0.76173502, "learning_rate": 5.785759316424196e-07, "loss": 0.78312099, "num_input_tokens_seen": 272296010, "step": 12625, "time_per_iteration": 2.695136308670044 }, { "auxiliary_loss_clip": 0.0108337, "auxiliary_loss_mlp": 0.01043075, "balance_loss_clip": 1.03519034, "balance_loss_mlp": 1.02824545, "epoch": 0.7591161881857809, "flos": 29825284296960.0, "grad_norm": 1.865247644851499, "language_loss": 0.63104314, "learning_rate": 5.783019789020977e-07, "loss": 0.65230757, "num_input_tokens_seen": 272318330, "step": 12626, "time_per_iteration": 2.815093517303467 }, { "auxiliary_loss_clip": 0.01080043, "auxiliary_loss_mlp": 0.00771292, "balance_loss_clip": 1.04494154, "balance_loss_mlp": 1.00028062, "epoch": 0.7591763114384488, "flos": 20302708567680.0, "grad_norm": 2.0523273844402605, "language_loss": 0.74221742, "learning_rate": 5.780280800727084e-07, "loss": 0.76073074, "num_input_tokens_seen": 272335265, "step": 12627, "time_per_iteration": 3.018779993057251 }, { "auxiliary_loss_clip": 0.01100814, "auxiliary_loss_mlp": 0.0103172, "balance_loss_clip": 1.03871465, "balance_loss_mlp": 1.0191313, "epoch": 0.7592364346911168, "flos": 20813609664000.0, "grad_norm": 2.9039370071826145, "language_loss": 0.6930986, "learning_rate": 5.777542351646356e-07, "loss": 0.71442395, "num_input_tokens_seen": 272354795, "step": 12628, "time_per_iteration": 2.717823028564453 }, { "auxiliary_loss_clip": 0.01102671, "auxiliary_loss_mlp": 0.01034671, "balance_loss_clip": 1.04052353, "balance_loss_mlp": 1.02078366, "epoch": 0.7592965579437848, "flos": 21251504367360.0, "grad_norm": 1.7338759935871468, "language_loss": 0.63148701, "learning_rate": 5.774804441882648e-07, "loss": 0.6528604, "num_input_tokens_seen": 272372875, "step": 12629, "time_per_iteration": 2.6770267486572266 }, { "auxiliary_loss_clip": 0.01084801, "auxiliary_loss_mlp": 0.01032088, "balance_loss_clip": 1.03561509, "balance_loss_mlp": 1.02010107, "epoch": 0.7593566811964527, "flos": 26213604704640.0, "grad_norm": 1.4746577606675504, "language_loss": 0.77671874, "learning_rate": 5.772067071539786e-07, "loss": 0.79788756, "num_input_tokens_seen": 272394715, "step": 12630, "time_per_iteration": 2.9122629165649414 }, { "auxiliary_loss_clip": 0.01029746, "auxiliary_loss_mlp": 0.01002357, "balance_loss_clip": 1.00722373, "balance_loss_mlp": 1.00131977, "epoch": 0.7594168044491207, "flos": 71237255374080.0, "grad_norm": 0.8115073267704523, "language_loss": 0.61498612, "learning_rate": 5.769330240721562e-07, "loss": 0.63530719, "num_input_tokens_seen": 272458775, "step": 12631, "time_per_iteration": 3.267413377761841 }, { "auxiliary_loss_clip": 0.01084169, "auxiliary_loss_mlp": 0.00772349, "balance_loss_clip": 1.03867972, "balance_loss_mlp": 1.00034893, "epoch": 0.7594769277017887, "flos": 26613326229120.0, "grad_norm": 1.5722858256300303, "language_loss": 0.73812342, "learning_rate": 5.766593949531767e-07, "loss": 0.75668871, "num_input_tokens_seen": 272479355, "step": 12632, "time_per_iteration": 2.9674253463745117 }, { "auxiliary_loss_clip": 0.01089012, "auxiliary_loss_mlp": 0.01030973, "balance_loss_clip": 1.0375607, "balance_loss_mlp": 1.01855755, "epoch": 0.7595370509544567, "flos": 17595941333760.0, "grad_norm": 2.3123827403326005, "language_loss": 0.75472778, "learning_rate": 5.763858198074154e-07, "loss": 0.77592766, "num_input_tokens_seen": 272493555, "step": 12633, "time_per_iteration": 2.733344078063965 }, { "auxiliary_loss_clip": 0.01087192, "auxiliary_loss_mlp": 0.01028663, "balance_loss_clip": 1.03815973, "balance_loss_mlp": 1.017272, "epoch": 0.7595971742071246, "flos": 18002953319040.0, "grad_norm": 2.016293205622038, "language_loss": 0.73391056, "learning_rate": 5.76112298645246e-07, "loss": 0.75506908, "num_input_tokens_seen": 272508925, "step": 12634, "time_per_iteration": 2.7296500205993652 }, { "auxiliary_loss_clip": 0.01111487, "auxiliary_loss_mlp": 0.01034168, "balance_loss_clip": 1.03916657, "balance_loss_mlp": 1.02143645, "epoch": 0.7596572974597926, "flos": 28840326480000.0, "grad_norm": 2.834994861255327, "language_loss": 0.64522898, "learning_rate": 5.758388314770408e-07, "loss": 0.66668558, "num_input_tokens_seen": 272528805, "step": 12635, "time_per_iteration": 2.79398512840271 }, { "auxiliary_loss_clip": 0.01054416, "auxiliary_loss_mlp": 0.01048736, "balance_loss_clip": 1.03525424, "balance_loss_mlp": 1.03316736, "epoch": 0.7597174207124605, "flos": 14282823588480.0, "grad_norm": 1.8350185732096174, "language_loss": 0.69167364, "learning_rate": 5.7556541831317e-07, "loss": 0.71270514, "num_input_tokens_seen": 272546655, "step": 12636, "time_per_iteration": 2.827582836151123 }, { "auxiliary_loss_clip": 0.01094213, "auxiliary_loss_mlp": 0.01034821, "balance_loss_clip": 1.03955829, "balance_loss_mlp": 1.02246487, "epoch": 0.7597775439651285, "flos": 21688932193920.0, "grad_norm": 2.1812107272877665, "language_loss": 0.81070203, "learning_rate": 5.752920591640018e-07, "loss": 0.83199233, "num_input_tokens_seen": 272564010, "step": 12637, "time_per_iteration": 2.805816650390625 }, { "auxiliary_loss_clip": 0.01098118, "auxiliary_loss_mlp": 0.01032589, "balance_loss_clip": 1.03679478, "balance_loss_mlp": 1.02025676, "epoch": 0.7598376672177964, "flos": 36101248312320.0, "grad_norm": 1.701654856542883, "language_loss": 0.66547924, "learning_rate": 5.750187540399017e-07, "loss": 0.68678635, "num_input_tokens_seen": 272585840, "step": 12638, "time_per_iteration": 2.8566620349884033 }, { "auxiliary_loss_clip": 0.01114657, "auxiliary_loss_mlp": 0.01040373, "balance_loss_clip": 1.04063082, "balance_loss_mlp": 1.02584124, "epoch": 0.7598977904704645, "flos": 18332326056960.0, "grad_norm": 2.2747225954566193, "language_loss": 0.6550855, "learning_rate": 5.747455029512323e-07, "loss": 0.6766358, "num_input_tokens_seen": 272602300, "step": 12639, "time_per_iteration": 2.6449224948883057 }, { "auxiliary_loss_clip": 0.01097983, "auxiliary_loss_mlp": 0.01032447, "balance_loss_clip": 1.03591299, "balance_loss_mlp": 1.01951265, "epoch": 0.7599579137231324, "flos": 20192642317440.0, "grad_norm": 2.3376509636382057, "language_loss": 0.70271343, "learning_rate": 5.744723059083572e-07, "loss": 0.72401774, "num_input_tokens_seen": 272619595, "step": 12640, "time_per_iteration": 2.813169240951538 }, { "auxiliary_loss_clip": 0.01091857, "auxiliary_loss_mlp": 0.01033957, "balance_loss_clip": 1.03943181, "balance_loss_mlp": 1.0203433, "epoch": 0.7600180369758004, "flos": 24024849459840.0, "grad_norm": 2.141253081598676, "language_loss": 0.66953784, "learning_rate": 5.741991629216343e-07, "loss": 0.69079602, "num_input_tokens_seen": 272638825, "step": 12641, "time_per_iteration": 2.8210034370422363 }, { "auxiliary_loss_clip": 0.01098494, "auxiliary_loss_mlp": 0.01031655, "balance_loss_clip": 1.03655171, "balance_loss_mlp": 1.01808345, "epoch": 0.7600781602284684, "flos": 18989527248000.0, "grad_norm": 3.210818856983626, "language_loss": 0.66875279, "learning_rate": 5.73926074001422e-07, "loss": 0.6900543, "num_input_tokens_seen": 272657240, "step": 12642, "time_per_iteration": 2.6761434078216553 }, { "auxiliary_loss_clip": 0.01092897, "auxiliary_loss_mlp": 0.01031582, "balance_loss_clip": 1.04070377, "balance_loss_mlp": 1.01937461, "epoch": 0.7601382834811363, "flos": 26067520091520.0, "grad_norm": 1.951124783740963, "language_loss": 0.75605899, "learning_rate": 5.736530391580765e-07, "loss": 0.77730376, "num_input_tokens_seen": 272677520, "step": 12643, "time_per_iteration": 2.858407497406006 }, { "auxiliary_loss_clip": 0.01076624, "auxiliary_loss_mlp": 0.0103542, "balance_loss_clip": 1.03911448, "balance_loss_mlp": 1.02123976, "epoch": 0.7601984067338043, "flos": 18844232734080.0, "grad_norm": 1.8455815779990985, "language_loss": 0.78802508, "learning_rate": 5.733800584019508e-07, "loss": 0.80914557, "num_input_tokens_seen": 272696770, "step": 12644, "time_per_iteration": 2.820368766784668 }, { "auxiliary_loss_clip": 0.01084265, "auxiliary_loss_mlp": 0.01032434, "balance_loss_clip": 1.0353601, "balance_loss_mlp": 1.01994061, "epoch": 0.7602585299864723, "flos": 24646391424000.0, "grad_norm": 1.5239807064585273, "language_loss": 0.80124938, "learning_rate": 5.731071317433957e-07, "loss": 0.82241637, "num_input_tokens_seen": 272718340, "step": 12645, "time_per_iteration": 2.8698811531066895 }, { "auxiliary_loss_clip": 0.01087859, "auxiliary_loss_mlp": 0.01033405, "balance_loss_clip": 1.0394851, "balance_loss_mlp": 1.02041101, "epoch": 0.7603186532391403, "flos": 23842100039040.0, "grad_norm": 1.4661810849316874, "language_loss": 0.72646892, "learning_rate": 5.728342591927611e-07, "loss": 0.74768156, "num_input_tokens_seen": 272739575, "step": 12646, "time_per_iteration": 2.8227429389953613 }, { "auxiliary_loss_clip": 0.01098686, "auxiliary_loss_mlp": 0.01035491, "balance_loss_clip": 1.03704524, "balance_loss_mlp": 1.02336717, "epoch": 0.7603787764918082, "flos": 22199905117440.0, "grad_norm": 2.4220316312811025, "language_loss": 0.67611617, "learning_rate": 5.725614407603949e-07, "loss": 0.69745797, "num_input_tokens_seen": 272758710, "step": 12647, "time_per_iteration": 2.8083581924438477 }, { "auxiliary_loss_clip": 0.01019336, "auxiliary_loss_mlp": 0.01006453, "balance_loss_clip": 1.00592494, "balance_loss_mlp": 1.00521874, "epoch": 0.7604388997444762, "flos": 54086894254080.0, "grad_norm": 0.6752663503199356, "language_loss": 0.48949182, "learning_rate": 5.722886764566415e-07, "loss": 0.50974971, "num_input_tokens_seen": 272814855, "step": 12648, "time_per_iteration": 3.211672782897949 }, { "auxiliary_loss_clip": 0.0109722, "auxiliary_loss_mlp": 0.01036106, "balance_loss_clip": 1.03749037, "balance_loss_mlp": 1.02400017, "epoch": 0.7604990229971441, "flos": 19681920789120.0, "grad_norm": 2.4521174104078467, "language_loss": 0.76747489, "learning_rate": 5.720159662918451e-07, "loss": 0.78880811, "num_input_tokens_seen": 272834400, "step": 12649, "time_per_iteration": 2.72628116607666 }, { "auxiliary_loss_clip": 0.0106851, "auxiliary_loss_mlp": 0.01035355, "balance_loss_clip": 1.03517592, "balance_loss_mlp": 1.02242661, "epoch": 0.7605591462498121, "flos": 25228036356480.0, "grad_norm": 1.7702335478327527, "language_loss": 0.68660265, "learning_rate": 5.717433102763462e-07, "loss": 0.7076413, "num_input_tokens_seen": 272854760, "step": 12650, "time_per_iteration": 2.8096909523010254 }, { "auxiliary_loss_clip": 0.01020249, "auxiliary_loss_mlp": 0.01003738, "balance_loss_clip": 1.00758457, "balance_loss_mlp": 1.00275445, "epoch": 0.76061926950248, "flos": 66783757662720.0, "grad_norm": 0.8336646667507255, "language_loss": 0.62671125, "learning_rate": 5.714707084204838e-07, "loss": 0.64695108, "num_input_tokens_seen": 272919030, "step": 12651, "time_per_iteration": 4.8483662605285645 }, { "auxiliary_loss_clip": 0.01076594, "auxiliary_loss_mlp": 0.01036234, "balance_loss_clip": 1.0368011, "balance_loss_mlp": 1.02373505, "epoch": 0.7606793927551481, "flos": 25338354001920.0, "grad_norm": 1.4829724837753475, "language_loss": 0.71288872, "learning_rate": 5.711981607345951e-07, "loss": 0.73401701, "num_input_tokens_seen": 272938925, "step": 12652, "time_per_iteration": 2.85551118850708 }, { "auxiliary_loss_clip": 0.01059292, "auxiliary_loss_mlp": 0.01037324, "balance_loss_clip": 1.0363282, "balance_loss_mlp": 1.02425838, "epoch": 0.760739516007816, "flos": 18223624523520.0, "grad_norm": 2.085886887216812, "language_loss": 0.80261797, "learning_rate": 5.709256672290152e-07, "loss": 0.82358414, "num_input_tokens_seen": 272954945, "step": 12653, "time_per_iteration": 6.101754665374756 }, { "auxiliary_loss_clip": 0.01116949, "auxiliary_loss_mlp": 0.01032648, "balance_loss_clip": 1.04151583, "balance_loss_mlp": 1.01946926, "epoch": 0.760799639260484, "flos": 22559119079040.0, "grad_norm": 1.7273806090867658, "language_loss": 0.79977405, "learning_rate": 5.706532279140785e-07, "loss": 0.82127005, "num_input_tokens_seen": 272972855, "step": 12654, "time_per_iteration": 2.7514119148254395 }, { "auxiliary_loss_clip": 0.01074955, "auxiliary_loss_mlp": 0.0103594, "balance_loss_clip": 1.03562033, "balance_loss_mlp": 1.02221942, "epoch": 0.760859762513152, "flos": 22309324922880.0, "grad_norm": 2.0189360189402357, "language_loss": 0.79458809, "learning_rate": 5.703808428001136e-07, "loss": 0.81569707, "num_input_tokens_seen": 272989895, "step": 12655, "time_per_iteration": 2.78948712348938 }, { "auxiliary_loss_clip": 0.01094485, "auxiliary_loss_mlp": 0.01028246, "balance_loss_clip": 1.03769636, "balance_loss_mlp": 1.01743925, "epoch": 0.7609198857658199, "flos": 24863902231680.0, "grad_norm": 1.6124233768982144, "language_loss": 0.68051422, "learning_rate": 5.701085118974505e-07, "loss": 0.70174152, "num_input_tokens_seen": 273011695, "step": 12656, "time_per_iteration": 2.795375347137451 }, { "auxiliary_loss_clip": 0.01101665, "auxiliary_loss_mlp": 0.01030913, "balance_loss_clip": 1.03489399, "balance_loss_mlp": 1.01786578, "epoch": 0.760980009018488, "flos": 16836790366080.0, "grad_norm": 2.7645541741379347, "language_loss": 0.73798579, "learning_rate": 5.698362352164164e-07, "loss": 0.75931156, "num_input_tokens_seen": 273028815, "step": 12657, "time_per_iteration": 4.21469521522522 }, { "auxiliary_loss_clip": 0.01012936, "auxiliary_loss_mlp": 0.01000637, "balance_loss_clip": 1.00884259, "balance_loss_mlp": 0.99950486, "epoch": 0.7610401322711559, "flos": 61230603029760.0, "grad_norm": 0.85360954009419, "language_loss": 0.64932978, "learning_rate": 5.695640127673347e-07, "loss": 0.66946548, "num_input_tokens_seen": 273084080, "step": 12658, "time_per_iteration": 3.2157864570617676 }, { "auxiliary_loss_clip": 0.01092364, "auxiliary_loss_mlp": 0.01034541, "balance_loss_clip": 1.03773546, "balance_loss_mlp": 1.02238202, "epoch": 0.7611002555238239, "flos": 19640730867840.0, "grad_norm": 2.0304267981282353, "language_loss": 0.79544449, "learning_rate": 5.692918445605293e-07, "loss": 0.81671351, "num_input_tokens_seen": 273102295, "step": 12659, "time_per_iteration": 2.6572675704956055 }, { "auxiliary_loss_clip": 0.01097791, "auxiliary_loss_mlp": 0.01028001, "balance_loss_clip": 1.03714883, "balance_loss_mlp": 1.015746, "epoch": 0.7611603787764918, "flos": 26872206526080.0, "grad_norm": 1.589308819258603, "language_loss": 0.68846476, "learning_rate": 5.690197306063209e-07, "loss": 0.7097227, "num_input_tokens_seen": 273123400, "step": 12660, "time_per_iteration": 2.815166473388672 }, { "auxiliary_loss_clip": 0.01111243, "auxiliary_loss_mlp": 0.01032754, "balance_loss_clip": 1.03771544, "balance_loss_mlp": 1.02017736, "epoch": 0.7612205020291598, "flos": 27344252085120.0, "grad_norm": 2.023337576106856, "language_loss": 0.70192313, "learning_rate": 5.687476709150281e-07, "loss": 0.7233631, "num_input_tokens_seen": 273145150, "step": 12661, "time_per_iteration": 2.7765588760375977 }, { "auxiliary_loss_clip": 0.01099752, "auxiliary_loss_mlp": 0.01034729, "balance_loss_clip": 1.03683341, "balance_loss_mlp": 1.02217579, "epoch": 0.7612806252818277, "flos": 29314598682240.0, "grad_norm": 1.6042830797514005, "language_loss": 0.83190757, "learning_rate": 5.68475665496966e-07, "loss": 0.85325241, "num_input_tokens_seen": 273165180, "step": 12662, "time_per_iteration": 2.7277190685272217 }, { "auxiliary_loss_clip": 0.01088049, "auxiliary_loss_mlp": 0.0104358, "balance_loss_clip": 1.03722358, "balance_loss_mlp": 1.03130126, "epoch": 0.7613407485344957, "flos": 19026048401280.0, "grad_norm": 1.7304537436557308, "language_loss": 0.68582624, "learning_rate": 5.682037143624505e-07, "loss": 0.70714259, "num_input_tokens_seen": 273184005, "step": 12663, "time_per_iteration": 2.770902156829834 }, { "auxiliary_loss_clip": 0.01100065, "auxiliary_loss_mlp": 0.01026036, "balance_loss_clip": 1.03998029, "balance_loss_mlp": 1.0138464, "epoch": 0.7614008717871636, "flos": 23256037733760.0, "grad_norm": 2.1194736525192357, "language_loss": 0.70156157, "learning_rate": 5.67931817521794e-07, "loss": 0.72282255, "num_input_tokens_seen": 273203565, "step": 12664, "time_per_iteration": 2.7074570655822754 }, { "auxiliary_loss_clip": 0.01105735, "auxiliary_loss_mlp": 0.01039411, "balance_loss_clip": 1.04057264, "balance_loss_mlp": 1.02536225, "epoch": 0.7614609950398317, "flos": 21579907438080.0, "grad_norm": 1.8390360170720685, "language_loss": 0.79482293, "learning_rate": 5.676599749853066e-07, "loss": 0.8162744, "num_input_tokens_seen": 273221645, "step": 12665, "time_per_iteration": 2.7299532890319824 }, { "auxiliary_loss_clip": 0.0111148, "auxiliary_loss_mlp": 0.00769447, "balance_loss_clip": 1.04143631, "balance_loss_mlp": 1.00019884, "epoch": 0.7615211182924996, "flos": 29277897960960.0, "grad_norm": 1.9892685132164005, "language_loss": 0.87823522, "learning_rate": 5.673881867632959e-07, "loss": 0.89704448, "num_input_tokens_seen": 273242040, "step": 12666, "time_per_iteration": 2.7689883708953857 }, { "auxiliary_loss_clip": 0.01055194, "auxiliary_loss_mlp": 0.01033948, "balance_loss_clip": 1.03460038, "balance_loss_mlp": 1.02016783, "epoch": 0.7615812415451676, "flos": 13261129136640.0, "grad_norm": 2.3749707608693513, "language_loss": 0.8353771, "learning_rate": 5.671164528660693e-07, "loss": 0.85626853, "num_input_tokens_seen": 273257365, "step": 12667, "time_per_iteration": 2.920854091644287 }, { "auxiliary_loss_clip": 0.01089109, "auxiliary_loss_mlp": 0.01037332, "balance_loss_clip": 1.03897429, "balance_loss_mlp": 1.02510726, "epoch": 0.7616413647978356, "flos": 18584741905920.0, "grad_norm": 1.7012297272780605, "language_loss": 0.78357065, "learning_rate": 5.668447733039296e-07, "loss": 0.80483508, "num_input_tokens_seen": 273274710, "step": 12668, "time_per_iteration": 2.694464683532715 }, { "auxiliary_loss_clip": 0.01075984, "auxiliary_loss_mlp": 0.01034063, "balance_loss_clip": 1.03536892, "balance_loss_mlp": 1.02142668, "epoch": 0.7617014880505035, "flos": 18516188799360.0, "grad_norm": 1.900278924462426, "language_loss": 0.64169192, "learning_rate": 5.6657314808718e-07, "loss": 0.66279244, "num_input_tokens_seen": 273292870, "step": 12669, "time_per_iteration": 2.793607234954834 }, { "auxiliary_loss_clip": 0.01086136, "auxiliary_loss_mlp": 0.01037174, "balance_loss_clip": 1.03618228, "balance_loss_mlp": 1.02251148, "epoch": 0.7617616113031715, "flos": 24973178382720.0, "grad_norm": 2.3594416048527886, "language_loss": 0.66683328, "learning_rate": 5.663015772261202e-07, "loss": 0.68806642, "num_input_tokens_seen": 273312375, "step": 12670, "time_per_iteration": 2.784454584121704 }, { "auxiliary_loss_clip": 0.01101863, "auxiliary_loss_mlp": 0.01036176, "balance_loss_clip": 1.03805709, "balance_loss_mlp": 1.02371264, "epoch": 0.7618217345558395, "flos": 23295036925440.0, "grad_norm": 1.6675852646548754, "language_loss": 0.73051012, "learning_rate": 5.660300607310493e-07, "loss": 0.75189054, "num_input_tokens_seen": 273332590, "step": 12671, "time_per_iteration": 2.7376444339752197 }, { "auxiliary_loss_clip": 0.01072018, "auxiliary_loss_mlp": 0.01036198, "balance_loss_clip": 1.03299487, "balance_loss_mlp": 1.02336478, "epoch": 0.7618818578085075, "flos": 25482894330240.0, "grad_norm": 1.6810616532176517, "language_loss": 0.73379242, "learning_rate": 5.657585986122613e-07, "loss": 0.75487459, "num_input_tokens_seen": 273352885, "step": 12672, "time_per_iteration": 2.839824914932251 }, { "auxiliary_loss_clip": 0.01001779, "auxiliary_loss_mlp": 0.01000945, "balance_loss_clip": 1.00902843, "balance_loss_mlp": 0.99994415, "epoch": 0.7619419810611754, "flos": 61151994115200.0, "grad_norm": 0.7605802796728681, "language_loss": 0.56720763, "learning_rate": 5.654871908800506e-07, "loss": 0.58723491, "num_input_tokens_seen": 273411730, "step": 12673, "time_per_iteration": 3.201223850250244 }, { "auxiliary_loss_clip": 0.01100506, "auxiliary_loss_mlp": 0.010336, "balance_loss_clip": 1.03872013, "balance_loss_mlp": 1.02017713, "epoch": 0.7620021043138434, "flos": 23258659426560.0, "grad_norm": 1.740985764323004, "language_loss": 0.74985719, "learning_rate": 5.652158375447102e-07, "loss": 0.77119827, "num_input_tokens_seen": 273430020, "step": 12674, "time_per_iteration": 2.7523674964904785 }, { "auxiliary_loss_clip": 0.01078335, "auxiliary_loss_mlp": 0.01034219, "balance_loss_clip": 1.03280282, "balance_loss_mlp": 1.02115917, "epoch": 0.7620622275665113, "flos": 25082490447360.0, "grad_norm": 2.016968785159948, "language_loss": 0.7202276, "learning_rate": 5.649445386165286e-07, "loss": 0.74135315, "num_input_tokens_seen": 273448690, "step": 12675, "time_per_iteration": 2.796057939529419 }, { "auxiliary_loss_clip": 0.01095004, "auxiliary_loss_mlp": 0.01030807, "balance_loss_clip": 1.03599072, "balance_loss_mlp": 1.01886785, "epoch": 0.7621223508191793, "flos": 20155007842560.0, "grad_norm": 2.355672138276969, "language_loss": 0.73052359, "learning_rate": 5.646732941057936e-07, "loss": 0.7517817, "num_input_tokens_seen": 273465190, "step": 12676, "time_per_iteration": 2.734591484069824 }, { "auxiliary_loss_clip": 0.01081109, "auxiliary_loss_mlp": 0.00771918, "balance_loss_clip": 1.03906035, "balance_loss_mlp": 1.00022256, "epoch": 0.7621824740718472, "flos": 18000187971840.0, "grad_norm": 2.93709923203383, "language_loss": 0.54046768, "learning_rate": 5.644021040227927e-07, "loss": 0.55899793, "num_input_tokens_seen": 273478620, "step": 12677, "time_per_iteration": 2.8109676837921143 }, { "auxiliary_loss_clip": 0.01052826, "auxiliary_loss_mlp": 0.01035963, "balance_loss_clip": 1.0335747, "balance_loss_mlp": 1.02283812, "epoch": 0.7622425973245153, "flos": 21725668828800.0, "grad_norm": 2.0762274911054184, "language_loss": 0.78760284, "learning_rate": 5.641309683778064e-07, "loss": 0.80849069, "num_input_tokens_seen": 273497635, "step": 12678, "time_per_iteration": 2.860340118408203 }, { "auxiliary_loss_clip": 0.01073918, "auxiliary_loss_mlp": 0.01036673, "balance_loss_clip": 1.0344305, "balance_loss_mlp": 1.02257085, "epoch": 0.7623027205771832, "flos": 19718549683200.0, "grad_norm": 3.9645067236030114, "language_loss": 0.77204514, "learning_rate": 5.638598871811175e-07, "loss": 0.79315102, "num_input_tokens_seen": 273513955, "step": 12679, "time_per_iteration": 2.772916793823242 }, { "auxiliary_loss_clip": 0.01100617, "auxiliary_loss_mlp": 0.01027269, "balance_loss_clip": 1.03917551, "balance_loss_mlp": 1.01456678, "epoch": 0.7623628438298512, "flos": 23988831096960.0, "grad_norm": 1.434526846534088, "language_loss": 0.80099726, "learning_rate": 5.635888604430059e-07, "loss": 0.82227612, "num_input_tokens_seen": 273533970, "step": 12680, "time_per_iteration": 2.7801437377929688 }, { "auxiliary_loss_clip": 0.01089966, "auxiliary_loss_mlp": 0.01032719, "balance_loss_clip": 1.03768969, "balance_loss_mlp": 1.01880169, "epoch": 0.7624229670825191, "flos": 22345702421760.0, "grad_norm": 1.9046696360663191, "language_loss": 0.62818468, "learning_rate": 5.633178881737493e-07, "loss": 0.64941156, "num_input_tokens_seen": 273553090, "step": 12681, "time_per_iteration": 2.8114664554595947 }, { "auxiliary_loss_clip": 0.01076613, "auxiliary_loss_mlp": 0.01031955, "balance_loss_clip": 1.03848743, "balance_loss_mlp": 1.01964092, "epoch": 0.7624830903351871, "flos": 22711775880960.0, "grad_norm": 2.2465025277457755, "language_loss": 0.76199776, "learning_rate": 5.63046970383622e-07, "loss": 0.78308344, "num_input_tokens_seen": 273572460, "step": 12682, "time_per_iteration": 2.8621296882629395 }, { "auxiliary_loss_clip": 0.01085809, "auxiliary_loss_mlp": 0.01032162, "balance_loss_clip": 1.03555107, "balance_loss_mlp": 1.02058053, "epoch": 0.7625432135878552, "flos": 25593714766080.0, "grad_norm": 1.5266925893040741, "language_loss": 0.68380392, "learning_rate": 5.627761070828974e-07, "loss": 0.70498371, "num_input_tokens_seen": 273592815, "step": 12683, "time_per_iteration": 2.804927349090576 }, { "auxiliary_loss_clip": 0.01067779, "auxiliary_loss_mlp": 0.00772982, "balance_loss_clip": 1.03292143, "balance_loss_mlp": 1.00020671, "epoch": 0.7626033368405231, "flos": 23987645948160.0, "grad_norm": 2.022263962104647, "language_loss": 0.83156735, "learning_rate": 5.625052982818472e-07, "loss": 0.84997493, "num_input_tokens_seen": 273611790, "step": 12684, "time_per_iteration": 2.7787985801696777 }, { "auxiliary_loss_clip": 0.0108949, "auxiliary_loss_mlp": 0.01041206, "balance_loss_clip": 1.03807712, "balance_loss_mlp": 1.02769983, "epoch": 0.7626634600931911, "flos": 12599115523200.0, "grad_norm": 2.242764424782362, "language_loss": 0.82618159, "learning_rate": 5.622345439907396e-07, "loss": 0.84748858, "num_input_tokens_seen": 273628340, "step": 12685, "time_per_iteration": 2.735823631286621 }, { "auxiliary_loss_clip": 0.0107975, "auxiliary_loss_mlp": 0.00770301, "balance_loss_clip": 1.03726244, "balance_loss_mlp": 1.00022709, "epoch": 0.762723583345859, "flos": 26322593546880.0, "grad_norm": 2.461504636054881, "language_loss": 0.77635926, "learning_rate": 5.619638442198422e-07, "loss": 0.79485977, "num_input_tokens_seen": 273646585, "step": 12686, "time_per_iteration": 2.906090021133423 }, { "auxiliary_loss_clip": 0.01052651, "auxiliary_loss_mlp": 0.01057311, "balance_loss_clip": 1.03302336, "balance_loss_mlp": 1.0405736, "epoch": 0.762783706598527, "flos": 21907053532800.0, "grad_norm": 1.7909742891247455, "language_loss": 0.72059739, "learning_rate": 5.616931989794198e-07, "loss": 0.74169701, "num_input_tokens_seen": 273665410, "step": 12687, "time_per_iteration": 2.736345052719116 }, { "auxiliary_loss_clip": 0.01084081, "auxiliary_loss_mlp": 0.01042723, "balance_loss_clip": 1.03387547, "balance_loss_mlp": 1.02746391, "epoch": 0.7628438298511949, "flos": 15339782217600.0, "grad_norm": 1.8904556177994511, "language_loss": 0.65018427, "learning_rate": 5.614226082797369e-07, "loss": 0.67145234, "num_input_tokens_seen": 273683035, "step": 12688, "time_per_iteration": 2.7697956562042236 }, { "auxiliary_loss_clip": 0.01101479, "auxiliary_loss_mlp": 0.01027744, "balance_loss_clip": 1.03997755, "balance_loss_mlp": 1.01574564, "epoch": 0.7629039531038629, "flos": 13006307076480.0, "grad_norm": 3.084065426087135, "language_loss": 0.70538044, "learning_rate": 5.611520721310515e-07, "loss": 0.72667265, "num_input_tokens_seen": 273700130, "step": 12689, "time_per_iteration": 2.9508743286132812 }, { "auxiliary_loss_clip": 0.01081126, "auxiliary_loss_mlp": 0.01040898, "balance_loss_clip": 1.03614342, "balance_loss_mlp": 1.0274868, "epoch": 0.7629640763565309, "flos": 26171660597760.0, "grad_norm": 1.827453823319608, "language_loss": 0.69980061, "learning_rate": 5.608815905436238e-07, "loss": 0.72102082, "num_input_tokens_seen": 273720310, "step": 12690, "time_per_iteration": 2.8916642665863037 }, { "auxiliary_loss_clip": 0.01084164, "auxiliary_loss_mlp": 0.01040929, "balance_loss_clip": 1.03480482, "balance_loss_mlp": 1.02747643, "epoch": 0.7630241996091989, "flos": 36793713680640.0, "grad_norm": 1.455347798519734, "language_loss": 0.69115114, "learning_rate": 5.606111635277109e-07, "loss": 0.71240205, "num_input_tokens_seen": 273744475, "step": 12691, "time_per_iteration": 4.387454032897949 }, { "auxiliary_loss_clip": 0.01093867, "auxiliary_loss_mlp": 0.01037257, "balance_loss_clip": 1.03709197, "balance_loss_mlp": 1.02576542, "epoch": 0.7630843228618668, "flos": 21835160461440.0, "grad_norm": 1.950930402576883, "language_loss": 0.81791067, "learning_rate": 5.603407910935662e-07, "loss": 0.83922184, "num_input_tokens_seen": 273764635, "step": 12692, "time_per_iteration": 5.863187551498413 }, { "auxiliary_loss_clip": 0.01078564, "auxiliary_loss_mlp": 0.010271, "balance_loss_clip": 1.04068136, "balance_loss_mlp": 1.01536989, "epoch": 0.7631444461145348, "flos": 12640520926080.0, "grad_norm": 2.677454083590648, "language_loss": 0.77390575, "learning_rate": 5.600704732514438e-07, "loss": 0.79496241, "num_input_tokens_seen": 273780115, "step": 12693, "time_per_iteration": 2.8327314853668213 }, { "auxiliary_loss_clip": 0.0107301, "auxiliary_loss_mlp": 0.01034355, "balance_loss_clip": 1.03885221, "balance_loss_mlp": 1.02155745, "epoch": 0.7632045693672027, "flos": 16836610798080.0, "grad_norm": 3.1941491202097523, "language_loss": 0.72766727, "learning_rate": 5.598002100115933e-07, "loss": 0.74874091, "num_input_tokens_seen": 273796605, "step": 12694, "time_per_iteration": 2.771289587020874 }, { "auxiliary_loss_clip": 0.01096742, "auxiliary_loss_mlp": 0.01029277, "balance_loss_clip": 1.03683043, "balance_loss_mlp": 1.01703393, "epoch": 0.7632646926198707, "flos": 22017335264640.0, "grad_norm": 1.9917055644917767, "language_loss": 0.70419681, "learning_rate": 5.595300013842625e-07, "loss": 0.72545701, "num_input_tokens_seen": 273816515, "step": 12695, "time_per_iteration": 2.616629123687744 }, { "auxiliary_loss_clip": 0.01109838, "auxiliary_loss_mlp": 0.01031794, "balance_loss_clip": 1.03797019, "balance_loss_mlp": 1.0198853, "epoch": 0.7633248158725388, "flos": 23114011357440.0, "grad_norm": 1.5503240571511046, "language_loss": 0.72249472, "learning_rate": 5.592598473796985e-07, "loss": 0.74391103, "num_input_tokens_seen": 273837060, "step": 12696, "time_per_iteration": 2.7050669193267822 }, { "auxiliary_loss_clip": 0.01051627, "auxiliary_loss_mlp": 0.01040707, "balance_loss_clip": 1.03538561, "balance_loss_mlp": 1.02642596, "epoch": 0.7633849391252067, "flos": 10889839952640.0, "grad_norm": 2.077421826663572, "language_loss": 0.71310508, "learning_rate": 5.589897480081453e-07, "loss": 0.73402846, "num_input_tokens_seen": 273853365, "step": 12697, "time_per_iteration": 4.246352672576904 }, { "auxiliary_loss_clip": 0.01077141, "auxiliary_loss_mlp": 0.01034421, "balance_loss_clip": 1.0388602, "balance_loss_mlp": 1.02219009, "epoch": 0.7634450623778747, "flos": 20994168355200.0, "grad_norm": 3.071082049112887, "language_loss": 0.66922784, "learning_rate": 5.587197032798461e-07, "loss": 0.69034344, "num_input_tokens_seen": 273870750, "step": 12698, "time_per_iteration": 2.7623679637908936 }, { "auxiliary_loss_clip": 0.01097288, "auxiliary_loss_mlp": 0.01029151, "balance_loss_clip": 1.03538871, "balance_loss_mlp": 1.01636612, "epoch": 0.7635051856305426, "flos": 18882046776960.0, "grad_norm": 1.6894035015942557, "language_loss": 0.72252488, "learning_rate": 5.5844971320504e-07, "loss": 0.74378926, "num_input_tokens_seen": 273890890, "step": 12699, "time_per_iteration": 2.681185483932495 }, { "auxiliary_loss_clip": 0.01088089, "auxiliary_loss_mlp": 0.01032373, "balance_loss_clip": 1.03612185, "balance_loss_mlp": 1.02065527, "epoch": 0.7635653088832106, "flos": 34786989584640.0, "grad_norm": 1.7379546952285325, "language_loss": 0.73000193, "learning_rate": 5.581797777939648e-07, "loss": 0.75120658, "num_input_tokens_seen": 273914015, "step": 12700, "time_per_iteration": 2.788801908493042 }, { "auxiliary_loss_clip": 0.01109919, "auxiliary_loss_mlp": 0.01030309, "balance_loss_clip": 1.03708696, "balance_loss_mlp": 1.01822746, "epoch": 0.7636254321358785, "flos": 23178434400000.0, "grad_norm": 2.5171117546055717, "language_loss": 0.69465768, "learning_rate": 5.579098970568574e-07, "loss": 0.71606004, "num_input_tokens_seen": 273927415, "step": 12701, "time_per_iteration": 2.6201059818267822 }, { "auxiliary_loss_clip": 0.01083521, "auxiliary_loss_mlp": 0.01031087, "balance_loss_clip": 1.03899217, "balance_loss_mlp": 1.01891518, "epoch": 0.7636855553885465, "flos": 21325229032320.0, "grad_norm": 2.215440552723354, "language_loss": 0.64664185, "learning_rate": 5.576400710039508e-07, "loss": 0.66778791, "num_input_tokens_seen": 273946690, "step": 12702, "time_per_iteration": 2.7970054149627686 }, { "auxiliary_loss_clip": 0.01079185, "auxiliary_loss_mlp": 0.01033415, "balance_loss_clip": 1.03836131, "balance_loss_mlp": 1.02095747, "epoch": 0.7637456786412145, "flos": 28658079849600.0, "grad_norm": 1.9784000831539899, "language_loss": 0.66083431, "learning_rate": 5.57370299645477e-07, "loss": 0.68196028, "num_input_tokens_seen": 273966870, "step": 12703, "time_per_iteration": 2.822849750518799 }, { "auxiliary_loss_clip": 0.01087834, "auxiliary_loss_mlp": 0.01026937, "balance_loss_clip": 1.03842688, "balance_loss_mlp": 1.01438999, "epoch": 0.7638058018938825, "flos": 21907269014400.0, "grad_norm": 2.027090239685764, "language_loss": 0.83859146, "learning_rate": 5.571005829916668e-07, "loss": 0.85973918, "num_input_tokens_seen": 273986360, "step": 12704, "time_per_iteration": 2.728527784347534 }, { "auxiliary_loss_clip": 0.01088663, "auxiliary_loss_mlp": 0.01032796, "balance_loss_clip": 1.03736877, "balance_loss_mlp": 1.02039814, "epoch": 0.7638659251465504, "flos": 29643899592960.0, "grad_norm": 1.895547997363001, "language_loss": 0.67812586, "learning_rate": 5.568309210527469e-07, "loss": 0.69934046, "num_input_tokens_seen": 274009745, "step": 12705, "time_per_iteration": 2.818378448486328 }, { "auxiliary_loss_clip": 0.01083042, "auxiliary_loss_mlp": 0.01032131, "balance_loss_clip": 1.03550816, "balance_loss_mlp": 1.01972699, "epoch": 0.7639260483992184, "flos": 26141172929280.0, "grad_norm": 1.7310921121136604, "language_loss": 0.73945439, "learning_rate": 5.565613138389427e-07, "loss": 0.76060611, "num_input_tokens_seen": 274028775, "step": 12706, "time_per_iteration": 2.7738003730773926 }, { "auxiliary_loss_clip": 0.0109458, "auxiliary_loss_mlp": 0.01037611, "balance_loss_clip": 1.03670621, "balance_loss_mlp": 1.02431297, "epoch": 0.7639861716518863, "flos": 20156695781760.0, "grad_norm": 2.5805411754522396, "language_loss": 0.78420258, "learning_rate": 5.562917613604781e-07, "loss": 0.80552453, "num_input_tokens_seen": 274047520, "step": 12707, "time_per_iteration": 2.785919666290283 }, { "auxiliary_loss_clip": 0.01083532, "auxiliary_loss_mlp": 0.01028293, "balance_loss_clip": 1.03674436, "balance_loss_mlp": 1.01594281, "epoch": 0.7640462949045543, "flos": 18583125793920.0, "grad_norm": 1.8763992467573365, "language_loss": 0.79923272, "learning_rate": 5.560222636275751e-07, "loss": 0.82035094, "num_input_tokens_seen": 274065350, "step": 12708, "time_per_iteration": 2.7112326622009277 }, { "auxiliary_loss_clip": 0.0102089, "auxiliary_loss_mlp": 0.00999756, "balance_loss_clip": 1.0106082, "balance_loss_mlp": 0.99848616, "epoch": 0.7641064181572224, "flos": 68321991646080.0, "grad_norm": 0.8077298698173723, "language_loss": 0.56427336, "learning_rate": 5.557528206504521e-07, "loss": 0.58447981, "num_input_tokens_seen": 274122315, "step": 12709, "time_per_iteration": 3.2111401557922363 }, { "auxiliary_loss_clip": 0.01098648, "auxiliary_loss_mlp": 0.01040257, "balance_loss_clip": 1.03582978, "balance_loss_mlp": 1.02636278, "epoch": 0.7641665414098903, "flos": 17968982031360.0, "grad_norm": 1.9630774322237245, "language_loss": 0.63484347, "learning_rate": 5.554834324393271e-07, "loss": 0.65623254, "num_input_tokens_seen": 274140555, "step": 12710, "time_per_iteration": 2.685795545578003 }, { "auxiliary_loss_clip": 0.01062185, "auxiliary_loss_mlp": 0.00771699, "balance_loss_clip": 1.03377032, "balance_loss_mlp": 1.00016749, "epoch": 0.7642266646625583, "flos": 21252078984960.0, "grad_norm": 2.5143918151768867, "language_loss": 0.64498585, "learning_rate": 5.552140990044154e-07, "loss": 0.66332471, "num_input_tokens_seen": 274161125, "step": 12711, "time_per_iteration": 2.845017671585083 }, { "auxiliary_loss_clip": 0.01088311, "auxiliary_loss_mlp": 0.01037404, "balance_loss_clip": 1.03707993, "balance_loss_mlp": 1.02514362, "epoch": 0.7642867879152262, "flos": 22747794243840.0, "grad_norm": 1.7149688745487186, "language_loss": 0.72759664, "learning_rate": 5.549448203559293e-07, "loss": 0.7488538, "num_input_tokens_seen": 274180835, "step": 12712, "time_per_iteration": 2.7211430072784424 }, { "auxiliary_loss_clip": 0.01077131, "auxiliary_loss_mlp": 0.01032428, "balance_loss_clip": 1.03835392, "balance_loss_mlp": 1.02084625, "epoch": 0.7643469111678942, "flos": 23332132696320.0, "grad_norm": 2.218446959632987, "language_loss": 0.80380988, "learning_rate": 5.546755965040804e-07, "loss": 0.82490551, "num_input_tokens_seen": 274201190, "step": 12713, "time_per_iteration": 2.822138786315918 }, { "auxiliary_loss_clip": 0.01102023, "auxiliary_loss_mlp": 0.00771212, "balance_loss_clip": 1.03739047, "balance_loss_mlp": 1.00028956, "epoch": 0.7644070344205621, "flos": 19857092440320.0, "grad_norm": 2.084525894573783, "language_loss": 0.83132589, "learning_rate": 5.544064274590776e-07, "loss": 0.85005832, "num_input_tokens_seen": 274217595, "step": 12714, "time_per_iteration": 2.67500638961792 }, { "auxiliary_loss_clip": 0.01104132, "auxiliary_loss_mlp": 0.01037809, "balance_loss_clip": 1.0384692, "balance_loss_mlp": 1.02498782, "epoch": 0.7644671576732301, "flos": 22090628966400.0, "grad_norm": 1.7447994690858495, "language_loss": 0.73020244, "learning_rate": 5.541373132311287e-07, "loss": 0.75162184, "num_input_tokens_seen": 274237885, "step": 12715, "time_per_iteration": 2.705496072769165 }, { "auxiliary_loss_clip": 0.0106908, "auxiliary_loss_mlp": 0.01029403, "balance_loss_clip": 1.03376102, "balance_loss_mlp": 1.01651025, "epoch": 0.7645272809258981, "flos": 25481421872640.0, "grad_norm": 1.9750549289299242, "language_loss": 0.63063681, "learning_rate": 5.538682538304376e-07, "loss": 0.65162164, "num_input_tokens_seen": 274258820, "step": 12716, "time_per_iteration": 2.7983617782592773 }, { "auxiliary_loss_clip": 0.01115577, "auxiliary_loss_mlp": 0.01037115, "balance_loss_clip": 1.03981853, "balance_loss_mlp": 1.02357841, "epoch": 0.7645874041785661, "flos": 21541877913600.0, "grad_norm": 1.536427490036212, "language_loss": 0.79740059, "learning_rate": 5.535992492672068e-07, "loss": 0.81892753, "num_input_tokens_seen": 274278835, "step": 12717, "time_per_iteration": 2.595195770263672 }, { "auxiliary_loss_clip": 0.01110878, "auxiliary_loss_mlp": 0.01037171, "balance_loss_clip": 1.03890347, "balance_loss_mlp": 1.02481461, "epoch": 0.764647527431234, "flos": 20630896156800.0, "grad_norm": 2.30472579589713, "language_loss": 0.66033196, "learning_rate": 5.53330299551638e-07, "loss": 0.68181252, "num_input_tokens_seen": 274297110, "step": 12718, "time_per_iteration": 2.673990488052368 }, { "auxiliary_loss_clip": 0.01063441, "auxiliary_loss_mlp": 0.01036496, "balance_loss_clip": 1.03585815, "balance_loss_mlp": 1.02499259, "epoch": 0.764707650683902, "flos": 21434074220160.0, "grad_norm": 2.1613863310626287, "language_loss": 0.77098262, "learning_rate": 5.530614046939286e-07, "loss": 0.791982, "num_input_tokens_seen": 274315610, "step": 12719, "time_per_iteration": 2.6510918140411377 }, { "auxiliary_loss_clip": 0.01112525, "auxiliary_loss_mlp": 0.01029144, "balance_loss_clip": 1.03881288, "balance_loss_mlp": 1.01615012, "epoch": 0.7647677739365699, "flos": 22711201263360.0, "grad_norm": 2.267731943336326, "language_loss": 0.7029593, "learning_rate": 5.527925647042754e-07, "loss": 0.72437602, "num_input_tokens_seen": 274333975, "step": 12720, "time_per_iteration": 2.5991692543029785 }, { "auxiliary_loss_clip": 0.01079824, "auxiliary_loss_mlp": 0.01040879, "balance_loss_clip": 1.03855467, "balance_loss_mlp": 1.02823687, "epoch": 0.7648278971892379, "flos": 21324115710720.0, "grad_norm": 1.5967062450845435, "language_loss": 0.73703921, "learning_rate": 5.52523779592875e-07, "loss": 0.7582463, "num_input_tokens_seen": 274353695, "step": 12721, "time_per_iteration": 2.764606237411499 }, { "auxiliary_loss_clip": 0.01070414, "auxiliary_loss_mlp": 0.01030705, "balance_loss_clip": 1.03494334, "balance_loss_mlp": 1.01805067, "epoch": 0.764888020441906, "flos": 20667345482880.0, "grad_norm": 1.6944622449827433, "language_loss": 0.73529649, "learning_rate": 5.522550493699163e-07, "loss": 0.75630772, "num_input_tokens_seen": 274371120, "step": 12722, "time_per_iteration": 2.7863218784332275 }, { "auxiliary_loss_clip": 0.01099467, "auxiliary_loss_mlp": 0.01038218, "balance_loss_clip": 1.03691196, "balance_loss_mlp": 1.02573085, "epoch": 0.7649481436945739, "flos": 25082526360960.0, "grad_norm": 1.8664873966014532, "language_loss": 0.74043649, "learning_rate": 5.519863740455912e-07, "loss": 0.76181328, "num_input_tokens_seen": 274389665, "step": 12723, "time_per_iteration": 2.6984498500823975 }, { "auxiliary_loss_clip": 0.01111926, "auxiliary_loss_mlp": 0.0103197, "balance_loss_clip": 1.03712893, "balance_loss_mlp": 1.01897049, "epoch": 0.7650082669472419, "flos": 24900890261760.0, "grad_norm": 1.9718177009092177, "language_loss": 0.73098785, "learning_rate": 5.517177536300881e-07, "loss": 0.75242674, "num_input_tokens_seen": 274408750, "step": 12724, "time_per_iteration": 2.723292112350464 }, { "auxiliary_loss_clip": 0.0109622, "auxiliary_loss_mlp": 0.01027237, "balance_loss_clip": 1.03798413, "balance_loss_mlp": 1.01521456, "epoch": 0.7650683901999098, "flos": 14647388676480.0, "grad_norm": 1.8049167073820385, "language_loss": 0.83982503, "learning_rate": 5.514491881335935e-07, "loss": 0.86105955, "num_input_tokens_seen": 274424600, "step": 12725, "time_per_iteration": 2.6900579929351807 }, { "auxiliary_loss_clip": 0.01071599, "auxiliary_loss_mlp": 0.01033335, "balance_loss_clip": 1.03815186, "balance_loss_mlp": 1.01962614, "epoch": 0.7651285134525778, "flos": 26352434770560.0, "grad_norm": 1.764771346840138, "language_loss": 0.77535796, "learning_rate": 5.511806775662901e-07, "loss": 0.79640734, "num_input_tokens_seen": 274443075, "step": 12726, "time_per_iteration": 2.7554757595062256 }, { "auxiliary_loss_clip": 0.01098675, "auxiliary_loss_mlp": 0.0103653, "balance_loss_clip": 1.03659284, "balance_loss_mlp": 1.0239116, "epoch": 0.7651886367052457, "flos": 26646866553600.0, "grad_norm": 1.727505900288767, "language_loss": 0.70817876, "learning_rate": 5.509122219383615e-07, "loss": 0.72953087, "num_input_tokens_seen": 274463240, "step": 12727, "time_per_iteration": 2.679713249206543 }, { "auxiliary_loss_clip": 0.0110535, "auxiliary_loss_mlp": 0.01031096, "balance_loss_clip": 1.03530371, "balance_loss_mlp": 1.01887083, "epoch": 0.7652487599579137, "flos": 25702847262720.0, "grad_norm": 1.645567589950576, "language_loss": 0.79781538, "learning_rate": 5.506438212599864e-07, "loss": 0.81917983, "num_input_tokens_seen": 274482750, "step": 12728, "time_per_iteration": 2.6556482315063477 }, { "auxiliary_loss_clip": 0.01112141, "auxiliary_loss_mlp": 0.01029917, "balance_loss_clip": 1.03871763, "balance_loss_mlp": 1.01615465, "epoch": 0.7653088832105817, "flos": 28585576247040.0, "grad_norm": 2.018168225354916, "language_loss": 0.55207121, "learning_rate": 5.503754755413424e-07, "loss": 0.57349181, "num_input_tokens_seen": 274503545, "step": 12729, "time_per_iteration": 2.656604290008545 }, { "auxiliary_loss_clip": 0.01087792, "auxiliary_loss_mlp": 0.00770692, "balance_loss_clip": 1.03700304, "balance_loss_mlp": 1.00016689, "epoch": 0.7653690064632497, "flos": 23366750428800.0, "grad_norm": 2.0285553204704914, "language_loss": 0.78009534, "learning_rate": 5.501071847926055e-07, "loss": 0.79868019, "num_input_tokens_seen": 274523825, "step": 12730, "time_per_iteration": 4.308157920837402 }, { "auxiliary_loss_clip": 0.01104921, "auxiliary_loss_mlp": 0.01038983, "balance_loss_clip": 1.04124045, "balance_loss_mlp": 1.02547646, "epoch": 0.7654291297159176, "flos": 15773905992960.0, "grad_norm": 1.8100841028281673, "language_loss": 0.69162709, "learning_rate": 5.498389490239495e-07, "loss": 0.7130661, "num_input_tokens_seen": 274541625, "step": 12731, "time_per_iteration": 5.375198841094971 }, { "auxiliary_loss_clip": 0.0111224, "auxiliary_loss_mlp": 0.01032378, "balance_loss_clip": 1.03824425, "balance_loss_mlp": 1.0195576, "epoch": 0.7654892529685856, "flos": 18033800123520.0, "grad_norm": 2.185341705177071, "language_loss": 0.70105004, "learning_rate": 5.495707682455471e-07, "loss": 0.72249627, "num_input_tokens_seen": 274557580, "step": 12732, "time_per_iteration": 4.1254401206970215 }, { "auxiliary_loss_clip": 0.01092112, "auxiliary_loss_mlp": 0.01027482, "balance_loss_clip": 1.0373385, "balance_loss_mlp": 1.01429737, "epoch": 0.7655493762212535, "flos": 27236017428480.0, "grad_norm": 1.4842742362274353, "language_loss": 0.78410125, "learning_rate": 5.493026424675653e-07, "loss": 0.8052972, "num_input_tokens_seen": 274578135, "step": 12733, "time_per_iteration": 2.7428579330444336 }, { "auxiliary_loss_clip": 0.0109795, "auxiliary_loss_mlp": 0.01031014, "balance_loss_clip": 1.03692389, "balance_loss_mlp": 1.0184319, "epoch": 0.7656094994739215, "flos": 20773964027520.0, "grad_norm": 1.7566510390792163, "language_loss": 0.7753557, "learning_rate": 5.490345717001726e-07, "loss": 0.79664528, "num_input_tokens_seen": 274595655, "step": 12734, "time_per_iteration": 2.7525999546051025 }, { "auxiliary_loss_clip": 0.01085843, "auxiliary_loss_mlp": 0.01034242, "balance_loss_clip": 1.03541505, "balance_loss_mlp": 1.01981783, "epoch": 0.7656696227265896, "flos": 23039245198080.0, "grad_norm": 1.5677045475604683, "language_loss": 0.73221684, "learning_rate": 5.48766555953535e-07, "loss": 0.75341773, "num_input_tokens_seen": 274616305, "step": 12735, "time_per_iteration": 2.7425713539123535 }, { "auxiliary_loss_clip": 0.01081818, "auxiliary_loss_mlp": 0.01035075, "balance_loss_clip": 1.03768682, "balance_loss_mlp": 1.02273107, "epoch": 0.7657297459792575, "flos": 27525636789120.0, "grad_norm": 1.7042118812882554, "language_loss": 0.72533989, "learning_rate": 5.484985952378145e-07, "loss": 0.74650872, "num_input_tokens_seen": 274638110, "step": 12736, "time_per_iteration": 4.268921852111816 }, { "auxiliary_loss_clip": 0.01100818, "auxiliary_loss_mlp": 0.00771184, "balance_loss_clip": 1.0399543, "balance_loss_mlp": 1.00027192, "epoch": 0.7657898692319255, "flos": 17128456801920.0, "grad_norm": 2.232664044830526, "language_loss": 0.77698004, "learning_rate": 5.482306895631728e-07, "loss": 0.79570007, "num_input_tokens_seen": 274656565, "step": 12737, "time_per_iteration": 2.751887321472168 }, { "auxiliary_loss_clip": 0.0108412, "auxiliary_loss_mlp": 0.01034529, "balance_loss_clip": 1.03502047, "balance_loss_mlp": 1.02128458, "epoch": 0.7658499924845934, "flos": 21465747037440.0, "grad_norm": 1.8163284528378292, "language_loss": 0.76455462, "learning_rate": 5.479628389397699e-07, "loss": 0.78574109, "num_input_tokens_seen": 274674215, "step": 12738, "time_per_iteration": 2.7251851558685303 }, { "auxiliary_loss_clip": 0.01092339, "auxiliary_loss_mlp": 0.01031134, "balance_loss_clip": 1.03848684, "balance_loss_mlp": 1.01825941, "epoch": 0.7659101157372614, "flos": 29496665744640.0, "grad_norm": 1.9441100679422159, "language_loss": 0.62250507, "learning_rate": 5.476950433777603e-07, "loss": 0.64373976, "num_input_tokens_seen": 274693445, "step": 12739, "time_per_iteration": 2.858171224594116 }, { "auxiliary_loss_clip": 0.01112469, "auxiliary_loss_mlp": 0.01035738, "balance_loss_clip": 1.03928363, "balance_loss_mlp": 1.02203465, "epoch": 0.7659702389899293, "flos": 18551812112640.0, "grad_norm": 2.47113097275276, "language_loss": 0.79031968, "learning_rate": 5.474273028873004e-07, "loss": 0.81180167, "num_input_tokens_seen": 274712815, "step": 12740, "time_per_iteration": 2.624732732772827 }, { "auxiliary_loss_clip": 0.01100888, "auxiliary_loss_mlp": 0.01032866, "balance_loss_clip": 1.03686976, "balance_loss_mlp": 1.01987791, "epoch": 0.7660303622425974, "flos": 23549176627200.0, "grad_norm": 1.653199646827083, "language_loss": 0.65173864, "learning_rate": 5.471596174785429e-07, "loss": 0.67307615, "num_input_tokens_seen": 274732690, "step": 12741, "time_per_iteration": 2.716336488723755 }, { "auxiliary_loss_clip": 0.01083513, "auxiliary_loss_mlp": 0.01030711, "balance_loss_clip": 1.03482628, "balance_loss_mlp": 1.0174545, "epoch": 0.7660904854952653, "flos": 18916736336640.0, "grad_norm": 1.544754015503659, "language_loss": 0.75767601, "learning_rate": 5.468919871616386e-07, "loss": 0.77881825, "num_input_tokens_seen": 274752460, "step": 12742, "time_per_iteration": 2.7747738361358643 }, { "auxiliary_loss_clip": 0.01086511, "auxiliary_loss_mlp": 0.01031712, "balance_loss_clip": 1.03885317, "balance_loss_mlp": 1.01983905, "epoch": 0.7661506087479333, "flos": 23147515768320.0, "grad_norm": 1.4566796365103731, "language_loss": 0.76655585, "learning_rate": 5.46624411946736e-07, "loss": 0.78773808, "num_input_tokens_seen": 274773070, "step": 12743, "time_per_iteration": 2.780097484588623 }, { "auxiliary_loss_clip": 0.01085441, "auxiliary_loss_mlp": 0.01034742, "balance_loss_clip": 1.03478014, "balance_loss_mlp": 1.02236819, "epoch": 0.7662107320006012, "flos": 17565776887680.0, "grad_norm": 1.917782267543357, "language_loss": 0.74838865, "learning_rate": 5.463568918439805e-07, "loss": 0.76959044, "num_input_tokens_seen": 274790220, "step": 12744, "time_per_iteration": 2.8596222400665283 }, { "auxiliary_loss_clip": 0.01099606, "auxiliary_loss_mlp": 0.01033696, "balance_loss_clip": 1.03648257, "balance_loss_mlp": 1.02051127, "epoch": 0.7662708552532692, "flos": 22303075956480.0, "grad_norm": 2.0417086586666424, "language_loss": 0.71049422, "learning_rate": 5.460894268635181e-07, "loss": 0.73182726, "num_input_tokens_seen": 274805095, "step": 12745, "time_per_iteration": 2.7712717056274414 }, { "auxiliary_loss_clip": 0.01095184, "auxiliary_loss_mlp": 0.01038801, "balance_loss_clip": 1.03534567, "balance_loss_mlp": 1.0241797, "epoch": 0.7663309785059371, "flos": 15742053607680.0, "grad_norm": 2.301646519557661, "language_loss": 0.77083957, "learning_rate": 5.458220170154896e-07, "loss": 0.79217947, "num_input_tokens_seen": 274821800, "step": 12746, "time_per_iteration": 2.6804726123809814 }, { "auxiliary_loss_clip": 0.0100528, "auxiliary_loss_mlp": 0.01001059, "balance_loss_clip": 1.01132298, "balance_loss_mlp": 0.99997419, "epoch": 0.7663911017586051, "flos": 62163312514560.0, "grad_norm": 0.6620577659541201, "language_loss": 0.56773937, "learning_rate": 5.455546623100362e-07, "loss": 0.58780277, "num_input_tokens_seen": 274886970, "step": 12747, "time_per_iteration": 3.3290786743164062 }, { "auxiliary_loss_clip": 0.01108005, "auxiliary_loss_mlp": 0.01035791, "balance_loss_clip": 1.03717351, "balance_loss_mlp": 1.02456689, "epoch": 0.7664512250112732, "flos": 26506025326080.0, "grad_norm": 1.9151583390314333, "language_loss": 0.72503966, "learning_rate": 5.452873627572956e-07, "loss": 0.7464776, "num_input_tokens_seen": 274907240, "step": 12748, "time_per_iteration": 2.730177640914917 }, { "auxiliary_loss_clip": 0.01074476, "auxiliary_loss_mlp": 0.01028824, "balance_loss_clip": 1.03368735, "balance_loss_mlp": 1.01592588, "epoch": 0.7665113482639411, "flos": 16249542912000.0, "grad_norm": 1.8433426874848031, "language_loss": 0.69247651, "learning_rate": 5.450201183674052e-07, "loss": 0.7135095, "num_input_tokens_seen": 274924650, "step": 12749, "time_per_iteration": 2.755204439163208 }, { "auxiliary_loss_clip": 0.01101353, "auxiliary_loss_mlp": 0.01030402, "balance_loss_clip": 1.03804362, "balance_loss_mlp": 1.01727748, "epoch": 0.7665714715166091, "flos": 27197880163200.0, "grad_norm": 1.535641047844791, "language_loss": 0.73516762, "learning_rate": 5.447529291504967e-07, "loss": 0.75648522, "num_input_tokens_seen": 274944550, "step": 12750, "time_per_iteration": 2.7742607593536377 }, { "auxiliary_loss_clip": 0.01097021, "auxiliary_loss_mlp": 0.01031967, "balance_loss_clip": 1.0379684, "balance_loss_mlp": 1.02008176, "epoch": 0.766631594769277, "flos": 21067785279360.0, "grad_norm": 2.3156427112447147, "language_loss": 0.76064527, "learning_rate": 5.444857951167026e-07, "loss": 0.78193521, "num_input_tokens_seen": 274961330, "step": 12751, "time_per_iteration": 2.730836868286133 }, { "auxiliary_loss_clip": 0.01077429, "auxiliary_loss_mlp": 0.01037666, "balance_loss_clip": 1.03694248, "balance_loss_mlp": 1.02451706, "epoch": 0.766691718021945, "flos": 24097963593600.0, "grad_norm": 1.9738925392982969, "language_loss": 0.6149745, "learning_rate": 5.442187162761537e-07, "loss": 0.63612545, "num_input_tokens_seen": 274981655, "step": 12752, "time_per_iteration": 2.869851589202881 }, { "auxiliary_loss_clip": 0.01102451, "auxiliary_loss_mlp": 0.01036291, "balance_loss_clip": 1.03904963, "balance_loss_mlp": 1.02302337, "epoch": 0.7667518412746129, "flos": 23440654661760.0, "grad_norm": 1.931365168470797, "language_loss": 0.69503748, "learning_rate": 5.439516926389767e-07, "loss": 0.71642488, "num_input_tokens_seen": 274999970, "step": 12753, "time_per_iteration": 2.7476491928100586 }, { "auxiliary_loss_clip": 0.01101717, "auxiliary_loss_mlp": 0.01036587, "balance_loss_clip": 1.03879189, "balance_loss_mlp": 1.02405787, "epoch": 0.766811964527281, "flos": 18148786536960.0, "grad_norm": 2.611222297039761, "language_loss": 0.62583512, "learning_rate": 5.436847242152971e-07, "loss": 0.64721823, "num_input_tokens_seen": 275015805, "step": 12754, "time_per_iteration": 2.7371304035186768 }, { "auxiliary_loss_clip": 0.01110914, "auxiliary_loss_mlp": 0.01030173, "balance_loss_clip": 1.03996325, "balance_loss_mlp": 1.01831782, "epoch": 0.7668720877799489, "flos": 19536051657600.0, "grad_norm": 2.549051131454572, "language_loss": 0.80213803, "learning_rate": 5.434178110152401e-07, "loss": 0.82354891, "num_input_tokens_seen": 275031810, "step": 12755, "time_per_iteration": 2.643878936767578 }, { "auxiliary_loss_clip": 0.01110814, "auxiliary_loss_mlp": 0.01030285, "balance_loss_clip": 1.03913355, "balance_loss_mlp": 1.01825666, "epoch": 0.7669322110326169, "flos": 22674320974080.0, "grad_norm": 2.28671666205893, "language_loss": 0.70240182, "learning_rate": 5.431509530489242e-07, "loss": 0.72381282, "num_input_tokens_seen": 275049325, "step": 12756, "time_per_iteration": 2.666398763656616 }, { "auxiliary_loss_clip": 0.01101033, "auxiliary_loss_mlp": 0.0103684, "balance_loss_clip": 1.03897476, "balance_loss_mlp": 1.02491951, "epoch": 0.7669923342852848, "flos": 26469396432000.0, "grad_norm": 1.5126125205867516, "language_loss": 0.70042777, "learning_rate": 5.428841503264706e-07, "loss": 0.72180653, "num_input_tokens_seen": 275070865, "step": 12757, "time_per_iteration": 2.9036061763763428 }, { "auxiliary_loss_clip": 0.01090769, "auxiliary_loss_mlp": 0.01039374, "balance_loss_clip": 1.03925812, "balance_loss_mlp": 1.02609968, "epoch": 0.7670524575379528, "flos": 22856136641280.0, "grad_norm": 1.9623271762553347, "language_loss": 0.76281571, "learning_rate": 5.426174028579955e-07, "loss": 0.7841171, "num_input_tokens_seen": 275088015, "step": 12758, "time_per_iteration": 2.7477500438690186 }, { "auxiliary_loss_clip": 0.0109864, "auxiliary_loss_mlp": 0.01041128, "balance_loss_clip": 1.03716195, "balance_loss_mlp": 1.0282712, "epoch": 0.7671125807906207, "flos": 22452141398400.0, "grad_norm": 1.933344061408033, "language_loss": 0.76319116, "learning_rate": 5.423507106536156e-07, "loss": 0.78458881, "num_input_tokens_seen": 275106975, "step": 12759, "time_per_iteration": 2.714374303817749 }, { "auxiliary_loss_clip": 0.01087695, "auxiliary_loss_mlp": 0.01028999, "balance_loss_clip": 1.03469515, "balance_loss_mlp": 1.0170604, "epoch": 0.7671727040432887, "flos": 35371543518720.0, "grad_norm": 4.630848621895134, "language_loss": 0.67929637, "learning_rate": 5.420840737234425e-07, "loss": 0.70046335, "num_input_tokens_seen": 275129560, "step": 12760, "time_per_iteration": 2.7753570079803467 }, { "auxiliary_loss_clip": 0.01089951, "auxiliary_loss_mlp": 0.01034392, "balance_loss_clip": 1.03797793, "balance_loss_mlp": 1.02147603, "epoch": 0.7672328272959568, "flos": 22494947431680.0, "grad_norm": 1.455109874708046, "language_loss": 0.79299426, "learning_rate": 5.418174920775871e-07, "loss": 0.81423771, "num_input_tokens_seen": 275151180, "step": 12761, "time_per_iteration": 2.7769343852996826 }, { "auxiliary_loss_clip": 0.01085141, "auxiliary_loss_mlp": 0.01035009, "balance_loss_clip": 1.03607702, "balance_loss_mlp": 1.022295, "epoch": 0.7672929505486247, "flos": 22815557251200.0, "grad_norm": 18.920071863703896, "language_loss": 0.66145515, "learning_rate": 5.415509657261589e-07, "loss": 0.68265665, "num_input_tokens_seen": 275170605, "step": 12762, "time_per_iteration": 2.8406293392181396 }, { "auxiliary_loss_clip": 0.01101121, "auxiliary_loss_mlp": 0.01034249, "balance_loss_clip": 1.03835821, "balance_loss_mlp": 1.02105296, "epoch": 0.7673530738012927, "flos": 20338834671360.0, "grad_norm": 1.6976408594267334, "language_loss": 0.74313831, "learning_rate": 5.412844946792639e-07, "loss": 0.76449203, "num_input_tokens_seen": 275188750, "step": 12763, "time_per_iteration": 2.6841235160827637 }, { "auxiliary_loss_clip": 0.01088871, "auxiliary_loss_mlp": 0.01033223, "balance_loss_clip": 1.03973687, "balance_loss_mlp": 1.02024698, "epoch": 0.7674131970539606, "flos": 34933576988160.0, "grad_norm": 1.693482308646493, "language_loss": 0.70655918, "learning_rate": 5.410180789470067e-07, "loss": 0.7277801, "num_input_tokens_seen": 275211365, "step": 12764, "time_per_iteration": 2.821410894393921 }, { "auxiliary_loss_clip": 0.01101312, "auxiliary_loss_mlp": 0.01031147, "balance_loss_clip": 1.03925323, "balance_loss_mlp": 1.01875496, "epoch": 0.7674733203066286, "flos": 28328850766080.0, "grad_norm": 1.8643050168393442, "language_loss": 0.69511282, "learning_rate": 5.40751718539491e-07, "loss": 0.7164374, "num_input_tokens_seen": 275231670, "step": 12765, "time_per_iteration": 2.7457258701324463 }, { "auxiliary_loss_clip": 0.01081052, "auxiliary_loss_mlp": 0.01029756, "balance_loss_clip": 1.03556418, "balance_loss_mlp": 1.01865792, "epoch": 0.7675334435592965, "flos": 16289727252480.0, "grad_norm": 3.667092334043392, "language_loss": 0.60817224, "learning_rate": 5.404854134668162e-07, "loss": 0.62928033, "num_input_tokens_seen": 275249425, "step": 12766, "time_per_iteration": 2.6500067710876465 }, { "auxiliary_loss_clip": 0.01001024, "auxiliary_loss_mlp": 0.01013385, "balance_loss_clip": 1.01323843, "balance_loss_mlp": 1.01216352, "epoch": 0.7675935668119646, "flos": 64826232220800.0, "grad_norm": 0.7347382071618644, "language_loss": 0.60767788, "learning_rate": 5.402191637390803e-07, "loss": 0.62782198, "num_input_tokens_seen": 275312485, "step": 12767, "time_per_iteration": 3.39412260055542 }, { "auxiliary_loss_clip": 0.01089304, "auxiliary_loss_mlp": 0.01027185, "balance_loss_clip": 1.04006386, "balance_loss_mlp": 1.01521647, "epoch": 0.7676536900646325, "flos": 22675398382080.0, "grad_norm": 1.6451651301272818, "language_loss": 0.69793016, "learning_rate": 5.399529693663801e-07, "loss": 0.71909499, "num_input_tokens_seen": 275331680, "step": 12768, "time_per_iteration": 2.730433464050293 }, { "auxiliary_loss_clip": 0.01106486, "auxiliary_loss_mlp": 0.01036773, "balance_loss_clip": 1.0407027, "balance_loss_mlp": 1.0239104, "epoch": 0.7677138133173005, "flos": 26939682224640.0, "grad_norm": 1.8343046170579347, "language_loss": 0.71094149, "learning_rate": 5.3968683035881e-07, "loss": 0.73237407, "num_input_tokens_seen": 275351615, "step": 12769, "time_per_iteration": 4.170667409896851 }, { "auxiliary_loss_clip": 0.01103072, "auxiliary_loss_mlp": 0.01029544, "balance_loss_clip": 1.04003, "balance_loss_mlp": 1.01668179, "epoch": 0.7677739365699684, "flos": 23799545400960.0, "grad_norm": 1.983209153557694, "language_loss": 0.80168104, "learning_rate": 5.394207467264611e-07, "loss": 0.82300717, "num_input_tokens_seen": 275368815, "step": 12770, "time_per_iteration": 5.3567235469818115 }, { "auxiliary_loss_clip": 0.01073219, "auxiliary_loss_mlp": 0.01038788, "balance_loss_clip": 1.03567314, "balance_loss_mlp": 1.02632451, "epoch": 0.7678340598226364, "flos": 34455497944320.0, "grad_norm": 1.6213929898270116, "language_loss": 0.78927696, "learning_rate": 5.391547184794245e-07, "loss": 0.81039715, "num_input_tokens_seen": 275389345, "step": 12771, "time_per_iteration": 4.329530954360962 }, { "auxiliary_loss_clip": 0.01110874, "auxiliary_loss_mlp": 0.01033957, "balance_loss_clip": 1.03865027, "balance_loss_mlp": 1.02205408, "epoch": 0.7678941830753043, "flos": 23841740903040.0, "grad_norm": 1.3882460901064075, "language_loss": 0.68299866, "learning_rate": 5.388887456277876e-07, "loss": 0.70444703, "num_input_tokens_seen": 275411240, "step": 12772, "time_per_iteration": 2.6789863109588623 }, { "auxiliary_loss_clip": 0.01095405, "auxiliary_loss_mlp": 0.01027019, "balance_loss_clip": 1.03676343, "balance_loss_mlp": 1.01512742, "epoch": 0.7679543063279723, "flos": 25410929431680.0, "grad_norm": 1.5084750243321292, "language_loss": 0.73452669, "learning_rate": 5.386228281816349e-07, "loss": 0.75575089, "num_input_tokens_seen": 275432010, "step": 12773, "time_per_iteration": 2.6992523670196533 }, { "auxiliary_loss_clip": 0.01069552, "auxiliary_loss_mlp": 0.01031097, "balance_loss_clip": 1.03272963, "balance_loss_mlp": 1.0193727, "epoch": 0.7680144295806404, "flos": 27962382257280.0, "grad_norm": 1.681002895076516, "language_loss": 0.81144333, "learning_rate": 5.383569661510512e-07, "loss": 0.83244979, "num_input_tokens_seen": 275453710, "step": 12774, "time_per_iteration": 2.8317103385925293 }, { "auxiliary_loss_clip": 0.01102442, "auxiliary_loss_mlp": 0.00769635, "balance_loss_clip": 1.04086018, "balance_loss_mlp": 1.00017095, "epoch": 0.7680745528333083, "flos": 20412810731520.0, "grad_norm": 1.7406217670940616, "language_loss": 0.69881612, "learning_rate": 5.380911595461177e-07, "loss": 0.71753687, "num_input_tokens_seen": 275472915, "step": 12775, "time_per_iteration": 2.6908600330352783 }, { "auxiliary_loss_clip": 0.00994458, "auxiliary_loss_mlp": 0.01000081, "balance_loss_clip": 1.01208818, "balance_loss_mlp": 0.99908555, "epoch": 0.7681346760859763, "flos": 68401103351040.0, "grad_norm": 0.7006055087346096, "language_loss": 0.5683471, "learning_rate": 5.378254083769147e-07, "loss": 0.58829248, "num_input_tokens_seen": 275534785, "step": 12776, "time_per_iteration": 4.903045415878296 }, { "auxiliary_loss_clip": 0.01097484, "auxiliary_loss_mlp": 0.01038787, "balance_loss_clip": 1.03686929, "balance_loss_mlp": 1.02621067, "epoch": 0.7681947993386442, "flos": 21251468453760.0, "grad_norm": 1.9522911810284198, "language_loss": 0.73814118, "learning_rate": 5.375597126535188e-07, "loss": 0.75950396, "num_input_tokens_seen": 275553205, "step": 12777, "time_per_iteration": 2.6122212409973145 }, { "auxiliary_loss_clip": 0.01086003, "auxiliary_loss_mlp": 0.01032363, "balance_loss_clip": 1.04298782, "balance_loss_mlp": 1.02055573, "epoch": 0.7682549225913122, "flos": 21397696721280.0, "grad_norm": 2.745693545308853, "language_loss": 0.70324051, "learning_rate": 5.372940723860043e-07, "loss": 0.72442418, "num_input_tokens_seen": 275571490, "step": 12778, "time_per_iteration": 2.67712664604187 }, { "auxiliary_loss_clip": 0.01097946, "auxiliary_loss_mlp": 0.01036667, "balance_loss_clip": 1.0395422, "balance_loss_mlp": 1.02473378, "epoch": 0.7683150458439801, "flos": 23038921975680.0, "grad_norm": 1.741525859100896, "language_loss": 0.70140779, "learning_rate": 5.37028487584446e-07, "loss": 0.72275388, "num_input_tokens_seen": 275589665, "step": 12779, "time_per_iteration": 2.699604034423828 }, { "auxiliary_loss_clip": 0.01086473, "auxiliary_loss_mlp": 0.01031094, "balance_loss_clip": 1.03794789, "balance_loss_mlp": 1.01829696, "epoch": 0.7683751690966482, "flos": 67332397996800.0, "grad_norm": 9.13576096667177, "language_loss": 0.58861399, "learning_rate": 5.367629582589133e-07, "loss": 0.60978961, "num_input_tokens_seen": 275615605, "step": 12780, "time_per_iteration": 3.0669844150543213 }, { "auxiliary_loss_clip": 0.01104147, "auxiliary_loss_mlp": 0.01037402, "balance_loss_clip": 1.03906894, "balance_loss_mlp": 1.02291799, "epoch": 0.7684352923493161, "flos": 21798890703360.0, "grad_norm": 1.8034337516792285, "language_loss": 0.67968678, "learning_rate": 5.364974844194759e-07, "loss": 0.70110226, "num_input_tokens_seen": 275634965, "step": 12781, "time_per_iteration": 2.651834726333618 }, { "auxiliary_loss_clip": 0.01060523, "auxiliary_loss_mlp": 0.01036749, "balance_loss_clip": 1.03551328, "balance_loss_mlp": 1.02461362, "epoch": 0.7684954156019841, "flos": 25847603072640.0, "grad_norm": 1.4376609198163834, "language_loss": 0.79309833, "learning_rate": 5.362320660762016e-07, "loss": 0.81407106, "num_input_tokens_seen": 275655785, "step": 12782, "time_per_iteration": 2.847486972808838 }, { "auxiliary_loss_clip": 0.01082383, "auxiliary_loss_mlp": 0.0103233, "balance_loss_clip": 1.03683078, "balance_loss_mlp": 1.01938355, "epoch": 0.768555538854652, "flos": 25447378757760.0, "grad_norm": 1.7564402643439623, "language_loss": 0.67005706, "learning_rate": 5.35966703239153e-07, "loss": 0.69120419, "num_input_tokens_seen": 275676160, "step": 12783, "time_per_iteration": 2.703382730484009 }, { "auxiliary_loss_clip": 0.01090024, "auxiliary_loss_mlp": 0.01032797, "balance_loss_clip": 1.03791714, "balance_loss_mlp": 1.01942182, "epoch": 0.76861566210732, "flos": 19646369303040.0, "grad_norm": 1.6469852773745217, "language_loss": 0.69382596, "learning_rate": 5.357013959183938e-07, "loss": 0.71505415, "num_input_tokens_seen": 275695660, "step": 12784, "time_per_iteration": 2.704110860824585 }, { "auxiliary_loss_clip": 0.01069442, "auxiliary_loss_mlp": 0.01027242, "balance_loss_clip": 1.03885603, "balance_loss_mlp": 1.01570261, "epoch": 0.7686757853599879, "flos": 22419032037120.0, "grad_norm": 1.8804976619771494, "language_loss": 0.80312717, "learning_rate": 5.354361441239843e-07, "loss": 0.824094, "num_input_tokens_seen": 275714025, "step": 12785, "time_per_iteration": 2.7998046875 }, { "auxiliary_loss_clip": 0.0109676, "auxiliary_loss_mlp": 0.01038542, "balance_loss_clip": 1.03655457, "balance_loss_mlp": 1.02337885, "epoch": 0.768735908612656, "flos": 47774262453120.0, "grad_norm": 1.5387616772885826, "language_loss": 0.77432472, "learning_rate": 5.351709478659836e-07, "loss": 0.79567772, "num_input_tokens_seen": 275737300, "step": 12786, "time_per_iteration": 2.8903398513793945 }, { "auxiliary_loss_clip": 0.01110354, "auxiliary_loss_mlp": 0.01035321, "balance_loss_clip": 1.03830373, "balance_loss_mlp": 1.02295876, "epoch": 0.7687960318653239, "flos": 30263179000320.0, "grad_norm": 1.918052748759356, "language_loss": 0.58398765, "learning_rate": 5.349058071544468e-07, "loss": 0.60544437, "num_input_tokens_seen": 275757895, "step": 12787, "time_per_iteration": 2.699540376663208 }, { "auxiliary_loss_clip": 0.01082553, "auxiliary_loss_mlp": 0.01032316, "balance_loss_clip": 1.03361166, "balance_loss_mlp": 1.01962042, "epoch": 0.7688561551179919, "flos": 19573434737280.0, "grad_norm": 1.5809067798231773, "language_loss": 0.76156747, "learning_rate": 5.346407219994292e-07, "loss": 0.78271621, "num_input_tokens_seen": 275776745, "step": 12788, "time_per_iteration": 2.81557559967041 }, { "auxiliary_loss_clip": 0.01071579, "auxiliary_loss_mlp": 0.00770364, "balance_loss_clip": 1.03880525, "balance_loss_mlp": 1.00020683, "epoch": 0.7689162783706599, "flos": 22783776693120.0, "grad_norm": 1.957956891358716, "language_loss": 0.66906554, "learning_rate": 5.343756924109821e-07, "loss": 0.68748498, "num_input_tokens_seen": 275797205, "step": 12789, "time_per_iteration": 2.8146092891693115 }, { "auxiliary_loss_clip": 0.01090409, "auxiliary_loss_mlp": 0.0103625, "balance_loss_clip": 1.03680754, "balance_loss_mlp": 1.02214777, "epoch": 0.7689764016233278, "flos": 34204195416960.0, "grad_norm": 1.6643565512884475, "language_loss": 0.68623877, "learning_rate": 5.341107183991553e-07, "loss": 0.70750535, "num_input_tokens_seen": 275817935, "step": 12790, "time_per_iteration": 2.812708854675293 }, { "auxiliary_loss_clip": 0.0108634, "auxiliary_loss_mlp": 0.01032838, "balance_loss_clip": 1.03740978, "balance_loss_mlp": 1.01972485, "epoch": 0.7690365248759958, "flos": 17274469587840.0, "grad_norm": 1.474038307182623, "language_loss": 0.68689752, "learning_rate": 5.338457999739969e-07, "loss": 0.70808923, "num_input_tokens_seen": 275837145, "step": 12791, "time_per_iteration": 2.7558822631835938 }, { "auxiliary_loss_clip": 0.01097751, "auxiliary_loss_mlp": 0.01036178, "balance_loss_clip": 1.038535, "balance_loss_mlp": 1.0244422, "epoch": 0.7690966481286637, "flos": 18223157646720.0, "grad_norm": 2.037350378754986, "language_loss": 0.79861724, "learning_rate": 5.335809371455526e-07, "loss": 0.81995654, "num_input_tokens_seen": 275855705, "step": 12792, "time_per_iteration": 2.6373798847198486 }, { "auxiliary_loss_clip": 0.01086002, "auxiliary_loss_mlp": 0.00771512, "balance_loss_clip": 1.04310513, "balance_loss_mlp": 1.0003171, "epoch": 0.7691567713813318, "flos": 21537568281600.0, "grad_norm": 1.8617627243354054, "language_loss": 0.72776759, "learning_rate": 5.333161299238673e-07, "loss": 0.74634272, "num_input_tokens_seen": 275873930, "step": 12793, "time_per_iteration": 2.8017160892486572 }, { "auxiliary_loss_clip": 0.01074333, "auxiliary_loss_mlp": 0.01036724, "balance_loss_clip": 1.03909159, "balance_loss_mlp": 1.02368283, "epoch": 0.7692168946339997, "flos": 39379999720320.0, "grad_norm": 1.9633300130492255, "language_loss": 0.63842422, "learning_rate": 5.330513783189803e-07, "loss": 0.65953475, "num_input_tokens_seen": 275895895, "step": 12794, "time_per_iteration": 2.8763763904571533 }, { "auxiliary_loss_clip": 0.01088067, "auxiliary_loss_mlp": 0.01038769, "balance_loss_clip": 1.03724957, "balance_loss_mlp": 1.02609682, "epoch": 0.7692770178866677, "flos": 25009950931200.0, "grad_norm": 1.537212991597864, "language_loss": 0.76528752, "learning_rate": 5.327866823409319e-07, "loss": 0.78655589, "num_input_tokens_seen": 275917825, "step": 12795, "time_per_iteration": 2.7116506099700928 }, { "auxiliary_loss_clip": 0.01075556, "auxiliary_loss_mlp": 0.01025881, "balance_loss_clip": 1.03575516, "balance_loss_mlp": 1.01325679, "epoch": 0.7693371411393356, "flos": 24716273333760.0, "grad_norm": 1.8098665948309556, "language_loss": 0.71871811, "learning_rate": 5.325220419997601e-07, "loss": 0.7397325, "num_input_tokens_seen": 275937890, "step": 12796, "time_per_iteration": 2.770573139190674 }, { "auxiliary_loss_clip": 0.01110769, "auxiliary_loss_mlp": 0.01030191, "balance_loss_clip": 1.03795838, "balance_loss_mlp": 1.01753139, "epoch": 0.7693972643920036, "flos": 15924803028480.0, "grad_norm": 1.8945883944315456, "language_loss": 0.64692825, "learning_rate": 5.32257457305499e-07, "loss": 0.66833782, "num_input_tokens_seen": 275954495, "step": 12797, "time_per_iteration": 2.597770929336548 }, { "auxiliary_loss_clip": 0.01074194, "auxiliary_loss_mlp": 0.01036918, "balance_loss_clip": 1.03441215, "balance_loss_mlp": 1.02261305, "epoch": 0.7694573876446715, "flos": 25405901527680.0, "grad_norm": 2.104503388065538, "language_loss": 0.91503501, "learning_rate": 5.319929282681823e-07, "loss": 0.93614614, "num_input_tokens_seen": 275972395, "step": 12798, "time_per_iteration": 2.7857353687286377 }, { "auxiliary_loss_clip": 0.01061452, "auxiliary_loss_mlp": 0.01027349, "balance_loss_clip": 1.03667367, "balance_loss_mlp": 1.01509404, "epoch": 0.7695175108973396, "flos": 16654220513280.0, "grad_norm": 1.8305644604969793, "language_loss": 0.82303166, "learning_rate": 5.317284548978418e-07, "loss": 0.84391975, "num_input_tokens_seen": 275989020, "step": 12799, "time_per_iteration": 2.7627201080322266 }, { "auxiliary_loss_clip": 0.01057867, "auxiliary_loss_mlp": 0.0102915, "balance_loss_clip": 1.03739285, "balance_loss_mlp": 1.01601338, "epoch": 0.7695776341500075, "flos": 13626520237440.0, "grad_norm": 1.9375837310730932, "language_loss": 0.7841835, "learning_rate": 5.314640372045045e-07, "loss": 0.80505365, "num_input_tokens_seen": 276006525, "step": 12800, "time_per_iteration": 2.860802173614502 }, { "auxiliary_loss_clip": 0.01094192, "auxiliary_loss_mlp": 0.01029605, "balance_loss_clip": 1.03736687, "balance_loss_mlp": 1.01572347, "epoch": 0.7696377574026755, "flos": 24276690691200.0, "grad_norm": 1.6551183463192032, "language_loss": 0.83884531, "learning_rate": 5.31199675198198e-07, "loss": 0.86008328, "num_input_tokens_seen": 276027130, "step": 12801, "time_per_iteration": 2.8100953102111816 }, { "auxiliary_loss_clip": 0.0108893, "auxiliary_loss_mlp": 0.01030665, "balance_loss_clip": 1.03665733, "balance_loss_mlp": 1.01778448, "epoch": 0.7696978806553435, "flos": 20923137210240.0, "grad_norm": 2.4183621963241357, "language_loss": 0.72267437, "learning_rate": 5.30935368888947e-07, "loss": 0.74387032, "num_input_tokens_seen": 276045715, "step": 12802, "time_per_iteration": 2.716482639312744 }, { "auxiliary_loss_clip": 0.0108354, "auxiliary_loss_mlp": 0.01034672, "balance_loss_clip": 1.0340662, "balance_loss_mlp": 1.022048, "epoch": 0.7697580039080114, "flos": 22929609911040.0, "grad_norm": 1.7224396030439215, "language_loss": 0.75905406, "learning_rate": 5.306711182867747e-07, "loss": 0.78023618, "num_input_tokens_seen": 276065375, "step": 12803, "time_per_iteration": 2.7502260208129883 }, { "auxiliary_loss_clip": 0.01018092, "auxiliary_loss_mlp": 0.01000358, "balance_loss_clip": 1.01451325, "balance_loss_mlp": 0.99920207, "epoch": 0.7698181271606794, "flos": 68717654933760.0, "grad_norm": 0.7330583208910887, "language_loss": 0.55806667, "learning_rate": 5.304069234017001e-07, "loss": 0.57825118, "num_input_tokens_seen": 276131405, "step": 12804, "time_per_iteration": 3.3005380630493164 }, { "auxiliary_loss_clip": 0.0101265, "auxiliary_loss_mlp": 0.01002009, "balance_loss_clip": 1.00900471, "balance_loss_mlp": 1.00096023, "epoch": 0.7698782504133473, "flos": 67409716999680.0, "grad_norm": 0.7614116720269231, "language_loss": 0.54004955, "learning_rate": 5.301427842437429e-07, "loss": 0.56019616, "num_input_tokens_seen": 276200755, "step": 12805, "time_per_iteration": 3.3900198936462402 }, { "auxiliary_loss_clip": 0.0108001, "auxiliary_loss_mlp": 0.01033208, "balance_loss_clip": 1.03882051, "balance_loss_mlp": 1.02053022, "epoch": 0.7699383736660154, "flos": 22488842119680.0, "grad_norm": 1.986233467865104, "language_loss": 0.73035413, "learning_rate": 5.298787008229187e-07, "loss": 0.7514863, "num_input_tokens_seen": 276217880, "step": 12806, "time_per_iteration": 2.7341980934143066 }, { "auxiliary_loss_clip": 0.01086866, "auxiliary_loss_mlp": 0.01035537, "balance_loss_clip": 1.03594339, "balance_loss_mlp": 1.02238786, "epoch": 0.7699984969186833, "flos": 21539723097600.0, "grad_norm": 2.048367090429927, "language_loss": 0.75222588, "learning_rate": 5.296146731492408e-07, "loss": 0.7734499, "num_input_tokens_seen": 276234810, "step": 12807, "time_per_iteration": 2.724539041519165 }, { "auxiliary_loss_clip": 0.01106456, "auxiliary_loss_mlp": 0.01031388, "balance_loss_clip": 1.04034483, "balance_loss_mlp": 1.01792347, "epoch": 0.7700586201713513, "flos": 21719096640000.0, "grad_norm": 2.054947719033548, "language_loss": 0.80061448, "learning_rate": 5.293507012327218e-07, "loss": 0.82199287, "num_input_tokens_seen": 276252850, "step": 12808, "time_per_iteration": 4.215209722518921 }, { "auxiliary_loss_clip": 0.01105023, "auxiliary_loss_mlp": 0.01039739, "balance_loss_clip": 1.03983986, "balance_loss_mlp": 1.02620244, "epoch": 0.7701187434240192, "flos": 27856015107840.0, "grad_norm": 2.2828692902230743, "language_loss": 0.79191184, "learning_rate": 5.290867850833718e-07, "loss": 0.8133595, "num_input_tokens_seen": 276272525, "step": 12809, "time_per_iteration": 4.67883825302124 }, { "auxiliary_loss_clip": 0.01075128, "auxiliary_loss_mlp": 0.01026317, "balance_loss_clip": 1.03558159, "balance_loss_mlp": 1.014974, "epoch": 0.7701788666766872, "flos": 28621307301120.0, "grad_norm": 1.7126957543660224, "language_loss": 0.70423043, "learning_rate": 5.288229247111993e-07, "loss": 0.72524494, "num_input_tokens_seen": 276294210, "step": 12810, "time_per_iteration": 4.299976110458374 }, { "auxiliary_loss_clip": 0.0108663, "auxiliary_loss_mlp": 0.01043871, "balance_loss_clip": 1.03548312, "balance_loss_mlp": 1.02746737, "epoch": 0.7702389899293551, "flos": 14246446089600.0, "grad_norm": 2.84512278280032, "language_loss": 0.77875537, "learning_rate": 5.285591201262079e-07, "loss": 0.80006033, "num_input_tokens_seen": 276310290, "step": 12811, "time_per_iteration": 2.792184352874756 }, { "auxiliary_loss_clip": 0.01001395, "auxiliary_loss_mlp": 0.01001171, "balance_loss_clip": 1.00706363, "balance_loss_mlp": 0.99988317, "epoch": 0.7702991131820232, "flos": 70574128439040.0, "grad_norm": 0.8151907995721069, "language_loss": 0.56650817, "learning_rate": 5.28295371338402e-07, "loss": 0.5865339, "num_input_tokens_seen": 276371715, "step": 12812, "time_per_iteration": 3.301762819290161 }, { "auxiliary_loss_clip": 0.01073584, "auxiliary_loss_mlp": 0.01035613, "balance_loss_clip": 1.03664494, "balance_loss_mlp": 1.02299511, "epoch": 0.7703592364346911, "flos": 25480021242240.0, "grad_norm": 3.4768581734180453, "language_loss": 0.72098076, "learning_rate": 5.280316783577836e-07, "loss": 0.74207264, "num_input_tokens_seen": 276389895, "step": 12813, "time_per_iteration": 2.8251900672912598 }, { "auxiliary_loss_clip": 0.0110181, "auxiliary_loss_mlp": 0.01030481, "balance_loss_clip": 1.03734303, "balance_loss_mlp": 1.01664054, "epoch": 0.7704193596873591, "flos": 19280906375040.0, "grad_norm": 2.0063403023078297, "language_loss": 0.66324687, "learning_rate": 5.27768041194351e-07, "loss": 0.68456984, "num_input_tokens_seen": 276408990, "step": 12814, "time_per_iteration": 2.7897889614105225 }, { "auxiliary_loss_clip": 0.01089036, "auxiliary_loss_mlp": 0.01038036, "balance_loss_clip": 1.03707969, "balance_loss_mlp": 1.02553058, "epoch": 0.7704794829400271, "flos": 23658452778240.0, "grad_norm": 1.8618896845056536, "language_loss": 0.65574408, "learning_rate": 5.275044598581018e-07, "loss": 0.67701477, "num_input_tokens_seen": 276428190, "step": 12815, "time_per_iteration": 2.745948314666748 }, { "auxiliary_loss_clip": 0.0109967, "auxiliary_loss_mlp": 0.01034412, "balance_loss_clip": 1.03795624, "balance_loss_mlp": 1.02119207, "epoch": 0.770539606192695, "flos": 18989311766400.0, "grad_norm": 3.9090080450756703, "language_loss": 0.65051812, "learning_rate": 5.272409343590322e-07, "loss": 0.67185891, "num_input_tokens_seen": 276446855, "step": 12816, "time_per_iteration": 4.193779230117798 }, { "auxiliary_loss_clip": 0.01102885, "auxiliary_loss_mlp": 0.01034999, "balance_loss_clip": 1.03968191, "balance_loss_mlp": 1.02194536, "epoch": 0.770599729445363, "flos": 11830160142720.0, "grad_norm": 2.3027657135701496, "language_loss": 0.71589029, "learning_rate": 5.26977464707133e-07, "loss": 0.73726916, "num_input_tokens_seen": 276462000, "step": 12817, "time_per_iteration": 2.701976776123047 }, { "auxiliary_loss_clip": 0.01067462, "auxiliary_loss_mlp": 0.01031755, "balance_loss_clip": 1.03671288, "balance_loss_mlp": 1.01967907, "epoch": 0.770659852698031, "flos": 17822610109440.0, "grad_norm": 2.117205920773346, "language_loss": 0.61316186, "learning_rate": 5.267140509123957e-07, "loss": 0.63415402, "num_input_tokens_seen": 276481190, "step": 12818, "time_per_iteration": 2.894584894180298 }, { "auxiliary_loss_clip": 0.01098817, "auxiliary_loss_mlp": 0.01029481, "balance_loss_clip": 1.03884339, "balance_loss_mlp": 1.01770937, "epoch": 0.770719975950699, "flos": 21871968923520.0, "grad_norm": 1.8092629622591248, "language_loss": 0.67272353, "learning_rate": 5.264506929848093e-07, "loss": 0.69400644, "num_input_tokens_seen": 276499520, "step": 12819, "time_per_iteration": 2.6729207038879395 }, { "auxiliary_loss_clip": 0.01114198, "auxiliary_loss_mlp": 0.01031273, "balance_loss_clip": 1.04036117, "balance_loss_mlp": 1.0183568, "epoch": 0.7707800992033669, "flos": 21325049464320.0, "grad_norm": 3.8495844407525786, "language_loss": 0.57512546, "learning_rate": 5.261873909343608e-07, "loss": 0.59658015, "num_input_tokens_seen": 276519110, "step": 12820, "time_per_iteration": 2.6065587997436523 }, { "auxiliary_loss_clip": 0.01082909, "auxiliary_loss_mlp": 0.01030006, "balance_loss_clip": 1.037233, "balance_loss_mlp": 1.01698244, "epoch": 0.7708402224560349, "flos": 28179426188160.0, "grad_norm": 2.6946227990391742, "language_loss": 0.80718732, "learning_rate": 5.259241447710343e-07, "loss": 0.82831645, "num_input_tokens_seen": 276538805, "step": 12821, "time_per_iteration": 2.7545745372772217 }, { "auxiliary_loss_clip": 0.01113447, "auxiliary_loss_mlp": 0.01036131, "balance_loss_clip": 1.04009652, "balance_loss_mlp": 1.02311945, "epoch": 0.7709003457087028, "flos": 15377057556480.0, "grad_norm": 3.179311365273749, "language_loss": 0.68571889, "learning_rate": 5.256609545048114e-07, "loss": 0.70721459, "num_input_tokens_seen": 276554770, "step": 12822, "time_per_iteration": 2.6314475536346436 }, { "auxiliary_loss_clip": 0.0108847, "auxiliary_loss_mlp": 0.01036733, "balance_loss_clip": 1.03697228, "balance_loss_mlp": 1.02384686, "epoch": 0.7709604689613708, "flos": 30621854257920.0, "grad_norm": 1.8530631240007662, "language_loss": 0.72300768, "learning_rate": 5.253978201456733e-07, "loss": 0.74425972, "num_input_tokens_seen": 276574535, "step": 12823, "time_per_iteration": 2.7124979496002197 }, { "auxiliary_loss_clip": 0.01107629, "auxiliary_loss_mlp": 0.01039791, "balance_loss_clip": 1.04024911, "balance_loss_mlp": 1.02459168, "epoch": 0.7710205922140387, "flos": 20301272023680.0, "grad_norm": 1.7759548619058283, "language_loss": 0.76394266, "learning_rate": 5.251347417035969e-07, "loss": 0.78541684, "num_input_tokens_seen": 276592925, "step": 12824, "time_per_iteration": 2.7012369632720947 }, { "auxiliary_loss_clip": 0.0108641, "auxiliary_loss_mlp": 0.01031747, "balance_loss_clip": 1.0379014, "balance_loss_mlp": 1.01897967, "epoch": 0.7710807154667068, "flos": 19644214487040.0, "grad_norm": 2.5594814083345856, "language_loss": 0.72377741, "learning_rate": 5.248717191885592e-07, "loss": 0.744959, "num_input_tokens_seen": 276610540, "step": 12825, "time_per_iteration": 2.711148977279663 }, { "auxiliary_loss_clip": 0.0110825, "auxiliary_loss_mlp": 0.01037397, "balance_loss_clip": 1.03889346, "balance_loss_mlp": 1.02650094, "epoch": 0.7711408387193747, "flos": 20006337450240.0, "grad_norm": 1.6443549229743277, "language_loss": 0.73782164, "learning_rate": 5.246087526105343e-07, "loss": 0.75927812, "num_input_tokens_seen": 276629200, "step": 12826, "time_per_iteration": 2.6268928050994873 }, { "auxiliary_loss_clip": 0.01112855, "auxiliary_loss_mlp": 0.01035754, "balance_loss_clip": 1.03778219, "balance_loss_mlp": 1.02234912, "epoch": 0.7712009619720427, "flos": 24971131307520.0, "grad_norm": 1.6817186914054845, "language_loss": 0.81052697, "learning_rate": 5.243458419794933e-07, "loss": 0.83201313, "num_input_tokens_seen": 276648655, "step": 12827, "time_per_iteration": 2.6236133575439453 }, { "auxiliary_loss_clip": 0.01030504, "auxiliary_loss_mlp": 0.01001401, "balance_loss_clip": 1.0079608, "balance_loss_mlp": 1.0003643, "epoch": 0.7712610852247107, "flos": 63249681404160.0, "grad_norm": 0.8667997379462846, "language_loss": 0.55184829, "learning_rate": 5.240829873054051e-07, "loss": 0.57216728, "num_input_tokens_seen": 276716500, "step": 12828, "time_per_iteration": 3.314025640487671 }, { "auxiliary_loss_clip": 0.01062789, "auxiliary_loss_mlp": 0.01034135, "balance_loss_clip": 1.03295088, "balance_loss_mlp": 1.02165389, "epoch": 0.7713212084773786, "flos": 18697860812160.0, "grad_norm": 1.7251497465168657, "language_loss": 0.6980052, "learning_rate": 5.23820188598238e-07, "loss": 0.71897441, "num_input_tokens_seen": 276733535, "step": 12829, "time_per_iteration": 2.7099921703338623 }, { "auxiliary_loss_clip": 0.01085241, "auxiliary_loss_mlp": 0.01036187, "balance_loss_clip": 1.04121757, "balance_loss_mlp": 1.02271688, "epoch": 0.7713813317300466, "flos": 14173367869440.0, "grad_norm": 2.8210703511982, "language_loss": 0.79999912, "learning_rate": 5.235574458679579e-07, "loss": 0.82121342, "num_input_tokens_seen": 276749575, "step": 12830, "time_per_iteration": 2.754983901977539 }, { "auxiliary_loss_clip": 0.01104042, "auxiliary_loss_mlp": 0.01037065, "balance_loss_clip": 1.03856182, "balance_loss_mlp": 1.02329099, "epoch": 0.7714414549827145, "flos": 25703960584320.0, "grad_norm": 1.661801317561211, "language_loss": 0.77825183, "learning_rate": 5.232947591245269e-07, "loss": 0.79966295, "num_input_tokens_seen": 276769460, "step": 12831, "time_per_iteration": 2.7142996788024902 }, { "auxiliary_loss_clip": 0.01078302, "auxiliary_loss_mlp": 0.01036061, "balance_loss_clip": 1.0332458, "balance_loss_mlp": 1.02210712, "epoch": 0.7715015782353826, "flos": 30555312312960.0, "grad_norm": 1.5151652331679557, "language_loss": 0.6105473, "learning_rate": 5.230321283779071e-07, "loss": 0.63169092, "num_input_tokens_seen": 276790820, "step": 12832, "time_per_iteration": 2.717639684677124 }, { "auxiliary_loss_clip": 0.01085655, "auxiliary_loss_mlp": 0.01039371, "balance_loss_clip": 1.03684115, "balance_loss_mlp": 1.02620983, "epoch": 0.7715617014880505, "flos": 20229343038720.0, "grad_norm": 1.801135841177815, "language_loss": 0.79230422, "learning_rate": 5.227695536380572e-07, "loss": 0.81355441, "num_input_tokens_seen": 276811345, "step": 12833, "time_per_iteration": 2.7320380210876465 }, { "auxiliary_loss_clip": 0.00988976, "auxiliary_loss_mlp": 0.01003321, "balance_loss_clip": 1.00962079, "balance_loss_mlp": 1.00185442, "epoch": 0.7716218247407185, "flos": 63664770971520.0, "grad_norm": 0.8509203865481852, "language_loss": 0.55384171, "learning_rate": 5.22507034914933e-07, "loss": 0.57376468, "num_input_tokens_seen": 276870950, "step": 12834, "time_per_iteration": 3.2906105518341064 }, { "auxiliary_loss_clip": 0.01065317, "auxiliary_loss_mlp": 0.01033528, "balance_loss_clip": 1.03433681, "balance_loss_mlp": 1.019449, "epoch": 0.7716819479933864, "flos": 19791807471360.0, "grad_norm": 2.0007244746658905, "language_loss": 0.72596645, "learning_rate": 5.222445722184903e-07, "loss": 0.74695486, "num_input_tokens_seen": 276890760, "step": 12835, "time_per_iteration": 2.789001941680908 }, { "auxiliary_loss_clip": 0.01078061, "auxiliary_loss_mlp": 0.00771412, "balance_loss_clip": 1.03562582, "balance_loss_mlp": 1.00025511, "epoch": 0.7717420712460544, "flos": 18442176825600.0, "grad_norm": 1.8060607168740586, "language_loss": 0.70171171, "learning_rate": 5.219821655586814e-07, "loss": 0.72020638, "num_input_tokens_seen": 276909625, "step": 12836, "time_per_iteration": 2.728555917739868 }, { "auxiliary_loss_clip": 0.01087588, "auxiliary_loss_mlp": 0.01031496, "balance_loss_clip": 1.03710699, "balance_loss_mlp": 1.01896143, "epoch": 0.7718021944987223, "flos": 35189476456320.0, "grad_norm": 1.7672669175991982, "language_loss": 0.59498906, "learning_rate": 5.217198149454575e-07, "loss": 0.61617988, "num_input_tokens_seen": 276930760, "step": 12837, "time_per_iteration": 2.771662712097168 }, { "auxiliary_loss_clip": 0.01019463, "auxiliary_loss_mlp": 0.01007255, "balance_loss_clip": 1.0126214, "balance_loss_mlp": 1.00599122, "epoch": 0.7718623177513904, "flos": 67923167961600.0, "grad_norm": 0.860607802199013, "language_loss": 0.55781054, "learning_rate": 5.214575203887666e-07, "loss": 0.57807767, "num_input_tokens_seen": 276989580, "step": 12838, "time_per_iteration": 3.17033052444458 }, { "auxiliary_loss_clip": 0.0110038, "auxiliary_loss_mlp": 0.01028077, "balance_loss_clip": 1.03804731, "balance_loss_mlp": 1.01625776, "epoch": 0.7719224410040583, "flos": 18581401941120.0, "grad_norm": 2.316418806228274, "language_loss": 0.69647658, "learning_rate": 5.211952818985538e-07, "loss": 0.71776116, "num_input_tokens_seen": 277005450, "step": 12839, "time_per_iteration": 2.645826578140259 }, { "auxiliary_loss_clip": 0.01099944, "auxiliary_loss_mlp": 0.01027637, "balance_loss_clip": 1.03894663, "balance_loss_mlp": 1.01572192, "epoch": 0.7719825642567263, "flos": 23075802264960.0, "grad_norm": 1.8115476435749553, "language_loss": 0.79911268, "learning_rate": 5.209330994847647e-07, "loss": 0.8203885, "num_input_tokens_seen": 277023055, "step": 12840, "time_per_iteration": 2.706791400909424 }, { "auxiliary_loss_clip": 0.0110078, "auxiliary_loss_mlp": 0.00770822, "balance_loss_clip": 1.03851485, "balance_loss_mlp": 1.00014949, "epoch": 0.7720426875093943, "flos": 20339086066560.0, "grad_norm": 2.545853908868313, "language_loss": 0.80008757, "learning_rate": 5.206709731573402e-07, "loss": 0.81880367, "num_input_tokens_seen": 277041150, "step": 12841, "time_per_iteration": 2.7192368507385254 }, { "auxiliary_loss_clip": 0.01075766, "auxiliary_loss_mlp": 0.01028708, "balance_loss_clip": 1.03847384, "balance_loss_mlp": 1.01574421, "epoch": 0.7721028107620622, "flos": 23880704181120.0, "grad_norm": 1.5305578365970447, "language_loss": 0.76161742, "learning_rate": 5.204089029262208e-07, "loss": 0.78266215, "num_input_tokens_seen": 277063895, "step": 12842, "time_per_iteration": 2.7325236797332764 }, { "auxiliary_loss_clip": 0.01059079, "auxiliary_loss_mlp": 0.00771703, "balance_loss_clip": 1.03726017, "balance_loss_mlp": 1.0002687, "epoch": 0.7721629340147302, "flos": 26651571235200.0, "grad_norm": 4.828495725379175, "language_loss": 0.68726575, "learning_rate": 5.201468888013445e-07, "loss": 0.70557356, "num_input_tokens_seen": 277084045, "step": 12843, "time_per_iteration": 2.81326961517334 }, { "auxiliary_loss_clip": 0.01088182, "auxiliary_loss_mlp": 0.01032978, "balance_loss_clip": 1.03403521, "balance_loss_mlp": 1.02059186, "epoch": 0.7722230572673981, "flos": 21178857110400.0, "grad_norm": 3.7944489397426286, "language_loss": 0.73675692, "learning_rate": 5.198849307926465e-07, "loss": 0.75796854, "num_input_tokens_seen": 277102625, "step": 12844, "time_per_iteration": 2.660747766494751 }, { "auxiliary_loss_clip": 0.0109532, "auxiliary_loss_mlp": 0.01041057, "balance_loss_clip": 1.03639054, "balance_loss_mlp": 1.02721667, "epoch": 0.7722831805200662, "flos": 27964644814080.0, "grad_norm": 1.567829696052933, "language_loss": 0.71341336, "learning_rate": 5.196230289100596e-07, "loss": 0.73477709, "num_input_tokens_seen": 277123210, "step": 12845, "time_per_iteration": 2.720493793487549 }, { "auxiliary_loss_clip": 0.01109647, "auxiliary_loss_mlp": 0.01032633, "balance_loss_clip": 1.03851032, "balance_loss_mlp": 1.02038407, "epoch": 0.7723433037727341, "flos": 33875576864640.0, "grad_norm": 1.7586648256902582, "language_loss": 0.64064783, "learning_rate": 5.193611831635159e-07, "loss": 0.66207063, "num_input_tokens_seen": 277144895, "step": 12846, "time_per_iteration": 2.7434511184692383 }, { "auxiliary_loss_clip": 0.0102204, "auxiliary_loss_mlp": 0.00751187, "balance_loss_clip": 1.0084672, "balance_loss_mlp": 0.99961835, "epoch": 0.7724034270254021, "flos": 62848271940480.0, "grad_norm": 0.7939383469798397, "language_loss": 0.61696756, "learning_rate": 5.19099393562945e-07, "loss": 0.63469982, "num_input_tokens_seen": 277205160, "step": 12847, "time_per_iteration": 3.1408584117889404 }, { "auxiliary_loss_clip": 0.01109979, "auxiliary_loss_mlp": 0.01027701, "balance_loss_clip": 1.0360781, "balance_loss_mlp": 1.01481414, "epoch": 0.77246355027807, "flos": 23295467888640.0, "grad_norm": 2.7620733076627255, "language_loss": 0.7912066, "learning_rate": 5.188376601182732e-07, "loss": 0.81258333, "num_input_tokens_seen": 277223005, "step": 12848, "time_per_iteration": 5.833041191101074 }, { "auxiliary_loss_clip": 0.01073036, "auxiliary_loss_mlp": 0.01041471, "balance_loss_clip": 1.03511548, "balance_loss_mlp": 1.02746367, "epoch": 0.772523673530738, "flos": 20121287950080.0, "grad_norm": 1.5824412187396433, "language_loss": 0.72673213, "learning_rate": 5.185759828394261e-07, "loss": 0.74787724, "num_input_tokens_seen": 277241785, "step": 12849, "time_per_iteration": 2.7188072204589844 }, { "auxiliary_loss_clip": 0.01110027, "auxiliary_loss_mlp": 0.01031983, "balance_loss_clip": 1.03745866, "balance_loss_mlp": 1.01899564, "epoch": 0.7725837967834059, "flos": 17820096157440.0, "grad_norm": 2.4780134177178166, "language_loss": 0.78607786, "learning_rate": 5.183143617363261e-07, "loss": 0.80749798, "num_input_tokens_seen": 277259050, "step": 12850, "time_per_iteration": 4.190839529037476 }, { "auxiliary_loss_clip": 0.01054122, "auxiliary_loss_mlp": 0.00771579, "balance_loss_clip": 1.03170514, "balance_loss_mlp": 1.00020933, "epoch": 0.772643920036074, "flos": 27198921657600.0, "grad_norm": 1.5628406207285341, "language_loss": 0.80081898, "learning_rate": 5.180527968188935e-07, "loss": 0.819076, "num_input_tokens_seen": 277278235, "step": 12851, "time_per_iteration": 2.8007707595825195 }, { "auxiliary_loss_clip": 0.01097831, "auxiliary_loss_mlp": 0.01027911, "balance_loss_clip": 1.03627992, "balance_loss_mlp": 1.01439285, "epoch": 0.7727040432887419, "flos": 21579512388480.0, "grad_norm": 1.50165866044674, "language_loss": 0.73771137, "learning_rate": 5.177912880970474e-07, "loss": 0.75896883, "num_input_tokens_seen": 277298355, "step": 12852, "time_per_iteration": 2.640066146850586 }, { "auxiliary_loss_clip": 0.01108862, "auxiliary_loss_mlp": 0.01036354, "balance_loss_clip": 1.0370307, "balance_loss_mlp": 1.02388501, "epoch": 0.7727641665414099, "flos": 22236641752320.0, "grad_norm": 1.9047864889104873, "language_loss": 0.82604998, "learning_rate": 5.17529835580704e-07, "loss": 0.84750211, "num_input_tokens_seen": 277316095, "step": 12853, "time_per_iteration": 2.6782071590423584 }, { "auxiliary_loss_clip": 0.01028971, "auxiliary_loss_mlp": 0.01000563, "balance_loss_clip": 1.00643969, "balance_loss_mlp": 0.99953192, "epoch": 0.7728242897940779, "flos": 54832221463680.0, "grad_norm": 0.7951405489665233, "language_loss": 0.54508865, "learning_rate": 5.172684392797786e-07, "loss": 0.56538397, "num_input_tokens_seen": 277380130, "step": 12854, "time_per_iteration": 3.2313177585601807 }, { "auxiliary_loss_clip": 0.01102068, "auxiliary_loss_mlp": 0.01032288, "balance_loss_clip": 1.03806114, "balance_loss_mlp": 1.01808441, "epoch": 0.7728844130467458, "flos": 34461962392320.0, "grad_norm": 1.5042786507697257, "language_loss": 0.71595842, "learning_rate": 5.170070992041826e-07, "loss": 0.73730195, "num_input_tokens_seen": 277404015, "step": 12855, "time_per_iteration": 4.29422926902771 }, { "auxiliary_loss_clip": 0.01111402, "auxiliary_loss_mlp": 0.01031298, "balance_loss_clip": 1.03859937, "balance_loss_mlp": 1.01755357, "epoch": 0.7729445362994138, "flos": 18916341287040.0, "grad_norm": 1.8322894078322527, "language_loss": 0.68102384, "learning_rate": 5.167458153638254e-07, "loss": 0.70245087, "num_input_tokens_seen": 277421375, "step": 12856, "time_per_iteration": 2.6372880935668945 }, { "auxiliary_loss_clip": 0.010814, "auxiliary_loss_mlp": 0.01035586, "balance_loss_clip": 1.03660607, "balance_loss_mlp": 1.02275896, "epoch": 0.7730046595520818, "flos": 22200048771840.0, "grad_norm": 1.6258522353598035, "language_loss": 0.79057026, "learning_rate": 5.164845877686162e-07, "loss": 0.81174016, "num_input_tokens_seen": 277440170, "step": 12857, "time_per_iteration": 2.796715021133423 }, { "auxiliary_loss_clip": 0.01063249, "auxiliary_loss_mlp": 0.00770001, "balance_loss_clip": 1.04108429, "balance_loss_mlp": 1.00020409, "epoch": 0.7730647828047498, "flos": 13552328695680.0, "grad_norm": 1.8401408925492355, "language_loss": 0.78711581, "learning_rate": 5.162234164284591e-07, "loss": 0.80544829, "num_input_tokens_seen": 277456880, "step": 12858, "time_per_iteration": 2.8125572204589844 }, { "auxiliary_loss_clip": 0.01112062, "auxiliary_loss_mlp": 0.01031889, "balance_loss_clip": 1.03837538, "balance_loss_mlp": 1.0190742, "epoch": 0.7731249060574177, "flos": 21976037602560.0, "grad_norm": 1.938091007163787, "language_loss": 0.77033961, "learning_rate": 5.159623013532591e-07, "loss": 0.7917791, "num_input_tokens_seen": 277475365, "step": 12859, "time_per_iteration": 2.659550428390503 }, { "auxiliary_loss_clip": 0.0109902, "auxiliary_loss_mlp": 0.01030466, "balance_loss_clip": 1.04030442, "balance_loss_mlp": 1.01920676, "epoch": 0.7731850293100857, "flos": 22601817371520.0, "grad_norm": 1.3916188047238045, "language_loss": 0.67878425, "learning_rate": 5.157012425529186e-07, "loss": 0.7000792, "num_input_tokens_seen": 277494975, "step": 12860, "time_per_iteration": 2.8458962440490723 }, { "auxiliary_loss_clip": 0.01114237, "auxiliary_loss_mlp": 0.01038751, "balance_loss_clip": 1.03815317, "balance_loss_mlp": 1.02510166, "epoch": 0.7732451525627536, "flos": 14098422142080.0, "grad_norm": 2.3344978091609656, "language_loss": 0.74838078, "learning_rate": 5.154402400373343e-07, "loss": 0.76991069, "num_input_tokens_seen": 277510520, "step": 12861, "time_per_iteration": 2.5893940925598145 }, { "auxiliary_loss_clip": 0.01105983, "auxiliary_loss_mlp": 0.01031912, "balance_loss_clip": 1.04054725, "balance_loss_mlp": 1.01798797, "epoch": 0.7733052758154216, "flos": 21470020755840.0, "grad_norm": 2.1487952861807558, "language_loss": 0.74759662, "learning_rate": 5.15179293816405e-07, "loss": 0.7689755, "num_input_tokens_seen": 277530505, "step": 12862, "time_per_iteration": 2.7624194622039795 }, { "auxiliary_loss_clip": 0.01064299, "auxiliary_loss_mlp": 0.01032266, "balance_loss_clip": 1.03402948, "balance_loss_mlp": 1.02048767, "epoch": 0.7733653990680895, "flos": 21394284929280.0, "grad_norm": 1.5250392948978249, "language_loss": 0.83059877, "learning_rate": 5.149184039000256e-07, "loss": 0.85156441, "num_input_tokens_seen": 277550810, "step": 12863, "time_per_iteration": 2.771484851837158 }, { "auxiliary_loss_clip": 0.01110135, "auxiliary_loss_mlp": 0.01033251, "balance_loss_clip": 1.03735471, "balance_loss_mlp": 1.02050209, "epoch": 0.7734255223207576, "flos": 17676058619520.0, "grad_norm": 1.7056890510124847, "language_loss": 0.73495519, "learning_rate": 5.146575702980898e-07, "loss": 0.75638908, "num_input_tokens_seen": 277567680, "step": 12864, "time_per_iteration": 2.6594743728637695 }, { "auxiliary_loss_clip": 0.01089331, "auxiliary_loss_mlp": 0.01031855, "balance_loss_clip": 1.03545022, "balance_loss_mlp": 1.0199455, "epoch": 0.7734856455734255, "flos": 25230837617280.0, "grad_norm": 1.592544393546876, "language_loss": 0.8264727, "learning_rate": 5.143967930204871e-07, "loss": 0.84768456, "num_input_tokens_seen": 277588970, "step": 12865, "time_per_iteration": 2.7463982105255127 }, { "auxiliary_loss_clip": 0.01116112, "auxiliary_loss_mlp": 0.01033209, "balance_loss_clip": 1.04054976, "balance_loss_mlp": 1.01934528, "epoch": 0.7735457688260935, "flos": 23433112805760.0, "grad_norm": 2.1106851031269365, "language_loss": 0.72128093, "learning_rate": 5.141360720771077e-07, "loss": 0.74277413, "num_input_tokens_seen": 277605450, "step": 12866, "time_per_iteration": 2.574566125869751 }, { "auxiliary_loss_clip": 0.01069034, "auxiliary_loss_mlp": 0.00770892, "balance_loss_clip": 1.03813267, "balance_loss_mlp": 1.00030208, "epoch": 0.7736058920787615, "flos": 18729246320640.0, "grad_norm": 3.2060196397051444, "language_loss": 0.64442635, "learning_rate": 5.138754074778371e-07, "loss": 0.66282552, "num_input_tokens_seen": 277622530, "step": 12867, "time_per_iteration": 2.701490879058838 }, { "auxiliary_loss_clip": 0.01098529, "auxiliary_loss_mlp": 0.01037441, "balance_loss_clip": 1.03714955, "balance_loss_mlp": 1.02506101, "epoch": 0.7736660153314294, "flos": 22893304239360.0, "grad_norm": 1.5193783690331675, "language_loss": 0.71179724, "learning_rate": 5.136147992325595e-07, "loss": 0.73315698, "num_input_tokens_seen": 277642700, "step": 12868, "time_per_iteration": 2.6771240234375 }, { "auxiliary_loss_clip": 0.01105128, "auxiliary_loss_mlp": 0.01031721, "balance_loss_clip": 1.04049754, "balance_loss_mlp": 1.01892424, "epoch": 0.7737261385840974, "flos": 13800901789440.0, "grad_norm": 2.0821303548121284, "language_loss": 0.77995592, "learning_rate": 5.133542473511578e-07, "loss": 0.80132443, "num_input_tokens_seen": 277660005, "step": 12869, "time_per_iteration": 2.6456408500671387 }, { "auxiliary_loss_clip": 0.01097602, "auxiliary_loss_mlp": 0.01027939, "balance_loss_clip": 1.03767705, "balance_loss_mlp": 1.01517785, "epoch": 0.7737862618367654, "flos": 28730727106560.0, "grad_norm": 1.7351593875890767, "language_loss": 0.73740292, "learning_rate": 5.130937518435124e-07, "loss": 0.75865841, "num_input_tokens_seen": 277682890, "step": 12870, "time_per_iteration": 2.670896530151367 }, { "auxiliary_loss_clip": 0.01102985, "auxiliary_loss_mlp": 0.01032417, "balance_loss_clip": 1.03815126, "balance_loss_mlp": 1.01947141, "epoch": 0.7738463850894334, "flos": 17018570119680.0, "grad_norm": 1.9332947968793013, "language_loss": 0.76220596, "learning_rate": 5.12833312719501e-07, "loss": 0.78355992, "num_input_tokens_seen": 277699330, "step": 12871, "time_per_iteration": 2.5998897552490234 }, { "auxiliary_loss_clip": 0.0108707, "auxiliary_loss_mlp": 0.01035167, "balance_loss_clip": 1.03574061, "balance_loss_mlp": 1.02281117, "epoch": 0.7739065083421013, "flos": 20704010290560.0, "grad_norm": 2.0007285261409407, "language_loss": 0.69219184, "learning_rate": 5.12572929988999e-07, "loss": 0.71341425, "num_input_tokens_seen": 277718750, "step": 12872, "time_per_iteration": 2.673105478286743 }, { "auxiliary_loss_clip": 0.01111983, "auxiliary_loss_mlp": 0.01031736, "balance_loss_clip": 1.03831863, "balance_loss_mlp": 1.01781273, "epoch": 0.7739666315947693, "flos": 20697222620160.0, "grad_norm": 2.4536948806528502, "language_loss": 0.85142237, "learning_rate": 5.123126036618804e-07, "loss": 0.8728596, "num_input_tokens_seen": 277734645, "step": 12873, "time_per_iteration": 2.590299606323242 }, { "auxiliary_loss_clip": 0.01115241, "auxiliary_loss_mlp": 0.01037294, "balance_loss_clip": 1.04048181, "balance_loss_mlp": 1.02497935, "epoch": 0.7740267548474372, "flos": 29570677718400.0, "grad_norm": 2.480997222503817, "language_loss": 0.65266359, "learning_rate": 5.120523337480174e-07, "loss": 0.67418897, "num_input_tokens_seen": 277755535, "step": 12874, "time_per_iteration": 2.6324357986450195 }, { "auxiliary_loss_clip": 0.01072577, "auxiliary_loss_mlp": 0.01031243, "balance_loss_clip": 1.0420754, "balance_loss_mlp": 1.01826084, "epoch": 0.7740868781001052, "flos": 23659099223040.0, "grad_norm": 1.5630841905142332, "language_loss": 0.62254053, "learning_rate": 5.117921202572785e-07, "loss": 0.64357871, "num_input_tokens_seen": 277775585, "step": 12875, "time_per_iteration": 2.7664403915405273 }, { "auxiliary_loss_clip": 0.0110217, "auxiliary_loss_mlp": 0.0103118, "balance_loss_clip": 1.0375613, "balance_loss_mlp": 1.01843607, "epoch": 0.7741470013527731, "flos": 24717314828160.0, "grad_norm": 2.655709255641646, "language_loss": 0.65554249, "learning_rate": 5.115319631995318e-07, "loss": 0.67687607, "num_input_tokens_seen": 277794795, "step": 12876, "time_per_iteration": 2.696556806564331 }, { "auxiliary_loss_clip": 0.01082571, "auxiliary_loss_mlp": 0.01036668, "balance_loss_clip": 1.03536308, "balance_loss_mlp": 1.02387714, "epoch": 0.7742071246054412, "flos": 21871645701120.0, "grad_norm": 1.869396409074905, "language_loss": 0.71216834, "learning_rate": 5.112718625846433e-07, "loss": 0.73336065, "num_input_tokens_seen": 277813235, "step": 12877, "time_per_iteration": 2.692688465118408 }, { "auxiliary_loss_clip": 0.01073259, "auxiliary_loss_mlp": 0.01040102, "balance_loss_clip": 1.03579319, "balance_loss_mlp": 1.02468836, "epoch": 0.7742672478581091, "flos": 22674249146880.0, "grad_norm": 1.8081756921528234, "language_loss": 0.82974255, "learning_rate": 5.110118184224736e-07, "loss": 0.85087615, "num_input_tokens_seen": 277832560, "step": 12878, "time_per_iteration": 2.7693746089935303 }, { "auxiliary_loss_clip": 0.01091515, "auxiliary_loss_mlp": 0.01034091, "balance_loss_clip": 1.03874159, "balance_loss_mlp": 1.0199523, "epoch": 0.7743273711107771, "flos": 18840892769280.0, "grad_norm": 1.713941118960012, "language_loss": 0.73144233, "learning_rate": 5.10751830722885e-07, "loss": 0.75269836, "num_input_tokens_seen": 277850120, "step": 12879, "time_per_iteration": 2.6757094860076904 }, { "auxiliary_loss_clip": 0.0108601, "auxiliary_loss_mlp": 0.01027949, "balance_loss_clip": 1.03621507, "balance_loss_mlp": 1.01507425, "epoch": 0.7743874943634451, "flos": 28729326476160.0, "grad_norm": 1.9120090925944704, "language_loss": 0.79831159, "learning_rate": 5.104918994957364e-07, "loss": 0.81945121, "num_input_tokens_seen": 277871020, "step": 12880, "time_per_iteration": 2.8304030895233154 }, { "auxiliary_loss_clip": 0.01087192, "auxiliary_loss_mlp": 0.01037799, "balance_loss_clip": 1.03749204, "balance_loss_mlp": 1.02506709, "epoch": 0.774447617616113, "flos": 21909639312000.0, "grad_norm": 1.5834670208699275, "language_loss": 0.70202577, "learning_rate": 5.102320247508847e-07, "loss": 0.72327566, "num_input_tokens_seen": 277891525, "step": 12881, "time_per_iteration": 2.7064766883850098 }, { "auxiliary_loss_clip": 0.01091686, "auxiliary_loss_mlp": 0.01043391, "balance_loss_clip": 1.03600717, "balance_loss_mlp": 1.02921081, "epoch": 0.774507740868781, "flos": 19500643825920.0, "grad_norm": 1.9376715027667266, "language_loss": 0.84492528, "learning_rate": 5.099722064981832e-07, "loss": 0.86627603, "num_input_tokens_seen": 277910425, "step": 12882, "time_per_iteration": 2.704357862472534 }, { "auxiliary_loss_clip": 0.01002891, "auxiliary_loss_mlp": 0.01007527, "balance_loss_clip": 1.01538849, "balance_loss_mlp": 1.00624514, "epoch": 0.774567864121449, "flos": 59426560402560.0, "grad_norm": 0.7677682887225041, "language_loss": 0.60380936, "learning_rate": 5.097124447474858e-07, "loss": 0.62391353, "num_input_tokens_seen": 277972795, "step": 12883, "time_per_iteration": 3.2393903732299805 }, { "auxiliary_loss_clip": 0.01064866, "auxiliary_loss_mlp": 0.0103875, "balance_loss_clip": 1.03618407, "balance_loss_mlp": 1.023646, "epoch": 0.774627987374117, "flos": 13225326255360.0, "grad_norm": 6.057542406739813, "language_loss": 0.72638834, "learning_rate": 5.094527395086416e-07, "loss": 0.7474246, "num_input_tokens_seen": 277990675, "step": 12884, "time_per_iteration": 2.798553705215454 }, { "auxiliary_loss_clip": 0.01100426, "auxiliary_loss_mlp": 0.01035391, "balance_loss_clip": 1.03860021, "balance_loss_mlp": 1.0236789, "epoch": 0.7746881106267849, "flos": 21394033534080.0, "grad_norm": 1.4931379605931039, "language_loss": 0.8105005, "learning_rate": 5.091930907914986e-07, "loss": 0.83185869, "num_input_tokens_seen": 278010050, "step": 12885, "time_per_iteration": 2.638674736022949 }, { "auxiliary_loss_clip": 0.01108511, "auxiliary_loss_mlp": 0.01036987, "balance_loss_clip": 1.03706241, "balance_loss_mlp": 1.0250479, "epoch": 0.7747482338794529, "flos": 25629338079360.0, "grad_norm": 1.712628084719396, "language_loss": 0.63937521, "learning_rate": 5.089334986059029e-07, "loss": 0.6608302, "num_input_tokens_seen": 278030660, "step": 12886, "time_per_iteration": 2.65639328956604 }, { "auxiliary_loss_clip": 0.01072173, "auxiliary_loss_mlp": 0.0103036, "balance_loss_clip": 1.03371465, "balance_loss_mlp": 1.01826668, "epoch": 0.7748083571321208, "flos": 11546933402880.0, "grad_norm": 1.8883319339128437, "language_loss": 0.69462442, "learning_rate": 5.086739629616987e-07, "loss": 0.71564978, "num_input_tokens_seen": 278047645, "step": 12887, "time_per_iteration": 4.30758261680603 }, { "auxiliary_loss_clip": 0.01100015, "auxiliary_loss_mlp": 0.0103293, "balance_loss_clip": 1.03749061, "balance_loss_mlp": 1.02090144, "epoch": 0.7748684803847888, "flos": 19062425900160.0, "grad_norm": 1.702042840830708, "language_loss": 0.70615542, "learning_rate": 5.084144838687275e-07, "loss": 0.72748482, "num_input_tokens_seen": 278066170, "step": 12888, "time_per_iteration": 2.681607246398926 }, { "auxiliary_loss_clip": 0.01101783, "auxiliary_loss_mlp": 0.01034032, "balance_loss_clip": 1.03678536, "balance_loss_mlp": 1.02094269, "epoch": 0.7749286036374567, "flos": 22273162905600.0, "grad_norm": 1.6866747197007421, "language_loss": 0.8189441, "learning_rate": 5.081550613368279e-07, "loss": 0.84030223, "num_input_tokens_seen": 278085545, "step": 12889, "time_per_iteration": 4.1007890701293945 }, { "auxiliary_loss_clip": 0.0107657, "auxiliary_loss_mlp": 0.01028595, "balance_loss_clip": 1.03708053, "balance_loss_mlp": 1.01628113, "epoch": 0.7749887268901248, "flos": 20192462749440.0, "grad_norm": 2.1944312112845057, "language_loss": 0.79288089, "learning_rate": 5.07895695375838e-07, "loss": 0.81393254, "num_input_tokens_seen": 278102995, "step": 12890, "time_per_iteration": 2.8066084384918213 }, { "auxiliary_loss_clip": 0.01084496, "auxiliary_loss_mlp": 0.01034255, "balance_loss_clip": 1.03861511, "balance_loss_mlp": 1.02098715, "epoch": 0.7750488501427927, "flos": 20337541781760.0, "grad_norm": 1.9334241832657861, "language_loss": 0.66675818, "learning_rate": 5.076363859955932e-07, "loss": 0.68794572, "num_input_tokens_seen": 278121460, "step": 12891, "time_per_iteration": 2.7070491313934326 }, { "auxiliary_loss_clip": 0.01100079, "auxiliary_loss_mlp": 0.01033227, "balance_loss_clip": 1.03662086, "balance_loss_mlp": 1.02079916, "epoch": 0.7751089733954607, "flos": 28364043116160.0, "grad_norm": 1.6084014662033723, "language_loss": 0.78700238, "learning_rate": 5.073771332059257e-07, "loss": 0.80833542, "num_input_tokens_seen": 278143905, "step": 12892, "time_per_iteration": 2.6891307830810547 }, { "auxiliary_loss_clip": 0.01105106, "auxiliary_loss_mlp": 0.01029124, "balance_loss_clip": 1.04138756, "balance_loss_mlp": 1.01607716, "epoch": 0.7751690966481286, "flos": 16943803960320.0, "grad_norm": 2.167077484645157, "language_loss": 0.67164677, "learning_rate": 5.071179370166669e-07, "loss": 0.69298911, "num_input_tokens_seen": 278160850, "step": 12893, "time_per_iteration": 2.6599507331848145 }, { "auxiliary_loss_clip": 0.01022351, "auxiliary_loss_mlp": 0.01001788, "balance_loss_clip": 1.00947237, "balance_loss_mlp": 1.00071514, "epoch": 0.7752292199007966, "flos": 65668050339840.0, "grad_norm": 0.8059900442079823, "language_loss": 0.58441579, "learning_rate": 5.068587974376468e-07, "loss": 0.60465717, "num_input_tokens_seen": 278219950, "step": 12894, "time_per_iteration": 4.591580629348755 }, { "auxiliary_loss_clip": 0.01093145, "auxiliary_loss_mlp": 0.01033525, "balance_loss_clip": 1.03960991, "balance_loss_mlp": 1.02001882, "epoch": 0.7752893431534646, "flos": 20594662312320.0, "grad_norm": 2.026677697607631, "language_loss": 0.77940953, "learning_rate": 5.065997144786895e-07, "loss": 0.80067623, "num_input_tokens_seen": 278237805, "step": 12895, "time_per_iteration": 2.550419807434082 }, { "auxiliary_loss_clip": 0.01070115, "auxiliary_loss_mlp": 0.01035434, "balance_loss_clip": 1.03553057, "balance_loss_mlp": 1.02099133, "epoch": 0.7753494664061326, "flos": 20485350247680.0, "grad_norm": 1.9545538067157624, "language_loss": 0.67606688, "learning_rate": 5.063406881496209e-07, "loss": 0.69712234, "num_input_tokens_seen": 278257660, "step": 12896, "time_per_iteration": 2.573294162750244 }, { "auxiliary_loss_clip": 0.01086749, "auxiliary_loss_mlp": 0.01040132, "balance_loss_clip": 1.03621519, "balance_loss_mlp": 1.02843189, "epoch": 0.7754095896588006, "flos": 20265900105600.0, "grad_norm": 1.6654676924809417, "language_loss": 0.6842171, "learning_rate": 5.060817184602629e-07, "loss": 0.70548594, "num_input_tokens_seen": 278275110, "step": 12897, "time_per_iteration": 2.646030902862549 }, { "auxiliary_loss_clip": 0.0111523, "auxiliary_loss_mlp": 0.01041854, "balance_loss_clip": 1.04096043, "balance_loss_mlp": 1.02774525, "epoch": 0.7754697129114685, "flos": 23331091201920.0, "grad_norm": 1.6795213586563635, "language_loss": 0.75452977, "learning_rate": 5.058228054204364e-07, "loss": 0.77610064, "num_input_tokens_seen": 278293035, "step": 12898, "time_per_iteration": 2.589974880218506 }, { "auxiliary_loss_clip": 0.01101527, "auxiliary_loss_mlp": 0.00771705, "balance_loss_clip": 1.0381062, "balance_loss_mlp": 1.00029922, "epoch": 0.7755298361641365, "flos": 17347619635200.0, "grad_norm": 1.834394412240628, "language_loss": 0.70020843, "learning_rate": 5.055639490399588e-07, "loss": 0.71894073, "num_input_tokens_seen": 278311010, "step": 12899, "time_per_iteration": 2.569342851638794 }, { "auxiliary_loss_clip": 0.01076575, "auxiliary_loss_mlp": 0.01037603, "balance_loss_clip": 1.03510606, "balance_loss_mlp": 1.02406061, "epoch": 0.7755899594168044, "flos": 19645866512640.0, "grad_norm": 2.136951327661946, "language_loss": 0.7508406, "learning_rate": 5.053051493286453e-07, "loss": 0.77198243, "num_input_tokens_seen": 278329900, "step": 12900, "time_per_iteration": 2.6928303241729736 }, { "auxiliary_loss_clip": 0.01093277, "auxiliary_loss_mlp": 0.01036499, "balance_loss_clip": 1.03764784, "balance_loss_mlp": 1.02486384, "epoch": 0.7756500826694724, "flos": 27414457217280.0, "grad_norm": 2.3252412495258867, "language_loss": 0.77514052, "learning_rate": 5.050464062963113e-07, "loss": 0.79643828, "num_input_tokens_seen": 278349980, "step": 12901, "time_per_iteration": 2.7284209728240967 }, { "auxiliary_loss_clip": 0.01102085, "auxiliary_loss_mlp": 0.01032875, "balance_loss_clip": 1.04122436, "balance_loss_mlp": 1.01966059, "epoch": 0.7757102059221404, "flos": 28730511624960.0, "grad_norm": 1.6090174147117244, "language_loss": 0.7720294, "learning_rate": 5.047877199527666e-07, "loss": 0.79337895, "num_input_tokens_seen": 278372485, "step": 12902, "time_per_iteration": 2.7194478511810303 }, { "auxiliary_loss_clip": 0.01100702, "auxiliary_loss_mlp": 0.01031411, "balance_loss_clip": 1.03726745, "balance_loss_mlp": 1.01915073, "epoch": 0.7757703291748084, "flos": 22486795044480.0, "grad_norm": 1.743455027715563, "language_loss": 0.73384994, "learning_rate": 5.045290903078215e-07, "loss": 0.75517106, "num_input_tokens_seen": 278391660, "step": 12903, "time_per_iteration": 2.705784797668457 }, { "auxiliary_loss_clip": 0.01089793, "auxiliary_loss_mlp": 0.01030768, "balance_loss_clip": 1.03994238, "balance_loss_mlp": 1.01834655, "epoch": 0.7758304524274763, "flos": 21430159637760.0, "grad_norm": 18.791554059780267, "language_loss": 0.76102394, "learning_rate": 5.042705173712835e-07, "loss": 0.78222954, "num_input_tokens_seen": 278409125, "step": 12904, "time_per_iteration": 2.6935760974884033 }, { "auxiliary_loss_clip": 0.01109136, "auxiliary_loss_mlp": 0.01027029, "balance_loss_clip": 1.03901672, "balance_loss_mlp": 1.01484025, "epoch": 0.7758905756801443, "flos": 23659242877440.0, "grad_norm": 2.2307497011290462, "language_loss": 0.68197864, "learning_rate": 5.040120011529576e-07, "loss": 0.70334029, "num_input_tokens_seen": 278429450, "step": 12905, "time_per_iteration": 2.6777610778808594 }, { "auxiliary_loss_clip": 0.01097117, "auxiliary_loss_mlp": 0.00770393, "balance_loss_clip": 1.03989148, "balance_loss_mlp": 1.00023961, "epoch": 0.7759506989328122, "flos": 28365479660160.0, "grad_norm": 1.6211065141580052, "language_loss": 0.67231417, "learning_rate": 5.037535416626459e-07, "loss": 0.69098926, "num_input_tokens_seen": 278449925, "step": 12906, "time_per_iteration": 2.7337546348571777 }, { "auxiliary_loss_clip": 0.01072574, "auxiliary_loss_mlp": 0.01034309, "balance_loss_clip": 1.03331351, "balance_loss_mlp": 1.02119029, "epoch": 0.7760108221854802, "flos": 14902785354240.0, "grad_norm": 1.9856089108583717, "language_loss": 0.81587309, "learning_rate": 5.034951389101498e-07, "loss": 0.83694196, "num_input_tokens_seen": 278467255, "step": 12907, "time_per_iteration": 2.687721014022827 }, { "auxiliary_loss_clip": 0.01096211, "auxiliary_loss_mlp": 0.01035671, "balance_loss_clip": 1.03709292, "balance_loss_mlp": 1.02327871, "epoch": 0.7760709454381483, "flos": 14792503622400.0, "grad_norm": 2.1316068769770213, "language_loss": 0.6746856, "learning_rate": 5.032367929052685e-07, "loss": 0.69600445, "num_input_tokens_seen": 278484250, "step": 12908, "time_per_iteration": 2.6765284538269043 }, { "auxiliary_loss_clip": 0.01079432, "auxiliary_loss_mlp": 0.01041924, "balance_loss_clip": 1.03593588, "balance_loss_mlp": 1.02890027, "epoch": 0.7761310686908162, "flos": 17379831156480.0, "grad_norm": 1.487534860967946, "language_loss": 0.70260543, "learning_rate": 5.029785036577976e-07, "loss": 0.72381896, "num_input_tokens_seen": 278502740, "step": 12909, "time_per_iteration": 2.711395502090454 }, { "auxiliary_loss_clip": 0.01100377, "auxiliary_loss_mlp": 0.01035688, "balance_loss_clip": 1.03995848, "balance_loss_mlp": 1.02347469, "epoch": 0.7761911919434842, "flos": 25556547168000.0, "grad_norm": 1.6219590580384207, "language_loss": 0.6782195, "learning_rate": 5.027202711775324e-07, "loss": 0.69958019, "num_input_tokens_seen": 278523890, "step": 12910, "time_per_iteration": 2.703979969024658 }, { "auxiliary_loss_clip": 0.01064156, "auxiliary_loss_mlp": 0.01032945, "balance_loss_clip": 1.03646898, "balance_loss_mlp": 1.02076757, "epoch": 0.7762513151961521, "flos": 23179763203200.0, "grad_norm": 1.5806807809655474, "language_loss": 0.71997929, "learning_rate": 5.024620954742646e-07, "loss": 0.74095035, "num_input_tokens_seen": 278543185, "step": 12911, "time_per_iteration": 2.8058223724365234 }, { "auxiliary_loss_clip": 0.01114991, "auxiliary_loss_mlp": 0.00771737, "balance_loss_clip": 1.04081869, "balance_loss_mlp": 1.00030136, "epoch": 0.7763114384488201, "flos": 21689614552320.0, "grad_norm": 3.3864854327362592, "language_loss": 0.63468528, "learning_rate": 5.022039765577836e-07, "loss": 0.65355253, "num_input_tokens_seen": 278559220, "step": 12912, "time_per_iteration": 2.641256809234619 }, { "auxiliary_loss_clip": 0.01001929, "auxiliary_loss_mlp": 0.01001295, "balance_loss_clip": 1.00920105, "balance_loss_mlp": 1.00030553, "epoch": 0.776371561701488, "flos": 69025554316800.0, "grad_norm": 0.7664213178657265, "language_loss": 0.53195411, "learning_rate": 5.019459144378779e-07, "loss": 0.55198634, "num_input_tokens_seen": 278618185, "step": 12913, "time_per_iteration": 3.3077611923217773 }, { "auxiliary_loss_clip": 0.01093414, "auxiliary_loss_mlp": 0.01037157, "balance_loss_clip": 1.04078877, "balance_loss_mlp": 1.02395415, "epoch": 0.776431684954156, "flos": 22893914770560.0, "grad_norm": 1.9204798335439963, "language_loss": 0.62302238, "learning_rate": 5.016879091243338e-07, "loss": 0.644328, "num_input_tokens_seen": 278636210, "step": 12914, "time_per_iteration": 2.7050273418426514 }, { "auxiliary_loss_clip": 0.0108926, "auxiliary_loss_mlp": 0.01032265, "balance_loss_clip": 1.03807616, "balance_loss_mlp": 1.01977742, "epoch": 0.776491808206824, "flos": 20261554560000.0, "grad_norm": 1.7420543212332402, "language_loss": 0.82108057, "learning_rate": 5.014299606269339e-07, "loss": 0.84229577, "num_input_tokens_seen": 278653305, "step": 12915, "time_per_iteration": 2.7126035690307617 }, { "auxiliary_loss_clip": 0.01099353, "auxiliary_loss_mlp": 0.0103825, "balance_loss_clip": 1.03824329, "balance_loss_mlp": 1.02410579, "epoch": 0.776551931459492, "flos": 26759051706240.0, "grad_norm": 1.7876763975048962, "language_loss": 0.74624789, "learning_rate": 5.011720689554603e-07, "loss": 0.76762396, "num_input_tokens_seen": 278671850, "step": 12916, "time_per_iteration": 2.6998839378356934 }, { "auxiliary_loss_clip": 0.01056671, "auxiliary_loss_mlp": 0.01036878, "balance_loss_clip": 1.03597093, "balance_loss_mlp": 1.02252531, "epoch": 0.7766120547121599, "flos": 52665080250240.0, "grad_norm": 1.5017921162458647, "language_loss": 0.65888739, "learning_rate": 5.009142341196919e-07, "loss": 0.67982292, "num_input_tokens_seen": 278697860, "step": 12917, "time_per_iteration": 3.097477674484253 }, { "auxiliary_loss_clip": 0.01099882, "auxiliary_loss_mlp": 0.01033637, "balance_loss_clip": 1.03583741, "balance_loss_mlp": 1.02095342, "epoch": 0.7766721779648279, "flos": 25156215112320.0, "grad_norm": 1.458879337938595, "language_loss": 0.64478171, "learning_rate": 5.006564561294065e-07, "loss": 0.66611689, "num_input_tokens_seen": 278720655, "step": 12918, "time_per_iteration": 2.7446439266204834 }, { "auxiliary_loss_clip": 0.01111393, "auxiliary_loss_mlp": 0.01037511, "balance_loss_clip": 1.0397799, "balance_loss_mlp": 1.02533412, "epoch": 0.7767323012174958, "flos": 23760761690880.0, "grad_norm": 2.338899246619233, "language_loss": 0.72807854, "learning_rate": 5.003987349943777e-07, "loss": 0.74956757, "num_input_tokens_seen": 278737375, "step": 12919, "time_per_iteration": 2.631877899169922 }, { "auxiliary_loss_clip": 0.01069782, "auxiliary_loss_mlp": 0.01029892, "balance_loss_clip": 1.03774428, "balance_loss_mlp": 1.01674342, "epoch": 0.7767924244701638, "flos": 22086642556800.0, "grad_norm": 2.3274821948551265, "language_loss": 0.78924805, "learning_rate": 5.001410707243792e-07, "loss": 0.8102448, "num_input_tokens_seen": 278756510, "step": 12920, "time_per_iteration": 2.8133649826049805 }, { "auxiliary_loss_clip": 0.01102553, "auxiliary_loss_mlp": 0.01033949, "balance_loss_clip": 1.03963828, "balance_loss_mlp": 1.0209614, "epoch": 0.7768525477228319, "flos": 21981640124160.0, "grad_norm": 11.784624421403892, "language_loss": 0.70922899, "learning_rate": 4.998834633291829e-07, "loss": 0.73059404, "num_input_tokens_seen": 278775410, "step": 12921, "time_per_iteration": 2.6603341102600098 }, { "auxiliary_loss_clip": 0.01105803, "auxiliary_loss_mlp": 0.01034407, "balance_loss_clip": 1.04023492, "balance_loss_mlp": 1.02050102, "epoch": 0.7769126709754998, "flos": 21794581071360.0, "grad_norm": 3.3431959549038885, "language_loss": 0.76222974, "learning_rate": 4.996259128185547e-07, "loss": 0.7836318, "num_input_tokens_seen": 278794260, "step": 12922, "time_per_iteration": 2.7015247344970703 }, { "auxiliary_loss_clip": 0.01063506, "auxiliary_loss_mlp": 0.0103979, "balance_loss_clip": 1.03708482, "balance_loss_mlp": 1.0270822, "epoch": 0.7769727942281678, "flos": 20047994248320.0, "grad_norm": 1.6454971966787777, "language_loss": 0.80262136, "learning_rate": 4.993684192022625e-07, "loss": 0.82365435, "num_input_tokens_seen": 278813290, "step": 12923, "time_per_iteration": 2.7818875312805176 }, { "auxiliary_loss_clip": 0.01076451, "auxiliary_loss_mlp": 0.01040924, "balance_loss_clip": 1.04072833, "balance_loss_mlp": 1.02828157, "epoch": 0.7770329174808357, "flos": 21686777377920.0, "grad_norm": 2.0408616917549067, "language_loss": 0.92191219, "learning_rate": 4.991109824900699e-07, "loss": 0.94308597, "num_input_tokens_seen": 278830610, "step": 12924, "time_per_iteration": 2.8274574279785156 }, { "auxiliary_loss_clip": 0.01099709, "auxiliary_loss_mlp": 0.01032924, "balance_loss_clip": 1.03679144, "balance_loss_mlp": 1.02001929, "epoch": 0.7770930407335037, "flos": 25849255098240.0, "grad_norm": 1.8094451984441313, "language_loss": 0.66132891, "learning_rate": 4.988536026917401e-07, "loss": 0.68265527, "num_input_tokens_seen": 278849530, "step": 12925, "time_per_iteration": 2.69667649269104 }, { "auxiliary_loss_clip": 0.01078276, "auxiliary_loss_mlp": 0.01032472, "balance_loss_clip": 1.03612852, "balance_loss_mlp": 1.01974022, "epoch": 0.7771531639861716, "flos": 24347865490560.0, "grad_norm": 2.0756313412895815, "language_loss": 0.7192542, "learning_rate": 4.985962798170314e-07, "loss": 0.74036169, "num_input_tokens_seen": 278869005, "step": 12926, "time_per_iteration": 4.349314451217651 }, { "auxiliary_loss_clip": 0.01103533, "auxiliary_loss_mlp": 0.01029731, "balance_loss_clip": 1.0389967, "balance_loss_mlp": 1.01636767, "epoch": 0.7772132872388396, "flos": 25629948610560.0, "grad_norm": 1.6573712780681307, "language_loss": 0.65608656, "learning_rate": 4.983390138757027e-07, "loss": 0.67741919, "num_input_tokens_seen": 278888790, "step": 12927, "time_per_iteration": 4.16760778427124 }, { "auxiliary_loss_clip": 0.01089675, "auxiliary_loss_mlp": 0.01039623, "balance_loss_clip": 1.03830886, "balance_loss_mlp": 1.02623534, "epoch": 0.7772734104915076, "flos": 26067412350720.0, "grad_norm": 1.7538632415038142, "language_loss": 0.72743905, "learning_rate": 4.980818048775093e-07, "loss": 0.74873203, "num_input_tokens_seen": 278908150, "step": 12928, "time_per_iteration": 2.755859851837158 }, { "auxiliary_loss_clip": 0.01071134, "auxiliary_loss_mlp": 0.01033028, "balance_loss_clip": 1.03876746, "balance_loss_mlp": 1.02003419, "epoch": 0.7773335337441756, "flos": 22925048883840.0, "grad_norm": 1.8588967363228528, "language_loss": 0.74152476, "learning_rate": 4.978246528322036e-07, "loss": 0.76256645, "num_input_tokens_seen": 278927425, "step": 12929, "time_per_iteration": 4.2707133293151855 }, { "auxiliary_loss_clip": 0.01074549, "auxiliary_loss_mlp": 0.01031484, "balance_loss_clip": 1.03665006, "balance_loss_mlp": 1.01832283, "epoch": 0.7773936569968435, "flos": 20776765288320.0, "grad_norm": 1.9039476143036729, "language_loss": 0.7758745, "learning_rate": 4.975675577495377e-07, "loss": 0.79693484, "num_input_tokens_seen": 278946475, "step": 12930, "time_per_iteration": 2.7537360191345215 }, { "auxiliary_loss_clip": 0.01113583, "auxiliary_loss_mlp": 0.01034326, "balance_loss_clip": 1.04102445, "balance_loss_mlp": 1.02152324, "epoch": 0.7774537802495115, "flos": 20372267255040.0, "grad_norm": 1.8280345361242294, "language_loss": 0.79341066, "learning_rate": 4.973105196392613e-07, "loss": 0.81488979, "num_input_tokens_seen": 278964345, "step": 12931, "time_per_iteration": 2.608551502227783 }, { "auxiliary_loss_clip": 0.01003397, "auxiliary_loss_mlp": 0.01004694, "balance_loss_clip": 1.02223182, "balance_loss_mlp": 1.00322199, "epoch": 0.7775139035021794, "flos": 53912081738880.0, "grad_norm": 0.8525982586440103, "language_loss": 0.59734511, "learning_rate": 4.970535385111199e-07, "loss": 0.61742604, "num_input_tokens_seen": 279022380, "step": 12932, "time_per_iteration": 3.19950270652771 }, { "auxiliary_loss_clip": 0.01102586, "auxiliary_loss_mlp": 0.01034922, "balance_loss_clip": 1.03881812, "balance_loss_mlp": 1.02250659, "epoch": 0.7775740267548474, "flos": 28842481296000.0, "grad_norm": 1.5001192410807755, "language_loss": 0.76264286, "learning_rate": 4.967966143748595e-07, "loss": 0.78401792, "num_input_tokens_seen": 279044275, "step": 12933, "time_per_iteration": 2.838245391845703 }, { "auxiliary_loss_clip": 0.01086722, "auxiliary_loss_mlp": 0.01039418, "balance_loss_clip": 1.03855717, "balance_loss_mlp": 1.02625704, "epoch": 0.7776341500075155, "flos": 21872471713920.0, "grad_norm": 1.9749580896078973, "language_loss": 0.73223925, "learning_rate": 4.965397472402215e-07, "loss": 0.75350064, "num_input_tokens_seen": 279063375, "step": 12934, "time_per_iteration": 4.214959621429443 }, { "auxiliary_loss_clip": 0.01069437, "auxiliary_loss_mlp": 0.01028937, "balance_loss_clip": 1.03676343, "balance_loss_mlp": 1.01571107, "epoch": 0.7776942732601834, "flos": 20229845829120.0, "grad_norm": 1.8916304823351247, "language_loss": 0.70821279, "learning_rate": 4.962829371169475e-07, "loss": 0.72919655, "num_input_tokens_seen": 279082680, "step": 12935, "time_per_iteration": 2.8492965698242188 }, { "auxiliary_loss_clip": 0.0108792, "auxiliary_loss_mlp": 0.00771991, "balance_loss_clip": 1.03933454, "balance_loss_mlp": 1.0001905, "epoch": 0.7777543965128514, "flos": 22231829329920.0, "grad_norm": 1.8453474181089096, "language_loss": 0.83784235, "learning_rate": 4.960261840147746e-07, "loss": 0.85644144, "num_input_tokens_seen": 279099805, "step": 12936, "time_per_iteration": 2.6989262104034424 }, { "auxiliary_loss_clip": 0.01105595, "auxiliary_loss_mlp": 0.01032215, "balance_loss_clip": 1.03868532, "balance_loss_mlp": 1.01979923, "epoch": 0.7778145197655193, "flos": 14501950508160.0, "grad_norm": 2.021883178321684, "language_loss": 0.6742574, "learning_rate": 4.957694879434397e-07, "loss": 0.69563556, "num_input_tokens_seen": 279117975, "step": 12937, "time_per_iteration": 2.6387362480163574 }, { "auxiliary_loss_clip": 0.01113841, "auxiliary_loss_mlp": 0.01033827, "balance_loss_clip": 1.03934264, "balance_loss_mlp": 1.021245, "epoch": 0.7778746430181873, "flos": 21140288881920.0, "grad_norm": 1.5206574462967066, "language_loss": 0.87595057, "learning_rate": 4.955128489126777e-07, "loss": 0.89742726, "num_input_tokens_seen": 279137255, "step": 12938, "time_per_iteration": 2.699613332748413 }, { "auxiliary_loss_clip": 0.01101775, "auxiliary_loss_mlp": 0.01034, "balance_loss_clip": 1.03820324, "balance_loss_mlp": 1.02050602, "epoch": 0.7779347662708552, "flos": 20266366982400.0, "grad_norm": 2.05617872988158, "language_loss": 0.8537035, "learning_rate": 4.95256266932218e-07, "loss": 0.87506127, "num_input_tokens_seen": 279154500, "step": 12939, "time_per_iteration": 2.648550510406494 }, { "auxiliary_loss_clip": 0.01108461, "auxiliary_loss_mlp": 0.00770264, "balance_loss_clip": 1.03820562, "balance_loss_mlp": 1.00022864, "epoch": 0.7779948895235232, "flos": 19209013303680.0, "grad_norm": 1.778278891076628, "language_loss": 0.69293523, "learning_rate": 4.949997420117915e-07, "loss": 0.71172249, "num_input_tokens_seen": 279173635, "step": 12940, "time_per_iteration": 2.5725789070129395 }, { "auxiliary_loss_clip": 0.01077299, "auxiliary_loss_mlp": 0.01026745, "balance_loss_clip": 1.03700173, "balance_loss_mlp": 1.01481247, "epoch": 0.7780550127761912, "flos": 23914711382400.0, "grad_norm": 2.1657166687563887, "language_loss": 0.77734792, "learning_rate": 4.947432741611255e-07, "loss": 0.7983883, "num_input_tokens_seen": 279194430, "step": 12941, "time_per_iteration": 2.74072265625 }, { "auxiliary_loss_clip": 0.01105122, "auxiliary_loss_mlp": 0.01039107, "balance_loss_clip": 1.03774464, "balance_loss_mlp": 1.02505839, "epoch": 0.7781151360288592, "flos": 32415951795840.0, "grad_norm": 2.6599455867272157, "language_loss": 0.73127586, "learning_rate": 4.944868633899462e-07, "loss": 0.75271809, "num_input_tokens_seen": 279212920, "step": 12942, "time_per_iteration": 2.717205047607422 }, { "auxiliary_loss_clip": 0.0105644, "auxiliary_loss_mlp": 0.01043958, "balance_loss_clip": 1.03546214, "balance_loss_mlp": 1.03034472, "epoch": 0.7781752592815271, "flos": 22346384780160.0, "grad_norm": 2.908887240584156, "language_loss": 0.67917764, "learning_rate": 4.942305097079751e-07, "loss": 0.7001816, "num_input_tokens_seen": 279232310, "step": 12943, "time_per_iteration": 2.7333195209503174 }, { "auxiliary_loss_clip": 0.01002881, "auxiliary_loss_mlp": 0.01004649, "balance_loss_clip": 1.00792861, "balance_loss_mlp": 1.00336123, "epoch": 0.7782353825341951, "flos": 70460183520000.0, "grad_norm": 0.7871784265530566, "language_loss": 0.5845629, "learning_rate": 4.939742131249347e-07, "loss": 0.60463822, "num_input_tokens_seen": 279295375, "step": 12944, "time_per_iteration": 3.390233039855957 }, { "auxiliary_loss_clip": 0.01113922, "auxiliary_loss_mlp": 0.01036077, "balance_loss_clip": 1.03909469, "balance_loss_mlp": 1.02220058, "epoch": 0.778295505786863, "flos": 19062569554560.0, "grad_norm": 1.8086848755411578, "language_loss": 0.67537427, "learning_rate": 4.937179736505428e-07, "loss": 0.69687426, "num_input_tokens_seen": 279313660, "step": 12945, "time_per_iteration": 2.6378118991851807 }, { "auxiliary_loss_clip": 0.01098229, "auxiliary_loss_mlp": 0.0103623, "balance_loss_clip": 1.03687143, "balance_loss_mlp": 1.02295065, "epoch": 0.778355629039531, "flos": 20999734963200.0, "grad_norm": 2.112440511554347, "language_loss": 0.69157761, "learning_rate": 4.93461791294516e-07, "loss": 0.71292222, "num_input_tokens_seen": 279334495, "step": 12946, "time_per_iteration": 2.7236101627349854 }, { "auxiliary_loss_clip": 0.0111324, "auxiliary_loss_mlp": 0.0102872, "balance_loss_clip": 1.03970623, "balance_loss_mlp": 1.01546407, "epoch": 0.7784157522921991, "flos": 21398091770880.0, "grad_norm": 2.366818430498899, "language_loss": 0.65404934, "learning_rate": 4.932056660665689e-07, "loss": 0.67546898, "num_input_tokens_seen": 279352985, "step": 12947, "time_per_iteration": 2.6700103282928467 }, { "auxiliary_loss_clip": 0.01049825, "auxiliary_loss_mlp": 0.01043003, "balance_loss_clip": 1.03298378, "balance_loss_mlp": 1.02796459, "epoch": 0.778475875544867, "flos": 20813861059200.0, "grad_norm": 1.8657083989144876, "language_loss": 0.64925945, "learning_rate": 4.929495979764147e-07, "loss": 0.67018777, "num_input_tokens_seen": 279371360, "step": 12948, "time_per_iteration": 2.8412203788757324 }, { "auxiliary_loss_clip": 0.01112305, "auxiliary_loss_mlp": 0.01035608, "balance_loss_clip": 1.03932905, "balance_loss_mlp": 1.02261424, "epoch": 0.778535998797535, "flos": 14355363104640.0, "grad_norm": 1.8274723515678126, "language_loss": 0.75157881, "learning_rate": 4.926935870337625e-07, "loss": 0.77305794, "num_input_tokens_seen": 279389400, "step": 12949, "time_per_iteration": 2.641893148422241 }, { "auxiliary_loss_clip": 0.01116388, "auxiliary_loss_mlp": 0.01033844, "balance_loss_clip": 1.04068756, "balance_loss_mlp": 1.02045703, "epoch": 0.7785961220502029, "flos": 19209552007680.0, "grad_norm": 2.2581725959312156, "language_loss": 0.68925655, "learning_rate": 4.924376332483202e-07, "loss": 0.71075886, "num_input_tokens_seen": 279409715, "step": 12950, "time_per_iteration": 2.7213573455810547 }, { "auxiliary_loss_clip": 0.01096074, "auxiliary_loss_mlp": 0.01034434, "balance_loss_clip": 1.03823721, "balance_loss_mlp": 1.02142787, "epoch": 0.7786562453028709, "flos": 25738757884800.0, "grad_norm": 1.7372750277816864, "language_loss": 0.71980989, "learning_rate": 4.921817366297938e-07, "loss": 0.74111497, "num_input_tokens_seen": 279427705, "step": 12951, "time_per_iteration": 2.741422414779663 }, { "auxiliary_loss_clip": 0.01087111, "auxiliary_loss_mlp": 0.01035076, "balance_loss_clip": 1.03640008, "balance_loss_mlp": 1.02192152, "epoch": 0.7787163685555388, "flos": 25739440243200.0, "grad_norm": 1.8924083949863153, "language_loss": 0.65915614, "learning_rate": 4.919258971878877e-07, "loss": 0.68037808, "num_input_tokens_seen": 279448215, "step": 12952, "time_per_iteration": 2.770171880722046 }, { "auxiliary_loss_clip": 0.0108209, "auxiliary_loss_mlp": 0.01031543, "balance_loss_clip": 1.03549063, "balance_loss_mlp": 1.01928258, "epoch": 0.7787764918082068, "flos": 22747722416640.0, "grad_norm": 1.528475201753157, "language_loss": 0.81114817, "learning_rate": 4.916701149323022e-07, "loss": 0.83228457, "num_input_tokens_seen": 279466260, "step": 12953, "time_per_iteration": 2.708888530731201 }, { "auxiliary_loss_clip": 0.01118162, "auxiliary_loss_mlp": 0.01035624, "balance_loss_clip": 1.04281271, "balance_loss_mlp": 1.02266002, "epoch": 0.7788366150608748, "flos": 15190860430080.0, "grad_norm": 2.122341354514922, "language_loss": 0.76798481, "learning_rate": 4.91414389872737e-07, "loss": 0.78952265, "num_input_tokens_seen": 279484520, "step": 12954, "time_per_iteration": 2.5349183082580566 }, { "auxiliary_loss_clip": 0.01100423, "auxiliary_loss_mlp": 0.01030129, "balance_loss_clip": 1.0369153, "balance_loss_mlp": 1.01788616, "epoch": 0.7788967383135428, "flos": 21210242618880.0, "grad_norm": 1.5352459629047766, "language_loss": 0.72880197, "learning_rate": 4.911587220188905e-07, "loss": 0.75010741, "num_input_tokens_seen": 279503130, "step": 12955, "time_per_iteration": 2.7405974864959717 }, { "auxiliary_loss_clip": 0.01079595, "auxiliary_loss_mlp": 0.0104146, "balance_loss_clip": 1.03563166, "balance_loss_mlp": 1.02835917, "epoch": 0.7789568615662107, "flos": 21682970536320.0, "grad_norm": 1.6339057488297875, "language_loss": 0.68833733, "learning_rate": 4.909031113804551e-07, "loss": 0.70954788, "num_input_tokens_seen": 279521930, "step": 12956, "time_per_iteration": 2.6949398517608643 }, { "auxiliary_loss_clip": 0.01076197, "auxiliary_loss_mlp": 0.01034972, "balance_loss_clip": 1.0365479, "balance_loss_mlp": 1.0227586, "epoch": 0.7790169848188787, "flos": 26360371676160.0, "grad_norm": 1.6442430086846629, "language_loss": 0.76081383, "learning_rate": 4.906475579671252e-07, "loss": 0.78192556, "num_input_tokens_seen": 279542375, "step": 12957, "time_per_iteration": 2.7577781677246094 }, { "auxiliary_loss_clip": 0.01041804, "auxiliary_loss_mlp": 0.01027809, "balance_loss_clip": 1.03647232, "balance_loss_mlp": 1.01506531, "epoch": 0.7790771080715466, "flos": 25516183259520.0, "grad_norm": 1.5510435056277987, "language_loss": 0.77168477, "learning_rate": 4.903920617885917e-07, "loss": 0.79238093, "num_input_tokens_seen": 279561885, "step": 12958, "time_per_iteration": 2.902573585510254 }, { "auxiliary_loss_clip": 0.01099333, "auxiliary_loss_mlp": 0.01042234, "balance_loss_clip": 1.03847003, "balance_loss_mlp": 1.02687943, "epoch": 0.7791372313242146, "flos": 16034186920320.0, "grad_norm": 2.00916706928726, "language_loss": 0.71559989, "learning_rate": 4.901366228545418e-07, "loss": 0.73701555, "num_input_tokens_seen": 279579965, "step": 12959, "time_per_iteration": 2.6020190715789795 }, { "auxiliary_loss_clip": 0.01094821, "auxiliary_loss_mlp": 0.00771197, "balance_loss_clip": 1.03832543, "balance_loss_mlp": 1.00027037, "epoch": 0.7791973545768827, "flos": 23842207779840.0, "grad_norm": 1.6491836005518046, "language_loss": 0.78150439, "learning_rate": 4.898812411746632e-07, "loss": 0.80016458, "num_input_tokens_seen": 279599030, "step": 12960, "time_per_iteration": 2.6712677478790283 }, { "auxiliary_loss_clip": 0.01104299, "auxiliary_loss_mlp": 0.01040928, "balance_loss_clip": 1.03950214, "balance_loss_mlp": 1.02792239, "epoch": 0.7792574778295506, "flos": 24168384207360.0, "grad_norm": 2.171108267887673, "language_loss": 0.75204015, "learning_rate": 4.896259167586385e-07, "loss": 0.77349246, "num_input_tokens_seen": 279614400, "step": 12961, "time_per_iteration": 2.6742923259735107 }, { "auxiliary_loss_clip": 0.01087433, "auxiliary_loss_mlp": 0.01038038, "balance_loss_clip": 1.03869224, "balance_loss_mlp": 1.02624202, "epoch": 0.7793176010822186, "flos": 21464921024640.0, "grad_norm": 1.7879944844879476, "language_loss": 0.73984349, "learning_rate": 4.893706496161511e-07, "loss": 0.76109815, "num_input_tokens_seen": 279633745, "step": 12962, "time_per_iteration": 2.6879115104675293 }, { "auxiliary_loss_clip": 0.01101036, "auxiliary_loss_mlp": 0.01030588, "balance_loss_clip": 1.03875148, "balance_loss_mlp": 1.01782036, "epoch": 0.7793777243348865, "flos": 20666699038080.0, "grad_norm": 1.7723287922493858, "language_loss": 0.69576943, "learning_rate": 4.891154397568795e-07, "loss": 0.71708572, "num_input_tokens_seen": 279651165, "step": 12963, "time_per_iteration": 2.6385724544525146 }, { "auxiliary_loss_clip": 0.01101416, "auxiliary_loss_mlp": 0.00770165, "balance_loss_clip": 1.04028928, "balance_loss_mlp": 1.00022078, "epoch": 0.7794378475875545, "flos": 27125771610240.0, "grad_norm": 1.6031620431196494, "language_loss": 0.63797098, "learning_rate": 4.888602871905019e-07, "loss": 0.65668678, "num_input_tokens_seen": 279671175, "step": 12964, "time_per_iteration": 2.6388909816741943 }, { "auxiliary_loss_clip": 0.01092497, "auxiliary_loss_mlp": 0.010352, "balance_loss_clip": 1.03853321, "balance_loss_mlp": 1.02259946, "epoch": 0.7794979708402224, "flos": 28074136446720.0, "grad_norm": 1.8780726000065868, "language_loss": 0.76702619, "learning_rate": 4.88605191926694e-07, "loss": 0.7883032, "num_input_tokens_seen": 279688675, "step": 12965, "time_per_iteration": 4.301928758621216 }, { "auxiliary_loss_clip": 0.01089139, "auxiliary_loss_mlp": 0.01039643, "balance_loss_clip": 1.03389204, "balance_loss_mlp": 1.02626801, "epoch": 0.7795580940928905, "flos": 26869548919680.0, "grad_norm": 1.824856626010527, "language_loss": 0.73063076, "learning_rate": 4.883501539751289e-07, "loss": 0.75191855, "num_input_tokens_seen": 279710245, "step": 12966, "time_per_iteration": 4.184988498687744 }, { "auxiliary_loss_clip": 0.01088561, "auxiliary_loss_mlp": 0.00769043, "balance_loss_clip": 1.04008389, "balance_loss_mlp": 1.00027704, "epoch": 0.7796182173455584, "flos": 23835384195840.0, "grad_norm": 1.6189038671897886, "language_loss": 0.7464664, "learning_rate": 4.880951733454768e-07, "loss": 0.76504242, "num_input_tokens_seen": 279729045, "step": 12967, "time_per_iteration": 2.7788522243499756 }, { "auxiliary_loss_clip": 0.0111227, "auxiliary_loss_mlp": 0.01032605, "balance_loss_clip": 1.03953099, "balance_loss_mlp": 1.01915836, "epoch": 0.7796783405982264, "flos": 19792238434560.0, "grad_norm": 3.3219826288937253, "language_loss": 0.72220939, "learning_rate": 4.878402500474073e-07, "loss": 0.74365819, "num_input_tokens_seen": 279748350, "step": 12968, "time_per_iteration": 4.058116436004639 }, { "auxiliary_loss_clip": 0.01085681, "auxiliary_loss_mlp": 0.01039035, "balance_loss_clip": 1.03827214, "balance_loss_mlp": 1.02664959, "epoch": 0.7797384638508943, "flos": 15450207603840.0, "grad_norm": 1.9018596701865034, "language_loss": 0.61007255, "learning_rate": 4.875853840905874e-07, "loss": 0.6313197, "num_input_tokens_seen": 279765620, "step": 12969, "time_per_iteration": 2.6471657752990723 }, { "auxiliary_loss_clip": 0.01090989, "auxiliary_loss_mlp": 0.01034313, "balance_loss_clip": 1.03693545, "balance_loss_mlp": 1.0227617, "epoch": 0.7797985871035623, "flos": 20922742160640.0, "grad_norm": 1.800586767958732, "language_loss": 0.70180488, "learning_rate": 4.873305754846811e-07, "loss": 0.72305787, "num_input_tokens_seen": 279782485, "step": 12970, "time_per_iteration": 2.6519546508789062 }, { "auxiliary_loss_clip": 0.01075649, "auxiliary_loss_mlp": 0.00770753, "balance_loss_clip": 1.04074073, "balance_loss_mlp": 1.00021172, "epoch": 0.7798587103562302, "flos": 36937212514560.0, "grad_norm": 1.645308207198137, "language_loss": 0.72213817, "learning_rate": 4.870758242393507e-07, "loss": 0.7406022, "num_input_tokens_seen": 279804170, "step": 12971, "time_per_iteration": 2.818019390106201 }, { "auxiliary_loss_clip": 0.0106953, "auxiliary_loss_mlp": 0.01037101, "balance_loss_clip": 1.03421068, "balance_loss_mlp": 1.02360034, "epoch": 0.7799188336088982, "flos": 22419283432320.0, "grad_norm": 3.5320834107901486, "language_loss": 0.74761558, "learning_rate": 4.868211303642578e-07, "loss": 0.76868188, "num_input_tokens_seen": 279823730, "step": 12972, "time_per_iteration": 2.7724294662475586 }, { "auxiliary_loss_clip": 0.01111753, "auxiliary_loss_mlp": 0.01025205, "balance_loss_clip": 1.03887677, "balance_loss_mlp": 1.01239038, "epoch": 0.7799789568615663, "flos": 18880466578560.0, "grad_norm": 1.8982422603948057, "language_loss": 0.71497881, "learning_rate": 4.865664938690584e-07, "loss": 0.73634839, "num_input_tokens_seen": 279843035, "step": 12973, "time_per_iteration": 4.207505226135254 }, { "auxiliary_loss_clip": 0.01099331, "auxiliary_loss_mlp": 0.01032267, "balance_loss_clip": 1.03967643, "balance_loss_mlp": 1.0208292, "epoch": 0.7800390801142342, "flos": 20262272832000.0, "grad_norm": 1.9924582249119662, "language_loss": 0.77612895, "learning_rate": 4.863119147634089e-07, "loss": 0.79744494, "num_input_tokens_seen": 279861450, "step": 12974, "time_per_iteration": 2.6812784671783447 }, { "auxiliary_loss_clip": 0.01077043, "auxiliary_loss_mlp": 0.01031844, "balance_loss_clip": 1.0368197, "balance_loss_mlp": 1.01885045, "epoch": 0.7800992033669022, "flos": 16690310703360.0, "grad_norm": 1.5902544107071221, "language_loss": 0.69343281, "learning_rate": 4.86057393056964e-07, "loss": 0.71452165, "num_input_tokens_seen": 279878660, "step": 12975, "time_per_iteration": 2.668877124786377 }, { "auxiliary_loss_clip": 0.01074216, "auxiliary_loss_mlp": 0.01029987, "balance_loss_clip": 1.03641438, "balance_loss_mlp": 1.0174228, "epoch": 0.7801593266195701, "flos": 18585208782720.0, "grad_norm": 1.9719657409906464, "language_loss": 0.82066941, "learning_rate": 4.858029287593739e-07, "loss": 0.84171152, "num_input_tokens_seen": 279895685, "step": 12976, "time_per_iteration": 2.760437488555908 }, { "auxiliary_loss_clip": 0.01090901, "auxiliary_loss_mlp": 0.00770609, "balance_loss_clip": 1.03640187, "balance_loss_mlp": 1.00019145, "epoch": 0.7802194498722381, "flos": 25484941405440.0, "grad_norm": 1.5169608974947786, "language_loss": 0.66052604, "learning_rate": 4.85548521880289e-07, "loss": 0.6791411, "num_input_tokens_seen": 279917240, "step": 12977, "time_per_iteration": 2.7686586380004883 }, { "auxiliary_loss_clip": 0.01087933, "auxiliary_loss_mlp": 0.01028467, "balance_loss_clip": 1.03792357, "balance_loss_mlp": 1.01699352, "epoch": 0.780279573124906, "flos": 31176315573120.0, "grad_norm": 1.5120129099478161, "language_loss": 0.74935395, "learning_rate": 4.852941724293554e-07, "loss": 0.77051795, "num_input_tokens_seen": 279938665, "step": 12978, "time_per_iteration": 2.775379180908203 }, { "auxiliary_loss_clip": 0.01087009, "auxiliary_loss_mlp": 0.01044028, "balance_loss_clip": 1.03668034, "balance_loss_mlp": 1.02886498, "epoch": 0.780339696377574, "flos": 26944027770240.0, "grad_norm": 2.430538160229645, "language_loss": 0.62134832, "learning_rate": 4.85039880416219e-07, "loss": 0.64265871, "num_input_tokens_seen": 279957965, "step": 12979, "time_per_iteration": 2.715329170227051 }, { "auxiliary_loss_clip": 0.01111779, "auxiliary_loss_mlp": 0.01030782, "balance_loss_clip": 1.03983402, "balance_loss_mlp": 1.01825941, "epoch": 0.780399819630242, "flos": 27957426180480.0, "grad_norm": 1.9760483655422685, "language_loss": 0.77112854, "learning_rate": 4.847856458505217e-07, "loss": 0.79255414, "num_input_tokens_seen": 279977490, "step": 12980, "time_per_iteration": 2.6605019569396973 }, { "auxiliary_loss_clip": 0.0111412, "auxiliary_loss_mlp": 0.01033295, "balance_loss_clip": 1.03999209, "balance_loss_mlp": 1.02089083, "epoch": 0.78045994288291, "flos": 22486795044480.0, "grad_norm": 1.9592981721345673, "language_loss": 0.77939653, "learning_rate": 4.845314687419046e-07, "loss": 0.80087066, "num_input_tokens_seen": 279994220, "step": 12981, "time_per_iteration": 2.658205032348633 }, { "auxiliary_loss_clip": 0.01069277, "auxiliary_loss_mlp": 0.01036103, "balance_loss_clip": 1.0379262, "balance_loss_mlp": 1.02320492, "epoch": 0.7805200661355779, "flos": 20850849089280.0, "grad_norm": 2.387436806844423, "language_loss": 0.72364557, "learning_rate": 4.842773491000067e-07, "loss": 0.74469936, "num_input_tokens_seen": 280012590, "step": 12982, "time_per_iteration": 2.6541051864624023 }, { "auxiliary_loss_clip": 0.01084276, "auxiliary_loss_mlp": 0.01030772, "balance_loss_clip": 1.03608441, "balance_loss_mlp": 1.01907182, "epoch": 0.7805801893882459, "flos": 25665966973440.0, "grad_norm": 1.5019346121142914, "language_loss": 0.73412144, "learning_rate": 4.840232869344636e-07, "loss": 0.75527191, "num_input_tokens_seen": 280033700, "step": 12983, "time_per_iteration": 2.6957807540893555 }, { "auxiliary_loss_clip": 0.01083908, "auxiliary_loss_mlp": 0.01031264, "balance_loss_clip": 1.03741837, "balance_loss_mlp": 1.0185684, "epoch": 0.7806403126409138, "flos": 11327806483200.0, "grad_norm": 1.8415039374254183, "language_loss": 0.74685752, "learning_rate": 4.837692822549086e-07, "loss": 0.76800919, "num_input_tokens_seen": 280052215, "step": 12984, "time_per_iteration": 2.6339285373687744 }, { "auxiliary_loss_clip": 0.0108127, "auxiliary_loss_mlp": 0.01032816, "balance_loss_clip": 1.03251958, "balance_loss_mlp": 1.02092516, "epoch": 0.7807004358935818, "flos": 19573362910080.0, "grad_norm": 2.0272215357184646, "language_loss": 0.81049699, "learning_rate": 4.835153350709746e-07, "loss": 0.83163786, "num_input_tokens_seen": 280070525, "step": 12985, "time_per_iteration": 2.6104180812835693 }, { "auxiliary_loss_clip": 0.01088889, "auxiliary_loss_mlp": 0.01033801, "balance_loss_clip": 1.03684783, "balance_loss_mlp": 1.02074158, "epoch": 0.7807605591462499, "flos": 19135827342720.0, "grad_norm": 2.394678033024852, "language_loss": 0.76863611, "learning_rate": 4.832614453922915e-07, "loss": 0.78986299, "num_input_tokens_seen": 280089855, "step": 12986, "time_per_iteration": 2.6664822101593018 }, { "auxiliary_loss_clip": 0.01100426, "auxiliary_loss_mlp": 0.01035599, "balance_loss_clip": 1.037323, "balance_loss_mlp": 1.02314782, "epoch": 0.7808206823989178, "flos": 32374654133760.0, "grad_norm": 2.6632985579320128, "language_loss": 0.73982435, "learning_rate": 4.830076132284859e-07, "loss": 0.76118457, "num_input_tokens_seen": 280109960, "step": 12987, "time_per_iteration": 2.6844065189361572 }, { "auxiliary_loss_clip": 0.01022794, "auxiliary_loss_mlp": 0.00999717, "balance_loss_clip": 1.01035285, "balance_loss_mlp": 0.99873984, "epoch": 0.7808808056515858, "flos": 55050235061760.0, "grad_norm": 0.7788144710273639, "language_loss": 0.55080384, "learning_rate": 4.82753838589184e-07, "loss": 0.57102895, "num_input_tokens_seen": 280169805, "step": 12988, "time_per_iteration": 3.1616508960723877 }, { "auxiliary_loss_clip": 0.0107797, "auxiliary_loss_mlp": 0.01031828, "balance_loss_clip": 1.03549254, "balance_loss_mlp": 1.01985967, "epoch": 0.7809409289042537, "flos": 12859468277760.0, "grad_norm": 3.638882308233123, "language_loss": 0.81044191, "learning_rate": 4.82500121484009e-07, "loss": 0.83153987, "num_input_tokens_seen": 280184630, "step": 12989, "time_per_iteration": 2.660554885864258 }, { "auxiliary_loss_clip": 0.01077669, "auxiliary_loss_mlp": 0.01031778, "balance_loss_clip": 1.03635395, "balance_loss_mlp": 1.01853991, "epoch": 0.7810010521569217, "flos": 21687244254720.0, "grad_norm": 1.7099876560000518, "language_loss": 0.70650768, "learning_rate": 4.822464619225806e-07, "loss": 0.72760212, "num_input_tokens_seen": 280203880, "step": 12990, "time_per_iteration": 2.7570815086364746 }, { "auxiliary_loss_clip": 0.01087538, "auxiliary_loss_mlp": 0.01034172, "balance_loss_clip": 1.03705347, "balance_loss_mlp": 1.02005148, "epoch": 0.7810611754095896, "flos": 16757068129920.0, "grad_norm": 1.9898673429166094, "language_loss": 0.77492607, "learning_rate": 4.819928599145184e-07, "loss": 0.79614317, "num_input_tokens_seen": 280220460, "step": 12991, "time_per_iteration": 2.655853748321533 }, { "auxiliary_loss_clip": 0.01071528, "auxiliary_loss_mlp": 0.0103742, "balance_loss_clip": 1.03491211, "balance_loss_mlp": 1.02443242, "epoch": 0.7811212986622577, "flos": 43507464658560.0, "grad_norm": 1.7999740041710885, "language_loss": 0.6594398, "learning_rate": 4.817393154694398e-07, "loss": 0.68052924, "num_input_tokens_seen": 280242680, "step": 12992, "time_per_iteration": 2.885798931121826 }, { "auxiliary_loss_clip": 0.01114039, "auxiliary_loss_mlp": 0.01030727, "balance_loss_clip": 1.04082417, "balance_loss_mlp": 1.01861548, "epoch": 0.7811814219149256, "flos": 21757700782080.0, "grad_norm": 1.7673036757148999, "language_loss": 0.61809367, "learning_rate": 4.814858285969578e-07, "loss": 0.63954139, "num_input_tokens_seen": 280260655, "step": 12993, "time_per_iteration": 2.5982654094696045 }, { "auxiliary_loss_clip": 0.01085768, "auxiliary_loss_mlp": 0.01032115, "balance_loss_clip": 1.03539443, "balance_loss_mlp": 1.01902032, "epoch": 0.7812415451675936, "flos": 24061514267520.0, "grad_norm": 1.4534277443177828, "language_loss": 0.68547273, "learning_rate": 4.812323993066862e-07, "loss": 0.70665157, "num_input_tokens_seen": 280281185, "step": 12994, "time_per_iteration": 2.7115039825439453 }, { "auxiliary_loss_clip": 0.01109576, "auxiliary_loss_mlp": 0.01027659, "balance_loss_clip": 1.03841043, "balance_loss_mlp": 1.01556516, "epoch": 0.7813016684202615, "flos": 18989706816000.0, "grad_norm": 1.8869179456101774, "language_loss": 0.68850774, "learning_rate": 4.809790276082335e-07, "loss": 0.70988011, "num_input_tokens_seen": 280298255, "step": 12995, "time_per_iteration": 2.6276211738586426 }, { "auxiliary_loss_clip": 0.01066626, "auxiliary_loss_mlp": 0.01029556, "balance_loss_clip": 1.03578615, "balance_loss_mlp": 1.01815367, "epoch": 0.7813617916729295, "flos": 25260786581760.0, "grad_norm": 1.6581465647867601, "language_loss": 0.74758989, "learning_rate": 4.807257135112088e-07, "loss": 0.76855165, "num_input_tokens_seen": 280319000, "step": 12996, "time_per_iteration": 2.7556345462799072 }, { "auxiliary_loss_clip": 0.01115417, "auxiliary_loss_mlp": 0.01034278, "balance_loss_clip": 1.04004622, "balance_loss_mlp": 1.02160597, "epoch": 0.7814219149255974, "flos": 17966037116160.0, "grad_norm": 2.7982414236779385, "language_loss": 0.68035823, "learning_rate": 4.804724570252167e-07, "loss": 0.70185518, "num_input_tokens_seen": 280336375, "step": 12997, "time_per_iteration": 2.633403778076172 }, { "auxiliary_loss_clip": 0.01115354, "auxiliary_loss_mlp": 0.01035627, "balance_loss_clip": 1.03941298, "balance_loss_mlp": 1.02176905, "epoch": 0.7814820381782654, "flos": 25776176878080.0, "grad_norm": 1.750414047475771, "language_loss": 0.81803972, "learning_rate": 4.802192581598614e-07, "loss": 0.83954954, "num_input_tokens_seen": 280358760, "step": 12998, "time_per_iteration": 2.6855201721191406 }, { "auxiliary_loss_clip": 0.01083435, "auxiliary_loss_mlp": 0.01038171, "balance_loss_clip": 1.03414857, "balance_loss_mlp": 1.02490866, "epoch": 0.7815421614309335, "flos": 20519572930560.0, "grad_norm": 2.2116291760065523, "language_loss": 0.74893302, "learning_rate": 4.799661169247453e-07, "loss": 0.77014905, "num_input_tokens_seen": 280377085, "step": 12999, "time_per_iteration": 2.657938241958618 }, { "auxiliary_loss_clip": 0.01098221, "auxiliary_loss_mlp": 0.01042598, "balance_loss_clip": 1.0372951, "balance_loss_mlp": 1.02817392, "epoch": 0.7816022846836014, "flos": 21287666384640.0, "grad_norm": 3.4549180565502957, "language_loss": 0.84463656, "learning_rate": 4.797130333294652e-07, "loss": 0.8660447, "num_input_tokens_seen": 280395465, "step": 13000, "time_per_iteration": 2.652675151824951 }, { "auxiliary_loss_clip": 0.01102345, "auxiliary_loss_mlp": 0.01033163, "balance_loss_clip": 1.03934288, "balance_loss_mlp": 1.02033567, "epoch": 0.7816624079362694, "flos": 19208402772480.0, "grad_norm": 1.8050152528671168, "language_loss": 0.66003239, "learning_rate": 4.794600073836192e-07, "loss": 0.68138748, "num_input_tokens_seen": 280412775, "step": 13001, "time_per_iteration": 2.650995969772339 }, { "auxiliary_loss_clip": 0.01073705, "auxiliary_loss_mlp": 0.01037556, "balance_loss_clip": 1.03569674, "balance_loss_mlp": 1.02527714, "epoch": 0.7817225311889373, "flos": 26104687689600.0, "grad_norm": 1.5795311212034024, "language_loss": 0.67058933, "learning_rate": 4.792070390968027e-07, "loss": 0.69170189, "num_input_tokens_seen": 280432905, "step": 13002, "time_per_iteration": 2.811582326889038 }, { "auxiliary_loss_clip": 0.01105543, "auxiliary_loss_mlp": 0.01035107, "balance_loss_clip": 1.04086781, "balance_loss_mlp": 1.02175558, "epoch": 0.7817826544416053, "flos": 21250929749760.0, "grad_norm": 2.254654590765684, "language_loss": 0.73237813, "learning_rate": 4.78954128478607e-07, "loss": 0.75378466, "num_input_tokens_seen": 280450785, "step": 13003, "time_per_iteration": 2.6425418853759766 }, { "auxiliary_loss_clip": 0.01101875, "auxiliary_loss_mlp": 0.01035905, "balance_loss_clip": 1.03900814, "balance_loss_mlp": 1.02342987, "epoch": 0.7818427776942732, "flos": 19932181822080.0, "grad_norm": 2.055818402329747, "language_loss": 0.61984468, "learning_rate": 4.787012755386233e-07, "loss": 0.64122242, "num_input_tokens_seen": 280468400, "step": 13004, "time_per_iteration": 2.6202731132507324 }, { "auxiliary_loss_clip": 0.0110586, "auxiliary_loss_mlp": 0.01032863, "balance_loss_clip": 1.03744853, "balance_loss_mlp": 1.02140069, "epoch": 0.7819029009469413, "flos": 11363753018880.0, "grad_norm": 1.8629152700369227, "language_loss": 0.82870841, "learning_rate": 4.784484802864403e-07, "loss": 0.85009563, "num_input_tokens_seen": 280483930, "step": 13005, "time_per_iteration": 4.243497371673584 }, { "auxiliary_loss_clip": 0.01070901, "auxiliary_loss_mlp": 0.00771151, "balance_loss_clip": 1.03450751, "balance_loss_mlp": 1.00017846, "epoch": 0.7819630241996092, "flos": 24279276470400.0, "grad_norm": 1.8867656052793076, "language_loss": 0.72342831, "learning_rate": 4.781957427316432e-07, "loss": 0.74184883, "num_input_tokens_seen": 280503465, "step": 13006, "time_per_iteration": 4.330881357192993 }, { "auxiliary_loss_clip": 0.01101615, "auxiliary_loss_mlp": 0.00771026, "balance_loss_clip": 1.03858209, "balance_loss_mlp": 1.00022697, "epoch": 0.7820231474522772, "flos": 22708902792960.0, "grad_norm": 1.62503166301797, "language_loss": 0.72343135, "learning_rate": 4.779430628838157e-07, "loss": 0.74215776, "num_input_tokens_seen": 280523375, "step": 13007, "time_per_iteration": 4.214543581008911 }, { "auxiliary_loss_clip": 0.01111637, "auxiliary_loss_mlp": 0.01028444, "balance_loss_clip": 1.03696549, "balance_loss_mlp": 1.01505101, "epoch": 0.7820832707049451, "flos": 20047419630720.0, "grad_norm": 1.901361826456498, "language_loss": 0.68807894, "learning_rate": 4.776904407525397e-07, "loss": 0.70947969, "num_input_tokens_seen": 280542920, "step": 13008, "time_per_iteration": 2.6050710678100586 }, { "auxiliary_loss_clip": 0.01082934, "auxiliary_loss_mlp": 0.01028936, "balance_loss_clip": 1.03609729, "balance_loss_mlp": 1.01611555, "epoch": 0.7821433939576131, "flos": 27162795553920.0, "grad_norm": 2.2501417791775036, "language_loss": 0.69864273, "learning_rate": 4.774378763473954e-07, "loss": 0.71976143, "num_input_tokens_seen": 280561700, "step": 13009, "time_per_iteration": 2.7489216327667236 }, { "auxiliary_loss_clip": 0.01069744, "auxiliary_loss_mlp": 0.01029641, "balance_loss_clip": 1.03394186, "balance_loss_mlp": 1.01677287, "epoch": 0.782203517210281, "flos": 22602068766720.0, "grad_norm": 2.6181121023195386, "language_loss": 0.81756222, "learning_rate": 4.771853696779586e-07, "loss": 0.83855605, "num_input_tokens_seen": 280580605, "step": 13010, "time_per_iteration": 2.754182815551758 }, { "auxiliary_loss_clip": 0.01097326, "auxiliary_loss_mlp": 0.01033843, "balance_loss_clip": 1.03652465, "balance_loss_mlp": 1.02199399, "epoch": 0.782263640462949, "flos": 29059812535680.0, "grad_norm": 1.5058057514043965, "language_loss": 0.61957836, "learning_rate": 4.76932920753806e-07, "loss": 0.64089006, "num_input_tokens_seen": 280601495, "step": 13011, "time_per_iteration": 2.676269292831421 }, { "auxiliary_loss_clip": 0.01098762, "auxiliary_loss_mlp": 0.01028506, "balance_loss_clip": 1.03798711, "balance_loss_mlp": 1.01740146, "epoch": 0.782323763715617, "flos": 25299498464640.0, "grad_norm": 1.7834780447298506, "language_loss": 0.703578, "learning_rate": 4.7668052958450913e-07, "loss": 0.72485065, "num_input_tokens_seen": 280622760, "step": 13012, "time_per_iteration": 4.222137451171875 }, { "auxiliary_loss_clip": 0.01030861, "auxiliary_loss_mlp": 0.00999997, "balance_loss_clip": 1.00834417, "balance_loss_mlp": 0.99901354, "epoch": 0.782383886968285, "flos": 65194388668800.0, "grad_norm": 0.7024977302558347, "language_loss": 0.55065835, "learning_rate": 4.764281961796395e-07, "loss": 0.57096696, "num_input_tokens_seen": 280687115, "step": 13013, "time_per_iteration": 3.2604727745056152 }, { "auxiliary_loss_clip": 0.01088673, "auxiliary_loss_mlp": 0.01038861, "balance_loss_clip": 1.0394305, "balance_loss_mlp": 1.02612925, "epoch": 0.782444010220953, "flos": 18405440190720.0, "grad_norm": 1.739605099015053, "language_loss": 0.65488654, "learning_rate": 4.76175920548765e-07, "loss": 0.67616189, "num_input_tokens_seen": 280705000, "step": 13014, "time_per_iteration": 2.702570915222168 }, { "auxiliary_loss_clip": 0.01005007, "auxiliary_loss_mlp": 0.01000676, "balance_loss_clip": 1.0074594, "balance_loss_mlp": 0.99947244, "epoch": 0.7825041334736209, "flos": 63955003841280.0, "grad_norm": 1.5199496836725135, "language_loss": 0.58456129, "learning_rate": 4.759237027014524e-07, "loss": 0.60461813, "num_input_tokens_seen": 280773525, "step": 13015, "time_per_iteration": 3.2708168029785156 }, { "auxiliary_loss_clip": 0.01082509, "auxiliary_loss_mlp": 0.0103455, "balance_loss_clip": 1.03745651, "balance_loss_mlp": 1.02287316, "epoch": 0.7825642567262889, "flos": 20339373375360.0, "grad_norm": 1.6009097708406814, "language_loss": 0.74550229, "learning_rate": 4.756715426472666e-07, "loss": 0.76667285, "num_input_tokens_seen": 280791915, "step": 13016, "time_per_iteration": 2.775660514831543 }, { "auxiliary_loss_clip": 0.01111525, "auxiliary_loss_mlp": 0.01032595, "balance_loss_clip": 1.03842187, "balance_loss_mlp": 1.01902854, "epoch": 0.7826243799789568, "flos": 20262955190400.0, "grad_norm": 1.7770751531413016, "language_loss": 0.75118351, "learning_rate": 4.7541944039576766e-07, "loss": 0.77262467, "num_input_tokens_seen": 280811460, "step": 13017, "time_per_iteration": 2.6645398139953613 }, { "auxiliary_loss_clip": 0.01085213, "auxiliary_loss_mlp": 0.01034128, "balance_loss_clip": 1.03540921, "balance_loss_mlp": 1.0211221, "epoch": 0.7826845032316249, "flos": 21132926593920.0, "grad_norm": 1.9823505334349008, "language_loss": 0.75479347, "learning_rate": 4.7516739595651636e-07, "loss": 0.77598691, "num_input_tokens_seen": 280825415, "step": 13018, "time_per_iteration": 2.6840744018554688 }, { "auxiliary_loss_clip": 0.01108451, "auxiliary_loss_mlp": 0.01029158, "balance_loss_clip": 1.03651655, "balance_loss_mlp": 1.0168916, "epoch": 0.7827446264842928, "flos": 22492253911680.0, "grad_norm": 1.4306758819867016, "language_loss": 0.77329135, "learning_rate": 4.749154093390708e-07, "loss": 0.79466748, "num_input_tokens_seen": 280845335, "step": 13019, "time_per_iteration": 2.6806087493896484 }, { "auxiliary_loss_clip": 0.01065952, "auxiliary_loss_mlp": 0.01028104, "balance_loss_clip": 1.03685999, "balance_loss_mlp": 1.01612306, "epoch": 0.7828047497369608, "flos": 28840649702400.0, "grad_norm": 1.4806512863046632, "language_loss": 0.67511666, "learning_rate": 4.746634805529852e-07, "loss": 0.6960572, "num_input_tokens_seen": 280867145, "step": 13020, "time_per_iteration": 2.9394872188568115 }, { "auxiliary_loss_clip": 0.01099304, "auxiliary_loss_mlp": 0.0103012, "balance_loss_clip": 1.03999841, "balance_loss_mlp": 1.01744223, "epoch": 0.7828648729896287, "flos": 23257689759360.0, "grad_norm": 3.1494596140171787, "language_loss": 0.62587798, "learning_rate": 4.7441160960781325e-07, "loss": 0.64717221, "num_input_tokens_seen": 280886185, "step": 13021, "time_per_iteration": 2.6747660636901855 }, { "auxiliary_loss_clip": 0.01107745, "auxiliary_loss_mlp": 0.01035666, "balance_loss_clip": 1.03731537, "balance_loss_mlp": 1.02393007, "epoch": 0.7829249962422967, "flos": 25265670831360.0, "grad_norm": 1.6912709426048431, "language_loss": 0.69153851, "learning_rate": 4.7415979651310636e-07, "loss": 0.71297264, "num_input_tokens_seen": 280907665, "step": 13022, "time_per_iteration": 2.698918342590332 }, { "auxiliary_loss_clip": 0.00980906, "auxiliary_loss_mlp": 0.0100189, "balance_loss_clip": 1.01163054, "balance_loss_mlp": 1.00038803, "epoch": 0.7829851194949646, "flos": 70722044645760.0, "grad_norm": 0.6377469168354571, "language_loss": 0.56205934, "learning_rate": 4.739080412784131e-07, "loss": 0.58188736, "num_input_tokens_seen": 280971405, "step": 13023, "time_per_iteration": 3.4054768085479736 }, { "auxiliary_loss_clip": 0.0107826, "auxiliary_loss_mlp": 0.01032398, "balance_loss_clip": 1.03205729, "balance_loss_mlp": 1.01958895, "epoch": 0.7830452427476327, "flos": 25660795415040.0, "grad_norm": 1.7775757007122028, "language_loss": 0.67073244, "learning_rate": 4.736563439132792e-07, "loss": 0.69183898, "num_input_tokens_seen": 280989615, "step": 13024, "time_per_iteration": 2.646439790725708 }, { "auxiliary_loss_clip": 0.01112317, "auxiliary_loss_mlp": 0.01028491, "balance_loss_clip": 1.03878796, "balance_loss_mlp": 1.01559806, "epoch": 0.7831053660003006, "flos": 22784315397120.0, "grad_norm": 1.5682779156650977, "language_loss": 0.77674961, "learning_rate": 4.734047044272498e-07, "loss": 0.79815769, "num_input_tokens_seen": 281009450, "step": 13025, "time_per_iteration": 2.632951021194458 }, { "auxiliary_loss_clip": 0.01084338, "auxiliary_loss_mlp": 0.01036499, "balance_loss_clip": 1.03556383, "balance_loss_mlp": 1.02404797, "epoch": 0.7831654892529686, "flos": 25812267068160.0, "grad_norm": 2.0934383648650194, "language_loss": 0.78239512, "learning_rate": 4.731531228298673e-07, "loss": 0.80360353, "num_input_tokens_seen": 281028120, "step": 13026, "time_per_iteration": 2.7387208938598633 }, { "auxiliary_loss_clip": 0.01097191, "auxiliary_loss_mlp": 0.01028382, "balance_loss_clip": 1.03798652, "balance_loss_mlp": 1.01656842, "epoch": 0.7832256125056366, "flos": 20771557816320.0, "grad_norm": 2.1298369773301, "language_loss": 0.75428832, "learning_rate": 4.729015991306715e-07, "loss": 0.77554405, "num_input_tokens_seen": 281042130, "step": 13027, "time_per_iteration": 2.62705135345459 }, { "auxiliary_loss_clip": 0.01102237, "auxiliary_loss_mlp": 0.01031953, "balance_loss_clip": 1.04018044, "balance_loss_mlp": 1.01980579, "epoch": 0.7832857357583045, "flos": 21506541909120.0, "grad_norm": 1.7296473073785772, "language_loss": 0.70366251, "learning_rate": 4.726501333391997e-07, "loss": 0.72500432, "num_input_tokens_seen": 281060945, "step": 13028, "time_per_iteration": 2.651749849319458 }, { "auxiliary_loss_clip": 0.01063459, "auxiliary_loss_mlp": 0.01038176, "balance_loss_clip": 1.03720903, "balance_loss_mlp": 1.02482486, "epoch": 0.7833458590109725, "flos": 18077791305600.0, "grad_norm": 2.132518402701666, "language_loss": 0.68704486, "learning_rate": 4.7239872546498774e-07, "loss": 0.70806122, "num_input_tokens_seen": 281079270, "step": 13029, "time_per_iteration": 2.733846664428711 }, { "auxiliary_loss_clip": 0.01085127, "auxiliary_loss_mlp": 0.01033179, "balance_loss_clip": 1.03914356, "balance_loss_mlp": 1.01970291, "epoch": 0.7834059822636404, "flos": 28288738252800.0, "grad_norm": 1.9111735577193074, "language_loss": 0.81041169, "learning_rate": 4.721473755175698e-07, "loss": 0.83159471, "num_input_tokens_seen": 281099500, "step": 13030, "time_per_iteration": 2.7992770671844482 }, { "auxiliary_loss_clip": 0.01104778, "auxiliary_loss_mlp": 0.01033096, "balance_loss_clip": 1.03917181, "balance_loss_mlp": 1.0206331, "epoch": 0.7834661055163085, "flos": 31686211088640.0, "grad_norm": 2.9451675534531847, "language_loss": 0.70219892, "learning_rate": 4.71896083506476e-07, "loss": 0.72357768, "num_input_tokens_seen": 281121250, "step": 13031, "time_per_iteration": 2.9042952060699463 }, { "auxiliary_loss_clip": 0.01072572, "auxiliary_loss_mlp": 0.01034559, "balance_loss_clip": 1.03533792, "balance_loss_mlp": 1.02237034, "epoch": 0.7835262287689764, "flos": 12933192942720.0, "grad_norm": 2.323625290086717, "language_loss": 0.790016, "learning_rate": 4.7164484944123574e-07, "loss": 0.81108725, "num_input_tokens_seen": 281138760, "step": 13032, "time_per_iteration": 2.750812292098999 }, { "auxiliary_loss_clip": 0.0110433, "auxiliary_loss_mlp": 0.01040225, "balance_loss_clip": 1.03909242, "balance_loss_mlp": 1.02739763, "epoch": 0.7835863520216444, "flos": 16143211676160.0, "grad_norm": 2.0684316430418463, "language_loss": 0.62812865, "learning_rate": 4.7139367333137726e-07, "loss": 0.64957428, "num_input_tokens_seen": 281157420, "step": 13033, "time_per_iteration": 2.6421468257904053 }, { "auxiliary_loss_clip": 0.01098998, "auxiliary_loss_mlp": 0.01034317, "balance_loss_clip": 1.03790116, "balance_loss_mlp": 1.02132297, "epoch": 0.7836464752743123, "flos": 11509909459200.0, "grad_norm": 1.58357633529001, "language_loss": 0.71756327, "learning_rate": 4.7114255518642255e-07, "loss": 0.73889643, "num_input_tokens_seen": 281174620, "step": 13034, "time_per_iteration": 2.7166271209716797 }, { "auxiliary_loss_clip": 0.01113235, "auxiliary_loss_mlp": 0.00771091, "balance_loss_clip": 1.03961957, "balance_loss_mlp": 1.00013566, "epoch": 0.7837065985269803, "flos": 18223696350720.0, "grad_norm": 1.6566949403371967, "language_loss": 0.72002685, "learning_rate": 4.7089149501589555e-07, "loss": 0.73887014, "num_input_tokens_seen": 281193865, "step": 13035, "time_per_iteration": 2.5778520107269287 }, { "auxiliary_loss_clip": 0.01112728, "auxiliary_loss_mlp": 0.01035529, "balance_loss_clip": 1.03951585, "balance_loss_mlp": 1.02208817, "epoch": 0.7837667217796482, "flos": 24754410599040.0, "grad_norm": 2.9879625935132372, "language_loss": 0.66463214, "learning_rate": 4.7064049282931664e-07, "loss": 0.68611467, "num_input_tokens_seen": 281212250, "step": 13036, "time_per_iteration": 2.6302857398986816 }, { "auxiliary_loss_clip": 0.01104467, "auxiliary_loss_mlp": 0.01039494, "balance_loss_clip": 1.03855228, "balance_loss_mlp": 1.02618408, "epoch": 0.7838268450323163, "flos": 22383121415040.0, "grad_norm": 2.0949975987912848, "language_loss": 0.73010111, "learning_rate": 4.703895486362031e-07, "loss": 0.75154078, "num_input_tokens_seen": 281230850, "step": 13037, "time_per_iteration": 2.6575746536254883 }, { "auxiliary_loss_clip": 0.01070616, "auxiliary_loss_mlp": 0.01035879, "balance_loss_clip": 1.03389454, "balance_loss_mlp": 1.02229476, "epoch": 0.7838869682849842, "flos": 19500284689920.0, "grad_norm": 5.499006833043144, "language_loss": 0.59598082, "learning_rate": 4.701386624460717e-07, "loss": 0.61704576, "num_input_tokens_seen": 281249810, "step": 13038, "time_per_iteration": 2.6556448936462402 }, { "auxiliary_loss_clip": 0.01089544, "auxiliary_loss_mlp": 0.01028906, "balance_loss_clip": 1.0388062, "balance_loss_mlp": 1.0172174, "epoch": 0.7839470915376522, "flos": 32892845690880.0, "grad_norm": 1.750335160170137, "language_loss": 0.68257546, "learning_rate": 4.698878342684349e-07, "loss": 0.70375991, "num_input_tokens_seen": 281273730, "step": 13039, "time_per_iteration": 2.760946273803711 }, { "auxiliary_loss_clip": 0.01072076, "auxiliary_loss_mlp": 0.01024882, "balance_loss_clip": 1.03432715, "balance_loss_mlp": 1.01383781, "epoch": 0.7840072147903202, "flos": 29676003373440.0, "grad_norm": 1.826043040750904, "language_loss": 0.69417781, "learning_rate": 4.6963706411280537e-07, "loss": 0.71514744, "num_input_tokens_seen": 281293670, "step": 13040, "time_per_iteration": 2.7545461654663086 }, { "auxiliary_loss_clip": 0.0106802, "auxiliary_loss_mlp": 0.01035678, "balance_loss_clip": 1.03712749, "balance_loss_mlp": 1.02223086, "epoch": 0.7840673380429881, "flos": 18186744234240.0, "grad_norm": 1.5142145779529246, "language_loss": 0.67758179, "learning_rate": 4.6938635198869116e-07, "loss": 0.69861877, "num_input_tokens_seen": 281313070, "step": 13041, "time_per_iteration": 2.7630157470703125 }, { "auxiliary_loss_clip": 0.01022608, "auxiliary_loss_mlp": 0.00751599, "balance_loss_clip": 1.0097084, "balance_loss_mlp": 0.99966377, "epoch": 0.7841274612956561, "flos": 66346006613760.0, "grad_norm": 0.6656190181226946, "language_loss": 0.57380033, "learning_rate": 4.691356979055998e-07, "loss": 0.59154236, "num_input_tokens_seen": 281374880, "step": 13042, "time_per_iteration": 3.1374757289886475 }, { "auxiliary_loss_clip": 0.01087388, "auxiliary_loss_mlp": 0.01033015, "balance_loss_clip": 1.03713918, "balance_loss_mlp": 1.02007461, "epoch": 0.784187584548324, "flos": 26648482665600.0, "grad_norm": 2.1244828686221267, "language_loss": 0.83795989, "learning_rate": 4.688851018730369e-07, "loss": 0.85916388, "num_input_tokens_seen": 281392620, "step": 13043, "time_per_iteration": 2.793748378753662 }, { "auxiliary_loss_clip": 0.01095712, "auxiliary_loss_mlp": 0.01027767, "balance_loss_clip": 1.03719783, "balance_loss_mlp": 1.0161922, "epoch": 0.7842477078009921, "flos": 25740158515200.0, "grad_norm": 1.3834924746992494, "language_loss": 0.88441205, "learning_rate": 4.6863456390050425e-07, "loss": 0.90564686, "num_input_tokens_seen": 281413140, "step": 13044, "time_per_iteration": 4.261160135269165 }, { "auxiliary_loss_clip": 0.01093506, "auxiliary_loss_mlp": 0.01034687, "balance_loss_clip": 1.03787422, "balance_loss_mlp": 1.02180016, "epoch": 0.78430783105366, "flos": 21980957765760.0, "grad_norm": 1.825374480580212, "language_loss": 0.78958154, "learning_rate": 4.6838408399750195e-07, "loss": 0.81086344, "num_input_tokens_seen": 281430860, "step": 13045, "time_per_iteration": 2.7708632946014404 }, { "auxiliary_loss_clip": 0.01084228, "auxiliary_loss_mlp": 0.01031577, "balance_loss_clip": 1.03655803, "balance_loss_mlp": 1.0191431, "epoch": 0.784367954306328, "flos": 23842279607040.0, "grad_norm": 1.484345043483713, "language_loss": 0.72495216, "learning_rate": 4.6813366217352925e-07, "loss": 0.7461102, "num_input_tokens_seen": 281451385, "step": 13046, "time_per_iteration": 4.295615196228027 }, { "auxiliary_loss_clip": 0.01070358, "auxiliary_loss_mlp": 0.01035911, "balance_loss_clip": 1.04044282, "balance_loss_mlp": 1.02267289, "epoch": 0.7844280775589959, "flos": 24826662806400.0, "grad_norm": 1.5168340119310013, "language_loss": 0.62780952, "learning_rate": 4.678832984380809e-07, "loss": 0.6488722, "num_input_tokens_seen": 281472255, "step": 13047, "time_per_iteration": 4.33956503868103 }, { "auxiliary_loss_clip": 0.01100709, "auxiliary_loss_mlp": 0.01027981, "balance_loss_clip": 1.03916669, "balance_loss_mlp": 1.01601255, "epoch": 0.7844882008116639, "flos": 22455660931200.0, "grad_norm": 1.6255681359432697, "language_loss": 0.73295152, "learning_rate": 4.676329928006515e-07, "loss": 0.75423837, "num_input_tokens_seen": 281492860, "step": 13048, "time_per_iteration": 2.764153003692627 }, { "auxiliary_loss_clip": 0.01087112, "auxiliary_loss_mlp": 0.0103201, "balance_loss_clip": 1.03815794, "balance_loss_mlp": 1.01965356, "epoch": 0.7845483240643318, "flos": 26104041244800.0, "grad_norm": 3.259574846755966, "language_loss": 0.74822855, "learning_rate": 4.6738274527073243e-07, "loss": 0.76941979, "num_input_tokens_seen": 281511815, "step": 13049, "time_per_iteration": 2.702545642852783 }, { "auxiliary_loss_clip": 0.01113727, "auxiliary_loss_mlp": 0.01032718, "balance_loss_clip": 1.03731608, "balance_loss_mlp": 1.01894963, "epoch": 0.7846084473169999, "flos": 19354307817600.0, "grad_norm": 1.71411914117224, "language_loss": 0.72622865, "learning_rate": 4.6713255585781454e-07, "loss": 0.74769306, "num_input_tokens_seen": 281530090, "step": 13050, "time_per_iteration": 2.6567511558532715 }, { "auxiliary_loss_clip": 0.01098536, "auxiliary_loss_mlp": 0.01034296, "balance_loss_clip": 1.03764224, "balance_loss_mlp": 1.02170706, "epoch": 0.7846685705696678, "flos": 23325811902720.0, "grad_norm": 1.9970425884506249, "language_loss": 0.73258287, "learning_rate": 4.668824245713825e-07, "loss": 0.75391114, "num_input_tokens_seen": 281547075, "step": 13051, "time_per_iteration": 4.220673322677612 }, { "auxiliary_loss_clip": 0.01112899, "auxiliary_loss_mlp": 0.01034321, "balance_loss_clip": 1.03919625, "balance_loss_mlp": 1.02135718, "epoch": 0.7847286938223358, "flos": 35809545962880.0, "grad_norm": 2.6887410249812578, "language_loss": 0.72721338, "learning_rate": 4.666323514209227e-07, "loss": 0.7486856, "num_input_tokens_seen": 281568080, "step": 13052, "time_per_iteration": 2.7622361183166504 }, { "auxiliary_loss_clip": 0.0108619, "auxiliary_loss_mlp": 0.01035577, "balance_loss_clip": 1.03937328, "balance_loss_mlp": 1.02357841, "epoch": 0.7847888170750038, "flos": 18478159274880.0, "grad_norm": 1.82904296097524, "language_loss": 0.69018829, "learning_rate": 4.663823364159183e-07, "loss": 0.71140599, "num_input_tokens_seen": 281586925, "step": 13053, "time_per_iteration": 2.7101058959960938 }, { "auxiliary_loss_clip": 0.0109323, "auxiliary_loss_mlp": 0.01031564, "balance_loss_clip": 1.03785491, "balance_loss_mlp": 1.01989341, "epoch": 0.7848489403276717, "flos": 25119155255040.0, "grad_norm": 2.155883968401707, "language_loss": 0.69833845, "learning_rate": 4.6613237956584893e-07, "loss": 0.71958637, "num_input_tokens_seen": 281603915, "step": 13054, "time_per_iteration": 2.6558749675750732 }, { "auxiliary_loss_clip": 0.01102359, "auxiliary_loss_mlp": 0.01035501, "balance_loss_clip": 1.03816795, "balance_loss_mlp": 1.02254295, "epoch": 0.7849090635803397, "flos": 26502433966080.0, "grad_norm": 1.6743772106587247, "language_loss": 0.76095474, "learning_rate": 4.658824808801938e-07, "loss": 0.78233331, "num_input_tokens_seen": 281624220, "step": 13055, "time_per_iteration": 2.729825019836426 }, { "auxiliary_loss_clip": 0.01115191, "auxiliary_loss_mlp": 0.01034507, "balance_loss_clip": 1.03995335, "balance_loss_mlp": 1.02139974, "epoch": 0.7849691868330076, "flos": 20959658363520.0, "grad_norm": 1.870278317520838, "language_loss": 0.7499572, "learning_rate": 4.656326403684283e-07, "loss": 0.77145422, "num_input_tokens_seen": 281642325, "step": 13056, "time_per_iteration": 2.6321020126342773 }, { "auxiliary_loss_clip": 0.01048067, "auxiliary_loss_mlp": 0.0103263, "balance_loss_clip": 1.03739357, "balance_loss_mlp": 1.01989865, "epoch": 0.7850293100856757, "flos": 26067484177920.0, "grad_norm": 1.7420143195586486, "language_loss": 0.70014071, "learning_rate": 4.6538285804002744e-07, "loss": 0.72094762, "num_input_tokens_seen": 281663065, "step": 13057, "time_per_iteration": 2.8007147312164307 }, { "auxiliary_loss_clip": 0.01064676, "auxiliary_loss_mlp": 0.01033287, "balance_loss_clip": 1.03794479, "balance_loss_mlp": 1.02130675, "epoch": 0.7850894333383436, "flos": 22491894775680.0, "grad_norm": 1.791422043134008, "language_loss": 0.76534569, "learning_rate": 4.6513313390446175e-07, "loss": 0.78632534, "num_input_tokens_seen": 281681005, "step": 13058, "time_per_iteration": 2.7110915184020996 }, { "auxiliary_loss_clip": 0.01101284, "auxiliary_loss_mlp": 0.01036049, "balance_loss_clip": 1.03946376, "balance_loss_mlp": 1.0238781, "epoch": 0.7851495565910116, "flos": 20558643949440.0, "grad_norm": 1.5851127868658192, "language_loss": 0.70834202, "learning_rate": 4.6488346797120146e-07, "loss": 0.72971535, "num_input_tokens_seen": 281697965, "step": 13059, "time_per_iteration": 2.7031941413879395 }, { "auxiliary_loss_clip": 0.01081291, "auxiliary_loss_mlp": 0.01038886, "balance_loss_clip": 1.03579831, "balance_loss_mlp": 1.02460492, "epoch": 0.7852096798436795, "flos": 15924838942080.0, "grad_norm": 2.081733102958074, "language_loss": 0.76698899, "learning_rate": 4.646338602497144e-07, "loss": 0.78819072, "num_input_tokens_seen": 281716035, "step": 13060, "time_per_iteration": 2.7939200401306152 }, { "auxiliary_loss_clip": 0.01083148, "auxiliary_loss_mlp": 0.01031719, "balance_loss_clip": 1.03790545, "balance_loss_mlp": 1.01883268, "epoch": 0.7852698030963475, "flos": 19062282245760.0, "grad_norm": 2.323604844819863, "language_loss": 0.77162534, "learning_rate": 4.643843107494654e-07, "loss": 0.79277396, "num_input_tokens_seen": 281732815, "step": 13061, "time_per_iteration": 2.697397232055664 }, { "auxiliary_loss_clip": 0.01074028, "auxiliary_loss_mlp": 0.01034387, "balance_loss_clip": 1.03479552, "balance_loss_mlp": 1.02089262, "epoch": 0.7853299263490154, "flos": 24644380262400.0, "grad_norm": 1.8894100648574905, "language_loss": 0.74005646, "learning_rate": 4.641348194799164e-07, "loss": 0.76114058, "num_input_tokens_seen": 281751980, "step": 13062, "time_per_iteration": 2.9962854385375977 }, { "auxiliary_loss_clip": 0.01097852, "auxiliary_loss_mlp": 0.01032338, "balance_loss_clip": 1.03713512, "balance_loss_mlp": 1.02026176, "epoch": 0.7853900496016835, "flos": 22017981709440.0, "grad_norm": 1.6526268980906231, "language_loss": 0.68907607, "learning_rate": 4.638853864505297e-07, "loss": 0.71037793, "num_input_tokens_seen": 281772670, "step": 13063, "time_per_iteration": 2.7347474098205566 }, { "auxiliary_loss_clip": 0.01099713, "auxiliary_loss_mlp": 0.01036078, "balance_loss_clip": 1.04048038, "balance_loss_mlp": 1.02360916, "epoch": 0.7854501728543514, "flos": 30227412032640.0, "grad_norm": 4.546851509745459, "language_loss": 0.72635663, "learning_rate": 4.636360116707625e-07, "loss": 0.74771458, "num_input_tokens_seen": 281792930, "step": 13064, "time_per_iteration": 2.7636148929595947 }, { "auxiliary_loss_clip": 0.01082833, "auxiliary_loss_mlp": 0.01033112, "balance_loss_clip": 1.03790045, "balance_loss_mlp": 1.02079129, "epoch": 0.7855102961070194, "flos": 18843694030080.0, "grad_norm": 14.965350757481792, "language_loss": 0.67957228, "learning_rate": 4.633866951500718e-07, "loss": 0.70073175, "num_input_tokens_seen": 281811805, "step": 13065, "time_per_iteration": 2.7619290351867676 }, { "auxiliary_loss_clip": 0.01097669, "auxiliary_loss_mlp": 0.01037896, "balance_loss_clip": 1.04063308, "balance_loss_mlp": 1.02562392, "epoch": 0.7855704193596874, "flos": 22309971367680.0, "grad_norm": 1.6867324299047715, "language_loss": 0.75999427, "learning_rate": 4.6313743689791196e-07, "loss": 0.78134984, "num_input_tokens_seen": 281831885, "step": 13066, "time_per_iteration": 2.647052764892578 }, { "auxiliary_loss_clip": 0.0103061, "auxiliary_loss_mlp": 0.01006066, "balance_loss_clip": 1.00811362, "balance_loss_mlp": 1.00509405, "epoch": 0.7856305426123553, "flos": 60004434407040.0, "grad_norm": 0.7063334807152991, "language_loss": 0.5335499, "learning_rate": 4.628882369237346e-07, "loss": 0.55391669, "num_input_tokens_seen": 281900310, "step": 13067, "time_per_iteration": 3.2783384323120117 }, { "auxiliary_loss_clip": 0.01065395, "auxiliary_loss_mlp": 0.01032148, "balance_loss_clip": 1.03609753, "balance_loss_mlp": 1.01884413, "epoch": 0.7856906658650233, "flos": 21868593045120.0, "grad_norm": 1.5153182614776801, "language_loss": 0.67582923, "learning_rate": 4.62639095236989e-07, "loss": 0.69680464, "num_input_tokens_seen": 281918870, "step": 13068, "time_per_iteration": 2.818237543106079 }, { "auxiliary_loss_clip": 0.01076742, "auxiliary_loss_mlp": 0.01030057, "balance_loss_clip": 1.03852606, "balance_loss_mlp": 1.01839852, "epoch": 0.7857507891176913, "flos": 23622937205760.0, "grad_norm": 2.4110222950654325, "language_loss": 0.68040943, "learning_rate": 4.6239001184712267e-07, "loss": 0.70147741, "num_input_tokens_seen": 281936905, "step": 13069, "time_per_iteration": 2.7654619216918945 }, { "auxiliary_loss_clip": 0.01103004, "auxiliary_loss_mlp": 0.01035827, "balance_loss_clip": 1.04032803, "balance_loss_mlp": 1.02331567, "epoch": 0.7858109123703593, "flos": 25520061928320.0, "grad_norm": 1.6503036246986864, "language_loss": 0.76820791, "learning_rate": 4.6214098676358195e-07, "loss": 0.7895962, "num_input_tokens_seen": 281955625, "step": 13070, "time_per_iteration": 2.7123591899871826 }, { "auxiliary_loss_clip": 0.0105121, "auxiliary_loss_mlp": 0.0105136, "balance_loss_clip": 1.030967, "balance_loss_mlp": 1.03746009, "epoch": 0.7858710356230272, "flos": 17457398576640.0, "grad_norm": 1.7605883689591728, "language_loss": 0.65229589, "learning_rate": 4.618920199958083e-07, "loss": 0.6733216, "num_input_tokens_seen": 281973285, "step": 13071, "time_per_iteration": 2.727679491043091 }, { "auxiliary_loss_clip": 0.01063123, "auxiliary_loss_mlp": 0.0103513, "balance_loss_clip": 1.03286123, "balance_loss_mlp": 1.02270818, "epoch": 0.7859311588756952, "flos": 24679680353280.0, "grad_norm": 1.7243596413538878, "language_loss": 0.73917699, "learning_rate": 4.616431115532442e-07, "loss": 0.76015961, "num_input_tokens_seen": 281991410, "step": 13072, "time_per_iteration": 2.8985819816589355 }, { "auxiliary_loss_clip": 0.01097172, "auxiliary_loss_mlp": 0.01032829, "balance_loss_clip": 1.04014218, "balance_loss_mlp": 1.0194838, "epoch": 0.7859912821283631, "flos": 21799142098560.0, "grad_norm": 4.385793601952052, "language_loss": 0.71439523, "learning_rate": 4.613942614453268e-07, "loss": 0.73569524, "num_input_tokens_seen": 282010845, "step": 13073, "time_per_iteration": 2.670741558074951 }, { "auxiliary_loss_clip": 0.01085075, "auxiliary_loss_mlp": 0.01035125, "balance_loss_clip": 1.0389545, "balance_loss_mlp": 1.0218693, "epoch": 0.7860514053810311, "flos": 20847293642880.0, "grad_norm": 1.6142243935129328, "language_loss": 0.76601768, "learning_rate": 4.611454696814938e-07, "loss": 0.78721976, "num_input_tokens_seen": 282029635, "step": 13074, "time_per_iteration": 2.715064287185669 }, { "auxiliary_loss_clip": 0.01067309, "auxiliary_loss_mlp": 0.01034423, "balance_loss_clip": 1.03506911, "balance_loss_mlp": 1.0224185, "epoch": 0.786111528633699, "flos": 24315689882880.0, "grad_norm": 1.7966754252742998, "language_loss": 0.75166345, "learning_rate": 4.608967362711782e-07, "loss": 0.77268076, "num_input_tokens_seen": 282050285, "step": 13075, "time_per_iteration": 2.8381521701812744 }, { "auxiliary_loss_clip": 0.01083185, "auxiliary_loss_mlp": 0.01026692, "balance_loss_clip": 1.04080176, "balance_loss_mlp": 1.01497984, "epoch": 0.7861716518863671, "flos": 24353180703360.0, "grad_norm": 1.743827758665396, "language_loss": 0.69089484, "learning_rate": 4.6064806122381283e-07, "loss": 0.71199363, "num_input_tokens_seen": 282071040, "step": 13076, "time_per_iteration": 2.812002658843994 }, { "auxiliary_loss_clip": 0.01095604, "auxiliary_loss_mlp": 0.01028537, "balance_loss_clip": 1.03609765, "balance_loss_mlp": 1.01606214, "epoch": 0.786231775139035, "flos": 14022399006720.0, "grad_norm": 2.296864016069315, "language_loss": 0.80343485, "learning_rate": 4.603994445488282e-07, "loss": 0.82467622, "num_input_tokens_seen": 282086610, "step": 13077, "time_per_iteration": 2.690382480621338 }, { "auxiliary_loss_clip": 0.0110006, "auxiliary_loss_mlp": 0.0103229, "balance_loss_clip": 1.039482, "balance_loss_mlp": 1.01980269, "epoch": 0.786291898391703, "flos": 33724248865920.0, "grad_norm": 1.6714014639715435, "language_loss": 0.70845038, "learning_rate": 4.6015088625564956e-07, "loss": 0.72977388, "num_input_tokens_seen": 282107440, "step": 13078, "time_per_iteration": 3.024754524230957 }, { "auxiliary_loss_clip": 0.01096328, "auxiliary_loss_mlp": 0.01035627, "balance_loss_clip": 1.03739369, "balance_loss_mlp": 1.02363431, "epoch": 0.786352021644371, "flos": 25811476968960.0, "grad_norm": 1.523123466356383, "language_loss": 0.81217003, "learning_rate": 4.599023863537039e-07, "loss": 0.83348954, "num_input_tokens_seen": 282127290, "step": 13079, "time_per_iteration": 2.6527066230773926 }, { "auxiliary_loss_clip": 0.01078236, "auxiliary_loss_mlp": 0.01032627, "balance_loss_clip": 1.03953731, "balance_loss_mlp": 1.0202589, "epoch": 0.7864121448970389, "flos": 28910818920960.0, "grad_norm": 1.8147971205749318, "language_loss": 0.68534672, "learning_rate": 4.596539448524146e-07, "loss": 0.70645535, "num_input_tokens_seen": 282147505, "step": 13080, "time_per_iteration": 2.7910823822021484 }, { "auxiliary_loss_clip": 0.01099002, "auxiliary_loss_mlp": 0.01034583, "balance_loss_clip": 1.03815937, "balance_loss_mlp": 1.02227473, "epoch": 0.7864722681497069, "flos": 19208833735680.0, "grad_norm": 1.6728405689924877, "language_loss": 0.69698668, "learning_rate": 4.594055617612016e-07, "loss": 0.71832252, "num_input_tokens_seen": 282166450, "step": 13081, "time_per_iteration": 2.676067590713501 }, { "auxiliary_loss_clip": 0.01086253, "auxiliary_loss_mlp": 0.01035065, "balance_loss_clip": 1.03589058, "balance_loss_mlp": 1.0229888, "epoch": 0.7865323914023749, "flos": 21871573873920.0, "grad_norm": 1.8288911392242622, "language_loss": 0.68142998, "learning_rate": 4.591572370894838e-07, "loss": 0.70264316, "num_input_tokens_seen": 282186465, "step": 13082, "time_per_iteration": 2.671044111251831 }, { "auxiliary_loss_clip": 0.01081636, "auxiliary_loss_mlp": 0.01036406, "balance_loss_clip": 1.03695893, "balance_loss_mlp": 1.02418661, "epoch": 0.7865925146550429, "flos": 25520313323520.0, "grad_norm": 1.7965603666617915, "language_loss": 0.66121304, "learning_rate": 4.589089708466789e-07, "loss": 0.68239349, "num_input_tokens_seen": 282207180, "step": 13083, "time_per_iteration": 2.77585506439209 }, { "auxiliary_loss_clip": 0.01089696, "auxiliary_loss_mlp": 0.01030934, "balance_loss_clip": 1.03773546, "balance_loss_mlp": 1.01740384, "epoch": 0.7866526379077108, "flos": 19097366855040.0, "grad_norm": 2.0746759351122614, "language_loss": 0.74140465, "learning_rate": 4.5866076304220015e-07, "loss": 0.76261097, "num_input_tokens_seen": 282225865, "step": 13084, "time_per_iteration": 5.905508518218994 }, { "auxiliary_loss_clip": 0.01083182, "auxiliary_loss_mlp": 0.01037679, "balance_loss_clip": 1.03682792, "balance_loss_mlp": 1.02519202, "epoch": 0.7867127611603788, "flos": 16173771171840.0, "grad_norm": 3.4926036147980635, "language_loss": 0.70331782, "learning_rate": 4.584126136854591e-07, "loss": 0.72452641, "num_input_tokens_seen": 282242895, "step": 13085, "time_per_iteration": 2.689375162124634 }, { "auxiliary_loss_clip": 0.01086151, "auxiliary_loss_mlp": 0.01030206, "balance_loss_clip": 1.03600478, "balance_loss_mlp": 1.01758742, "epoch": 0.7867728844130467, "flos": 20773640805120.0, "grad_norm": 2.163841211360238, "language_loss": 0.7244603, "learning_rate": 4.5816452278586617e-07, "loss": 0.74562383, "num_input_tokens_seen": 282260425, "step": 13086, "time_per_iteration": 4.172788381576538 }, { "auxiliary_loss_clip": 0.01108157, "auxiliary_loss_mlp": 0.01027186, "balance_loss_clip": 1.03651989, "balance_loss_mlp": 1.01503301, "epoch": 0.7868330076657147, "flos": 21760106993280.0, "grad_norm": 1.9419848626775902, "language_loss": 0.74776971, "learning_rate": 4.5791649035282965e-07, "loss": 0.7691232, "num_input_tokens_seen": 282279335, "step": 13087, "time_per_iteration": 2.695462465286255 }, { "auxiliary_loss_clip": 0.01085975, "auxiliary_loss_mlp": 0.01031976, "balance_loss_clip": 1.03558397, "balance_loss_mlp": 1.02015603, "epoch": 0.7868931309183826, "flos": 25700692446720.0, "grad_norm": 1.58603589617711, "language_loss": 0.71365935, "learning_rate": 4.5766851639575456e-07, "loss": 0.73483884, "num_input_tokens_seen": 282299905, "step": 13088, "time_per_iteration": 2.781475782394409 }, { "auxiliary_loss_clip": 0.01029395, "auxiliary_loss_mlp": 0.01003015, "balance_loss_clip": 1.0068965, "balance_loss_mlp": 1.0020256, "epoch": 0.7869532541710507, "flos": 64644883430400.0, "grad_norm": 1.2260501594651212, "language_loss": 0.55467439, "learning_rate": 4.574206009240431e-07, "loss": 0.5749985, "num_input_tokens_seen": 282367620, "step": 13089, "time_per_iteration": 3.24120831489563 }, { "auxiliary_loss_clip": 0.01017655, "auxiliary_loss_mlp": 0.01001728, "balance_loss_clip": 1.00651848, "balance_loss_mlp": 1.0007323, "epoch": 0.7870133774237186, "flos": 67453600440960.0, "grad_norm": 0.7641579994840295, "language_loss": 0.49973857, "learning_rate": 4.571727439470976e-07, "loss": 0.51993239, "num_input_tokens_seen": 282435695, "step": 13090, "time_per_iteration": 4.754423379898071 }, { "auxiliary_loss_clip": 0.01099139, "auxiliary_loss_mlp": 0.01031069, "balance_loss_clip": 1.0383184, "balance_loss_mlp": 1.01955974, "epoch": 0.7870735006763866, "flos": 26068310190720.0, "grad_norm": 1.460212524446196, "language_loss": 0.8408305, "learning_rate": 4.5692494547431583e-07, "loss": 0.86213255, "num_input_tokens_seen": 282456025, "step": 13091, "time_per_iteration": 2.6467459201812744 }, { "auxiliary_loss_clip": 0.01019902, "auxiliary_loss_mlp": 0.01003454, "balance_loss_clip": 1.00713682, "balance_loss_mlp": 1.00247598, "epoch": 0.7871336239290546, "flos": 70289572896000.0, "grad_norm": 0.7147506558128559, "language_loss": 0.64014363, "learning_rate": 4.566772055150947e-07, "loss": 0.6603772, "num_input_tokens_seen": 282520995, "step": 13092, "time_per_iteration": 3.2051150798797607 }, { "auxiliary_loss_clip": 0.01088327, "auxiliary_loss_mlp": 0.01035505, "balance_loss_clip": 1.03942823, "balance_loss_mlp": 1.0227139, "epoch": 0.7871937471817225, "flos": 15778574760960.0, "grad_norm": 2.3884379503568076, "language_loss": 0.79189074, "learning_rate": 4.564295240788285e-07, "loss": 0.81312907, "num_input_tokens_seen": 282539355, "step": 13093, "time_per_iteration": 2.7134079933166504 }, { "auxiliary_loss_clip": 0.01080576, "auxiliary_loss_mlp": 0.01028467, "balance_loss_clip": 1.03772855, "balance_loss_mlp": 1.01671863, "epoch": 0.7872538704343905, "flos": 20485242506880.0, "grad_norm": 1.8523965571373735, "language_loss": 0.75549555, "learning_rate": 4.561819011749106e-07, "loss": 0.77658594, "num_input_tokens_seen": 282555735, "step": 13094, "time_per_iteration": 2.7055883407592773 }, { "auxiliary_loss_clip": 0.01061535, "auxiliary_loss_mlp": 0.01044464, "balance_loss_clip": 1.03247035, "balance_loss_mlp": 1.030725, "epoch": 0.7873139936870585, "flos": 25082670015360.0, "grad_norm": 1.6047845480222185, "language_loss": 0.79805398, "learning_rate": 4.5593433681272884e-07, "loss": 0.81911397, "num_input_tokens_seen": 282574550, "step": 13095, "time_per_iteration": 2.819106340408325 }, { "auxiliary_loss_clip": 0.01098697, "auxiliary_loss_mlp": 0.01032788, "balance_loss_clip": 1.03697014, "balance_loss_mlp": 1.02055073, "epoch": 0.7873741169397265, "flos": 30883176679680.0, "grad_norm": 1.6143252232165546, "language_loss": 0.67820108, "learning_rate": 4.556868310016715e-07, "loss": 0.69951594, "num_input_tokens_seen": 282596520, "step": 13096, "time_per_iteration": 2.6944971084594727 }, { "auxiliary_loss_clip": 0.01082196, "auxiliary_loss_mlp": 0.01027058, "balance_loss_clip": 1.0342679, "balance_loss_mlp": 1.01628733, "epoch": 0.7874342401923944, "flos": 46791962242560.0, "grad_norm": 1.8327451146164324, "language_loss": 0.7056793, "learning_rate": 4.55439383751125e-07, "loss": 0.72677183, "num_input_tokens_seen": 282620560, "step": 13097, "time_per_iteration": 2.969263792037964 }, { "auxiliary_loss_clip": 0.01092033, "auxiliary_loss_mlp": 0.01034929, "balance_loss_clip": 1.04004323, "balance_loss_mlp": 1.0221442, "epoch": 0.7874943634450624, "flos": 23584548545280.0, "grad_norm": 1.6158173512871257, "language_loss": 0.80720508, "learning_rate": 4.5519199507047126e-07, "loss": 0.82847476, "num_input_tokens_seen": 282639830, "step": 13098, "time_per_iteration": 2.7234272956848145 }, { "auxiliary_loss_clip": 0.01069091, "auxiliary_loss_mlp": 0.01031845, "balance_loss_clip": 1.03451467, "balance_loss_mlp": 1.02053809, "epoch": 0.7875544866977303, "flos": 20191169859840.0, "grad_norm": 2.07716673704352, "language_loss": 0.73976696, "learning_rate": 4.5494466496909177e-07, "loss": 0.76077634, "num_input_tokens_seen": 282660130, "step": 13099, "time_per_iteration": 2.7741127014160156 }, { "auxiliary_loss_clip": 0.01087499, "auxiliary_loss_mlp": 0.01024045, "balance_loss_clip": 1.03627956, "balance_loss_mlp": 1.01170659, "epoch": 0.7876146099503983, "flos": 22602571557120.0, "grad_norm": 1.5896108161315186, "language_loss": 0.78226274, "learning_rate": 4.5469739345636603e-07, "loss": 0.80337822, "num_input_tokens_seen": 282681125, "step": 13100, "time_per_iteration": 2.7259294986724854 }, { "auxiliary_loss_clip": 0.01101593, "auxiliary_loss_mlp": 0.00771735, "balance_loss_clip": 1.03714919, "balance_loss_mlp": 1.00031686, "epoch": 0.7876747332030662, "flos": 10705833555840.0, "grad_norm": 3.3947108231001772, "language_loss": 0.66015649, "learning_rate": 4.5445018054167007e-07, "loss": 0.67888987, "num_input_tokens_seen": 282696690, "step": 13101, "time_per_iteration": 2.6262006759643555 }, { "auxiliary_loss_clip": 0.01086168, "auxiliary_loss_mlp": 0.01030979, "balance_loss_clip": 1.03619587, "balance_loss_mlp": 1.01895058, "epoch": 0.7877348564557343, "flos": 38399315621760.0, "grad_norm": 1.4292814509281728, "language_loss": 0.77840889, "learning_rate": 4.5420302623437745e-07, "loss": 0.79958034, "num_input_tokens_seen": 282721210, "step": 13102, "time_per_iteration": 3.016707420349121 }, { "auxiliary_loss_clip": 0.01096566, "auxiliary_loss_mlp": 0.01040471, "balance_loss_clip": 1.0358392, "balance_loss_mlp": 1.02863932, "epoch": 0.7877949797084022, "flos": 18329524796160.0, "grad_norm": 1.7485518464366943, "language_loss": 0.82362533, "learning_rate": 4.5395593054386093e-07, "loss": 0.84499568, "num_input_tokens_seen": 282738505, "step": 13103, "time_per_iteration": 2.6577935218811035 }, { "auxiliary_loss_clip": 0.01101133, "auxiliary_loss_mlp": 0.01033792, "balance_loss_clip": 1.03859389, "balance_loss_mlp": 1.02039886, "epoch": 0.7878551029610702, "flos": 25806736373760.0, "grad_norm": 3.304808366824196, "language_loss": 0.8070327, "learning_rate": 4.537088934794913e-07, "loss": 0.8283819, "num_input_tokens_seen": 282756895, "step": 13104, "time_per_iteration": 2.680666923522949 }, { "auxiliary_loss_clip": 0.01111584, "auxiliary_loss_mlp": 0.01034583, "balance_loss_clip": 1.03829467, "balance_loss_mlp": 1.02250695, "epoch": 0.7879152262137382, "flos": 22342685679360.0, "grad_norm": 1.6276257181376157, "language_loss": 0.74308252, "learning_rate": 4.5346191505063515e-07, "loss": 0.76454425, "num_input_tokens_seen": 282774955, "step": 13105, "time_per_iteration": 2.5943186283111572 }, { "auxiliary_loss_clip": 0.0105328, "auxiliary_loss_mlp": 0.0104138, "balance_loss_clip": 1.03382134, "balance_loss_mlp": 1.02832067, "epoch": 0.7879753494664061, "flos": 24785329230720.0, "grad_norm": 1.561193248297936, "language_loss": 0.75636542, "learning_rate": 4.5321499526665776e-07, "loss": 0.77731198, "num_input_tokens_seen": 282793165, "step": 13106, "time_per_iteration": 2.8052754402160645 }, { "auxiliary_loss_clip": 0.01060642, "auxiliary_loss_mlp": 0.01033724, "balance_loss_clip": 1.0368247, "balance_loss_mlp": 1.02129078, "epoch": 0.7880354727190741, "flos": 16909078487040.0, "grad_norm": 2.2640209986182116, "language_loss": 0.73844689, "learning_rate": 4.5296813413692337e-07, "loss": 0.75939053, "num_input_tokens_seen": 282809820, "step": 13107, "time_per_iteration": 2.7168357372283936 }, { "auxiliary_loss_clip": 0.01109075, "auxiliary_loss_mlp": 0.01034958, "balance_loss_clip": 1.03867579, "balance_loss_mlp": 1.02291143, "epoch": 0.7880955959717421, "flos": 22230500526720.0, "grad_norm": 1.5353613262891537, "language_loss": 0.73295653, "learning_rate": 4.5272133167079165e-07, "loss": 0.7543968, "num_input_tokens_seen": 282828600, "step": 13108, "time_per_iteration": 2.6911845207214355 }, { "auxiliary_loss_clip": 0.01029486, "auxiliary_loss_mlp": 0.00999387, "balance_loss_clip": 1.00682902, "balance_loss_mlp": 0.99848729, "epoch": 0.7881557192244101, "flos": 69183200131200.0, "grad_norm": 0.890062717819184, "language_loss": 0.60359526, "learning_rate": 4.5247458787762216e-07, "loss": 0.62388396, "num_input_tokens_seen": 282882775, "step": 13109, "time_per_iteration": 3.113757610321045 }, { "auxiliary_loss_clip": 0.01067084, "auxiliary_loss_mlp": 0.010294, "balance_loss_clip": 1.03637147, "balance_loss_mlp": 1.01732993, "epoch": 0.788215842477078, "flos": 24935436167040.0, "grad_norm": 1.6561185443626747, "language_loss": 0.72235435, "learning_rate": 4.5222790276677126e-07, "loss": 0.74331915, "num_input_tokens_seen": 282902680, "step": 13110, "time_per_iteration": 2.7759180068969727 }, { "auxiliary_loss_clip": 0.01056492, "auxiliary_loss_mlp": 0.01030136, "balance_loss_clip": 1.03376198, "balance_loss_mlp": 1.01843548, "epoch": 0.788275965729746, "flos": 26106483369600.0, "grad_norm": 1.3819740231055346, "language_loss": 0.75173604, "learning_rate": 4.5198127634759455e-07, "loss": 0.77260238, "num_input_tokens_seen": 282923625, "step": 13111, "time_per_iteration": 2.840644121170044 }, { "auxiliary_loss_clip": 0.01094246, "auxiliary_loss_mlp": 0.01035923, "balance_loss_clip": 1.03667474, "balance_loss_mlp": 1.02317989, "epoch": 0.7883360889824139, "flos": 21214803646080.0, "grad_norm": 2.288432261799451, "language_loss": 0.61037534, "learning_rate": 4.5173470862944206e-07, "loss": 0.63167697, "num_input_tokens_seen": 282941955, "step": 13112, "time_per_iteration": 2.673748016357422 }, { "auxiliary_loss_clip": 0.01089796, "auxiliary_loss_mlp": 0.0103157, "balance_loss_clip": 1.03910899, "balance_loss_mlp": 1.01814699, "epoch": 0.7883962122350819, "flos": 21142551438720.0, "grad_norm": 1.825503520806994, "language_loss": 0.67753619, "learning_rate": 4.514881996216644e-07, "loss": 0.69874984, "num_input_tokens_seen": 282961280, "step": 13113, "time_per_iteration": 2.6813149452209473 }, { "auxiliary_loss_clip": 0.01069296, "auxiliary_loss_mlp": 0.01035093, "balance_loss_clip": 1.0344131, "balance_loss_mlp": 1.02270675, "epoch": 0.7884563354877498, "flos": 15302901928320.0, "grad_norm": 3.4397675156813867, "language_loss": 0.5793888, "learning_rate": 4.5124174933361e-07, "loss": 0.60043263, "num_input_tokens_seen": 282978210, "step": 13114, "time_per_iteration": 2.7150933742523193 }, { "auxiliary_loss_clip": 0.01062606, "auxiliary_loss_mlp": 0.01031989, "balance_loss_clip": 1.03754115, "balance_loss_mlp": 1.01891208, "epoch": 0.7885164587404179, "flos": 24388301226240.0, "grad_norm": 1.5845799743186602, "language_loss": 0.67243695, "learning_rate": 4.5099535777462306e-07, "loss": 0.69338286, "num_input_tokens_seen": 282998845, "step": 13115, "time_per_iteration": 2.80094575881958 }, { "auxiliary_loss_clip": 0.01083933, "auxiliary_loss_mlp": 0.01040208, "balance_loss_clip": 1.03556573, "balance_loss_mlp": 1.02654052, "epoch": 0.7885765819930858, "flos": 14385886686720.0, "grad_norm": 2.573676201829806, "language_loss": 0.88785017, "learning_rate": 4.50749024954048e-07, "loss": 0.90909165, "num_input_tokens_seen": 283015200, "step": 13116, "time_per_iteration": 2.8118736743927 }, { "auxiliary_loss_clip": 0.01093449, "auxiliary_loss_mlp": 0.01033859, "balance_loss_clip": 1.03728342, "balance_loss_mlp": 1.02034712, "epoch": 0.7886367052457538, "flos": 18259930195200.0, "grad_norm": 2.1380250897449384, "language_loss": 0.72576118, "learning_rate": 4.505027508812245e-07, "loss": 0.74703431, "num_input_tokens_seen": 283033680, "step": 13117, "time_per_iteration": 2.782005786895752 }, { "auxiliary_loss_clip": 0.01096232, "auxiliary_loss_mlp": 0.01027109, "balance_loss_clip": 1.03812051, "balance_loss_mlp": 1.01586211, "epoch": 0.7886968284984217, "flos": 15305092657920.0, "grad_norm": 1.6421996108060435, "language_loss": 0.79999858, "learning_rate": 4.502565355654926e-07, "loss": 0.82123202, "num_input_tokens_seen": 283050620, "step": 13118, "time_per_iteration": 2.678349256515503 }, { "auxiliary_loss_clip": 0.01097412, "auxiliary_loss_mlp": 0.01028112, "balance_loss_clip": 1.03808641, "balance_loss_mlp": 1.01605964, "epoch": 0.7887569517510897, "flos": 21215450090880.0, "grad_norm": 1.6890691063161838, "language_loss": 0.72958535, "learning_rate": 4.500103790161878e-07, "loss": 0.75084054, "num_input_tokens_seen": 283070215, "step": 13119, "time_per_iteration": 2.7472004890441895 }, { "auxiliary_loss_clip": 0.01095693, "auxiliary_loss_mlp": 0.01028074, "balance_loss_clip": 1.03482223, "balance_loss_mlp": 1.01517558, "epoch": 0.7888170750037578, "flos": 22711237176960.0, "grad_norm": 3.3903571989834584, "language_loss": 0.71983945, "learning_rate": 4.4976428124264454e-07, "loss": 0.74107713, "num_input_tokens_seen": 283091485, "step": 13120, "time_per_iteration": 2.82316517829895 }, { "auxiliary_loss_clip": 0.01081982, "auxiliary_loss_mlp": 0.007726, "balance_loss_clip": 1.03474998, "balance_loss_mlp": 1.00026715, "epoch": 0.7888771982564257, "flos": 36429148592640.0, "grad_norm": 1.5160777676600576, "language_loss": 0.79098976, "learning_rate": 4.4951824225419564e-07, "loss": 0.80953562, "num_input_tokens_seen": 283115040, "step": 13121, "time_per_iteration": 2.8498284816741943 }, { "auxiliary_loss_clip": 0.01095183, "auxiliary_loss_mlp": 0.01030106, "balance_loss_clip": 1.0355587, "balance_loss_mlp": 1.01765466, "epoch": 0.7889373215090937, "flos": 27309993488640.0, "grad_norm": 1.3811288834626105, "language_loss": 0.80475199, "learning_rate": 4.4927226206017057e-07, "loss": 0.82600486, "num_input_tokens_seen": 283136925, "step": 13122, "time_per_iteration": 2.667525053024292 }, { "auxiliary_loss_clip": 0.01081111, "auxiliary_loss_mlp": 0.01026345, "balance_loss_clip": 1.03613377, "balance_loss_mlp": 1.01491308, "epoch": 0.7889974447617616, "flos": 19829010983040.0, "grad_norm": 1.947347999480454, "language_loss": 0.78504455, "learning_rate": 4.4902634066989597e-07, "loss": 0.8061192, "num_input_tokens_seen": 283155725, "step": 13123, "time_per_iteration": 5.875938653945923 }, { "auxiliary_loss_clip": 0.0109205, "auxiliary_loss_mlp": 0.01034554, "balance_loss_clip": 1.04389477, "balance_loss_mlp": 1.02196002, "epoch": 0.7890575680144296, "flos": 17271201450240.0, "grad_norm": 1.9573332964647796, "language_loss": 0.67213017, "learning_rate": 4.487804780926985e-07, "loss": 0.69339627, "num_input_tokens_seen": 283173845, "step": 13124, "time_per_iteration": 4.206716775894165 }, { "auxiliary_loss_clip": 0.01087652, "auxiliary_loss_mlp": 0.01025366, "balance_loss_clip": 1.03578448, "balance_loss_mlp": 1.01191306, "epoch": 0.7891176912670975, "flos": 27600151553280.0, "grad_norm": 2.308329967659437, "language_loss": 0.72559512, "learning_rate": 4.4853467433790036e-07, "loss": 0.74672532, "num_input_tokens_seen": 283191985, "step": 13125, "time_per_iteration": 2.7699477672576904 }, { "auxiliary_loss_clip": 0.01092333, "auxiliary_loss_mlp": 0.01028843, "balance_loss_clip": 1.03605413, "balance_loss_mlp": 1.01586151, "epoch": 0.7891778145197655, "flos": 22711668140160.0, "grad_norm": 1.8181427406883512, "language_loss": 0.72330505, "learning_rate": 4.4828892941482267e-07, "loss": 0.74451685, "num_input_tokens_seen": 283210855, "step": 13126, "time_per_iteration": 2.799743413925171 }, { "auxiliary_loss_clip": 0.01091919, "auxiliary_loss_mlp": 0.01031202, "balance_loss_clip": 1.03676748, "balance_loss_mlp": 1.01820195, "epoch": 0.7892379377724335, "flos": 17310775259520.0, "grad_norm": 1.9171689494151543, "language_loss": 0.76746297, "learning_rate": 4.480432433327845e-07, "loss": 0.78869414, "num_input_tokens_seen": 283229665, "step": 13127, "time_per_iteration": 2.6769402027130127 }, { "auxiliary_loss_clip": 0.0109264, "auxiliary_loss_mlp": 0.01040923, "balance_loss_clip": 1.03622723, "balance_loss_mlp": 1.02709436, "epoch": 0.7892980610251015, "flos": 25775674087680.0, "grad_norm": 1.6866494650381205, "language_loss": 0.85712594, "learning_rate": 4.47797616101103e-07, "loss": 0.87846154, "num_input_tokens_seen": 283248615, "step": 13128, "time_per_iteration": 2.6580183506011963 }, { "auxiliary_loss_clip": 0.0109824, "auxiliary_loss_mlp": 0.01037637, "balance_loss_clip": 1.03702545, "balance_loss_mlp": 1.02604949, "epoch": 0.7893581842777694, "flos": 21579943351680.0, "grad_norm": 2.375306290130731, "language_loss": 0.69267899, "learning_rate": 4.475520477290904e-07, "loss": 0.71403778, "num_input_tokens_seen": 283267135, "step": 13129, "time_per_iteration": 2.736177682876587 }, { "auxiliary_loss_clip": 0.01020095, "auxiliary_loss_mlp": 0.01001956, "balance_loss_clip": 1.00642443, "balance_loss_mlp": 1.00062704, "epoch": 0.7894183075304374, "flos": 69016468176000.0, "grad_norm": 0.7134870194246187, "language_loss": 0.61555952, "learning_rate": 4.473065382260597e-07, "loss": 0.63578004, "num_input_tokens_seen": 283328940, "step": 13130, "time_per_iteration": 4.797807216644287 }, { "auxiliary_loss_clip": 0.0110005, "auxiliary_loss_mlp": 0.01028381, "balance_loss_clip": 1.03902447, "balance_loss_mlp": 1.01690102, "epoch": 0.7894784307831053, "flos": 24243258107520.0, "grad_norm": 1.9168458838285078, "language_loss": 0.73797166, "learning_rate": 4.4706108760132124e-07, "loss": 0.75925595, "num_input_tokens_seen": 283350000, "step": 13131, "time_per_iteration": 2.7573840618133545 }, { "auxiliary_loss_clip": 0.01088103, "auxiliary_loss_mlp": 0.01026242, "balance_loss_clip": 1.0371995, "balance_loss_mlp": 1.01223469, "epoch": 0.7895385540357733, "flos": 20266546550400.0, "grad_norm": 2.4133377950586676, "language_loss": 0.68751633, "learning_rate": 4.4681569586418153e-07, "loss": 0.70865989, "num_input_tokens_seen": 283368020, "step": 13132, "time_per_iteration": 2.719820499420166 }, { "auxiliary_loss_clip": 0.01101541, "auxiliary_loss_mlp": 0.01040122, "balance_loss_clip": 1.03842628, "balance_loss_mlp": 1.02676463, "epoch": 0.7895986772884414, "flos": 20996574566400.0, "grad_norm": 2.9264754072085104, "language_loss": 0.62335461, "learning_rate": 4.465703630239468e-07, "loss": 0.64477122, "num_input_tokens_seen": 283387030, "step": 13133, "time_per_iteration": 2.6314589977264404 }, { "auxiliary_loss_clip": 0.01079478, "auxiliary_loss_mlp": 0.01037851, "balance_loss_clip": 1.03612971, "balance_loss_mlp": 1.02386165, "epoch": 0.7896588005411093, "flos": 18657999694080.0, "grad_norm": 3.367198830526819, "language_loss": 0.7950719, "learning_rate": 4.463250890899195e-07, "loss": 0.8162452, "num_input_tokens_seen": 283402090, "step": 13134, "time_per_iteration": 2.7504961490631104 }, { "auxiliary_loss_clip": 0.0109746, "auxiliary_loss_mlp": 0.01032763, "balance_loss_clip": 1.03501463, "balance_loss_mlp": 1.02011466, "epoch": 0.7897189237937773, "flos": 18405907067520.0, "grad_norm": 1.8328144041845063, "language_loss": 0.80414212, "learning_rate": 4.460798740713998e-07, "loss": 0.82544434, "num_input_tokens_seen": 283421035, "step": 13135, "time_per_iteration": 2.666182518005371 }, { "auxiliary_loss_clip": 0.01097147, "auxiliary_loss_mlp": 0.0103152, "balance_loss_clip": 1.0373044, "balance_loss_mlp": 1.01890731, "epoch": 0.7897790470464452, "flos": 23731602825600.0, "grad_norm": 1.9348385982052458, "language_loss": 0.72716129, "learning_rate": 4.4583471797768733e-07, "loss": 0.7484479, "num_input_tokens_seen": 283441830, "step": 13136, "time_per_iteration": 2.643087387084961 }, { "auxiliary_loss_clip": 0.01115705, "auxiliary_loss_mlp": 0.01034132, "balance_loss_clip": 1.03773975, "balance_loss_mlp": 1.02081013, "epoch": 0.7898391702991132, "flos": 15918949111680.0, "grad_norm": 5.084496642242111, "language_loss": 0.70505196, "learning_rate": 4.455896208180778e-07, "loss": 0.72655034, "num_input_tokens_seen": 283459540, "step": 13137, "time_per_iteration": 2.584527015686035 }, { "auxiliary_loss_clip": 0.01108112, "auxiliary_loss_mlp": 0.01035486, "balance_loss_clip": 1.03718948, "balance_loss_mlp": 1.02206349, "epoch": 0.7898992935517811, "flos": 19829046896640.0, "grad_norm": 1.7127744511556113, "language_loss": 0.73933578, "learning_rate": 4.4534458260186645e-07, "loss": 0.76077175, "num_input_tokens_seen": 283478790, "step": 13138, "time_per_iteration": 2.7276523113250732 }, { "auxiliary_loss_clip": 0.01070823, "auxiliary_loss_mlp": 0.01031914, "balance_loss_clip": 1.03749275, "balance_loss_mlp": 1.01971924, "epoch": 0.7899594168044491, "flos": 16216253982720.0, "grad_norm": 1.9590714056368506, "language_loss": 0.68501168, "learning_rate": 4.4509960333834426e-07, "loss": 0.70603907, "num_input_tokens_seen": 283495720, "step": 13139, "time_per_iteration": 2.7639269828796387 }, { "auxiliary_loss_clip": 0.01021477, "auxiliary_loss_mlp": 0.01001215, "balance_loss_clip": 1.00810361, "balance_loss_mlp": 1.00014842, "epoch": 0.790019540057117, "flos": 68331005959680.0, "grad_norm": 0.8505295432368817, "language_loss": 0.60203749, "learning_rate": 4.448546830368003e-07, "loss": 0.62226439, "num_input_tokens_seen": 283558795, "step": 13140, "time_per_iteration": 3.293804168701172 }, { "auxiliary_loss_clip": 0.01111705, "auxiliary_loss_mlp": 0.01036907, "balance_loss_clip": 1.03908968, "balance_loss_mlp": 1.02385402, "epoch": 0.7900796633097851, "flos": 30332773601280.0, "grad_norm": 1.6223884699668718, "language_loss": 0.76106548, "learning_rate": 4.4460982170652304e-07, "loss": 0.78255159, "num_input_tokens_seen": 283579305, "step": 13141, "time_per_iteration": 2.753269672393799 }, { "auxiliary_loss_clip": 0.01101932, "auxiliary_loss_mlp": 0.01036808, "balance_loss_clip": 1.03863978, "balance_loss_mlp": 1.02401733, "epoch": 0.790139786562453, "flos": 22126790983680.0, "grad_norm": 2.0698981191981978, "language_loss": 0.68995577, "learning_rate": 4.4436501935679694e-07, "loss": 0.71134317, "num_input_tokens_seen": 283597840, "step": 13142, "time_per_iteration": 2.682314872741699 }, { "auxiliary_loss_clip": 0.00984677, "auxiliary_loss_mlp": 0.01013212, "balance_loss_clip": 1.01147008, "balance_loss_mlp": 1.01161504, "epoch": 0.790199909815121, "flos": 58207284213120.0, "grad_norm": 0.8339263340003221, "language_loss": 0.59981745, "learning_rate": 4.441202759969049e-07, "loss": 0.61979634, "num_input_tokens_seen": 283647950, "step": 13143, "time_per_iteration": 3.278980255126953 }, { "auxiliary_loss_clip": 0.01082841, "auxiliary_loss_mlp": 0.01035062, "balance_loss_clip": 1.03883827, "balance_loss_mlp": 1.02172852, "epoch": 0.7902600330677889, "flos": 34533316759680.0, "grad_norm": 1.6349862086854898, "language_loss": 0.74675769, "learning_rate": 4.4387559163612875e-07, "loss": 0.76793671, "num_input_tokens_seen": 283670645, "step": 13144, "time_per_iteration": 3.294663429260254 }, { "auxiliary_loss_clip": 0.01103742, "auxiliary_loss_mlp": 0.01036273, "balance_loss_clip": 1.03867149, "balance_loss_mlp": 1.02252793, "epoch": 0.7903201563204569, "flos": 22346384780160.0, "grad_norm": 2.139554645223281, "language_loss": 0.82848895, "learning_rate": 4.4363096628374605e-07, "loss": 0.84988904, "num_input_tokens_seen": 283688830, "step": 13145, "time_per_iteration": 2.7851741313934326 }, { "auxiliary_loss_clip": 0.01095507, "auxiliary_loss_mlp": 0.01030248, "balance_loss_clip": 1.0367043, "balance_loss_mlp": 1.01874435, "epoch": 0.790380279573125, "flos": 22053533195520.0, "grad_norm": 1.5468904439953068, "language_loss": 0.73388755, "learning_rate": 4.4338639994903235e-07, "loss": 0.75514507, "num_input_tokens_seen": 283708625, "step": 13146, "time_per_iteration": 2.65710186958313 }, { "auxiliary_loss_clip": 0.01111662, "auxiliary_loss_mlp": 0.01028915, "balance_loss_clip": 1.03781211, "balance_loss_mlp": 1.01685667, "epoch": 0.7904404028257929, "flos": 20302600826880.0, "grad_norm": 1.8467569642796249, "language_loss": 0.75617737, "learning_rate": 4.4314189264126246e-07, "loss": 0.77758318, "num_input_tokens_seen": 283725710, "step": 13147, "time_per_iteration": 2.7460520267486572 }, { "auxiliary_loss_clip": 0.01091922, "auxiliary_loss_mlp": 0.0103933, "balance_loss_clip": 1.03564286, "balance_loss_mlp": 1.02576411, "epoch": 0.7905005260784609, "flos": 20008923229440.0, "grad_norm": 1.7581550780117867, "language_loss": 0.72203916, "learning_rate": 4.428974443697087e-07, "loss": 0.7433517, "num_input_tokens_seen": 283744150, "step": 13148, "time_per_iteration": 2.6912500858306885 }, { "auxiliary_loss_clip": 0.01095913, "auxiliary_loss_mlp": 0.01030776, "balance_loss_clip": 1.03445816, "balance_loss_mlp": 1.01777613, "epoch": 0.7905606493311288, "flos": 26905926418560.0, "grad_norm": 1.814389925772028, "language_loss": 0.71692038, "learning_rate": 4.4265305514363913e-07, "loss": 0.73818725, "num_input_tokens_seen": 283764170, "step": 13149, "time_per_iteration": 2.800591230392456 }, { "auxiliary_loss_clip": 0.01074802, "auxiliary_loss_mlp": 0.01034207, "balance_loss_clip": 1.03384662, "balance_loss_mlp": 1.02023542, "epoch": 0.7906207725837968, "flos": 23696230907520.0, "grad_norm": 2.263262344883557, "language_loss": 0.65186799, "learning_rate": 4.424087249723225e-07, "loss": 0.67295814, "num_input_tokens_seen": 283784305, "step": 13150, "time_per_iteration": 2.774513006210327 }, { "auxiliary_loss_clip": 0.01108732, "auxiliary_loss_mlp": 0.01033548, "balance_loss_clip": 1.03688979, "balance_loss_mlp": 1.02138877, "epoch": 0.7906808958364647, "flos": 20848837927680.0, "grad_norm": 2.4892944447292065, "language_loss": 0.70353788, "learning_rate": 4.421644538650231e-07, "loss": 0.72496063, "num_input_tokens_seen": 283804040, "step": 13151, "time_per_iteration": 2.624737024307251 }, { "auxiliary_loss_clip": 0.01091472, "auxiliary_loss_mlp": 0.0103796, "balance_loss_clip": 1.03773379, "balance_loss_mlp": 1.02501988, "epoch": 0.7907410190891327, "flos": 40735196974080.0, "grad_norm": 1.643411919564688, "language_loss": 0.70038378, "learning_rate": 4.4192024183100306e-07, "loss": 0.72167814, "num_input_tokens_seen": 283827120, "step": 13152, "time_per_iteration": 2.820726156234741 }, { "auxiliary_loss_clip": 0.01076957, "auxiliary_loss_mlp": 0.00770237, "balance_loss_clip": 1.03583848, "balance_loss_mlp": 1.00032854, "epoch": 0.7908011423418007, "flos": 13261165050240.0, "grad_norm": 2.5235845787272972, "language_loss": 0.72838122, "learning_rate": 4.4167608887952367e-07, "loss": 0.74685311, "num_input_tokens_seen": 283844820, "step": 13153, "time_per_iteration": 2.782799005508423 }, { "auxiliary_loss_clip": 0.01109362, "auxiliary_loss_mlp": 0.01027556, "balance_loss_clip": 1.0372107, "balance_loss_mlp": 1.01542032, "epoch": 0.7908612655944687, "flos": 19754747614080.0, "grad_norm": 1.5411567451067254, "language_loss": 0.78878421, "learning_rate": 4.4143199501984306e-07, "loss": 0.81015342, "num_input_tokens_seen": 283862870, "step": 13154, "time_per_iteration": 2.617465019226074 }, { "auxiliary_loss_clip": 0.01106383, "auxiliary_loss_mlp": 0.01030654, "balance_loss_clip": 1.03864491, "balance_loss_mlp": 1.0168618, "epoch": 0.7909213888471366, "flos": 21287738211840.0, "grad_norm": 2.826426218857978, "language_loss": 0.7024678, "learning_rate": 4.411879602612185e-07, "loss": 0.72383815, "num_input_tokens_seen": 283882405, "step": 13155, "time_per_iteration": 2.60141658782959 }, { "auxiliary_loss_clip": 0.01110954, "auxiliary_loss_mlp": 0.01030043, "balance_loss_clip": 1.03789937, "balance_loss_mlp": 1.01748431, "epoch": 0.7909815120998046, "flos": 22528882805760.0, "grad_norm": 1.6493957316701613, "language_loss": 0.76920623, "learning_rate": 4.4094398461290174e-07, "loss": 0.79061615, "num_input_tokens_seen": 283902070, "step": 13156, "time_per_iteration": 2.616990327835083 }, { "auxiliary_loss_clip": 0.01077807, "auxiliary_loss_mlp": 0.01032296, "balance_loss_clip": 1.03416896, "balance_loss_mlp": 1.02008295, "epoch": 0.7910416353524725, "flos": 26727702111360.0, "grad_norm": 1.6152194898453356, "language_loss": 0.65486753, "learning_rate": 4.4070006808414526e-07, "loss": 0.67596853, "num_input_tokens_seen": 283924100, "step": 13157, "time_per_iteration": 2.7800040245056152 }, { "auxiliary_loss_clip": 0.01098205, "auxiliary_loss_mlp": 0.01038516, "balance_loss_clip": 1.03559875, "balance_loss_mlp": 1.02468824, "epoch": 0.7911017586051405, "flos": 24644847139200.0, "grad_norm": 1.6816257658835039, "language_loss": 0.74068034, "learning_rate": 4.4045621068419894e-07, "loss": 0.76204759, "num_input_tokens_seen": 283944955, "step": 13158, "time_per_iteration": 2.6075475215911865 }, { "auxiliary_loss_clip": 0.01095673, "auxiliary_loss_mlp": 0.01033389, "balance_loss_clip": 1.0357399, "balance_loss_mlp": 1.02176023, "epoch": 0.7911618818578086, "flos": 17565489578880.0, "grad_norm": 2.030035460018427, "language_loss": 0.67612302, "learning_rate": 4.40212412422309e-07, "loss": 0.69741368, "num_input_tokens_seen": 283963125, "step": 13159, "time_per_iteration": 2.6242077350616455 }, { "auxiliary_loss_clip": 0.01098583, "auxiliary_loss_mlp": 0.01035004, "balance_loss_clip": 1.03775477, "balance_loss_mlp": 1.02250552, "epoch": 0.7912220051104765, "flos": 16721660298240.0, "grad_norm": 3.313195141465383, "language_loss": 0.67271805, "learning_rate": 4.399686733077206e-07, "loss": 0.69405401, "num_input_tokens_seen": 283982850, "step": 13160, "time_per_iteration": 2.75685715675354 }, { "auxiliary_loss_clip": 0.0108344, "auxiliary_loss_mlp": 0.01027351, "balance_loss_clip": 1.03476191, "balance_loss_mlp": 1.01664615, "epoch": 0.7912821283631445, "flos": 13698736531200.0, "grad_norm": 2.063884011157957, "language_loss": 0.72593331, "learning_rate": 4.3972499334967694e-07, "loss": 0.74704123, "num_input_tokens_seen": 283998275, "step": 13161, "time_per_iteration": 2.6084799766540527 }, { "auxiliary_loss_clip": 0.01080502, "auxiliary_loss_mlp": 0.01033054, "balance_loss_clip": 1.03568757, "balance_loss_mlp": 1.02046573, "epoch": 0.7913422516158124, "flos": 23769021818880.0, "grad_norm": 1.6120052582411066, "language_loss": 0.73379862, "learning_rate": 4.39481372557418e-07, "loss": 0.75493419, "num_input_tokens_seen": 284018750, "step": 13162, "time_per_iteration": 6.126726865768433 }, { "auxiliary_loss_clip": 0.01089834, "auxiliary_loss_mlp": 0.01032531, "balance_loss_clip": 1.03729248, "balance_loss_mlp": 1.01965666, "epoch": 0.7914023748684804, "flos": 19938251220480.0, "grad_norm": 1.9889723389835698, "language_loss": 0.71760178, "learning_rate": 4.392378109401811e-07, "loss": 0.73882544, "num_input_tokens_seen": 284037850, "step": 13163, "time_per_iteration": 4.413632869720459 }, { "auxiliary_loss_clip": 0.01075124, "auxiliary_loss_mlp": 0.01031765, "balance_loss_clip": 1.03465581, "balance_loss_mlp": 1.01800179, "epoch": 0.7914624981211483, "flos": 20594805966720.0, "grad_norm": 1.8803473960616024, "language_loss": 0.70246696, "learning_rate": 4.3899430850720296e-07, "loss": 0.72353578, "num_input_tokens_seen": 284056380, "step": 13164, "time_per_iteration": 2.698758840560913 }, { "auxiliary_loss_clip": 0.01070741, "auxiliary_loss_mlp": 0.01037319, "balance_loss_clip": 1.0364182, "balance_loss_mlp": 1.02521276, "epoch": 0.7915226213738163, "flos": 21799465320960.0, "grad_norm": 1.885675562841956, "language_loss": 0.67027831, "learning_rate": 4.387508652677177e-07, "loss": 0.69135886, "num_input_tokens_seen": 284074945, "step": 13165, "time_per_iteration": 2.74423885345459 }, { "auxiliary_loss_clip": 0.01062193, "auxiliary_loss_mlp": 0.01027491, "balance_loss_clip": 1.0360235, "balance_loss_mlp": 1.0160346, "epoch": 0.7915827446264843, "flos": 16288362535680.0, "grad_norm": 2.5652044967821563, "language_loss": 0.72134489, "learning_rate": 4.385074812309557e-07, "loss": 0.74224174, "num_input_tokens_seen": 284092070, "step": 13166, "time_per_iteration": 2.74450421333313 }, { "auxiliary_loss_clip": 0.01107168, "auxiliary_loss_mlp": 0.01033777, "balance_loss_clip": 1.03592849, "balance_loss_mlp": 1.02065766, "epoch": 0.7916428678791523, "flos": 25702595867520.0, "grad_norm": 1.6924622649146908, "language_loss": 0.77245665, "learning_rate": 4.382641564061462e-07, "loss": 0.79386616, "num_input_tokens_seen": 284112255, "step": 13167, "time_per_iteration": 2.6304922103881836 }, { "auxiliary_loss_clip": 0.01074373, "auxiliary_loss_mlp": 0.01032393, "balance_loss_clip": 1.03654242, "balance_loss_mlp": 1.02080607, "epoch": 0.7917029911318202, "flos": 23878513451520.0, "grad_norm": 1.5572430197509217, "language_loss": 0.8423599, "learning_rate": 4.3802089080251713e-07, "loss": 0.86342752, "num_input_tokens_seen": 284132330, "step": 13168, "time_per_iteration": 2.7429237365722656 }, { "auxiliary_loss_clip": 0.011112, "auxiliary_loss_mlp": 0.01031479, "balance_loss_clip": 1.03944874, "balance_loss_mlp": 1.01902127, "epoch": 0.7917631143844882, "flos": 21646593037440.0, "grad_norm": 1.5464810747568485, "language_loss": 0.72668618, "learning_rate": 4.3777768442929155e-07, "loss": 0.74811298, "num_input_tokens_seen": 284150640, "step": 13169, "time_per_iteration": 2.6592273712158203 }, { "auxiliary_loss_clip": 0.01112278, "auxiliary_loss_mlp": 0.01034078, "balance_loss_clip": 1.03776097, "balance_loss_mlp": 1.02096534, "epoch": 0.7918232376371561, "flos": 38874198355200.0, "grad_norm": 3.0164907954915856, "language_loss": 0.67173648, "learning_rate": 4.3753453729569287e-07, "loss": 0.69320005, "num_input_tokens_seen": 284171910, "step": 13170, "time_per_iteration": 4.270065546035767 }, { "auxiliary_loss_clip": 0.01098461, "auxiliary_loss_mlp": 0.01026319, "balance_loss_clip": 1.03575373, "balance_loss_mlp": 1.01436245, "epoch": 0.7918833608898241, "flos": 20775544225920.0, "grad_norm": 1.6549225426524543, "language_loss": 0.70591486, "learning_rate": 4.372914494109412e-07, "loss": 0.72716266, "num_input_tokens_seen": 284191340, "step": 13171, "time_per_iteration": 2.6470091342926025 }, { "auxiliary_loss_clip": 0.01097608, "auxiliary_loss_mlp": 0.01030463, "balance_loss_clip": 1.03621912, "balance_loss_mlp": 1.01798749, "epoch": 0.7919434841424922, "flos": 33910122769920.0, "grad_norm": 3.018313579930399, "language_loss": 0.67142022, "learning_rate": 4.370484207842553e-07, "loss": 0.69270092, "num_input_tokens_seen": 284212495, "step": 13172, "time_per_iteration": 2.7242603302001953 }, { "auxiliary_loss_clip": 0.01083539, "auxiliary_loss_mlp": 0.01033715, "balance_loss_clip": 1.03492141, "balance_loss_mlp": 1.02068591, "epoch": 0.7920036073951601, "flos": 21064660796160.0, "grad_norm": 2.177677653156343, "language_loss": 0.79725873, "learning_rate": 4.3680545142484893e-07, "loss": 0.81843126, "num_input_tokens_seen": 284230825, "step": 13173, "time_per_iteration": 2.6997551918029785 }, { "auxiliary_loss_clip": 0.01071714, "auxiliary_loss_mlp": 0.01038162, "balance_loss_clip": 1.03270435, "balance_loss_mlp": 1.02527571, "epoch": 0.7920637306478281, "flos": 23655974739840.0, "grad_norm": 2.0997194022490038, "language_loss": 0.76738131, "learning_rate": 4.365625413419365e-07, "loss": 0.78848016, "num_input_tokens_seen": 284250365, "step": 13174, "time_per_iteration": 2.8328940868377686 }, { "auxiliary_loss_clip": 0.01083806, "auxiliary_loss_mlp": 0.01034483, "balance_loss_clip": 1.03376579, "balance_loss_mlp": 1.02280629, "epoch": 0.792123853900496, "flos": 27195438038400.0, "grad_norm": 2.433097475426471, "language_loss": 0.71779603, "learning_rate": 4.363196905447297e-07, "loss": 0.73897892, "num_input_tokens_seen": 284269635, "step": 13175, "time_per_iteration": 2.7348971366882324 }, { "auxiliary_loss_clip": 0.01098061, "auxiliary_loss_mlp": 0.010319, "balance_loss_clip": 1.03613544, "balance_loss_mlp": 1.01925838, "epoch": 0.792183977153164, "flos": 19098659744640.0, "grad_norm": 1.8855424428426124, "language_loss": 0.60150284, "learning_rate": 4.360768990424364e-07, "loss": 0.62280244, "num_input_tokens_seen": 284288380, "step": 13176, "time_per_iteration": 2.645940065383911 }, { "auxiliary_loss_clip": 0.01112239, "auxiliary_loss_mlp": 0.01033063, "balance_loss_clip": 1.04115438, "balance_loss_mlp": 1.02052176, "epoch": 0.7922441004058319, "flos": 17128851851520.0, "grad_norm": 1.8607925161268413, "language_loss": 0.73708278, "learning_rate": 4.3583416684426376e-07, "loss": 0.75853586, "num_input_tokens_seen": 284306920, "step": 13177, "time_per_iteration": 2.624305009841919 }, { "auxiliary_loss_clip": 0.01092978, "auxiliary_loss_mlp": 0.01035467, "balance_loss_clip": 1.03717804, "balance_loss_mlp": 1.02310514, "epoch": 0.7923042236585, "flos": 17821640442240.0, "grad_norm": 1.880784618902091, "language_loss": 0.64198965, "learning_rate": 4.355914939594174e-07, "loss": 0.66327411, "num_input_tokens_seen": 284324700, "step": 13178, "time_per_iteration": 2.6623740196228027 }, { "auxiliary_loss_clip": 0.01086006, "auxiliary_loss_mlp": 0.01028552, "balance_loss_clip": 1.03637266, "balance_loss_mlp": 1.01807904, "epoch": 0.7923643469111679, "flos": 29935206892800.0, "grad_norm": 1.4540811343422748, "language_loss": 0.68699908, "learning_rate": 4.3534888039709726e-07, "loss": 0.70814466, "num_input_tokens_seen": 284345985, "step": 13179, "time_per_iteration": 2.832632541656494 }, { "auxiliary_loss_clip": 0.01106835, "auxiliary_loss_mlp": 0.01030886, "balance_loss_clip": 1.036268, "balance_loss_mlp": 1.01872063, "epoch": 0.7924244701638359, "flos": 22674716023680.0, "grad_norm": 2.2092827624793117, "language_loss": 0.74018443, "learning_rate": 4.3510632616650444e-07, "loss": 0.76156163, "num_input_tokens_seen": 284364475, "step": 13180, "time_per_iteration": 2.6299288272857666 }, { "auxiliary_loss_clip": 0.01099012, "auxiliary_loss_mlp": 0.01036443, "balance_loss_clip": 1.03927088, "balance_loss_mlp": 1.02306199, "epoch": 0.7924845934165038, "flos": 17968156018560.0, "grad_norm": 2.065397931254967, "language_loss": 0.8179431, "learning_rate": 4.3486383127683646e-07, "loss": 0.83929765, "num_input_tokens_seen": 284382125, "step": 13181, "time_per_iteration": 2.6588377952575684 }, { "auxiliary_loss_clip": 0.01079854, "auxiliary_loss_mlp": 0.01038549, "balance_loss_clip": 1.03439593, "balance_loss_mlp": 1.02538192, "epoch": 0.7925447166691718, "flos": 23476960333440.0, "grad_norm": 1.7700147531802202, "language_loss": 0.77401638, "learning_rate": 4.346213957372895e-07, "loss": 0.79520041, "num_input_tokens_seen": 284401585, "step": 13182, "time_per_iteration": 2.702794313430786 }, { "auxiliary_loss_clip": 0.01097492, "auxiliary_loss_mlp": 0.01041087, "balance_loss_clip": 1.0389626, "balance_loss_mlp": 1.02766991, "epoch": 0.7926048399218397, "flos": 20447572118400.0, "grad_norm": 1.8061510819801756, "language_loss": 0.74171931, "learning_rate": 4.34379019557056e-07, "loss": 0.76310509, "num_input_tokens_seen": 284419125, "step": 13183, "time_per_iteration": 2.615912675857544 }, { "auxiliary_loss_clip": 0.01078552, "auxiliary_loss_mlp": 0.01032415, "balance_loss_clip": 1.03608036, "balance_loss_mlp": 1.0189023, "epoch": 0.7926649631745077, "flos": 37160038535040.0, "grad_norm": 1.5412113664578542, "language_loss": 0.68428183, "learning_rate": 4.341367027453264e-07, "loss": 0.70539147, "num_input_tokens_seen": 284440445, "step": 13184, "time_per_iteration": 2.7763001918792725 }, { "auxiliary_loss_clip": 0.01073218, "auxiliary_loss_mlp": 0.01034358, "balance_loss_clip": 1.03828871, "balance_loss_mlp": 1.02169812, "epoch": 0.7927250864271758, "flos": 17018606033280.0, "grad_norm": 1.8246032292732381, "language_loss": 0.70783365, "learning_rate": 4.338944453112907e-07, "loss": 0.72890937, "num_input_tokens_seen": 284459370, "step": 13185, "time_per_iteration": 2.7633087635040283 }, { "auxiliary_loss_clip": 0.01096127, "auxiliary_loss_mlp": 0.01032722, "balance_loss_clip": 1.03772926, "balance_loss_mlp": 1.02017522, "epoch": 0.7927852096798437, "flos": 17749208666880.0, "grad_norm": 2.140666716995379, "language_loss": 0.65258479, "learning_rate": 4.3365224726413375e-07, "loss": 0.67387331, "num_input_tokens_seen": 284477525, "step": 13186, "time_per_iteration": 2.762816905975342 }, { "auxiliary_loss_clip": 0.01094364, "auxiliary_loss_mlp": 0.01037697, "balance_loss_clip": 1.03739357, "balance_loss_mlp": 1.02557981, "epoch": 0.7928453329325117, "flos": 23838436851840.0, "grad_norm": 1.4957281455318547, "language_loss": 0.76961684, "learning_rate": 4.334101086130408e-07, "loss": 0.79093742, "num_input_tokens_seen": 284496590, "step": 13187, "time_per_iteration": 2.7023680210113525 }, { "auxiliary_loss_clip": 0.01088541, "auxiliary_loss_mlp": 0.01031178, "balance_loss_clip": 1.03613758, "balance_loss_mlp": 1.0191083, "epoch": 0.7929054561851796, "flos": 17454920538240.0, "grad_norm": 2.090727269336269, "language_loss": 0.7242974, "learning_rate": 4.3316802936719334e-07, "loss": 0.7454946, "num_input_tokens_seen": 284511470, "step": 13188, "time_per_iteration": 2.6116061210632324 }, { "auxiliary_loss_clip": 0.01110097, "auxiliary_loss_mlp": 0.00771207, "balance_loss_clip": 1.03619778, "balance_loss_mlp": 1.0002346, "epoch": 0.7929655794378476, "flos": 21981280988160.0, "grad_norm": 3.5192145755873043, "language_loss": 0.63126463, "learning_rate": 4.329260095357725e-07, "loss": 0.65007764, "num_input_tokens_seen": 284531125, "step": 13189, "time_per_iteration": 2.5492398738861084 }, { "auxiliary_loss_clip": 0.01063574, "auxiliary_loss_mlp": 0.01031813, "balance_loss_clip": 1.03684545, "balance_loss_mlp": 1.02014804, "epoch": 0.7930257026905155, "flos": 17273930883840.0, "grad_norm": 1.84728181644231, "language_loss": 0.73074591, "learning_rate": 4.3268404912795307e-07, "loss": 0.75169981, "num_input_tokens_seen": 284549340, "step": 13190, "time_per_iteration": 2.7327284812927246 }, { "auxiliary_loss_clip": 0.01094105, "auxiliary_loss_mlp": 0.01030162, "balance_loss_clip": 1.03697276, "balance_loss_mlp": 1.01938009, "epoch": 0.7930858259431836, "flos": 27300584125440.0, "grad_norm": 1.7378717321453667, "language_loss": 0.73166823, "learning_rate": 4.3244214815291166e-07, "loss": 0.75291085, "num_input_tokens_seen": 284567060, "step": 13191, "time_per_iteration": 2.761871337890625 }, { "auxiliary_loss_clip": 0.01097055, "auxiliary_loss_mlp": 0.01039867, "balance_loss_clip": 1.03603792, "balance_loss_mlp": 1.02686751, "epoch": 0.7931459491958515, "flos": 19863736456320.0, "grad_norm": 1.7612896167092924, "language_loss": 0.69279987, "learning_rate": 4.322003066198219e-07, "loss": 0.71416903, "num_input_tokens_seen": 284586600, "step": 13192, "time_per_iteration": 2.6835954189300537 }, { "auxiliary_loss_clip": 0.01074955, "auxiliary_loss_mlp": 0.01035455, "balance_loss_clip": 1.03394866, "balance_loss_mlp": 1.0229497, "epoch": 0.7932060724485195, "flos": 23147120718720.0, "grad_norm": 1.8840827690458661, "language_loss": 0.75363815, "learning_rate": 4.3195852453785274e-07, "loss": 0.77474225, "num_input_tokens_seen": 284605715, "step": 13193, "time_per_iteration": 2.723729372024536 }, { "auxiliary_loss_clip": 0.01097101, "auxiliary_loss_mlp": 0.01033041, "balance_loss_clip": 1.03796721, "balance_loss_mlp": 1.01971292, "epoch": 0.7932661957011874, "flos": 29934847756800.0, "grad_norm": 2.301032967508139, "language_loss": 0.71940517, "learning_rate": 4.317168019161741e-07, "loss": 0.74070656, "num_input_tokens_seen": 284628540, "step": 13194, "time_per_iteration": 2.758888006210327 }, { "auxiliary_loss_clip": 0.01113373, "auxiliary_loss_mlp": 0.01036092, "balance_loss_clip": 1.03853393, "balance_loss_mlp": 1.02333045, "epoch": 0.7933263189538554, "flos": 22559119079040.0, "grad_norm": 1.9174397116927768, "language_loss": 0.70116889, "learning_rate": 4.314751387639517e-07, "loss": 0.72266352, "num_input_tokens_seen": 284646040, "step": 13195, "time_per_iteration": 2.558119058609009 }, { "auxiliary_loss_clip": 0.01060029, "auxiliary_loss_mlp": 0.0102797, "balance_loss_clip": 1.03700423, "balance_loss_mlp": 1.0154351, "epoch": 0.7933864422065233, "flos": 25479051575040.0, "grad_norm": 3.5361878755115286, "language_loss": 0.77569836, "learning_rate": 4.3123353509034844e-07, "loss": 0.79657841, "num_input_tokens_seen": 284665110, "step": 13196, "time_per_iteration": 2.7758255004882812 }, { "auxiliary_loss_clip": 0.01079414, "auxiliary_loss_mlp": 0.01037257, "balance_loss_clip": 1.03883171, "balance_loss_mlp": 1.02485287, "epoch": 0.7934465654591913, "flos": 33583156243200.0, "grad_norm": 1.7631963808402482, "language_loss": 0.68811917, "learning_rate": 4.309919909045268e-07, "loss": 0.70928586, "num_input_tokens_seen": 284686515, "step": 13197, "time_per_iteration": 2.788442850112915 }, { "auxiliary_loss_clip": 0.01097503, "auxiliary_loss_mlp": 0.01029061, "balance_loss_clip": 1.03770566, "balance_loss_mlp": 1.01680613, "epoch": 0.7935066887118594, "flos": 31432538263680.0, "grad_norm": 2.573420648877448, "language_loss": 0.65293157, "learning_rate": 4.30750506215646e-07, "loss": 0.6741972, "num_input_tokens_seen": 284707300, "step": 13198, "time_per_iteration": 2.785005807876587 }, { "auxiliary_loss_clip": 0.010622, "auxiliary_loss_mlp": 0.01040394, "balance_loss_clip": 1.03600621, "balance_loss_mlp": 1.02515936, "epoch": 0.7935668119645273, "flos": 14682616940160.0, "grad_norm": 2.6924527077689113, "language_loss": 0.72298622, "learning_rate": 4.30509081032864e-07, "loss": 0.74401212, "num_input_tokens_seen": 284723545, "step": 13199, "time_per_iteration": 2.828518867492676 }, { "auxiliary_loss_clip": 0.01083399, "auxiliary_loss_mlp": 0.01032791, "balance_loss_clip": 1.03479409, "balance_loss_mlp": 1.02038765, "epoch": 0.7936269352171953, "flos": 18004246208640.0, "grad_norm": 1.7805702438055635, "language_loss": 0.80542034, "learning_rate": 4.302677153653349e-07, "loss": 0.82658225, "num_input_tokens_seen": 284742650, "step": 13200, "time_per_iteration": 2.719022035598755 }, { "auxiliary_loss_clip": 0.01096575, "auxiliary_loss_mlp": 0.01034603, "balance_loss_clip": 1.0383296, "balance_loss_mlp": 1.02258706, "epoch": 0.7936870584698632, "flos": 18880215183360.0, "grad_norm": 1.7717483221141246, "language_loss": 0.77400053, "learning_rate": 4.3002640922221077e-07, "loss": 0.79531235, "num_input_tokens_seen": 284760955, "step": 13201, "time_per_iteration": 4.26847243309021 }, { "auxiliary_loss_clip": 0.01108331, "auxiliary_loss_mlp": 0.01032866, "balance_loss_clip": 1.03744578, "balance_loss_mlp": 1.02092719, "epoch": 0.7937471817225312, "flos": 23367001824000.0, "grad_norm": 1.5551997587526456, "language_loss": 0.67323661, "learning_rate": 4.2978516261264296e-07, "loss": 0.69464856, "num_input_tokens_seen": 284780745, "step": 13202, "time_per_iteration": 4.283862352371216 }, { "auxiliary_loss_clip": 0.01099327, "auxiliary_loss_mlp": 0.01034975, "balance_loss_clip": 1.03811014, "balance_loss_mlp": 1.02211785, "epoch": 0.7938073049751991, "flos": 22674428714880.0, "grad_norm": 2.1258656424203464, "language_loss": 0.75316, "learning_rate": 4.2954397554577884e-07, "loss": 0.77450299, "num_input_tokens_seen": 284799000, "step": 13203, "time_per_iteration": 4.218053817749023 }, { "auxiliary_loss_clip": 0.01057545, "auxiliary_loss_mlp": 0.01032901, "balance_loss_clip": 1.03676009, "balance_loss_mlp": 1.02075946, "epoch": 0.7938674282278672, "flos": 22851431959680.0, "grad_norm": 1.8069073081221512, "language_loss": 0.66618353, "learning_rate": 4.293028480307643e-07, "loss": 0.68708801, "num_input_tokens_seen": 284817450, "step": 13204, "time_per_iteration": 2.819964647293091 }, { "auxiliary_loss_clip": 0.01049205, "auxiliary_loss_mlp": 0.01028932, "balance_loss_clip": 1.03277397, "balance_loss_mlp": 1.01646256, "epoch": 0.7939275514805351, "flos": 27012509049600.0, "grad_norm": 1.5710457021949253, "language_loss": 0.7940079, "learning_rate": 4.290617800767438e-07, "loss": 0.8147893, "num_input_tokens_seen": 284838865, "step": 13205, "time_per_iteration": 2.832738161087036 }, { "auxiliary_loss_clip": 0.0107234, "auxiliary_loss_mlp": 0.01030961, "balance_loss_clip": 1.0324893, "balance_loss_mlp": 1.01827097, "epoch": 0.7939876747332031, "flos": 21142838747520.0, "grad_norm": 7.819538292121243, "language_loss": 0.7771039, "learning_rate": 4.28820771692858e-07, "loss": 0.79813695, "num_input_tokens_seen": 284857975, "step": 13206, "time_per_iteration": 2.7768259048461914 }, { "auxiliary_loss_clip": 0.01086044, "auxiliary_loss_mlp": 0.01035236, "balance_loss_clip": 1.03653049, "balance_loss_mlp": 1.02031064, "epoch": 0.794047797985871, "flos": 23289075267840.0, "grad_norm": 2.0761554247876526, "language_loss": 0.78858304, "learning_rate": 4.285798228882456e-07, "loss": 0.8097958, "num_input_tokens_seen": 284877145, "step": 13207, "time_per_iteration": 2.78918719291687 }, { "auxiliary_loss_clip": 0.01071641, "auxiliary_loss_mlp": 0.01034494, "balance_loss_clip": 1.03531897, "balance_loss_mlp": 1.02225077, "epoch": 0.794107921238539, "flos": 24608074590720.0, "grad_norm": 1.921000285111042, "language_loss": 0.83848017, "learning_rate": 4.2833893367204375e-07, "loss": 0.85954154, "num_input_tokens_seen": 284895560, "step": 13208, "time_per_iteration": 2.799513578414917 }, { "auxiliary_loss_clip": 0.00994574, "auxiliary_loss_mlp": 0.0101022, "balance_loss_clip": 1.00948644, "balance_loss_mlp": 1.0090878, "epoch": 0.7941680444912069, "flos": 64093690252800.0, "grad_norm": 0.7333327804859686, "language_loss": 0.58320063, "learning_rate": 4.280981040533875e-07, "loss": 0.60324866, "num_input_tokens_seen": 284963135, "step": 13209, "time_per_iteration": 4.956205368041992 }, { "auxiliary_loss_clip": 0.01076765, "auxiliary_loss_mlp": 0.01034685, "balance_loss_clip": 1.03624475, "balance_loss_mlp": 1.02142262, "epoch": 0.794228167743875, "flos": 24388839930240.0, "grad_norm": 2.256316924700655, "language_loss": 0.62863505, "learning_rate": 4.2785733404140825e-07, "loss": 0.64974952, "num_input_tokens_seen": 284981755, "step": 13210, "time_per_iteration": 2.7703917026519775 }, { "auxiliary_loss_clip": 0.010938, "auxiliary_loss_mlp": 0.01036077, "balance_loss_clip": 1.03719687, "balance_loss_mlp": 1.024073, "epoch": 0.794288290996543, "flos": 28512498026880.0, "grad_norm": 1.9531340028994628, "language_loss": 0.6936754, "learning_rate": 4.2761662364523676e-07, "loss": 0.71497422, "num_input_tokens_seen": 285003060, "step": 13211, "time_per_iteration": 2.74078106880188 }, { "auxiliary_loss_clip": 0.01102649, "auxiliary_loss_mlp": 0.01039332, "balance_loss_clip": 1.03825963, "balance_loss_mlp": 1.02562237, "epoch": 0.7943484142492109, "flos": 25922117836800.0, "grad_norm": 1.640321960898119, "language_loss": 0.72502631, "learning_rate": 4.2737597287400074e-07, "loss": 0.74644607, "num_input_tokens_seen": 285021640, "step": 13212, "time_per_iteration": 2.745793104171753 }, { "auxiliary_loss_clip": 0.01095421, "auxiliary_loss_mlp": 0.01029916, "balance_loss_clip": 1.03563583, "balance_loss_mlp": 1.01776266, "epoch": 0.7944085375018789, "flos": 23915286000000.0, "grad_norm": 1.7707252579484445, "language_loss": 0.80655056, "learning_rate": 4.271353817368246e-07, "loss": 0.82780391, "num_input_tokens_seen": 285040490, "step": 13213, "time_per_iteration": 2.7571616172790527 }, { "auxiliary_loss_clip": 0.01102684, "auxiliary_loss_mlp": 0.01030906, "balance_loss_clip": 1.03846729, "balance_loss_mlp": 1.01816225, "epoch": 0.7944686607545468, "flos": 20229953569920.0, "grad_norm": 2.0723417946435196, "language_loss": 0.67524314, "learning_rate": 4.268948502428327e-07, "loss": 0.69657904, "num_input_tokens_seen": 285059270, "step": 13214, "time_per_iteration": 2.7216098308563232 }, { "auxiliary_loss_clip": 0.01107626, "auxiliary_loss_mlp": 0.01031004, "balance_loss_clip": 1.03777719, "balance_loss_mlp": 1.01888001, "epoch": 0.7945287840072148, "flos": 21980993679360.0, "grad_norm": 2.140296316096213, "language_loss": 0.72678429, "learning_rate": 4.2665437840114535e-07, "loss": 0.74817061, "num_input_tokens_seen": 285075390, "step": 13215, "time_per_iteration": 2.687727212905884 }, { "auxiliary_loss_clip": 0.01058497, "auxiliary_loss_mlp": 0.01037215, "balance_loss_clip": 1.03636539, "balance_loss_mlp": 1.02328491, "epoch": 0.7945889072598827, "flos": 26397718842240.0, "grad_norm": 1.5145901921228262, "language_loss": 0.79136622, "learning_rate": 4.2641396622088253e-07, "loss": 0.81232333, "num_input_tokens_seen": 285096290, "step": 13216, "time_per_iteration": 2.7064990997314453 }, { "auxiliary_loss_clip": 0.01096019, "auxiliary_loss_mlp": 0.01034212, "balance_loss_clip": 1.03587198, "balance_loss_mlp": 1.02159381, "epoch": 0.7946490305125508, "flos": 25810255906560.0, "grad_norm": 1.5522674129771217, "language_loss": 0.73874998, "learning_rate": 4.261736137111598e-07, "loss": 0.7600522, "num_input_tokens_seen": 285116020, "step": 13217, "time_per_iteration": 2.6791646480560303 }, { "auxiliary_loss_clip": 0.01082895, "auxiliary_loss_mlp": 0.01034325, "balance_loss_clip": 1.03578281, "balance_loss_mlp": 1.02138495, "epoch": 0.7947091537652187, "flos": 15960965045760.0, "grad_norm": 1.8630939701915927, "language_loss": 0.73956853, "learning_rate": 4.259333208810907e-07, "loss": 0.76074076, "num_input_tokens_seen": 285133510, "step": 13218, "time_per_iteration": 2.681337594985962 }, { "auxiliary_loss_clip": 0.01099657, "auxiliary_loss_mlp": 0.01037837, "balance_loss_clip": 1.0363996, "balance_loss_mlp": 1.02428901, "epoch": 0.7947692770178867, "flos": 18587866389120.0, "grad_norm": 1.8649212651108453, "language_loss": 0.83193207, "learning_rate": 4.2569308773978817e-07, "loss": 0.85330701, "num_input_tokens_seen": 285151690, "step": 13219, "time_per_iteration": 2.6580770015716553 }, { "auxiliary_loss_clip": 0.01100239, "auxiliary_loss_mlp": 0.01043205, "balance_loss_clip": 1.03854525, "balance_loss_mlp": 1.02832818, "epoch": 0.7948294002705546, "flos": 20442220992000.0, "grad_norm": 2.3946957736467915, "language_loss": 0.75677502, "learning_rate": 4.2545291429636123e-07, "loss": 0.77820945, "num_input_tokens_seen": 285170485, "step": 13220, "time_per_iteration": 2.644994020462036 }, { "auxiliary_loss_clip": 0.01084385, "auxiliary_loss_mlp": 0.01035732, "balance_loss_clip": 1.0356847, "balance_loss_mlp": 1.0225656, "epoch": 0.7948895235232226, "flos": 38181194282880.0, "grad_norm": 1.8822123036698593, "language_loss": 0.72409242, "learning_rate": 4.252128005599176e-07, "loss": 0.74529362, "num_input_tokens_seen": 285191050, "step": 13221, "time_per_iteration": 2.765852689743042 }, { "auxiliary_loss_clip": 0.01099762, "auxiliary_loss_mlp": 0.01030106, "balance_loss_clip": 1.03885102, "balance_loss_mlp": 1.01822662, "epoch": 0.7949496467758905, "flos": 15559806977280.0, "grad_norm": 2.0084967919839527, "language_loss": 0.74979097, "learning_rate": 4.249727465395634e-07, "loss": 0.77108967, "num_input_tokens_seen": 285208750, "step": 13222, "time_per_iteration": 2.6160507202148438 }, { "auxiliary_loss_clip": 0.01012175, "auxiliary_loss_mlp": 0.01002836, "balance_loss_clip": 1.00953972, "balance_loss_mlp": 1.00179863, "epoch": 0.7950097700285585, "flos": 70897036728960.0, "grad_norm": 0.7723294250131235, "language_loss": 0.6706062, "learning_rate": 4.247327522443993e-07, "loss": 0.69075632, "num_input_tokens_seen": 285264605, "step": 13223, "time_per_iteration": 3.087876319885254 }, { "auxiliary_loss_clip": 0.010973, "auxiliary_loss_mlp": 0.01033159, "balance_loss_clip": 1.0365479, "balance_loss_mlp": 1.01981974, "epoch": 0.7950698932812266, "flos": 23951627585280.0, "grad_norm": 2.366420887555622, "language_loss": 0.71044689, "learning_rate": 4.2449281768352717e-07, "loss": 0.73175144, "num_input_tokens_seen": 285283940, "step": 13224, "time_per_iteration": 2.640591621398926 }, { "auxiliary_loss_clip": 0.0103006, "auxiliary_loss_mlp": 0.01002258, "balance_loss_clip": 1.00757217, "balance_loss_mlp": 1.00124514, "epoch": 0.7951300165338945, "flos": 60282561415680.0, "grad_norm": 0.6682926442494496, "language_loss": 0.54986, "learning_rate": 4.2425294286604527e-07, "loss": 0.57018316, "num_input_tokens_seen": 285349525, "step": 13225, "time_per_iteration": 3.1831283569335938 }, { "auxiliary_loss_clip": 0.01083968, "auxiliary_loss_mlp": 0.01025518, "balance_loss_clip": 1.03336477, "balance_loss_mlp": 1.01375794, "epoch": 0.7951901397865625, "flos": 22819004956800.0, "grad_norm": 2.1821061924274106, "language_loss": 0.64788163, "learning_rate": 4.2401312780105034e-07, "loss": 0.66897643, "num_input_tokens_seen": 285367355, "step": 13226, "time_per_iteration": 2.7201919555664062 }, { "auxiliary_loss_clip": 0.01065742, "auxiliary_loss_mlp": 0.01037995, "balance_loss_clip": 1.03712797, "balance_loss_mlp": 1.02581131, "epoch": 0.7952502630392304, "flos": 35695672871040.0, "grad_norm": 2.811230996366362, "language_loss": 0.69988328, "learning_rate": 4.237733724976349e-07, "loss": 0.72092068, "num_input_tokens_seen": 285386190, "step": 13227, "time_per_iteration": 2.88179874420166 }, { "auxiliary_loss_clip": 0.01065232, "auxiliary_loss_mlp": 0.01029863, "balance_loss_clip": 1.03389943, "balance_loss_mlp": 1.01914048, "epoch": 0.7953103862918984, "flos": 25629840869760.0, "grad_norm": 1.5862839127208228, "language_loss": 0.69230592, "learning_rate": 4.2353367696489184e-07, "loss": 0.71325696, "num_input_tokens_seen": 285406150, "step": 13228, "time_per_iteration": 2.9039552211761475 }, { "auxiliary_loss_clip": 0.01062042, "auxiliary_loss_mlp": 0.01046101, "balance_loss_clip": 1.03289461, "balance_loss_mlp": 1.03178382, "epoch": 0.7953705095445663, "flos": 40551980676480.0, "grad_norm": 1.5155440892228063, "language_loss": 0.70645332, "learning_rate": 4.232940412119095e-07, "loss": 0.72753471, "num_input_tokens_seen": 285429900, "step": 13229, "time_per_iteration": 2.9804372787475586 }, { "auxiliary_loss_clip": 0.01103757, "auxiliary_loss_mlp": 0.01033426, "balance_loss_clip": 1.03948689, "balance_loss_mlp": 1.02086771, "epoch": 0.7954306327972344, "flos": 27636672706560.0, "grad_norm": 2.202823116168555, "language_loss": 0.71696305, "learning_rate": 4.2305446524777457e-07, "loss": 0.73833489, "num_input_tokens_seen": 285452555, "step": 13230, "time_per_iteration": 2.8171424865722656 }, { "auxiliary_loss_clip": 0.0101259, "auxiliary_loss_mlp": 0.01002419, "balance_loss_clip": 1.00992072, "balance_loss_mlp": 1.00133443, "epoch": 0.7954907560499023, "flos": 59504055995520.0, "grad_norm": 0.8970251417868289, "language_loss": 0.63560265, "learning_rate": 4.2281494908157247e-07, "loss": 0.65575272, "num_input_tokens_seen": 285515700, "step": 13231, "time_per_iteration": 3.281342029571533 }, { "auxiliary_loss_clip": 0.01086059, "auxiliary_loss_mlp": 0.01028708, "balance_loss_clip": 1.03541231, "balance_loss_mlp": 1.01657856, "epoch": 0.7955508793025703, "flos": 20120533764480.0, "grad_norm": 4.745461781703955, "language_loss": 0.69967991, "learning_rate": 4.2257549272238566e-07, "loss": 0.72082758, "num_input_tokens_seen": 285533910, "step": 13232, "time_per_iteration": 2.6862258911132812 }, { "auxiliary_loss_clip": 0.01098188, "auxiliary_loss_mlp": 0.0103012, "balance_loss_clip": 1.03567708, "balance_loss_mlp": 1.01753175, "epoch": 0.7956110025552382, "flos": 26505378881280.0, "grad_norm": 3.7128388079610075, "language_loss": 0.77988273, "learning_rate": 4.223360961792952e-07, "loss": 0.80116582, "num_input_tokens_seen": 285554080, "step": 13233, "time_per_iteration": 2.755737066268921 }, { "auxiliary_loss_clip": 0.01099521, "auxiliary_loss_mlp": 0.01032696, "balance_loss_clip": 1.03679132, "balance_loss_mlp": 1.02042377, "epoch": 0.7956711258079062, "flos": 22565475786240.0, "grad_norm": 1.9443930320320317, "language_loss": 0.79183459, "learning_rate": 4.220967594613769e-07, "loss": 0.81315672, "num_input_tokens_seen": 285572325, "step": 13234, "time_per_iteration": 2.7519893646240234 }, { "auxiliary_loss_clip": 0.01089518, "auxiliary_loss_mlp": 0.00769637, "balance_loss_clip": 1.03883278, "balance_loss_mlp": 1.00016356, "epoch": 0.7957312490605741, "flos": 17379005143680.0, "grad_norm": 1.963343394843674, "language_loss": 0.69879019, "learning_rate": 4.218574825777077e-07, "loss": 0.71738172, "num_input_tokens_seen": 285589770, "step": 13235, "time_per_iteration": 2.6992905139923096 }, { "auxiliary_loss_clip": 0.01072089, "auxiliary_loss_mlp": 0.01031418, "balance_loss_clip": 1.03438449, "balance_loss_mlp": 1.0185138, "epoch": 0.7957913723132422, "flos": 22491427898880.0, "grad_norm": 1.4985866886242822, "language_loss": 0.6796065, "learning_rate": 4.2161826553736145e-07, "loss": 0.70064157, "num_input_tokens_seen": 285610065, "step": 13236, "time_per_iteration": 2.7930455207824707 }, { "auxiliary_loss_clip": 0.01062113, "auxiliary_loss_mlp": 0.01030284, "balance_loss_clip": 1.03622985, "balance_loss_mlp": 1.01748657, "epoch": 0.7958514955659101, "flos": 22638087129600.0, "grad_norm": 1.6336623511601824, "language_loss": 0.75105399, "learning_rate": 4.2137910834940826e-07, "loss": 0.7719779, "num_input_tokens_seen": 285628480, "step": 13237, "time_per_iteration": 2.8149497509002686 }, { "auxiliary_loss_clip": 0.01100352, "auxiliary_loss_mlp": 0.0103538, "balance_loss_clip": 1.03876448, "balance_loss_mlp": 1.02211833, "epoch": 0.7959116188185781, "flos": 20704225772160.0, "grad_norm": 2.4969872572253067, "language_loss": 0.71244603, "learning_rate": 4.211400110229175e-07, "loss": 0.73380333, "num_input_tokens_seen": 285647805, "step": 13238, "time_per_iteration": 2.650225877761841 }, { "auxiliary_loss_clip": 0.01093003, "auxiliary_loss_mlp": 0.01028042, "balance_loss_clip": 1.03492129, "balance_loss_mlp": 1.01565003, "epoch": 0.7959717420712461, "flos": 19024683684480.0, "grad_norm": 2.0234207796888666, "language_loss": 0.74033141, "learning_rate": 4.2090097356695684e-07, "loss": 0.7615419, "num_input_tokens_seen": 285665505, "step": 13239, "time_per_iteration": 2.68799090385437 }, { "auxiliary_loss_clip": 0.01113057, "auxiliary_loss_mlp": 0.01034221, "balance_loss_clip": 1.03780365, "balance_loss_mlp": 1.02156138, "epoch": 0.796031865323914, "flos": 26356636661760.0, "grad_norm": 2.1969833935070953, "language_loss": 0.69364315, "learning_rate": 4.2066199599058814e-07, "loss": 0.7151159, "num_input_tokens_seen": 285685855, "step": 13240, "time_per_iteration": 4.224658250808716 }, { "auxiliary_loss_clip": 0.01024595, "auxiliary_loss_mlp": 0.00998488, "balance_loss_clip": 1.01116359, "balance_loss_mlp": 0.99737942, "epoch": 0.796091988576582, "flos": 62069440320000.0, "grad_norm": 0.887431308267293, "language_loss": 0.58674192, "learning_rate": 4.2042307830287526e-07, "loss": 0.60697281, "num_input_tokens_seen": 285735710, "step": 13241, "time_per_iteration": 4.535626649856567 }, { "auxiliary_loss_clip": 0.01078843, "auxiliary_loss_mlp": 0.01030817, "balance_loss_clip": 1.03829169, "balance_loss_mlp": 1.01925421, "epoch": 0.7961521118292499, "flos": 39020103400320.0, "grad_norm": 1.593674462626725, "language_loss": 0.64147931, "learning_rate": 4.201842205128772e-07, "loss": 0.66257584, "num_input_tokens_seen": 285757045, "step": 13242, "time_per_iteration": 4.472386598587036 }, { "auxiliary_loss_clip": 0.01110267, "auxiliary_loss_mlp": 0.01034884, "balance_loss_clip": 1.03763533, "balance_loss_mlp": 1.02198589, "epoch": 0.796212235081918, "flos": 21762836426880.0, "grad_norm": 1.8778627225113254, "language_loss": 0.75913978, "learning_rate": 4.199454226296526e-07, "loss": 0.78059125, "num_input_tokens_seen": 285776050, "step": 13243, "time_per_iteration": 2.590519666671753 }, { "auxiliary_loss_clip": 0.01085583, "auxiliary_loss_mlp": 0.01032008, "balance_loss_clip": 1.04038298, "balance_loss_mlp": 1.01900232, "epoch": 0.7962723583345859, "flos": 21178857110400.0, "grad_norm": 1.6501275630789378, "language_loss": 0.79442871, "learning_rate": 4.1970668466225565e-07, "loss": 0.81560457, "num_input_tokens_seen": 285796830, "step": 13244, "time_per_iteration": 2.752902030944824 }, { "auxiliary_loss_clip": 0.01102665, "auxiliary_loss_mlp": 0.01029623, "balance_loss_clip": 1.03597069, "balance_loss_mlp": 1.01640284, "epoch": 0.7963324815872539, "flos": 17128636369920.0, "grad_norm": 2.172628764552698, "language_loss": 0.68508917, "learning_rate": 4.1946800661973934e-07, "loss": 0.70641208, "num_input_tokens_seen": 285814755, "step": 13245, "time_per_iteration": 2.5828065872192383 }, { "auxiliary_loss_clip": 0.01090189, "auxiliary_loss_mlp": 0.01034231, "balance_loss_clip": 1.03773546, "balance_loss_mlp": 1.02139258, "epoch": 0.7963926048399218, "flos": 21397481239680.0, "grad_norm": 1.806241454752508, "language_loss": 0.79336578, "learning_rate": 4.192293885111549e-07, "loss": 0.81461, "num_input_tokens_seen": 285834255, "step": 13246, "time_per_iteration": 2.6900124549865723 }, { "auxiliary_loss_clip": 0.01090986, "auxiliary_loss_mlp": 0.01031115, "balance_loss_clip": 1.03666353, "balance_loss_mlp": 1.01834822, "epoch": 0.7964527280925898, "flos": 25184188828800.0, "grad_norm": 2.003867832970485, "language_loss": 0.66143036, "learning_rate": 4.1899083034555007e-07, "loss": 0.6826514, "num_input_tokens_seen": 285853540, "step": 13247, "time_per_iteration": 2.6503524780273438 }, { "auxiliary_loss_clip": 0.01085363, "auxiliary_loss_mlp": 0.01029081, "balance_loss_clip": 1.0366044, "balance_loss_mlp": 1.01764846, "epoch": 0.7965128513452577, "flos": 27015884928000.0, "grad_norm": 2.040458459238989, "language_loss": 0.71853489, "learning_rate": 4.1875233213197123e-07, "loss": 0.73967934, "num_input_tokens_seen": 285872705, "step": 13248, "time_per_iteration": 4.260182857513428 }, { "auxiliary_loss_clip": 0.0109327, "auxiliary_loss_mlp": 0.01028611, "balance_loss_clip": 1.03817999, "balance_loss_mlp": 1.01565921, "epoch": 0.7965729745979258, "flos": 24419578993920.0, "grad_norm": 2.2291806355034507, "language_loss": 0.76553303, "learning_rate": 4.1851389387946255e-07, "loss": 0.78675187, "num_input_tokens_seen": 285890290, "step": 13249, "time_per_iteration": 2.6802589893341064 }, { "auxiliary_loss_clip": 0.01082795, "auxiliary_loss_mlp": 0.01029897, "balance_loss_clip": 1.03743911, "balance_loss_mlp": 1.01721263, "epoch": 0.7966330978505937, "flos": 18840389978880.0, "grad_norm": 2.0056248298720623, "language_loss": 0.61770105, "learning_rate": 4.1827551559706674e-07, "loss": 0.63882804, "num_input_tokens_seen": 285909190, "step": 13250, "time_per_iteration": 2.7855334281921387 }, { "auxiliary_loss_clip": 0.01088491, "auxiliary_loss_mlp": 0.01027346, "balance_loss_clip": 1.03615296, "balance_loss_mlp": 1.01445389, "epoch": 0.7966932211032617, "flos": 13152319862400.0, "grad_norm": 2.1520508995588523, "language_loss": 0.72124857, "learning_rate": 4.180371972938206e-07, "loss": 0.74240696, "num_input_tokens_seen": 285927570, "step": 13251, "time_per_iteration": 2.7121150493621826 }, { "auxiliary_loss_clip": 0.01116, "auxiliary_loss_mlp": 0.01031729, "balance_loss_clip": 1.0401994, "balance_loss_mlp": 1.01820469, "epoch": 0.7967533443559297, "flos": 23949760078080.0, "grad_norm": 2.6256177602060395, "language_loss": 0.72742116, "learning_rate": 4.177989389787624e-07, "loss": 0.74889851, "num_input_tokens_seen": 285945810, "step": 13252, "time_per_iteration": 2.582284927368164 }, { "auxiliary_loss_clip": 0.01109038, "auxiliary_loss_mlp": 0.01028059, "balance_loss_clip": 1.03879833, "balance_loss_mlp": 1.01554191, "epoch": 0.7968134676085976, "flos": 30368791964160.0, "grad_norm": 1.5712453668855284, "language_loss": 0.66325545, "learning_rate": 4.175607406609278e-07, "loss": 0.68462646, "num_input_tokens_seen": 285964235, "step": 13253, "time_per_iteration": 2.6929616928100586 }, { "auxiliary_loss_clip": 0.0108594, "auxiliary_loss_mlp": 0.01036955, "balance_loss_clip": 1.0418272, "balance_loss_mlp": 1.0236156, "epoch": 0.7968735908612656, "flos": 23075048079360.0, "grad_norm": 1.5829772200812473, "language_loss": 0.67843878, "learning_rate": 4.1732260234934767e-07, "loss": 0.69966775, "num_input_tokens_seen": 285983710, "step": 13254, "time_per_iteration": 2.7649550437927246 }, { "auxiliary_loss_clip": 0.01098933, "auxiliary_loss_mlp": 0.01034065, "balance_loss_clip": 1.03641415, "balance_loss_mlp": 1.02192962, "epoch": 0.7969337141139335, "flos": 23582250074880.0, "grad_norm": 1.8731034083925706, "language_loss": 0.70037842, "learning_rate": 4.1708452405305314e-07, "loss": 0.72170842, "num_input_tokens_seen": 286003425, "step": 13255, "time_per_iteration": 2.6560351848602295 }, { "auxiliary_loss_clip": 0.01108119, "auxiliary_loss_mlp": 0.01031031, "balance_loss_clip": 1.03694665, "balance_loss_mlp": 1.01906836, "epoch": 0.7969938373666016, "flos": 19755860935680.0, "grad_norm": 2.1084612697613268, "language_loss": 0.79501426, "learning_rate": 4.168465057810733e-07, "loss": 0.81640577, "num_input_tokens_seen": 286020130, "step": 13256, "time_per_iteration": 2.6129326820373535 }, { "auxiliary_loss_clip": 0.01098682, "auxiliary_loss_mlp": 0.01025793, "balance_loss_clip": 1.03868675, "balance_loss_mlp": 1.01325274, "epoch": 0.7970539606192695, "flos": 24134089697280.0, "grad_norm": 1.6974660367757792, "language_loss": 0.66300124, "learning_rate": 4.166085475424315e-07, "loss": 0.68424594, "num_input_tokens_seen": 286040230, "step": 13257, "time_per_iteration": 2.6830172538757324 }, { "auxiliary_loss_clip": 0.0109134, "auxiliary_loss_mlp": 0.01034065, "balance_loss_clip": 1.0377934, "balance_loss_mlp": 1.02150643, "epoch": 0.7971140838719375, "flos": 17968622895360.0, "grad_norm": 1.9146410977072226, "language_loss": 0.72192776, "learning_rate": 4.163706493461523e-07, "loss": 0.74318182, "num_input_tokens_seen": 286059475, "step": 13258, "time_per_iteration": 2.661726236343384 }, { "auxiliary_loss_clip": 0.01100938, "auxiliary_loss_mlp": 0.01034189, "balance_loss_clip": 1.03692877, "balance_loss_mlp": 1.0205518, "epoch": 0.7971742071246054, "flos": 19169547235200.0, "grad_norm": 1.8087181306355609, "language_loss": 0.68977499, "learning_rate": 4.1613281120125655e-07, "loss": 0.71112633, "num_input_tokens_seen": 286077820, "step": 13259, "time_per_iteration": 2.611186981201172 }, { "auxiliary_loss_clip": 0.01096475, "auxiliary_loss_mlp": 0.01030487, "balance_loss_clip": 1.03723931, "balance_loss_mlp": 1.01854253, "epoch": 0.7972343303772734, "flos": 27125951178240.0, "grad_norm": 2.1313633169820547, "language_loss": 0.73609447, "learning_rate": 4.158950331167641e-07, "loss": 0.75736415, "num_input_tokens_seen": 286097285, "step": 13260, "time_per_iteration": 2.699951648712158 }, { "auxiliary_loss_clip": 0.010819, "auxiliary_loss_mlp": 0.01032439, "balance_loss_clip": 1.03271997, "balance_loss_mlp": 1.02031517, "epoch": 0.7972944536299413, "flos": 20996646393600.0, "grad_norm": 1.836032369081443, "language_loss": 0.78399926, "learning_rate": 4.1565731510169065e-07, "loss": 0.80514264, "num_input_tokens_seen": 286116000, "step": 13261, "time_per_iteration": 2.6140952110290527 }, { "auxiliary_loss_clip": 0.01095642, "auxiliary_loss_mlp": 0.01030129, "balance_loss_clip": 1.03774393, "balance_loss_mlp": 1.0191493, "epoch": 0.7973545768826094, "flos": 21580015178880.0, "grad_norm": 1.439588217770827, "language_loss": 0.76199102, "learning_rate": 4.154196571650501e-07, "loss": 0.78324872, "num_input_tokens_seen": 286135110, "step": 13262, "time_per_iteration": 2.7024636268615723 }, { "auxiliary_loss_clip": 0.01082139, "auxiliary_loss_mlp": 0.01033411, "balance_loss_clip": 1.03903556, "balance_loss_mlp": 1.0191412, "epoch": 0.7974147001352773, "flos": 20558536208640.0, "grad_norm": 2.631651732945755, "language_loss": 0.70419514, "learning_rate": 4.1518205931585524e-07, "loss": 0.72535068, "num_input_tokens_seen": 286152835, "step": 13263, "time_per_iteration": 2.72177791595459 }, { "auxiliary_loss_clip": 0.01103923, "auxiliary_loss_mlp": 0.01038655, "balance_loss_clip": 1.03778529, "balance_loss_mlp": 1.02499938, "epoch": 0.7974748233879453, "flos": 20996790048000.0, "grad_norm": 2.0043756449172547, "language_loss": 0.70802379, "learning_rate": 4.149445215631153e-07, "loss": 0.72944963, "num_input_tokens_seen": 286171785, "step": 13264, "time_per_iteration": 2.706388473510742 }, { "auxiliary_loss_clip": 0.01107469, "auxiliary_loss_mlp": 0.01033786, "balance_loss_clip": 1.03775704, "balance_loss_mlp": 1.02225232, "epoch": 0.7975349466406133, "flos": 22565188477440.0, "grad_norm": 1.891498852375028, "language_loss": 0.76922268, "learning_rate": 4.1470704391583776e-07, "loss": 0.79063523, "num_input_tokens_seen": 286190420, "step": 13265, "time_per_iteration": 2.6580817699432373 }, { "auxiliary_loss_clip": 0.01080723, "auxiliary_loss_mlp": 0.01027162, "balance_loss_clip": 1.0393877, "balance_loss_mlp": 1.0149374, "epoch": 0.7975950698932812, "flos": 21689542725120.0, "grad_norm": 2.280360071674855, "language_loss": 0.75571597, "learning_rate": 4.144696263830285e-07, "loss": 0.77679479, "num_input_tokens_seen": 286210105, "step": 13266, "time_per_iteration": 2.707306146621704 }, { "auxiliary_loss_clip": 0.01083885, "auxiliary_loss_mlp": 0.01026943, "balance_loss_clip": 1.03626752, "balance_loss_mlp": 1.01505208, "epoch": 0.7976551931459492, "flos": 19604568850560.0, "grad_norm": 7.354197908727964, "language_loss": 0.84225118, "learning_rate": 4.1423226897369015e-07, "loss": 0.86335951, "num_input_tokens_seen": 286228180, "step": 13267, "time_per_iteration": 2.6513888835906982 }, { "auxiliary_loss_clip": 0.01095515, "auxiliary_loss_mlp": 0.01031649, "balance_loss_clip": 1.03541887, "balance_loss_mlp": 1.01869643, "epoch": 0.7977153163986171, "flos": 21687603390720.0, "grad_norm": 1.5920140883630767, "language_loss": 0.76201731, "learning_rate": 4.139949716968223e-07, "loss": 0.7832889, "num_input_tokens_seen": 286247305, "step": 13268, "time_per_iteration": 2.7020766735076904 }, { "auxiliary_loss_clip": 0.01109132, "auxiliary_loss_mlp": 0.01030449, "balance_loss_clip": 1.03789496, "balance_loss_mlp": 1.0182898, "epoch": 0.7977754396512852, "flos": 23476780765440.0, "grad_norm": 1.5724932567080838, "language_loss": 0.77637428, "learning_rate": 4.1375773456142403e-07, "loss": 0.79777002, "num_input_tokens_seen": 286268145, "step": 13269, "time_per_iteration": 2.6634888648986816 }, { "auxiliary_loss_clip": 0.01090369, "auxiliary_loss_mlp": 0.01042032, "balance_loss_clip": 1.03390729, "balance_loss_mlp": 1.02950919, "epoch": 0.7978355629039531, "flos": 22382223575040.0, "grad_norm": 1.6845375324844267, "language_loss": 0.82535768, "learning_rate": 4.135205575764922e-07, "loss": 0.84668171, "num_input_tokens_seen": 286286775, "step": 13270, "time_per_iteration": 2.684476613998413 }, { "auxiliary_loss_clip": 0.01068469, "auxiliary_loss_mlp": 0.01040513, "balance_loss_clip": 1.03474867, "balance_loss_mlp": 1.02632725, "epoch": 0.7978956861566211, "flos": 20266331068800.0, "grad_norm": 1.5659305382034026, "language_loss": 0.59210402, "learning_rate": 4.1328344075101905e-07, "loss": 0.61319387, "num_input_tokens_seen": 286305590, "step": 13271, "time_per_iteration": 2.860095262527466 }, { "auxiliary_loss_clip": 0.01090884, "auxiliary_loss_mlp": 0.01031592, "balance_loss_clip": 1.03714991, "balance_loss_mlp": 1.01914704, "epoch": 0.797955809409289, "flos": 28112417366400.0, "grad_norm": 1.4518492514226418, "language_loss": 0.73159599, "learning_rate": 4.130463840939975e-07, "loss": 0.75282073, "num_input_tokens_seen": 286328050, "step": 13272, "time_per_iteration": 2.770979881286621 }, { "auxiliary_loss_clip": 0.01046384, "auxiliary_loss_mlp": 0.01036557, "balance_loss_clip": 1.03212595, "balance_loss_mlp": 1.023736, "epoch": 0.798015932661957, "flos": 15559591495680.0, "grad_norm": 2.073152053590518, "language_loss": 0.71566808, "learning_rate": 4.128093876144161e-07, "loss": 0.73649746, "num_input_tokens_seen": 286345265, "step": 13273, "time_per_iteration": 2.7531182765960693 }, { "auxiliary_loss_clip": 0.0108926, "auxiliary_loss_mlp": 0.01034875, "balance_loss_clip": 1.03732777, "balance_loss_mlp": 1.02203608, "epoch": 0.7980760559146249, "flos": 23951196622080.0, "grad_norm": 1.7484854884585128, "language_loss": 0.75765157, "learning_rate": 4.1257245132126117e-07, "loss": 0.77889293, "num_input_tokens_seen": 286364465, "step": 13274, "time_per_iteration": 2.788862705230713 }, { "auxiliary_loss_clip": 0.0105609, "auxiliary_loss_mlp": 0.01027938, "balance_loss_clip": 1.03353024, "balance_loss_mlp": 1.01679242, "epoch": 0.798136179167293, "flos": 28038082170240.0, "grad_norm": 1.3747811855715935, "language_loss": 0.77784944, "learning_rate": 4.12335575223518e-07, "loss": 0.79868966, "num_input_tokens_seen": 286385565, "step": 13275, "time_per_iteration": 2.823310375213623 }, { "auxiliary_loss_clip": 0.01100598, "auxiliary_loss_mlp": 0.01039391, "balance_loss_clip": 1.03790784, "balance_loss_mlp": 1.02598548, "epoch": 0.7981963024199609, "flos": 35984538046080.0, "grad_norm": 1.8295595288590525, "language_loss": 0.63964415, "learning_rate": 4.1209875933016877e-07, "loss": 0.66104394, "num_input_tokens_seen": 286403950, "step": 13276, "time_per_iteration": 2.6914138793945312 }, { "auxiliary_loss_clip": 0.01067297, "auxiliary_loss_mlp": 0.01030718, "balance_loss_clip": 1.03446054, "balance_loss_mlp": 1.01858199, "epoch": 0.7982564256726289, "flos": 25884914325120.0, "grad_norm": 1.804313446176304, "language_loss": 0.61235017, "learning_rate": 4.118620036501945e-07, "loss": 0.63333035, "num_input_tokens_seen": 286426160, "step": 13277, "time_per_iteration": 2.7913875579833984 }, { "auxiliary_loss_clip": 0.0108732, "auxiliary_loss_mlp": 0.01033667, "balance_loss_clip": 1.03842235, "balance_loss_mlp": 1.0209415, "epoch": 0.7983165489252969, "flos": 25739152934400.0, "grad_norm": 1.9796578843322905, "language_loss": 0.79335415, "learning_rate": 4.1162530819257227e-07, "loss": 0.81456405, "num_input_tokens_seen": 286446610, "step": 13278, "time_per_iteration": 2.69783353805542 }, { "auxiliary_loss_clip": 0.01089196, "auxiliary_loss_mlp": 0.01040766, "balance_loss_clip": 1.03595579, "balance_loss_mlp": 1.0271821, "epoch": 0.7983766721779648, "flos": 21908202768000.0, "grad_norm": 1.9939125008903142, "language_loss": 0.62796175, "learning_rate": 4.113886729662768e-07, "loss": 0.64926136, "num_input_tokens_seen": 286465460, "step": 13279, "time_per_iteration": 2.6455893516540527 }, { "auxiliary_loss_clip": 0.01093985, "auxiliary_loss_mlp": 0.01029527, "balance_loss_clip": 1.03608727, "balance_loss_mlp": 1.01845241, "epoch": 0.7984367954306328, "flos": 29347420734720.0, "grad_norm": 2.0437348521937633, "language_loss": 0.71019673, "learning_rate": 4.111520979802825e-07, "loss": 0.73143184, "num_input_tokens_seen": 286485720, "step": 13280, "time_per_iteration": 5.853861093521118 }, { "auxiliary_loss_clip": 0.01071418, "auxiliary_loss_mlp": 0.01042718, "balance_loss_clip": 1.03524828, "balance_loss_mlp": 1.02807951, "epoch": 0.7984969186833007, "flos": 31357772104320.0, "grad_norm": 1.7977133455003094, "language_loss": 0.62786448, "learning_rate": 4.1091558324355955e-07, "loss": 0.64900589, "num_input_tokens_seen": 286507465, "step": 13281, "time_per_iteration": 2.8363935947418213 }, { "auxiliary_loss_clip": 0.01098858, "auxiliary_loss_mlp": 0.01032968, "balance_loss_clip": 1.03470564, "balance_loss_mlp": 1.0203495, "epoch": 0.7985570419359688, "flos": 24312924535680.0, "grad_norm": 2.135522103798107, "language_loss": 0.80706322, "learning_rate": 4.1067912876507683e-07, "loss": 0.82838148, "num_input_tokens_seen": 286526345, "step": 13282, "time_per_iteration": 4.3146162033081055 }, { "auxiliary_loss_clip": 0.01075396, "auxiliary_loss_mlp": 0.00770211, "balance_loss_clip": 1.03265977, "balance_loss_mlp": 1.00023508, "epoch": 0.7986171651886367, "flos": 15742233175680.0, "grad_norm": 1.7496465983827643, "language_loss": 0.71291137, "learning_rate": 4.10442734553802e-07, "loss": 0.73136741, "num_input_tokens_seen": 286544095, "step": 13283, "time_per_iteration": 2.7113521099090576 }, { "auxiliary_loss_clip": 0.01094572, "auxiliary_loss_mlp": 0.01026527, "balance_loss_clip": 1.03506041, "balance_loss_mlp": 1.01502371, "epoch": 0.7986772884413047, "flos": 11619401091840.0, "grad_norm": 1.8142883767804951, "language_loss": 0.73932701, "learning_rate": 4.102064006186967e-07, "loss": 0.76053798, "num_input_tokens_seen": 286560960, "step": 13284, "time_per_iteration": 2.690788984298706 }, { "auxiliary_loss_clip": 0.01081168, "auxiliary_loss_mlp": 0.01036951, "balance_loss_clip": 1.03430653, "balance_loss_mlp": 1.02556038, "epoch": 0.7987374116939726, "flos": 22091059929600.0, "grad_norm": 2.8983502428316252, "language_loss": 0.70378709, "learning_rate": 4.0997012696872415e-07, "loss": 0.72496831, "num_input_tokens_seen": 286579865, "step": 13285, "time_per_iteration": 2.6703269481658936 }, { "auxiliary_loss_clip": 0.01080639, "auxiliary_loss_mlp": 0.01033039, "balance_loss_clip": 1.03381705, "balance_loss_mlp": 1.02097476, "epoch": 0.7987975349466406, "flos": 17890696339200.0, "grad_norm": 1.6695326991809423, "language_loss": 0.7404871, "learning_rate": 4.097339136128437e-07, "loss": 0.76162386, "num_input_tokens_seen": 286597295, "step": 13286, "time_per_iteration": 2.663839817047119 }, { "auxiliary_loss_clip": 0.01087446, "auxiliary_loss_mlp": 0.01030474, "balance_loss_clip": 1.03605843, "balance_loss_mlp": 1.01811767, "epoch": 0.7988576581993085, "flos": 19719232041600.0, "grad_norm": 1.9331179632037672, "language_loss": 0.75270319, "learning_rate": 4.0949776056001296e-07, "loss": 0.77388239, "num_input_tokens_seen": 286616270, "step": 13287, "time_per_iteration": 2.6603620052337646 }, { "auxiliary_loss_clip": 0.01086627, "auxiliary_loss_mlp": 0.01029452, "balance_loss_clip": 1.03799939, "balance_loss_mlp": 1.01714361, "epoch": 0.7989177814519766, "flos": 28036358317440.0, "grad_norm": 1.5251443213363312, "language_loss": 0.61793303, "learning_rate": 4.092616678191863e-07, "loss": 0.63909382, "num_input_tokens_seen": 286638315, "step": 13288, "time_per_iteration": 4.3285603523254395 }, { "auxiliary_loss_clip": 0.01098321, "auxiliary_loss_mlp": 0.01032216, "balance_loss_clip": 1.03874183, "balance_loss_mlp": 1.02039015, "epoch": 0.7989779047046445, "flos": 28871029630080.0, "grad_norm": 2.003655756829568, "language_loss": 0.70842254, "learning_rate": 4.090256353993169e-07, "loss": 0.72972792, "num_input_tokens_seen": 286658630, "step": 13289, "time_per_iteration": 2.754244089126587 }, { "auxiliary_loss_clip": 0.01077989, "auxiliary_loss_mlp": 0.01036331, "balance_loss_clip": 1.0401969, "balance_loss_mlp": 1.02322364, "epoch": 0.7990380279573125, "flos": 18186887888640.0, "grad_norm": 2.067060536008121, "language_loss": 0.62479776, "learning_rate": 4.0878966330935506e-07, "loss": 0.64594096, "num_input_tokens_seen": 286676870, "step": 13290, "time_per_iteration": 2.7182984352111816 }, { "auxiliary_loss_clip": 0.01102224, "auxiliary_loss_mlp": 0.0103293, "balance_loss_clip": 1.03841472, "balance_loss_mlp": 1.01973963, "epoch": 0.7990981512099805, "flos": 20879936127360.0, "grad_norm": 2.07432932467733, "language_loss": 0.71562916, "learning_rate": 4.08553751558248e-07, "loss": 0.73698068, "num_input_tokens_seen": 286694300, "step": 13291, "time_per_iteration": 2.679877281188965 }, { "auxiliary_loss_clip": 0.01071725, "auxiliary_loss_mlp": 0.01028448, "balance_loss_clip": 1.03726125, "balance_loss_mlp": 1.01692605, "epoch": 0.7991582744626484, "flos": 26099911180800.0, "grad_norm": 1.4271226582537684, "language_loss": 0.63687944, "learning_rate": 4.083179001549422e-07, "loss": 0.65788114, "num_input_tokens_seen": 286714545, "step": 13292, "time_per_iteration": 2.7268645763397217 }, { "auxiliary_loss_clip": 0.01097914, "auxiliary_loss_mlp": 0.0103158, "balance_loss_clip": 1.03674936, "balance_loss_mlp": 1.0198257, "epoch": 0.7992183977153164, "flos": 35295843605760.0, "grad_norm": 1.6084532273776246, "language_loss": 0.56303227, "learning_rate": 4.0808210910838105e-07, "loss": 0.58432722, "num_input_tokens_seen": 286734525, "step": 13293, "time_per_iteration": 2.7652106285095215 }, { "auxiliary_loss_clip": 0.0108332, "auxiliary_loss_mlp": 0.0103505, "balance_loss_clip": 1.03898919, "balance_loss_mlp": 1.02236032, "epoch": 0.7992785209679844, "flos": 51853426577280.0, "grad_norm": 6.931153518532829, "language_loss": 0.71501821, "learning_rate": 4.0784637842750704e-07, "loss": 0.73620194, "num_input_tokens_seen": 286753430, "step": 13294, "time_per_iteration": 2.9734227657318115 }, { "auxiliary_loss_clip": 0.01071635, "auxiliary_loss_mlp": 0.01033492, "balance_loss_clip": 1.03379464, "balance_loss_mlp": 1.02083826, "epoch": 0.7993386442206524, "flos": 22565116650240.0, "grad_norm": 1.9346589994202708, "language_loss": 0.72097647, "learning_rate": 4.0761070812125675e-07, "loss": 0.74202782, "num_input_tokens_seen": 286771915, "step": 13295, "time_per_iteration": 2.8697874546051025 }, { "auxiliary_loss_clip": 0.0107528, "auxiliary_loss_mlp": 0.01033352, "balance_loss_clip": 1.03569388, "balance_loss_mlp": 1.02194977, "epoch": 0.7993987674733203, "flos": 18800277465600.0, "grad_norm": 1.7062921905810151, "language_loss": 0.75847328, "learning_rate": 4.0737509819856797e-07, "loss": 0.77955961, "num_input_tokens_seen": 286789835, "step": 13296, "time_per_iteration": 2.851438522338867 }, { "auxiliary_loss_clip": 0.00998558, "auxiliary_loss_mlp": 0.01004815, "balance_loss_clip": 1.00716496, "balance_loss_mlp": 1.00364101, "epoch": 0.7994588907259883, "flos": 69421720394880.0, "grad_norm": 0.714455868109846, "language_loss": 0.60823548, "learning_rate": 4.0713954866837573e-07, "loss": 0.6282692, "num_input_tokens_seen": 286855580, "step": 13297, "time_per_iteration": 3.307276725769043 }, { "auxiliary_loss_clip": 0.01086945, "auxiliary_loss_mlp": 0.01034042, "balance_loss_clip": 1.03667986, "balance_loss_mlp": 1.02218103, "epoch": 0.7995190139786562, "flos": 13480327883520.0, "grad_norm": 2.2332538895333482, "language_loss": 0.70562863, "learning_rate": 4.0690405953961073e-07, "loss": 0.72683859, "num_input_tokens_seen": 286874360, "step": 13298, "time_per_iteration": 2.764620542526245 }, { "auxiliary_loss_clip": 0.01073541, "auxiliary_loss_mlp": 0.01036123, "balance_loss_clip": 1.03546071, "balance_loss_mlp": 1.0225215, "epoch": 0.7995791372313242, "flos": 21652842003840.0, "grad_norm": 2.144443690498565, "language_loss": 0.75778526, "learning_rate": 4.066686308212037e-07, "loss": 0.77888191, "num_input_tokens_seen": 286891950, "step": 13299, "time_per_iteration": 2.7200376987457275 }, { "auxiliary_loss_clip": 0.0108171, "auxiliary_loss_mlp": 0.01035616, "balance_loss_clip": 1.03365636, "balance_loss_mlp": 1.02388, "epoch": 0.7996392604839921, "flos": 26068130622720.0, "grad_norm": 1.914646005951808, "language_loss": 0.77740645, "learning_rate": 4.064332625220828e-07, "loss": 0.79857981, "num_input_tokens_seen": 286911725, "step": 13300, "time_per_iteration": 3.0327885150909424 }, { "auxiliary_loss_clip": 0.01066534, "auxiliary_loss_mlp": 0.01041633, "balance_loss_clip": 1.03083372, "balance_loss_mlp": 1.02648187, "epoch": 0.7996993837366602, "flos": 24606889441920.0, "grad_norm": 1.7486826819933081, "language_loss": 0.6372295, "learning_rate": 4.0619795465117115e-07, "loss": 0.65831113, "num_input_tokens_seen": 286931400, "step": 13301, "time_per_iteration": 2.797971725463867 }, { "auxiliary_loss_clip": 0.01096682, "auxiliary_loss_mlp": 0.01034454, "balance_loss_clip": 1.03674114, "balance_loss_mlp": 1.02209187, "epoch": 0.7997595069893281, "flos": 20992049452800.0, "grad_norm": 2.21348387588423, "language_loss": 0.71967971, "learning_rate": 4.059627072173928e-07, "loss": 0.74099112, "num_input_tokens_seen": 286949795, "step": 13302, "time_per_iteration": 2.874833822250366 }, { "auxiliary_loss_clip": 0.01111886, "auxiliary_loss_mlp": 0.00770697, "balance_loss_clip": 1.03792214, "balance_loss_mlp": 1.0001955, "epoch": 0.7998196302419961, "flos": 24426510318720.0, "grad_norm": 2.0232516764799953, "language_loss": 0.83735251, "learning_rate": 4.057275202296684e-07, "loss": 0.8561784, "num_input_tokens_seen": 286968805, "step": 13303, "time_per_iteration": 2.73748779296875 }, { "auxiliary_loss_clip": 0.01106654, "auxiliary_loss_mlp": 0.01032892, "balance_loss_clip": 1.03686202, "balance_loss_mlp": 1.02197862, "epoch": 0.7998797534946641, "flos": 30264651457920.0, "grad_norm": 1.7050821885070455, "language_loss": 0.58436215, "learning_rate": 4.054923936969166e-07, "loss": 0.60575771, "num_input_tokens_seen": 286990235, "step": 13304, "time_per_iteration": 2.6886215209960938 }, { "auxiliary_loss_clip": 0.01111166, "auxiliary_loss_mlp": 0.01031185, "balance_loss_clip": 1.03615296, "balance_loss_mlp": 1.01842976, "epoch": 0.799939876747332, "flos": 23513984277120.0, "grad_norm": 1.709353821854052, "language_loss": 0.6893419, "learning_rate": 4.0525732762805265e-07, "loss": 0.71076536, "num_input_tokens_seen": 287011060, "step": 13305, "time_per_iteration": 2.6649460792541504 }, { "auxiliary_loss_clip": 0.01072914, "auxiliary_loss_mlp": 0.01027062, "balance_loss_clip": 1.0366323, "balance_loss_mlp": 1.01584458, "epoch": 0.8, "flos": 19318109886720.0, "grad_norm": 1.5790890199142242, "language_loss": 0.69499552, "learning_rate": 4.0502232203199107e-07, "loss": 0.71599531, "num_input_tokens_seen": 287029215, "step": 13306, "time_per_iteration": 2.7563791275024414 }, { "auxiliary_loss_clip": 0.01101067, "auxiliary_loss_mlp": 0.0103442, "balance_loss_clip": 1.03880584, "balance_loss_mlp": 1.02221918, "epoch": 0.800060123252668, "flos": 32412432263040.0, "grad_norm": 1.5764355485932124, "language_loss": 0.69476044, "learning_rate": 4.0478737691764286e-07, "loss": 0.71611536, "num_input_tokens_seen": 287050855, "step": 13307, "time_per_iteration": 2.732285737991333 }, { "auxiliary_loss_clip": 0.01085939, "auxiliary_loss_mlp": 0.01036111, "balance_loss_clip": 1.0350318, "balance_loss_mlp": 1.02393389, "epoch": 0.800120246505336, "flos": 20010611168640.0, "grad_norm": 1.8640379762112131, "language_loss": 0.76623571, "learning_rate": 4.0455249229391677e-07, "loss": 0.78745627, "num_input_tokens_seen": 287069915, "step": 13308, "time_per_iteration": 2.642228603363037 }, { "auxiliary_loss_clip": 0.01063897, "auxiliary_loss_mlp": 0.01031546, "balance_loss_clip": 1.03632379, "balance_loss_mlp": 1.01817083, "epoch": 0.8001803697580039, "flos": 31868278151040.0, "grad_norm": 1.4469096851593135, "language_loss": 0.78943181, "learning_rate": 4.0431766816972e-07, "loss": 0.8103863, "num_input_tokens_seen": 287091450, "step": 13309, "time_per_iteration": 2.864769697189331 }, { "auxiliary_loss_clip": 0.01030417, "auxiliary_loss_mlp": 0.01001696, "balance_loss_clip": 1.00792837, "balance_loss_mlp": 1.00063515, "epoch": 0.8002404930106719, "flos": 63392066916480.0, "grad_norm": 0.9323209922385806, "language_loss": 0.64716959, "learning_rate": 4.040829045539571e-07, "loss": 0.66749072, "num_input_tokens_seen": 287148365, "step": 13310, "time_per_iteration": 3.1092755794525146 }, { "auxiliary_loss_clip": 0.01098583, "auxiliary_loss_mlp": 0.01033194, "balance_loss_clip": 1.03659534, "balance_loss_mlp": 1.02035546, "epoch": 0.8003006162633398, "flos": 27855476403840.0, "grad_norm": 2.1579786023849445, "language_loss": 0.82891053, "learning_rate": 4.0384820145553156e-07, "loss": 0.85022825, "num_input_tokens_seen": 287168280, "step": 13311, "time_per_iteration": 2.7086493968963623 }, { "auxiliary_loss_clip": 0.0109936, "auxiliary_loss_mlp": 0.01033907, "balance_loss_clip": 1.03775251, "balance_loss_mlp": 1.0216639, "epoch": 0.8003607395160078, "flos": 18223337214720.0, "grad_norm": 1.9933201272328842, "language_loss": 0.66162074, "learning_rate": 4.0361355888334116e-07, "loss": 0.68295336, "num_input_tokens_seen": 287185980, "step": 13312, "time_per_iteration": 2.680204153060913 }, { "auxiliary_loss_clip": 0.01114636, "auxiliary_loss_mlp": 0.01031944, "balance_loss_clip": 1.04067063, "balance_loss_mlp": 1.01846743, "epoch": 0.8004208627686757, "flos": 20886975192960.0, "grad_norm": 2.3011865249501264, "language_loss": 0.75151718, "learning_rate": 4.033789768462843e-07, "loss": 0.77298295, "num_input_tokens_seen": 287203875, "step": 13313, "time_per_iteration": 2.606222629547119 }, { "auxiliary_loss_clip": 0.0109515, "auxiliary_loss_mlp": 0.01030962, "balance_loss_clip": 1.03412461, "balance_loss_mlp": 1.01851058, "epoch": 0.8004809860213438, "flos": 26436143416320.0, "grad_norm": 1.3567607017939294, "language_loss": 0.75564599, "learning_rate": 4.031444553532575e-07, "loss": 0.77690709, "num_input_tokens_seen": 287226445, "step": 13314, "time_per_iteration": 2.6715898513793945 }, { "auxiliary_loss_clip": 0.00988299, "auxiliary_loss_mlp": 0.01000387, "balance_loss_clip": 1.00804853, "balance_loss_mlp": 0.99932635, "epoch": 0.8005411092740117, "flos": 63648612829440.0, "grad_norm": 0.8122679233845669, "language_loss": 0.53769958, "learning_rate": 4.029099944131522e-07, "loss": 0.55758643, "num_input_tokens_seen": 287286240, "step": 13315, "time_per_iteration": 3.1782495975494385 }, { "auxiliary_loss_clip": 0.01086886, "auxiliary_loss_mlp": 0.01029216, "balance_loss_clip": 1.03729582, "balance_loss_mlp": 1.0172112, "epoch": 0.8006012325266797, "flos": 36138056774400.0, "grad_norm": 1.6928178696023135, "language_loss": 0.71341288, "learning_rate": 4.026755940348603e-07, "loss": 0.7345739, "num_input_tokens_seen": 287310265, "step": 13316, "time_per_iteration": 2.7924816608428955 }, { "auxiliary_loss_clip": 0.01091573, "auxiliary_loss_mlp": 0.01030799, "balance_loss_clip": 1.03969979, "balance_loss_mlp": 1.0183655, "epoch": 0.8006613557793477, "flos": 33838947970560.0, "grad_norm": 1.868325107289893, "language_loss": 0.64874738, "learning_rate": 4.024412542272706e-07, "loss": 0.66997111, "num_input_tokens_seen": 287331610, "step": 13317, "time_per_iteration": 2.7774088382720947 }, { "auxiliary_loss_clip": 0.01029734, "auxiliary_loss_mlp": 0.01001074, "balance_loss_clip": 1.00732291, "balance_loss_mlp": 1.00008476, "epoch": 0.8007214790320156, "flos": 67348310699520.0, "grad_norm": 0.7791846864300481, "language_loss": 0.59069222, "learning_rate": 4.0220697499926783e-07, "loss": 0.61100036, "num_input_tokens_seen": 287394795, "step": 13318, "time_per_iteration": 4.755454778671265 }, { "auxiliary_loss_clip": 0.01074086, "auxiliary_loss_mlp": 0.01027358, "balance_loss_clip": 1.03581715, "balance_loss_mlp": 1.01549029, "epoch": 0.8007816022846836, "flos": 23185653033600.0, "grad_norm": 1.8075855078848244, "language_loss": 0.66764301, "learning_rate": 4.019727563597366e-07, "loss": 0.68865746, "num_input_tokens_seen": 287414595, "step": 13319, "time_per_iteration": 4.444296360015869 }, { "auxiliary_loss_clip": 0.0111121, "auxiliary_loss_mlp": 0.00771312, "balance_loss_clip": 1.03728712, "balance_loss_mlp": 1.00022757, "epoch": 0.8008417255373516, "flos": 21981388728960.0, "grad_norm": 1.8607859210030597, "language_loss": 0.74157208, "learning_rate": 4.0173859831755873e-07, "loss": 0.76039732, "num_input_tokens_seen": 287434395, "step": 13320, "time_per_iteration": 2.628570079803467 }, { "auxiliary_loss_clip": 0.01097073, "auxiliary_loss_mlp": 0.01026936, "balance_loss_clip": 1.0365932, "balance_loss_mlp": 1.01422882, "epoch": 0.8009018487900196, "flos": 16727334647040.0, "grad_norm": 1.9175300817586667, "language_loss": 0.80223489, "learning_rate": 4.015045008816138e-07, "loss": 0.823475, "num_input_tokens_seen": 287450590, "step": 13321, "time_per_iteration": 4.052290201187134 }, { "auxiliary_loss_clip": 0.01033155, "auxiliary_loss_mlp": 0.01036586, "balance_loss_clip": 1.02668345, "balance_loss_mlp": 1.02364588, "epoch": 0.8009619720426875, "flos": 20813609664000.0, "grad_norm": 1.8862095260452836, "language_loss": 0.66014248, "learning_rate": 4.0127046406077825e-07, "loss": 0.6808399, "num_input_tokens_seen": 287468455, "step": 13322, "time_per_iteration": 2.7416417598724365 }, { "auxiliary_loss_clip": 0.01099704, "auxiliary_loss_mlp": 0.01028354, "balance_loss_clip": 1.03734875, "balance_loss_mlp": 1.01642156, "epoch": 0.8010220952953555, "flos": 17931096161280.0, "grad_norm": 1.9049008418549798, "language_loss": 0.77709258, "learning_rate": 4.010364878639265e-07, "loss": 0.79837316, "num_input_tokens_seen": 287486485, "step": 13323, "time_per_iteration": 2.6071035861968994 }, { "auxiliary_loss_clip": 0.01110946, "auxiliary_loss_mlp": 0.01029696, "balance_loss_clip": 1.03769231, "balance_loss_mlp": 1.01716661, "epoch": 0.8010822185480234, "flos": 24572235795840.0, "grad_norm": 2.445337752212116, "language_loss": 0.71122754, "learning_rate": 4.00802572299932e-07, "loss": 0.73263395, "num_input_tokens_seen": 287503940, "step": 13324, "time_per_iteration": 2.6217870712280273 }, { "auxiliary_loss_clip": 0.01068071, "auxiliary_loss_mlp": 0.0103353, "balance_loss_clip": 1.03280735, "balance_loss_mlp": 1.02047682, "epoch": 0.8011423418006914, "flos": 21829988903040.0, "grad_norm": 1.814435796416432, "language_loss": 0.76471907, "learning_rate": 4.005687173776635e-07, "loss": 0.78573507, "num_input_tokens_seen": 287521660, "step": 13325, "time_per_iteration": 2.6970367431640625 }, { "auxiliary_loss_clip": 0.01084618, "auxiliary_loss_mlp": 0.01027508, "balance_loss_clip": 1.03447258, "balance_loss_mlp": 1.01634359, "epoch": 0.8012024650533593, "flos": 23915178259200.0, "grad_norm": 1.5321170582331973, "language_loss": 0.7980848, "learning_rate": 4.003349231059898e-07, "loss": 0.81920606, "num_input_tokens_seen": 287541505, "step": 13326, "time_per_iteration": 2.6341090202331543 }, { "auxiliary_loss_clip": 0.0109705, "auxiliary_loss_mlp": 0.01032819, "balance_loss_clip": 1.03666115, "balance_loss_mlp": 1.02096391, "epoch": 0.8012625883060274, "flos": 23587062497280.0, "grad_norm": 1.9170928763125719, "language_loss": 0.65865368, "learning_rate": 4.001011894937765e-07, "loss": 0.67995238, "num_input_tokens_seen": 287560015, "step": 13327, "time_per_iteration": 4.200170278549194 }, { "auxiliary_loss_clip": 0.01094832, "auxiliary_loss_mlp": 0.01031408, "balance_loss_clip": 1.03746152, "balance_loss_mlp": 1.02033961, "epoch": 0.8013227115586953, "flos": 20813932886400.0, "grad_norm": 1.5945061628863433, "language_loss": 0.73482913, "learning_rate": 3.9986751654988636e-07, "loss": 0.75609159, "num_input_tokens_seen": 287579150, "step": 13328, "time_per_iteration": 2.598289966583252 }, { "auxiliary_loss_clip": 0.01050876, "auxiliary_loss_mlp": 0.01034952, "balance_loss_clip": 1.03355122, "balance_loss_mlp": 1.02166045, "epoch": 0.8013828348113633, "flos": 15888317788800.0, "grad_norm": 1.9762883167731011, "language_loss": 0.73578757, "learning_rate": 3.996339042831798e-07, "loss": 0.7566458, "num_input_tokens_seen": 287597420, "step": 13329, "time_per_iteration": 2.738548994064331 }, { "auxiliary_loss_clip": 0.0102058, "auxiliary_loss_mlp": 0.00999735, "balance_loss_clip": 1.0074687, "balance_loss_mlp": 0.99866766, "epoch": 0.8014429580640313, "flos": 71062981562880.0, "grad_norm": 0.6934041027763224, "language_loss": 0.52926564, "learning_rate": 3.9940035270251605e-07, "loss": 0.54946882, "num_input_tokens_seen": 287667280, "step": 13330, "time_per_iteration": 3.3172037601470947 }, { "auxiliary_loss_clip": 0.01083958, "auxiliary_loss_mlp": 0.01037459, "balance_loss_clip": 1.03489339, "balance_loss_mlp": 1.02364862, "epoch": 0.8015030813166992, "flos": 23076340968960.0, "grad_norm": 1.7329849942476805, "language_loss": 0.7308808, "learning_rate": 3.991668618167519e-07, "loss": 0.75209498, "num_input_tokens_seen": 287687375, "step": 13331, "time_per_iteration": 2.7093939781188965 }, { "auxiliary_loss_clip": 0.01091699, "auxiliary_loss_mlp": 0.01029361, "balance_loss_clip": 1.03614366, "balance_loss_mlp": 1.01829839, "epoch": 0.8015632045693672, "flos": 21872328059520.0, "grad_norm": 1.8780665842935151, "language_loss": 0.77335048, "learning_rate": 3.989334316347401e-07, "loss": 0.79456115, "num_input_tokens_seen": 287707895, "step": 13332, "time_per_iteration": 2.708766460418701 }, { "auxiliary_loss_clip": 0.01110082, "auxiliary_loss_mlp": 0.01032572, "balance_loss_clip": 1.03853345, "balance_loss_mlp": 1.02041256, "epoch": 0.8016233278220352, "flos": 23656728925440.0, "grad_norm": 1.9285581629240347, "language_loss": 0.83625793, "learning_rate": 3.987000621653338e-07, "loss": 0.85768449, "num_input_tokens_seen": 287723990, "step": 13333, "time_per_iteration": 2.6203196048736572 }, { "auxiliary_loss_clip": 0.01088802, "auxiliary_loss_mlp": 0.01032312, "balance_loss_clip": 1.03681588, "balance_loss_mlp": 1.02005112, "epoch": 0.8016834510747032, "flos": 16253170185600.0, "grad_norm": 2.0639926273292115, "language_loss": 0.73560673, "learning_rate": 3.9846675341738133e-07, "loss": 0.75681788, "num_input_tokens_seen": 287742380, "step": 13334, "time_per_iteration": 2.674370765686035 }, { "auxiliary_loss_clip": 0.01068855, "auxiliary_loss_mlp": 0.01038369, "balance_loss_clip": 1.03341401, "balance_loss_mlp": 1.02465343, "epoch": 0.8017435743273711, "flos": 12276027665280.0, "grad_norm": 3.712822925491278, "language_loss": 0.7483573, "learning_rate": 3.9823350539972967e-07, "loss": 0.7694295, "num_input_tokens_seen": 287760130, "step": 13335, "time_per_iteration": 2.661638021469116 }, { "auxiliary_loss_clip": 0.01067475, "auxiliary_loss_mlp": 0.01033285, "balance_loss_clip": 1.03284895, "balance_loss_mlp": 1.02039289, "epoch": 0.8018036975800391, "flos": 17196112068480.0, "grad_norm": 1.8858612114976723, "language_loss": 0.75267804, "learning_rate": 3.9800031812122416e-07, "loss": 0.77368569, "num_input_tokens_seen": 287777565, "step": 13336, "time_per_iteration": 2.716108560562134 }, { "auxiliary_loss_clip": 0.01077828, "auxiliary_loss_mlp": 0.01037534, "balance_loss_clip": 1.03872991, "balance_loss_mlp": 1.02433777, "epoch": 0.801863820832707, "flos": 20631865824000.0, "grad_norm": 2.2915329222004153, "language_loss": 0.75145626, "learning_rate": 3.977671915907068e-07, "loss": 0.77260983, "num_input_tokens_seen": 287796310, "step": 13337, "time_per_iteration": 2.714571237564087 }, { "auxiliary_loss_clip": 0.0105226, "auxiliary_loss_mlp": 0.00771062, "balance_loss_clip": 1.03701448, "balance_loss_mlp": 1.00021958, "epoch": 0.801923944085375, "flos": 30445569285120.0, "grad_norm": 1.6114282506426694, "language_loss": 0.80135483, "learning_rate": 3.9753412581701883e-07, "loss": 0.81958807, "num_input_tokens_seen": 287817330, "step": 13338, "time_per_iteration": 2.8196728229522705 }, { "auxiliary_loss_clip": 0.01073348, "auxiliary_loss_mlp": 0.01033358, "balance_loss_clip": 1.03255105, "balance_loss_mlp": 1.01937521, "epoch": 0.801984067338043, "flos": 20010575255040.0, "grad_norm": 1.8585955829202727, "language_loss": 0.74602437, "learning_rate": 3.9730112080899733e-07, "loss": 0.76709145, "num_input_tokens_seen": 287835095, "step": 13339, "time_per_iteration": 2.6212968826293945 }, { "auxiliary_loss_clip": 0.01096453, "auxiliary_loss_mlp": 0.01029361, "balance_loss_clip": 1.03771079, "balance_loss_mlp": 1.01769042, "epoch": 0.802044190590711, "flos": 22784028088320.0, "grad_norm": 1.7386931657461442, "language_loss": 0.79321545, "learning_rate": 3.970681765754775e-07, "loss": 0.81447363, "num_input_tokens_seen": 287854595, "step": 13340, "time_per_iteration": 2.6530919075012207 }, { "auxiliary_loss_clip": 0.01083163, "auxiliary_loss_mlp": 0.01032731, "balance_loss_clip": 1.04112291, "balance_loss_mlp": 1.02116799, "epoch": 0.8021043138433789, "flos": 27600115639680.0, "grad_norm": 1.956756887496364, "language_loss": 0.68165088, "learning_rate": 3.968352931252936e-07, "loss": 0.70280981, "num_input_tokens_seen": 287876960, "step": 13341, "time_per_iteration": 2.75055193901062 }, { "auxiliary_loss_clip": 0.01012323, "auxiliary_loss_mlp": 0.01007998, "balance_loss_clip": 1.00806713, "balance_loss_mlp": 1.00693703, "epoch": 0.8021644370960469, "flos": 62063730057600.0, "grad_norm": 0.8136387701822201, "language_loss": 0.61581981, "learning_rate": 3.9660247046727547e-07, "loss": 0.63602304, "num_input_tokens_seen": 287936530, "step": 13342, "time_per_iteration": 3.1247668266296387 }, { "auxiliary_loss_clip": 0.01092566, "auxiliary_loss_mlp": 0.01037048, "balance_loss_clip": 1.03939772, "balance_loss_mlp": 1.02370882, "epoch": 0.8022245603487148, "flos": 23361794352000.0, "grad_norm": 1.856395424623049, "language_loss": 0.63709104, "learning_rate": 3.963697086102522e-07, "loss": 0.65838718, "num_input_tokens_seen": 287954285, "step": 13343, "time_per_iteration": 2.7734808921813965 }, { "auxiliary_loss_clip": 0.01081526, "auxiliary_loss_mlp": 0.01029916, "balance_loss_clip": 1.03520012, "balance_loss_mlp": 1.01859128, "epoch": 0.8022846836013828, "flos": 10853354712960.0, "grad_norm": 2.8925242692111124, "language_loss": 0.68967628, "learning_rate": 3.96137007563051e-07, "loss": 0.71079069, "num_input_tokens_seen": 287971595, "step": 13344, "time_per_iteration": 2.7123825550079346 }, { "auxiliary_loss_clip": 0.01099765, "auxiliary_loss_mlp": 0.0102957, "balance_loss_clip": 1.03843033, "balance_loss_mlp": 1.01712489, "epoch": 0.8023448068540509, "flos": 29240443054080.0, "grad_norm": 1.7802127623764623, "language_loss": 0.7023524, "learning_rate": 3.9590436733449506e-07, "loss": 0.72364575, "num_input_tokens_seen": 287992540, "step": 13345, "time_per_iteration": 2.7695276737213135 }, { "auxiliary_loss_clip": 0.01013378, "auxiliary_loss_mlp": 0.01005433, "balance_loss_clip": 1.00990939, "balance_loss_mlp": 1.00426471, "epoch": 0.8024049301067188, "flos": 64153588181760.0, "grad_norm": 0.8891261669037472, "language_loss": 0.62973511, "learning_rate": 3.956717879334059e-07, "loss": 0.64992326, "num_input_tokens_seen": 288052810, "step": 13346, "time_per_iteration": 3.28011417388916 }, { "auxiliary_loss_clip": 0.01084414, "auxiliary_loss_mlp": 0.01031015, "balance_loss_clip": 1.03860998, "balance_loss_mlp": 1.01868272, "epoch": 0.8024650533593868, "flos": 28585360765440.0, "grad_norm": 2.315219650527018, "language_loss": 0.72604311, "learning_rate": 3.9543926936860327e-07, "loss": 0.74719733, "num_input_tokens_seen": 288073045, "step": 13347, "time_per_iteration": 2.7291135787963867 }, { "auxiliary_loss_clip": 0.01098598, "auxiliary_loss_mlp": 0.01032216, "balance_loss_clip": 1.0363127, "balance_loss_mlp": 1.01959181, "epoch": 0.8025251766120547, "flos": 16982264448000.0, "grad_norm": 1.7959998434769961, "language_loss": 0.72794473, "learning_rate": 3.9520681164890493e-07, "loss": 0.74925292, "num_input_tokens_seen": 288091165, "step": 13348, "time_per_iteration": 2.623680353164673 }, { "auxiliary_loss_clip": 0.01083208, "auxiliary_loss_mlp": 0.01030443, "balance_loss_clip": 1.03903532, "balance_loss_mlp": 1.0179677, "epoch": 0.8025852998647227, "flos": 22163671272960.0, "grad_norm": 3.547597089549619, "language_loss": 0.75893748, "learning_rate": 3.9497441478312444e-07, "loss": 0.780074, "num_input_tokens_seen": 288110595, "step": 13349, "time_per_iteration": 2.658114433288574 }, { "auxiliary_loss_clip": 0.01110466, "auxiliary_loss_mlp": 0.01034774, "balance_loss_clip": 1.03946209, "balance_loss_mlp": 1.02321064, "epoch": 0.8026454231173906, "flos": 22017012042240.0, "grad_norm": 2.407592259971092, "language_loss": 0.83429128, "learning_rate": 3.947420787800755e-07, "loss": 0.85574365, "num_input_tokens_seen": 288128995, "step": 13350, "time_per_iteration": 2.6693131923675537 }, { "auxiliary_loss_clip": 0.01100877, "auxiliary_loss_mlp": 0.01036159, "balance_loss_clip": 1.03946972, "balance_loss_mlp": 1.02371919, "epoch": 0.8027055463700586, "flos": 22491320158080.0, "grad_norm": 2.406570051160922, "language_loss": 0.71667969, "learning_rate": 3.945098036485679e-07, "loss": 0.7380501, "num_input_tokens_seen": 288149265, "step": 13351, "time_per_iteration": 2.6675031185150146 }, { "auxiliary_loss_clip": 0.01069791, "auxiliary_loss_mlp": 0.01034939, "balance_loss_clip": 1.03470254, "balance_loss_mlp": 1.02237439, "epoch": 0.8027656696227266, "flos": 28912901909760.0, "grad_norm": 1.586066811433664, "language_loss": 0.61656845, "learning_rate": 3.9427758939740885e-07, "loss": 0.63761568, "num_input_tokens_seen": 288170745, "step": 13352, "time_per_iteration": 2.8598105907440186 }, { "auxiliary_loss_clip": 0.01096816, "auxiliary_loss_mlp": 0.0103672, "balance_loss_clip": 1.03765738, "balance_loss_mlp": 1.02495408, "epoch": 0.8028257928753946, "flos": 18589374760320.0, "grad_norm": 2.4706643643447346, "language_loss": 0.76973253, "learning_rate": 3.940454360354046e-07, "loss": 0.79106784, "num_input_tokens_seen": 288189415, "step": 13353, "time_per_iteration": 2.6434624195098877 }, { "auxiliary_loss_clip": 0.01052438, "auxiliary_loss_mlp": 0.01029892, "balance_loss_clip": 1.03585696, "balance_loss_mlp": 1.01597464, "epoch": 0.8028859161280625, "flos": 19130009339520.0, "grad_norm": 2.077119473640625, "language_loss": 0.73317617, "learning_rate": 3.938133435713582e-07, "loss": 0.75399947, "num_input_tokens_seen": 288206900, "step": 13354, "time_per_iteration": 2.980314254760742 }, { "auxiliary_loss_clip": 0.01069099, "auxiliary_loss_mlp": 0.01040669, "balance_loss_clip": 1.03414679, "balance_loss_mlp": 1.02725756, "epoch": 0.8029460393807305, "flos": 20229881742720.0, "grad_norm": 2.1378474316121463, "language_loss": 0.65846258, "learning_rate": 3.935813120140714e-07, "loss": 0.6795603, "num_input_tokens_seen": 288224800, "step": 13355, "time_per_iteration": 2.7468628883361816 }, { "auxiliary_loss_clip": 0.01074013, "auxiliary_loss_mlp": 0.01033759, "balance_loss_clip": 1.03249073, "balance_loss_mlp": 1.0199666, "epoch": 0.8030061626333984, "flos": 49783320933120.0, "grad_norm": 2.375725357962007, "language_loss": 0.68678093, "learning_rate": 3.9334934137234235e-07, "loss": 0.70785862, "num_input_tokens_seen": 288249400, "step": 13356, "time_per_iteration": 2.9967265129089355 }, { "auxiliary_loss_clip": 0.01069606, "auxiliary_loss_mlp": 0.01029553, "balance_loss_clip": 1.04181337, "balance_loss_mlp": 1.01715517, "epoch": 0.8030662858860664, "flos": 21615243442560.0, "grad_norm": 1.6420809304717021, "language_loss": 0.77664089, "learning_rate": 3.931174316549666e-07, "loss": 0.79763246, "num_input_tokens_seen": 288268780, "step": 13357, "time_per_iteration": 4.406202077865601 }, { "auxiliary_loss_clip": 0.01074511, "auxiliary_loss_mlp": 0.01032493, "balance_loss_clip": 1.03333926, "balance_loss_mlp": 1.01853395, "epoch": 0.8031264091387345, "flos": 25630056351360.0, "grad_norm": 1.4188016653576663, "language_loss": 0.77055764, "learning_rate": 3.9288558287073937e-07, "loss": 0.79162776, "num_input_tokens_seen": 288290830, "step": 13358, "time_per_iteration": 4.418318033218384 }, { "auxiliary_loss_clip": 0.01097306, "auxiliary_loss_mlp": 0.01028987, "balance_loss_clip": 1.03661919, "balance_loss_mlp": 1.01740575, "epoch": 0.8031865323914024, "flos": 19646225648640.0, "grad_norm": 1.5140849812100452, "language_loss": 0.84604448, "learning_rate": 3.9265379502845143e-07, "loss": 0.86730748, "num_input_tokens_seen": 288308865, "step": 13359, "time_per_iteration": 2.6452579498291016 }, { "auxiliary_loss_clip": 0.01081667, "auxiliary_loss_mlp": 0.01025679, "balance_loss_clip": 1.03710377, "balance_loss_mlp": 1.01406813, "epoch": 0.8032466556440704, "flos": 26169110732160.0, "grad_norm": 1.8313021321254959, "language_loss": 0.73854876, "learning_rate": 3.924220681368928e-07, "loss": 0.75962222, "num_input_tokens_seen": 288327325, "step": 13360, "time_per_iteration": 2.7636659145355225 }, { "auxiliary_loss_clip": 0.01110485, "auxiliary_loss_mlp": 0.01027583, "balance_loss_clip": 1.03801131, "balance_loss_mlp": 1.01598358, "epoch": 0.8033067788967383, "flos": 25520026014720.0, "grad_norm": 2.137959732287125, "language_loss": 0.69831038, "learning_rate": 3.921904022048512e-07, "loss": 0.71969098, "num_input_tokens_seen": 288347285, "step": 13361, "time_per_iteration": 4.267240524291992 }, { "auxiliary_loss_clip": 0.01112515, "auxiliary_loss_mlp": 0.01035596, "balance_loss_clip": 1.03754067, "balance_loss_mlp": 1.02316272, "epoch": 0.8033669021494063, "flos": 24024274842240.0, "grad_norm": 1.8009112987643567, "language_loss": 0.70254129, "learning_rate": 3.919587972411098e-07, "loss": 0.72402239, "num_input_tokens_seen": 288367785, "step": 13362, "time_per_iteration": 2.6688599586486816 }, { "auxiliary_loss_clip": 0.01116592, "auxiliary_loss_mlp": 0.0103705, "balance_loss_clip": 1.03921294, "balance_loss_mlp": 1.02289987, "epoch": 0.8034270254020742, "flos": 13588059749760.0, "grad_norm": 2.3399431246123132, "language_loss": 0.78629005, "learning_rate": 3.91727253254452e-07, "loss": 0.80782652, "num_input_tokens_seen": 288384135, "step": 13363, "time_per_iteration": 2.597430944442749 }, { "auxiliary_loss_clip": 0.01097254, "auxiliary_loss_mlp": 0.01028939, "balance_loss_clip": 1.03588295, "balance_loss_mlp": 1.01675546, "epoch": 0.8034871486547422, "flos": 27412661537280.0, "grad_norm": 6.139321315724576, "language_loss": 0.74428964, "learning_rate": 3.9149577025365787e-07, "loss": 0.76555157, "num_input_tokens_seen": 288403805, "step": 13364, "time_per_iteration": 2.688309669494629 }, { "auxiliary_loss_clip": 0.01096585, "auxiliary_loss_mlp": 0.01030723, "balance_loss_clip": 1.03990126, "balance_loss_mlp": 1.0187124, "epoch": 0.8035472719074102, "flos": 32598593475840.0, "grad_norm": 2.077806609776792, "language_loss": 0.61057466, "learning_rate": 3.9126434824750596e-07, "loss": 0.63184774, "num_input_tokens_seen": 288424895, "step": 13365, "time_per_iteration": 2.9324018955230713 }, { "auxiliary_loss_clip": 0.01089765, "auxiliary_loss_mlp": 0.01034058, "balance_loss_clip": 1.03685248, "balance_loss_mlp": 1.02096307, "epoch": 0.8036073951600782, "flos": 21287989607040.0, "grad_norm": 1.971281433653639, "language_loss": 0.66274738, "learning_rate": 3.910329872447706e-07, "loss": 0.68398559, "num_input_tokens_seen": 288443865, "step": 13366, "time_per_iteration": 4.221456050872803 }, { "auxiliary_loss_clip": 0.01106605, "auxiliary_loss_mlp": 0.01031218, "balance_loss_clip": 1.03672922, "balance_loss_mlp": 1.01938665, "epoch": 0.8036675184127461, "flos": 18113845582080.0, "grad_norm": 1.997291212558988, "language_loss": 0.74654198, "learning_rate": 3.908016872542259e-07, "loss": 0.7679202, "num_input_tokens_seen": 288461065, "step": 13367, "time_per_iteration": 2.6199634075164795 }, { "auxiliary_loss_clip": 0.01108205, "auxiliary_loss_mlp": 0.01027463, "balance_loss_clip": 1.03705049, "balance_loss_mlp": 1.01538706, "epoch": 0.8037276416654141, "flos": 26030280666240.0, "grad_norm": 1.603723731254455, "language_loss": 0.73827767, "learning_rate": 3.905704482846428e-07, "loss": 0.75963438, "num_input_tokens_seen": 288481865, "step": 13368, "time_per_iteration": 2.6368210315704346 }, { "auxiliary_loss_clip": 0.01110551, "auxiliary_loss_mlp": 0.01031277, "balance_loss_clip": 1.03718567, "balance_loss_mlp": 1.01879573, "epoch": 0.803787764918082, "flos": 18802180886400.0, "grad_norm": 1.9805811504758806, "language_loss": 0.70231676, "learning_rate": 3.90339270344789e-07, "loss": 0.72373503, "num_input_tokens_seen": 288499345, "step": 13369, "time_per_iteration": 2.5763745307922363 }, { "auxiliary_loss_clip": 0.01088545, "auxiliary_loss_mlp": 0.01033097, "balance_loss_clip": 1.03672147, "balance_loss_mlp": 1.02170682, "epoch": 0.80384788817075, "flos": 20225787592320.0, "grad_norm": 2.6910439854166466, "language_loss": 0.73273438, "learning_rate": 3.901081534434312e-07, "loss": 0.75395083, "num_input_tokens_seen": 288517660, "step": 13370, "time_per_iteration": 2.748764753341675 }, { "auxiliary_loss_clip": 0.01087131, "auxiliary_loss_mlp": 0.01032078, "balance_loss_clip": 1.03560901, "balance_loss_mlp": 1.01849425, "epoch": 0.8039080114234181, "flos": 18515290959360.0, "grad_norm": 2.9168856587883987, "language_loss": 0.86785686, "learning_rate": 3.898770975893342e-07, "loss": 0.88904893, "num_input_tokens_seen": 288534180, "step": 13371, "time_per_iteration": 2.7640862464904785 }, { "auxiliary_loss_clip": 0.01100956, "auxiliary_loss_mlp": 0.01032312, "balance_loss_clip": 1.03582239, "balance_loss_mlp": 1.0192045, "epoch": 0.803968134676086, "flos": 22382510883840.0, "grad_norm": 1.8421468200068354, "language_loss": 0.75026673, "learning_rate": 3.89646102791259e-07, "loss": 0.77159941, "num_input_tokens_seen": 288553350, "step": 13372, "time_per_iteration": 2.724491596221924 }, { "auxiliary_loss_clip": 0.01068816, "auxiliary_loss_mlp": 0.01031309, "balance_loss_clip": 1.03654325, "balance_loss_mlp": 1.01796961, "epoch": 0.804028257928754, "flos": 23842566915840.0, "grad_norm": 2.31065628339188, "language_loss": 0.79036891, "learning_rate": 3.894151690579646e-07, "loss": 0.81137019, "num_input_tokens_seen": 288571325, "step": 13373, "time_per_iteration": 2.910059928894043 }, { "auxiliary_loss_clip": 0.01081798, "auxiliary_loss_mlp": 0.01035447, "balance_loss_clip": 1.03376925, "balance_loss_mlp": 1.02387166, "epoch": 0.8040883811814219, "flos": 23550720912000.0, "grad_norm": 1.7053581559181326, "language_loss": 0.74311471, "learning_rate": 3.8918429639820815e-07, "loss": 0.76428711, "num_input_tokens_seen": 288592100, "step": 13374, "time_per_iteration": 2.698894500732422 }, { "auxiliary_loss_clip": 0.01059369, "auxiliary_loss_mlp": 0.01037575, "balance_loss_clip": 1.03106141, "balance_loss_mlp": 1.02297187, "epoch": 0.8041485044340899, "flos": 19026263882880.0, "grad_norm": 1.889259029929228, "language_loss": 0.6848501, "learning_rate": 3.889534848207452e-07, "loss": 0.70581961, "num_input_tokens_seen": 288612305, "step": 13375, "time_per_iteration": 2.781163215637207 }, { "auxiliary_loss_clip": 0.01008954, "auxiliary_loss_mlp": 0.01001942, "balance_loss_clip": 1.01513779, "balance_loss_mlp": 1.00076795, "epoch": 0.8042086276867578, "flos": 70005663797760.0, "grad_norm": 0.7220073692688251, "language_loss": 0.55658954, "learning_rate": 3.887227343343271e-07, "loss": 0.57669854, "num_input_tokens_seen": 288676015, "step": 13376, "time_per_iteration": 3.373588800430298 }, { "auxiliary_loss_clip": 0.01056178, "auxiliary_loss_mlp": 0.01035137, "balance_loss_clip": 1.03220654, "balance_loss_mlp": 1.02096248, "epoch": 0.8042687509394258, "flos": 21872435800320.0, "grad_norm": 1.964983939907185, "language_loss": 0.72909731, "learning_rate": 3.8849204494770425e-07, "loss": 0.75001043, "num_input_tokens_seen": 288696455, "step": 13377, "time_per_iteration": 2.8420963287353516 }, { "auxiliary_loss_clip": 0.01095422, "auxiliary_loss_mlp": 0.01029914, "balance_loss_clip": 1.03510308, "balance_loss_mlp": 1.01737309, "epoch": 0.8043288741920938, "flos": 26614870513920.0, "grad_norm": 1.843970634280457, "language_loss": 0.70282233, "learning_rate": 3.8826141666962567e-07, "loss": 0.72407568, "num_input_tokens_seen": 288715560, "step": 13378, "time_per_iteration": 2.656498670578003 }, { "auxiliary_loss_clip": 0.0110247, "auxiliary_loss_mlp": 0.01027065, "balance_loss_clip": 1.03830576, "balance_loss_mlp": 1.01435089, "epoch": 0.8043889974447618, "flos": 33403387651200.0, "grad_norm": 1.3626557970625712, "language_loss": 0.69352663, "learning_rate": 3.880308495088347e-07, "loss": 0.71482199, "num_input_tokens_seen": 288739485, "step": 13379, "time_per_iteration": 2.725536584854126 }, { "auxiliary_loss_clip": 0.01115659, "auxiliary_loss_mlp": 0.01034653, "balance_loss_clip": 1.04020107, "balance_loss_mlp": 1.02027059, "epoch": 0.8044491206974297, "flos": 20375966355840.0, "grad_norm": 1.7895941643177822, "language_loss": 0.76386261, "learning_rate": 3.8780034347407533e-07, "loss": 0.7853657, "num_input_tokens_seen": 288757420, "step": 13380, "time_per_iteration": 2.560413360595703 }, { "auxiliary_loss_clip": 0.01062218, "auxiliary_loss_mlp": 0.01029167, "balance_loss_clip": 1.03264856, "balance_loss_mlp": 1.01679909, "epoch": 0.8045092439500977, "flos": 23403810286080.0, "grad_norm": 2.4010662161686813, "language_loss": 0.69055688, "learning_rate": 3.875698985740887e-07, "loss": 0.71147072, "num_input_tokens_seen": 288775535, "step": 13381, "time_per_iteration": 2.7233426570892334 }, { "auxiliary_loss_clip": 0.01102054, "auxiliary_loss_mlp": 0.01033657, "balance_loss_clip": 1.03834701, "balance_loss_mlp": 1.02112257, "epoch": 0.8045693672027656, "flos": 24097245321600.0, "grad_norm": 1.7871560626135898, "language_loss": 0.63795519, "learning_rate": 3.873395148176135e-07, "loss": 0.65931231, "num_input_tokens_seen": 288795035, "step": 13382, "time_per_iteration": 2.62091326713562 }, { "auxiliary_loss_clip": 0.01086707, "auxiliary_loss_mlp": 0.01036462, "balance_loss_clip": 1.03742146, "balance_loss_mlp": 1.02481508, "epoch": 0.8046294904554336, "flos": 27707165147520.0, "grad_norm": 4.567282213112271, "language_loss": 0.7625041, "learning_rate": 3.8710919221338487e-07, "loss": 0.78373575, "num_input_tokens_seen": 288816270, "step": 13383, "time_per_iteration": 2.7304000854492188 }, { "auxiliary_loss_clip": 0.01093751, "auxiliary_loss_mlp": 0.01041645, "balance_loss_clip": 1.03544414, "balance_loss_mlp": 1.02812028, "epoch": 0.8046896137081017, "flos": 24972998814720.0, "grad_norm": 1.8100283052553972, "language_loss": 0.69704837, "learning_rate": 3.868789307701381e-07, "loss": 0.71840227, "num_input_tokens_seen": 288836050, "step": 13384, "time_per_iteration": 2.623194932937622 }, { "auxiliary_loss_clip": 0.0109844, "auxiliary_loss_mlp": 0.01036542, "balance_loss_clip": 1.03508019, "balance_loss_mlp": 1.02301192, "epoch": 0.8047497369607696, "flos": 17675484001920.0, "grad_norm": 8.534865412307397, "language_loss": 0.79628527, "learning_rate": 3.8664873049660375e-07, "loss": 0.81763506, "num_input_tokens_seen": 288852900, "step": 13385, "time_per_iteration": 2.640493869781494 }, { "auxiliary_loss_clip": 0.01109031, "auxiliary_loss_mlp": 0.01031665, "balance_loss_clip": 1.03663421, "balance_loss_mlp": 1.01859391, "epoch": 0.8048098602134376, "flos": 22382079920640.0, "grad_norm": 1.7222276785014166, "language_loss": 0.72210598, "learning_rate": 3.864185914015108e-07, "loss": 0.74351293, "num_input_tokens_seen": 288872625, "step": 13386, "time_per_iteration": 2.620424747467041 }, { "auxiliary_loss_clip": 0.01000165, "auxiliary_loss_mlp": 0.01002697, "balance_loss_clip": 1.0073638, "balance_loss_mlp": 1.00164747, "epoch": 0.8048699834661055, "flos": 71200949702400.0, "grad_norm": 0.6627374322558968, "language_loss": 0.51254958, "learning_rate": 3.861885134935865e-07, "loss": 0.53257823, "num_input_tokens_seen": 288939180, "step": 13387, "time_per_iteration": 3.249873399734497 }, { "auxiliary_loss_clip": 0.01108941, "auxiliary_loss_mlp": 0.01033944, "balance_loss_clip": 1.03665853, "balance_loss_mlp": 1.02005613, "epoch": 0.8049301067187735, "flos": 23660320285440.0, "grad_norm": 1.7754749617398262, "language_loss": 0.73770982, "learning_rate": 3.859584967815559e-07, "loss": 0.75913864, "num_input_tokens_seen": 288958925, "step": 13388, "time_per_iteration": 2.63905930519104 }, { "auxiliary_loss_clip": 0.0108125, "auxiliary_loss_mlp": 0.01028741, "balance_loss_clip": 1.04047871, "balance_loss_mlp": 1.01668882, "epoch": 0.8049902299714414, "flos": 24426330750720.0, "grad_norm": 1.3693007974519653, "language_loss": 0.71537852, "learning_rate": 3.857285412741411e-07, "loss": 0.73647845, "num_input_tokens_seen": 288980935, "step": 13389, "time_per_iteration": 2.8490209579467773 }, { "auxiliary_loss_clip": 0.01085356, "auxiliary_loss_mlp": 0.01032905, "balance_loss_clip": 1.03994167, "balance_loss_mlp": 1.02047765, "epoch": 0.8050503532241094, "flos": 17492626840320.0, "grad_norm": 2.1746579852789565, "language_loss": 0.82934594, "learning_rate": 3.8549864698006097e-07, "loss": 0.8505286, "num_input_tokens_seen": 288996780, "step": 13390, "time_per_iteration": 2.695349931716919 }, { "auxiliary_loss_clip": 0.01021163, "auxiliary_loss_mlp": 0.01001808, "balance_loss_clip": 1.00786567, "balance_loss_mlp": 1.00077081, "epoch": 0.8051104764767774, "flos": 57658030369920.0, "grad_norm": 0.7760583028840095, "language_loss": 0.55514753, "learning_rate": 3.8526881390803424e-07, "loss": 0.57537723, "num_input_tokens_seen": 289057590, "step": 13391, "time_per_iteration": 3.1499392986297607 }, { "auxiliary_loss_clip": 0.01096246, "auxiliary_loss_mlp": 0.01032181, "balance_loss_clip": 1.0376209, "balance_loss_mlp": 1.02046287, "epoch": 0.8051705997294454, "flos": 18003456109440.0, "grad_norm": 1.5156025498114776, "language_loss": 0.84548998, "learning_rate": 3.850390420667762e-07, "loss": 0.86677432, "num_input_tokens_seen": 289076285, "step": 13392, "time_per_iteration": 2.7121686935424805 }, { "auxiliary_loss_clip": 0.01075704, "auxiliary_loss_mlp": 0.01031392, "balance_loss_clip": 1.03425109, "balance_loss_mlp": 1.01953077, "epoch": 0.8052307229821133, "flos": 26397754755840.0, "grad_norm": 1.5752957975366317, "language_loss": 0.70452738, "learning_rate": 3.8480933146499914e-07, "loss": 0.72559834, "num_input_tokens_seen": 289097585, "step": 13393, "time_per_iteration": 2.857966899871826 }, { "auxiliary_loss_clip": 0.01100081, "auxiliary_loss_mlp": 0.01033968, "balance_loss_clip": 1.03709984, "balance_loss_mlp": 1.02045584, "epoch": 0.8052908462347813, "flos": 21757018423680.0, "grad_norm": 2.1123482954588733, "language_loss": 0.76134676, "learning_rate": 3.84579682111414e-07, "loss": 0.78268725, "num_input_tokens_seen": 289116890, "step": 13394, "time_per_iteration": 2.6536917686462402 }, { "auxiliary_loss_clip": 0.0111249, "auxiliary_loss_mlp": 0.01030653, "balance_loss_clip": 1.03986442, "balance_loss_mlp": 1.0186125, "epoch": 0.8053509694874492, "flos": 25442279026560.0, "grad_norm": 1.6448341122906027, "language_loss": 0.64934421, "learning_rate": 3.843500940147304e-07, "loss": 0.67077565, "num_input_tokens_seen": 289136670, "step": 13395, "time_per_iteration": 2.6280672550201416 }, { "auxiliary_loss_clip": 0.01019533, "auxiliary_loss_mlp": 0.00999955, "balance_loss_clip": 1.00609279, "balance_loss_mlp": 0.99902552, "epoch": 0.8054110927401172, "flos": 57668122091520.0, "grad_norm": 0.7500398234084821, "language_loss": 0.57342923, "learning_rate": 3.8412056718365206e-07, "loss": 0.59362411, "num_input_tokens_seen": 289200150, "step": 13396, "time_per_iteration": 3.278367519378662 }, { "auxiliary_loss_clip": 0.01099939, "auxiliary_loss_mlp": 0.01035451, "balance_loss_clip": 1.0372299, "balance_loss_mlp": 1.02181315, "epoch": 0.8054712159927853, "flos": 19276201693440.0, "grad_norm": 1.6362380088306854, "language_loss": 0.77317524, "learning_rate": 3.8389110162688353e-07, "loss": 0.79452914, "num_input_tokens_seen": 289218125, "step": 13397, "time_per_iteration": 5.758723258972168 }, { "auxiliary_loss_clip": 0.01095341, "auxiliary_loss_mlp": 0.01029553, "balance_loss_clip": 1.04027462, "balance_loss_mlp": 1.01784718, "epoch": 0.8055313392454532, "flos": 17967617314560.0, "grad_norm": 1.6834134519930992, "language_loss": 0.70419687, "learning_rate": 3.836616973531266e-07, "loss": 0.72544581, "num_input_tokens_seen": 289237115, "step": 13398, "time_per_iteration": 2.6618268489837646 }, { "auxiliary_loss_clip": 0.01086822, "auxiliary_loss_mlp": 0.01031551, "balance_loss_clip": 1.03505898, "balance_loss_mlp": 1.01981521, "epoch": 0.8055914624981212, "flos": 13478352635520.0, "grad_norm": 4.3745696767144056, "language_loss": 0.69005787, "learning_rate": 3.834323543710805e-07, "loss": 0.71124166, "num_input_tokens_seen": 289253635, "step": 13399, "time_per_iteration": 2.682286024093628 }, { "auxiliary_loss_clip": 0.01109953, "auxiliary_loss_mlp": 0.0103448, "balance_loss_clip": 1.03786373, "balance_loss_mlp": 1.02234411, "epoch": 0.8056515857507891, "flos": 13224787551360.0, "grad_norm": 2.3581489443950043, "language_loss": 0.71867836, "learning_rate": 3.8320307268944153e-07, "loss": 0.74012268, "num_input_tokens_seen": 289270085, "step": 13400, "time_per_iteration": 4.049706935882568 }, { "auxiliary_loss_clip": 0.0109504, "auxiliary_loss_mlp": 0.01032952, "balance_loss_clip": 1.03316319, "balance_loss_mlp": 1.0205605, "epoch": 0.8057117090034571, "flos": 23878190229120.0, "grad_norm": 1.8105260022834564, "language_loss": 0.64344472, "learning_rate": 3.829738523169037e-07, "loss": 0.66472465, "num_input_tokens_seen": 289289645, "step": 13401, "time_per_iteration": 2.6664888858795166 }, { "auxiliary_loss_clip": 0.01097912, "auxiliary_loss_mlp": 0.01033141, "balance_loss_clip": 1.03556728, "balance_loss_mlp": 1.0208919, "epoch": 0.805771832256125, "flos": 21214300855680.0, "grad_norm": 2.280323005246413, "language_loss": 0.83644533, "learning_rate": 3.8274469326215985e-07, "loss": 0.8577559, "num_input_tokens_seen": 289306630, "step": 13402, "time_per_iteration": 2.6944761276245117 }, { "auxiliary_loss_clip": 0.01058036, "auxiliary_loss_mlp": 0.01032366, "balance_loss_clip": 1.03603578, "balance_loss_mlp": 1.01981318, "epoch": 0.805831955508793, "flos": 17566818382080.0, "grad_norm": 6.24262023613013, "language_loss": 0.68056262, "learning_rate": 3.8251559553389876e-07, "loss": 0.70146668, "num_input_tokens_seen": 289324960, "step": 13403, "time_per_iteration": 2.763301372528076 }, { "auxiliary_loss_clip": 0.01069641, "auxiliary_loss_mlp": 0.00769597, "balance_loss_clip": 1.0345124, "balance_loss_mlp": 1.00027502, "epoch": 0.805892078761461, "flos": 26907542530560.0, "grad_norm": 3.472008939762663, "language_loss": 0.84777313, "learning_rate": 3.822865591408084e-07, "loss": 0.86616552, "num_input_tokens_seen": 289344980, "step": 13404, "time_per_iteration": 2.7284321784973145 }, { "auxiliary_loss_clip": 0.01066717, "auxiliary_loss_mlp": 0.01032046, "balance_loss_clip": 1.03557312, "balance_loss_mlp": 1.02060783, "epoch": 0.805952202014129, "flos": 31506442496640.0, "grad_norm": 1.5427853582818836, "language_loss": 0.70597529, "learning_rate": 3.820575840915743e-07, "loss": 0.72696286, "num_input_tokens_seen": 289367500, "step": 13405, "time_per_iteration": 4.542062520980835 }, { "auxiliary_loss_clip": 0.01098712, "auxiliary_loss_mlp": 0.01025986, "balance_loss_clip": 1.03720641, "balance_loss_mlp": 1.01441038, "epoch": 0.8060123252667969, "flos": 24389953251840.0, "grad_norm": 2.8341192767465957, "language_loss": 0.7541554, "learning_rate": 3.818286703948788e-07, "loss": 0.77540243, "num_input_tokens_seen": 289385930, "step": 13406, "time_per_iteration": 2.68805193901062 }, { "auxiliary_loss_clip": 0.01100072, "auxiliary_loss_mlp": 0.01035029, "balance_loss_clip": 1.03811562, "balance_loss_mlp": 1.02201676, "epoch": 0.8060724485194649, "flos": 23479941162240.0, "grad_norm": 1.5246012967976152, "language_loss": 0.76201332, "learning_rate": 3.815998180594018e-07, "loss": 0.7833643, "num_input_tokens_seen": 289408025, "step": 13407, "time_per_iteration": 2.666938066482544 }, { "auxiliary_loss_clip": 0.01080345, "auxiliary_loss_mlp": 0.00770991, "balance_loss_clip": 1.03358412, "balance_loss_mlp": 1.00019884, "epoch": 0.8061325717721328, "flos": 18624495283200.0, "grad_norm": 1.5796081620527238, "language_loss": 0.73616993, "learning_rate": 3.81371027093822e-07, "loss": 0.75468338, "num_input_tokens_seen": 289426575, "step": 13408, "time_per_iteration": 2.662716865539551 }, { "auxiliary_loss_clip": 0.01079538, "auxiliary_loss_mlp": 0.01039164, "balance_loss_clip": 1.03391623, "balance_loss_mlp": 1.02488232, "epoch": 0.8061926950248008, "flos": 23582752865280.0, "grad_norm": 1.8848269452946171, "language_loss": 0.7084735, "learning_rate": 3.8114229750681523e-07, "loss": 0.72966051, "num_input_tokens_seen": 289447760, "step": 13409, "time_per_iteration": 2.6887590885162354 }, { "auxiliary_loss_clip": 0.01108, "auxiliary_loss_mlp": 0.01029607, "balance_loss_clip": 1.03585696, "balance_loss_mlp": 1.0173347, "epoch": 0.8062528182774689, "flos": 11143333209600.0, "grad_norm": 2.076248478414053, "language_loss": 0.76634085, "learning_rate": 3.809136293070545e-07, "loss": 0.78771693, "num_input_tokens_seen": 289463920, "step": 13410, "time_per_iteration": 2.5652787685394287 }, { "auxiliary_loss_clip": 0.01099064, "auxiliary_loss_mlp": 0.01037232, "balance_loss_clip": 1.03812885, "balance_loss_mlp": 1.02414274, "epoch": 0.8063129415301368, "flos": 22346815743360.0, "grad_norm": 2.5070949588041653, "language_loss": 0.68454826, "learning_rate": 3.806850225032117e-07, "loss": 0.70591122, "num_input_tokens_seen": 289482635, "step": 13411, "time_per_iteration": 2.627668857574463 }, { "auxiliary_loss_clip": 0.01076065, "auxiliary_loss_mlp": 0.01032676, "balance_loss_clip": 1.03557301, "balance_loss_mlp": 1.02042735, "epoch": 0.8063730647828048, "flos": 23988400133760.0, "grad_norm": 1.6635819299590309, "language_loss": 0.68043619, "learning_rate": 3.804564771039551e-07, "loss": 0.70152354, "num_input_tokens_seen": 289502040, "step": 13412, "time_per_iteration": 2.8055179119110107 }, { "auxiliary_loss_clip": 0.01099792, "auxiliary_loss_mlp": 0.01036335, "balance_loss_clip": 1.03846812, "balance_loss_mlp": 1.02239335, "epoch": 0.8064331880354727, "flos": 21321494017920.0, "grad_norm": 1.701960301408402, "language_loss": 0.81657159, "learning_rate": 3.8022799311795064e-07, "loss": 0.83793283, "num_input_tokens_seen": 289520740, "step": 13413, "time_per_iteration": 2.700803279876709 }, { "auxiliary_loss_clip": 0.01092458, "auxiliary_loss_mlp": 0.01042312, "balance_loss_clip": 1.03472614, "balance_loss_mlp": 1.02902031, "epoch": 0.8064933112881407, "flos": 19682890456320.0, "grad_norm": 1.8513481069997626, "language_loss": 0.85172534, "learning_rate": 3.7999957055386303e-07, "loss": 0.8730731, "num_input_tokens_seen": 289535840, "step": 13414, "time_per_iteration": 2.563521385192871 }, { "auxiliary_loss_clip": 0.01083885, "auxiliary_loss_mlp": 0.01033091, "balance_loss_clip": 1.03454745, "balance_loss_mlp": 1.02088439, "epoch": 0.8065534345408086, "flos": 19279721226240.0, "grad_norm": 1.9940068342487725, "language_loss": 0.67127073, "learning_rate": 3.7977120942035467e-07, "loss": 0.69244045, "num_input_tokens_seen": 289555205, "step": 13415, "time_per_iteration": 2.816197633743286 }, { "auxiliary_loss_clip": 0.01072851, "auxiliary_loss_mlp": 0.01025923, "balance_loss_clip": 1.0345068, "balance_loss_mlp": 1.01406205, "epoch": 0.8066135577934767, "flos": 19677718897920.0, "grad_norm": 1.6185283067641691, "language_loss": 0.76407629, "learning_rate": 3.7954290972608383e-07, "loss": 0.78506404, "num_input_tokens_seen": 289573000, "step": 13416, "time_per_iteration": 2.7045845985412598 }, { "auxiliary_loss_clip": 0.01095005, "auxiliary_loss_mlp": 0.01034611, "balance_loss_clip": 1.03473926, "balance_loss_mlp": 1.02240372, "epoch": 0.8066736810461446, "flos": 21143592933120.0, "grad_norm": 1.4143763344150053, "language_loss": 0.65079415, "learning_rate": 3.793146714797086e-07, "loss": 0.67209029, "num_input_tokens_seen": 289592625, "step": 13417, "time_per_iteration": 2.6034398078918457 }, { "auxiliary_loss_clip": 0.01075095, "auxiliary_loss_mlp": 0.01055795, "balance_loss_clip": 1.0338254, "balance_loss_mlp": 1.0419488, "epoch": 0.8067338042988126, "flos": 22598261925120.0, "grad_norm": 1.7315768879693472, "language_loss": 0.8098107, "learning_rate": 3.7908649468988306e-07, "loss": 0.8311196, "num_input_tokens_seen": 289610780, "step": 13418, "time_per_iteration": 2.721973180770874 }, { "auxiliary_loss_clip": 0.01090483, "auxiliary_loss_mlp": 0.01032366, "balance_loss_clip": 1.03820384, "balance_loss_mlp": 1.01939058, "epoch": 0.8067939275514805, "flos": 16508423208960.0, "grad_norm": 1.9096821915848545, "language_loss": 0.84676445, "learning_rate": 3.7885837936526066e-07, "loss": 0.86799294, "num_input_tokens_seen": 289628890, "step": 13419, "time_per_iteration": 2.6393797397613525 }, { "auxiliary_loss_clip": 0.01071529, "auxiliary_loss_mlp": 0.00770579, "balance_loss_clip": 1.03478575, "balance_loss_mlp": 1.00021386, "epoch": 0.8068540508041485, "flos": 28541836460160.0, "grad_norm": 1.770068441657345, "language_loss": 0.76010084, "learning_rate": 3.7863032551449047e-07, "loss": 0.7785219, "num_input_tokens_seen": 289647220, "step": 13420, "time_per_iteration": 2.8084943294525146 }, { "auxiliary_loss_clip": 0.01090718, "auxiliary_loss_mlp": 0.00769899, "balance_loss_clip": 1.03447235, "balance_loss_mlp": 1.00020134, "epoch": 0.8069141740568164, "flos": 21652482867840.0, "grad_norm": 1.8346765895775454, "language_loss": 0.78423268, "learning_rate": 3.784023331462207e-07, "loss": 0.8028388, "num_input_tokens_seen": 289665800, "step": 13421, "time_per_iteration": 2.6397383213043213 }, { "auxiliary_loss_clip": 0.01078405, "auxiliary_loss_mlp": 0.01025501, "balance_loss_clip": 1.0375917, "balance_loss_mlp": 1.01340711, "epoch": 0.8069742973094844, "flos": 17529327561600.0, "grad_norm": 1.6792370158266972, "language_loss": 0.80156964, "learning_rate": 3.78174402269098e-07, "loss": 0.82260871, "num_input_tokens_seen": 289682705, "step": 13422, "time_per_iteration": 2.7309072017669678 }, { "auxiliary_loss_clip": 0.01108091, "auxiliary_loss_mlp": 0.01032058, "balance_loss_clip": 1.03667307, "balance_loss_mlp": 1.02025604, "epoch": 0.8070344205621525, "flos": 23367037737600.0, "grad_norm": 1.6418430759860865, "language_loss": 0.67767537, "learning_rate": 3.7794653289176347e-07, "loss": 0.69907683, "num_input_tokens_seen": 289702920, "step": 13423, "time_per_iteration": 2.6276538372039795 }, { "auxiliary_loss_clip": 0.01087102, "auxiliary_loss_mlp": 0.01036916, "balance_loss_clip": 1.03931355, "balance_loss_mlp": 1.02424431, "epoch": 0.8070945438148204, "flos": 22930184528640.0, "grad_norm": 1.8059898203268332, "language_loss": 0.80249333, "learning_rate": 3.7771872502285904e-07, "loss": 0.82373351, "num_input_tokens_seen": 289723280, "step": 13424, "time_per_iteration": 2.7442784309387207 }, { "auxiliary_loss_clip": 0.01098964, "auxiliary_loss_mlp": 0.01028478, "balance_loss_clip": 1.03548968, "balance_loss_mlp": 1.01652098, "epoch": 0.8071546670674884, "flos": 25300683613440.0, "grad_norm": 1.410290973233428, "language_loss": 0.78814334, "learning_rate": 3.774909786710232e-07, "loss": 0.80941772, "num_input_tokens_seen": 289743475, "step": 13425, "time_per_iteration": 2.666613817214966 }, { "auxiliary_loss_clip": 0.0107896, "auxiliary_loss_mlp": 0.01032247, "balance_loss_clip": 1.03484488, "balance_loss_mlp": 1.0198555, "epoch": 0.8072147903201563, "flos": 18113701927680.0, "grad_norm": 2.5661654398107814, "language_loss": 0.75609297, "learning_rate": 3.772632938448923e-07, "loss": 0.77720505, "num_input_tokens_seen": 289761400, "step": 13426, "time_per_iteration": 2.6524770259857178 }, { "auxiliary_loss_clip": 0.01098302, "auxiliary_loss_mlp": 0.0102617, "balance_loss_clip": 1.03656507, "balance_loss_mlp": 1.01461828, "epoch": 0.8072749135728243, "flos": 26688164215680.0, "grad_norm": 1.8886628255914524, "language_loss": 0.72703242, "learning_rate": 3.770356705530997e-07, "loss": 0.74827707, "num_input_tokens_seen": 289781025, "step": 13427, "time_per_iteration": 2.6662089824676514 }, { "auxiliary_loss_clip": 0.01060667, "auxiliary_loss_mlp": 0.01038927, "balance_loss_clip": 1.03814864, "balance_loss_mlp": 1.02555811, "epoch": 0.8073350368254922, "flos": 19240291071360.0, "grad_norm": 1.7110395570969614, "language_loss": 0.70348513, "learning_rate": 3.768081088042774e-07, "loss": 0.72448105, "num_input_tokens_seen": 289798380, "step": 13428, "time_per_iteration": 2.7715890407562256 }, { "auxiliary_loss_clip": 0.01089538, "auxiliary_loss_mlp": 0.0102989, "balance_loss_clip": 1.03667974, "balance_loss_mlp": 1.0185827, "epoch": 0.8073951600781603, "flos": 13334530579200.0, "grad_norm": 2.360374881733329, "language_loss": 0.74510443, "learning_rate": 3.765806086070544e-07, "loss": 0.76629871, "num_input_tokens_seen": 289814515, "step": 13429, "time_per_iteration": 2.6052982807159424 }, { "auxiliary_loss_clip": 0.01096224, "auxiliary_loss_mlp": 0.01032238, "balance_loss_clip": 1.03724742, "balance_loss_mlp": 1.02020407, "epoch": 0.8074552833308282, "flos": 22853191726080.0, "grad_norm": 2.1805515525099466, "language_loss": 0.66939056, "learning_rate": 3.763531699700568e-07, "loss": 0.6906752, "num_input_tokens_seen": 289834315, "step": 13430, "time_per_iteration": 2.6713409423828125 }, { "auxiliary_loss_clip": 0.01068167, "auxiliary_loss_mlp": 0.01029352, "balance_loss_clip": 1.03282046, "balance_loss_mlp": 1.0171392, "epoch": 0.8075154065834962, "flos": 20339409288960.0, "grad_norm": 1.7027899268363083, "language_loss": 0.80057859, "learning_rate": 3.7612579290190994e-07, "loss": 0.82155377, "num_input_tokens_seen": 289853770, "step": 13431, "time_per_iteration": 2.648855447769165 }, { "auxiliary_loss_clip": 0.01084241, "auxiliary_loss_mlp": 0.0102858, "balance_loss_clip": 1.03625894, "balance_loss_mlp": 1.01611698, "epoch": 0.8075755298361641, "flos": 21908059113600.0, "grad_norm": 1.7686498749229664, "language_loss": 0.80383635, "learning_rate": 3.7589847741123593e-07, "loss": 0.82496452, "num_input_tokens_seen": 289870480, "step": 13432, "time_per_iteration": 2.644226551055908 }, { "auxiliary_loss_clip": 0.01083614, "auxiliary_loss_mlp": 0.01032011, "balance_loss_clip": 1.03852034, "balance_loss_mlp": 1.01924944, "epoch": 0.8076356530888321, "flos": 15669298609920.0, "grad_norm": 7.437398375727633, "language_loss": 0.70418423, "learning_rate": 3.7567122350665415e-07, "loss": 0.72534049, "num_input_tokens_seen": 289888275, "step": 13433, "time_per_iteration": 2.657998561859131 }, { "auxiliary_loss_clip": 0.01083097, "auxiliary_loss_mlp": 0.01028189, "balance_loss_clip": 1.03641129, "balance_loss_mlp": 1.01629746, "epoch": 0.8076957763415, "flos": 37777414521600.0, "grad_norm": 1.6613480416430744, "language_loss": 0.7224468, "learning_rate": 3.754440311967828e-07, "loss": 0.7435596, "num_input_tokens_seen": 289911495, "step": 13434, "time_per_iteration": 2.787569046020508 }, { "auxiliary_loss_clip": 0.01071783, "auxiliary_loss_mlp": 0.01027721, "balance_loss_clip": 1.03727186, "balance_loss_mlp": 1.01534724, "epoch": 0.807755899594168, "flos": 19610781903360.0, "grad_norm": 1.8325775965674183, "language_loss": 0.67859507, "learning_rate": 3.752169004902361e-07, "loss": 0.69959009, "num_input_tokens_seen": 289930045, "step": 13435, "time_per_iteration": 2.719987154006958 }, { "auxiliary_loss_clip": 0.01065411, "auxiliary_loss_mlp": 0.01033721, "balance_loss_clip": 1.03534269, "balance_loss_mlp": 1.01921952, "epoch": 0.8078160228468361, "flos": 23294893271040.0, "grad_norm": 1.9641244359702266, "language_loss": 0.75152278, "learning_rate": 3.749898313956279e-07, "loss": 0.7725141, "num_input_tokens_seen": 289950815, "step": 13436, "time_per_iteration": 4.378523826599121 }, { "auxiliary_loss_clip": 0.01104889, "auxiliary_loss_mlp": 0.01033319, "balance_loss_clip": 1.03509259, "balance_loss_mlp": 1.02078414, "epoch": 0.807876146099504, "flos": 27162651899520.0, "grad_norm": 2.08988998980751, "language_loss": 0.70339876, "learning_rate": 3.747628239215674e-07, "loss": 0.7247808, "num_input_tokens_seen": 289971730, "step": 13437, "time_per_iteration": 2.7287251949310303 }, { "auxiliary_loss_clip": 0.01081874, "auxiliary_loss_mlp": 0.01034019, "balance_loss_clip": 1.03837299, "balance_loss_mlp": 1.02234817, "epoch": 0.807936269352172, "flos": 27160030206720.0, "grad_norm": 1.7193467545995484, "language_loss": 0.73327583, "learning_rate": 3.745358780766636e-07, "loss": 0.75443482, "num_input_tokens_seen": 289992995, "step": 13438, "time_per_iteration": 2.73563289642334 }, { "auxiliary_loss_clip": 0.01084164, "auxiliary_loss_mlp": 0.01031364, "balance_loss_clip": 1.0364821, "balance_loss_mlp": 1.01958609, "epoch": 0.8079963926048399, "flos": 20740423703040.0, "grad_norm": 1.9218850358156638, "language_loss": 0.77182925, "learning_rate": 3.7430899386952344e-07, "loss": 0.79298449, "num_input_tokens_seen": 290009405, "step": 13439, "time_per_iteration": 4.257704257965088 }, { "auxiliary_loss_clip": 0.01108447, "auxiliary_loss_mlp": 0.01030482, "balance_loss_clip": 1.0371114, "balance_loss_mlp": 1.01817346, "epoch": 0.8080565158575079, "flos": 25009663622400.0, "grad_norm": 1.5573047662808601, "language_loss": 0.78926432, "learning_rate": 3.7408217130874786e-07, "loss": 0.81065357, "num_input_tokens_seen": 290031085, "step": 13440, "time_per_iteration": 2.61833119392395 }, { "auxiliary_loss_clip": 0.01088716, "auxiliary_loss_mlp": 0.00770828, "balance_loss_clip": 1.03697038, "balance_loss_mlp": 1.00019264, "epoch": 0.8081166391101758, "flos": 18698076293760.0, "grad_norm": 1.6195395418805565, "language_loss": 0.59136355, "learning_rate": 3.7385541040293946e-07, "loss": 0.60995901, "num_input_tokens_seen": 290048670, "step": 13441, "time_per_iteration": 2.6545674800872803 }, { "auxiliary_loss_clip": 0.01097558, "auxiliary_loss_mlp": 0.01033695, "balance_loss_clip": 1.03679454, "balance_loss_mlp": 1.02092791, "epoch": 0.8081767623628439, "flos": 19828651847040.0, "grad_norm": 2.045109695288156, "language_loss": 0.76209891, "learning_rate": 3.7362871116069684e-07, "loss": 0.78341144, "num_input_tokens_seen": 290064085, "step": 13442, "time_per_iteration": 2.579463005065918 }, { "auxiliary_loss_clip": 0.0108635, "auxiliary_loss_mlp": 0.01031047, "balance_loss_clip": 1.03652704, "balance_loss_mlp": 1.01932859, "epoch": 0.8082368856155118, "flos": 35772952982400.0, "grad_norm": 1.9468682551853589, "language_loss": 0.70523083, "learning_rate": 3.734020735906169e-07, "loss": 0.72640479, "num_input_tokens_seen": 290086255, "step": 13443, "time_per_iteration": 2.768657922744751 }, { "auxiliary_loss_clip": 0.010672, "auxiliary_loss_mlp": 0.01040475, "balance_loss_clip": 1.03662682, "balance_loss_mlp": 1.02816081, "epoch": 0.8082970088681798, "flos": 17198015489280.0, "grad_norm": 2.2011960209089128, "language_loss": 0.82247496, "learning_rate": 3.7317549770129286e-07, "loss": 0.8435517, "num_input_tokens_seen": 290103995, "step": 13444, "time_per_iteration": 4.2101311683654785 }, { "auxiliary_loss_clip": 0.00996531, "auxiliary_loss_mlp": 0.00751439, "balance_loss_clip": 1.01225722, "balance_loss_mlp": 0.99960417, "epoch": 0.8083571321208477, "flos": 63555207511680.0, "grad_norm": 0.8321689368239322, "language_loss": 0.53573275, "learning_rate": 3.7294898350131754e-07, "loss": 0.5532124, "num_input_tokens_seen": 290157245, "step": 13445, "time_per_iteration": 3.0625863075256348 }, { "auxiliary_loss_clip": 0.01071369, "auxiliary_loss_mlp": 0.01031423, "balance_loss_clip": 1.03452253, "balance_loss_mlp": 1.01799452, "epoch": 0.8084172553735157, "flos": 17930701111680.0, "grad_norm": 2.525792385078195, "language_loss": 0.72017092, "learning_rate": 3.7272253099927964e-07, "loss": 0.7411989, "num_input_tokens_seen": 290174970, "step": 13446, "time_per_iteration": 2.7008473873138428 }, { "auxiliary_loss_clip": 0.01084211, "auxiliary_loss_mlp": 0.01031802, "balance_loss_clip": 1.03550613, "balance_loss_mlp": 1.01871324, "epoch": 0.8084773786261836, "flos": 24097999507200.0, "grad_norm": 1.7496181509927413, "language_loss": 0.71567613, "learning_rate": 3.7249614020376606e-07, "loss": 0.73683619, "num_input_tokens_seen": 290194395, "step": 13447, "time_per_iteration": 2.6628973484039307 }, { "auxiliary_loss_clip": 0.01047169, "auxiliary_loss_mlp": 0.01036303, "balance_loss_clip": 1.03517139, "balance_loss_mlp": 1.02175951, "epoch": 0.8085375018788516, "flos": 15588211656960.0, "grad_norm": 3.8730614264516787, "language_loss": 0.74754572, "learning_rate": 3.7226981112336197e-07, "loss": 0.7683804, "num_input_tokens_seen": 290209200, "step": 13448, "time_per_iteration": 2.8440589904785156 }, { "auxiliary_loss_clip": 0.01028882, "auxiliary_loss_mlp": 0.01000792, "balance_loss_clip": 1.00652528, "balance_loss_mlp": 0.99984467, "epoch": 0.8085976251315197, "flos": 67561296393600.0, "grad_norm": 0.7365379441466359, "language_loss": 0.63871992, "learning_rate": 3.7204354376665024e-07, "loss": 0.65901667, "num_input_tokens_seen": 290274565, "step": 13449, "time_per_iteration": 3.194010019302368 }, { "auxiliary_loss_clip": 0.0110053, "auxiliary_loss_mlp": 0.01027665, "balance_loss_clip": 1.03765333, "balance_loss_mlp": 1.01495767, "epoch": 0.8086577483841876, "flos": 22561453463040.0, "grad_norm": 1.8200797231923929, "language_loss": 0.73743933, "learning_rate": 3.718173381422105e-07, "loss": 0.75872129, "num_input_tokens_seen": 290293630, "step": 13450, "time_per_iteration": 2.6638128757476807 }, { "auxiliary_loss_clip": 0.0108587, "auxiliary_loss_mlp": 0.00770156, "balance_loss_clip": 1.03534913, "balance_loss_mlp": 1.00021505, "epoch": 0.8087178716368556, "flos": 17968084191360.0, "grad_norm": 1.7745684945736697, "language_loss": 0.74215508, "learning_rate": 3.7159119425861986e-07, "loss": 0.76071537, "num_input_tokens_seen": 290311450, "step": 13451, "time_per_iteration": 2.6632473468780518 }, { "auxiliary_loss_clip": 0.0108524, "auxiliary_loss_mlp": 0.0103415, "balance_loss_clip": 1.03462768, "balance_loss_mlp": 1.02030349, "epoch": 0.8087779948895235, "flos": 21719527603200.0, "grad_norm": 1.7259402772912478, "language_loss": 0.80131054, "learning_rate": 3.713651121244543e-07, "loss": 0.82250446, "num_input_tokens_seen": 290330165, "step": 13452, "time_per_iteration": 2.7077267169952393 }, { "auxiliary_loss_clip": 0.01100267, "auxiliary_loss_mlp": 0.01037977, "balance_loss_clip": 1.03795743, "balance_loss_mlp": 1.02595496, "epoch": 0.8088381181421915, "flos": 29092885983360.0, "grad_norm": 1.7921268226395743, "language_loss": 0.78446937, "learning_rate": 3.711390917482875e-07, "loss": 0.80585182, "num_input_tokens_seen": 290350815, "step": 13453, "time_per_iteration": 2.655306339263916 }, { "auxiliary_loss_clip": 0.01056304, "auxiliary_loss_mlp": 0.01031896, "balance_loss_clip": 1.0308342, "balance_loss_mlp": 1.01884866, "epoch": 0.8088982413948594, "flos": 22198432659840.0, "grad_norm": 2.2494726223817882, "language_loss": 0.77063608, "learning_rate": 3.709131331386892e-07, "loss": 0.79151809, "num_input_tokens_seen": 290367380, "step": 13454, "time_per_iteration": 2.711794376373291 }, { "auxiliary_loss_clip": 0.01074594, "auxiliary_loss_mlp": 0.01029586, "balance_loss_clip": 1.0350008, "balance_loss_mlp": 1.0173552, "epoch": 0.8089583646475275, "flos": 28036717453440.0, "grad_norm": 1.8232411108878894, "language_loss": 0.76701343, "learning_rate": 3.7068723630422795e-07, "loss": 0.78805518, "num_input_tokens_seen": 290387965, "step": 13455, "time_per_iteration": 2.7459137439727783 }, { "auxiliary_loss_clip": 0.01082819, "auxiliary_loss_mlp": 0.01035059, "balance_loss_clip": 1.03439069, "balance_loss_mlp": 1.02137375, "epoch": 0.8090184879001954, "flos": 16617735273600.0, "grad_norm": 1.8300657936441902, "language_loss": 0.79052675, "learning_rate": 3.70461401253471e-07, "loss": 0.81170559, "num_input_tokens_seen": 290404150, "step": 13456, "time_per_iteration": 2.629514455795288 }, { "auxiliary_loss_clip": 0.01108824, "auxiliary_loss_mlp": 0.0103641, "balance_loss_clip": 1.03869963, "balance_loss_mlp": 1.02431631, "epoch": 0.8090786111528634, "flos": 27340804379520.0, "grad_norm": 1.9435022674849403, "language_loss": 0.71875274, "learning_rate": 3.702356279949801e-07, "loss": 0.74020511, "num_input_tokens_seen": 290422370, "step": 13457, "time_per_iteration": 2.6066160202026367 }, { "auxiliary_loss_clip": 0.01088316, "auxiliary_loss_mlp": 0.01027847, "balance_loss_clip": 1.03695107, "balance_loss_mlp": 1.01670051, "epoch": 0.8091387344055313, "flos": 21105742976640.0, "grad_norm": 2.4771443577175196, "language_loss": 0.72800726, "learning_rate": 3.700099165373176e-07, "loss": 0.74916887, "num_input_tokens_seen": 290442645, "step": 13458, "time_per_iteration": 2.670316696166992 }, { "auxiliary_loss_clip": 0.01097692, "auxiliary_loss_mlp": 0.01035982, "balance_loss_clip": 1.03728068, "balance_loss_mlp": 1.02393568, "epoch": 0.8091988576581993, "flos": 11655060318720.0, "grad_norm": 3.9528236134114736, "language_loss": 0.78632605, "learning_rate": 3.6978426688904275e-07, "loss": 0.80766273, "num_input_tokens_seen": 290458520, "step": 13459, "time_per_iteration": 2.6871142387390137 }, { "auxiliary_loss_clip": 0.0108428, "auxiliary_loss_mlp": 0.01028178, "balance_loss_clip": 1.03804088, "balance_loss_mlp": 1.01529706, "epoch": 0.8092589809108672, "flos": 22963329803520.0, "grad_norm": 2.0963523926810073, "language_loss": 0.79731387, "learning_rate": 3.695586790587113e-07, "loss": 0.81843841, "num_input_tokens_seen": 290474465, "step": 13460, "time_per_iteration": 2.7210707664489746 }, { "auxiliary_loss_clip": 0.01085117, "auxiliary_loss_mlp": 0.01033465, "balance_loss_clip": 1.033885, "balance_loss_mlp": 1.02023244, "epoch": 0.8093191041635353, "flos": 13260985482240.0, "grad_norm": 1.8549028601882585, "language_loss": 0.84519565, "learning_rate": 3.693331530548789e-07, "loss": 0.86638141, "num_input_tokens_seen": 290492060, "step": 13461, "time_per_iteration": 2.7760846614837646 }, { "auxiliary_loss_clip": 0.01100089, "auxiliary_loss_mlp": 0.01039455, "balance_loss_clip": 1.03782284, "balance_loss_mlp": 1.02653313, "epoch": 0.8093792274162032, "flos": 25516003691520.0, "grad_norm": 1.8598987285745991, "language_loss": 0.76461577, "learning_rate": 3.69107688886096e-07, "loss": 0.78601122, "num_input_tokens_seen": 290511510, "step": 13462, "time_per_iteration": 2.845400094985962 }, { "auxiliary_loss_clip": 0.01088384, "auxiliary_loss_mlp": 0.01034949, "balance_loss_clip": 1.03809071, "balance_loss_mlp": 1.02182388, "epoch": 0.8094393506688712, "flos": 23546483107200.0, "grad_norm": 4.497352318555959, "language_loss": 0.83011431, "learning_rate": 3.6888228656091357e-07, "loss": 0.85134763, "num_input_tokens_seen": 290530035, "step": 13463, "time_per_iteration": 2.801821708679199 }, { "auxiliary_loss_clip": 0.01107291, "auxiliary_loss_mlp": 0.01031928, "balance_loss_clip": 1.03720284, "balance_loss_mlp": 1.02069807, "epoch": 0.8094994739215392, "flos": 17055917285760.0, "grad_norm": 2.01920176880003, "language_loss": 0.62397665, "learning_rate": 3.686569460878779e-07, "loss": 0.64536881, "num_input_tokens_seen": 290548245, "step": 13464, "time_per_iteration": 2.7305564880371094 }, { "auxiliary_loss_clip": 0.01106405, "auxiliary_loss_mlp": 0.01028723, "balance_loss_clip": 1.03645182, "balance_loss_mlp": 1.01739168, "epoch": 0.8095595971742071, "flos": 23551223702400.0, "grad_norm": 1.547589805180524, "language_loss": 0.61729312, "learning_rate": 3.684316674755341e-07, "loss": 0.6386444, "num_input_tokens_seen": 290568625, "step": 13465, "time_per_iteration": 2.61460018157959 }, { "auxiliary_loss_clip": 0.01098999, "auxiliary_loss_mlp": 0.01035736, "balance_loss_clip": 1.03847957, "balance_loss_mlp": 1.02339816, "epoch": 0.8096197204268751, "flos": 20373201008640.0, "grad_norm": 2.0112821173289035, "language_loss": 0.82087392, "learning_rate": 3.682064507324256e-07, "loss": 0.84222126, "num_input_tokens_seen": 290586575, "step": 13466, "time_per_iteration": 2.65686297416687 }, { "auxiliary_loss_clip": 0.01094893, "auxiliary_loss_mlp": 0.0077026, "balance_loss_clip": 1.03960299, "balance_loss_mlp": 1.00025487, "epoch": 0.809679843679543, "flos": 27818775682560.0, "grad_norm": 1.8295052098367424, "language_loss": 0.75791496, "learning_rate": 3.6798129586709204e-07, "loss": 0.77656651, "num_input_tokens_seen": 290606790, "step": 13467, "time_per_iteration": 2.7522189617156982 }, { "auxiliary_loss_clip": 0.01073167, "auxiliary_loss_mlp": 0.01031375, "balance_loss_clip": 1.03119135, "balance_loss_mlp": 1.0187211, "epoch": 0.8097399669322111, "flos": 22014103040640.0, "grad_norm": 2.4131060616703484, "language_loss": 0.78938639, "learning_rate": 3.6775620288807073e-07, "loss": 0.81043178, "num_input_tokens_seen": 290625525, "step": 13468, "time_per_iteration": 2.7481191158294678 }, { "auxiliary_loss_clip": 0.01095827, "auxiliary_loss_mlp": 0.01025893, "balance_loss_clip": 1.03550076, "balance_loss_mlp": 1.0147531, "epoch": 0.809800090184879, "flos": 18988988544000.0, "grad_norm": 1.863036262504337, "language_loss": 0.67737466, "learning_rate": 3.675311718038978e-07, "loss": 0.69859189, "num_input_tokens_seen": 290644935, "step": 13469, "time_per_iteration": 2.6411309242248535 }, { "auxiliary_loss_clip": 0.01000462, "auxiliary_loss_mlp": 0.01006561, "balance_loss_clip": 1.00773001, "balance_loss_mlp": 1.00516653, "epoch": 0.809860213437547, "flos": 66099516508800.0, "grad_norm": 0.6947384431465805, "language_loss": 0.54638267, "learning_rate": 3.6730620262310683e-07, "loss": 0.56645286, "num_input_tokens_seen": 290710735, "step": 13470, "time_per_iteration": 3.368800401687622 }, { "auxiliary_loss_clip": 0.01106442, "auxiliary_loss_mlp": 0.01028612, "balance_loss_clip": 1.03582871, "balance_loss_mlp": 1.01704264, "epoch": 0.8099203366902149, "flos": 20882485992960.0, "grad_norm": 2.380568963099226, "language_loss": 0.69673979, "learning_rate": 3.670812953542279e-07, "loss": 0.7180903, "num_input_tokens_seen": 290729565, "step": 13471, "time_per_iteration": 2.6116278171539307 }, { "auxiliary_loss_clip": 0.01099408, "auxiliary_loss_mlp": 0.01028634, "balance_loss_clip": 1.03811002, "balance_loss_mlp": 1.01664793, "epoch": 0.8099804599428829, "flos": 26030927111040.0, "grad_norm": 1.7876254964812721, "language_loss": 0.79963589, "learning_rate": 3.6685645000579003e-07, "loss": 0.82091635, "num_input_tokens_seen": 290749360, "step": 13472, "time_per_iteration": 2.656299114227295 }, { "auxiliary_loss_clip": 0.01020676, "auxiliary_loss_mlp": 0.01001704, "balance_loss_clip": 1.00737977, "balance_loss_mlp": 1.00073814, "epoch": 0.8100405831955508, "flos": 69303573584640.0, "grad_norm": 0.7639440927647733, "language_loss": 0.57809198, "learning_rate": 3.666316665863201e-07, "loss": 0.59831583, "num_input_tokens_seen": 290812145, "step": 13473, "time_per_iteration": 3.1810851097106934 }, { "auxiliary_loss_clip": 0.01058837, "auxiliary_loss_mlp": 0.01030171, "balance_loss_clip": 1.03626239, "balance_loss_mlp": 1.01749909, "epoch": 0.8101007064482189, "flos": 15012492468480.0, "grad_norm": 1.6585074003521199, "language_loss": 0.73932016, "learning_rate": 3.664069451043399e-07, "loss": 0.76021028, "num_input_tokens_seen": 290829845, "step": 13474, "time_per_iteration": 2.7276382446289062 }, { "auxiliary_loss_clip": 0.01096806, "auxiliary_loss_mlp": 0.01037708, "balance_loss_clip": 1.0382781, "balance_loss_mlp": 1.02569187, "epoch": 0.8101608297008868, "flos": 21067210661760.0, "grad_norm": 1.8626875728515482, "language_loss": 0.78847283, "learning_rate": 3.661822855683723e-07, "loss": 0.80981803, "num_input_tokens_seen": 290848815, "step": 13475, "time_per_iteration": 4.543492078781128 }, { "auxiliary_loss_clip": 0.01096436, "auxiliary_loss_mlp": 0.01035936, "balance_loss_clip": 1.03686523, "balance_loss_mlp": 1.02425337, "epoch": 0.8102209529535548, "flos": 23731279603200.0, "grad_norm": 1.6073691325832198, "language_loss": 0.75316137, "learning_rate": 3.659576879869364e-07, "loss": 0.77448511, "num_input_tokens_seen": 290868580, "step": 13476, "time_per_iteration": 4.139512062072754 }, { "auxiliary_loss_clip": 0.01089782, "auxiliary_loss_mlp": 0.01036786, "balance_loss_clip": 1.03533959, "balance_loss_mlp": 1.02327955, "epoch": 0.8102810762062228, "flos": 10955879107200.0, "grad_norm": 2.306148776958402, "language_loss": 0.73640966, "learning_rate": 3.657331523685485e-07, "loss": 0.75767529, "num_input_tokens_seen": 290883540, "step": 13477, "time_per_iteration": 2.632864236831665 }, { "auxiliary_loss_clip": 0.01083746, "auxiliary_loss_mlp": 0.01035491, "balance_loss_clip": 1.03832769, "balance_loss_mlp": 1.02404094, "epoch": 0.8103411994588907, "flos": 14648825220480.0, "grad_norm": 2.218286037812516, "language_loss": 0.69816357, "learning_rate": 3.6550867872172365e-07, "loss": 0.71935594, "num_input_tokens_seen": 290901560, "step": 13478, "time_per_iteration": 4.151829242706299 }, { "auxiliary_loss_clip": 0.01028235, "auxiliary_loss_mlp": 0.01001319, "balance_loss_clip": 1.00567842, "balance_loss_mlp": 1.00037754, "epoch": 0.8104013227115587, "flos": 59153314665600.0, "grad_norm": 0.6814078364409648, "language_loss": 0.52150369, "learning_rate": 3.6528426705497293e-07, "loss": 0.54179931, "num_input_tokens_seen": 290959185, "step": 13479, "time_per_iteration": 3.0655500888824463 }, { "auxiliary_loss_clip": 0.01055198, "auxiliary_loss_mlp": 0.01032847, "balance_loss_clip": 1.03287351, "balance_loss_mlp": 1.02027011, "epoch": 0.8104614459642266, "flos": 19828687760640.0, "grad_norm": 1.696157853255504, "language_loss": 0.7152276, "learning_rate": 3.650599173768072e-07, "loss": 0.73610806, "num_input_tokens_seen": 290979585, "step": 13480, "time_per_iteration": 2.7024738788604736 }, { "auxiliary_loss_clip": 0.01108515, "auxiliary_loss_mlp": 0.01030096, "balance_loss_clip": 1.03706646, "balance_loss_mlp": 1.01809144, "epoch": 0.8105215692168947, "flos": 25374264624000.0, "grad_norm": 1.7323825430226805, "language_loss": 0.7977457, "learning_rate": 3.648356296957327e-07, "loss": 0.81913185, "num_input_tokens_seen": 291000865, "step": 13481, "time_per_iteration": 2.5942976474761963 }, { "auxiliary_loss_clip": 0.01085323, "auxiliary_loss_mlp": 0.01030766, "balance_loss_clip": 1.03626788, "balance_loss_mlp": 1.01913691, "epoch": 0.8105816924695626, "flos": 20481722974080.0, "grad_norm": 2.2690231246252686, "language_loss": 0.72909606, "learning_rate": 3.646114040202548e-07, "loss": 0.75025702, "num_input_tokens_seen": 291018285, "step": 13482, "time_per_iteration": 2.6350491046905518 }, { "auxiliary_loss_clip": 0.01044563, "auxiliary_loss_mlp": 0.01026969, "balance_loss_clip": 1.03307259, "balance_loss_mlp": 1.01405859, "epoch": 0.8106418157222306, "flos": 14538687143040.0, "grad_norm": 2.648947443807841, "language_loss": 0.65993869, "learning_rate": 3.6438724035887705e-07, "loss": 0.68065393, "num_input_tokens_seen": 291035745, "step": 13483, "time_per_iteration": 4.212749719619751 }, { "auxiliary_loss_clip": 0.01080725, "auxiliary_loss_mlp": 0.01028092, "balance_loss_clip": 1.03347242, "balance_loss_mlp": 1.01528955, "epoch": 0.8107019389748985, "flos": 22564470205440.0, "grad_norm": 1.846296028434146, "language_loss": 0.76504505, "learning_rate": 3.641631387200992e-07, "loss": 0.78613329, "num_input_tokens_seen": 291053280, "step": 13484, "time_per_iteration": 2.6466033458709717 }, { "auxiliary_loss_clip": 0.01091182, "auxiliary_loss_mlp": 0.01033252, "balance_loss_clip": 1.03665829, "balance_loss_mlp": 1.01950169, "epoch": 0.8107620622275665, "flos": 19609560840960.0, "grad_norm": 1.4911204923404016, "language_loss": 0.72589421, "learning_rate": 3.639390991124183e-07, "loss": 0.74713862, "num_input_tokens_seen": 291072855, "step": 13485, "time_per_iteration": 2.7968504428863525 }, { "auxiliary_loss_clip": 0.01060159, "auxiliary_loss_mlp": 0.01037186, "balance_loss_clip": 1.02962208, "balance_loss_mlp": 1.02368569, "epoch": 0.8108221854802344, "flos": 16143498984960.0, "grad_norm": 1.7176036308983489, "language_loss": 0.75729263, "learning_rate": 3.637151215443308e-07, "loss": 0.77826607, "num_input_tokens_seen": 291090285, "step": 13486, "time_per_iteration": 2.652395725250244 }, { "auxiliary_loss_clip": 0.01089867, "auxiliary_loss_mlp": 0.01031482, "balance_loss_clip": 1.03695536, "balance_loss_mlp": 1.01949561, "epoch": 0.8108823087329025, "flos": 21106209853440.0, "grad_norm": 1.9697831053353734, "language_loss": 0.72577608, "learning_rate": 3.6349120602433045e-07, "loss": 0.74698949, "num_input_tokens_seen": 291107675, "step": 13487, "time_per_iteration": 2.5947606563568115 }, { "auxiliary_loss_clip": 0.01046594, "auxiliary_loss_mlp": 0.0103374, "balance_loss_clip": 1.03592014, "balance_loss_mlp": 1.02142572, "epoch": 0.8109424319855704, "flos": 29199648182400.0, "grad_norm": 2.098798308260877, "language_loss": 0.84576857, "learning_rate": 3.6326735256090715e-07, "loss": 0.8665719, "num_input_tokens_seen": 291126900, "step": 13488, "time_per_iteration": 2.793503761291504 }, { "auxiliary_loss_clip": 0.01111048, "auxiliary_loss_mlp": 0.01032699, "balance_loss_clip": 1.03847623, "balance_loss_mlp": 1.02028883, "epoch": 0.8110025552382384, "flos": 23111856541440.0, "grad_norm": 1.9295150325791675, "language_loss": 0.73623288, "learning_rate": 3.630435611625502e-07, "loss": 0.75767034, "num_input_tokens_seen": 291145285, "step": 13489, "time_per_iteration": 2.599238395690918 }, { "auxiliary_loss_clip": 0.01065923, "auxiliary_loss_mlp": 0.00769841, "balance_loss_clip": 1.03703368, "balance_loss_mlp": 1.00027084, "epoch": 0.8110626784909064, "flos": 22379961018240.0, "grad_norm": 1.5399542352120712, "language_loss": 0.71757072, "learning_rate": 3.628198318377453e-07, "loss": 0.73592842, "num_input_tokens_seen": 291163485, "step": 13490, "time_per_iteration": 2.830582857131958 }, { "auxiliary_loss_clip": 0.01077295, "auxiliary_loss_mlp": 0.01050998, "balance_loss_clip": 1.03662229, "balance_loss_mlp": 1.03582263, "epoch": 0.8111228017435743, "flos": 23368043318400.0, "grad_norm": 2.1733695936937103, "language_loss": 0.71880186, "learning_rate": 3.625961645949762e-07, "loss": 0.74008483, "num_input_tokens_seen": 291182215, "step": 13491, "time_per_iteration": 2.788850784301758 }, { "auxiliary_loss_clip": 0.01107942, "auxiliary_loss_mlp": 0.01029919, "balance_loss_clip": 1.03627849, "balance_loss_mlp": 1.01822448, "epoch": 0.8111829249962423, "flos": 21286553063040.0, "grad_norm": 1.3514463639541505, "language_loss": 0.67817056, "learning_rate": 3.623725594427245e-07, "loss": 0.6995492, "num_input_tokens_seen": 291203145, "step": 13492, "time_per_iteration": 2.6831281185150146 }, { "auxiliary_loss_clip": 0.01064465, "auxiliary_loss_mlp": 0.01029335, "balance_loss_clip": 1.03531671, "balance_loss_mlp": 1.01716399, "epoch": 0.8112430482489102, "flos": 22345558767360.0, "grad_norm": 1.6581742417712237, "language_loss": 0.71983981, "learning_rate": 3.6214901638947006e-07, "loss": 0.74077779, "num_input_tokens_seen": 291220600, "step": 13493, "time_per_iteration": 2.7153713703155518 }, { "auxiliary_loss_clip": 0.01091343, "auxiliary_loss_mlp": 0.01038969, "balance_loss_clip": 1.03388119, "balance_loss_mlp": 1.02628565, "epoch": 0.8113031715015783, "flos": 31138321962240.0, "grad_norm": 1.76439624492188, "language_loss": 0.70763975, "learning_rate": 3.619255354436885e-07, "loss": 0.72894287, "num_input_tokens_seen": 291241195, "step": 13494, "time_per_iteration": 2.6391232013702393 }, { "auxiliary_loss_clip": 0.01100106, "auxiliary_loss_mlp": 0.01033523, "balance_loss_clip": 1.03767419, "balance_loss_mlp": 1.02014816, "epoch": 0.8113632947542462, "flos": 25335445000320.0, "grad_norm": 1.998515349302589, "language_loss": 0.76569247, "learning_rate": 3.6170211661385543e-07, "loss": 0.78702873, "num_input_tokens_seen": 291258715, "step": 13495, "time_per_iteration": 2.588968515396118 }, { "auxiliary_loss_clip": 0.01089895, "auxiliary_loss_mlp": 0.01036707, "balance_loss_clip": 1.03693032, "balance_loss_mlp": 1.02406478, "epoch": 0.8114234180069142, "flos": 28439168411520.0, "grad_norm": 1.859703318738602, "language_loss": 0.80103755, "learning_rate": 3.614787599084417e-07, "loss": 0.82230359, "num_input_tokens_seen": 291278030, "step": 13496, "time_per_iteration": 2.612717390060425 }, { "auxiliary_loss_clip": 0.01098421, "auxiliary_loss_mlp": 0.01031509, "balance_loss_clip": 1.0357132, "balance_loss_mlp": 1.01813412, "epoch": 0.8114835412595821, "flos": 20338870584960.0, "grad_norm": 1.7561157530405371, "language_loss": 0.71104527, "learning_rate": 3.6125546533591787e-07, "loss": 0.73234457, "num_input_tokens_seen": 291296740, "step": 13497, "time_per_iteration": 2.505080461502075 }, { "auxiliary_loss_clip": 0.01073865, "auxiliary_loss_mlp": 0.01031093, "balance_loss_clip": 1.03375506, "balance_loss_mlp": 1.01949358, "epoch": 0.8115436645122501, "flos": 22490889194880.0, "grad_norm": 1.5327513158552182, "language_loss": 0.76614642, "learning_rate": 3.610322329047508e-07, "loss": 0.78719592, "num_input_tokens_seen": 291318730, "step": 13498, "time_per_iteration": 2.6442582607269287 }, { "auxiliary_loss_clip": 0.01109819, "auxiliary_loss_mlp": 0.01034546, "balance_loss_clip": 1.03731918, "balance_loss_mlp": 1.02193928, "epoch": 0.811603787764918, "flos": 13845288021120.0, "grad_norm": 3.6462574824728313, "language_loss": 0.84119499, "learning_rate": 3.608090626234055e-07, "loss": 0.86263865, "num_input_tokens_seen": 291336755, "step": 13499, "time_per_iteration": 2.483522653579712 }, { "auxiliary_loss_clip": 0.01075443, "auxiliary_loss_mlp": 0.01031753, "balance_loss_clip": 1.03560185, "balance_loss_mlp": 1.01798427, "epoch": 0.8116639110175861, "flos": 21614632911360.0, "grad_norm": 1.449342518398089, "language_loss": 0.76081306, "learning_rate": 3.6058595450034603e-07, "loss": 0.78188503, "num_input_tokens_seen": 291356795, "step": 13500, "time_per_iteration": 2.605076313018799 }, { "auxiliary_loss_clip": 0.01008001, "auxiliary_loss_mlp": 0.00999684, "balance_loss_clip": 1.00579894, "balance_loss_mlp": 0.9987666, "epoch": 0.811724034270254, "flos": 64459799625600.0, "grad_norm": 0.8052955879746776, "language_loss": 0.59879011, "learning_rate": 3.603629085440303e-07, "loss": 0.61886698, "num_input_tokens_seen": 291416005, "step": 13501, "time_per_iteration": 3.1991348266601562 }, { "auxiliary_loss_clip": 0.01094365, "auxiliary_loss_mlp": 0.01025632, "balance_loss_clip": 1.03644705, "balance_loss_mlp": 1.01366997, "epoch": 0.811784157522922, "flos": 24754123290240.0, "grad_norm": 1.57620399349307, "language_loss": 0.79127729, "learning_rate": 3.6013992476291753e-07, "loss": 0.81247723, "num_input_tokens_seen": 291434870, "step": 13502, "time_per_iteration": 2.612614154815674 }, { "auxiliary_loss_clip": 0.01081743, "auxiliary_loss_mlp": 0.01037896, "balance_loss_clip": 1.03356886, "balance_loss_mlp": 1.02457452, "epoch": 0.81184428077559, "flos": 12167146563840.0, "grad_norm": 1.8233893425242464, "language_loss": 0.71166331, "learning_rate": 3.599170031654635e-07, "loss": 0.73285973, "num_input_tokens_seen": 291452230, "step": 13503, "time_per_iteration": 2.61946964263916 }, { "auxiliary_loss_clip": 0.01079859, "auxiliary_loss_mlp": 0.01030554, "balance_loss_clip": 1.03437757, "balance_loss_mlp": 1.01704192, "epoch": 0.8119044040282579, "flos": 44422037775360.0, "grad_norm": 1.451264981868303, "language_loss": 0.67748487, "learning_rate": 3.5969414376012065e-07, "loss": 0.69858897, "num_input_tokens_seen": 291477425, "step": 13504, "time_per_iteration": 2.850944995880127 }, { "auxiliary_loss_clip": 0.01081144, "auxiliary_loss_mlp": 0.01032065, "balance_loss_clip": 1.03502822, "balance_loss_mlp": 1.01892805, "epoch": 0.8119645272809259, "flos": 52155507957120.0, "grad_norm": 6.247382834720127, "language_loss": 0.74465597, "learning_rate": 3.594713465553403e-07, "loss": 0.76578808, "num_input_tokens_seen": 291501070, "step": 13505, "time_per_iteration": 2.9765865802764893 }, { "auxiliary_loss_clip": 0.01085863, "auxiliary_loss_mlp": 0.01031618, "balance_loss_clip": 1.03640378, "balance_loss_mlp": 1.01850462, "epoch": 0.8120246505335939, "flos": 30232978640640.0, "grad_norm": 1.9161202158648818, "language_loss": 0.73033476, "learning_rate": 3.5924861155957123e-07, "loss": 0.75150955, "num_input_tokens_seen": 291524945, "step": 13506, "time_per_iteration": 2.7573962211608887 }, { "auxiliary_loss_clip": 0.01114046, "auxiliary_loss_mlp": 0.01031523, "balance_loss_clip": 1.03798163, "balance_loss_mlp": 1.01900649, "epoch": 0.8120847737862619, "flos": 22127652910080.0, "grad_norm": 2.1835357751588664, "language_loss": 0.75858426, "learning_rate": 3.590259387812593e-07, "loss": 0.78003991, "num_input_tokens_seen": 291544605, "step": 13507, "time_per_iteration": 2.5932223796844482 }, { "auxiliary_loss_clip": 0.01110179, "auxiliary_loss_mlp": 0.01028613, "balance_loss_clip": 1.03553689, "balance_loss_mlp": 1.01663268, "epoch": 0.8121448970389298, "flos": 23295180579840.0, "grad_norm": 1.789952904121179, "language_loss": 0.70542687, "learning_rate": 3.5880332822884783e-07, "loss": 0.72681475, "num_input_tokens_seen": 291563850, "step": 13508, "time_per_iteration": 2.6186447143554688 }, { "auxiliary_loss_clip": 0.01096661, "auxiliary_loss_mlp": 0.01033361, "balance_loss_clip": 1.03715634, "balance_loss_mlp": 1.02163088, "epoch": 0.8122050202915978, "flos": 22164138149760.0, "grad_norm": 2.344890772054223, "language_loss": 0.75989026, "learning_rate": 3.585807799107785e-07, "loss": 0.78119051, "num_input_tokens_seen": 291581730, "step": 13509, "time_per_iteration": 2.589594841003418 }, { "auxiliary_loss_clip": 0.01110373, "auxiliary_loss_mlp": 0.01030617, "balance_loss_clip": 1.03736436, "balance_loss_mlp": 1.01834416, "epoch": 0.8122651435442657, "flos": 23258946735360.0, "grad_norm": 1.705010281678355, "language_loss": 0.76900029, "learning_rate": 3.58358293835491e-07, "loss": 0.79041028, "num_input_tokens_seen": 291601225, "step": 13510, "time_per_iteration": 2.6146199703216553 }, { "auxiliary_loss_clip": 0.01099011, "auxiliary_loss_mlp": 0.01035544, "balance_loss_clip": 1.0352056, "balance_loss_mlp": 1.02240086, "epoch": 0.8123252667969337, "flos": 16140015365760.0, "grad_norm": 2.0243900954468446, "language_loss": 0.69868124, "learning_rate": 3.581358700114212e-07, "loss": 0.72002673, "num_input_tokens_seen": 291616995, "step": 13511, "time_per_iteration": 2.5841333866119385 }, { "auxiliary_loss_clip": 0.0108991, "auxiliary_loss_mlp": 0.01035297, "balance_loss_clip": 1.03681922, "balance_loss_mlp": 1.0227139, "epoch": 0.8123853900496016, "flos": 21245399055360.0, "grad_norm": 1.8851833910284228, "language_loss": 0.79274458, "learning_rate": 3.57913508447004e-07, "loss": 0.81399667, "num_input_tokens_seen": 291636145, "step": 13512, "time_per_iteration": 2.6589250564575195 }, { "auxiliary_loss_clip": 0.01096941, "auxiliary_loss_mlp": 0.01028829, "balance_loss_clip": 1.0360527, "balance_loss_mlp": 1.01692605, "epoch": 0.8124455133022697, "flos": 64377596373120.0, "grad_norm": 1.5979406334713135, "language_loss": 0.63230824, "learning_rate": 3.5769120915067076e-07, "loss": 0.65356594, "num_input_tokens_seen": 291662440, "step": 13513, "time_per_iteration": 2.990612030029297 }, { "auxiliary_loss_clip": 0.01058491, "auxiliary_loss_mlp": 0.01033418, "balance_loss_clip": 1.03237724, "balance_loss_mlp": 1.02057886, "epoch": 0.8125056365549376, "flos": 23842207779840.0, "grad_norm": 1.8613542328195332, "language_loss": 0.71270061, "learning_rate": 3.5746897213085194e-07, "loss": 0.73361969, "num_input_tokens_seen": 291680950, "step": 13514, "time_per_iteration": 4.445888519287109 }, { "auxiliary_loss_clip": 0.01073863, "auxiliary_loss_mlp": 0.01030266, "balance_loss_clip": 1.03600311, "balance_loss_mlp": 1.01780891, "epoch": 0.8125657598076056, "flos": 23550325862400.0, "grad_norm": 1.5528453792894483, "language_loss": 0.62748504, "learning_rate": 3.5724679739597364e-07, "loss": 0.64852631, "num_input_tokens_seen": 291702395, "step": 13515, "time_per_iteration": 4.218576192855835 }, { "auxiliary_loss_clip": 0.01102975, "auxiliary_loss_mlp": 0.00769685, "balance_loss_clip": 1.03534853, "balance_loss_mlp": 1.00016952, "epoch": 0.8126258830602736, "flos": 20704225772160.0, "grad_norm": 5.114192306914355, "language_loss": 0.75868255, "learning_rate": 3.570246849544616e-07, "loss": 0.7774092, "num_input_tokens_seen": 291721135, "step": 13516, "time_per_iteration": 2.6113100051879883 }, { "auxiliary_loss_clip": 0.0105995, "auxiliary_loss_mlp": 0.01028758, "balance_loss_clip": 1.03563333, "balance_loss_mlp": 1.01692092, "epoch": 0.8126860063129415, "flos": 23618160696960.0, "grad_norm": 1.4792626024719475, "language_loss": 0.91386318, "learning_rate": 3.5680263481473907e-07, "loss": 0.9347502, "num_input_tokens_seen": 291741235, "step": 13517, "time_per_iteration": 2.730743408203125 }, { "auxiliary_loss_clip": 0.01101276, "auxiliary_loss_mlp": 0.00770067, "balance_loss_clip": 1.03974628, "balance_loss_mlp": 1.00018048, "epoch": 0.8127461295656095, "flos": 25007149670400.0, "grad_norm": 1.60312479075504, "language_loss": 0.78797936, "learning_rate": 3.565806469852244e-07, "loss": 0.80669272, "num_input_tokens_seen": 291761430, "step": 13518, "time_per_iteration": 4.1632232666015625 }, { "auxiliary_loss_clip": 0.01096668, "auxiliary_loss_mlp": 0.01029335, "balance_loss_clip": 1.03643668, "balance_loss_mlp": 1.01815319, "epoch": 0.8128062528182775, "flos": 27342169096320.0, "grad_norm": 1.5642213115065133, "language_loss": 0.78870726, "learning_rate": 3.56358721474336e-07, "loss": 0.80996728, "num_input_tokens_seen": 291781755, "step": 13519, "time_per_iteration": 2.681206226348877 }, { "auxiliary_loss_clip": 0.01109503, "auxiliary_loss_mlp": 0.01033337, "balance_loss_clip": 1.03672457, "balance_loss_mlp": 1.02139187, "epoch": 0.8128663760709455, "flos": 26506312634880.0, "grad_norm": 1.5860289304426558, "language_loss": 0.70635796, "learning_rate": 3.561368582904905e-07, "loss": 0.72778636, "num_input_tokens_seen": 291804410, "step": 13520, "time_per_iteration": 2.6522674560546875 }, { "auxiliary_loss_clip": 0.01093185, "auxiliary_loss_mlp": 0.01033091, "balance_loss_clip": 1.03861439, "balance_loss_mlp": 1.02049041, "epoch": 0.8129264993236134, "flos": 17931239815680.0, "grad_norm": 1.3829440346213167, "language_loss": 0.7262553, "learning_rate": 3.5591505744209925e-07, "loss": 0.74751806, "num_input_tokens_seen": 291823285, "step": 13521, "time_per_iteration": 2.7141287326812744 }, { "auxiliary_loss_clip": 0.01101075, "auxiliary_loss_mlp": 0.01030446, "balance_loss_clip": 1.03675306, "balance_loss_mlp": 1.01783347, "epoch": 0.8129866225762814, "flos": 26177694082560.0, "grad_norm": 1.8750216603820857, "language_loss": 0.70207542, "learning_rate": 3.5569331893757394e-07, "loss": 0.7233907, "num_input_tokens_seen": 291845305, "step": 13522, "time_per_iteration": 2.7125802040100098 }, { "auxiliary_loss_clip": 0.01093707, "auxiliary_loss_mlp": 0.01032133, "balance_loss_clip": 1.03634274, "balance_loss_mlp": 1.02078998, "epoch": 0.8130467458289493, "flos": 21032197879680.0, "grad_norm": 1.6227036980267577, "language_loss": 0.7064119, "learning_rate": 3.554716427853233e-07, "loss": 0.72767031, "num_input_tokens_seen": 291863715, "step": 13523, "time_per_iteration": 4.0815582275390625 }, { "auxiliary_loss_clip": 0.01096974, "auxiliary_loss_mlp": 0.01031155, "balance_loss_clip": 1.03545976, "balance_loss_mlp": 1.01855457, "epoch": 0.8131068690816173, "flos": 15487051979520.0, "grad_norm": 2.0371214657099435, "language_loss": 0.70833939, "learning_rate": 3.5525002899375256e-07, "loss": 0.7296207, "num_input_tokens_seen": 291880735, "step": 13524, "time_per_iteration": 2.6003952026367188 }, { "auxiliary_loss_clip": 0.01095723, "auxiliary_loss_mlp": 0.01030696, "balance_loss_clip": 1.03494859, "balance_loss_mlp": 1.01905477, "epoch": 0.8131669923342852, "flos": 29351227576320.0, "grad_norm": 1.8442373927503088, "language_loss": 0.62811154, "learning_rate": 3.550284775712653e-07, "loss": 0.64937574, "num_input_tokens_seen": 291900535, "step": 13525, "time_per_iteration": 2.657466411590576 }, { "auxiliary_loss_clip": 0.01079403, "auxiliary_loss_mlp": 0.01036654, "balance_loss_clip": 1.03601646, "balance_loss_mlp": 1.02482271, "epoch": 0.8132271155869533, "flos": 35256162055680.0, "grad_norm": 1.777429638442415, "language_loss": 0.65313601, "learning_rate": 3.548069885262628e-07, "loss": 0.67429662, "num_input_tokens_seen": 291919760, "step": 13526, "time_per_iteration": 2.7304532527923584 }, { "auxiliary_loss_clip": 0.01083448, "auxiliary_loss_mlp": 0.0102643, "balance_loss_clip": 1.03576994, "balance_loss_mlp": 1.01569486, "epoch": 0.8132872388396212, "flos": 27781895393280.0, "grad_norm": 1.6432032258331546, "language_loss": 0.75642002, "learning_rate": 3.5458556186714473e-07, "loss": 0.77751887, "num_input_tokens_seen": 291938915, "step": 13527, "time_per_iteration": 2.667377471923828 }, { "auxiliary_loss_clip": 0.01107517, "auxiliary_loss_mlp": 0.01026255, "balance_loss_clip": 1.03658116, "balance_loss_mlp": 1.01454246, "epoch": 0.8133473620922892, "flos": 27819601695360.0, "grad_norm": 2.105634119207497, "language_loss": 0.70704925, "learning_rate": 3.5436419760230706e-07, "loss": 0.728387, "num_input_tokens_seen": 291958145, "step": 13528, "time_per_iteration": 2.6163675785064697 }, { "auxiliary_loss_clip": 0.01108822, "auxiliary_loss_mlp": 0.01030423, "balance_loss_clip": 1.03638566, "balance_loss_mlp": 1.01875806, "epoch": 0.8134074853449572, "flos": 18989527248000.0, "grad_norm": 1.9964746744268802, "language_loss": 0.69046456, "learning_rate": 3.5414289574014357e-07, "loss": 0.71185702, "num_input_tokens_seen": 291976860, "step": 13529, "time_per_iteration": 2.5404155254364014 }, { "auxiliary_loss_clip": 0.01089372, "auxiliary_loss_mlp": 0.01031807, "balance_loss_clip": 1.03602207, "balance_loss_mlp": 1.02028584, "epoch": 0.8134676085976251, "flos": 24242863057920.0, "grad_norm": 2.1717910963600615, "language_loss": 0.77427143, "learning_rate": 3.5392165628904635e-07, "loss": 0.79548317, "num_input_tokens_seen": 291998085, "step": 13530, "time_per_iteration": 2.617090940475464 }, { "auxiliary_loss_clip": 0.01097307, "auxiliary_loss_mlp": 0.01035822, "balance_loss_clip": 1.03674924, "balance_loss_mlp": 1.02292991, "epoch": 0.8135277318502931, "flos": 19062389986560.0, "grad_norm": 4.052594441167417, "language_loss": 0.81679058, "learning_rate": 3.537004792574052e-07, "loss": 0.83812189, "num_input_tokens_seen": 292016585, "step": 13531, "time_per_iteration": 2.6205062866210938 }, { "auxiliary_loss_clip": 0.01084413, "auxiliary_loss_mlp": 0.01034781, "balance_loss_clip": 1.03383708, "balance_loss_mlp": 1.02089322, "epoch": 0.813587855102961, "flos": 17269728992640.0, "grad_norm": 1.9466339664768382, "language_loss": 0.72048044, "learning_rate": 3.534793646536065e-07, "loss": 0.7416724, "num_input_tokens_seen": 292033255, "step": 13532, "time_per_iteration": 2.6235249042510986 }, { "auxiliary_loss_clip": 0.01076826, "auxiliary_loss_mlp": 0.01028928, "balance_loss_clip": 1.03568232, "balance_loss_mlp": 1.01717389, "epoch": 0.8136479783556291, "flos": 20157593621760.0, "grad_norm": 1.944089941040769, "language_loss": 0.7643801, "learning_rate": 3.5325831248603533e-07, "loss": 0.78543758, "num_input_tokens_seen": 292051800, "step": 13533, "time_per_iteration": 2.686540126800537 }, { "auxiliary_loss_clip": 0.01112795, "auxiliary_loss_mlp": 0.0077037, "balance_loss_clip": 1.03745687, "balance_loss_mlp": 1.00021124, "epoch": 0.813708101608297, "flos": 22052348046720.0, "grad_norm": 1.645533384240896, "language_loss": 0.76579952, "learning_rate": 3.5303732276307495e-07, "loss": 0.78463125, "num_input_tokens_seen": 292072215, "step": 13534, "time_per_iteration": 2.6405410766601562 }, { "auxiliary_loss_clip": 0.01090662, "auxiliary_loss_mlp": 0.01028024, "balance_loss_clip": 1.03678405, "balance_loss_mlp": 1.01722336, "epoch": 0.813768224860965, "flos": 16173412035840.0, "grad_norm": 2.1008954563080153, "language_loss": 0.93045878, "learning_rate": 3.5281639549310336e-07, "loss": 0.95164573, "num_input_tokens_seen": 292088830, "step": 13535, "time_per_iteration": 2.64209246635437 }, { "auxiliary_loss_clip": 0.01071147, "auxiliary_loss_mlp": 0.01027139, "balance_loss_clip": 1.0390265, "balance_loss_mlp": 1.01590967, "epoch": 0.8138283481136329, "flos": 24352318776960.0, "grad_norm": 1.5385602481593355, "language_loss": 0.70752996, "learning_rate": 3.52595530684499e-07, "loss": 0.72851282, "num_input_tokens_seen": 292109225, "step": 13536, "time_per_iteration": 2.80938720703125 }, { "auxiliary_loss_clip": 0.01072251, "auxiliary_loss_mlp": 0.01029565, "balance_loss_clip": 1.034621, "balance_loss_mlp": 1.01691151, "epoch": 0.8138884713663009, "flos": 25516362827520.0, "grad_norm": 1.7915852283109845, "language_loss": 0.75374007, "learning_rate": 3.5237472834563775e-07, "loss": 0.77475834, "num_input_tokens_seen": 292129660, "step": 13537, "time_per_iteration": 2.709963798522949 }, { "auxiliary_loss_clip": 0.01083975, "auxiliary_loss_mlp": 0.01039156, "balance_loss_clip": 1.03624582, "balance_loss_mlp": 1.02596569, "epoch": 0.8139485946189688, "flos": 22454368041600.0, "grad_norm": 1.531988851802544, "language_loss": 0.76327688, "learning_rate": 3.5215398848489163e-07, "loss": 0.78450817, "num_input_tokens_seen": 292149090, "step": 13538, "time_per_iteration": 2.659142255783081 }, { "auxiliary_loss_clip": 0.01090459, "auxiliary_loss_mlp": 0.01029562, "balance_loss_clip": 1.03432798, "balance_loss_mlp": 1.01791525, "epoch": 0.8140087178716369, "flos": 21250391045760.0, "grad_norm": 1.5412733274323733, "language_loss": 0.78075993, "learning_rate": 3.5193331111063176e-07, "loss": 0.80196011, "num_input_tokens_seen": 292169260, "step": 13539, "time_per_iteration": 2.637423515319824 }, { "auxiliary_loss_clip": 0.01073968, "auxiliary_loss_mlp": 0.01029706, "balance_loss_clip": 1.04544592, "balance_loss_mlp": 1.01841712, "epoch": 0.8140688411243048, "flos": 39415730774400.0, "grad_norm": 2.1381487111045145, "language_loss": 0.66290975, "learning_rate": 3.5171269623122533e-07, "loss": 0.68394649, "num_input_tokens_seen": 292188145, "step": 13540, "time_per_iteration": 2.8771181106567383 }, { "auxiliary_loss_clip": 0.01101069, "auxiliary_loss_mlp": 0.01033274, "balance_loss_clip": 1.03914928, "balance_loss_mlp": 1.02268267, "epoch": 0.8141289643769728, "flos": 25415885508480.0, "grad_norm": 1.6197380880165504, "language_loss": 0.67438757, "learning_rate": 3.5149214385503913e-07, "loss": 0.69573104, "num_input_tokens_seen": 292212135, "step": 13541, "time_per_iteration": 2.769536018371582 }, { "auxiliary_loss_clip": 0.01106222, "auxiliary_loss_mlp": 0.01034432, "balance_loss_clip": 1.03566313, "balance_loss_mlp": 1.02187276, "epoch": 0.8141890876296408, "flos": 12568053237120.0, "grad_norm": 1.8846151947016416, "language_loss": 0.69230938, "learning_rate": 3.512716539904355e-07, "loss": 0.71371591, "num_input_tokens_seen": 292230645, "step": 13542, "time_per_iteration": 2.6285057067871094 }, { "auxiliary_loss_clip": 0.0111203, "auxiliary_loss_mlp": 0.01033744, "balance_loss_clip": 1.03642273, "balance_loss_mlp": 1.02083373, "epoch": 0.8142492108823087, "flos": 14967172483200.0, "grad_norm": 2.925687794386818, "language_loss": 0.79454219, "learning_rate": 3.5105122664577613e-07, "loss": 0.81599998, "num_input_tokens_seen": 292243540, "step": 13543, "time_per_iteration": 2.5403339862823486 }, { "auxiliary_loss_clip": 0.01081798, "auxiliary_loss_mlp": 0.01035405, "balance_loss_clip": 1.03946197, "balance_loss_mlp": 1.02264941, "epoch": 0.8143093341349767, "flos": 12422004537600.0, "grad_norm": 2.0517480511317774, "language_loss": 0.78091002, "learning_rate": 3.5083086182942003e-07, "loss": 0.80208206, "num_input_tokens_seen": 292261715, "step": 13544, "time_per_iteration": 2.782600164413452 }, { "auxiliary_loss_clip": 0.01116058, "auxiliary_loss_mlp": 0.01031813, "balance_loss_clip": 1.03913999, "balance_loss_mlp": 1.01734674, "epoch": 0.8143694573876447, "flos": 11910564737280.0, "grad_norm": 3.0853831344468707, "language_loss": 0.7382375, "learning_rate": 3.5061055954972264e-07, "loss": 0.75971621, "num_input_tokens_seen": 292275080, "step": 13545, "time_per_iteration": 2.631141185760498 }, { "auxiliary_loss_clip": 0.01096875, "auxiliary_loss_mlp": 0.01029281, "balance_loss_clip": 1.03640938, "balance_loss_mlp": 1.0174439, "epoch": 0.8144295806403127, "flos": 21212900225280.0, "grad_norm": 1.6342395606373197, "language_loss": 0.76933265, "learning_rate": 3.5039031981503776e-07, "loss": 0.79059422, "num_input_tokens_seen": 292294635, "step": 13546, "time_per_iteration": 2.6105756759643555 }, { "auxiliary_loss_clip": 0.0110063, "auxiliary_loss_mlp": 0.01030809, "balance_loss_clip": 1.0386976, "balance_loss_mlp": 1.01948416, "epoch": 0.8144897038929806, "flos": 19865280741120.0, "grad_norm": 2.057693835457072, "language_loss": 0.70437783, "learning_rate": 3.501701426337178e-07, "loss": 0.72569221, "num_input_tokens_seen": 292312695, "step": 13547, "time_per_iteration": 2.6459848880767822 }, { "auxiliary_loss_clip": 0.01112435, "auxiliary_loss_mlp": 0.01036703, "balance_loss_clip": 1.03837729, "balance_loss_mlp": 1.02320886, "epoch": 0.8145498271456486, "flos": 24571733005440.0, "grad_norm": 1.7911803251126166, "language_loss": 0.70297545, "learning_rate": 3.49950028014111e-07, "loss": 0.7244668, "num_input_tokens_seen": 292332005, "step": 13548, "time_per_iteration": 2.7214651107788086 }, { "auxiliary_loss_clip": 0.01099863, "auxiliary_loss_mlp": 0.01033065, "balance_loss_clip": 1.03860509, "balance_loss_mlp": 1.01963055, "epoch": 0.8146099503983165, "flos": 20193037367040.0, "grad_norm": 4.113506280616557, "language_loss": 0.77017093, "learning_rate": 3.4972997596456444e-07, "loss": 0.79150021, "num_input_tokens_seen": 292348365, "step": 13549, "time_per_iteration": 2.6624276638031006 }, { "auxiliary_loss_clip": 0.011122, "auxiliary_loss_mlp": 0.01030353, "balance_loss_clip": 1.03999424, "balance_loss_mlp": 1.01782978, "epoch": 0.8146700736509845, "flos": 19536949497600.0, "grad_norm": 1.9226967396977621, "language_loss": 0.71076775, "learning_rate": 3.4950998649342233e-07, "loss": 0.73219323, "num_input_tokens_seen": 292368050, "step": 13550, "time_per_iteration": 2.7254621982574463 }, { "auxiliary_loss_clip": 0.01094556, "auxiliary_loss_mlp": 0.01025932, "balance_loss_clip": 1.0368104, "balance_loss_mlp": 1.01444018, "epoch": 0.8147301969036524, "flos": 18041341979520.0, "grad_norm": 1.9888509797715757, "language_loss": 0.71529424, "learning_rate": 3.4929005960902826e-07, "loss": 0.73649913, "num_input_tokens_seen": 292385315, "step": 13551, "time_per_iteration": 2.704594850540161 }, { "auxiliary_loss_clip": 0.01072466, "auxiliary_loss_mlp": 0.01036963, "balance_loss_clip": 1.03925037, "balance_loss_mlp": 1.02343869, "epoch": 0.8147903201563205, "flos": 18004713085440.0, "grad_norm": 1.9897080161612837, "language_loss": 0.68656695, "learning_rate": 3.4907019531971926e-07, "loss": 0.70766115, "num_input_tokens_seen": 292403375, "step": 13552, "time_per_iteration": 2.7425405979156494 }, { "auxiliary_loss_clip": 0.01107317, "auxiliary_loss_mlp": 0.01043397, "balance_loss_clip": 1.03570342, "balance_loss_mlp": 1.03133857, "epoch": 0.8148504434089884, "flos": 20259327916800.0, "grad_norm": 1.7008594179120202, "language_loss": 0.82082725, "learning_rate": 3.4885039363383407e-07, "loss": 0.84233445, "num_input_tokens_seen": 292419260, "step": 13553, "time_per_iteration": 2.5453405380249023 }, { "auxiliary_loss_clip": 0.01097272, "auxiliary_loss_mlp": 0.01030289, "balance_loss_clip": 1.035079, "balance_loss_mlp": 1.01831412, "epoch": 0.8149105666616564, "flos": 12494723621760.0, "grad_norm": 1.6418052636171558, "language_loss": 0.67904902, "learning_rate": 3.4863065455970795e-07, "loss": 0.70032459, "num_input_tokens_seen": 292436095, "step": 13554, "time_per_iteration": 4.209248781204224 }, { "auxiliary_loss_clip": 0.01082623, "auxiliary_loss_mlp": 0.01041493, "balance_loss_clip": 1.035748, "balance_loss_mlp": 1.02727127, "epoch": 0.8149706899143244, "flos": 32523683662080.0, "grad_norm": 1.9729231386540171, "language_loss": 0.66057062, "learning_rate": 3.484109781056723e-07, "loss": 0.68181175, "num_input_tokens_seen": 292457190, "step": 13555, "time_per_iteration": 4.274117708206177 }, { "auxiliary_loss_clip": 0.01102138, "auxiliary_loss_mlp": 0.01034591, "balance_loss_clip": 1.03688693, "balance_loss_mlp": 1.02167439, "epoch": 0.8150308131669923, "flos": 19386088375680.0, "grad_norm": 2.108444825647498, "language_loss": 0.7319755, "learning_rate": 3.4819136428005844e-07, "loss": 0.75334281, "num_input_tokens_seen": 292474300, "step": 13556, "time_per_iteration": 2.5886549949645996 }, { "auxiliary_loss_clip": 0.01099496, "auxiliary_loss_mlp": 0.01027956, "balance_loss_clip": 1.03907287, "balance_loss_mlp": 1.01664877, "epoch": 0.8150909364196604, "flos": 17421380213760.0, "grad_norm": 1.654846698931865, "language_loss": 0.80619091, "learning_rate": 3.4797181309119307e-07, "loss": 0.82746542, "num_input_tokens_seen": 292492420, "step": 13557, "time_per_iteration": 4.058533430099487 }, { "auxiliary_loss_clip": 0.01089108, "auxiliary_loss_mlp": 0.01034069, "balance_loss_clip": 1.03591609, "balance_loss_mlp": 1.02201128, "epoch": 0.8151510596723283, "flos": 27162795553920.0, "grad_norm": 1.7508168660237897, "language_loss": 0.6597842, "learning_rate": 3.4775232454740255e-07, "loss": 0.68101597, "num_input_tokens_seen": 292512895, "step": 13558, "time_per_iteration": 2.7690083980560303 }, { "auxiliary_loss_clip": 0.01029498, "auxiliary_loss_mlp": 0.01004693, "balance_loss_clip": 1.00695944, "balance_loss_mlp": 1.00384712, "epoch": 0.8152111829249963, "flos": 64219052718720.0, "grad_norm": 0.8394726411943846, "language_loss": 0.56896985, "learning_rate": 3.4753289865700896e-07, "loss": 0.58931184, "num_input_tokens_seen": 292566580, "step": 13559, "time_per_iteration": 3.114321231842041 }, { "auxiliary_loss_clip": 0.01012079, "auxiliary_loss_mlp": 0.01011711, "balance_loss_clip": 1.00770724, "balance_loss_mlp": 1.0104531, "epoch": 0.8152713061776642, "flos": 67072012306560.0, "grad_norm": 0.6789957550904517, "language_loss": 0.55196381, "learning_rate": 3.473135354283334e-07, "loss": 0.57220173, "num_input_tokens_seen": 292621490, "step": 13560, "time_per_iteration": 3.059293746948242 }, { "auxiliary_loss_clip": 0.0108755, "auxiliary_loss_mlp": 0.01029779, "balance_loss_clip": 1.03620529, "balance_loss_mlp": 1.01832318, "epoch": 0.8153314294303322, "flos": 14391130072320.0, "grad_norm": 1.7604364526960343, "language_loss": 0.67580026, "learning_rate": 3.470942348696948e-07, "loss": 0.6969735, "num_input_tokens_seen": 292638660, "step": 13561, "time_per_iteration": 2.659605026245117 }, { "auxiliary_loss_clip": 0.01103139, "auxiliary_loss_mlp": 0.01035516, "balance_loss_clip": 1.03822076, "balance_loss_mlp": 1.02304101, "epoch": 0.8153915526830001, "flos": 25623520076160.0, "grad_norm": 1.5670796727664797, "language_loss": 0.81579733, "learning_rate": 3.468749969894085e-07, "loss": 0.83718389, "num_input_tokens_seen": 292658545, "step": 13562, "time_per_iteration": 4.182463884353638 }, { "auxiliary_loss_clip": 0.01085183, "auxiliary_loss_mlp": 0.01033103, "balance_loss_clip": 1.03773975, "balance_loss_mlp": 1.02135468, "epoch": 0.8154516759356681, "flos": 23369156640000.0, "grad_norm": 1.459474907859823, "language_loss": 0.71938479, "learning_rate": 3.4665582179578734e-07, "loss": 0.74056768, "num_input_tokens_seen": 292678460, "step": 13563, "time_per_iteration": 2.695099353790283 }, { "auxiliary_loss_clip": 0.01025068, "auxiliary_loss_mlp": 0.01029122, "balance_loss_clip": 1.03488255, "balance_loss_mlp": 1.01562715, "epoch": 0.815511799188336, "flos": 28149189914880.0, "grad_norm": 1.6109076046410835, "language_loss": 0.702739, "learning_rate": 3.4643670929714387e-07, "loss": 0.72328091, "num_input_tokens_seen": 292699815, "step": 13564, "time_per_iteration": 3.0163979530334473 }, { "auxiliary_loss_clip": 0.0108271, "auxiliary_loss_mlp": 0.0102893, "balance_loss_clip": 1.03672302, "balance_loss_mlp": 1.01679492, "epoch": 0.8155719224410041, "flos": 16983413683200.0, "grad_norm": 2.0873578348376745, "language_loss": 0.70476174, "learning_rate": 3.462176595017854e-07, "loss": 0.72587812, "num_input_tokens_seen": 292717370, "step": 13565, "time_per_iteration": 2.8652422428131104 }, { "auxiliary_loss_clip": 0.01097982, "auxiliary_loss_mlp": 0.01031925, "balance_loss_clip": 1.03627336, "balance_loss_mlp": 1.01994491, "epoch": 0.815632045693672, "flos": 24681727428480.0, "grad_norm": 1.7436798411842787, "language_loss": 0.78950644, "learning_rate": 3.459986724180188e-07, "loss": 0.8108055, "num_input_tokens_seen": 292737110, "step": 13566, "time_per_iteration": 2.6846365928649902 }, { "auxiliary_loss_clip": 0.01087086, "auxiliary_loss_mlp": 0.01029651, "balance_loss_clip": 1.03798318, "balance_loss_mlp": 1.01873779, "epoch": 0.81569216894634, "flos": 19938323047680.0, "grad_norm": 1.5991898000196176, "language_loss": 0.82388943, "learning_rate": 3.457797480541491e-07, "loss": 0.84505683, "num_input_tokens_seen": 292756510, "step": 13567, "time_per_iteration": 2.6953818798065186 }, { "auxiliary_loss_clip": 0.01105808, "auxiliary_loss_mlp": 0.01028496, "balance_loss_clip": 1.03625703, "balance_loss_mlp": 1.01798785, "epoch": 0.8157522921990079, "flos": 21799393493760.0, "grad_norm": 2.2084802673905592, "language_loss": 0.79599839, "learning_rate": 3.455608864184771e-07, "loss": 0.81734145, "num_input_tokens_seen": 292776710, "step": 13568, "time_per_iteration": 2.6095540523529053 }, { "auxiliary_loss_clip": 0.01088313, "auxiliary_loss_mlp": 0.01030015, "balance_loss_clip": 1.03864861, "balance_loss_mlp": 1.01857734, "epoch": 0.8158124154516759, "flos": 18508323720960.0, "grad_norm": 1.8748620768134565, "language_loss": 0.77194703, "learning_rate": 3.453420875193016e-07, "loss": 0.79313028, "num_input_tokens_seen": 292794350, "step": 13569, "time_per_iteration": 2.7158358097076416 }, { "auxiliary_loss_clip": 0.01107012, "auxiliary_loss_mlp": 0.01039138, "balance_loss_clip": 1.03705049, "balance_loss_mlp": 1.02786636, "epoch": 0.815872538704344, "flos": 26830801123200.0, "grad_norm": 2.207936196273456, "language_loss": 0.59039974, "learning_rate": 3.451233513649199e-07, "loss": 0.61186123, "num_input_tokens_seen": 292814005, "step": 13570, "time_per_iteration": 2.6274027824401855 }, { "auxiliary_loss_clip": 0.01099743, "auxiliary_loss_mlp": 0.01037351, "balance_loss_clip": 1.03609109, "balance_loss_mlp": 1.02433372, "epoch": 0.8159326619570119, "flos": 21725704742400.0, "grad_norm": 1.7456808566314872, "language_loss": 0.8209976, "learning_rate": 3.4490467796362687e-07, "loss": 0.84236854, "num_input_tokens_seen": 292833485, "step": 13571, "time_per_iteration": 2.607311725616455 }, { "auxiliary_loss_clip": 0.01082011, "auxiliary_loss_mlp": 0.01040519, "balance_loss_clip": 1.03530788, "balance_loss_mlp": 1.02747178, "epoch": 0.8159927852096799, "flos": 13840726993920.0, "grad_norm": 2.3386046142984966, "language_loss": 0.7775113, "learning_rate": 3.446860673237142e-07, "loss": 0.79873657, "num_input_tokens_seen": 292848045, "step": 13572, "time_per_iteration": 2.615434169769287 }, { "auxiliary_loss_clip": 0.01110553, "auxiliary_loss_mlp": 0.01033153, "balance_loss_clip": 1.0374527, "balance_loss_mlp": 1.0209341, "epoch": 0.8160529084623478, "flos": 24499516711680.0, "grad_norm": 1.477093078405139, "language_loss": 0.65240854, "learning_rate": 3.4446751945347186e-07, "loss": 0.67384559, "num_input_tokens_seen": 292869965, "step": 13573, "time_per_iteration": 2.6414575576782227 }, { "auxiliary_loss_clip": 0.01075717, "auxiliary_loss_mlp": 0.01028632, "balance_loss_clip": 1.03710008, "balance_loss_mlp": 1.0173732, "epoch": 0.8161130317150158, "flos": 24826339584000.0, "grad_norm": 1.642166809234801, "language_loss": 0.75473046, "learning_rate": 3.442490343611868e-07, "loss": 0.77577394, "num_input_tokens_seen": 292889680, "step": 13574, "time_per_iteration": 2.85577392578125 }, { "auxiliary_loss_clip": 0.01101144, "auxiliary_loss_mlp": 0.0103536, "balance_loss_clip": 1.03803658, "balance_loss_mlp": 1.02264595, "epoch": 0.8161731549676837, "flos": 30956542208640.0, "grad_norm": 2.5612570404533157, "language_loss": 0.60302323, "learning_rate": 3.4403061205514485e-07, "loss": 0.62438828, "num_input_tokens_seen": 292912360, "step": 13575, "time_per_iteration": 2.725813627243042 }, { "auxiliary_loss_clip": 0.01030079, "auxiliary_loss_mlp": 0.01039791, "balance_loss_clip": 1.03109765, "balance_loss_mlp": 1.02550936, "epoch": 0.8162332782203517, "flos": 18551991680640.0, "grad_norm": 7.314687575537146, "language_loss": 0.74446952, "learning_rate": 3.4381225254362736e-07, "loss": 0.76516831, "num_input_tokens_seen": 292928325, "step": 13576, "time_per_iteration": 2.8337759971618652 }, { "auxiliary_loss_clip": 0.01010195, "auxiliary_loss_mlp": 0.01001162, "balance_loss_clip": 1.00829458, "balance_loss_mlp": 1.00028598, "epoch": 0.8162934014730197, "flos": 70386853904640.0, "grad_norm": 0.8299990748373413, "language_loss": 0.58698022, "learning_rate": 3.435939558349155e-07, "loss": 0.60709381, "num_input_tokens_seen": 292992795, "step": 13577, "time_per_iteration": 3.217165470123291 }, { "auxiliary_loss_clip": 0.01050236, "auxiliary_loss_mlp": 0.01031701, "balance_loss_clip": 1.03245091, "balance_loss_mlp": 1.01977444, "epoch": 0.8163535247256877, "flos": 21214839559680.0, "grad_norm": 1.6040267571253908, "language_loss": 0.70921433, "learning_rate": 3.4337572193728747e-07, "loss": 0.73003376, "num_input_tokens_seen": 293011950, "step": 13578, "time_per_iteration": 2.840709686279297 }, { "auxiliary_loss_clip": 0.01068471, "auxiliary_loss_mlp": 0.01030739, "balance_loss_clip": 1.03506184, "balance_loss_mlp": 1.0190444, "epoch": 0.8164136479783556, "flos": 21098847565440.0, "grad_norm": 2.7752862595751977, "language_loss": 0.73731124, "learning_rate": 3.431575508590172e-07, "loss": 0.7583034, "num_input_tokens_seen": 293030175, "step": 13579, "time_per_iteration": 2.812387704849243 }, { "auxiliary_loss_clip": 0.01110978, "auxiliary_loss_mlp": 0.01028039, "balance_loss_clip": 1.03761864, "balance_loss_mlp": 1.01615429, "epoch": 0.8164737712310236, "flos": 21720640924800.0, "grad_norm": 5.498991527014378, "language_loss": 0.79516351, "learning_rate": 3.4293944260837873e-07, "loss": 0.81655371, "num_input_tokens_seen": 293047980, "step": 13580, "time_per_iteration": 2.8092665672302246 }, { "auxiliary_loss_clip": 0.01071948, "auxiliary_loss_mlp": 0.01034847, "balance_loss_clip": 1.03299272, "balance_loss_mlp": 1.02182913, "epoch": 0.8165338944836915, "flos": 19536805843200.0, "grad_norm": 1.723429137426299, "language_loss": 0.69085348, "learning_rate": 3.4272139719364314e-07, "loss": 0.71192145, "num_input_tokens_seen": 293067030, "step": 13581, "time_per_iteration": 2.7907984256744385 }, { "auxiliary_loss_clip": 0.01107871, "auxiliary_loss_mlp": 0.01032555, "balance_loss_clip": 1.03613353, "balance_loss_mlp": 1.02049136, "epoch": 0.8165940177363595, "flos": 22928568416640.0, "grad_norm": 1.8992496957974652, "language_loss": 0.59582806, "learning_rate": 3.4250341462307786e-07, "loss": 0.61723232, "num_input_tokens_seen": 293085575, "step": 13582, "time_per_iteration": 2.72542405128479 }, { "auxiliary_loss_clip": 0.0107424, "auxiliary_loss_mlp": 0.00769809, "balance_loss_clip": 1.03585207, "balance_loss_mlp": 1.00015545, "epoch": 0.8166541409890276, "flos": 23370377702400.0, "grad_norm": 1.954054796899383, "language_loss": 0.82329261, "learning_rate": 3.4228549490494897e-07, "loss": 0.84173316, "num_input_tokens_seen": 293108200, "step": 13583, "time_per_iteration": 2.749908685684204 }, { "auxiliary_loss_clip": 0.01088673, "auxiliary_loss_mlp": 0.01025238, "balance_loss_clip": 1.03623259, "balance_loss_mlp": 1.01392472, "epoch": 0.8167142642416955, "flos": 18441997257600.0, "grad_norm": 1.802555874623744, "language_loss": 0.74573183, "learning_rate": 3.4206763804752093e-07, "loss": 0.76687098, "num_input_tokens_seen": 293126020, "step": 13584, "time_per_iteration": 2.8091073036193848 }, { "auxiliary_loss_clip": 0.01098996, "auxiliary_loss_mlp": 0.0102858, "balance_loss_clip": 1.03830242, "balance_loss_mlp": 1.01618278, "epoch": 0.8167743874943635, "flos": 21214983214080.0, "grad_norm": 1.689121999421373, "language_loss": 0.74577987, "learning_rate": 3.4184984405905405e-07, "loss": 0.76705563, "num_input_tokens_seen": 293144620, "step": 13585, "time_per_iteration": 2.6251516342163086 }, { "auxiliary_loss_clip": 0.01083034, "auxiliary_loss_mlp": 0.01035775, "balance_loss_clip": 1.03814149, "balance_loss_mlp": 1.02334797, "epoch": 0.8168345107470314, "flos": 18697681244160.0, "grad_norm": 2.110704900607274, "language_loss": 0.6954788, "learning_rate": 3.416321129478068e-07, "loss": 0.71666694, "num_input_tokens_seen": 293162850, "step": 13586, "time_per_iteration": 2.6488070487976074 }, { "auxiliary_loss_clip": 0.01049954, "auxiliary_loss_mlp": 0.01038255, "balance_loss_clip": 1.03342056, "balance_loss_mlp": 1.02592838, "epoch": 0.8168946339996994, "flos": 16253098358400.0, "grad_norm": 1.5273465759672988, "language_loss": 0.60744089, "learning_rate": 3.4141444472203594e-07, "loss": 0.62832302, "num_input_tokens_seen": 293181620, "step": 13587, "time_per_iteration": 2.7878332138061523 }, { "auxiliary_loss_clip": 0.01100484, "auxiliary_loss_mlp": 0.0103421, "balance_loss_clip": 1.03639674, "balance_loss_mlp": 1.02172291, "epoch": 0.8169547572523673, "flos": 26941585645440.0, "grad_norm": 2.223800946247814, "language_loss": 0.6970458, "learning_rate": 3.4119683938999624e-07, "loss": 0.71839273, "num_input_tokens_seen": 293200270, "step": 13588, "time_per_iteration": 2.692920207977295 }, { "auxiliary_loss_clip": 0.01085855, "auxiliary_loss_mlp": 0.01043402, "balance_loss_clip": 1.03655553, "balance_loss_mlp": 1.02848303, "epoch": 0.8170148805050353, "flos": 18952323736320.0, "grad_norm": 1.5303676433154123, "language_loss": 0.73124111, "learning_rate": 3.4097929695993854e-07, "loss": 0.75253367, "num_input_tokens_seen": 293218960, "step": 13589, "time_per_iteration": 2.679173469543457 }, { "auxiliary_loss_clip": 0.01094872, "auxiliary_loss_mlp": 0.01032909, "balance_loss_clip": 1.03692865, "balance_loss_mlp": 1.02016521, "epoch": 0.8170750037577033, "flos": 21834909066240.0, "grad_norm": 2.2699236258793456, "language_loss": 0.73170865, "learning_rate": 3.4076181744011166e-07, "loss": 0.75298643, "num_input_tokens_seen": 293236450, "step": 13590, "time_per_iteration": 2.661827802658081 }, { "auxiliary_loss_clip": 0.01112691, "auxiliary_loss_mlp": 0.01033597, "balance_loss_clip": 1.03789759, "balance_loss_mlp": 1.01964402, "epoch": 0.8171351270103713, "flos": 33507169021440.0, "grad_norm": 2.228487135956597, "language_loss": 0.65462661, "learning_rate": 3.4054440083876345e-07, "loss": 0.67608947, "num_input_tokens_seen": 293256480, "step": 13591, "time_per_iteration": 2.713564872741699 }, { "auxiliary_loss_clip": 0.01110837, "auxiliary_loss_mlp": 0.01036337, "balance_loss_clip": 1.03630888, "balance_loss_mlp": 1.02364123, "epoch": 0.8171952502630392, "flos": 22708184520960.0, "grad_norm": 2.2790144502571366, "language_loss": 0.68108523, "learning_rate": 3.403270471641373e-07, "loss": 0.70255697, "num_input_tokens_seen": 293274960, "step": 13592, "time_per_iteration": 2.673107862472534 }, { "auxiliary_loss_clip": 0.01086566, "auxiliary_loss_mlp": 0.01029843, "balance_loss_clip": 1.03531361, "balance_loss_mlp": 1.01699781, "epoch": 0.8172553735157072, "flos": 26723715701760.0, "grad_norm": 1.5485466533329424, "language_loss": 0.6656639, "learning_rate": 3.401097564244759e-07, "loss": 0.68682802, "num_input_tokens_seen": 293295945, "step": 13593, "time_per_iteration": 2.738813877105713 }, { "auxiliary_loss_clip": 0.01098161, "auxiliary_loss_mlp": 0.01032061, "balance_loss_clip": 1.03540421, "balance_loss_mlp": 1.02022982, "epoch": 0.8173154967683751, "flos": 15961072786560.0, "grad_norm": 1.90048610301986, "language_loss": 0.69598675, "learning_rate": 3.398925286280188e-07, "loss": 0.71728897, "num_input_tokens_seen": 293313300, "step": 13594, "time_per_iteration": 5.800758361816406 }, { "auxiliary_loss_clip": 0.01110285, "auxiliary_loss_mlp": 0.01033044, "balance_loss_clip": 1.0364691, "balance_loss_mlp": 1.02115333, "epoch": 0.8173756200210431, "flos": 25986720447360.0, "grad_norm": 1.8053968351175154, "language_loss": 0.65974349, "learning_rate": 3.3967536378300456e-07, "loss": 0.68117678, "num_input_tokens_seen": 293333085, "step": 13595, "time_per_iteration": 2.6032371520996094 }, { "auxiliary_loss_clip": 0.01068247, "auxiliary_loss_mlp": 0.01028339, "balance_loss_clip": 1.03591299, "balance_loss_mlp": 1.01576889, "epoch": 0.8174357432737112, "flos": 25664422688640.0, "grad_norm": 1.659795934344192, "language_loss": 0.78425729, "learning_rate": 3.394582618976658e-07, "loss": 0.80522317, "num_input_tokens_seen": 293351895, "step": 13596, "time_per_iteration": 4.231920003890991 }, { "auxiliary_loss_clip": 0.01081938, "auxiliary_loss_mlp": 0.01028873, "balance_loss_clip": 1.03306651, "balance_loss_mlp": 1.01600397, "epoch": 0.8174958665263791, "flos": 21835088634240.0, "grad_norm": 2.5636613927912775, "language_loss": 0.58887529, "learning_rate": 3.392412229802362e-07, "loss": 0.60998344, "num_input_tokens_seen": 293371165, "step": 13597, "time_per_iteration": 2.699782133102417 }, { "auxiliary_loss_clip": 0.0107094, "auxiliary_loss_mlp": 0.01033853, "balance_loss_clip": 1.03980625, "balance_loss_mlp": 1.02193189, "epoch": 0.8175559897790471, "flos": 22455517276800.0, "grad_norm": 1.534270538423627, "language_loss": 0.82330656, "learning_rate": 3.390242470389462e-07, "loss": 0.84435457, "num_input_tokens_seen": 293391150, "step": 13598, "time_per_iteration": 2.7620291709899902 }, { "auxiliary_loss_clip": 0.01052171, "auxiliary_loss_mlp": 0.01031716, "balance_loss_clip": 1.03996241, "balance_loss_mlp": 1.01993775, "epoch": 0.817616113031715, "flos": 23615790399360.0, "grad_norm": 1.8636627263308922, "language_loss": 0.82549691, "learning_rate": 3.3880733408202277e-07, "loss": 0.84633583, "num_input_tokens_seen": 293409440, "step": 13599, "time_per_iteration": 2.8193368911743164 }, { "auxiliary_loss_clip": 0.01057864, "auxiliary_loss_mlp": 0.0104518, "balance_loss_clip": 1.03178751, "balance_loss_mlp": 1.03132749, "epoch": 0.817676236284383, "flos": 27672260106240.0, "grad_norm": 2.111179301114437, "language_loss": 0.83922112, "learning_rate": 3.3859048411769186e-07, "loss": 0.86025155, "num_input_tokens_seen": 293428995, "step": 13600, "time_per_iteration": 2.7920475006103516 }, { "auxiliary_loss_clip": 0.01074994, "auxiliary_loss_mlp": 0.01031351, "balance_loss_clip": 1.03580821, "balance_loss_mlp": 1.01914954, "epoch": 0.8177363595370509, "flos": 24681009156480.0, "grad_norm": 1.862299702432468, "language_loss": 0.74226046, "learning_rate": 3.383736971541766e-07, "loss": 0.76332384, "num_input_tokens_seen": 293449155, "step": 13601, "time_per_iteration": 4.308535575866699 }, { "auxiliary_loss_clip": 0.01078366, "auxiliary_loss_mlp": 0.01036015, "balance_loss_clip": 1.03641343, "balance_loss_mlp": 1.02314591, "epoch": 0.817796482789719, "flos": 17346326745600.0, "grad_norm": 2.078289918028392, "language_loss": 0.68360138, "learning_rate": 3.3815697319969737e-07, "loss": 0.70474523, "num_input_tokens_seen": 293466125, "step": 13602, "time_per_iteration": 2.8116466999053955 }, { "auxiliary_loss_clip": 0.01068639, "auxiliary_loss_mlp": 0.01038644, "balance_loss_clip": 1.03409863, "balance_loss_mlp": 1.02547121, "epoch": 0.8178566060423869, "flos": 17778475272960.0, "grad_norm": 2.118367882744336, "language_loss": 0.83765864, "learning_rate": 3.379403122624718e-07, "loss": 0.85873151, "num_input_tokens_seen": 293481345, "step": 13603, "time_per_iteration": 2.7411158084869385 }, { "auxiliary_loss_clip": 0.0106116, "auxiliary_loss_mlp": 0.01028949, "balance_loss_clip": 1.03759289, "balance_loss_mlp": 1.0176841, "epoch": 0.8179167292950549, "flos": 24973250209920.0, "grad_norm": 1.7705965391975051, "language_loss": 0.69410896, "learning_rate": 3.377237143507159e-07, "loss": 0.71501005, "num_input_tokens_seen": 293502330, "step": 13604, "time_per_iteration": 2.7547354698181152 }, { "auxiliary_loss_clip": 0.01081221, "auxiliary_loss_mlp": 0.01034665, "balance_loss_clip": 1.03777099, "balance_loss_mlp": 1.02226162, "epoch": 0.8179768525477228, "flos": 22856783086080.0, "grad_norm": 1.8951606880095677, "language_loss": 0.74119198, "learning_rate": 3.3750717947264406e-07, "loss": 0.76235086, "num_input_tokens_seen": 293521415, "step": 13605, "time_per_iteration": 2.7130730152130127 }, { "auxiliary_loss_clip": 0.01071497, "auxiliary_loss_mlp": 0.01038946, "balance_loss_clip": 1.03906167, "balance_loss_mlp": 1.02588034, "epoch": 0.8180369758003908, "flos": 18515147304960.0, "grad_norm": 1.7730120877057978, "language_loss": 0.73990393, "learning_rate": 3.372907076364666e-07, "loss": 0.76100838, "num_input_tokens_seen": 293539245, "step": 13606, "time_per_iteration": 2.705872058868408 }, { "auxiliary_loss_clip": 0.01108658, "auxiliary_loss_mlp": 0.01032239, "balance_loss_clip": 1.03782868, "balance_loss_mlp": 1.02010965, "epoch": 0.8180970990530587, "flos": 33182105915520.0, "grad_norm": 1.7215775601325016, "language_loss": 0.65496033, "learning_rate": 3.370742988503916e-07, "loss": 0.67636931, "num_input_tokens_seen": 293560640, "step": 13607, "time_per_iteration": 2.695094347000122 }, { "auxiliary_loss_clip": 0.01087636, "auxiliary_loss_mlp": 0.01030931, "balance_loss_clip": 1.0383904, "balance_loss_mlp": 1.0186528, "epoch": 0.8181572223057267, "flos": 25010022758400.0, "grad_norm": 1.9180357233704657, "language_loss": 0.70527983, "learning_rate": 3.3685795312262634e-07, "loss": 0.72646552, "num_input_tokens_seen": 293579465, "step": 13608, "time_per_iteration": 2.7109787464141846 }, { "auxiliary_loss_clip": 0.01094237, "auxiliary_loss_mlp": 0.01033072, "balance_loss_clip": 1.03419423, "balance_loss_mlp": 1.02114487, "epoch": 0.8182173455583948, "flos": 28548731871360.0, "grad_norm": 1.94764090555504, "language_loss": 0.79518479, "learning_rate": 3.366416704613735e-07, "loss": 0.81645787, "num_input_tokens_seen": 293600540, "step": 13609, "time_per_iteration": 2.678457736968994 }, { "auxiliary_loss_clip": 0.01006167, "auxiliary_loss_mlp": 0.01001094, "balance_loss_clip": 1.01206219, "balance_loss_mlp": 0.99999696, "epoch": 0.8182774688110627, "flos": 72028043245440.0, "grad_norm": 0.745693768883286, "language_loss": 0.55858743, "learning_rate": 3.3642545087483544e-07, "loss": 0.57866001, "num_input_tokens_seen": 293665160, "step": 13610, "time_per_iteration": 3.287687063217163 }, { "auxiliary_loss_clip": 0.01043521, "auxiliary_loss_mlp": 0.00770311, "balance_loss_clip": 1.02925563, "balance_loss_mlp": 1.00016284, "epoch": 0.8183375920637307, "flos": 19755358145280.0, "grad_norm": 1.923535272295543, "language_loss": 0.77933627, "learning_rate": 3.362092943712107e-07, "loss": 0.79747456, "num_input_tokens_seen": 293683995, "step": 13611, "time_per_iteration": 2.757842540740967 }, { "auxiliary_loss_clip": 0.01074897, "auxiliary_loss_mlp": 0.01033531, "balance_loss_clip": 1.03499138, "balance_loss_mlp": 1.01989329, "epoch": 0.8183977153163986, "flos": 22341895580160.0, "grad_norm": 1.792092336455415, "language_loss": 0.77061421, "learning_rate": 3.3599320095869745e-07, "loss": 0.79169852, "num_input_tokens_seen": 293704115, "step": 13612, "time_per_iteration": 2.7527639865875244 }, { "auxiliary_loss_clip": 0.01070156, "auxiliary_loss_mlp": 0.01026228, "balance_loss_clip": 1.03287673, "balance_loss_mlp": 1.01489091, "epoch": 0.8184578385690666, "flos": 17712472032000.0, "grad_norm": 2.2843501898761205, "language_loss": 0.86122215, "learning_rate": 3.3577717064548793e-07, "loss": 0.88218594, "num_input_tokens_seen": 293722225, "step": 13613, "time_per_iteration": 2.7401769161224365 }, { "auxiliary_loss_clip": 0.01098117, "auxiliary_loss_mlp": 0.01045961, "balance_loss_clip": 1.03796077, "balance_loss_mlp": 1.03408742, "epoch": 0.8185179618217345, "flos": 25701159323520.0, "grad_norm": 2.6943480518584906, "language_loss": 0.72842276, "learning_rate": 3.355612034397746e-07, "loss": 0.74986356, "num_input_tokens_seen": 293743995, "step": 13614, "time_per_iteration": 2.680565118789673 }, { "auxiliary_loss_clip": 0.01085324, "auxiliary_loss_mlp": 0.01040844, "balance_loss_clip": 1.03373837, "balance_loss_mlp": 1.02824354, "epoch": 0.8185780850744026, "flos": 25960326929280.0, "grad_norm": 1.7330678379647075, "language_loss": 0.81346858, "learning_rate": 3.353452993497479e-07, "loss": 0.83473027, "num_input_tokens_seen": 293764935, "step": 13615, "time_per_iteration": 2.715773105621338 }, { "auxiliary_loss_clip": 0.01093975, "auxiliary_loss_mlp": 0.01032579, "balance_loss_clip": 1.03279996, "balance_loss_mlp": 1.01989484, "epoch": 0.8186382083270705, "flos": 25228431406080.0, "grad_norm": 3.391470400733545, "language_loss": 0.75472414, "learning_rate": 3.3512945838359375e-07, "loss": 0.77598965, "num_input_tokens_seen": 293784035, "step": 13616, "time_per_iteration": 2.6478960514068604 }, { "auxiliary_loss_clip": 0.0106733, "auxiliary_loss_mlp": 0.01043672, "balance_loss_clip": 1.03091192, "balance_loss_mlp": 1.02980757, "epoch": 0.8186983315797385, "flos": 22415009713920.0, "grad_norm": 1.7062309242094946, "language_loss": 0.75500989, "learning_rate": 3.349136805494979e-07, "loss": 0.77611995, "num_input_tokens_seen": 293803360, "step": 13617, "time_per_iteration": 2.7314293384552 }, { "auxiliary_loss_clip": 0.01080104, "auxiliary_loss_mlp": 0.01032113, "balance_loss_clip": 1.03370297, "balance_loss_mlp": 1.02053142, "epoch": 0.8187584548324064, "flos": 22018017623040.0, "grad_norm": 1.9943005109582315, "language_loss": 0.68466866, "learning_rate": 3.346979658556415e-07, "loss": 0.70579082, "num_input_tokens_seen": 293821325, "step": 13618, "time_per_iteration": 2.7118663787841797 }, { "auxiliary_loss_clip": 0.01086635, "auxiliary_loss_mlp": 0.01032733, "balance_loss_clip": 1.03645062, "balance_loss_mlp": 1.01954257, "epoch": 0.8188185780850744, "flos": 29241664116480.0, "grad_norm": 1.958275526623864, "language_loss": 0.69876873, "learning_rate": 3.344823143102058e-07, "loss": 0.71996242, "num_input_tokens_seen": 293840315, "step": 13619, "time_per_iteration": 2.7601280212402344 }, { "auxiliary_loss_clip": 0.01051452, "auxiliary_loss_mlp": 0.01029042, "balance_loss_clip": 1.03892016, "balance_loss_mlp": 1.01647735, "epoch": 0.8188787013377423, "flos": 20696504348160.0, "grad_norm": 1.8298313202907792, "language_loss": 0.73982012, "learning_rate": 3.3426672592136694e-07, "loss": 0.760625, "num_input_tokens_seen": 293855685, "step": 13620, "time_per_iteration": 2.782697916030884 }, { "auxiliary_loss_clip": 0.01079658, "auxiliary_loss_mlp": 0.00772167, "balance_loss_clip": 1.03250647, "balance_loss_mlp": 1.00025058, "epoch": 0.8189388245904103, "flos": 23732967542400.0, "grad_norm": 1.515288767485074, "language_loss": 0.76337874, "learning_rate": 3.340512006973011e-07, "loss": 0.78189701, "num_input_tokens_seen": 293875540, "step": 13621, "time_per_iteration": 2.681579828262329 }, { "auxiliary_loss_clip": 0.01082197, "auxiliary_loss_mlp": 0.01030264, "balance_loss_clip": 1.03172946, "balance_loss_mlp": 1.01746082, "epoch": 0.8189989478430784, "flos": 28255090187520.0, "grad_norm": 2.436105215072431, "language_loss": 0.66058964, "learning_rate": 3.3383573864618076e-07, "loss": 0.68171418, "num_input_tokens_seen": 293896570, "step": 13622, "time_per_iteration": 2.753495216369629 }, { "auxiliary_loss_clip": 0.01111281, "auxiliary_loss_mlp": 0.01030003, "balance_loss_clip": 1.03886437, "balance_loss_mlp": 1.01628244, "epoch": 0.8190590710957463, "flos": 21397696721280.0, "grad_norm": 1.992471820034199, "language_loss": 0.74813384, "learning_rate": 3.3362033977617653e-07, "loss": 0.76954669, "num_input_tokens_seen": 293914680, "step": 13623, "time_per_iteration": 2.6488537788391113 }, { "auxiliary_loss_clip": 0.01085531, "auxiliary_loss_mlp": 0.01039034, "balance_loss_clip": 1.03339553, "balance_loss_mlp": 1.02606368, "epoch": 0.8191191943484143, "flos": 38796451367040.0, "grad_norm": 1.888675270274182, "language_loss": 0.63241279, "learning_rate": 3.3340500409545527e-07, "loss": 0.65365839, "num_input_tokens_seen": 293936480, "step": 13624, "time_per_iteration": 2.9440207481384277 }, { "auxiliary_loss_clip": 0.01106162, "auxiliary_loss_mlp": 0.01034735, "balance_loss_clip": 1.03641939, "balance_loss_mlp": 1.02273679, "epoch": 0.8191793176010822, "flos": 25446516831360.0, "grad_norm": 1.6219303590574095, "language_loss": 0.78032911, "learning_rate": 3.3318973161218386e-07, "loss": 0.80173808, "num_input_tokens_seen": 293957815, "step": 13625, "time_per_iteration": 2.685042381286621 }, { "auxiliary_loss_clip": 0.01101604, "auxiliary_loss_mlp": 0.00771173, "balance_loss_clip": 1.0347513, "balance_loss_mlp": 1.0001961, "epoch": 0.8192394408537502, "flos": 25083029151360.0, "grad_norm": 2.016240511414733, "language_loss": 0.75687516, "learning_rate": 3.329745223345244e-07, "loss": 0.77560294, "num_input_tokens_seen": 293975440, "step": 13626, "time_per_iteration": 2.637768507003784 }, { "auxiliary_loss_clip": 0.01098049, "auxiliary_loss_mlp": 0.0103675, "balance_loss_clip": 1.0376209, "balance_loss_mlp": 1.02519846, "epoch": 0.8192995641064181, "flos": 27673732563840.0, "grad_norm": 1.5972949724439707, "language_loss": 0.73228663, "learning_rate": 3.3275937627063823e-07, "loss": 0.75363463, "num_input_tokens_seen": 293997540, "step": 13627, "time_per_iteration": 2.7295448780059814 }, { "auxiliary_loss_clip": 0.01109571, "auxiliary_loss_mlp": 0.01033076, "balance_loss_clip": 1.03797257, "balance_loss_mlp": 1.02066636, "epoch": 0.8193596873590862, "flos": 21288492397440.0, "grad_norm": 1.651474068364024, "language_loss": 0.69222027, "learning_rate": 3.3254429342868353e-07, "loss": 0.71364677, "num_input_tokens_seen": 294017030, "step": 13628, "time_per_iteration": 2.6305129528045654 }, { "auxiliary_loss_clip": 0.01087095, "auxiliary_loss_mlp": 0.01045505, "balance_loss_clip": 1.0360409, "balance_loss_mlp": 1.03141403, "epoch": 0.8194198106117541, "flos": 17492626840320.0, "grad_norm": 2.3448084033624115, "language_loss": 0.85264301, "learning_rate": 3.323292738168171e-07, "loss": 0.87396896, "num_input_tokens_seen": 294035700, "step": 13629, "time_per_iteration": 2.6781747341156006 }, { "auxiliary_loss_clip": 0.01106506, "auxiliary_loss_mlp": 0.01026288, "balance_loss_clip": 1.03619409, "balance_loss_mlp": 1.01409864, "epoch": 0.8194799338644221, "flos": 15267925059840.0, "grad_norm": 2.0184519411378345, "language_loss": 0.73626029, "learning_rate": 3.3211431744319084e-07, "loss": 0.75758827, "num_input_tokens_seen": 294049730, "step": 13630, "time_per_iteration": 2.6452038288116455 }, { "auxiliary_loss_clip": 0.01096556, "auxiliary_loss_mlp": 0.01039124, "balance_loss_clip": 1.03655708, "balance_loss_mlp": 1.02556396, "epoch": 0.81954005711709, "flos": 14718814871040.0, "grad_norm": 1.8847266375290428, "language_loss": 0.72261512, "learning_rate": 3.31899424315957e-07, "loss": 0.74397194, "num_input_tokens_seen": 294066545, "step": 13631, "time_per_iteration": 2.595489025115967 }, { "auxiliary_loss_clip": 0.01108623, "auxiliary_loss_mlp": 0.01030377, "balance_loss_clip": 1.03625333, "balance_loss_mlp": 1.018224, "epoch": 0.819600180369758, "flos": 23074042498560.0, "grad_norm": 1.78491625477661, "language_loss": 0.76710784, "learning_rate": 3.3168459444326447e-07, "loss": 0.78849781, "num_input_tokens_seen": 294087455, "step": 13632, "time_per_iteration": 2.639312267303467 }, { "auxiliary_loss_clip": 0.01081621, "auxiliary_loss_mlp": 0.01031706, "balance_loss_clip": 1.03269899, "balance_loss_mlp": 1.01979756, "epoch": 0.8196603036224259, "flos": 27599792417280.0, "grad_norm": 3.6495669730083455, "language_loss": 0.65916097, "learning_rate": 3.314698278332588e-07, "loss": 0.68029428, "num_input_tokens_seen": 294107480, "step": 13633, "time_per_iteration": 4.429157733917236 }, { "auxiliary_loss_clip": 0.01090266, "auxiliary_loss_mlp": 0.01037639, "balance_loss_clip": 1.03390145, "balance_loss_mlp": 1.02634966, "epoch": 0.8197204268750939, "flos": 28582020800640.0, "grad_norm": 1.4436935437112157, "language_loss": 0.75417399, "learning_rate": 3.3125512449408513e-07, "loss": 0.77545297, "num_input_tokens_seen": 294130115, "step": 13634, "time_per_iteration": 4.236420392990112 }, { "auxiliary_loss_clip": 0.01049415, "auxiliary_loss_mlp": 0.00769002, "balance_loss_clip": 1.03555465, "balance_loss_mlp": 1.00017309, "epoch": 0.819780550127762, "flos": 23258300290560.0, "grad_norm": 1.863716594786732, "language_loss": 0.82285905, "learning_rate": 3.310404844338841e-07, "loss": 0.84104323, "num_input_tokens_seen": 294148495, "step": 13635, "time_per_iteration": 4.350587606430054 }, { "auxiliary_loss_clip": 0.01094136, "auxiliary_loss_mlp": 0.01031626, "balance_loss_clip": 1.03306413, "balance_loss_mlp": 1.01876307, "epoch": 0.8198406733804299, "flos": 26685255214080.0, "grad_norm": 1.580556826967959, "language_loss": 0.7557019, "learning_rate": 3.308259076607949e-07, "loss": 0.77695948, "num_input_tokens_seen": 294169595, "step": 13636, "time_per_iteration": 2.694965362548828 }, { "auxiliary_loss_clip": 0.01085829, "auxiliary_loss_mlp": 0.01034289, "balance_loss_clip": 1.04320598, "balance_loss_mlp": 1.02125335, "epoch": 0.8199007966330979, "flos": 20084084438400.0, "grad_norm": 2.291328351334751, "language_loss": 0.81272769, "learning_rate": 3.3061139418295445e-07, "loss": 0.83392888, "num_input_tokens_seen": 294183885, "step": 13637, "time_per_iteration": 2.730604410171509 }, { "auxiliary_loss_clip": 0.01097936, "auxiliary_loss_mlp": 0.01031089, "balance_loss_clip": 1.03770888, "balance_loss_mlp": 1.01860201, "epoch": 0.8199609198857658, "flos": 31902788142720.0, "grad_norm": 2.2206002932791566, "language_loss": 0.710298, "learning_rate": 3.3039694400849725e-07, "loss": 0.73158824, "num_input_tokens_seen": 294200150, "step": 13638, "time_per_iteration": 2.683467149734497 }, { "auxiliary_loss_clip": 0.01061969, "auxiliary_loss_mlp": 0.0103106, "balance_loss_clip": 1.0327965, "balance_loss_mlp": 1.01680839, "epoch": 0.8200210431384338, "flos": 26470150617600.0, "grad_norm": 1.942665681540599, "language_loss": 0.79615062, "learning_rate": 3.3018255714555564e-07, "loss": 0.81708086, "num_input_tokens_seen": 294220385, "step": 13639, "time_per_iteration": 2.7710959911346436 }, { "auxiliary_loss_clip": 0.01062834, "auxiliary_loss_mlp": 0.01033155, "balance_loss_clip": 1.03322732, "balance_loss_mlp": 1.02089465, "epoch": 0.8200811663911017, "flos": 22091454979200.0, "grad_norm": 1.6392982589425356, "language_loss": 0.79226673, "learning_rate": 3.299682336022589e-07, "loss": 0.81322664, "num_input_tokens_seen": 294239355, "step": 13640, "time_per_iteration": 4.275204658508301 }, { "auxiliary_loss_clip": 0.01076176, "auxiliary_loss_mlp": 0.01035588, "balance_loss_clip": 1.03405476, "balance_loss_mlp": 1.0229578, "epoch": 0.8201412896437698, "flos": 37593659520000.0, "grad_norm": 1.7308217218168405, "language_loss": 0.63248229, "learning_rate": 3.297539733867336e-07, "loss": 0.65359992, "num_input_tokens_seen": 294259395, "step": 13641, "time_per_iteration": 2.795254945755005 }, { "auxiliary_loss_clip": 0.01056206, "auxiliary_loss_mlp": 0.01028153, "balance_loss_clip": 1.03538704, "balance_loss_mlp": 1.01539707, "epoch": 0.8202014128964377, "flos": 19646333389440.0, "grad_norm": 1.8557472198282705, "language_loss": 0.73365706, "learning_rate": 3.295397765071055e-07, "loss": 0.75450063, "num_input_tokens_seen": 294277365, "step": 13642, "time_per_iteration": 2.7157320976257324 }, { "auxiliary_loss_clip": 0.01086181, "auxiliary_loss_mlp": 0.01031537, "balance_loss_clip": 1.03858817, "balance_loss_mlp": 1.01963401, "epoch": 0.8202615361491057, "flos": 31467335564160.0, "grad_norm": 2.095752785900936, "language_loss": 0.70286655, "learning_rate": 3.2932564297149615e-07, "loss": 0.72404379, "num_input_tokens_seen": 294297555, "step": 13643, "time_per_iteration": 2.7395925521850586 }, { "auxiliary_loss_clip": 0.01097598, "auxiliary_loss_mlp": 0.01031628, "balance_loss_clip": 1.03775418, "balance_loss_mlp": 1.01995778, "epoch": 0.8203216594017736, "flos": 24715555061760.0, "grad_norm": 1.69681118758784, "language_loss": 0.65516806, "learning_rate": 3.291115727880256e-07, "loss": 0.67646027, "num_input_tokens_seen": 294317600, "step": 13644, "time_per_iteration": 2.6443233489990234 }, { "auxiliary_loss_clip": 0.01069905, "auxiliary_loss_mlp": 0.01034884, "balance_loss_clip": 1.0356884, "balance_loss_mlp": 1.02291584, "epoch": 0.8203817826544416, "flos": 26031824951040.0, "grad_norm": 1.4101189561247485, "language_loss": 0.70740688, "learning_rate": 3.2889756596481234e-07, "loss": 0.72845483, "num_input_tokens_seen": 294340215, "step": 13645, "time_per_iteration": 2.7722573280334473 }, { "auxiliary_loss_clip": 0.01083381, "auxiliary_loss_mlp": 0.01027664, "balance_loss_clip": 1.03680301, "balance_loss_mlp": 1.01596987, "epoch": 0.8204419059071095, "flos": 25954544839680.0, "grad_norm": 2.371583298507033, "language_loss": 0.7132858, "learning_rate": 3.286836225099707e-07, "loss": 0.73439622, "num_input_tokens_seen": 294358590, "step": 13646, "time_per_iteration": 2.713864803314209 }, { "auxiliary_loss_clip": 0.01089571, "auxiliary_loss_mlp": 0.01030919, "balance_loss_clip": 1.036955, "balance_loss_mlp": 1.01831234, "epoch": 0.8205020291597775, "flos": 23580059345280.0, "grad_norm": 2.245036233958922, "language_loss": 0.78633201, "learning_rate": 3.284697424316132e-07, "loss": 0.80753696, "num_input_tokens_seen": 294375825, "step": 13647, "time_per_iteration": 2.659745693206787 }, { "auxiliary_loss_clip": 0.01105517, "auxiliary_loss_mlp": 0.01033284, "balance_loss_clip": 1.03771901, "balance_loss_mlp": 1.02169704, "epoch": 0.8205621524124456, "flos": 26799164219520.0, "grad_norm": 1.7369474065732662, "language_loss": 0.67728269, "learning_rate": 3.2825592573785034e-07, "loss": 0.69867074, "num_input_tokens_seen": 294398500, "step": 13648, "time_per_iteration": 2.642002582550049 }, { "auxiliary_loss_clip": 0.01080292, "auxiliary_loss_mlp": 0.01028555, "balance_loss_clip": 1.03181791, "balance_loss_mlp": 1.0157932, "epoch": 0.8206222756651135, "flos": 27527863432320.0, "grad_norm": 1.7471547354733792, "language_loss": 0.80010235, "learning_rate": 3.28042172436791e-07, "loss": 0.82119077, "num_input_tokens_seen": 294418840, "step": 13649, "time_per_iteration": 2.704329252243042 }, { "auxiliary_loss_clip": 0.01092884, "auxiliary_loss_mlp": 0.01034827, "balance_loss_clip": 1.03850818, "balance_loss_mlp": 1.0212965, "epoch": 0.8206823989177815, "flos": 21178605715200.0, "grad_norm": 1.9987063648882384, "language_loss": 0.69307315, "learning_rate": 3.278284825365396e-07, "loss": 0.71435022, "num_input_tokens_seen": 294438215, "step": 13650, "time_per_iteration": 2.59381365776062 }, { "auxiliary_loss_clip": 0.01090201, "auxiliary_loss_mlp": 0.01031521, "balance_loss_clip": 1.03758073, "balance_loss_mlp": 1.01843143, "epoch": 0.8207425221704494, "flos": 11509622150400.0, "grad_norm": 1.942606809988791, "language_loss": 0.60333896, "learning_rate": 3.276148560452001e-07, "loss": 0.62455606, "num_input_tokens_seen": 294455260, "step": 13651, "time_per_iteration": 2.620542287826538 }, { "auxiliary_loss_clip": 0.01069774, "auxiliary_loss_mlp": 0.00773358, "balance_loss_clip": 1.03502905, "balance_loss_mlp": 1.00031233, "epoch": 0.8208026454231174, "flos": 19791987039360.0, "grad_norm": 3.123048822731667, "language_loss": 0.72240758, "learning_rate": 3.2740129297087293e-07, "loss": 0.74083889, "num_input_tokens_seen": 294473205, "step": 13652, "time_per_iteration": 2.7204532623291016 }, { "auxiliary_loss_clip": 0.01081839, "auxiliary_loss_mlp": 0.01030063, "balance_loss_clip": 1.03512836, "balance_loss_mlp": 1.01936436, "epoch": 0.8208627686757853, "flos": 15667538843520.0, "grad_norm": 1.909630535987182, "language_loss": 0.73210537, "learning_rate": 3.271877933216558e-07, "loss": 0.75322437, "num_input_tokens_seen": 294490645, "step": 13653, "time_per_iteration": 2.6469080448150635 }, { "auxiliary_loss_clip": 0.0107235, "auxiliary_loss_mlp": 0.01036685, "balance_loss_clip": 1.03659797, "balance_loss_mlp": 1.02340472, "epoch": 0.8209228919284534, "flos": 37482659516160.0, "grad_norm": 1.930498918584404, "language_loss": 0.63319474, "learning_rate": 3.269743571056451e-07, "loss": 0.65428507, "num_input_tokens_seen": 294513500, "step": 13654, "time_per_iteration": 2.9437685012817383 }, { "auxiliary_loss_clip": 0.0108459, "auxiliary_loss_mlp": 0.01029817, "balance_loss_clip": 1.03793693, "balance_loss_mlp": 1.01780069, "epoch": 0.8209830151811213, "flos": 23112969863040.0, "grad_norm": 1.5668397368199467, "language_loss": 0.70084441, "learning_rate": 3.2676098433093447e-07, "loss": 0.72198856, "num_input_tokens_seen": 294535710, "step": 13655, "time_per_iteration": 2.7804574966430664 }, { "auxiliary_loss_clip": 0.01084392, "auxiliary_loss_mlp": 0.01036883, "balance_loss_clip": 1.03608942, "balance_loss_mlp": 1.0246346, "epoch": 0.8210431384337893, "flos": 21288169175040.0, "grad_norm": 2.0172748125132283, "language_loss": 0.82037187, "learning_rate": 3.265476750056162e-07, "loss": 0.84158462, "num_input_tokens_seen": 294554055, "step": 13656, "time_per_iteration": 2.721017599105835 }, { "auxiliary_loss_clip": 0.01080199, "auxiliary_loss_mlp": 0.01030184, "balance_loss_clip": 1.03631461, "balance_loss_mlp": 1.01812029, "epoch": 0.8211032616864572, "flos": 11502403516800.0, "grad_norm": 2.1429350332327335, "language_loss": 0.74038959, "learning_rate": 3.2633442913777654e-07, "loss": 0.76149338, "num_input_tokens_seen": 294570390, "step": 13657, "time_per_iteration": 2.6449975967407227 }, { "auxiliary_loss_clip": 0.01076624, "auxiliary_loss_mlp": 0.01033144, "balance_loss_clip": 1.03495431, "balance_loss_mlp": 1.02119923, "epoch": 0.8211633849391252, "flos": 29821477455360.0, "grad_norm": 1.677076204685542, "language_loss": 0.55757195, "learning_rate": 3.2612124673550325e-07, "loss": 0.57866967, "num_input_tokens_seen": 294593050, "step": 13658, "time_per_iteration": 2.7866504192352295 }, { "auxiliary_loss_clip": 0.01046948, "auxiliary_loss_mlp": 0.01032121, "balance_loss_clip": 1.03354919, "balance_loss_mlp": 1.01984835, "epoch": 0.8212235081917931, "flos": 13115439573120.0, "grad_norm": 2.054958093178623, "language_loss": 0.78911436, "learning_rate": 3.259081278068805e-07, "loss": 0.80990505, "num_input_tokens_seen": 294608550, "step": 13659, "time_per_iteration": 2.7733964920043945 }, { "auxiliary_loss_clip": 0.01090521, "auxiliary_loss_mlp": 0.01028594, "balance_loss_clip": 1.03315973, "balance_loss_mlp": 1.01845503, "epoch": 0.8212836314444611, "flos": 40515351782400.0, "grad_norm": 1.7003866148099478, "language_loss": 0.59908175, "learning_rate": 3.256950723599887e-07, "loss": 0.62027293, "num_input_tokens_seen": 294630380, "step": 13660, "time_per_iteration": 2.7818117141723633 }, { "auxiliary_loss_clip": 0.01096127, "auxiliary_loss_mlp": 0.01034813, "balance_loss_clip": 1.03519523, "balance_loss_mlp": 1.0208652, "epoch": 0.8213437546971292, "flos": 18770543982720.0, "grad_norm": 2.120880379867683, "language_loss": 0.73009235, "learning_rate": 3.254820804029075e-07, "loss": 0.75140172, "num_input_tokens_seen": 294648655, "step": 13661, "time_per_iteration": 2.5873122215270996 }, { "auxiliary_loss_clip": 0.01093175, "auxiliary_loss_mlp": 0.01030555, "balance_loss_clip": 1.03569698, "balance_loss_mlp": 1.01827097, "epoch": 0.8214038779497971, "flos": 19682279925120.0, "grad_norm": 2.1908603009914707, "language_loss": 0.74912691, "learning_rate": 3.252691519437143e-07, "loss": 0.77036428, "num_input_tokens_seen": 294666915, "step": 13662, "time_per_iteration": 2.70076322555542 }, { "auxiliary_loss_clip": 0.01029455, "auxiliary_loss_mlp": 0.01001299, "balance_loss_clip": 1.00707769, "balance_loss_mlp": 1.00035727, "epoch": 0.8214640012024651, "flos": 71602969697280.0, "grad_norm": 0.7436789430956001, "language_loss": 0.54036576, "learning_rate": 3.250562869904825e-07, "loss": 0.5606733, "num_input_tokens_seen": 294731545, "step": 13663, "time_per_iteration": 3.2524144649505615 }, { "auxiliary_loss_clip": 0.0106094, "auxiliary_loss_mlp": 0.01032266, "balance_loss_clip": 1.03105712, "balance_loss_mlp": 1.02002287, "epoch": 0.821524124455133, "flos": 14757203531520.0, "grad_norm": 2.109135364690857, "language_loss": 0.65783775, "learning_rate": 3.248434855512838e-07, "loss": 0.67876983, "num_input_tokens_seen": 294748745, "step": 13664, "time_per_iteration": 2.7579057216644287 }, { "auxiliary_loss_clip": 0.01081895, "auxiliary_loss_mlp": 0.0103047, "balance_loss_clip": 1.03475428, "balance_loss_mlp": 1.01932395, "epoch": 0.821584247707801, "flos": 25082274965760.0, "grad_norm": 1.5036493569794076, "language_loss": 0.75327474, "learning_rate": 3.246307476341881e-07, "loss": 0.77439839, "num_input_tokens_seen": 294768955, "step": 13665, "time_per_iteration": 2.7124111652374268 }, { "auxiliary_loss_clip": 0.01093989, "auxiliary_loss_mlp": 0.00769563, "balance_loss_clip": 1.03792393, "balance_loss_mlp": 1.00023198, "epoch": 0.8216443709604689, "flos": 36830701710720.0, "grad_norm": 2.32376999717277, "language_loss": 0.65432054, "learning_rate": 3.2441807324726256e-07, "loss": 0.67295599, "num_input_tokens_seen": 294789250, "step": 13666, "time_per_iteration": 2.7520713806152344 }, { "auxiliary_loss_clip": 0.01059201, "auxiliary_loss_mlp": 0.01030678, "balance_loss_clip": 1.03574967, "balance_loss_mlp": 1.01929307, "epoch": 0.821704494213137, "flos": 25081808088960.0, "grad_norm": 1.6586859973993004, "language_loss": 0.76773095, "learning_rate": 3.2420546239857174e-07, "loss": 0.78862977, "num_input_tokens_seen": 294809760, "step": 13667, "time_per_iteration": 2.8164875507354736 }, { "auxiliary_loss_clip": 0.01077218, "auxiliary_loss_mlp": 0.01032735, "balance_loss_clip": 1.03665185, "balance_loss_mlp": 1.02043223, "epoch": 0.8217646174658049, "flos": 14356117290240.0, "grad_norm": 1.9214564024977732, "language_loss": 0.77153236, "learning_rate": 3.239929150961773e-07, "loss": 0.79263186, "num_input_tokens_seen": 294826495, "step": 13668, "time_per_iteration": 2.795309066772461 }, { "auxiliary_loss_clip": 0.0106108, "auxiliary_loss_mlp": 0.01032359, "balance_loss_clip": 1.03410029, "balance_loss_mlp": 1.02047384, "epoch": 0.8218247407184729, "flos": 22090557139200.0, "grad_norm": 2.232101782459693, "language_loss": 0.7333163, "learning_rate": 3.2378043134813984e-07, "loss": 0.75425071, "num_input_tokens_seen": 294845370, "step": 13669, "time_per_iteration": 2.733705520629883 }, { "auxiliary_loss_clip": 0.01096991, "auxiliary_loss_mlp": 0.01026791, "balance_loss_clip": 1.03674257, "balance_loss_mlp": 1.01509678, "epoch": 0.8218848639711408, "flos": 16764035368320.0, "grad_norm": 1.5914876728736391, "language_loss": 0.78921843, "learning_rate": 3.235680111625161e-07, "loss": 0.81045628, "num_input_tokens_seen": 294863740, "step": 13670, "time_per_iteration": 2.632380723953247 }, { "auxiliary_loss_clip": 0.01101033, "auxiliary_loss_mlp": 0.01037163, "balance_loss_clip": 1.03839719, "balance_loss_mlp": 1.02437234, "epoch": 0.8219449872238088, "flos": 25994801007360.0, "grad_norm": 1.7358038060221426, "language_loss": 0.74638772, "learning_rate": 3.2335565454736123e-07, "loss": 0.76776969, "num_input_tokens_seen": 294882815, "step": 13671, "time_per_iteration": 2.6536366939544678 }, { "auxiliary_loss_clip": 0.01102103, "auxiliary_loss_mlp": 0.0103, "balance_loss_clip": 1.03765309, "balance_loss_mlp": 1.0173583, "epoch": 0.8220051104764767, "flos": 20778094091520.0, "grad_norm": 1.8480200060327416, "language_loss": 0.76200128, "learning_rate": 3.23143361510728e-07, "loss": 0.78332233, "num_input_tokens_seen": 294901985, "step": 13672, "time_per_iteration": 2.6287293434143066 }, { "auxiliary_loss_clip": 0.0105776, "auxiliary_loss_mlp": 0.01037446, "balance_loss_clip": 1.03279448, "balance_loss_mlp": 1.02387452, "epoch": 0.8220652337291448, "flos": 14574849160320.0, "grad_norm": 2.155588749623656, "language_loss": 0.74635303, "learning_rate": 3.2293113206066733e-07, "loss": 0.76730502, "num_input_tokens_seen": 294919705, "step": 13673, "time_per_iteration": 5.964927911758423 }, { "auxiliary_loss_clip": 0.01091542, "auxiliary_loss_mlp": 0.01034095, "balance_loss_clip": 1.03949618, "balance_loss_mlp": 1.02133989, "epoch": 0.8221253569818128, "flos": 23805866194560.0, "grad_norm": 1.8576069667953699, "language_loss": 0.79360175, "learning_rate": 3.227189662052254e-07, "loss": 0.8148582, "num_input_tokens_seen": 294939900, "step": 13674, "time_per_iteration": 2.711923599243164 }, { "auxiliary_loss_clip": 0.01082091, "auxiliary_loss_mlp": 0.01037891, "balance_loss_clip": 1.03274429, "balance_loss_mlp": 1.0257858, "epoch": 0.8221854802344807, "flos": 21288241002240.0, "grad_norm": 2.0257881823597508, "language_loss": 0.69993466, "learning_rate": 3.225068639524484e-07, "loss": 0.72113442, "num_input_tokens_seen": 294959110, "step": 13675, "time_per_iteration": 4.205335378646851 }, { "auxiliary_loss_clip": 0.01089922, "auxiliary_loss_mlp": 0.01037385, "balance_loss_clip": 1.03466141, "balance_loss_mlp": 1.02468348, "epoch": 0.8222456034871487, "flos": 20956785275520.0, "grad_norm": 1.6271888022504428, "language_loss": 0.74166471, "learning_rate": 3.2229482531037965e-07, "loss": 0.76293778, "num_input_tokens_seen": 294978660, "step": 13676, "time_per_iteration": 2.633631944656372 }, { "auxiliary_loss_clip": 0.01081581, "auxiliary_loss_mlp": 0.01032411, "balance_loss_clip": 1.03602481, "balance_loss_mlp": 1.02066302, "epoch": 0.8223057267398166, "flos": 21397517153280.0, "grad_norm": 1.8848040435519355, "language_loss": 0.80344379, "learning_rate": 3.2208285028705893e-07, "loss": 0.82458377, "num_input_tokens_seen": 294998075, "step": 13677, "time_per_iteration": 2.715427875518799 }, { "auxiliary_loss_clip": 0.01093784, "auxiliary_loss_mlp": 0.01037139, "balance_loss_clip": 1.03556919, "balance_loss_mlp": 1.02450824, "epoch": 0.8223658499924846, "flos": 15268212368640.0, "grad_norm": 2.296503126138382, "language_loss": 0.70510441, "learning_rate": 3.218709388905245e-07, "loss": 0.72641361, "num_input_tokens_seen": 295015950, "step": 13678, "time_per_iteration": 2.662177085876465 }, { "auxiliary_loss_clip": 0.01107791, "auxiliary_loss_mlp": 0.01034832, "balance_loss_clip": 1.03623056, "balance_loss_mlp": 1.02258909, "epoch": 0.8224259732451525, "flos": 31249537447680.0, "grad_norm": 1.4830532789333675, "language_loss": 0.71389025, "learning_rate": 3.216590911288133e-07, "loss": 0.73531646, "num_input_tokens_seen": 295036800, "step": 13679, "time_per_iteration": 4.202351808547974 }, { "auxiliary_loss_clip": 0.01079212, "auxiliary_loss_mlp": 0.01033441, "balance_loss_clip": 1.03329039, "balance_loss_mlp": 1.02008915, "epoch": 0.8224860964978206, "flos": 21574628138880.0, "grad_norm": 1.9740769073564464, "language_loss": 0.70159578, "learning_rate": 3.214473070099564e-07, "loss": 0.72272229, "num_input_tokens_seen": 295055300, "step": 13680, "time_per_iteration": 2.644590139389038 }, { "auxiliary_loss_clip": 0.01075147, "auxiliary_loss_mlp": 0.01029985, "balance_loss_clip": 1.03547573, "balance_loss_mlp": 1.01875556, "epoch": 0.8225462197504885, "flos": 25483217552640.0, "grad_norm": 3.3850190908064164, "language_loss": 0.59734452, "learning_rate": 3.21235586541986e-07, "loss": 0.61839581, "num_input_tokens_seen": 295076420, "step": 13681, "time_per_iteration": 2.693240165710449 }, { "auxiliary_loss_clip": 0.01084056, "auxiliary_loss_mlp": 0.01038117, "balance_loss_clip": 1.0347333, "balance_loss_mlp": 1.02480125, "epoch": 0.8226063430031565, "flos": 39385458587520.0, "grad_norm": 2.647199220979941, "language_loss": 0.68972695, "learning_rate": 3.2102392973293047e-07, "loss": 0.71094871, "num_input_tokens_seen": 295100540, "step": 13682, "time_per_iteration": 2.793362855911255 }, { "auxiliary_loss_clip": 0.01109468, "auxiliary_loss_mlp": 0.01030898, "balance_loss_clip": 1.0367074, "balance_loss_mlp": 1.01775503, "epoch": 0.8226664662558244, "flos": 22815269942400.0, "grad_norm": 1.8560800713238335, "language_loss": 0.79419553, "learning_rate": 3.20812336590816e-07, "loss": 0.81559926, "num_input_tokens_seen": 295120180, "step": 13683, "time_per_iteration": 2.663804292678833 }, { "auxiliary_loss_clip": 0.01104253, "auxiliary_loss_mlp": 0.01029993, "balance_loss_clip": 1.03593254, "balance_loss_mlp": 1.01891863, "epoch": 0.8227265895084924, "flos": 25665607837440.0, "grad_norm": 1.9656579535514493, "language_loss": 0.86604738, "learning_rate": 3.206008071236661e-07, "loss": 0.88738984, "num_input_tokens_seen": 295138530, "step": 13684, "time_per_iteration": 2.6015169620513916 }, { "auxiliary_loss_clip": 0.01104335, "auxiliary_loss_mlp": 0.01029249, "balance_loss_clip": 1.03555918, "balance_loss_mlp": 1.01763213, "epoch": 0.8227867127611603, "flos": 26179274280960.0, "grad_norm": 1.536812487486819, "language_loss": 0.79920459, "learning_rate": 3.2038934133950157e-07, "loss": 0.82054043, "num_input_tokens_seen": 295160260, "step": 13685, "time_per_iteration": 2.7008142471313477 }, { "auxiliary_loss_clip": 0.0107249, "auxiliary_loss_mlp": 0.01030486, "balance_loss_clip": 1.03493214, "balance_loss_mlp": 1.01813579, "epoch": 0.8228468360138284, "flos": 22018053536640.0, "grad_norm": 1.6748443436475502, "language_loss": 0.68744385, "learning_rate": 3.2017793924634194e-07, "loss": 0.70847368, "num_input_tokens_seen": 295177055, "step": 13686, "time_per_iteration": 2.7271742820739746 }, { "auxiliary_loss_clip": 0.01076871, "auxiliary_loss_mlp": 0.01032996, "balance_loss_clip": 1.03525162, "balance_loss_mlp": 1.02082491, "epoch": 0.8229069592664963, "flos": 14903359971840.0, "grad_norm": 1.8146731165016403, "language_loss": 0.77963513, "learning_rate": 3.1996660085220263e-07, "loss": 0.80073375, "num_input_tokens_seen": 295193870, "step": 13687, "time_per_iteration": 2.6741888523101807 }, { "auxiliary_loss_clip": 0.01097929, "auxiliary_loss_mlp": 0.01030475, "balance_loss_clip": 1.03655159, "balance_loss_mlp": 1.01794028, "epoch": 0.8229670825191643, "flos": 15669478177920.0, "grad_norm": 1.7147177179883277, "language_loss": 0.72279108, "learning_rate": 3.1975532616509825e-07, "loss": 0.74407512, "num_input_tokens_seen": 295211040, "step": 13688, "time_per_iteration": 2.583867311477661 }, { "auxiliary_loss_clip": 0.01108409, "auxiliary_loss_mlp": 0.00769781, "balance_loss_clip": 1.03682184, "balance_loss_mlp": 1.0001483, "epoch": 0.8230272057718323, "flos": 23183498217600.0, "grad_norm": 2.234271170897282, "language_loss": 0.73181629, "learning_rate": 3.1954411519304025e-07, "loss": 0.75059819, "num_input_tokens_seen": 295231300, "step": 13689, "time_per_iteration": 2.718895673751831 }, { "auxiliary_loss_clip": 0.01098539, "auxiliary_loss_mlp": 0.01031163, "balance_loss_clip": 1.0351994, "balance_loss_mlp": 1.0188967, "epoch": 0.8230873290245002, "flos": 21032413361280.0, "grad_norm": 3.545814626026256, "language_loss": 0.69253677, "learning_rate": 3.1933296794403887e-07, "loss": 0.71383381, "num_input_tokens_seen": 295251045, "step": 13690, "time_per_iteration": 2.6642231941223145 }, { "auxiliary_loss_clip": 0.01062899, "auxiliary_loss_mlp": 0.01041098, "balance_loss_clip": 1.03263807, "balance_loss_mlp": 1.02722192, "epoch": 0.8231474522771682, "flos": 21250139650560.0, "grad_norm": 1.8299845733517255, "language_loss": 0.85268778, "learning_rate": 3.191218844260988e-07, "loss": 0.87372774, "num_input_tokens_seen": 295270225, "step": 13691, "time_per_iteration": 2.7507143020629883 }, { "auxiliary_loss_clip": 0.01101229, "auxiliary_loss_mlp": 0.01034591, "balance_loss_clip": 1.03781307, "balance_loss_mlp": 1.02287257, "epoch": 0.8232075755298361, "flos": 23842028211840.0, "grad_norm": 1.8079890688492317, "language_loss": 0.77103651, "learning_rate": 3.189108646472252e-07, "loss": 0.79239464, "num_input_tokens_seen": 295288950, "step": 13692, "time_per_iteration": 2.67478084564209 }, { "auxiliary_loss_clip": 0.01096284, "auxiliary_loss_mlp": 0.01027159, "balance_loss_clip": 1.03692162, "balance_loss_mlp": 1.0151006, "epoch": 0.8232676987825042, "flos": 21653955325440.0, "grad_norm": 1.722595052749625, "language_loss": 0.71423566, "learning_rate": 3.186999086154205e-07, "loss": 0.73547006, "num_input_tokens_seen": 295309405, "step": 13693, "time_per_iteration": 2.718867301940918 }, { "auxiliary_loss_clip": 0.01070842, "auxiliary_loss_mlp": 0.0102981, "balance_loss_clip": 1.03349066, "balance_loss_mlp": 1.01865232, "epoch": 0.8233278220351721, "flos": 26322701287680.0, "grad_norm": 1.3395802259030574, "language_loss": 0.83745861, "learning_rate": 3.1848901633868355e-07, "loss": 0.85846514, "num_input_tokens_seen": 295331115, "step": 13694, "time_per_iteration": 2.7664167881011963 }, { "auxiliary_loss_clip": 0.0104721, "auxiliary_loss_mlp": 0.01032456, "balance_loss_clip": 1.03542459, "balance_loss_mlp": 1.0194205, "epoch": 0.8233879452878401, "flos": 21725812483200.0, "grad_norm": 1.774536934152641, "language_loss": 0.76836276, "learning_rate": 3.182781878250118e-07, "loss": 0.78915936, "num_input_tokens_seen": 295350495, "step": 13695, "time_per_iteration": 2.750267744064331 }, { "auxiliary_loss_clip": 0.01087721, "auxiliary_loss_mlp": 0.01034204, "balance_loss_clip": 1.03655171, "balance_loss_mlp": 1.02215171, "epoch": 0.823448068540508, "flos": 20557746109440.0, "grad_norm": 1.7975071163239338, "language_loss": 0.80965418, "learning_rate": 3.1806742308239985e-07, "loss": 0.83087343, "num_input_tokens_seen": 295368225, "step": 13696, "time_per_iteration": 2.6955337524414062 }, { "auxiliary_loss_clip": 0.01020282, "auxiliary_loss_mlp": 0.0100384, "balance_loss_clip": 1.00769222, "balance_loss_mlp": 1.00285649, "epoch": 0.823508191793176, "flos": 67273688194560.0, "grad_norm": 0.7350797292935349, "language_loss": 0.63867533, "learning_rate": 3.178567221188393e-07, "loss": 0.65891653, "num_input_tokens_seen": 295430035, "step": 13697, "time_per_iteration": 3.2243242263793945 }, { "auxiliary_loss_clip": 0.01070899, "auxiliary_loss_mlp": 0.01025732, "balance_loss_clip": 1.03476644, "balance_loss_mlp": 1.01477075, "epoch": 0.8235683150458439, "flos": 17928402641280.0, "grad_norm": 1.6913547769566408, "language_loss": 0.72991723, "learning_rate": 3.1764608494232037e-07, "loss": 0.75088358, "num_input_tokens_seen": 295447765, "step": 13698, "time_per_iteration": 2.670644998550415 }, { "auxiliary_loss_clip": 0.01063119, "auxiliary_loss_mlp": 0.01047662, "balance_loss_clip": 1.03002477, "balance_loss_mlp": 1.03214049, "epoch": 0.823628438298512, "flos": 18916089891840.0, "grad_norm": 1.861601543372515, "language_loss": 0.71800578, "learning_rate": 3.174355115608305e-07, "loss": 0.73911357, "num_input_tokens_seen": 295464810, "step": 13699, "time_per_iteration": 2.7969279289245605 }, { "auxiliary_loss_clip": 0.01086761, "auxiliary_loss_mlp": 0.01028632, "balance_loss_clip": 1.03632307, "balance_loss_mlp": 1.01650214, "epoch": 0.8236885615511799, "flos": 18696460181760.0, "grad_norm": 1.9855299133733353, "language_loss": 0.8196975, "learning_rate": 3.1722500198235526e-07, "loss": 0.84085149, "num_input_tokens_seen": 295482605, "step": 13700, "time_per_iteration": 2.6503469944000244 }, { "auxiliary_loss_clip": 0.01086133, "auxiliary_loss_mlp": 0.01035546, "balance_loss_clip": 1.03498542, "balance_loss_mlp": 1.02366662, "epoch": 0.8237486848038479, "flos": 23695009845120.0, "grad_norm": 1.6635741154144412, "language_loss": 0.73422629, "learning_rate": 3.170145562148763e-07, "loss": 0.7554431, "num_input_tokens_seen": 295503780, "step": 13701, "time_per_iteration": 2.6358823776245117 }, { "auxiliary_loss_clip": 0.01097849, "auxiliary_loss_mlp": 0.01036965, "balance_loss_clip": 1.03509569, "balance_loss_mlp": 1.02432895, "epoch": 0.8238088080565159, "flos": 23441301106560.0, "grad_norm": 1.9462768217086985, "language_loss": 0.69265807, "learning_rate": 3.1680417426637384e-07, "loss": 0.71400625, "num_input_tokens_seen": 295522035, "step": 13702, "time_per_iteration": 2.60188627243042 }, { "auxiliary_loss_clip": 0.01063324, "auxiliary_loss_mlp": 0.01034265, "balance_loss_clip": 1.03598332, "balance_loss_mlp": 1.02128875, "epoch": 0.8238689313091838, "flos": 22746537267840.0, "grad_norm": 1.9923897807991633, "language_loss": 0.75280106, "learning_rate": 3.1659385614482603e-07, "loss": 0.77377695, "num_input_tokens_seen": 295541190, "step": 13703, "time_per_iteration": 2.7468554973602295 }, { "auxiliary_loss_clip": 0.01113854, "auxiliary_loss_mlp": 0.01037893, "balance_loss_clip": 1.03847456, "balance_loss_mlp": 1.02467299, "epoch": 0.8239290545618518, "flos": 25630092264960.0, "grad_norm": 1.7182748421567742, "language_loss": 0.69657588, "learning_rate": 3.1638360185820755e-07, "loss": 0.71809334, "num_input_tokens_seen": 295558860, "step": 13704, "time_per_iteration": 2.5931785106658936 }, { "auxiliary_loss_clip": 0.01105612, "auxiliary_loss_mlp": 0.01030565, "balance_loss_clip": 1.03566051, "balance_loss_mlp": 1.01859665, "epoch": 0.8239891778145197, "flos": 26026473824640.0, "grad_norm": 1.8447020844215793, "language_loss": 0.64444757, "learning_rate": 3.161734114144916e-07, "loss": 0.66580933, "num_input_tokens_seen": 295578155, "step": 13705, "time_per_iteration": 2.5968048572540283 }, { "auxiliary_loss_clip": 0.01110492, "auxiliary_loss_mlp": 0.01031144, "balance_loss_clip": 1.03668666, "balance_loss_mlp": 1.01796532, "epoch": 0.8240493010671878, "flos": 21833257040640.0, "grad_norm": 1.541851656815521, "language_loss": 0.69572484, "learning_rate": 3.1596328482164915e-07, "loss": 0.71714121, "num_input_tokens_seen": 295599170, "step": 13706, "time_per_iteration": 2.5887668132781982 }, { "auxiliary_loss_clip": 0.0108328, "auxiliary_loss_mlp": 0.01039333, "balance_loss_clip": 1.03719616, "balance_loss_mlp": 1.02601147, "epoch": 0.8241094243198557, "flos": 18551919853440.0, "grad_norm": 1.661218457463816, "language_loss": 0.69479191, "learning_rate": 3.157532220876475e-07, "loss": 0.71601802, "num_input_tokens_seen": 295617465, "step": 13707, "time_per_iteration": 2.6411385536193848 }, { "auxiliary_loss_clip": 0.01072958, "auxiliary_loss_mlp": 0.01038335, "balance_loss_clip": 1.03431034, "balance_loss_mlp": 1.0244596, "epoch": 0.8241695475725237, "flos": 25447163276160.0, "grad_norm": 1.8467879085994943, "language_loss": 0.79235733, "learning_rate": 3.1554322322045226e-07, "loss": 0.81347024, "num_input_tokens_seen": 295634960, "step": 13708, "time_per_iteration": 2.700183153152466 }, { "auxiliary_loss_clip": 0.01092221, "auxiliary_loss_mlp": 0.0103148, "balance_loss_clip": 1.0341289, "balance_loss_mlp": 1.01864731, "epoch": 0.8242296708251916, "flos": 18989670902400.0, "grad_norm": 3.0430641807268954, "language_loss": 0.68361056, "learning_rate": 3.1533328822802664e-07, "loss": 0.70484757, "num_input_tokens_seen": 295652725, "step": 13709, "time_per_iteration": 2.5937395095825195 }, { "auxiliary_loss_clip": 0.01065101, "auxiliary_loss_mlp": 0.01032868, "balance_loss_clip": 1.03181398, "balance_loss_mlp": 1.02109027, "epoch": 0.8242897940778596, "flos": 22600883617920.0, "grad_norm": 1.766284405655816, "language_loss": 0.82331645, "learning_rate": 3.151234171183319e-07, "loss": 0.84429616, "num_input_tokens_seen": 295671195, "step": 13710, "time_per_iteration": 2.749650239944458 }, { "auxiliary_loss_clip": 0.01096973, "auxiliary_loss_mlp": 0.01034028, "balance_loss_clip": 1.03629923, "balance_loss_mlp": 1.02127314, "epoch": 0.8243499173305275, "flos": 21468153248640.0, "grad_norm": 13.701839105359984, "language_loss": 0.78112018, "learning_rate": 3.149136098993257e-07, "loss": 0.80243027, "num_input_tokens_seen": 295689130, "step": 13711, "time_per_iteration": 2.7447783946990967 }, { "auxiliary_loss_clip": 0.0107344, "auxiliary_loss_mlp": 0.01029912, "balance_loss_clip": 1.03311896, "balance_loss_mlp": 1.01736498, "epoch": 0.8244100405831956, "flos": 20010359773440.0, "grad_norm": 3.3468765947444568, "language_loss": 0.65435582, "learning_rate": 3.1470386657896473e-07, "loss": 0.67538929, "num_input_tokens_seen": 295706385, "step": 13712, "time_per_iteration": 4.317276477813721 }, { "auxiliary_loss_clip": 0.01091569, "auxiliary_loss_mlp": 0.01029045, "balance_loss_clip": 1.03673708, "balance_loss_mlp": 1.0174818, "epoch": 0.8244701638358635, "flos": 26430684549120.0, "grad_norm": 1.8364742562696106, "language_loss": 0.74371034, "learning_rate": 3.14494187165202e-07, "loss": 0.76491648, "num_input_tokens_seen": 295727925, "step": 13713, "time_per_iteration": 4.166277647018433 }, { "auxiliary_loss_clip": 0.01096875, "auxiliary_loss_mlp": 0.01027842, "balance_loss_clip": 1.03534007, "balance_loss_mlp": 1.01558685, "epoch": 0.8245302870885315, "flos": 17640004343040.0, "grad_norm": 6.838551643078677, "language_loss": 0.80911207, "learning_rate": 3.1428457166598833e-07, "loss": 0.83035922, "num_input_tokens_seen": 295744420, "step": 13714, "time_per_iteration": 2.624154806137085 }, { "auxiliary_loss_clip": 0.01099074, "auxiliary_loss_mlp": 0.01034976, "balance_loss_clip": 1.03917253, "balance_loss_mlp": 1.02173758, "epoch": 0.8245904103411995, "flos": 26209510554240.0, "grad_norm": 1.9766045334359852, "language_loss": 0.66371924, "learning_rate": 3.1407502008927235e-07, "loss": 0.68505979, "num_input_tokens_seen": 295765105, "step": 13715, "time_per_iteration": 4.212578296661377 }, { "auxiliary_loss_clip": 0.01081096, "auxiliary_loss_mlp": 0.01029311, "balance_loss_clip": 1.03784251, "balance_loss_mlp": 1.01657939, "epoch": 0.8246505335938674, "flos": 24205084928640.0, "grad_norm": 2.0424767412149567, "language_loss": 0.74730164, "learning_rate": 3.1386553244300086e-07, "loss": 0.76840568, "num_input_tokens_seen": 295784200, "step": 13716, "time_per_iteration": 2.7325594425201416 }, { "auxiliary_loss_clip": 0.00991112, "auxiliary_loss_mlp": 0.0100064, "balance_loss_clip": 1.00916934, "balance_loss_mlp": 0.99952489, "epoch": 0.8247106568465354, "flos": 67092195749760.0, "grad_norm": 0.7138774267720784, "language_loss": 0.58973479, "learning_rate": 3.136561087351175e-07, "loss": 0.60965228, "num_input_tokens_seen": 295846555, "step": 13717, "time_per_iteration": 4.931637763977051 }, { "auxiliary_loss_clip": 0.01094759, "auxiliary_loss_mlp": 0.00770088, "balance_loss_clip": 1.03633809, "balance_loss_mlp": 1.00021517, "epoch": 0.8247707800992033, "flos": 12568232805120.0, "grad_norm": 1.8911400591103953, "language_loss": 0.79565227, "learning_rate": 3.1344674897356373e-07, "loss": 0.81430078, "num_input_tokens_seen": 295863425, "step": 13718, "time_per_iteration": 2.6436800956726074 }, { "auxiliary_loss_clip": 0.01088621, "auxiliary_loss_mlp": 0.01033203, "balance_loss_clip": 1.03615391, "balance_loss_mlp": 1.02111554, "epoch": 0.8248309033518714, "flos": 15923617879680.0, "grad_norm": 1.5520316842938593, "language_loss": 0.68703258, "learning_rate": 3.132374531662778e-07, "loss": 0.70825082, "num_input_tokens_seen": 295880925, "step": 13719, "time_per_iteration": 2.716325044631958 }, { "auxiliary_loss_clip": 0.01079067, "auxiliary_loss_mlp": 0.01034824, "balance_loss_clip": 1.03340077, "balance_loss_mlp": 1.0202682, "epoch": 0.8248910266045393, "flos": 17564735393280.0, "grad_norm": 2.6589956640079038, "language_loss": 0.70158517, "learning_rate": 3.13028221321197e-07, "loss": 0.72272408, "num_input_tokens_seen": 295898205, "step": 13720, "time_per_iteration": 2.5896477699279785 }, { "auxiliary_loss_clip": 0.01033476, "auxiliary_loss_mlp": 0.01024097, "balance_loss_clip": 1.03508949, "balance_loss_mlp": 1.01189578, "epoch": 0.8249511498572073, "flos": 28619655275520.0, "grad_norm": 1.5927327922033778, "language_loss": 0.75676763, "learning_rate": 3.1281905344625467e-07, "loss": 0.77734333, "num_input_tokens_seen": 295918130, "step": 13721, "time_per_iteration": 3.003366470336914 }, { "auxiliary_loss_clip": 0.01064431, "auxiliary_loss_mlp": 0.0102768, "balance_loss_clip": 1.04172993, "balance_loss_mlp": 1.01569343, "epoch": 0.8250112731098752, "flos": 25556583081600.0, "grad_norm": 1.9277434065767896, "language_loss": 0.7792846, "learning_rate": 3.1260994954938305e-07, "loss": 0.80020571, "num_input_tokens_seen": 295937760, "step": 13722, "time_per_iteration": 2.993467092514038 }, { "auxiliary_loss_clip": 0.01107933, "auxiliary_loss_mlp": 0.01030614, "balance_loss_clip": 1.03832984, "balance_loss_mlp": 1.01868153, "epoch": 0.8250713963625432, "flos": 27746164339200.0, "grad_norm": 1.9336689467836483, "language_loss": 0.63077027, "learning_rate": 3.1240090963851205e-07, "loss": 0.6521557, "num_input_tokens_seen": 295957585, "step": 13723, "time_per_iteration": 2.65627384185791 }, { "auxiliary_loss_clip": 0.01109221, "auxiliary_loss_mlp": 0.0103494, "balance_loss_clip": 1.03650689, "balance_loss_mlp": 1.0223273, "epoch": 0.8251315196152111, "flos": 21610610588160.0, "grad_norm": 1.425967776015011, "language_loss": 0.74256718, "learning_rate": 3.121919337215666e-07, "loss": 0.76400876, "num_input_tokens_seen": 295977135, "step": 13724, "time_per_iteration": 2.6450181007385254 }, { "auxiliary_loss_clip": 0.01076005, "auxiliary_loss_mlp": 0.01035593, "balance_loss_clip": 1.03590727, "balance_loss_mlp": 1.02253342, "epoch": 0.8251916428678792, "flos": 28579363194240.0, "grad_norm": 1.8109135586659708, "language_loss": 0.6419245, "learning_rate": 3.1198302180647253e-07, "loss": 0.66304046, "num_input_tokens_seen": 295996265, "step": 13725, "time_per_iteration": 2.747354507446289 }, { "auxiliary_loss_clip": 0.01081699, "auxiliary_loss_mlp": 0.01029734, "balance_loss_clip": 1.03467178, "balance_loss_mlp": 1.01717496, "epoch": 0.8252517661205471, "flos": 23075191733760.0, "grad_norm": 1.5423551170824084, "language_loss": 0.81953287, "learning_rate": 3.1177417390115125e-07, "loss": 0.84064722, "num_input_tokens_seen": 296014745, "step": 13726, "time_per_iteration": 2.677957057952881 }, { "auxiliary_loss_clip": 0.01090181, "auxiliary_loss_mlp": 0.01033897, "balance_loss_clip": 1.03259659, "balance_loss_mlp": 1.02245855, "epoch": 0.8253118893732151, "flos": 31759576617600.0, "grad_norm": 1.6832694134847563, "language_loss": 0.70317417, "learning_rate": 3.1156539001352286e-07, "loss": 0.72441494, "num_input_tokens_seen": 296036960, "step": 13727, "time_per_iteration": 2.6928937435150146 }, { "auxiliary_loss_clip": 0.01102136, "auxiliary_loss_mlp": 0.01028405, "balance_loss_clip": 1.03817558, "balance_loss_mlp": 1.01547694, "epoch": 0.8253720126258831, "flos": 18296415434880.0, "grad_norm": 1.667834208410725, "language_loss": 0.62520349, "learning_rate": 3.113566701515036e-07, "loss": 0.64650893, "num_input_tokens_seen": 296056540, "step": 13728, "time_per_iteration": 2.6370222568511963 }, { "auxiliary_loss_clip": 0.01092032, "auxiliary_loss_mlp": 0.01029928, "balance_loss_clip": 1.03751087, "balance_loss_mlp": 1.0174228, "epoch": 0.825432135878551, "flos": 26797332625920.0, "grad_norm": 1.9482709923382855, "language_loss": 0.71667683, "learning_rate": 3.111480143230092e-07, "loss": 0.73789644, "num_input_tokens_seen": 296077950, "step": 13729, "time_per_iteration": 2.6492090225219727 }, { "auxiliary_loss_clip": 0.01014426, "auxiliary_loss_mlp": 0.0100436, "balance_loss_clip": 1.01090586, "balance_loss_mlp": 1.00330532, "epoch": 0.825492259131219, "flos": 54219116217600.0, "grad_norm": 0.8488116722729574, "language_loss": 0.6264025, "learning_rate": 3.109394225359514e-07, "loss": 0.64659035, "num_input_tokens_seen": 296127060, "step": 13730, "time_per_iteration": 3.0054545402526855 }, { "auxiliary_loss_clip": 0.01058894, "auxiliary_loss_mlp": 0.01034161, "balance_loss_clip": 1.03521633, "balance_loss_mlp": 1.02156639, "epoch": 0.825552382383887, "flos": 43756145493120.0, "grad_norm": 7.4365130225127505, "language_loss": 0.6353327, "learning_rate": 3.1073089479823945e-07, "loss": 0.65626323, "num_input_tokens_seen": 296147775, "step": 13731, "time_per_iteration": 2.9331674575805664 }, { "auxiliary_loss_clip": 0.0107139, "auxiliary_loss_mlp": 0.00773278, "balance_loss_clip": 1.03046966, "balance_loss_mlp": 1.0002327, "epoch": 0.825612505636555, "flos": 12602814624000.0, "grad_norm": 2.180240651143821, "language_loss": 0.70295024, "learning_rate": 3.105224311177812e-07, "loss": 0.72139692, "num_input_tokens_seen": 296163560, "step": 13732, "time_per_iteration": 2.765413761138916 }, { "auxiliary_loss_clip": 0.01100354, "auxiliary_loss_mlp": 0.01038249, "balance_loss_clip": 1.03632462, "balance_loss_mlp": 1.02532113, "epoch": 0.8256726288892229, "flos": 17595618111360.0, "grad_norm": 2.287080193464761, "language_loss": 0.71307957, "learning_rate": 3.103140315024817e-07, "loss": 0.7344656, "num_input_tokens_seen": 296178730, "step": 13733, "time_per_iteration": 2.663184642791748 }, { "auxiliary_loss_clip": 0.01106421, "auxiliary_loss_mlp": 0.01033457, "balance_loss_clip": 1.03536689, "balance_loss_mlp": 1.02092218, "epoch": 0.8257327521418909, "flos": 23805794367360.0, "grad_norm": 1.5370953364737692, "language_loss": 0.82361829, "learning_rate": 3.1010569596024437e-07, "loss": 0.84501708, "num_input_tokens_seen": 296200175, "step": 13734, "time_per_iteration": 2.5860283374786377 }, { "auxiliary_loss_clip": 0.01078022, "auxiliary_loss_mlp": 0.01033879, "balance_loss_clip": 1.03394449, "balance_loss_mlp": 1.02108788, "epoch": 0.8257928753945588, "flos": 19281121856640.0, "grad_norm": 1.767732379268741, "language_loss": 0.8304292, "learning_rate": 3.098974244989676e-07, "loss": 0.85154831, "num_input_tokens_seen": 296219305, "step": 13735, "time_per_iteration": 2.6341776847839355 }, { "auxiliary_loss_clip": 0.01103224, "auxiliary_loss_mlp": 0.01029169, "balance_loss_clip": 1.03989172, "balance_loss_mlp": 1.01795721, "epoch": 0.8258529986472268, "flos": 18478841633280.0, "grad_norm": 1.736444707629355, "language_loss": 0.70653635, "learning_rate": 3.096892171265497e-07, "loss": 0.72786027, "num_input_tokens_seen": 296236945, "step": 13736, "time_per_iteration": 2.5950427055358887 }, { "auxiliary_loss_clip": 0.01021603, "auxiliary_loss_mlp": 0.01002911, "balance_loss_clip": 1.00879157, "balance_loss_mlp": 1.00194514, "epoch": 0.8259131218998947, "flos": 62137957512960.0, "grad_norm": 0.8987273381116809, "language_loss": 0.6798467, "learning_rate": 3.0948107385088665e-07, "loss": 0.70009184, "num_input_tokens_seen": 296294685, "step": 13737, "time_per_iteration": 3.1607825756073 }, { "auxiliary_loss_clip": 0.01084099, "auxiliary_loss_mlp": 0.010326, "balance_loss_clip": 1.0343399, "balance_loss_mlp": 1.02032113, "epoch": 0.8259732451525628, "flos": 22159038418560.0, "grad_norm": 1.7543830364671171, "language_loss": 0.69818115, "learning_rate": 3.0927299467987e-07, "loss": 0.71934807, "num_input_tokens_seen": 296314790, "step": 13738, "time_per_iteration": 2.715946912765503 }, { "auxiliary_loss_clip": 0.01092604, "auxiliary_loss_mlp": 0.01029077, "balance_loss_clip": 1.03914821, "balance_loss_mlp": 1.01492107, "epoch": 0.8260333684052307, "flos": 38361645233280.0, "grad_norm": 1.9809949104241253, "language_loss": 0.63092321, "learning_rate": 3.090649796213911e-07, "loss": 0.65214008, "num_input_tokens_seen": 296335355, "step": 13739, "time_per_iteration": 2.8820793628692627 }, { "auxiliary_loss_clip": 0.01011074, "auxiliary_loss_mlp": 0.01000594, "balance_loss_clip": 1.00743914, "balance_loss_mlp": 0.99961609, "epoch": 0.8260934916578987, "flos": 62185611882240.0, "grad_norm": 0.8059815006501098, "language_loss": 0.59246588, "learning_rate": 3.0885702868333853e-07, "loss": 0.61258256, "num_input_tokens_seen": 296399885, "step": 13740, "time_per_iteration": 3.2520594596862793 }, { "auxiliary_loss_clip": 0.01114893, "auxiliary_loss_mlp": 0.01034041, "balance_loss_clip": 1.03906655, "balance_loss_mlp": 1.02052891, "epoch": 0.8261536149105667, "flos": 22565475786240.0, "grad_norm": 2.0240971997235317, "language_loss": 0.75221682, "learning_rate": 3.086491418735959e-07, "loss": 0.7737062, "num_input_tokens_seen": 296417660, "step": 13741, "time_per_iteration": 2.543391704559326 }, { "auxiliary_loss_clip": 0.01096486, "auxiliary_loss_mlp": 0.01034655, "balance_loss_clip": 1.03584099, "balance_loss_mlp": 1.02222109, "epoch": 0.8262137381632346, "flos": 32525479342080.0, "grad_norm": 1.8715316592875402, "language_loss": 0.62344342, "learning_rate": 3.0844131920004726e-07, "loss": 0.64475489, "num_input_tokens_seen": 296438255, "step": 13742, "time_per_iteration": 2.7066636085510254 }, { "auxiliary_loss_clip": 0.01066607, "auxiliary_loss_mlp": 0.01036357, "balance_loss_clip": 1.03625488, "balance_loss_mlp": 1.02224827, "epoch": 0.8262738614159026, "flos": 14136451666560.0, "grad_norm": 2.739309614101712, "language_loss": 0.65881348, "learning_rate": 3.0823356067057327e-07, "loss": 0.67984313, "num_input_tokens_seen": 296454485, "step": 13743, "time_per_iteration": 2.722188949584961 }, { "auxiliary_loss_clip": 0.01089117, "auxiliary_loss_mlp": 0.01035575, "balance_loss_clip": 1.0356648, "balance_loss_mlp": 1.02275968, "epoch": 0.8263339846685706, "flos": 19825347795840.0, "grad_norm": 1.7892755960798923, "language_loss": 0.66778719, "learning_rate": 3.0802586629305283e-07, "loss": 0.6890341, "num_input_tokens_seen": 296473740, "step": 13744, "time_per_iteration": 2.632858991622925 }, { "auxiliary_loss_clip": 0.01077178, "auxiliary_loss_mlp": 0.01032278, "balance_loss_clip": 1.03721189, "balance_loss_mlp": 1.02044034, "epoch": 0.8263941079212386, "flos": 22745962650240.0, "grad_norm": 1.8023826175749642, "language_loss": 0.75316632, "learning_rate": 3.078182360753612e-07, "loss": 0.77426088, "num_input_tokens_seen": 296493355, "step": 13745, "time_per_iteration": 2.7315781116485596 }, { "auxiliary_loss_clip": 0.01077899, "auxiliary_loss_mlp": 0.0077187, "balance_loss_clip": 1.03393078, "balance_loss_mlp": 1.00011253, "epoch": 0.8264542311739065, "flos": 20120641505280.0, "grad_norm": 1.8014211048676299, "language_loss": 0.79279208, "learning_rate": 3.076106700253709e-07, "loss": 0.81128979, "num_input_tokens_seen": 296510520, "step": 13746, "time_per_iteration": 2.6316795349121094 }, { "auxiliary_loss_clip": 0.01103647, "auxiliary_loss_mlp": 0.01036543, "balance_loss_clip": 1.03922772, "balance_loss_mlp": 1.02318525, "epoch": 0.8265143544265745, "flos": 16837149502080.0, "grad_norm": 1.8721646210593863, "language_loss": 0.68316424, "learning_rate": 3.0740316815095415e-07, "loss": 0.70456612, "num_input_tokens_seen": 296528265, "step": 13747, "time_per_iteration": 2.586827039718628 }, { "auxiliary_loss_clip": 0.0109475, "auxiliary_loss_mlp": 0.01037445, "balance_loss_clip": 1.03468108, "balance_loss_mlp": 1.02315235, "epoch": 0.8265744776792424, "flos": 22018592240640.0, "grad_norm": 1.994737585930927, "language_loss": 0.75182354, "learning_rate": 3.0719573045997835e-07, "loss": 0.77314556, "num_input_tokens_seen": 296547810, "step": 13748, "time_per_iteration": 2.650148868560791 }, { "auxiliary_loss_clip": 0.01071464, "auxiliary_loss_mlp": 0.01033138, "balance_loss_clip": 1.03686166, "balance_loss_mlp": 1.02170539, "epoch": 0.8266346009319104, "flos": 19244852098560.0, "grad_norm": 1.689203762569125, "language_loss": 0.64030862, "learning_rate": 3.069883569603102e-07, "loss": 0.6613546, "num_input_tokens_seen": 296565940, "step": 13749, "time_per_iteration": 2.757077217102051 }, { "auxiliary_loss_clip": 0.01082519, "auxiliary_loss_mlp": 0.01028885, "balance_loss_clip": 1.03196669, "balance_loss_mlp": 1.01680279, "epoch": 0.8266947241845783, "flos": 24166768095360.0, "grad_norm": 1.5570975728465015, "language_loss": 0.73744154, "learning_rate": 3.067810476598132e-07, "loss": 0.75855553, "num_input_tokens_seen": 296585090, "step": 13750, "time_per_iteration": 2.714416742324829 }, { "auxiliary_loss_clip": 0.01099886, "auxiliary_loss_mlp": 0.01035041, "balance_loss_clip": 1.03848624, "balance_loss_mlp": 1.02245283, "epoch": 0.8267548474372464, "flos": 21105814803840.0, "grad_norm": 1.7884167706589897, "language_loss": 0.65513742, "learning_rate": 3.065738025663496e-07, "loss": 0.67648673, "num_input_tokens_seen": 296604950, "step": 13751, "time_per_iteration": 5.785562753677368 }, { "auxiliary_loss_clip": 0.01081731, "auxiliary_loss_mlp": 0.01027835, "balance_loss_clip": 1.03284156, "balance_loss_mlp": 1.01637304, "epoch": 0.8268149706899143, "flos": 39968288668800.0, "grad_norm": 1.5963517669581677, "language_loss": 0.60753131, "learning_rate": 3.0636662168777607e-07, "loss": 0.628627, "num_input_tokens_seen": 296627780, "step": 13752, "time_per_iteration": 2.755326747894287 }, { "auxiliary_loss_clip": 0.01018872, "auxiliary_loss_mlp": 0.0100062, "balance_loss_clip": 1.00675297, "balance_loss_mlp": 0.99959439, "epoch": 0.8268750939425823, "flos": 65782423244160.0, "grad_norm": 0.7684012049495671, "language_loss": 0.57412326, "learning_rate": 3.0615950503194986e-07, "loss": 0.59431815, "num_input_tokens_seen": 296683850, "step": 13753, "time_per_iteration": 3.1750407218933105 }, { "auxiliary_loss_clip": 0.0099067, "auxiliary_loss_mlp": 0.00751461, "balance_loss_clip": 1.01540029, "balance_loss_mlp": 0.99955767, "epoch": 0.8269352171952503, "flos": 52981455242880.0, "grad_norm": 0.6979413175863002, "language_loss": 0.54908955, "learning_rate": 3.0595245260672563e-07, "loss": 0.56651086, "num_input_tokens_seen": 296741420, "step": 13754, "time_per_iteration": 4.901344299316406 }, { "auxiliary_loss_clip": 0.0106662, "auxiliary_loss_mlp": 0.01033194, "balance_loss_clip": 1.03270113, "balance_loss_mlp": 1.02221489, "epoch": 0.8269953404479182, "flos": 23076125487360.0, "grad_norm": 1.746367517796231, "language_loss": 0.69104445, "learning_rate": 3.0574546441995354e-07, "loss": 0.71204263, "num_input_tokens_seen": 296759620, "step": 13755, "time_per_iteration": 3.0003440380096436 }, { "auxiliary_loss_clip": 0.01062261, "auxiliary_loss_mlp": 0.01029699, "balance_loss_clip": 1.03354418, "balance_loss_mlp": 1.01864886, "epoch": 0.8270554637005862, "flos": 14209996763520.0, "grad_norm": 1.955736447357461, "language_loss": 0.70088506, "learning_rate": 3.0553854047948324e-07, "loss": 0.72180462, "num_input_tokens_seen": 296777275, "step": 13756, "time_per_iteration": 4.257762432098389 }, { "auxiliary_loss_clip": 0.01102671, "auxiliary_loss_mlp": 0.01033469, "balance_loss_clip": 1.04094052, "balance_loss_mlp": 1.02107131, "epoch": 0.8271155869532542, "flos": 21762046327680.0, "grad_norm": 1.737700331339717, "language_loss": 0.72146094, "learning_rate": 3.053316807931623e-07, "loss": 0.74282235, "num_input_tokens_seen": 296796655, "step": 13757, "time_per_iteration": 2.6405348777770996 }, { "auxiliary_loss_clip": 0.01101277, "auxiliary_loss_mlp": 0.01035132, "balance_loss_clip": 1.03689456, "balance_loss_mlp": 1.02129722, "epoch": 0.8271757102059222, "flos": 15120475729920.0, "grad_norm": 2.690067923112346, "language_loss": 0.6930806, "learning_rate": 3.0512488536883283e-07, "loss": 0.7144447, "num_input_tokens_seen": 296813705, "step": 13758, "time_per_iteration": 2.6009304523468018 }, { "auxiliary_loss_clip": 0.01083685, "auxiliary_loss_mlp": 0.01028318, "balance_loss_clip": 1.03558612, "balance_loss_mlp": 1.01677251, "epoch": 0.8272358334585901, "flos": 24133730561280.0, "grad_norm": 1.5602292181836888, "language_loss": 0.69900572, "learning_rate": 3.0491815421433775e-07, "loss": 0.72012579, "num_input_tokens_seen": 296833985, "step": 13759, "time_per_iteration": 2.6815199851989746 }, { "auxiliary_loss_clip": 0.01087619, "auxiliary_loss_mlp": 0.0103009, "balance_loss_clip": 1.03719068, "balance_loss_mlp": 1.01779902, "epoch": 0.8272959567112581, "flos": 18990712396800.0, "grad_norm": 1.7630032179390376, "language_loss": 0.70951492, "learning_rate": 3.047114873375161e-07, "loss": 0.73069203, "num_input_tokens_seen": 296850150, "step": 13760, "time_per_iteration": 2.6415457725524902 }, { "auxiliary_loss_clip": 0.0106558, "auxiliary_loss_mlp": 0.01031388, "balance_loss_clip": 1.03384495, "balance_loss_mlp": 1.01930583, "epoch": 0.827356079963926, "flos": 20631614428800.0, "grad_norm": 44.683792034890395, "language_loss": 0.77058452, "learning_rate": 3.0450488474620505e-07, "loss": 0.79155421, "num_input_tokens_seen": 296869585, "step": 13761, "time_per_iteration": 2.658909320831299 }, { "auxiliary_loss_clip": 0.01075197, "auxiliary_loss_mlp": 0.01033022, "balance_loss_clip": 1.03739285, "balance_loss_mlp": 1.0216558, "epoch": 0.827416203216594, "flos": 22416625825920.0, "grad_norm": 1.6365377494190674, "language_loss": 0.70046437, "learning_rate": 3.042983464482387e-07, "loss": 0.72154659, "num_input_tokens_seen": 296887710, "step": 13762, "time_per_iteration": 2.6890883445739746 }, { "auxiliary_loss_clip": 0.01056694, "auxiliary_loss_mlp": 0.01032, "balance_loss_clip": 1.03542447, "balance_loss_mlp": 1.0196439, "epoch": 0.827476326469262, "flos": 19026192055680.0, "grad_norm": 2.3529843833311297, "language_loss": 0.70278549, "learning_rate": 3.0409187245144853e-07, "loss": 0.72367239, "num_input_tokens_seen": 296906265, "step": 13763, "time_per_iteration": 2.7008395195007324 }, { "auxiliary_loss_clip": 0.01013794, "auxiliary_loss_mlp": 0.00999838, "balance_loss_clip": 1.01678598, "balance_loss_mlp": 0.99868739, "epoch": 0.82753644972193, "flos": 68500575089280.0, "grad_norm": 0.8946836161965805, "language_loss": 0.65109873, "learning_rate": 3.038854627636651e-07, "loss": 0.67123502, "num_input_tokens_seen": 296971290, "step": 13764, "time_per_iteration": 3.350186586380005 }, { "auxiliary_loss_clip": 0.01100213, "auxiliary_loss_mlp": 0.01033972, "balance_loss_clip": 1.03844428, "balance_loss_mlp": 1.02069247, "epoch": 0.8275965729745979, "flos": 18405404277120.0, "grad_norm": 1.9854785426124901, "language_loss": 0.77840686, "learning_rate": 3.0367911739271423e-07, "loss": 0.79974878, "num_input_tokens_seen": 296989060, "step": 13765, "time_per_iteration": 2.6723389625549316 }, { "auxiliary_loss_clip": 0.01056381, "auxiliary_loss_mlp": 0.01029974, "balance_loss_clip": 1.03462076, "balance_loss_mlp": 1.01668835, "epoch": 0.8276566962272659, "flos": 28512067063680.0, "grad_norm": 1.6645003745934188, "language_loss": 0.62420988, "learning_rate": 3.034728363464214e-07, "loss": 0.64507341, "num_input_tokens_seen": 297011300, "step": 13766, "time_per_iteration": 2.811694383621216 }, { "auxiliary_loss_clip": 0.01073861, "auxiliary_loss_mlp": 0.01030979, "balance_loss_clip": 1.03385091, "balance_loss_mlp": 1.01828325, "epoch": 0.8277168194799339, "flos": 20230240878720.0, "grad_norm": 1.6477178817747764, "language_loss": 0.82427168, "learning_rate": 3.03266619632609e-07, "loss": 0.84532011, "num_input_tokens_seen": 297030350, "step": 13767, "time_per_iteration": 2.716275453567505 }, { "auxiliary_loss_clip": 0.01082823, "auxiliary_loss_mlp": 0.0102858, "balance_loss_clip": 1.03814888, "balance_loss_mlp": 1.01580667, "epoch": 0.8277769427326018, "flos": 28476623318400.0, "grad_norm": 1.6672913040668584, "language_loss": 0.6903677, "learning_rate": 3.030604672590964e-07, "loss": 0.71148169, "num_input_tokens_seen": 297049710, "step": 13768, "time_per_iteration": 2.688441753387451 }, { "auxiliary_loss_clip": 0.0103987, "auxiliary_loss_mlp": 0.0103503, "balance_loss_clip": 1.02953947, "balance_loss_mlp": 1.02242327, "epoch": 0.8278370659852698, "flos": 27197628768000.0, "grad_norm": 2.022593721700604, "language_loss": 0.74887329, "learning_rate": 3.028543792337006e-07, "loss": 0.76962233, "num_input_tokens_seen": 297070510, "step": 13769, "time_per_iteration": 2.765038251876831 }, { "auxiliary_loss_clip": 0.01084819, "auxiliary_loss_mlp": 0.01030015, "balance_loss_clip": 1.03507888, "balance_loss_mlp": 1.01786184, "epoch": 0.8278971892379378, "flos": 37816126404480.0, "grad_norm": 1.6742460818696816, "language_loss": 0.74587786, "learning_rate": 3.0264835556423675e-07, "loss": 0.76702625, "num_input_tokens_seen": 297092585, "step": 13770, "time_per_iteration": 2.78021502494812 }, { "auxiliary_loss_clip": 0.010808, "auxiliary_loss_mlp": 0.01033104, "balance_loss_clip": 1.03744841, "balance_loss_mlp": 1.0202477, "epoch": 0.8279573124906058, "flos": 22560160573440.0, "grad_norm": 2.613906237758894, "language_loss": 0.75822175, "learning_rate": 3.0244239625851785e-07, "loss": 0.77936077, "num_input_tokens_seen": 297110055, "step": 13771, "time_per_iteration": 2.6900837421417236 }, { "auxiliary_loss_clip": 0.01109049, "auxiliary_loss_mlp": 0.01035265, "balance_loss_clip": 1.03709054, "balance_loss_mlp": 1.02323627, "epoch": 0.8280174357432737, "flos": 36064619418240.0, "grad_norm": 1.6606339233038099, "language_loss": 0.72508442, "learning_rate": 3.0223650132435284e-07, "loss": 0.74652761, "num_input_tokens_seen": 297132170, "step": 13772, "time_per_iteration": 2.7568705081939697 }, { "auxiliary_loss_clip": 0.01087016, "auxiliary_loss_mlp": 0.01030121, "balance_loss_clip": 1.03733611, "balance_loss_mlp": 1.01710296, "epoch": 0.8280775589959417, "flos": 22961067246720.0, "grad_norm": 2.2592902165659154, "language_loss": 0.75143635, "learning_rate": 3.0203067076955035e-07, "loss": 0.77260774, "num_input_tokens_seen": 297149515, "step": 13773, "time_per_iteration": 2.683868646621704 }, { "auxiliary_loss_clip": 0.01062538, "auxiliary_loss_mlp": 0.01034452, "balance_loss_clip": 1.03560376, "balance_loss_mlp": 1.02264452, "epoch": 0.8281376822486096, "flos": 26063282286720.0, "grad_norm": 1.872449151264808, "language_loss": 0.75778252, "learning_rate": 3.01824904601915e-07, "loss": 0.77875245, "num_input_tokens_seen": 297170320, "step": 13774, "time_per_iteration": 2.7567591667175293 }, { "auxiliary_loss_clip": 0.01081331, "auxiliary_loss_mlp": 0.00770591, "balance_loss_clip": 1.03898907, "balance_loss_mlp": 1.00031459, "epoch": 0.8281978055012776, "flos": 20667776446080.0, "grad_norm": 1.8056323038896689, "language_loss": 0.74878412, "learning_rate": 3.01619202829249e-07, "loss": 0.76730335, "num_input_tokens_seen": 297189935, "step": 13775, "time_per_iteration": 2.74230694770813 }, { "auxiliary_loss_clip": 0.01112679, "auxiliary_loss_mlp": 0.01030499, "balance_loss_clip": 1.0371238, "balance_loss_mlp": 1.01723146, "epoch": 0.8282579287539455, "flos": 29315281040640.0, "grad_norm": 2.0814301454392994, "language_loss": 0.73856264, "learning_rate": 3.01413565459353e-07, "loss": 0.75999445, "num_input_tokens_seen": 297210885, "step": 13776, "time_per_iteration": 2.684095621109009 }, { "auxiliary_loss_clip": 0.01053766, "auxiliary_loss_mlp": 0.01037151, "balance_loss_clip": 1.0285629, "balance_loss_mlp": 1.02321506, "epoch": 0.8283180520066136, "flos": 15706178899200.0, "grad_norm": 1.9371446657055744, "language_loss": 0.77532077, "learning_rate": 3.0120799250002483e-07, "loss": 0.79622996, "num_input_tokens_seen": 297228500, "step": 13777, "time_per_iteration": 2.7644686698913574 }, { "auxiliary_loss_clip": 0.01096655, "auxiliary_loss_mlp": 0.01029876, "balance_loss_clip": 1.03806889, "balance_loss_mlp": 1.01883733, "epoch": 0.8283781752592815, "flos": 24791470456320.0, "grad_norm": 1.6926504608706043, "language_loss": 0.82732141, "learning_rate": 3.010024839590604e-07, "loss": 0.8485868, "num_input_tokens_seen": 297249470, "step": 13778, "time_per_iteration": 2.7171225547790527 }, { "auxiliary_loss_clip": 0.01092306, "auxiliary_loss_mlp": 0.01025464, "balance_loss_clip": 1.03395522, "balance_loss_mlp": 1.01303005, "epoch": 0.8284382985119495, "flos": 18982811404800.0, "grad_norm": 1.8413358549591246, "language_loss": 0.74458718, "learning_rate": 3.0079703984425187e-07, "loss": 0.76576483, "num_input_tokens_seen": 297265970, "step": 13779, "time_per_iteration": 2.626110553741455 }, { "auxiliary_loss_clip": 0.0100526, "auxiliary_loss_mlp": 0.01000579, "balance_loss_clip": 1.01090991, "balance_loss_mlp": 0.99951804, "epoch": 0.8284984217646175, "flos": 61034460814080.0, "grad_norm": 0.7702655263685751, "language_loss": 0.56672931, "learning_rate": 3.0059166016338954e-07, "loss": 0.5867877, "num_input_tokens_seen": 297325525, "step": 13780, "time_per_iteration": 3.212908983230591 }, { "auxiliary_loss_clip": 0.01067858, "auxiliary_loss_mlp": 0.01029582, "balance_loss_clip": 1.03421974, "balance_loss_mlp": 1.01657593, "epoch": 0.8285585450172854, "flos": 19714635100800.0, "grad_norm": 1.699800901130364, "language_loss": 0.79739404, "learning_rate": 3.0038634492426205e-07, "loss": 0.81836849, "num_input_tokens_seen": 297345025, "step": 13781, "time_per_iteration": 2.655301809310913 }, { "auxiliary_loss_clip": 0.01065725, "auxiliary_loss_mlp": 0.01033354, "balance_loss_clip": 1.03596509, "balance_loss_mlp": 1.01966882, "epoch": 0.8286186682699535, "flos": 21688896280320.0, "grad_norm": 1.8730371598803492, "language_loss": 0.75640142, "learning_rate": 3.001810941346543e-07, "loss": 0.77739221, "num_input_tokens_seen": 297363570, "step": 13782, "time_per_iteration": 2.6944918632507324 }, { "auxiliary_loss_clip": 0.01095829, "auxiliary_loss_mlp": 0.01033027, "balance_loss_clip": 1.03388703, "balance_loss_mlp": 1.02099264, "epoch": 0.8286787915226214, "flos": 25775566346880.0, "grad_norm": 1.6561193474083664, "language_loss": 0.76484203, "learning_rate": 2.9997590780234983e-07, "loss": 0.78613055, "num_input_tokens_seen": 297385385, "step": 13783, "time_per_iteration": 2.6690874099731445 }, { "auxiliary_loss_clip": 0.01107918, "auxiliary_loss_mlp": 0.01028274, "balance_loss_clip": 1.03614211, "balance_loss_mlp": 1.01590598, "epoch": 0.8287389147752894, "flos": 21288348743040.0, "grad_norm": 1.6914982205488613, "language_loss": 0.73518729, "learning_rate": 2.997707859351304e-07, "loss": 0.75654924, "num_input_tokens_seen": 297403950, "step": 13784, "time_per_iteration": 2.6368956565856934 }, { "auxiliary_loss_clip": 0.01100253, "auxiliary_loss_mlp": 0.01035986, "balance_loss_clip": 1.03504157, "balance_loss_mlp": 1.02221763, "epoch": 0.8287990380279573, "flos": 33544875323520.0, "grad_norm": 6.002474127634083, "language_loss": 0.69880319, "learning_rate": 2.99565728540772e-07, "loss": 0.72016555, "num_input_tokens_seen": 297424565, "step": 13785, "time_per_iteration": 2.7842202186584473 }, { "auxiliary_loss_clip": 0.01085403, "auxiliary_loss_mlp": 0.01035636, "balance_loss_clip": 1.03928435, "balance_loss_mlp": 1.02327418, "epoch": 0.8288591612806253, "flos": 22966346545920.0, "grad_norm": 1.401854742726992, "language_loss": 0.68165773, "learning_rate": 2.993607356270516e-07, "loss": 0.7028681, "num_input_tokens_seen": 297445180, "step": 13786, "time_per_iteration": 2.6792120933532715 }, { "auxiliary_loss_clip": 0.01069299, "auxiliary_loss_mlp": 0.01035995, "balance_loss_clip": 1.03638959, "balance_loss_mlp": 1.02368629, "epoch": 0.8289192845332932, "flos": 18588979710720.0, "grad_norm": 1.8600312404195347, "language_loss": 0.77116591, "learning_rate": 2.991558072017426e-07, "loss": 0.7922188, "num_input_tokens_seen": 297463790, "step": 13787, "time_per_iteration": 2.7485241889953613 }, { "auxiliary_loss_clip": 0.01090466, "auxiliary_loss_mlp": 0.01033116, "balance_loss_clip": 1.03657961, "balance_loss_mlp": 1.02168417, "epoch": 0.8289794077859612, "flos": 15450423085440.0, "grad_norm": 1.668975764455463, "language_loss": 0.80241442, "learning_rate": 2.989509432726163e-07, "loss": 0.82365024, "num_input_tokens_seen": 297480100, "step": 13788, "time_per_iteration": 2.646430730819702 }, { "auxiliary_loss_clip": 0.01083639, "auxiliary_loss_mlp": 0.01033973, "balance_loss_clip": 1.03628707, "balance_loss_mlp": 1.02209973, "epoch": 0.8290395310386292, "flos": 28877853214080.0, "grad_norm": 1.718363547417138, "language_loss": 0.71454132, "learning_rate": 2.9874614384744014e-07, "loss": 0.73571742, "num_input_tokens_seen": 297499890, "step": 13789, "time_per_iteration": 2.6843364238739014 }, { "auxiliary_loss_clip": 0.01076455, "auxiliary_loss_mlp": 0.01028674, "balance_loss_clip": 1.032372, "balance_loss_mlp": 1.01604366, "epoch": 0.8290996542912972, "flos": 36576274700160.0, "grad_norm": 2.586850358316563, "language_loss": 0.68054211, "learning_rate": 2.985414089339813e-07, "loss": 0.7015934, "num_input_tokens_seen": 297521440, "step": 13790, "time_per_iteration": 4.365084171295166 }, { "auxiliary_loss_clip": 0.01099215, "auxiliary_loss_mlp": 0.01030184, "balance_loss_clip": 1.03627872, "balance_loss_mlp": 1.01633167, "epoch": 0.8291597775439651, "flos": 23623009032960.0, "grad_norm": 1.629598209312908, "language_loss": 0.77366352, "learning_rate": 2.9833673854000265e-07, "loss": 0.7949574, "num_input_tokens_seen": 297539920, "step": 13791, "time_per_iteration": 4.515652894973755 }, { "auxiliary_loss_clip": 0.01083692, "auxiliary_loss_mlp": 0.01031149, "balance_loss_clip": 1.03688049, "balance_loss_mlp": 1.01825666, "epoch": 0.8292199007966331, "flos": 21397481239680.0, "grad_norm": 1.4251720115143436, "language_loss": 0.70067787, "learning_rate": 2.981321326732651e-07, "loss": 0.72182631, "num_input_tokens_seen": 297560000, "step": 13792, "time_per_iteration": 2.7335619926452637 }, { "auxiliary_loss_clip": 0.0108758, "auxiliary_loss_mlp": 0.01032436, "balance_loss_clip": 1.03578472, "balance_loss_mlp": 1.01971602, "epoch": 0.829280024049301, "flos": 28767607395840.0, "grad_norm": 1.529482170283821, "language_loss": 0.64886749, "learning_rate": 2.9792759134152736e-07, "loss": 0.67006767, "num_input_tokens_seen": 297579300, "step": 13793, "time_per_iteration": 4.254675388336182 }, { "auxiliary_loss_clip": 0.01052518, "auxiliary_loss_mlp": 0.01038478, "balance_loss_clip": 1.03231871, "balance_loss_mlp": 1.02323079, "epoch": 0.829340147301969, "flos": 19938071652480.0, "grad_norm": 1.91807865555319, "language_loss": 0.66570354, "learning_rate": 2.977231145525461e-07, "loss": 0.6866135, "num_input_tokens_seen": 297598095, "step": 13794, "time_per_iteration": 2.6897053718566895 }, { "auxiliary_loss_clip": 0.01108178, "auxiliary_loss_mlp": 0.01036453, "balance_loss_clip": 1.03576493, "balance_loss_mlp": 1.0234766, "epoch": 0.829400270554637, "flos": 25228575060480.0, "grad_norm": 2.1693553604990132, "language_loss": 0.66396624, "learning_rate": 2.975187023140757e-07, "loss": 0.68541253, "num_input_tokens_seen": 297615955, "step": 13795, "time_per_iteration": 2.609815835952759 }, { "auxiliary_loss_clip": 0.0101707, "auxiliary_loss_mlp": 0.01041895, "balance_loss_clip": 1.031497, "balance_loss_mlp": 1.02748859, "epoch": 0.829460393807305, "flos": 24463570176000.0, "grad_norm": 1.7274097985625807, "language_loss": 0.66617584, "learning_rate": 2.973143546338661e-07, "loss": 0.68676549, "num_input_tokens_seen": 297636285, "step": 13796, "time_per_iteration": 4.47485876083374 }, { "auxiliary_loss_clip": 0.01060431, "auxiliary_loss_mlp": 0.0104346, "balance_loss_clip": 1.03264594, "balance_loss_mlp": 1.02998924, "epoch": 0.829520517059973, "flos": 15122486891520.0, "grad_norm": 1.7688571213307858, "language_loss": 0.7208361, "learning_rate": 2.971100715196666e-07, "loss": 0.74187499, "num_input_tokens_seen": 297653315, "step": 13797, "time_per_iteration": 2.996868133544922 }, { "auxiliary_loss_clip": 0.01042783, "auxiliary_loss_mlp": 0.01032554, "balance_loss_clip": 1.03644705, "balance_loss_mlp": 1.02056766, "epoch": 0.8295806403126409, "flos": 21579979265280.0, "grad_norm": 2.64934630097921, "language_loss": 0.72061169, "learning_rate": 2.969058529792243e-07, "loss": 0.74136508, "num_input_tokens_seen": 297673480, "step": 13798, "time_per_iteration": 2.8359265327453613 }, { "auxiliary_loss_clip": 0.01069075, "auxiliary_loss_mlp": 0.01032261, "balance_loss_clip": 1.03152323, "balance_loss_mlp": 1.01987529, "epoch": 0.8296407635653089, "flos": 21726566668800.0, "grad_norm": 1.5432798793202427, "language_loss": 0.76292628, "learning_rate": 2.967016990202822e-07, "loss": 0.78393966, "num_input_tokens_seen": 297693250, "step": 13799, "time_per_iteration": 2.693103790283203 }, { "auxiliary_loss_clip": 0.01108566, "auxiliary_loss_mlp": 0.01033314, "balance_loss_clip": 1.03785658, "balance_loss_mlp": 1.02094579, "epoch": 0.8297008868179768, "flos": 11181147252480.0, "grad_norm": 1.9112775618394213, "language_loss": 0.67614651, "learning_rate": 2.9649760965058245e-07, "loss": 0.69756532, "num_input_tokens_seen": 297710975, "step": 13800, "time_per_iteration": 2.6247994899749756 }, { "auxiliary_loss_clip": 0.01074439, "auxiliary_loss_mlp": 0.01033186, "balance_loss_clip": 1.03878558, "balance_loss_mlp": 1.01930976, "epoch": 0.8297610100706448, "flos": 20664041431680.0, "grad_norm": 2.709008705792723, "language_loss": 0.74460614, "learning_rate": 2.9629358487786515e-07, "loss": 0.76568246, "num_input_tokens_seen": 297730860, "step": 13801, "time_per_iteration": 2.7638845443725586 }, { "auxiliary_loss_clip": 0.01063708, "auxiliary_loss_mlp": 0.0102829, "balance_loss_clip": 1.03407621, "balance_loss_mlp": 1.01658368, "epoch": 0.8298211333233128, "flos": 20376325491840.0, "grad_norm": 1.5797415663470742, "language_loss": 0.73625791, "learning_rate": 2.9608962470986476e-07, "loss": 0.75717783, "num_input_tokens_seen": 297749765, "step": 13802, "time_per_iteration": 2.7499916553497314 }, { "auxiliary_loss_clip": 0.01088515, "auxiliary_loss_mlp": 0.01029669, "balance_loss_clip": 1.03459883, "balance_loss_mlp": 1.01764071, "epoch": 0.8298812565759808, "flos": 21508696725120.0, "grad_norm": 1.4712858328123304, "language_loss": 0.74977744, "learning_rate": 2.9588572915431644e-07, "loss": 0.77095926, "num_input_tokens_seen": 297770380, "step": 13803, "time_per_iteration": 2.7700328826904297 }, { "auxiliary_loss_clip": 0.01099479, "auxiliary_loss_mlp": 0.01034049, "balance_loss_clip": 1.03803515, "balance_loss_mlp": 1.02196717, "epoch": 0.8299413798286487, "flos": 22818681734400.0, "grad_norm": 1.629212800491102, "language_loss": 0.79214036, "learning_rate": 2.9568189821895215e-07, "loss": 0.81347561, "num_input_tokens_seen": 297789440, "step": 13804, "time_per_iteration": 2.668266773223877 }, { "auxiliary_loss_clip": 0.01109225, "auxiliary_loss_mlp": 0.01032372, "balance_loss_clip": 1.03797591, "balance_loss_mlp": 1.0205344, "epoch": 0.8300015030813167, "flos": 29679199683840.0, "grad_norm": 2.3697694156081157, "language_loss": 0.72845703, "learning_rate": 2.954781319115016e-07, "loss": 0.74987304, "num_input_tokens_seen": 297810425, "step": 13805, "time_per_iteration": 2.68404221534729 }, { "auxiliary_loss_clip": 0.01102118, "auxiliary_loss_mlp": 0.00771001, "balance_loss_clip": 1.03904784, "balance_loss_mlp": 1.00029325, "epoch": 0.8300616263339846, "flos": 19719483436800.0, "grad_norm": 2.0648930657274462, "language_loss": 0.77626657, "learning_rate": 2.952744302396906e-07, "loss": 0.79499781, "num_input_tokens_seen": 297827680, "step": 13806, "time_per_iteration": 2.6478402614593506 }, { "auxiliary_loss_clip": 0.0110212, "auxiliary_loss_mlp": 0.01033184, "balance_loss_clip": 1.03748512, "balance_loss_mlp": 1.01954055, "epoch": 0.8301217495866526, "flos": 19901945548800.0, "grad_norm": 1.8834407676447842, "language_loss": 0.63916278, "learning_rate": 2.950707932112444e-07, "loss": 0.66051579, "num_input_tokens_seen": 297848005, "step": 13807, "time_per_iteration": 2.6306519508361816 }, { "auxiliary_loss_clip": 0.01097082, "auxiliary_loss_mlp": 0.01029652, "balance_loss_clip": 1.0383265, "balance_loss_mlp": 1.01728976, "epoch": 0.8301818728393207, "flos": 19715784336000.0, "grad_norm": 1.9692323669369614, "language_loss": 0.72846484, "learning_rate": 2.948672208338847e-07, "loss": 0.74973214, "num_input_tokens_seen": 297866730, "step": 13808, "time_per_iteration": 2.640733480453491 }, { "auxiliary_loss_clip": 0.0109338, "auxiliary_loss_mlp": 0.01046632, "balance_loss_clip": 1.03866029, "balance_loss_mlp": 1.03264272, "epoch": 0.8302419960919886, "flos": 28293658416000.0, "grad_norm": 1.7739722668753906, "language_loss": 0.66351604, "learning_rate": 2.9466371311533046e-07, "loss": 0.6849162, "num_input_tokens_seen": 297886390, "step": 13809, "time_per_iteration": 2.751115322113037 }, { "auxiliary_loss_clip": 0.011108, "auxiliary_loss_mlp": 0.01024776, "balance_loss_clip": 1.03813148, "balance_loss_mlp": 1.01287341, "epoch": 0.8303021193446566, "flos": 18223444955520.0, "grad_norm": 1.8449229056789198, "language_loss": 0.74058008, "learning_rate": 2.9446027006329896e-07, "loss": 0.76193583, "num_input_tokens_seen": 297905110, "step": 13810, "time_per_iteration": 2.506547451019287 }, { "auxiliary_loss_clip": 0.01076467, "auxiliary_loss_mlp": 0.01036006, "balance_loss_clip": 1.03609502, "balance_loss_mlp": 1.02471638, "epoch": 0.8303622425973245, "flos": 23111425578240.0, "grad_norm": 1.5651865455038416, "language_loss": 0.81083822, "learning_rate": 2.94256891685505e-07, "loss": 0.83196294, "num_input_tokens_seen": 297925460, "step": 13811, "time_per_iteration": 2.7325217723846436 }, { "auxiliary_loss_clip": 0.01076005, "auxiliary_loss_mlp": 0.01045296, "balance_loss_clip": 1.0357846, "balance_loss_mlp": 1.03202796, "epoch": 0.8304223658499925, "flos": 19572860119680.0, "grad_norm": 2.992999936529954, "language_loss": 0.73513645, "learning_rate": 2.9405357798966156e-07, "loss": 0.75634944, "num_input_tokens_seen": 297941760, "step": 13812, "time_per_iteration": 2.7724623680114746 }, { "auxiliary_loss_clip": 0.01081692, "auxiliary_loss_mlp": 0.01028975, "balance_loss_clip": 1.03739822, "balance_loss_mlp": 1.01693439, "epoch": 0.8304824891026604, "flos": 24426115269120.0, "grad_norm": 1.8080417533170523, "language_loss": 0.78173685, "learning_rate": 2.9385032898347664e-07, "loss": 0.80284357, "num_input_tokens_seen": 297959745, "step": 13813, "time_per_iteration": 2.7371325492858887 }, { "auxiliary_loss_clip": 0.01054685, "auxiliary_loss_mlp": 0.00771129, "balance_loss_clip": 1.03353238, "balance_loss_mlp": 1.00019467, "epoch": 0.8305426123553284, "flos": 22381792611840.0, "grad_norm": 1.8015570621783799, "language_loss": 0.71141535, "learning_rate": 2.93647144674658e-07, "loss": 0.7296735, "num_input_tokens_seen": 297977665, "step": 13814, "time_per_iteration": 2.8410873413085938 }, { "auxiliary_loss_clip": 0.01117986, "auxiliary_loss_mlp": 0.01044096, "balance_loss_clip": 1.03891778, "balance_loss_mlp": 1.02902818, "epoch": 0.8306027356079964, "flos": 14903575453440.0, "grad_norm": 2.331626844380792, "language_loss": 0.67776018, "learning_rate": 2.9344402507091116e-07, "loss": 0.69938099, "num_input_tokens_seen": 297993525, "step": 13815, "time_per_iteration": 2.607855796813965 }, { "auxiliary_loss_clip": 0.01097003, "auxiliary_loss_mlp": 0.01033309, "balance_loss_clip": 1.03770578, "balance_loss_mlp": 1.02078068, "epoch": 0.8306628588606644, "flos": 19644573623040.0, "grad_norm": 1.9297971278174, "language_loss": 0.75907093, "learning_rate": 2.9324097017993745e-07, "loss": 0.78037405, "num_input_tokens_seen": 298012920, "step": 13816, "time_per_iteration": 2.632202625274658 }, { "auxiliary_loss_clip": 0.01074394, "auxiliary_loss_mlp": 0.01036444, "balance_loss_clip": 1.03376317, "balance_loss_mlp": 1.02446318, "epoch": 0.8307229821133323, "flos": 24389737770240.0, "grad_norm": 1.9005747270922144, "language_loss": 0.81343293, "learning_rate": 2.930379800094371e-07, "loss": 0.83454132, "num_input_tokens_seen": 298033310, "step": 13817, "time_per_iteration": 2.8131661415100098 }, { "auxiliary_loss_clip": 0.01101146, "auxiliary_loss_mlp": 0.01040878, "balance_loss_clip": 1.03882217, "balance_loss_mlp": 1.02748489, "epoch": 0.8307831053660003, "flos": 20996933702400.0, "grad_norm": 1.505220062902958, "language_loss": 0.78014338, "learning_rate": 2.9283505456710875e-07, "loss": 0.80156362, "num_input_tokens_seen": 298053530, "step": 13818, "time_per_iteration": 2.6576030254364014 }, { "auxiliary_loss_clip": 0.01093761, "auxiliary_loss_mlp": 0.01035748, "balance_loss_clip": 1.03938222, "balance_loss_mlp": 1.02312958, "epoch": 0.8308432286186682, "flos": 21397301671680.0, "grad_norm": 1.8020045024766413, "language_loss": 0.819812, "learning_rate": 2.926321938606453e-07, "loss": 0.84110707, "num_input_tokens_seen": 298069305, "step": 13819, "time_per_iteration": 2.6772990226745605 }, { "auxiliary_loss_clip": 0.01020743, "auxiliary_loss_mlp": 0.0100494, "balance_loss_clip": 1.00830984, "balance_loss_mlp": 1.00400436, "epoch": 0.8309033518713362, "flos": 62533656714240.0, "grad_norm": 0.7602438257539984, "language_loss": 0.56127542, "learning_rate": 2.924293978977399e-07, "loss": 0.58153224, "num_input_tokens_seen": 298125830, "step": 13820, "time_per_iteration": 3.193361520767212 }, { "auxiliary_loss_clip": 0.01095529, "auxiliary_loss_mlp": 0.01025816, "balance_loss_clip": 1.0352664, "balance_loss_mlp": 1.01369286, "epoch": 0.8309634751240043, "flos": 16979104051200.0, "grad_norm": 1.789990737596213, "language_loss": 0.67907584, "learning_rate": 2.922266666860831e-07, "loss": 0.70028925, "num_input_tokens_seen": 298142320, "step": 13821, "time_per_iteration": 2.661176919937134 }, { "auxiliary_loss_clip": 0.01043861, "auxiliary_loss_mlp": 0.01036485, "balance_loss_clip": 1.029109, "balance_loss_mlp": 1.02242458, "epoch": 0.8310235983766722, "flos": 22674464628480.0, "grad_norm": 1.7649942540467223, "language_loss": 0.69191265, "learning_rate": 2.920240002333625e-07, "loss": 0.7127161, "num_input_tokens_seen": 298161845, "step": 13822, "time_per_iteration": 2.9704768657684326 }, { "auxiliary_loss_clip": 0.01059895, "auxiliary_loss_mlp": 0.01035644, "balance_loss_clip": 1.03586471, "balance_loss_mlp": 1.02335334, "epoch": 0.8310837216293402, "flos": 30811463176320.0, "grad_norm": 1.6650310845720533, "language_loss": 0.62025028, "learning_rate": 2.918213985472631e-07, "loss": 0.64120567, "num_input_tokens_seen": 298184165, "step": 13823, "time_per_iteration": 2.8505992889404297 }, { "auxiliary_loss_clip": 0.01009787, "auxiliary_loss_mlp": 0.00999982, "balance_loss_clip": 1.00688207, "balance_loss_mlp": 0.9989447, "epoch": 0.8311438448820081, "flos": 71276074997760.0, "grad_norm": 0.9240644196901294, "language_loss": 0.61982203, "learning_rate": 2.916188616354669e-07, "loss": 0.63991976, "num_input_tokens_seen": 298251720, "step": 13824, "time_per_iteration": 3.28657603263855 }, { "auxiliary_loss_clip": 0.01110797, "auxiliary_loss_mlp": 0.01030392, "balance_loss_clip": 1.03885794, "balance_loss_mlp": 1.01815486, "epoch": 0.8312039681346761, "flos": 20887082933760.0, "grad_norm": 1.7761437257032648, "language_loss": 0.73975819, "learning_rate": 2.914163895056552e-07, "loss": 0.76117009, "num_input_tokens_seen": 298271910, "step": 13825, "time_per_iteration": 2.6168012619018555 }, { "auxiliary_loss_clip": 0.01060453, "auxiliary_loss_mlp": 0.0077103, "balance_loss_clip": 1.03461838, "balance_loss_mlp": 1.00020123, "epoch": 0.831264091387344, "flos": 17017528625280.0, "grad_norm": 1.9546255724089596, "language_loss": 0.80497503, "learning_rate": 2.9121398216550486e-07, "loss": 0.82328987, "num_input_tokens_seen": 298288105, "step": 13826, "time_per_iteration": 2.6546146869659424 }, { "auxiliary_loss_clip": 0.01110653, "auxiliary_loss_mlp": 0.01033321, "balance_loss_clip": 1.03793025, "balance_loss_mlp": 1.02049446, "epoch": 0.831324214640012, "flos": 24419578993920.0, "grad_norm": 1.5280583431221222, "language_loss": 0.67963809, "learning_rate": 2.910116396226914e-07, "loss": 0.70107782, "num_input_tokens_seen": 298307600, "step": 13827, "time_per_iteration": 2.5905277729034424 }, { "auxiliary_loss_clip": 0.01098107, "auxiliary_loss_mlp": 0.01030102, "balance_loss_clip": 1.03539395, "balance_loss_mlp": 1.01871204, "epoch": 0.83138433789268, "flos": 13545576938880.0, "grad_norm": 1.973600288976441, "language_loss": 0.74098945, "learning_rate": 2.9080936188488834e-07, "loss": 0.76227152, "num_input_tokens_seen": 298323055, "step": 13828, "time_per_iteration": 2.6251087188720703 }, { "auxiliary_loss_clip": 0.01073913, "auxiliary_loss_mlp": 0.01033531, "balance_loss_clip": 1.03275013, "balance_loss_mlp": 1.0203644, "epoch": 0.831444461145348, "flos": 44492386561920.0, "grad_norm": 2.63988910993405, "language_loss": 0.67159581, "learning_rate": 2.906071489597657e-07, "loss": 0.69267023, "num_input_tokens_seen": 298346950, "step": 13829, "time_per_iteration": 3.220686435699463 }, { "auxiliary_loss_clip": 0.01085933, "auxiliary_loss_mlp": 0.01031507, "balance_loss_clip": 1.03704345, "balance_loss_mlp": 1.01854897, "epoch": 0.8315045843980159, "flos": 22705024124160.0, "grad_norm": 1.6177335267963915, "language_loss": 0.82913047, "learning_rate": 2.9040500085499054e-07, "loss": 0.8503049, "num_input_tokens_seen": 298366315, "step": 13830, "time_per_iteration": 6.03197717666626 }, { "auxiliary_loss_clip": 0.01097952, "auxiliary_loss_mlp": 0.01034258, "balance_loss_clip": 1.03697491, "balance_loss_mlp": 1.02208698, "epoch": 0.8315647076506839, "flos": 16873491087360.0, "grad_norm": 2.1932847563543247, "language_loss": 0.73822612, "learning_rate": 2.9020291757822925e-07, "loss": 0.75954819, "num_input_tokens_seen": 298385185, "step": 13831, "time_per_iteration": 2.665022611618042 }, { "auxiliary_loss_clip": 0.01111975, "auxiliary_loss_mlp": 0.01033792, "balance_loss_clip": 1.03963041, "balance_loss_mlp": 1.02083445, "epoch": 0.8316248309033518, "flos": 13808730954240.0, "grad_norm": 1.6071595034037367, "language_loss": 0.7129162, "learning_rate": 2.9000089913714523e-07, "loss": 0.73437387, "num_input_tokens_seen": 298402335, "step": 13832, "time_per_iteration": 2.647451400756836 }, { "auxiliary_loss_clip": 0.0108072, "auxiliary_loss_mlp": 0.0103351, "balance_loss_clip": 1.03389788, "balance_loss_mlp": 1.02102256, "epoch": 0.8316849541560198, "flos": 23512511819520.0, "grad_norm": 1.6195807094532317, "language_loss": 0.84269989, "learning_rate": 2.897989455393979e-07, "loss": 0.86384219, "num_input_tokens_seen": 298423370, "step": 13833, "time_per_iteration": 4.226484298706055 }, { "auxiliary_loss_clip": 0.010921, "auxiliary_loss_mlp": 0.01036257, "balance_loss_clip": 1.03806973, "balance_loss_mlp": 1.02329278, "epoch": 0.8317450774086879, "flos": 23771356202880.0, "grad_norm": 2.0307476796649917, "language_loss": 0.76316315, "learning_rate": 2.8959705679264625e-07, "loss": 0.78444666, "num_input_tokens_seen": 298444835, "step": 13834, "time_per_iteration": 2.814335584640503 }, { "auxiliary_loss_clip": 0.01105662, "auxiliary_loss_mlp": 0.00769799, "balance_loss_clip": 1.03617358, "balance_loss_mlp": 1.00016499, "epoch": 0.8318052006613558, "flos": 16215535710720.0, "grad_norm": 1.877967943064554, "language_loss": 0.79689634, "learning_rate": 2.893952329045459e-07, "loss": 0.81565094, "num_input_tokens_seen": 298461845, "step": 13835, "time_per_iteration": 4.108726978302002 }, { "auxiliary_loss_clip": 0.01103663, "auxiliary_loss_mlp": 0.01037899, "balance_loss_clip": 1.03955829, "balance_loss_mlp": 1.02354026, "epoch": 0.8318653239140238, "flos": 19974556892160.0, "grad_norm": 1.8066423967351954, "language_loss": 0.80604517, "learning_rate": 2.8919347388274905e-07, "loss": 0.82746077, "num_input_tokens_seen": 298479095, "step": 13836, "time_per_iteration": 2.624318838119507 }, { "auxiliary_loss_clip": 0.01088523, "auxiliary_loss_mlp": 0.01031373, "balance_loss_clip": 1.03795898, "balance_loss_mlp": 1.01932681, "epoch": 0.8319254471666917, "flos": 17704714694400.0, "grad_norm": 1.9385404559381145, "language_loss": 0.77292264, "learning_rate": 2.8899177973490727e-07, "loss": 0.79412162, "num_input_tokens_seen": 298494475, "step": 13837, "time_per_iteration": 2.662458896636963 }, { "auxiliary_loss_clip": 0.01114063, "auxiliary_loss_mlp": 0.01030366, "balance_loss_clip": 1.03759873, "balance_loss_mlp": 1.01654339, "epoch": 0.8319855704193597, "flos": 19536554448000.0, "grad_norm": 1.6836751176353142, "language_loss": 0.83425492, "learning_rate": 2.887901504686685e-07, "loss": 0.85569924, "num_input_tokens_seen": 298513185, "step": 13838, "time_per_iteration": 2.533836603164673 }, { "auxiliary_loss_clip": 0.01081066, "auxiliary_loss_mlp": 0.01035142, "balance_loss_clip": 1.03288436, "balance_loss_mlp": 1.02131331, "epoch": 0.8320456936720276, "flos": 21178067011200.0, "grad_norm": 2.4451044719374613, "language_loss": 0.74250424, "learning_rate": 2.885885860916795e-07, "loss": 0.76366633, "num_input_tokens_seen": 298531885, "step": 13839, "time_per_iteration": 2.6616058349609375 }, { "auxiliary_loss_clip": 0.01096452, "auxiliary_loss_mlp": 0.01033098, "balance_loss_clip": 1.03666425, "balance_loss_mlp": 1.02004433, "epoch": 0.8321058169246957, "flos": 33250874503680.0, "grad_norm": 1.4805288033952766, "language_loss": 0.67812371, "learning_rate": 2.8838708661158253e-07, "loss": 0.69941914, "num_input_tokens_seen": 298554905, "step": 13840, "time_per_iteration": 2.735732078552246 }, { "auxiliary_loss_clip": 0.01054107, "auxiliary_loss_mlp": 0.01039263, "balance_loss_clip": 1.03295565, "balance_loss_mlp": 1.02507687, "epoch": 0.8321659401773636, "flos": 14208129256320.0, "grad_norm": 1.9499790502126348, "language_loss": 0.79567152, "learning_rate": 2.8818565203601843e-07, "loss": 0.81660521, "num_input_tokens_seen": 298571185, "step": 13841, "time_per_iteration": 2.6811771392822266 }, { "auxiliary_loss_clip": 0.0106104, "auxiliary_loss_mlp": 0.01030475, "balance_loss_clip": 1.03763831, "balance_loss_mlp": 1.0179522, "epoch": 0.8322260634300316, "flos": 15158253859200.0, "grad_norm": 1.7496340078851804, "language_loss": 0.68060827, "learning_rate": 2.879842823726262e-07, "loss": 0.70152342, "num_input_tokens_seen": 298588505, "step": 13842, "time_per_iteration": 2.8322203159332275 }, { "auxiliary_loss_clip": 0.0108993, "auxiliary_loss_mlp": 0.0103225, "balance_loss_clip": 1.03790903, "balance_loss_mlp": 1.01888657, "epoch": 0.8322861866826995, "flos": 25300827267840.0, "grad_norm": 1.5488311429576032, "language_loss": 0.73103952, "learning_rate": 2.8778297762904124e-07, "loss": 0.75226128, "num_input_tokens_seen": 298609295, "step": 13843, "time_per_iteration": 2.886599540710449 }, { "auxiliary_loss_clip": 0.01077287, "auxiliary_loss_mlp": 0.01027519, "balance_loss_clip": 1.03611994, "balance_loss_mlp": 1.01505589, "epoch": 0.8323463099353675, "flos": 17019360218880.0, "grad_norm": 1.8098235185512692, "language_loss": 0.77365232, "learning_rate": 2.875817378128975e-07, "loss": 0.79470038, "num_input_tokens_seen": 298625765, "step": 13844, "time_per_iteration": 2.7069430351257324 }, { "auxiliary_loss_clip": 0.01007928, "auxiliary_loss_mlp": 0.01001333, "balance_loss_clip": 1.00663698, "balance_loss_mlp": 1.00036097, "epoch": 0.8324064331880354, "flos": 55607889709440.0, "grad_norm": 0.7847120872391591, "language_loss": 0.55208087, "learning_rate": 2.8738056293182624e-07, "loss": 0.57217348, "num_input_tokens_seen": 298683005, "step": 13845, "time_per_iteration": 3.0783231258392334 }, { "auxiliary_loss_clip": 0.01102275, "auxiliary_loss_mlp": 0.01044708, "balance_loss_clip": 1.03761721, "balance_loss_mlp": 1.0314219, "epoch": 0.8324665564407034, "flos": 26138623063680.0, "grad_norm": 1.6009211364700722, "language_loss": 0.75140607, "learning_rate": 2.871794529934555e-07, "loss": 0.77287591, "num_input_tokens_seen": 298703060, "step": 13846, "time_per_iteration": 2.676182508468628 }, { "auxiliary_loss_clip": 0.01056649, "auxiliary_loss_mlp": 0.01033367, "balance_loss_clip": 1.03160548, "balance_loss_mlp": 1.01809657, "epoch": 0.8325266796933715, "flos": 22049187649920.0, "grad_norm": 1.6388328541738983, "language_loss": 0.78896999, "learning_rate": 2.8697840800541115e-07, "loss": 0.80987018, "num_input_tokens_seen": 298721765, "step": 13847, "time_per_iteration": 2.7297866344451904 }, { "auxiliary_loss_clip": 0.01052928, "auxiliary_loss_mlp": 0.01030708, "balance_loss_clip": 1.0369612, "balance_loss_mlp": 1.01901376, "epoch": 0.8325868029460394, "flos": 22816634659200.0, "grad_norm": 2.65968337371303, "language_loss": 0.74193573, "learning_rate": 2.867774279753175e-07, "loss": 0.76277208, "num_input_tokens_seen": 298740825, "step": 13848, "time_per_iteration": 2.740797758102417 }, { "auxiliary_loss_clip": 0.0110005, "auxiliary_loss_mlp": 0.01027688, "balance_loss_clip": 1.03858709, "balance_loss_mlp": 1.0153321, "epoch": 0.8326469261987074, "flos": 14757454926720.0, "grad_norm": 1.7578930460196398, "language_loss": 0.63396668, "learning_rate": 2.8657651291079554e-07, "loss": 0.65524411, "num_input_tokens_seen": 298758515, "step": 13849, "time_per_iteration": 2.5713930130004883 }, { "auxiliary_loss_clip": 0.01084755, "auxiliary_loss_mlp": 0.01033475, "balance_loss_clip": 1.0322125, "balance_loss_mlp": 1.0203917, "epoch": 0.8327070494513753, "flos": 22926126291840.0, "grad_norm": 2.0835174192024533, "language_loss": 0.79707754, "learning_rate": 2.863756628194638e-07, "loss": 0.81825984, "num_input_tokens_seen": 298776375, "step": 13850, "time_per_iteration": 2.6037027835845947 }, { "auxiliary_loss_clip": 0.0106844, "auxiliary_loss_mlp": 0.01032788, "balance_loss_clip": 1.03266466, "balance_loss_mlp": 1.02161264, "epoch": 0.8327671727040433, "flos": 20665334321280.0, "grad_norm": 1.589654785001457, "language_loss": 0.7825923, "learning_rate": 2.8617487770893877e-07, "loss": 0.80360448, "num_input_tokens_seen": 298795135, "step": 13851, "time_per_iteration": 2.669689416885376 }, { "auxiliary_loss_clip": 0.01021321, "auxiliary_loss_mlp": 0.01003693, "balance_loss_clip": 1.00839996, "balance_loss_mlp": 1.00260222, "epoch": 0.8328272959567112, "flos": 56060760384000.0, "grad_norm": 0.7603079247900993, "language_loss": 0.55759335, "learning_rate": 2.859741575868344e-07, "loss": 0.57784349, "num_input_tokens_seen": 298855475, "step": 13852, "time_per_iteration": 3.171971321105957 }, { "auxiliary_loss_clip": 0.01096762, "auxiliary_loss_mlp": 0.01030025, "balance_loss_clip": 1.03631687, "balance_loss_mlp": 1.01785994, "epoch": 0.8328874192093793, "flos": 32303084284800.0, "grad_norm": 1.490710672408854, "language_loss": 0.67185426, "learning_rate": 2.8577350246076125e-07, "loss": 0.69312215, "num_input_tokens_seen": 298875875, "step": 13853, "time_per_iteration": 2.705221176147461 }, { "auxiliary_loss_clip": 0.01082363, "auxiliary_loss_mlp": 0.01035053, "balance_loss_clip": 1.03677762, "balance_loss_mlp": 1.02310205, "epoch": 0.8329475424620472, "flos": 23512691387520.0, "grad_norm": 1.8131340833394713, "language_loss": 0.7809993, "learning_rate": 2.855729123383286e-07, "loss": 0.80217344, "num_input_tokens_seen": 298895950, "step": 13854, "time_per_iteration": 2.6784071922302246 }, { "auxiliary_loss_clip": 0.01029094, "auxiliary_loss_mlp": 0.00999528, "balance_loss_clip": 1.00678289, "balance_loss_mlp": 0.99855083, "epoch": 0.8330076657147152, "flos": 67840680378240.0, "grad_norm": 0.7605812264158395, "language_loss": 0.58664268, "learning_rate": 2.8537238722714295e-07, "loss": 0.60692888, "num_input_tokens_seen": 298955770, "step": 13855, "time_per_iteration": 3.0027222633361816 }, { "auxiliary_loss_clip": 0.01098543, "auxiliary_loss_mlp": 0.01027235, "balance_loss_clip": 1.03760314, "balance_loss_mlp": 1.01511717, "epoch": 0.8330677889673831, "flos": 22892801448960.0, "grad_norm": 1.6486782153606043, "language_loss": 0.71799862, "learning_rate": 2.8517192713480853e-07, "loss": 0.73925638, "num_input_tokens_seen": 298976545, "step": 13856, "time_per_iteration": 2.6572425365448 }, { "auxiliary_loss_clip": 0.01098496, "auxiliary_loss_mlp": 0.01031016, "balance_loss_clip": 1.03601694, "balance_loss_mlp": 1.01861823, "epoch": 0.8331279122200511, "flos": 27345042184320.0, "grad_norm": 1.530155897456529, "language_loss": 0.75503182, "learning_rate": 2.8497153206892677e-07, "loss": 0.77632695, "num_input_tokens_seen": 298996750, "step": 13857, "time_per_iteration": 2.709289073944092 }, { "auxiliary_loss_clip": 0.0106038, "auxiliary_loss_mlp": 0.01024239, "balance_loss_clip": 1.0359571, "balance_loss_mlp": 1.01319456, "epoch": 0.833188035472719, "flos": 19938179393280.0, "grad_norm": 1.5089034219469146, "language_loss": 0.7372514, "learning_rate": 2.847712020370958e-07, "loss": 0.75809759, "num_input_tokens_seen": 299014895, "step": 13858, "time_per_iteration": 2.771655321121216 }, { "auxiliary_loss_clip": 0.01112772, "auxiliary_loss_mlp": 0.01033358, "balance_loss_clip": 1.03744984, "balance_loss_mlp": 1.01981604, "epoch": 0.833248158725387, "flos": 15232624968960.0, "grad_norm": 1.9106405712672399, "language_loss": 0.73376054, "learning_rate": 2.8457093704691316e-07, "loss": 0.75522184, "num_input_tokens_seen": 299032855, "step": 13859, "time_per_iteration": 2.5690972805023193 }, { "auxiliary_loss_clip": 0.01093273, "auxiliary_loss_mlp": 0.01025326, "balance_loss_clip": 1.03597152, "balance_loss_mlp": 1.01405454, "epoch": 0.8333082819780551, "flos": 24535535074560.0, "grad_norm": 1.588476401883647, "language_loss": 0.79069161, "learning_rate": 2.8437073710597205e-07, "loss": 0.81187761, "num_input_tokens_seen": 299052055, "step": 13860, "time_per_iteration": 2.687077283859253 }, { "auxiliary_loss_clip": 0.0103731, "auxiliary_loss_mlp": 0.01031939, "balance_loss_clip": 1.03546524, "balance_loss_mlp": 1.01915944, "epoch": 0.833368405230723, "flos": 31467407391360.0, "grad_norm": 1.5993206787679535, "language_loss": 0.8204006, "learning_rate": 2.841706022218644e-07, "loss": 0.84109306, "num_input_tokens_seen": 299075285, "step": 13861, "time_per_iteration": 3.007451295852661 }, { "auxiliary_loss_clip": 0.01112118, "auxiliary_loss_mlp": 0.0103305, "balance_loss_clip": 1.03988099, "balance_loss_mlp": 1.02040219, "epoch": 0.833428528483391, "flos": 14902713527040.0, "grad_norm": 1.7332412626678735, "language_loss": 0.78811872, "learning_rate": 2.839705324021806e-07, "loss": 0.80957043, "num_input_tokens_seen": 299092520, "step": 13862, "time_per_iteration": 2.7910513877868652 }, { "auxiliary_loss_clip": 0.01099183, "auxiliary_loss_mlp": 0.01035326, "balance_loss_clip": 1.03552341, "balance_loss_mlp": 1.02280307, "epoch": 0.8334886517360589, "flos": 22199833290240.0, "grad_norm": 1.8555893155682146, "language_loss": 0.75250399, "learning_rate": 2.83770527654505e-07, "loss": 0.77384913, "num_input_tokens_seen": 299109450, "step": 13863, "time_per_iteration": 2.623645782470703 }, { "auxiliary_loss_clip": 0.01049642, "auxiliary_loss_mlp": 0.00771776, "balance_loss_clip": 1.03239465, "balance_loss_mlp": 1.00020719, "epoch": 0.8335487749887269, "flos": 30372562892160.0, "grad_norm": 1.9984651067343642, "language_loss": 0.75399351, "learning_rate": 2.835705879864232e-07, "loss": 0.77220774, "num_input_tokens_seen": 299129540, "step": 13864, "time_per_iteration": 2.8347368240356445 }, { "auxiliary_loss_clip": 0.01086549, "auxiliary_loss_mlp": 0.01034445, "balance_loss_clip": 1.03666651, "balance_loss_mlp": 1.02171326, "epoch": 0.8336088982413948, "flos": 24681152810880.0, "grad_norm": 1.9805001513042173, "language_loss": 0.69349921, "learning_rate": 2.833707134055168e-07, "loss": 0.71470916, "num_input_tokens_seen": 299148670, "step": 13865, "time_per_iteration": 2.7639873027801514 }, { "auxiliary_loss_clip": 0.01099811, "auxiliary_loss_mlp": 0.01032523, "balance_loss_clip": 1.03819227, "balance_loss_mlp": 1.01979089, "epoch": 0.8336690214940629, "flos": 38177207873280.0, "grad_norm": 5.1442614378118625, "language_loss": 0.75333238, "learning_rate": 2.831709039193653e-07, "loss": 0.7746557, "num_input_tokens_seen": 299169330, "step": 13866, "time_per_iteration": 2.777001142501831 }, { "auxiliary_loss_clip": 0.01008617, "auxiliary_loss_mlp": 0.01009028, "balance_loss_clip": 1.00722134, "balance_loss_mlp": 1.00765133, "epoch": 0.8337291447467308, "flos": 55565119589760.0, "grad_norm": 0.870724565539336, "language_loss": 0.63078576, "learning_rate": 2.8297115953554465e-07, "loss": 0.65096223, "num_input_tokens_seen": 299220980, "step": 13867, "time_per_iteration": 3.081568956375122 }, { "auxiliary_loss_clip": 0.01083895, "auxiliary_loss_mlp": 0.01028872, "balance_loss_clip": 1.03740549, "balance_loss_mlp": 1.01767826, "epoch": 0.8337892679993988, "flos": 24133550993280.0, "grad_norm": 1.7649595884410185, "language_loss": 0.71936655, "learning_rate": 2.827714802616301e-07, "loss": 0.74049425, "num_input_tokens_seen": 299240130, "step": 13868, "time_per_iteration": 2.652420997619629 }, { "auxiliary_loss_clip": 0.0108564, "auxiliary_loss_mlp": 0.01032875, "balance_loss_clip": 1.03956676, "balance_loss_mlp": 1.02057862, "epoch": 0.8338493912520667, "flos": 28183915388160.0, "grad_norm": 1.3896296545352977, "language_loss": 0.80381906, "learning_rate": 2.8257186610519325e-07, "loss": 0.82500416, "num_input_tokens_seen": 299260705, "step": 13869, "time_per_iteration": 4.254533529281616 }, { "auxiliary_loss_clip": 0.01100488, "auxiliary_loss_mlp": 0.01032723, "balance_loss_clip": 1.03849924, "balance_loss_mlp": 1.02023017, "epoch": 0.8339095145047347, "flos": 22158356060160.0, "grad_norm": 1.5584172404732568, "language_loss": 0.82560688, "learning_rate": 2.823723170738028e-07, "loss": 0.84693897, "num_input_tokens_seen": 299278925, "step": 13870, "time_per_iteration": 4.364636421203613 }, { "auxiliary_loss_clip": 0.01078884, "auxiliary_loss_mlp": 0.0102765, "balance_loss_clip": 1.03682613, "balance_loss_mlp": 1.01443601, "epoch": 0.8339696377574026, "flos": 17307112072320.0, "grad_norm": 2.7050320038401146, "language_loss": 0.7043367, "learning_rate": 2.821728331750264e-07, "loss": 0.72540206, "num_input_tokens_seen": 299291580, "step": 13871, "time_per_iteration": 2.650563955307007 }, { "auxiliary_loss_clip": 0.01097514, "auxiliary_loss_mlp": 0.0103365, "balance_loss_clip": 1.03766418, "balance_loss_mlp": 1.02192545, "epoch": 0.8340297610100706, "flos": 20668351063680.0, "grad_norm": 1.6604394599481103, "language_loss": 0.6898998, "learning_rate": 2.8197341441642853e-07, "loss": 0.7112115, "num_input_tokens_seen": 299310385, "step": 13872, "time_per_iteration": 4.172610759735107 }, { "auxiliary_loss_clip": 0.01086882, "auxiliary_loss_mlp": 0.01026823, "balance_loss_clip": 1.03666329, "balance_loss_mlp": 1.01506281, "epoch": 0.8340898842627387, "flos": 20515442866560.0, "grad_norm": 1.969935634979257, "language_loss": 0.73773992, "learning_rate": 2.817740608055712e-07, "loss": 0.75887698, "num_input_tokens_seen": 299327660, "step": 13873, "time_per_iteration": 2.7069506645202637 }, { "auxiliary_loss_clip": 0.01087674, "auxiliary_loss_mlp": 0.01035501, "balance_loss_clip": 1.03668487, "balance_loss_mlp": 1.02100515, "epoch": 0.8341500075154066, "flos": 21425850005760.0, "grad_norm": 2.086638333931779, "language_loss": 0.75528133, "learning_rate": 2.81574772350013e-07, "loss": 0.7765131, "num_input_tokens_seen": 299343685, "step": 13874, "time_per_iteration": 4.401844263076782 }, { "auxiliary_loss_clip": 0.0108051, "auxiliary_loss_mlp": 0.01028815, "balance_loss_clip": 1.0355829, "balance_loss_mlp": 1.01691747, "epoch": 0.8342101307680746, "flos": 22090988102400.0, "grad_norm": 2.2988326749129766, "language_loss": 0.66232169, "learning_rate": 2.813755490573118e-07, "loss": 0.68341494, "num_input_tokens_seen": 299363305, "step": 13875, "time_per_iteration": 2.7391769886016846 }, { "auxiliary_loss_clip": 0.010648, "auxiliary_loss_mlp": 0.01036714, "balance_loss_clip": 1.03338897, "balance_loss_mlp": 1.02434039, "epoch": 0.8342702540207425, "flos": 21871466133120.0, "grad_norm": 1.700714112258655, "language_loss": 0.79729408, "learning_rate": 2.8117639093502243e-07, "loss": 0.81830925, "num_input_tokens_seen": 299382630, "step": 13876, "time_per_iteration": 2.8024299144744873 }, { "auxiliary_loss_clip": 0.01093328, "auxiliary_loss_mlp": 0.01038156, "balance_loss_clip": 1.03557479, "balance_loss_mlp": 1.02462614, "epoch": 0.8343303772734105, "flos": 22528487756160.0, "grad_norm": 1.934148297226032, "language_loss": 0.87182283, "learning_rate": 2.8097729799069615e-07, "loss": 0.89313757, "num_input_tokens_seen": 299402385, "step": 13877, "time_per_iteration": 2.652780055999756 }, { "auxiliary_loss_clip": 0.01064054, "auxiliary_loss_mlp": 0.01029724, "balance_loss_clip": 1.03309846, "balance_loss_mlp": 1.01811349, "epoch": 0.8343905005260784, "flos": 14939773384320.0, "grad_norm": 2.0462356502158445, "language_loss": 0.69456965, "learning_rate": 2.807782702318828e-07, "loss": 0.71550739, "num_input_tokens_seen": 299419820, "step": 13878, "time_per_iteration": 2.642768144607544 }, { "auxiliary_loss_clip": 0.01084966, "auxiliary_loss_mlp": 0.01028475, "balance_loss_clip": 1.03594303, "balance_loss_mlp": 1.01660752, "epoch": 0.8344506237787465, "flos": 15012456554880.0, "grad_norm": 2.2290576221184537, "language_loss": 0.790878, "learning_rate": 2.805793076661309e-07, "loss": 0.81201237, "num_input_tokens_seen": 299436265, "step": 13879, "time_per_iteration": 2.6227519512176514 }, { "auxiliary_loss_clip": 0.01061568, "auxiliary_loss_mlp": 0.01031482, "balance_loss_clip": 1.03857195, "balance_loss_mlp": 1.02046072, "epoch": 0.8345107470314144, "flos": 17560389847680.0, "grad_norm": 2.053274894813819, "language_loss": 0.8324911, "learning_rate": 2.803804103009828e-07, "loss": 0.85342157, "num_input_tokens_seen": 299451660, "step": 13880, "time_per_iteration": 2.7081100940704346 }, { "auxiliary_loss_clip": 0.01089609, "auxiliary_loss_mlp": 0.0102994, "balance_loss_clip": 1.035254, "balance_loss_mlp": 1.01767302, "epoch": 0.8345708702840824, "flos": 25187277398400.0, "grad_norm": 1.577577271354365, "language_loss": 0.78032011, "learning_rate": 2.80181578143982e-07, "loss": 0.80151558, "num_input_tokens_seen": 299472070, "step": 13881, "time_per_iteration": 2.672635793685913 }, { "auxiliary_loss_clip": 0.010645, "auxiliary_loss_mlp": 0.01024591, "balance_loss_clip": 1.03461313, "balance_loss_mlp": 1.01385057, "epoch": 0.8346309935367503, "flos": 15083559527040.0, "grad_norm": 2.2926708400629137, "language_loss": 0.78564227, "learning_rate": 2.7998281120266807e-07, "loss": 0.80653316, "num_input_tokens_seen": 299486725, "step": 13882, "time_per_iteration": 2.6480295658111572 }, { "auxiliary_loss_clip": 0.01070114, "auxiliary_loss_mlp": 0.01053336, "balance_loss_clip": 1.03278971, "balance_loss_mlp": 1.03948951, "epoch": 0.8346911167894183, "flos": 22930615491840.0, "grad_norm": 1.6147247688158133, "language_loss": 0.80761689, "learning_rate": 2.79784109484579e-07, "loss": 0.82885134, "num_input_tokens_seen": 299505435, "step": 13883, "time_per_iteration": 2.6793839931488037 }, { "auxiliary_loss_clip": 0.01096684, "auxiliary_loss_mlp": 0.01036312, "balance_loss_clip": 1.03590465, "balance_loss_mlp": 1.02373528, "epoch": 0.8347512400420862, "flos": 20193037367040.0, "grad_norm": 4.685073844907577, "language_loss": 0.74089235, "learning_rate": 2.795854729972482e-07, "loss": 0.76222229, "num_input_tokens_seen": 299523555, "step": 13884, "time_per_iteration": 2.604556083679199 }, { "auxiliary_loss_clip": 0.01095519, "auxiliary_loss_mlp": 0.01034904, "balance_loss_clip": 1.03934622, "balance_loss_mlp": 1.02086711, "epoch": 0.8348113632947542, "flos": 25954832148480.0, "grad_norm": 1.6937955935304687, "language_loss": 0.7016691, "learning_rate": 2.7938690174820913e-07, "loss": 0.72297329, "num_input_tokens_seen": 299541660, "step": 13885, "time_per_iteration": 2.6773464679718018 }, { "auxiliary_loss_clip": 0.01077954, "auxiliary_loss_mlp": 0.01033472, "balance_loss_clip": 1.03690195, "balance_loss_mlp": 1.02092576, "epoch": 0.8348714865474223, "flos": 34204554552960.0, "grad_norm": 1.8731982804534555, "language_loss": 0.6992318, "learning_rate": 2.791883957449912e-07, "loss": 0.72034615, "num_input_tokens_seen": 299562465, "step": 13886, "time_per_iteration": 2.8285069465637207 }, { "auxiliary_loss_clip": 0.01073957, "auxiliary_loss_mlp": 0.01033794, "balance_loss_clip": 1.03586638, "balance_loss_mlp": 1.01972771, "epoch": 0.8349316098000902, "flos": 24390132819840.0, "grad_norm": 2.5697448718102414, "language_loss": 0.79508579, "learning_rate": 2.7898995499512134e-07, "loss": 0.81616336, "num_input_tokens_seen": 299582700, "step": 13887, "time_per_iteration": 2.7178754806518555 }, { "auxiliary_loss_clip": 0.01092328, "auxiliary_loss_mlp": 0.00771043, "balance_loss_clip": 1.03849149, "balance_loss_mlp": 1.00030017, "epoch": 0.8349917330527582, "flos": 23032744836480.0, "grad_norm": 2.693894693314375, "language_loss": 0.64530712, "learning_rate": 2.7879157950612467e-07, "loss": 0.66394079, "num_input_tokens_seen": 299600310, "step": 13888, "time_per_iteration": 2.6735687255859375 }, { "auxiliary_loss_clip": 0.01088596, "auxiliary_loss_mlp": 0.01028671, "balance_loss_clip": 1.03663945, "balance_loss_mlp": 1.01663125, "epoch": 0.8350518563054261, "flos": 13625873792640.0, "grad_norm": 2.0620816016550436, "language_loss": 0.66669202, "learning_rate": 2.785932692855244e-07, "loss": 0.68786466, "num_input_tokens_seen": 299617025, "step": 13889, "time_per_iteration": 2.680638551712036 }, { "auxiliary_loss_clip": 0.01090008, "auxiliary_loss_mlp": 0.01028757, "balance_loss_clip": 1.03369141, "balance_loss_mlp": 1.01666367, "epoch": 0.8351119795580941, "flos": 21579799697280.0, "grad_norm": 2.20971990347348, "language_loss": 0.6832096, "learning_rate": 2.783950243408399e-07, "loss": 0.70439726, "num_input_tokens_seen": 299633050, "step": 13890, "time_per_iteration": 2.627889394760132 }, { "auxiliary_loss_clip": 0.01088958, "auxiliary_loss_mlp": 0.01036104, "balance_loss_clip": 1.03766465, "balance_loss_mlp": 1.02320004, "epoch": 0.835172102810762, "flos": 20038297576320.0, "grad_norm": 2.518146173573676, "language_loss": 0.59095812, "learning_rate": 2.7819684467958817e-07, "loss": 0.61220872, "num_input_tokens_seen": 299646445, "step": 13891, "time_per_iteration": 2.7173044681549072 }, { "auxiliary_loss_clip": 0.01099806, "auxiliary_loss_mlp": 0.01029991, "balance_loss_clip": 1.03822279, "balance_loss_mlp": 1.01823068, "epoch": 0.8352322260634301, "flos": 25111577485440.0, "grad_norm": 1.6267311876614727, "language_loss": 0.71812761, "learning_rate": 2.779987303092846e-07, "loss": 0.7394256, "num_input_tokens_seen": 299662665, "step": 13892, "time_per_iteration": 2.662322998046875 }, { "auxiliary_loss_clip": 0.01106347, "auxiliary_loss_mlp": 0.01034997, "balance_loss_clip": 1.03654015, "balance_loss_mlp": 1.02241993, "epoch": 0.835292349316098, "flos": 24863758577280.0, "grad_norm": 1.986327047613025, "language_loss": 0.65929645, "learning_rate": 2.7780068123744207e-07, "loss": 0.68070984, "num_input_tokens_seen": 299683585, "step": 13893, "time_per_iteration": 2.666810989379883 }, { "auxiliary_loss_clip": 0.01079282, "auxiliary_loss_mlp": 0.01024568, "balance_loss_clip": 1.03549695, "balance_loss_mlp": 1.01279628, "epoch": 0.835352472568766, "flos": 19865568049920.0, "grad_norm": 2.066965169500514, "language_loss": 0.78525186, "learning_rate": 2.7760269747156996e-07, "loss": 0.80629033, "num_input_tokens_seen": 299702680, "step": 13894, "time_per_iteration": 2.656261920928955 }, { "auxiliary_loss_clip": 0.01089446, "auxiliary_loss_mlp": 0.01029648, "balance_loss_clip": 1.03555644, "balance_loss_mlp": 1.01722014, "epoch": 0.8354125958214339, "flos": 22054754257920.0, "grad_norm": 1.734895870039155, "language_loss": 0.72428441, "learning_rate": 2.7740477901917625e-07, "loss": 0.74547529, "num_input_tokens_seen": 299721050, "step": 13895, "time_per_iteration": 2.5912013053894043 }, { "auxiliary_loss_clip": 0.01098522, "auxiliary_loss_mlp": 0.01043176, "balance_loss_clip": 1.0375011, "balance_loss_mlp": 1.02959836, "epoch": 0.8354727190741019, "flos": 21397804462080.0, "grad_norm": 2.180746282209792, "language_loss": 0.72239274, "learning_rate": 2.772069258877667e-07, "loss": 0.7438097, "num_input_tokens_seen": 299738255, "step": 13896, "time_per_iteration": 2.816459894180298 }, { "auxiliary_loss_clip": 0.01096666, "auxiliary_loss_mlp": 0.01033423, "balance_loss_clip": 1.03551006, "balance_loss_mlp": 1.02084064, "epoch": 0.8355328423267698, "flos": 50840997834240.0, "grad_norm": 2.4314822364196456, "language_loss": 0.5891223, "learning_rate": 2.770091380848423e-07, "loss": 0.61042321, "num_input_tokens_seen": 299761315, "step": 13897, "time_per_iteration": 2.854132652282715 }, { "auxiliary_loss_clip": 0.01029051, "auxiliary_loss_mlp": 0.00750932, "balance_loss_clip": 1.00681758, "balance_loss_mlp": 0.99963111, "epoch": 0.8355929655794379, "flos": 65551052764800.0, "grad_norm": 0.6926411996792173, "language_loss": 0.57645589, "learning_rate": 2.7681141561790423e-07, "loss": 0.59425569, "num_input_tokens_seen": 299828735, "step": 13898, "time_per_iteration": 3.189154624938965 }, { "auxiliary_loss_clip": 0.01095352, "auxiliary_loss_mlp": 0.01038588, "balance_loss_clip": 1.03804767, "balance_loss_mlp": 1.02465272, "epoch": 0.8356530888321058, "flos": 19170516902400.0, "grad_norm": 1.982321802271085, "language_loss": 0.79983473, "learning_rate": 2.7661375849444967e-07, "loss": 0.8211742, "num_input_tokens_seen": 299848395, "step": 13899, "time_per_iteration": 2.6372761726379395 }, { "auxiliary_loss_clip": 0.01110341, "auxiliary_loss_mlp": 0.01035111, "balance_loss_clip": 1.03744435, "balance_loss_mlp": 1.0235889, "epoch": 0.8357132120847738, "flos": 44126672238720.0, "grad_norm": 3.0475154129794473, "language_loss": 0.69246173, "learning_rate": 2.764161667219749e-07, "loss": 0.7139163, "num_input_tokens_seen": 299871665, "step": 13900, "time_per_iteration": 2.7809805870056152 }, { "auxiliary_loss_clip": 0.01086706, "auxiliary_loss_mlp": 0.01030844, "balance_loss_clip": 1.03770447, "balance_loss_mlp": 1.01880407, "epoch": 0.8357733353374418, "flos": 24389701856640.0, "grad_norm": 1.5605306335935556, "language_loss": 0.71076608, "learning_rate": 2.762186403079716e-07, "loss": 0.73194158, "num_input_tokens_seen": 299891960, "step": 13901, "time_per_iteration": 2.6282958984375 }, { "auxiliary_loss_clip": 0.01065898, "auxiliary_loss_mlp": 0.01039549, "balance_loss_clip": 1.03284073, "balance_loss_mlp": 1.02650762, "epoch": 0.8358334585901097, "flos": 20916313626240.0, "grad_norm": 2.0190709128744686, "language_loss": 0.79701173, "learning_rate": 2.7602117925992963e-07, "loss": 0.81806624, "num_input_tokens_seen": 299905070, "step": 13902, "time_per_iteration": 2.690213441848755 }, { "auxiliary_loss_clip": 0.01096183, "auxiliary_loss_mlp": 0.01031756, "balance_loss_clip": 1.03576422, "balance_loss_mlp": 1.01979947, "epoch": 0.8358935818427777, "flos": 19244169740160.0, "grad_norm": 1.8244739444872173, "language_loss": 0.62556911, "learning_rate": 2.758237835853379e-07, "loss": 0.6468485, "num_input_tokens_seen": 299925130, "step": 13903, "time_per_iteration": 2.6231348514556885 }, { "auxiliary_loss_clip": 0.01084825, "auxiliary_loss_mlp": 0.01036663, "balance_loss_clip": 1.03508985, "balance_loss_mlp": 1.02401519, "epoch": 0.8359537050954456, "flos": 24134053783680.0, "grad_norm": 4.416778142718545, "language_loss": 0.7411294, "learning_rate": 2.7562645329168054e-07, "loss": 0.7623443, "num_input_tokens_seen": 299943845, "step": 13904, "time_per_iteration": 2.7428109645843506 }, { "auxiliary_loss_clip": 0.01082834, "auxiliary_loss_mlp": 0.01030754, "balance_loss_clip": 1.03411317, "balance_loss_mlp": 1.017802, "epoch": 0.8360138283481137, "flos": 16180415187840.0, "grad_norm": 1.704840597436559, "language_loss": 0.72898692, "learning_rate": 2.7542918838644104e-07, "loss": 0.75012279, "num_input_tokens_seen": 299961620, "step": 13905, "time_per_iteration": 2.7568368911743164 }, { "auxiliary_loss_clip": 0.01096191, "auxiliary_loss_mlp": 0.01036518, "balance_loss_clip": 1.03658271, "balance_loss_mlp": 1.02507973, "epoch": 0.8360739516007816, "flos": 22198899536640.0, "grad_norm": 1.8034628053468047, "language_loss": 0.66455811, "learning_rate": 2.752319888771e-07, "loss": 0.68588519, "num_input_tokens_seen": 299982170, "step": 13906, "time_per_iteration": 2.6989848613739014 }, { "auxiliary_loss_clip": 0.01096481, "auxiliary_loss_mlp": 0.01028681, "balance_loss_clip": 1.03535354, "balance_loss_mlp": 1.01639068, "epoch": 0.8361340748534496, "flos": 20923137210240.0, "grad_norm": 2.553726632874823, "language_loss": 0.74047542, "learning_rate": 2.7503485477113475e-07, "loss": 0.76172698, "num_input_tokens_seen": 300001330, "step": 13907, "time_per_iteration": 2.5955686569213867 }, { "auxiliary_loss_clip": 0.0107652, "auxiliary_loss_mlp": 0.0103429, "balance_loss_clip": 1.03481364, "balance_loss_mlp": 1.02162361, "epoch": 0.8361941981061175, "flos": 26173599932160.0, "grad_norm": 11.039148102509154, "language_loss": 0.75409931, "learning_rate": 2.7483778607602005e-07, "loss": 0.7752074, "num_input_tokens_seen": 300020645, "step": 13908, "time_per_iteration": 2.696906566619873 }, { "auxiliary_loss_clip": 0.01097882, "auxiliary_loss_mlp": 0.01031312, "balance_loss_clip": 1.03590965, "balance_loss_mlp": 1.01825249, "epoch": 0.8362543213587855, "flos": 24419363512320.0, "grad_norm": 1.9388338218951495, "language_loss": 0.71320546, "learning_rate": 2.7464078279922964e-07, "loss": 0.73449743, "num_input_tokens_seen": 300039945, "step": 13909, "time_per_iteration": 5.753490686416626 }, { "auxiliary_loss_clip": 0.01112711, "auxiliary_loss_mlp": 0.00771249, "balance_loss_clip": 1.03798199, "balance_loss_mlp": 1.00026917, "epoch": 0.8363144446114534, "flos": 17202396948480.0, "grad_norm": 1.7414813953695425, "language_loss": 0.73090255, "learning_rate": 2.744438449482338e-07, "loss": 0.74974209, "num_input_tokens_seen": 300058260, "step": 13910, "time_per_iteration": 2.6283226013183594 }, { "auxiliary_loss_clip": 0.01095006, "auxiliary_loss_mlp": 0.00772614, "balance_loss_clip": 1.0360173, "balance_loss_mlp": 1.00014329, "epoch": 0.8363745678641215, "flos": 19279398003840.0, "grad_norm": 2.750713587824779, "language_loss": 0.73741031, "learning_rate": 2.742469725305001e-07, "loss": 0.75608653, "num_input_tokens_seen": 300076720, "step": 13911, "time_per_iteration": 4.149497985839844 }, { "auxiliary_loss_clip": 0.01090915, "auxiliary_loss_mlp": 0.01037296, "balance_loss_clip": 1.03743172, "balance_loss_mlp": 1.02490461, "epoch": 0.8364346911167894, "flos": 11874869596800.0, "grad_norm": 2.172823280625671, "language_loss": 0.78602064, "learning_rate": 2.740501655534946e-07, "loss": 0.80730277, "num_input_tokens_seen": 300092950, "step": 13912, "time_per_iteration": 2.7247042655944824 }, { "auxiliary_loss_clip": 0.01099282, "auxiliary_loss_mlp": 0.01030051, "balance_loss_clip": 1.03660643, "balance_loss_mlp": 1.01849961, "epoch": 0.8364948143694574, "flos": 20225212974720.0, "grad_norm": 1.9881992667595267, "language_loss": 0.7914685, "learning_rate": 2.738534240246797e-07, "loss": 0.81276178, "num_input_tokens_seen": 300110950, "step": 13913, "time_per_iteration": 4.134316682815552 }, { "auxiliary_loss_clip": 0.01097532, "auxiliary_loss_mlp": 0.01030734, "balance_loss_clip": 1.03647411, "balance_loss_mlp": 1.0179075, "epoch": 0.8365549376221254, "flos": 21612909058560.0, "grad_norm": 2.642445797747624, "language_loss": 0.73418862, "learning_rate": 2.736567479515153e-07, "loss": 0.75547129, "num_input_tokens_seen": 300128705, "step": 13914, "time_per_iteration": 2.571171760559082 }, { "auxiliary_loss_clip": 0.01062932, "auxiliary_loss_mlp": 0.0103246, "balance_loss_clip": 1.03763103, "balance_loss_mlp": 1.01987803, "epoch": 0.8366150608747933, "flos": 23294210912640.0, "grad_norm": 1.590677583762713, "language_loss": 0.71320194, "learning_rate": 2.7346013734146025e-07, "loss": 0.73415583, "num_input_tokens_seen": 300148635, "step": 13915, "time_per_iteration": 2.751453161239624 }, { "auxiliary_loss_clip": 0.01080426, "auxiliary_loss_mlp": 0.01030234, "balance_loss_clip": 1.03791106, "balance_loss_mlp": 1.01852822, "epoch": 0.8366751841274613, "flos": 15267673664640.0, "grad_norm": 1.8965589808135064, "language_loss": 0.71970236, "learning_rate": 2.7326359220197035e-07, "loss": 0.74080902, "num_input_tokens_seen": 300165490, "step": 13916, "time_per_iteration": 2.6807093620300293 }, { "auxiliary_loss_clip": 0.01077533, "auxiliary_loss_mlp": 0.00770081, "balance_loss_clip": 1.03652239, "balance_loss_mlp": 1.00017905, "epoch": 0.8367353073801292, "flos": 13224931205760.0, "grad_norm": 2.314558822643351, "language_loss": 0.74767375, "learning_rate": 2.7306711254049755e-07, "loss": 0.76614988, "num_input_tokens_seen": 300182130, "step": 13917, "time_per_iteration": 2.6898746490478516 }, { "auxiliary_loss_clip": 0.01107617, "auxiliary_loss_mlp": 0.01034237, "balance_loss_clip": 1.03919959, "balance_loss_mlp": 1.02238786, "epoch": 0.8367954306327973, "flos": 24205084928640.0, "grad_norm": 1.7520480918143468, "language_loss": 0.79073501, "learning_rate": 2.728706983644933e-07, "loss": 0.81215358, "num_input_tokens_seen": 300203050, "step": 13918, "time_per_iteration": 2.585444450378418 }, { "auxiliary_loss_clip": 0.01069111, "auxiliary_loss_mlp": 0.01035129, "balance_loss_clip": 1.03858578, "balance_loss_mlp": 1.02256465, "epoch": 0.8368555538854652, "flos": 24534744975360.0, "grad_norm": 1.689646886321145, "language_loss": 0.67851698, "learning_rate": 2.7267434968140457e-07, "loss": 0.69955939, "num_input_tokens_seen": 300224380, "step": 13919, "time_per_iteration": 2.7965781688690186 }, { "auxiliary_loss_clip": 0.0109041, "auxiliary_loss_mlp": 0.01036947, "balance_loss_clip": 1.03292394, "balance_loss_mlp": 1.02389407, "epoch": 0.8369156771381332, "flos": 20259363830400.0, "grad_norm": 1.776956438502091, "language_loss": 0.73908985, "learning_rate": 2.7247806649867835e-07, "loss": 0.76036346, "num_input_tokens_seen": 300242915, "step": 13920, "time_per_iteration": 2.636904716491699 }, { "auxiliary_loss_clip": 0.01088456, "auxiliary_loss_mlp": 0.01029889, "balance_loss_clip": 1.03454947, "balance_loss_mlp": 1.0174973, "epoch": 0.8369758003908011, "flos": 21835555511040.0, "grad_norm": 1.8125965247419975, "language_loss": 0.68985099, "learning_rate": 2.722818488237566e-07, "loss": 0.71103442, "num_input_tokens_seen": 300261905, "step": 13921, "time_per_iteration": 2.649538278579712 }, { "auxiliary_loss_clip": 0.01101931, "auxiliary_loss_mlp": 0.01031907, "balance_loss_clip": 1.03782213, "balance_loss_mlp": 1.01993847, "epoch": 0.8370359236434691, "flos": 21719312121600.0, "grad_norm": 1.96204567174708, "language_loss": 0.85527766, "learning_rate": 2.720856966640801e-07, "loss": 0.876616, "num_input_tokens_seen": 300281145, "step": 13922, "time_per_iteration": 2.6043357849121094 }, { "auxiliary_loss_clip": 0.01068346, "auxiliary_loss_mlp": 0.00769545, "balance_loss_clip": 1.0355072, "balance_loss_mlp": 1.00012457, "epoch": 0.837096046896137, "flos": 23148880485120.0, "grad_norm": 1.4962135590682923, "language_loss": 0.71717429, "learning_rate": 2.71889610027088e-07, "loss": 0.73555321, "num_input_tokens_seen": 300301610, "step": 13923, "time_per_iteration": 2.6874313354492188 }, { "auxiliary_loss_clip": 0.01082449, "auxiliary_loss_mlp": 0.01029737, "balance_loss_clip": 1.03654337, "balance_loss_mlp": 1.01662445, "epoch": 0.8371561701488051, "flos": 24492872695680.0, "grad_norm": 1.885906995135632, "language_loss": 0.759628, "learning_rate": 2.7169358892021433e-07, "loss": 0.78074992, "num_input_tokens_seen": 300319420, "step": 13924, "time_per_iteration": 2.671105146408081 }, { "auxiliary_loss_clip": 0.01084333, "auxiliary_loss_mlp": 0.01027444, "balance_loss_clip": 1.03405309, "balance_loss_mlp": 1.01530862, "epoch": 0.837216293401473, "flos": 29206723161600.0, "grad_norm": 1.5298544720059444, "language_loss": 0.64247084, "learning_rate": 2.7149763335089293e-07, "loss": 0.66358864, "num_input_tokens_seen": 300341325, "step": 13925, "time_per_iteration": 2.6903226375579834 }, { "auxiliary_loss_clip": 0.01086129, "auxiliary_loss_mlp": 0.01032025, "balance_loss_clip": 1.03692162, "balance_loss_mlp": 1.01949048, "epoch": 0.837276416654141, "flos": 25265275781760.0, "grad_norm": 2.05791983020966, "language_loss": 0.74643993, "learning_rate": 2.713017433265543e-07, "loss": 0.7676214, "num_input_tokens_seen": 300361620, "step": 13926, "time_per_iteration": 2.7113802433013916 }, { "auxiliary_loss_clip": 0.01099802, "auxiliary_loss_mlp": 0.01036252, "balance_loss_clip": 1.03915524, "balance_loss_mlp": 1.02321601, "epoch": 0.837336539906809, "flos": 13882024656000.0, "grad_norm": 5.046746125844788, "language_loss": 0.71061361, "learning_rate": 2.711059188546274e-07, "loss": 0.73197412, "num_input_tokens_seen": 300378675, "step": 13927, "time_per_iteration": 2.5931549072265625 }, { "auxiliary_loss_clip": 0.01001985, "auxiliary_loss_mlp": 0.01002351, "balance_loss_clip": 1.00909257, "balance_loss_mlp": 1.00123012, "epoch": 0.8373966631594769, "flos": 68870599044480.0, "grad_norm": 0.7031143541961051, "language_loss": 0.58787715, "learning_rate": 2.7091015994253695e-07, "loss": 0.60792047, "num_input_tokens_seen": 300449740, "step": 13928, "time_per_iteration": 3.2960739135742188 }, { "auxiliary_loss_clip": 0.01071961, "auxiliary_loss_mlp": 0.01042566, "balance_loss_clip": 1.03641248, "balance_loss_mlp": 1.02829027, "epoch": 0.8374567864121449, "flos": 20448972748800.0, "grad_norm": 1.77650313447611, "language_loss": 0.69560969, "learning_rate": 2.707144665977068e-07, "loss": 0.71675503, "num_input_tokens_seen": 300470000, "step": 13929, "time_per_iteration": 2.6807215213775635 }, { "auxiliary_loss_clip": 0.01098571, "auxiliary_loss_mlp": 0.01027466, "balance_loss_clip": 1.03639913, "balance_loss_mlp": 1.01444805, "epoch": 0.8375169096648128, "flos": 41904197101440.0, "grad_norm": 1.5497858215784133, "language_loss": 0.66676092, "learning_rate": 2.705188388275574e-07, "loss": 0.68802124, "num_input_tokens_seen": 300494975, "step": 13930, "time_per_iteration": 2.861119031906128 }, { "auxiliary_loss_clip": 0.01066352, "auxiliary_loss_mlp": 0.01027411, "balance_loss_clip": 1.03803921, "balance_loss_mlp": 1.01527548, "epoch": 0.8375770329174809, "flos": 20009354192640.0, "grad_norm": 1.6161268751514244, "language_loss": 0.71101642, "learning_rate": 2.703232766395067e-07, "loss": 0.7319541, "num_input_tokens_seen": 300513175, "step": 13931, "time_per_iteration": 2.8232531547546387 }, { "auxiliary_loss_clip": 0.01072605, "auxiliary_loss_mlp": 0.01032905, "balance_loss_clip": 1.03191161, "balance_loss_mlp": 1.02047718, "epoch": 0.8376371561701488, "flos": 22783597125120.0, "grad_norm": 2.2209749295259913, "language_loss": 0.71790922, "learning_rate": 2.701277800409705e-07, "loss": 0.73896432, "num_input_tokens_seen": 300533770, "step": 13932, "time_per_iteration": 2.7237002849578857 }, { "auxiliary_loss_clip": 0.0104491, "auxiliary_loss_mlp": 0.01034703, "balance_loss_clip": 1.03452373, "balance_loss_mlp": 1.02334225, "epoch": 0.8376972794228168, "flos": 23914459987200.0, "grad_norm": 2.3684193578736785, "language_loss": 0.66962039, "learning_rate": 2.699323490393628e-07, "loss": 0.69041657, "num_input_tokens_seen": 300552995, "step": 13933, "time_per_iteration": 2.926781415939331 }, { "auxiliary_loss_clip": 0.01079254, "auxiliary_loss_mlp": 0.01043444, "balance_loss_clip": 1.03600967, "balance_loss_mlp": 1.0309329, "epoch": 0.8377574026754847, "flos": 13734718980480.0, "grad_norm": 1.9196886282794297, "language_loss": 0.76411772, "learning_rate": 2.697369836420933e-07, "loss": 0.78534472, "num_input_tokens_seen": 300570275, "step": 13934, "time_per_iteration": 2.8570826053619385 }, { "auxiliary_loss_clip": 0.01100527, "auxiliary_loss_mlp": 0.01029597, "balance_loss_clip": 1.04098976, "balance_loss_mlp": 1.01738369, "epoch": 0.8378175259281527, "flos": 21651333632640.0, "grad_norm": 1.505075221616912, "language_loss": 0.77353156, "learning_rate": 2.6954168385657115e-07, "loss": 0.79483283, "num_input_tokens_seen": 300590875, "step": 13935, "time_per_iteration": 2.6582868099212646 }, { "auxiliary_loss_clip": 0.01070099, "auxiliary_loss_mlp": 0.01027318, "balance_loss_clip": 1.03629911, "balance_loss_mlp": 1.01469421, "epoch": 0.8378776491808206, "flos": 15448806973440.0, "grad_norm": 5.23588368234973, "language_loss": 0.56080019, "learning_rate": 2.6934644969020135e-07, "loss": 0.58177441, "num_input_tokens_seen": 300607490, "step": 13936, "time_per_iteration": 2.684828042984009 }, { "auxiliary_loss_clip": 0.01090807, "auxiliary_loss_mlp": 0.0103317, "balance_loss_clip": 1.03235912, "balance_loss_mlp": 1.02025938, "epoch": 0.8379377724334887, "flos": 14720395069440.0, "grad_norm": 2.797405460790855, "language_loss": 0.89294749, "learning_rate": 2.691512811503882e-07, "loss": 0.91418725, "num_input_tokens_seen": 300623635, "step": 13937, "time_per_iteration": 2.5899250507354736 }, { "auxiliary_loss_clip": 0.01099019, "auxiliary_loss_mlp": 0.01026723, "balance_loss_clip": 1.03668594, "balance_loss_mlp": 1.01458192, "epoch": 0.8379978956861566, "flos": 24535247765760.0, "grad_norm": 2.44657354170433, "language_loss": 0.81838822, "learning_rate": 2.689561782445313e-07, "loss": 0.83964562, "num_input_tokens_seen": 300643835, "step": 13938, "time_per_iteration": 2.634232521057129 }, { "auxiliary_loss_clip": 0.01101448, "auxiliary_loss_mlp": 0.01033084, "balance_loss_clip": 1.03746319, "balance_loss_mlp": 1.01988757, "epoch": 0.8380580189388246, "flos": 18952611045120.0, "grad_norm": 1.6841429045668255, "language_loss": 0.7044903, "learning_rate": 2.6876114098002965e-07, "loss": 0.72583556, "num_input_tokens_seen": 300662500, "step": 13939, "time_per_iteration": 2.61344575881958 }, { "auxiliary_loss_clip": 0.01078321, "auxiliary_loss_mlp": 0.01039086, "balance_loss_clip": 1.03616691, "balance_loss_mlp": 1.02573454, "epoch": 0.8381181421914926, "flos": 26540283922560.0, "grad_norm": 6.593424997719047, "language_loss": 0.76224637, "learning_rate": 2.6856616936428e-07, "loss": 0.78342044, "num_input_tokens_seen": 300681480, "step": 13940, "time_per_iteration": 2.6947879791259766 }, { "auxiliary_loss_clip": 0.01093556, "auxiliary_loss_mlp": 0.01034451, "balance_loss_clip": 1.0350914, "balance_loss_mlp": 1.02206564, "epoch": 0.8381782654441605, "flos": 23291481479040.0, "grad_norm": 1.6698207376849759, "language_loss": 0.76370448, "learning_rate": 2.6837126340467374e-07, "loss": 0.78498459, "num_input_tokens_seen": 300699165, "step": 13941, "time_per_iteration": 2.629971742630005 }, { "auxiliary_loss_clip": 0.01068862, "auxiliary_loss_mlp": 0.01030093, "balance_loss_clip": 1.03516936, "balance_loss_mlp": 1.01728964, "epoch": 0.8382383886968285, "flos": 26758800311040.0, "grad_norm": 2.4302037617265793, "language_loss": 0.73204666, "learning_rate": 2.6817642310860276e-07, "loss": 0.75303626, "num_input_tokens_seen": 300714615, "step": 13942, "time_per_iteration": 2.741283893585205 }, { "auxiliary_loss_clip": 0.01067172, "auxiliary_loss_mlp": 0.01039608, "balance_loss_clip": 1.03562307, "balance_loss_mlp": 1.02545786, "epoch": 0.8382985119494964, "flos": 26104544035200.0, "grad_norm": 1.6102800053781703, "language_loss": 0.79528558, "learning_rate": 2.679816484834554e-07, "loss": 0.81635338, "num_input_tokens_seen": 300734860, "step": 13943, "time_per_iteration": 2.7648844718933105 }, { "auxiliary_loss_clip": 0.01057583, "auxiliary_loss_mlp": 0.01030261, "balance_loss_clip": 1.03292835, "balance_loss_mlp": 1.01832187, "epoch": 0.8383586352021645, "flos": 16435129507200.0, "grad_norm": 1.9936529414882505, "language_loss": 0.85062182, "learning_rate": 2.6778693953661766e-07, "loss": 0.87150025, "num_input_tokens_seen": 300752735, "step": 13944, "time_per_iteration": 2.702016592025757 }, { "auxiliary_loss_clip": 0.01009407, "auxiliary_loss_mlp": 0.00750919, "balance_loss_clip": 1.00603545, "balance_loss_mlp": 0.99966449, "epoch": 0.8384187584548324, "flos": 64195532288640.0, "grad_norm": 0.6194539078330007, "language_loss": 0.50268608, "learning_rate": 2.6759229627547263e-07, "loss": 0.5202893, "num_input_tokens_seen": 300820760, "step": 13945, "time_per_iteration": 3.2719228267669678 }, { "auxiliary_loss_clip": 0.01067358, "auxiliary_loss_mlp": 0.01031818, "balance_loss_clip": 1.03898573, "balance_loss_mlp": 1.01964068, "epoch": 0.8384788817075004, "flos": 22382905933440.0, "grad_norm": 2.0041104630114726, "language_loss": 0.64992464, "learning_rate": 2.673977187074017e-07, "loss": 0.67091638, "num_input_tokens_seen": 300840025, "step": 13946, "time_per_iteration": 2.7332231998443604 }, { "auxiliary_loss_clip": 0.01060162, "auxiliary_loss_mlp": 0.01032335, "balance_loss_clip": 1.03414798, "balance_loss_mlp": 1.01927578, "epoch": 0.8385390049601683, "flos": 29496845312640.0, "grad_norm": 1.5282176707936672, "language_loss": 0.67431152, "learning_rate": 2.672032068397829e-07, "loss": 0.69523644, "num_input_tokens_seen": 300860380, "step": 13947, "time_per_iteration": 2.8148739337921143 }, { "auxiliary_loss_clip": 0.01084671, "auxiliary_loss_mlp": 0.01034421, "balance_loss_clip": 1.03683496, "balance_loss_mlp": 1.02082467, "epoch": 0.8385991282128363, "flos": 32707797799680.0, "grad_norm": 2.1167215710156566, "language_loss": 0.70042205, "learning_rate": 2.6700876067999176e-07, "loss": 0.72161293, "num_input_tokens_seen": 300881895, "step": 13948, "time_per_iteration": 4.3659327030181885 }, { "auxiliary_loss_clip": 0.01084202, "auxiliary_loss_mlp": 0.01032949, "balance_loss_clip": 1.03514576, "balance_loss_mlp": 1.02195239, "epoch": 0.8386592514655042, "flos": 25441022050560.0, "grad_norm": 2.5602152463146033, "language_loss": 0.85150999, "learning_rate": 2.6681438023540194e-07, "loss": 0.8726815, "num_input_tokens_seen": 300901575, "step": 13949, "time_per_iteration": 4.24399995803833 }, { "auxiliary_loss_clip": 0.01081801, "auxiliary_loss_mlp": 0.01029076, "balance_loss_clip": 1.03833914, "balance_loss_mlp": 1.01670778, "epoch": 0.8387193747181723, "flos": 22015898720640.0, "grad_norm": 4.303340454207266, "language_loss": 0.69926894, "learning_rate": 2.66620065513385e-07, "loss": 0.72037774, "num_input_tokens_seen": 300919735, "step": 13950, "time_per_iteration": 4.277710914611816 }, { "auxiliary_loss_clip": 0.01091242, "auxiliary_loss_mlp": 0.01029375, "balance_loss_clip": 1.03648567, "balance_loss_mlp": 1.01687598, "epoch": 0.8387794979708402, "flos": 18150223080960.0, "grad_norm": 1.697645904301953, "language_loss": 0.6442672, "learning_rate": 2.6642581652130913e-07, "loss": 0.6654734, "num_input_tokens_seen": 300939150, "step": 13951, "time_per_iteration": 2.564544439315796 }, { "auxiliary_loss_clip": 0.01100469, "auxiliary_loss_mlp": 0.01030012, "balance_loss_clip": 1.03913283, "balance_loss_mlp": 1.01795959, "epoch": 0.8388396212235082, "flos": 25411216740480.0, "grad_norm": 1.428691785600865, "language_loss": 0.69986898, "learning_rate": 2.662316332665393e-07, "loss": 0.72117376, "num_input_tokens_seen": 300959730, "step": 13952, "time_per_iteration": 4.1969122886657715 }, { "auxiliary_loss_clip": 0.01096336, "auxiliary_loss_mlp": 0.01032886, "balance_loss_clip": 1.03714609, "balance_loss_mlp": 1.02128077, "epoch": 0.8388997444761762, "flos": 22273055164800.0, "grad_norm": 1.8744192088426839, "language_loss": 0.72788477, "learning_rate": 2.6603751575643987e-07, "loss": 0.74917698, "num_input_tokens_seen": 300976120, "step": 13953, "time_per_iteration": 2.6013376712799072 }, { "auxiliary_loss_clip": 0.01036141, "auxiliary_loss_mlp": 0.01033482, "balance_loss_clip": 1.03169441, "balance_loss_mlp": 1.01992166, "epoch": 0.8389598677288441, "flos": 19573219255680.0, "grad_norm": 1.9148205474215427, "language_loss": 0.68345833, "learning_rate": 2.6584346399837176e-07, "loss": 0.70415455, "num_input_tokens_seen": 300995080, "step": 13954, "time_per_iteration": 2.7236297130584717 }, { "auxiliary_loss_clip": 0.0108771, "auxiliary_loss_mlp": 0.01035111, "balance_loss_clip": 1.03767776, "balance_loss_mlp": 1.0240128, "epoch": 0.8390199909815121, "flos": 17384715406080.0, "grad_norm": 1.7636414088599872, "language_loss": 0.7324779, "learning_rate": 2.656494779996932e-07, "loss": 0.7537061, "num_input_tokens_seen": 301012920, "step": 13955, "time_per_iteration": 2.661045551300049 }, { "auxiliary_loss_clip": 0.01043432, "auxiliary_loss_mlp": 0.01032322, "balance_loss_clip": 1.03135204, "balance_loss_mlp": 1.019346, "epoch": 0.83908011423418, "flos": 24639639667200.0, "grad_norm": 8.047952352869046, "language_loss": 0.66471386, "learning_rate": 2.6545555776775995e-07, "loss": 0.68547142, "num_input_tokens_seen": 301028875, "step": 13956, "time_per_iteration": 2.7914817333221436 }, { "auxiliary_loss_clip": 0.01099865, "auxiliary_loss_mlp": 0.01036313, "balance_loss_clip": 1.03744364, "balance_loss_mlp": 1.02332473, "epoch": 0.8391402374868481, "flos": 24718356322560.0, "grad_norm": 2.4631411130881995, "language_loss": 0.79544741, "learning_rate": 2.6526170330992667e-07, "loss": 0.81680918, "num_input_tokens_seen": 301050115, "step": 13957, "time_per_iteration": 2.7476260662078857 }, { "auxiliary_loss_clip": 0.00983967, "auxiliary_loss_mlp": 0.01019247, "balance_loss_clip": 1.01336145, "balance_loss_mlp": 1.01760185, "epoch": 0.839200360739516, "flos": 56871695784960.0, "grad_norm": 0.7593984964089096, "language_loss": 0.53379953, "learning_rate": 2.6506791463354283e-07, "loss": 0.5538317, "num_input_tokens_seen": 301114155, "step": 13958, "time_per_iteration": 3.488459825515747 }, { "auxiliary_loss_clip": 0.01098132, "auxiliary_loss_mlp": 0.01032752, "balance_loss_clip": 1.03722978, "balance_loss_mlp": 1.01981759, "epoch": 0.839260483992184, "flos": 18332792933760.0, "grad_norm": 1.8527919164572209, "language_loss": 0.72979414, "learning_rate": 2.648741917459574e-07, "loss": 0.75110304, "num_input_tokens_seen": 301133150, "step": 13959, "time_per_iteration": 3.048078775405884 }, { "auxiliary_loss_clip": 0.01075035, "auxiliary_loss_mlp": 0.01024754, "balance_loss_clip": 1.037763, "balance_loss_mlp": 1.01298177, "epoch": 0.8393206072448519, "flos": 27087921653760.0, "grad_norm": 2.0364412666865843, "language_loss": 0.55541557, "learning_rate": 2.646805346545169e-07, "loss": 0.57641345, "num_input_tokens_seen": 301153600, "step": 13960, "time_per_iteration": 2.835035800933838 }, { "auxiliary_loss_clip": 0.01002229, "auxiliary_loss_mlp": 0.01000998, "balance_loss_clip": 1.0077697, "balance_loss_mlp": 1.00003195, "epoch": 0.8393807304975199, "flos": 61521192057600.0, "grad_norm": 0.7763315867784596, "language_loss": 0.60705209, "learning_rate": 2.6448694336656397e-07, "loss": 0.62708437, "num_input_tokens_seen": 301214335, "step": 13961, "time_per_iteration": 3.3663535118103027 }, { "auxiliary_loss_clip": 0.01052805, "auxiliary_loss_mlp": 0.010396, "balance_loss_clip": 1.02986741, "balance_loss_mlp": 1.02657616, "epoch": 0.8394408537501878, "flos": 14894848448640.0, "grad_norm": 2.557584362268972, "language_loss": 0.68461823, "learning_rate": 2.642934178894405e-07, "loss": 0.70554227, "num_input_tokens_seen": 301228960, "step": 13962, "time_per_iteration": 2.6838927268981934 }, { "auxiliary_loss_clip": 0.01077301, "auxiliary_loss_mlp": 0.01027695, "balance_loss_clip": 1.03520894, "balance_loss_mlp": 1.01575601, "epoch": 0.8395009770028559, "flos": 17412186332160.0, "grad_norm": 1.9087314512083013, "language_loss": 0.72709483, "learning_rate": 2.640999582304841e-07, "loss": 0.74814475, "num_input_tokens_seen": 301245875, "step": 13963, "time_per_iteration": 2.7063026428222656 }, { "auxiliary_loss_clip": 0.01085945, "auxiliary_loss_mlp": 0.01034113, "balance_loss_clip": 1.035007, "balance_loss_mlp": 1.02194226, "epoch": 0.8395611002555238, "flos": 27924747782400.0, "grad_norm": 1.5783482209537385, "language_loss": 0.76520944, "learning_rate": 2.6390656439703173e-07, "loss": 0.78640997, "num_input_tokens_seen": 301265550, "step": 13964, "time_per_iteration": 2.7841615676879883 }, { "auxiliary_loss_clip": 0.01089552, "auxiliary_loss_mlp": 0.01036408, "balance_loss_clip": 1.03722572, "balance_loss_mlp": 1.02287793, "epoch": 0.8396212235081918, "flos": 11100922225920.0, "grad_norm": 2.0757447568639633, "language_loss": 0.78032225, "learning_rate": 2.637132363964161e-07, "loss": 0.80158186, "num_input_tokens_seen": 301282035, "step": 13965, "time_per_iteration": 2.67738938331604 }, { "auxiliary_loss_clip": 0.01092348, "auxiliary_loss_mlp": 0.01032536, "balance_loss_clip": 1.03630924, "balance_loss_mlp": 1.02068114, "epoch": 0.8396813467608598, "flos": 35735641729920.0, "grad_norm": 1.499677295305954, "language_loss": 0.65898132, "learning_rate": 2.635199742359684e-07, "loss": 0.68023014, "num_input_tokens_seen": 301305210, "step": 13966, "time_per_iteration": 2.7493228912353516 }, { "auxiliary_loss_clip": 0.01086107, "auxiliary_loss_mlp": 0.01032854, "balance_loss_clip": 1.03722155, "balance_loss_mlp": 1.02049196, "epoch": 0.8397414700135277, "flos": 26176724415360.0, "grad_norm": 1.9100754434512948, "language_loss": 0.74755192, "learning_rate": 2.633267779230177e-07, "loss": 0.76874149, "num_input_tokens_seen": 301324885, "step": 13967, "time_per_iteration": 2.6640665531158447 }, { "auxiliary_loss_clip": 0.01081249, "auxiliary_loss_mlp": 0.01029478, "balance_loss_clip": 1.0370276, "balance_loss_mlp": 1.01756883, "epoch": 0.8398015932661957, "flos": 18333116156160.0, "grad_norm": 1.8492234177580402, "language_loss": 0.82993788, "learning_rate": 2.6313364746488974e-07, "loss": 0.85104513, "num_input_tokens_seen": 301343070, "step": 13968, "time_per_iteration": 2.5900182723999023 }, { "auxiliary_loss_clip": 0.01083656, "auxiliary_loss_mlp": 0.01032124, "balance_loss_clip": 1.03804672, "balance_loss_mlp": 1.01986384, "epoch": 0.8398617165188637, "flos": 17379507934080.0, "grad_norm": 2.094652231343387, "language_loss": 0.7729916, "learning_rate": 2.629405828689075e-07, "loss": 0.7941494, "num_input_tokens_seen": 301359280, "step": 13969, "time_per_iteration": 2.611394166946411 }, { "auxiliary_loss_clip": 0.01090762, "auxiliary_loss_mlp": 0.01030453, "balance_loss_clip": 1.03660858, "balance_loss_mlp": 1.01741195, "epoch": 0.8399218397715317, "flos": 22929681738240.0, "grad_norm": 2.0003618837908244, "language_loss": 0.77181804, "learning_rate": 2.627475841423923e-07, "loss": 0.79303014, "num_input_tokens_seen": 301376465, "step": 13970, "time_per_iteration": 2.6121816635131836 }, { "auxiliary_loss_clip": 0.01087144, "auxiliary_loss_mlp": 0.01038704, "balance_loss_clip": 1.03651595, "balance_loss_mlp": 1.02689075, "epoch": 0.8399819630241996, "flos": 23149562843520.0, "grad_norm": 2.097520356637354, "language_loss": 0.71949625, "learning_rate": 2.625546512926633e-07, "loss": 0.74075466, "num_input_tokens_seen": 301396000, "step": 13971, "time_per_iteration": 2.6382222175598145 }, { "auxiliary_loss_clip": 0.01085619, "auxiliary_loss_mlp": 0.01031304, "balance_loss_clip": 1.03411746, "balance_loss_mlp": 1.01840544, "epoch": 0.8400420862768676, "flos": 16397423205120.0, "grad_norm": 1.7445644136224228, "language_loss": 0.77706194, "learning_rate": 2.623617843270358e-07, "loss": 0.79823112, "num_input_tokens_seen": 301413160, "step": 13972, "time_per_iteration": 2.637141227722168 }, { "auxiliary_loss_clip": 0.01041674, "auxiliary_loss_mlp": 0.01037854, "balance_loss_clip": 1.03100634, "balance_loss_mlp": 1.02458596, "epoch": 0.8401022095295355, "flos": 21287486816640.0, "grad_norm": 1.304807190993185, "language_loss": 0.68481863, "learning_rate": 2.6216898325282333e-07, "loss": 0.70561385, "num_input_tokens_seen": 301433325, "step": 13973, "time_per_iteration": 2.7618348598480225 }, { "auxiliary_loss_clip": 0.01088741, "auxiliary_loss_mlp": 0.01030742, "balance_loss_clip": 1.03717828, "balance_loss_mlp": 1.01786125, "epoch": 0.8401623327822035, "flos": 17311313963520.0, "grad_norm": 2.2621035315363858, "language_loss": 0.78135633, "learning_rate": 2.619762480773382e-07, "loss": 0.80255115, "num_input_tokens_seen": 301450265, "step": 13974, "time_per_iteration": 2.6674814224243164 }, { "auxiliary_loss_clip": 0.01095006, "auxiliary_loss_mlp": 0.01030636, "balance_loss_clip": 1.03827214, "balance_loss_mlp": 1.01826859, "epoch": 0.8402224560348714, "flos": 22236677665920.0, "grad_norm": 1.513610095867281, "language_loss": 0.7256, "learning_rate": 2.617835788078868e-07, "loss": 0.74685645, "num_input_tokens_seen": 301470760, "step": 13975, "time_per_iteration": 2.838907241821289 }, { "auxiliary_loss_clip": 0.01089044, "auxiliary_loss_mlp": 0.01025952, "balance_loss_clip": 1.03686631, "balance_loss_mlp": 1.01353598, "epoch": 0.8402825792875395, "flos": 20229953569920.0, "grad_norm": 7.753004351668279, "language_loss": 0.72390342, "learning_rate": 2.6159097545177645e-07, "loss": 0.74505341, "num_input_tokens_seen": 301489425, "step": 13976, "time_per_iteration": 2.726900100708008 }, { "auxiliary_loss_clip": 0.01107341, "auxiliary_loss_mlp": 0.00769496, "balance_loss_clip": 1.03678119, "balance_loss_mlp": 1.00013971, "epoch": 0.8403427025402074, "flos": 23289973107840.0, "grad_norm": 1.8413083341315597, "language_loss": 0.71979779, "learning_rate": 2.61398438016311e-07, "loss": 0.7385661, "num_input_tokens_seen": 301508885, "step": 13977, "time_per_iteration": 2.630323886871338 }, { "auxiliary_loss_clip": 0.01096098, "auxiliary_loss_mlp": 0.01032339, "balance_loss_clip": 1.03397727, "balance_loss_mlp": 1.02011466, "epoch": 0.8404028257928754, "flos": 32675586278400.0, "grad_norm": 1.7930366312861392, "language_loss": 0.68852651, "learning_rate": 2.6120596650879043e-07, "loss": 0.70981085, "num_input_tokens_seen": 301533780, "step": 13978, "time_per_iteration": 2.7467479705810547 }, { "auxiliary_loss_clip": 0.01071792, "auxiliary_loss_mlp": 0.01031819, "balance_loss_clip": 1.03347301, "balance_loss_mlp": 1.01965952, "epoch": 0.8404629490455434, "flos": 16180522928640.0, "grad_norm": 1.8105667854844871, "language_loss": 0.77938527, "learning_rate": 2.610135609365145e-07, "loss": 0.80042142, "num_input_tokens_seen": 301551775, "step": 13979, "time_per_iteration": 2.6985831260681152 }, { "auxiliary_loss_clip": 0.0109651, "auxiliary_loss_mlp": 0.01030498, "balance_loss_clip": 1.04012775, "balance_loss_mlp": 1.01822543, "epoch": 0.8405230722982113, "flos": 15194451790080.0, "grad_norm": 2.0614080045574714, "language_loss": 0.77732342, "learning_rate": 2.60821221306778e-07, "loss": 0.79859352, "num_input_tokens_seen": 301570495, "step": 13980, "time_per_iteration": 2.5943267345428467 }, { "auxiliary_loss_clip": 0.01073604, "auxiliary_loss_mlp": 0.01029935, "balance_loss_clip": 1.03548491, "balance_loss_mlp": 1.01782358, "epoch": 0.8405831955508793, "flos": 27812418975360.0, "grad_norm": 5.854551943090863, "language_loss": 0.86627793, "learning_rate": 2.606289476268757e-07, "loss": 0.88731331, "num_input_tokens_seen": 301591705, "step": 13981, "time_per_iteration": 2.742199182510376 }, { "auxiliary_loss_clip": 0.01097126, "auxiliary_loss_mlp": 0.01033594, "balance_loss_clip": 1.03606057, "balance_loss_mlp": 1.02131581, "epoch": 0.8406433188035473, "flos": 23769452782080.0, "grad_norm": 2.935607999333321, "language_loss": 0.67501163, "learning_rate": 2.6043673990409745e-07, "loss": 0.69631881, "num_input_tokens_seen": 301611670, "step": 13982, "time_per_iteration": 2.6252353191375732 }, { "auxiliary_loss_clip": 0.01061743, "auxiliary_loss_mlp": 0.01041459, "balance_loss_clip": 1.03561366, "balance_loss_mlp": 1.02742803, "epoch": 0.8407034420562153, "flos": 29205681667200.0, "grad_norm": 3.073966172290324, "language_loss": 0.67936915, "learning_rate": 2.602445981457324e-07, "loss": 0.70040119, "num_input_tokens_seen": 301632540, "step": 13983, "time_per_iteration": 2.7724905014038086 }, { "auxiliary_loss_clip": 0.01069644, "auxiliary_loss_mlp": 0.01032871, "balance_loss_clip": 1.03083551, "balance_loss_mlp": 1.01959062, "epoch": 0.8407635653088832, "flos": 26360084367360.0, "grad_norm": 1.7674861859482776, "language_loss": 0.79221404, "learning_rate": 2.6005252235906684e-07, "loss": 0.8132391, "num_input_tokens_seen": 301651480, "step": 13984, "time_per_iteration": 2.7457640171051025 }, { "auxiliary_loss_clip": 0.01094285, "auxiliary_loss_mlp": 0.01033616, "balance_loss_clip": 1.03387666, "balance_loss_mlp": 1.02156985, "epoch": 0.8408236885615512, "flos": 21468799693440.0, "grad_norm": 2.394750373647798, "language_loss": 0.59764493, "learning_rate": 2.598605125513842e-07, "loss": 0.6189239, "num_input_tokens_seen": 301670010, "step": 13985, "time_per_iteration": 2.6200110912323 }, { "auxiliary_loss_clip": 0.01067816, "auxiliary_loss_mlp": 0.01029355, "balance_loss_clip": 1.03496993, "balance_loss_mlp": 1.01653397, "epoch": 0.8408838118142191, "flos": 22963724853120.0, "grad_norm": 1.5708649671091988, "language_loss": 0.82083929, "learning_rate": 2.5966856872996467e-07, "loss": 0.84181106, "num_input_tokens_seen": 301689785, "step": 13986, "time_per_iteration": 2.728940725326538 }, { "auxiliary_loss_clip": 0.01088746, "auxiliary_loss_mlp": 0.0077024, "balance_loss_clip": 1.03921962, "balance_loss_mlp": 1.00023127, "epoch": 0.8409439350668871, "flos": 26800026145920.0, "grad_norm": 1.4303842163720517, "language_loss": 0.6583513, "learning_rate": 2.5947669090208755e-07, "loss": 0.67694116, "num_input_tokens_seen": 301712225, "step": 13987, "time_per_iteration": 4.393038988113403 }, { "auxiliary_loss_clip": 0.01109413, "auxiliary_loss_mlp": 0.00770439, "balance_loss_clip": 1.03814602, "balance_loss_mlp": 1.00023389, "epoch": 0.841004058319555, "flos": 26578672583040.0, "grad_norm": 2.100312722202425, "language_loss": 0.67510009, "learning_rate": 2.5928487907502906e-07, "loss": 0.69389856, "num_input_tokens_seen": 301730955, "step": 13988, "time_per_iteration": 4.25507664680481 }, { "auxiliary_loss_clip": 0.01099532, "auxiliary_loss_mlp": 0.01036728, "balance_loss_clip": 1.04084682, "balance_loss_mlp": 1.02341866, "epoch": 0.8410641815722231, "flos": 14501878680960.0, "grad_norm": 2.2740432778318143, "language_loss": 0.81379843, "learning_rate": 2.590931332560622e-07, "loss": 0.83516109, "num_input_tokens_seen": 301746930, "step": 13989, "time_per_iteration": 2.584982395172119 }, { "auxiliary_loss_clip": 0.01096831, "auxiliary_loss_mlp": 0.0103085, "balance_loss_clip": 1.03519654, "balance_loss_mlp": 1.01829755, "epoch": 0.841124304824891, "flos": 29166682475520.0, "grad_norm": 1.6804070387823404, "language_loss": 0.75063282, "learning_rate": 2.5890145345245826e-07, "loss": 0.77190965, "num_input_tokens_seen": 301766945, "step": 13990, "time_per_iteration": 4.359278440475464 }, { "auxiliary_loss_clip": 0.01093958, "auxiliary_loss_mlp": 0.01031281, "balance_loss_clip": 1.03545666, "balance_loss_mlp": 1.01897252, "epoch": 0.841184428077559, "flos": 22412028885120.0, "grad_norm": 1.7221123856133072, "language_loss": 0.80666637, "learning_rate": 2.5870983967148597e-07, "loss": 0.82791877, "num_input_tokens_seen": 301785460, "step": 13991, "time_per_iteration": 4.206341743469238 }, { "auxiliary_loss_clip": 0.01070481, "auxiliary_loss_mlp": 0.01033414, "balance_loss_clip": 1.0353756, "balance_loss_mlp": 1.0215528, "epoch": 0.841244551330227, "flos": 22962791099520.0, "grad_norm": 2.0828857174593263, "language_loss": 0.70396578, "learning_rate": 2.585182919204105e-07, "loss": 0.72500479, "num_input_tokens_seen": 301804180, "step": 13992, "time_per_iteration": 2.692427158355713 }, { "auxiliary_loss_clip": 0.01075291, "auxiliary_loss_mlp": 0.01027217, "balance_loss_clip": 1.03414965, "balance_loss_mlp": 1.01490271, "epoch": 0.8413046745828949, "flos": 21032736583680.0, "grad_norm": 3.8503455300269427, "language_loss": 0.76960343, "learning_rate": 2.583268102064959e-07, "loss": 0.79062855, "num_input_tokens_seen": 301823670, "step": 13993, "time_per_iteration": 2.704113006591797 }, { "auxiliary_loss_clip": 0.01102579, "auxiliary_loss_mlp": 0.01036453, "balance_loss_clip": 1.0354774, "balance_loss_mlp": 1.02206421, "epoch": 0.841364797835563, "flos": 27052082858880.0, "grad_norm": 6.277502737271607, "language_loss": 0.74242276, "learning_rate": 2.5813539453700393e-07, "loss": 0.76381308, "num_input_tokens_seen": 301845890, "step": 13994, "time_per_iteration": 2.6997077465057373 }, { "auxiliary_loss_clip": 0.01094097, "auxiliary_loss_mlp": 0.01029334, "balance_loss_clip": 1.0345273, "balance_loss_mlp": 1.01830745, "epoch": 0.8414249210882309, "flos": 17895688329600.0, "grad_norm": 1.5616526510476096, "language_loss": 0.5941689, "learning_rate": 2.5794404491919163e-07, "loss": 0.61540318, "num_input_tokens_seen": 301863985, "step": 13995, "time_per_iteration": 2.6176936626434326 }, { "auxiliary_loss_clip": 0.01095561, "auxiliary_loss_mlp": 0.01031608, "balance_loss_clip": 1.035285, "balance_loss_mlp": 1.01885819, "epoch": 0.8414850443408989, "flos": 25441201618560.0, "grad_norm": 2.764700779392295, "language_loss": 0.71651798, "learning_rate": 2.577527613603163e-07, "loss": 0.73778963, "num_input_tokens_seen": 301882765, "step": 13996, "time_per_iteration": 2.596438407897949 }, { "auxiliary_loss_clip": 0.0108265, "auxiliary_loss_mlp": 0.01030717, "balance_loss_clip": 1.03388953, "balance_loss_mlp": 1.01880229, "epoch": 0.8415451675935668, "flos": 23220055284480.0, "grad_norm": 1.7462285917475873, "language_loss": 0.64240086, "learning_rate": 2.5756154386763017e-07, "loss": 0.66353452, "num_input_tokens_seen": 301902720, "step": 13997, "time_per_iteration": 2.7167398929595947 }, { "auxiliary_loss_clip": 0.01087567, "auxiliary_loss_mlp": 0.01035998, "balance_loss_clip": 1.03863931, "balance_loss_mlp": 1.02296853, "epoch": 0.8416052908462348, "flos": 18546496899840.0, "grad_norm": 1.8459858361991137, "language_loss": 0.82516265, "learning_rate": 2.5737039244838565e-07, "loss": 0.84639835, "num_input_tokens_seen": 301921245, "step": 13998, "time_per_iteration": 2.6906321048736572 }, { "auxiliary_loss_clip": 0.0110001, "auxiliary_loss_mlp": 0.00769946, "balance_loss_clip": 1.03833914, "balance_loss_mlp": 1.00016832, "epoch": 0.8416654140989027, "flos": 26105190480000.0, "grad_norm": 2.037627492824348, "language_loss": 0.80260479, "learning_rate": 2.5717930710982984e-07, "loss": 0.82130432, "num_input_tokens_seen": 301942320, "step": 13999, "time_per_iteration": 2.679971218109131 }, { "auxiliary_loss_clip": 0.01098013, "auxiliary_loss_mlp": 0.01033879, "balance_loss_clip": 1.03585649, "balance_loss_mlp": 1.02033651, "epoch": 0.8417255373515707, "flos": 26433270328320.0, "grad_norm": 2.9164172994343946, "language_loss": 0.66541272, "learning_rate": 2.569882878592096e-07, "loss": 0.68673158, "num_input_tokens_seen": 301963110, "step": 14000, "time_per_iteration": 2.6393961906433105 }, { "auxiliary_loss_clip": 0.011048, "auxiliary_loss_mlp": 0.01028492, "balance_loss_clip": 1.03878963, "balance_loss_mlp": 1.01545656, "epoch": 0.8417856606042387, "flos": 24717745791360.0, "grad_norm": 1.439326835594235, "language_loss": 0.79285717, "learning_rate": 2.5679733470376885e-07, "loss": 0.81419003, "num_input_tokens_seen": 301984915, "step": 14001, "time_per_iteration": 2.6358094215393066 }, { "auxiliary_loss_clip": 0.01045692, "auxiliary_loss_mlp": 0.01031649, "balance_loss_clip": 1.0337944, "balance_loss_mlp": 1.01975203, "epoch": 0.8418457838569067, "flos": 20850849089280.0, "grad_norm": 1.7593384488852517, "language_loss": 0.78821921, "learning_rate": 2.5660644765074703e-07, "loss": 0.80899262, "num_input_tokens_seen": 302004095, "step": 14002, "time_per_iteration": 2.7560184001922607 }, { "auxiliary_loss_clip": 0.01062189, "auxiliary_loss_mlp": 0.00769355, "balance_loss_clip": 1.03507459, "balance_loss_mlp": 1.00019288, "epoch": 0.8419059071095746, "flos": 28660629715200.0, "grad_norm": 1.490278429458478, "language_loss": 0.78022176, "learning_rate": 2.5641562670738334e-07, "loss": 0.79853719, "num_input_tokens_seen": 302027250, "step": 14003, "time_per_iteration": 2.792100429534912 }, { "auxiliary_loss_clip": 0.01083114, "auxiliary_loss_mlp": 0.01028819, "balance_loss_clip": 1.03756177, "balance_loss_mlp": 1.01619506, "epoch": 0.8419660303622426, "flos": 21653596189440.0, "grad_norm": 4.275398079582637, "language_loss": 0.65523028, "learning_rate": 2.5622487188091436e-07, "loss": 0.67634964, "num_input_tokens_seen": 302046950, "step": 14004, "time_per_iteration": 2.676882028579712 }, { "auxiliary_loss_clip": 0.01098301, "auxiliary_loss_mlp": 0.01032889, "balance_loss_clip": 1.03571546, "balance_loss_mlp": 1.01909709, "epoch": 0.8420261536149106, "flos": 25301114576640.0, "grad_norm": 2.012358102157947, "language_loss": 0.76216292, "learning_rate": 2.560341831785724e-07, "loss": 0.7834748, "num_input_tokens_seen": 302065470, "step": 14005, "time_per_iteration": 2.6246840953826904 }, { "auxiliary_loss_clip": 0.01072567, "auxiliary_loss_mlp": 0.00770849, "balance_loss_clip": 1.03307796, "balance_loss_mlp": 1.00026453, "epoch": 0.8420862768675785, "flos": 18763397176320.0, "grad_norm": 1.64958735251114, "language_loss": 0.77457279, "learning_rate": 2.5584356060758906e-07, "loss": 0.7930069, "num_input_tokens_seen": 302083190, "step": 14006, "time_per_iteration": 2.686828136444092 }, { "auxiliary_loss_clip": 0.01098645, "auxiliary_loss_mlp": 0.01036893, "balance_loss_clip": 1.03723645, "balance_loss_mlp": 1.02451313, "epoch": 0.8421464001202466, "flos": 18328052338560.0, "grad_norm": 2.595732898924613, "language_loss": 0.76791775, "learning_rate": 2.556530041751932e-07, "loss": 0.78927308, "num_input_tokens_seen": 302098820, "step": 14007, "time_per_iteration": 2.5972254276275635 }, { "auxiliary_loss_clip": 0.01081698, "auxiliary_loss_mlp": 0.01034223, "balance_loss_clip": 1.03605211, "balance_loss_mlp": 1.02137184, "epoch": 0.8422065233729145, "flos": 31537181560320.0, "grad_norm": 2.375931901998386, "language_loss": 0.65710688, "learning_rate": 2.554625138886102e-07, "loss": 0.67826605, "num_input_tokens_seen": 302117075, "step": 14008, "time_per_iteration": 2.700505256652832 }, { "auxiliary_loss_clip": 0.01019521, "auxiliary_loss_mlp": 0.01001888, "balance_loss_clip": 1.00692487, "balance_loss_mlp": 1.00089824, "epoch": 0.8422666466255825, "flos": 64298128510080.0, "grad_norm": 0.7086546699989251, "language_loss": 0.5692749, "learning_rate": 2.552720897550631e-07, "loss": 0.58948898, "num_input_tokens_seen": 302179735, "step": 14009, "time_per_iteration": 3.2387187480926514 }, { "auxiliary_loss_clip": 0.01039857, "auxiliary_loss_mlp": 0.01034967, "balance_loss_clip": 1.03280532, "balance_loss_mlp": 1.02329016, "epoch": 0.8423267698782504, "flos": 24316731377280.0, "grad_norm": 1.394156026072437, "language_loss": 0.77893424, "learning_rate": 2.5508173178177304e-07, "loss": 0.79968244, "num_input_tokens_seen": 302202055, "step": 14010, "time_per_iteration": 2.8507986068725586 }, { "auxiliary_loss_clip": 0.01113646, "auxiliary_loss_mlp": 0.01037844, "balance_loss_clip": 1.03962326, "balance_loss_mlp": 1.0242126, "epoch": 0.8423868931309184, "flos": 18296092212480.0, "grad_norm": 1.6155120229975741, "language_loss": 0.72607601, "learning_rate": 2.548914399759592e-07, "loss": 0.7475909, "num_input_tokens_seen": 302221360, "step": 14011, "time_per_iteration": 2.614745855331421 }, { "auxiliary_loss_clip": 0.01093355, "auxiliary_loss_mlp": 0.01039684, "balance_loss_clip": 1.0365963, "balance_loss_mlp": 1.02718472, "epoch": 0.8424470163835863, "flos": 23550218121600.0, "grad_norm": 1.762716946245802, "language_loss": 0.84175313, "learning_rate": 2.5470121434483636e-07, "loss": 0.86308348, "num_input_tokens_seen": 302240715, "step": 14012, "time_per_iteration": 2.872255325317383 }, { "auxiliary_loss_clip": 0.01100527, "auxiliary_loss_mlp": 0.01030747, "balance_loss_clip": 1.03485525, "balance_loss_mlp": 1.02031064, "epoch": 0.8425071396362543, "flos": 23769488695680.0, "grad_norm": 1.7021120391885685, "language_loss": 0.67887056, "learning_rate": 2.5451105489561884e-07, "loss": 0.70018327, "num_input_tokens_seen": 302260950, "step": 14013, "time_per_iteration": 2.603848457336426 }, { "auxiliary_loss_clip": 0.01115809, "auxiliary_loss_mlp": 0.01036207, "balance_loss_clip": 1.03945398, "balance_loss_mlp": 1.02304602, "epoch": 0.8425672628889223, "flos": 16178906816640.0, "grad_norm": 3.415074767080469, "language_loss": 0.78946209, "learning_rate": 2.5432096163551644e-07, "loss": 0.81098223, "num_input_tokens_seen": 302277500, "step": 14014, "time_per_iteration": 2.555556297302246 }, { "auxiliary_loss_clip": 0.01077492, "auxiliary_loss_mlp": 0.00770145, "balance_loss_clip": 1.03449714, "balance_loss_mlp": 1.00027716, "epoch": 0.8426273861415903, "flos": 23149131880320.0, "grad_norm": 1.667905320409494, "language_loss": 0.67027128, "learning_rate": 2.5413093457173884e-07, "loss": 0.68874758, "num_input_tokens_seen": 302297930, "step": 14015, "time_per_iteration": 2.7183566093444824 }, { "auxiliary_loss_clip": 0.011092, "auxiliary_loss_mlp": 0.01031339, "balance_loss_clip": 1.03803563, "balance_loss_mlp": 1.0183686, "epoch": 0.8426875093942582, "flos": 17457757712640.0, "grad_norm": 5.511316765631844, "language_loss": 0.76168728, "learning_rate": 2.5394097371149036e-07, "loss": 0.78309268, "num_input_tokens_seen": 302315735, "step": 14016, "time_per_iteration": 2.5260772705078125 }, { "auxiliary_loss_clip": 0.01086806, "auxiliary_loss_mlp": 0.01032087, "balance_loss_clip": 1.03610539, "balance_loss_mlp": 1.01919413, "epoch": 0.8427476326469262, "flos": 19640551299840.0, "grad_norm": 1.8433329789592472, "language_loss": 0.79657745, "learning_rate": 2.5375107906197544e-07, "loss": 0.81776643, "num_input_tokens_seen": 302332790, "step": 14017, "time_per_iteration": 2.630877733230591 }, { "auxiliary_loss_clip": 0.01087127, "auxiliary_loss_mlp": 0.0103184, "balance_loss_clip": 1.03714514, "balance_loss_mlp": 1.02009821, "epoch": 0.8428077558995941, "flos": 11941160146560.0, "grad_norm": 2.433761002198627, "language_loss": 0.63508832, "learning_rate": 2.5356125063039525e-07, "loss": 0.65627795, "num_input_tokens_seen": 302346490, "step": 14018, "time_per_iteration": 2.600435256958008 }, { "auxiliary_loss_clip": 0.0109746, "auxiliary_loss_mlp": 0.01036009, "balance_loss_clip": 1.03713536, "balance_loss_mlp": 1.02413559, "epoch": 0.8428678791522621, "flos": 10451729767680.0, "grad_norm": 2.058952264097869, "language_loss": 0.79526985, "learning_rate": 2.5337148842394687e-07, "loss": 0.81660461, "num_input_tokens_seen": 302363235, "step": 14019, "time_per_iteration": 2.606147289276123 }, { "auxiliary_loss_clip": 0.01066617, "auxiliary_loss_mlp": 0.01042966, "balance_loss_clip": 1.0320183, "balance_loss_mlp": 1.02731419, "epoch": 0.8429280024049302, "flos": 28767248259840.0, "grad_norm": 1.8880951217635216, "language_loss": 0.78381932, "learning_rate": 2.531817924498265e-07, "loss": 0.80491519, "num_input_tokens_seen": 302383270, "step": 14020, "time_per_iteration": 2.761439561843872 }, { "auxiliary_loss_clip": 0.01094532, "auxiliary_loss_mlp": 0.01027321, "balance_loss_clip": 1.03691518, "balance_loss_mlp": 1.01528084, "epoch": 0.8429881256575981, "flos": 19537093152000.0, "grad_norm": 1.619318951916878, "language_loss": 0.71194899, "learning_rate": 2.5299216271522805e-07, "loss": 0.73316747, "num_input_tokens_seen": 302401355, "step": 14021, "time_per_iteration": 2.5756282806396484 }, { "auxiliary_loss_clip": 0.01082102, "auxiliary_loss_mlp": 0.01039813, "balance_loss_clip": 1.03787649, "balance_loss_mlp": 1.02695012, "epoch": 0.8430482489102661, "flos": 24790931752320.0, "grad_norm": 1.6917414821142582, "language_loss": 0.69565594, "learning_rate": 2.5280259922734125e-07, "loss": 0.71687508, "num_input_tokens_seen": 302419515, "step": 14022, "time_per_iteration": 2.654576301574707 }, { "auxiliary_loss_clip": 0.01053571, "auxiliary_loss_mlp": 0.01034251, "balance_loss_clip": 1.03549337, "balance_loss_mlp": 1.02110815, "epoch": 0.843108372162934, "flos": 21544248211200.0, "grad_norm": 2.0704880658750264, "language_loss": 0.72135806, "learning_rate": 2.526131019933553e-07, "loss": 0.74223632, "num_input_tokens_seen": 302438280, "step": 14023, "time_per_iteration": 2.763561248779297 }, { "auxiliary_loss_clip": 0.01097817, "auxiliary_loss_mlp": 0.01036537, "balance_loss_clip": 1.03748226, "balance_loss_mlp": 1.02365077, "epoch": 0.843168495415602, "flos": 24608792862720.0, "grad_norm": 3.1279379432314496, "language_loss": 0.66840017, "learning_rate": 2.524236710204559e-07, "loss": 0.68974364, "num_input_tokens_seen": 302460860, "step": 14024, "time_per_iteration": 2.6798737049102783 }, { "auxiliary_loss_clip": 0.01094098, "auxiliary_loss_mlp": 0.01033274, "balance_loss_clip": 1.0358882, "balance_loss_mlp": 1.02064943, "epoch": 0.8432286186682699, "flos": 15122738286720.0, "grad_norm": 1.7785534436425128, "language_loss": 0.80463433, "learning_rate": 2.522343063158261e-07, "loss": 0.82590806, "num_input_tokens_seen": 302476980, "step": 14025, "time_per_iteration": 2.5957210063934326 }, { "auxiliary_loss_clip": 0.01094269, "auxiliary_loss_mlp": 0.01032342, "balance_loss_clip": 1.03669548, "balance_loss_mlp": 1.02171469, "epoch": 0.843288741920938, "flos": 20301882554880.0, "grad_norm": 1.7599057641282252, "language_loss": 0.77854842, "learning_rate": 2.5204500788664606e-07, "loss": 0.79981452, "num_input_tokens_seen": 302496380, "step": 14026, "time_per_iteration": 4.200474500656128 }, { "auxiliary_loss_clip": 0.01082991, "auxiliary_loss_mlp": 0.01035909, "balance_loss_clip": 1.03349411, "balance_loss_mlp": 1.02325487, "epoch": 0.8433488651736059, "flos": 23332096782720.0, "grad_norm": 1.3875448337644876, "language_loss": 0.8256402, "learning_rate": 2.518557757400945e-07, "loss": 0.84682918, "num_input_tokens_seen": 302516845, "step": 14027, "time_per_iteration": 2.649754524230957 }, { "auxiliary_loss_clip": 0.01083401, "auxiliary_loss_mlp": 0.01029357, "balance_loss_clip": 1.03570163, "balance_loss_mlp": 1.01768661, "epoch": 0.8434089884262739, "flos": 39458105844480.0, "grad_norm": 1.6608095267116312, "language_loss": 0.56683648, "learning_rate": 2.5166660988334754e-07, "loss": 0.58796406, "num_input_tokens_seen": 302538865, "step": 14028, "time_per_iteration": 4.3750526905059814 }, { "auxiliary_loss_clip": 0.01082684, "auxiliary_loss_mlp": 0.0102599, "balance_loss_clip": 1.03466272, "balance_loss_mlp": 1.01414621, "epoch": 0.8434691116789418, "flos": 23768842250880.0, "grad_norm": 2.5757916535354304, "language_loss": 0.64079869, "learning_rate": 2.51477510323578e-07, "loss": 0.66188538, "num_input_tokens_seen": 302557970, "step": 14029, "time_per_iteration": 4.223181962966919 }, { "auxiliary_loss_clip": 0.01105336, "auxiliary_loss_mlp": 0.0103257, "balance_loss_clip": 1.037485, "balance_loss_mlp": 1.02116728, "epoch": 0.8435292349316098, "flos": 22671411972480.0, "grad_norm": 2.654906642720587, "language_loss": 0.7511518, "learning_rate": 2.51288477067956e-07, "loss": 0.77253079, "num_input_tokens_seen": 302578915, "step": 14030, "time_per_iteration": 4.182165145874023 }, { "auxiliary_loss_clip": 0.01087432, "auxiliary_loss_mlp": 0.01035541, "balance_loss_clip": 1.03771615, "balance_loss_mlp": 1.02353668, "epoch": 0.8435893581842777, "flos": 18843622202880.0, "grad_norm": 1.6649016404625991, "language_loss": 0.83075505, "learning_rate": 2.510995101236502e-07, "loss": 0.85198474, "num_input_tokens_seen": 302596300, "step": 14031, "time_per_iteration": 2.6392641067504883 }, { "auxiliary_loss_clip": 0.01084779, "auxiliary_loss_mlp": 0.01029478, "balance_loss_clip": 1.03526592, "balance_loss_mlp": 1.01829624, "epoch": 0.8436494814369457, "flos": 20704225772160.0, "grad_norm": 1.8193190780443504, "language_loss": 0.80412525, "learning_rate": 2.509106094978266e-07, "loss": 0.82526779, "num_input_tokens_seen": 302614975, "step": 14032, "time_per_iteration": 2.640856981277466 }, { "auxiliary_loss_clip": 0.01072594, "auxiliary_loss_mlp": 0.01035806, "balance_loss_clip": 1.03194261, "balance_loss_mlp": 1.02130389, "epoch": 0.8437096046896138, "flos": 22674177319680.0, "grad_norm": 1.5175948868756235, "language_loss": 0.75642312, "learning_rate": 2.507217751976478e-07, "loss": 0.77750713, "num_input_tokens_seen": 302636415, "step": 14033, "time_per_iteration": 2.6690027713775635 }, { "auxiliary_loss_clip": 0.01070256, "auxiliary_loss_mlp": 0.01037556, "balance_loss_clip": 1.03320062, "balance_loss_mlp": 1.02597451, "epoch": 0.8437697279422817, "flos": 16180127879040.0, "grad_norm": 1.777468155857912, "language_loss": 0.83613944, "learning_rate": 2.505330072302743e-07, "loss": 0.85721743, "num_input_tokens_seen": 302653605, "step": 14034, "time_per_iteration": 2.765951156616211 }, { "auxiliary_loss_clip": 0.01074581, "auxiliary_loss_mlp": 0.01032138, "balance_loss_clip": 1.03461361, "balance_loss_mlp": 1.01791012, "epoch": 0.8438298511949497, "flos": 28765847629440.0, "grad_norm": 1.4932135863758922, "language_loss": 0.78466785, "learning_rate": 2.503443056028656e-07, "loss": 0.80573499, "num_input_tokens_seen": 302673965, "step": 14035, "time_per_iteration": 2.76178240776062 }, { "auxiliary_loss_clip": 0.01093632, "auxiliary_loss_mlp": 0.01036882, "balance_loss_clip": 1.03451896, "balance_loss_mlp": 1.02403092, "epoch": 0.8438899744476176, "flos": 33724284779520.0, "grad_norm": 1.361908156116711, "language_loss": 0.72181344, "learning_rate": 2.501556703225751e-07, "loss": 0.74311858, "num_input_tokens_seen": 302695560, "step": 14036, "time_per_iteration": 2.719937562942505 }, { "auxiliary_loss_clip": 0.01103676, "auxiliary_loss_mlp": 0.01025959, "balance_loss_clip": 1.03616214, "balance_loss_mlp": 1.01573718, "epoch": 0.8439500977002856, "flos": 25110787386240.0, "grad_norm": 1.7256131227226181, "language_loss": 0.69647789, "learning_rate": 2.49967101396557e-07, "loss": 0.71777427, "num_input_tokens_seen": 302713480, "step": 14037, "time_per_iteration": 2.581303596496582 }, { "auxiliary_loss_clip": 0.01107935, "auxiliary_loss_mlp": 0.01026894, "balance_loss_clip": 1.03714299, "balance_loss_mlp": 1.01509237, "epoch": 0.8440102209529535, "flos": 32850362880000.0, "grad_norm": 1.8136551952010338, "language_loss": 0.69107378, "learning_rate": 2.4977859883196227e-07, "loss": 0.71242201, "num_input_tokens_seen": 302736860, "step": 14038, "time_per_iteration": 2.6723809242248535 }, { "auxiliary_loss_clip": 0.01051869, "auxiliary_loss_mlp": 0.01039696, "balance_loss_clip": 1.03102171, "balance_loss_mlp": 1.02648771, "epoch": 0.8440703442056215, "flos": 23730202195200.0, "grad_norm": 1.5801528390801436, "language_loss": 0.76572794, "learning_rate": 2.49590162635938e-07, "loss": 0.78664356, "num_input_tokens_seen": 302757745, "step": 14039, "time_per_iteration": 2.721997022628784 }, { "auxiliary_loss_clip": 0.0111525, "auxiliary_loss_mlp": 0.01028155, "balance_loss_clip": 1.03972554, "balance_loss_mlp": 1.01560223, "epoch": 0.8441304674582895, "flos": 20193719725440.0, "grad_norm": 1.8472982875687889, "language_loss": 0.79579616, "learning_rate": 2.4940179281563046e-07, "loss": 0.81723017, "num_input_tokens_seen": 302774885, "step": 14040, "time_per_iteration": 2.531126022338867 }, { "auxiliary_loss_clip": 0.01077191, "auxiliary_loss_mlp": 0.01039289, "balance_loss_clip": 1.03676081, "balance_loss_mlp": 1.02646196, "epoch": 0.8441905907109575, "flos": 20219897761920.0, "grad_norm": 1.9576992195932046, "language_loss": 0.69267452, "learning_rate": 2.492134893781821e-07, "loss": 0.71383929, "num_input_tokens_seen": 302791035, "step": 14041, "time_per_iteration": 2.749387741088867 }, { "auxiliary_loss_clip": 0.01087824, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.03482831, "balance_loss_mlp": 1.02474546, "epoch": 0.8442507139636254, "flos": 13516453987200.0, "grad_norm": 1.879973628715058, "language_loss": 0.68978488, "learning_rate": 2.490252523307341e-07, "loss": 0.71103394, "num_input_tokens_seen": 302808650, "step": 14042, "time_per_iteration": 2.656613826751709 }, { "auxiliary_loss_clip": 0.01085316, "auxiliary_loss_mlp": 0.01033386, "balance_loss_clip": 1.03706896, "balance_loss_mlp": 1.02187014, "epoch": 0.8443108372162934, "flos": 18220212731520.0, "grad_norm": 3.089358843777989, "language_loss": 0.74717695, "learning_rate": 2.4883708168042373e-07, "loss": 0.76836395, "num_input_tokens_seen": 302824605, "step": 14043, "time_per_iteration": 2.633385181427002 }, { "auxiliary_loss_clip": 0.01107453, "auxiliary_loss_mlp": 0.00769638, "balance_loss_clip": 1.03682041, "balance_loss_mlp": 1.00010276, "epoch": 0.8443709604689613, "flos": 16105110324480.0, "grad_norm": 2.7829513982165306, "language_loss": 0.7167477, "learning_rate": 2.486489774343865e-07, "loss": 0.73551863, "num_input_tokens_seen": 302840170, "step": 14044, "time_per_iteration": 2.5848615169525146 }, { "auxiliary_loss_clip": 0.01085792, "auxiliary_loss_mlp": 0.01029904, "balance_loss_clip": 1.03579986, "balance_loss_mlp": 1.01772702, "epoch": 0.8444310837216293, "flos": 18512130562560.0, "grad_norm": 1.5511958815264777, "language_loss": 0.74899876, "learning_rate": 2.484609395997559e-07, "loss": 0.77015567, "num_input_tokens_seen": 302858320, "step": 14045, "time_per_iteration": 2.6302268505096436 }, { "auxiliary_loss_clip": 0.01086761, "auxiliary_loss_mlp": 0.00769733, "balance_loss_clip": 1.03393674, "balance_loss_mlp": 1.00021636, "epoch": 0.8444912069742974, "flos": 14939845211520.0, "grad_norm": 1.9839329932661167, "language_loss": 0.78436804, "learning_rate": 2.4827296818366216e-07, "loss": 0.80293298, "num_input_tokens_seen": 302875255, "step": 14046, "time_per_iteration": 2.6413092613220215 }, { "auxiliary_loss_clip": 0.01081685, "auxiliary_loss_mlp": 0.01035179, "balance_loss_clip": 1.03393447, "balance_loss_mlp": 1.02033782, "epoch": 0.8445513302269653, "flos": 20120318282880.0, "grad_norm": 2.2863023721610842, "language_loss": 0.7816503, "learning_rate": 2.4808506319323255e-07, "loss": 0.80281889, "num_input_tokens_seen": 302894690, "step": 14047, "time_per_iteration": 2.6660380363464355 }, { "auxiliary_loss_clip": 0.01086084, "auxiliary_loss_mlp": 0.0103182, "balance_loss_clip": 1.03934455, "balance_loss_mlp": 1.01988101, "epoch": 0.8446114534796333, "flos": 31170928533120.0, "grad_norm": 1.8722756124069524, "language_loss": 0.72262931, "learning_rate": 2.478972246355935e-07, "loss": 0.74380839, "num_input_tokens_seen": 302912405, "step": 14048, "time_per_iteration": 2.750633716583252 }, { "auxiliary_loss_clip": 0.01029086, "auxiliary_loss_mlp": 0.01033735, "balance_loss_clip": 1.03568673, "balance_loss_mlp": 1.02149785, "epoch": 0.8446715767323012, "flos": 23948323534080.0, "grad_norm": 1.6779262102032728, "language_loss": 0.73663235, "learning_rate": 2.477094525178667e-07, "loss": 0.75726056, "num_input_tokens_seen": 302932525, "step": 14049, "time_per_iteration": 3.1203606128692627 }, { "auxiliary_loss_clip": 0.01019667, "auxiliary_loss_mlp": 0.00751068, "balance_loss_clip": 1.00710368, "balance_loss_mlp": 0.99964279, "epoch": 0.8447316999849692, "flos": 67984897484160.0, "grad_norm": 0.890275680771782, "language_loss": 0.60581625, "learning_rate": 2.475217468471729e-07, "loss": 0.62352359, "num_input_tokens_seen": 302991285, "step": 14050, "time_per_iteration": 3.2392361164093018 }, { "auxiliary_loss_clip": 0.01082426, "auxiliary_loss_mlp": 0.00772259, "balance_loss_clip": 1.03367877, "balance_loss_mlp": 1.00022018, "epoch": 0.8447918232376371, "flos": 22418924296320.0, "grad_norm": 2.8956487608781036, "language_loss": 0.72659081, "learning_rate": 2.473341076306303e-07, "loss": 0.74513769, "num_input_tokens_seen": 303009515, "step": 14051, "time_per_iteration": 2.6661341190338135 }, { "auxiliary_loss_clip": 0.01095777, "auxiliary_loss_mlp": 0.01027878, "balance_loss_clip": 1.03622103, "balance_loss_mlp": 1.01606417, "epoch": 0.8448519464903052, "flos": 23694147918720.0, "grad_norm": 1.811318817116214, "language_loss": 0.74613708, "learning_rate": 2.471465348753547e-07, "loss": 0.76737368, "num_input_tokens_seen": 303026905, "step": 14052, "time_per_iteration": 2.7032968997955322 }, { "auxiliary_loss_clip": 0.0107808, "auxiliary_loss_mlp": 0.01028694, "balance_loss_clip": 1.03693604, "balance_loss_mlp": 1.01800132, "epoch": 0.8449120697429731, "flos": 13735904129280.0, "grad_norm": 2.027055068247027, "language_loss": 0.73807508, "learning_rate": 2.469590285884575e-07, "loss": 0.75914282, "num_input_tokens_seen": 303045245, "step": 14053, "time_per_iteration": 2.6814658641815186 }, { "auxiliary_loss_clip": 0.01092814, "auxiliary_loss_mlp": 0.01029071, "balance_loss_clip": 1.03634143, "balance_loss_mlp": 1.01660776, "epoch": 0.8449721929956411, "flos": 20886795624960.0, "grad_norm": 1.652927903849228, "language_loss": 0.73763537, "learning_rate": 2.467715887770494e-07, "loss": 0.75885427, "num_input_tokens_seen": 303065205, "step": 14054, "time_per_iteration": 2.7116918563842773 }, { "auxiliary_loss_clip": 0.01101862, "auxiliary_loss_mlp": 0.0103134, "balance_loss_clip": 1.03724992, "balance_loss_mlp": 1.01904297, "epoch": 0.845032316248309, "flos": 33216939129600.0, "grad_norm": 1.366097555200395, "language_loss": 0.77969533, "learning_rate": 2.4658421544823895e-07, "loss": 0.8010273, "num_input_tokens_seen": 303088250, "step": 14055, "time_per_iteration": 2.73816180229187 }, { "auxiliary_loss_clip": 0.01096569, "auxiliary_loss_mlp": 0.01036149, "balance_loss_clip": 1.03655601, "balance_loss_mlp": 1.02406144, "epoch": 0.845092439500977, "flos": 23585230903680.0, "grad_norm": 1.670368654954579, "language_loss": 0.72893804, "learning_rate": 2.463969086091302e-07, "loss": 0.75026524, "num_input_tokens_seen": 303109280, "step": 14056, "time_per_iteration": 2.70538592338562 }, { "auxiliary_loss_clip": 0.01102046, "auxiliary_loss_mlp": 0.01036064, "balance_loss_clip": 1.03741002, "balance_loss_mlp": 1.02301073, "epoch": 0.8451525627536449, "flos": 13333920048000.0, "grad_norm": 2.25806090872676, "language_loss": 0.67116416, "learning_rate": 2.4620966826682686e-07, "loss": 0.69254524, "num_input_tokens_seen": 303126075, "step": 14057, "time_per_iteration": 2.6500983238220215 }, { "auxiliary_loss_clip": 0.01061896, "auxiliary_loss_mlp": 0.01031163, "balance_loss_clip": 1.03401434, "balance_loss_mlp": 1.01830649, "epoch": 0.8452126860063129, "flos": 27817985583360.0, "grad_norm": 14.445297752057405, "language_loss": 0.77819413, "learning_rate": 2.460224944284284e-07, "loss": 0.79912472, "num_input_tokens_seen": 303146920, "step": 14058, "time_per_iteration": 2.7543530464172363 }, { "auxiliary_loss_clip": 0.01110927, "auxiliary_loss_mlp": 0.0103341, "balance_loss_clip": 1.03813863, "balance_loss_mlp": 1.02150726, "epoch": 0.845272809258981, "flos": 27124694202240.0, "grad_norm": 1.550470920076943, "language_loss": 0.69772273, "learning_rate": 2.45835387101033e-07, "loss": 0.7191661, "num_input_tokens_seen": 303167885, "step": 14059, "time_per_iteration": 2.6287412643432617 }, { "auxiliary_loss_clip": 0.01112261, "auxiliary_loss_mlp": 0.01036279, "balance_loss_clip": 1.03764248, "balance_loss_mlp": 1.02282071, "epoch": 0.8453329325116489, "flos": 18332577452160.0, "grad_norm": 2.0803365731251278, "language_loss": 0.57748783, "learning_rate": 2.4564834629173516e-07, "loss": 0.59897316, "num_input_tokens_seen": 303185000, "step": 14060, "time_per_iteration": 2.5504209995269775 }, { "auxiliary_loss_clip": 0.0108835, "auxiliary_loss_mlp": 0.01037716, "balance_loss_clip": 1.0332464, "balance_loss_mlp": 1.02378607, "epoch": 0.8453930557643169, "flos": 22675254727680.0, "grad_norm": 1.5991086812981428, "language_loss": 0.756387, "learning_rate": 2.454613720076277e-07, "loss": 0.77764767, "num_input_tokens_seen": 303205210, "step": 14061, "time_per_iteration": 2.6448512077331543 }, { "auxiliary_loss_clip": 0.010831, "auxiliary_loss_mlp": 0.0102804, "balance_loss_clip": 1.0347625, "balance_loss_mlp": 1.01493907, "epoch": 0.8454531790169848, "flos": 22487261921280.0, "grad_norm": 2.9339159034227134, "language_loss": 0.71316195, "learning_rate": 2.452744642558013e-07, "loss": 0.73427337, "num_input_tokens_seen": 303224655, "step": 14062, "time_per_iteration": 2.6758151054382324 }, { "auxiliary_loss_clip": 0.00988143, "auxiliary_loss_mlp": 0.00999448, "balance_loss_clip": 1.01111102, "balance_loss_mlp": 0.99836904, "epoch": 0.8455133022696528, "flos": 58277848481280.0, "grad_norm": 0.6380346194484136, "language_loss": 0.52619612, "learning_rate": 2.450876230433432e-07, "loss": 0.54607201, "num_input_tokens_seen": 303289645, "step": 14063, "time_per_iteration": 3.2946317195892334 }, { "auxiliary_loss_clip": 0.01065561, "auxiliary_loss_mlp": 0.01025729, "balance_loss_clip": 1.03565407, "balance_loss_mlp": 1.01469028, "epoch": 0.8455734255223207, "flos": 21361283308800.0, "grad_norm": 1.9822663620489593, "language_loss": 0.82145214, "learning_rate": 2.449008483773378e-07, "loss": 0.84236503, "num_input_tokens_seen": 303308350, "step": 14064, "time_per_iteration": 2.6607656478881836 }, { "auxiliary_loss_clip": 0.01101966, "auxiliary_loss_mlp": 0.01033412, "balance_loss_clip": 1.03909707, "balance_loss_mlp": 1.02036476, "epoch": 0.8456335487749888, "flos": 20449260057600.0, "grad_norm": 1.7113113897930685, "language_loss": 0.72365153, "learning_rate": 2.447141402648685e-07, "loss": 0.74500531, "num_input_tokens_seen": 303325230, "step": 14065, "time_per_iteration": 4.209578037261963 }, { "auxiliary_loss_clip": 0.0107366, "auxiliary_loss_mlp": 0.01027599, "balance_loss_clip": 1.03522468, "balance_loss_mlp": 1.01634598, "epoch": 0.8456936720276567, "flos": 28840901097600.0, "grad_norm": 1.7659052654385863, "language_loss": 0.77673888, "learning_rate": 2.445274987130146e-07, "loss": 0.79775143, "num_input_tokens_seen": 303345810, "step": 14066, "time_per_iteration": 2.7587270736694336 }, { "auxiliary_loss_clip": 0.01072656, "auxiliary_loss_mlp": 0.01030619, "balance_loss_clip": 1.03802919, "balance_loss_mlp": 1.01832891, "epoch": 0.8457537952803247, "flos": 22672884430080.0, "grad_norm": 1.438108024739344, "language_loss": 0.69719791, "learning_rate": 2.4434092372885363e-07, "loss": 0.71823066, "num_input_tokens_seen": 303365140, "step": 14067, "time_per_iteration": 4.787655353546143 }, { "auxiliary_loss_clip": 0.01071366, "auxiliary_loss_mlp": 0.01028654, "balance_loss_clip": 1.03298759, "balance_loss_mlp": 1.01651883, "epoch": 0.8458139185329926, "flos": 33802929607680.0, "grad_norm": 2.205817987023731, "language_loss": 0.71166551, "learning_rate": 2.4415441531946144e-07, "loss": 0.73266566, "num_input_tokens_seen": 303386150, "step": 14068, "time_per_iteration": 4.351239204406738 }, { "auxiliary_loss_clip": 0.00992733, "auxiliary_loss_mlp": 0.01001464, "balance_loss_clip": 1.00806594, "balance_loss_mlp": 1.00047481, "epoch": 0.8458740417856606, "flos": 70295929603200.0, "grad_norm": 0.6926239489661511, "language_loss": 0.604882, "learning_rate": 2.4396797349190976e-07, "loss": 0.62482405, "num_input_tokens_seen": 303453770, "step": 14069, "time_per_iteration": 5.011239290237427 }, { "auxiliary_loss_clip": 0.01085111, "auxiliary_loss_mlp": 0.01030137, "balance_loss_clip": 1.0371033, "balance_loss_mlp": 1.01844859, "epoch": 0.8459341650383285, "flos": 24170862245760.0, "grad_norm": 1.60147052022308, "language_loss": 0.74564326, "learning_rate": 2.4378159825326804e-07, "loss": 0.76679569, "num_input_tokens_seen": 303474520, "step": 14070, "time_per_iteration": 2.651233196258545 }, { "auxiliary_loss_clip": 0.01061032, "auxiliary_loss_mlp": 0.01030467, "balance_loss_clip": 1.03419256, "balance_loss_mlp": 1.01784265, "epoch": 0.8459942882909965, "flos": 38181158369280.0, "grad_norm": 3.793310499972189, "language_loss": 0.66902626, "learning_rate": 2.435952896106039e-07, "loss": 0.68994129, "num_input_tokens_seen": 303497345, "step": 14071, "time_per_iteration": 2.863940954208374 }, { "auxiliary_loss_clip": 0.01019962, "auxiliary_loss_mlp": 0.00751058, "balance_loss_clip": 1.00760365, "balance_loss_mlp": 0.99957699, "epoch": 0.8460544115436646, "flos": 64118252177280.0, "grad_norm": 0.7316338741504227, "language_loss": 0.61046565, "learning_rate": 2.4340904757098313e-07, "loss": 0.62817585, "num_input_tokens_seen": 303554890, "step": 14072, "time_per_iteration": 3.041468858718872 }, { "auxiliary_loss_clip": 0.01069973, "auxiliary_loss_mlp": 0.0103143, "balance_loss_clip": 1.0375272, "balance_loss_mlp": 1.01801336, "epoch": 0.8461145347963325, "flos": 24170826332160.0, "grad_norm": 2.698687896203199, "language_loss": 0.72609383, "learning_rate": 2.4322287214146664e-07, "loss": 0.74710786, "num_input_tokens_seen": 303574380, "step": 14073, "time_per_iteration": 2.7544729709625244 }, { "auxiliary_loss_clip": 0.01091999, "auxiliary_loss_mlp": 0.01034187, "balance_loss_clip": 1.0379231, "balance_loss_mlp": 1.02047241, "epoch": 0.8461746580490005, "flos": 34893787697280.0, "grad_norm": 2.1945547327589976, "language_loss": 0.78341836, "learning_rate": 2.430367633291155e-07, "loss": 0.80468023, "num_input_tokens_seen": 303594910, "step": 14074, "time_per_iteration": 2.8085241317749023 }, { "auxiliary_loss_clip": 0.01099175, "auxiliary_loss_mlp": 0.01030891, "balance_loss_clip": 1.03856003, "balance_loss_mlp": 1.01867247, "epoch": 0.8462347813016684, "flos": 25557014044800.0, "grad_norm": 2.0589509569637143, "language_loss": 0.75481176, "learning_rate": 2.4285072114098583e-07, "loss": 0.77611244, "num_input_tokens_seen": 303613520, "step": 14075, "time_per_iteration": 2.6737287044525146 }, { "auxiliary_loss_clip": 0.01084327, "auxiliary_loss_mlp": 0.01026954, "balance_loss_clip": 1.03593063, "balance_loss_mlp": 1.0144732, "epoch": 0.8462949045543364, "flos": 21325336773120.0, "grad_norm": 2.312181037526072, "language_loss": 0.73324478, "learning_rate": 2.4266474558413355e-07, "loss": 0.75435758, "num_input_tokens_seen": 303631225, "step": 14076, "time_per_iteration": 2.6550984382629395 }, { "auxiliary_loss_clip": 0.01091988, "auxiliary_loss_mlp": 0.01032969, "balance_loss_clip": 1.03691387, "balance_loss_mlp": 1.02119696, "epoch": 0.8463550278070043, "flos": 22637440684800.0, "grad_norm": 2.1680045577224543, "language_loss": 0.78016102, "learning_rate": 2.4247883666560945e-07, "loss": 0.80141062, "num_input_tokens_seen": 303649175, "step": 14077, "time_per_iteration": 2.7090940475463867 }, { "auxiliary_loss_clip": 0.01075749, "auxiliary_loss_mlp": 0.0103478, "balance_loss_clip": 1.03529286, "balance_loss_mlp": 1.022102, "epoch": 0.8464151510596724, "flos": 13005588804480.0, "grad_norm": 3.514395307267888, "language_loss": 0.75203717, "learning_rate": 2.422929943924643e-07, "loss": 0.77314246, "num_input_tokens_seen": 303665915, "step": 14078, "time_per_iteration": 2.668720245361328 }, { "auxiliary_loss_clip": 0.01069196, "auxiliary_loss_mlp": 0.01025183, "balance_loss_clip": 1.0366447, "balance_loss_mlp": 1.01232052, "epoch": 0.8464752743123403, "flos": 15704921923200.0, "grad_norm": 3.2183911644001237, "language_loss": 0.85171533, "learning_rate": 2.4210721877174565e-07, "loss": 0.87265909, "num_input_tokens_seen": 303679985, "step": 14079, "time_per_iteration": 2.7119951248168945 }, { "auxiliary_loss_clip": 0.01084378, "auxiliary_loss_mlp": 0.01037369, "balance_loss_clip": 1.03693473, "balance_loss_mlp": 1.02382135, "epoch": 0.8465353975650083, "flos": 21653955325440.0, "grad_norm": 1.8047778580763019, "language_loss": 0.5904004, "learning_rate": 2.419215098104965e-07, "loss": 0.61161786, "num_input_tokens_seen": 303698470, "step": 14080, "time_per_iteration": 2.6963582038879395 }, { "auxiliary_loss_clip": 0.01084298, "auxiliary_loss_mlp": 0.01029709, "balance_loss_clip": 1.03614235, "balance_loss_mlp": 1.01678658, "epoch": 0.8465955208176762, "flos": 18515650095360.0, "grad_norm": 2.4057964944506174, "language_loss": 0.65874493, "learning_rate": 2.4173586751576014e-07, "loss": 0.67988491, "num_input_tokens_seen": 303716415, "step": 14081, "time_per_iteration": 2.68113112449646 }, { "auxiliary_loss_clip": 0.01096638, "auxiliary_loss_mlp": 0.01035444, "balance_loss_clip": 1.03579867, "balance_loss_mlp": 1.0226047, "epoch": 0.8466556440703442, "flos": 24200559815040.0, "grad_norm": 1.8302638990848867, "language_loss": 0.72922373, "learning_rate": 2.41550291894576e-07, "loss": 0.75054455, "num_input_tokens_seen": 303734490, "step": 14082, "time_per_iteration": 2.6815195083618164 }, { "auxiliary_loss_clip": 0.01055673, "auxiliary_loss_mlp": 0.01036867, "balance_loss_clip": 1.03194714, "balance_loss_mlp": 1.02327132, "epoch": 0.8467157673230121, "flos": 20375894528640.0, "grad_norm": 2.0055254774422666, "language_loss": 0.76295221, "learning_rate": 2.413647829539809e-07, "loss": 0.78387761, "num_input_tokens_seen": 303752310, "step": 14083, "time_per_iteration": 2.7438621520996094 }, { "auxiliary_loss_clip": 0.01061542, "auxiliary_loss_mlp": 0.01034056, "balance_loss_clip": 1.03273213, "balance_loss_mlp": 1.01982224, "epoch": 0.8467758905756801, "flos": 28473642489600.0, "grad_norm": 1.7272953079134175, "language_loss": 0.65860844, "learning_rate": 2.411793407010092e-07, "loss": 0.67956436, "num_input_tokens_seen": 303776065, "step": 14084, "time_per_iteration": 2.7735166549682617 }, { "auxiliary_loss_clip": 0.01067401, "auxiliary_loss_mlp": 0.01030615, "balance_loss_clip": 1.03815413, "balance_loss_mlp": 1.01846731, "epoch": 0.8468360138283482, "flos": 11692551139200.0, "grad_norm": 2.7040264249801766, "language_loss": 0.69605839, "learning_rate": 2.409939651426938e-07, "loss": 0.71703851, "num_input_tokens_seen": 303793500, "step": 14085, "time_per_iteration": 2.773153781890869 }, { "auxiliary_loss_clip": 0.01066275, "auxiliary_loss_mlp": 0.0102846, "balance_loss_clip": 1.03402877, "balance_loss_mlp": 1.01666391, "epoch": 0.8468961370810161, "flos": 24607859109120.0, "grad_norm": 1.5869517214902362, "language_loss": 0.71034825, "learning_rate": 2.408086562860634e-07, "loss": 0.73129559, "num_input_tokens_seen": 303814835, "step": 14086, "time_per_iteration": 2.778090476989746 }, { "auxiliary_loss_clip": 0.01091608, "auxiliary_loss_mlp": 0.01032202, "balance_loss_clip": 1.03516686, "balance_loss_mlp": 1.01981008, "epoch": 0.8469562603336841, "flos": 19609812236160.0, "grad_norm": 1.9986258796414704, "language_loss": 0.74891198, "learning_rate": 2.4062341413814445e-07, "loss": 0.77015007, "num_input_tokens_seen": 303834505, "step": 14087, "time_per_iteration": 2.659958600997925 }, { "auxiliary_loss_clip": 0.01080191, "auxiliary_loss_mlp": 0.01026574, "balance_loss_clip": 1.03761494, "balance_loss_mlp": 1.01394975, "epoch": 0.847016383586352, "flos": 22638949056000.0, "grad_norm": 1.3437593766156344, "language_loss": 0.74087977, "learning_rate": 2.4043823870596227e-07, "loss": 0.76194739, "num_input_tokens_seen": 303855050, "step": 14088, "time_per_iteration": 2.820697784423828 }, { "auxiliary_loss_clip": 0.01099232, "auxiliary_loss_mlp": 0.01032056, "balance_loss_clip": 1.03691757, "balance_loss_mlp": 1.01979518, "epoch": 0.84707650683902, "flos": 20960161153920.0, "grad_norm": 2.494250359435125, "language_loss": 0.7231648, "learning_rate": 2.402531299965387e-07, "loss": 0.74447769, "num_input_tokens_seen": 303875635, "step": 14089, "time_per_iteration": 2.6343815326690674 }, { "auxiliary_loss_clip": 0.01108775, "auxiliary_loss_mlp": 0.01028953, "balance_loss_clip": 1.03952324, "balance_loss_mlp": 1.01720452, "epoch": 0.8471366300916879, "flos": 24093007516800.0, "grad_norm": 1.3722946087239658, "language_loss": 0.79204518, "learning_rate": 2.400680880168928e-07, "loss": 0.81342244, "num_input_tokens_seen": 303896750, "step": 14090, "time_per_iteration": 2.6099236011505127 }, { "auxiliary_loss_clip": 0.01053519, "auxiliary_loss_mlp": 0.01040145, "balance_loss_clip": 1.03225899, "balance_loss_mlp": 1.02603018, "epoch": 0.847196753344356, "flos": 18332900674560.0, "grad_norm": 2.9954684546587553, "language_loss": 0.76710737, "learning_rate": 2.3988311277404085e-07, "loss": 0.78804398, "num_input_tokens_seen": 303915435, "step": 14091, "time_per_iteration": 2.780735492706299 }, { "auxiliary_loss_clip": 0.01028625, "auxiliary_loss_mlp": 0.01002892, "balance_loss_clip": 1.00622869, "balance_loss_mlp": 1.00184846, "epoch": 0.8472568765970239, "flos": 49567536956160.0, "grad_norm": 0.8179269563899582, "language_loss": 0.59413207, "learning_rate": 2.396982042749982e-07, "loss": 0.61444724, "num_input_tokens_seen": 303977245, "step": 14092, "time_per_iteration": 3.1960132122039795 }, { "auxiliary_loss_clip": 0.01081941, "auxiliary_loss_mlp": 0.01036569, "balance_loss_clip": 1.03254557, "balance_loss_mlp": 1.02275276, "epoch": 0.8473169998496919, "flos": 19279074781440.0, "grad_norm": 1.7883321120809321, "language_loss": 0.70245391, "learning_rate": 2.395133625267756e-07, "loss": 0.72363901, "num_input_tokens_seen": 303996055, "step": 14093, "time_per_iteration": 2.6437125205993652 }, { "auxiliary_loss_clip": 0.01105171, "auxiliary_loss_mlp": 0.01025923, "balance_loss_clip": 1.03583193, "balance_loss_mlp": 1.01443684, "epoch": 0.8473771231023598, "flos": 17675555829120.0, "grad_norm": 2.1012559182302866, "language_loss": 0.83147365, "learning_rate": 2.3932858753638263e-07, "loss": 0.85278457, "num_input_tokens_seen": 304012205, "step": 14094, "time_per_iteration": 2.5802862644195557 }, { "auxiliary_loss_clip": 0.01089017, "auxiliary_loss_mlp": 0.01031094, "balance_loss_clip": 1.03741288, "balance_loss_mlp": 1.01977515, "epoch": 0.8474372463550278, "flos": 26359761144960.0, "grad_norm": 2.04303085122746, "language_loss": 0.71497333, "learning_rate": 2.3914387931082626e-07, "loss": 0.7361744, "num_input_tokens_seen": 304033475, "step": 14095, "time_per_iteration": 2.6501832008361816 }, { "auxiliary_loss_clip": 0.01094545, "auxiliary_loss_mlp": 0.00769791, "balance_loss_clip": 1.03552461, "balance_loss_mlp": 1.00019312, "epoch": 0.8474973696076957, "flos": 23402050519680.0, "grad_norm": 1.904609327228077, "language_loss": 0.80488968, "learning_rate": 2.3895923785711105e-07, "loss": 0.82353306, "num_input_tokens_seen": 304051845, "step": 14096, "time_per_iteration": 2.644343376159668 }, { "auxiliary_loss_clip": 0.0110016, "auxiliary_loss_mlp": 0.01030989, "balance_loss_clip": 1.03743386, "balance_loss_mlp": 1.01790023, "epoch": 0.8475574928603637, "flos": 25075666863360.0, "grad_norm": 8.034016369371804, "language_loss": 0.77681863, "learning_rate": 2.387746631822374e-07, "loss": 0.79813015, "num_input_tokens_seen": 304069965, "step": 14097, "time_per_iteration": 2.793025255203247 }, { "auxiliary_loss_clip": 0.0107883, "auxiliary_loss_mlp": 0.0102726, "balance_loss_clip": 1.03687394, "balance_loss_mlp": 1.01560712, "epoch": 0.8476176161130318, "flos": 19966691813760.0, "grad_norm": 1.7033024624087645, "language_loss": 0.802845, "learning_rate": 2.385901552932048e-07, "loss": 0.82390594, "num_input_tokens_seen": 304086805, "step": 14098, "time_per_iteration": 2.675039052963257 }, { "auxiliary_loss_clip": 0.01092536, "auxiliary_loss_mlp": 0.00770177, "balance_loss_clip": 1.03604007, "balance_loss_mlp": 1.00013864, "epoch": 0.8476777393656997, "flos": 21285834791040.0, "grad_norm": 1.8975178373976451, "language_loss": 0.71665621, "learning_rate": 2.3840571419701062e-07, "loss": 0.73528326, "num_input_tokens_seen": 304105865, "step": 14099, "time_per_iteration": 2.5827932357788086 }, { "auxiliary_loss_clip": 0.01094872, "auxiliary_loss_mlp": 0.01032372, "balance_loss_clip": 1.03322709, "balance_loss_mlp": 1.01883566, "epoch": 0.8477378626183677, "flos": 29971476650880.0, "grad_norm": 2.118472556405624, "language_loss": 0.63617903, "learning_rate": 2.3822133990064787e-07, "loss": 0.65745145, "num_input_tokens_seen": 304128300, "step": 14100, "time_per_iteration": 2.723047971725464 }, { "auxiliary_loss_clip": 0.01099377, "auxiliary_loss_mlp": 0.01033502, "balance_loss_clip": 1.03651261, "balance_loss_mlp": 1.02066314, "epoch": 0.8477979858710356, "flos": 24237727413120.0, "grad_norm": 2.0266984363046876, "language_loss": 0.73806208, "learning_rate": 2.380370324111085e-07, "loss": 0.75939089, "num_input_tokens_seen": 304143695, "step": 14101, "time_per_iteration": 2.6568257808685303 }, { "auxiliary_loss_clip": 0.01098555, "auxiliary_loss_mlp": 0.01028521, "balance_loss_clip": 1.03505516, "balance_loss_mlp": 1.01662445, "epoch": 0.8478581091237036, "flos": 25593678852480.0, "grad_norm": 1.6724420871950227, "language_loss": 0.71237093, "learning_rate": 2.3785279173538163e-07, "loss": 0.73364168, "num_input_tokens_seen": 304165800, "step": 14102, "time_per_iteration": 2.72493052482605 }, { "auxiliary_loss_clip": 0.01084921, "auxiliary_loss_mlp": 0.01033048, "balance_loss_clip": 1.03555894, "balance_loss_mlp": 1.01940477, "epoch": 0.8479182323763715, "flos": 12057116227200.0, "grad_norm": 2.4752629302772426, "language_loss": 0.81727469, "learning_rate": 2.3766861788045366e-07, "loss": 0.83845437, "num_input_tokens_seen": 304182910, "step": 14103, "time_per_iteration": 2.723888874053955 }, { "auxiliary_loss_clip": 0.01109645, "auxiliary_loss_mlp": 0.01030985, "balance_loss_clip": 1.03859901, "balance_loss_mlp": 1.01881981, "epoch": 0.8479783556290396, "flos": 21433391861760.0, "grad_norm": 1.9133517562586435, "language_loss": 0.78571969, "learning_rate": 2.374845108533079e-07, "loss": 0.80712605, "num_input_tokens_seen": 304200175, "step": 14104, "time_per_iteration": 4.045240879058838 }, { "auxiliary_loss_clip": 0.01101779, "auxiliary_loss_mlp": 0.01037373, "balance_loss_clip": 1.03928828, "balance_loss_mlp": 1.02440929, "epoch": 0.8480384788817075, "flos": 19642634288640.0, "grad_norm": 1.8032304310085965, "language_loss": 0.78830254, "learning_rate": 2.3730047066092607e-07, "loss": 0.80969405, "num_input_tokens_seen": 304217775, "step": 14105, "time_per_iteration": 2.5720746517181396 }, { "auxiliary_loss_clip": 0.01083671, "auxiliary_loss_mlp": 0.01037196, "balance_loss_clip": 1.03580463, "balance_loss_mlp": 1.02209163, "epoch": 0.8480986021343755, "flos": 22489201255680.0, "grad_norm": 1.7624192448776133, "language_loss": 0.50159001, "learning_rate": 2.3711649731028749e-07, "loss": 0.52279866, "num_input_tokens_seen": 304235760, "step": 14106, "time_per_iteration": 4.288937330245972 }, { "auxiliary_loss_clip": 0.01077376, "auxiliary_loss_mlp": 0.01035024, "balance_loss_clip": 1.03691649, "balance_loss_mlp": 1.0228827, "epoch": 0.8481587253870434, "flos": 22090557139200.0, "grad_norm": 2.145828005559372, "language_loss": 0.75445443, "learning_rate": 2.3693259080836792e-07, "loss": 0.7755785, "num_input_tokens_seen": 304253985, "step": 14107, "time_per_iteration": 2.6221656799316406 }, { "auxiliary_loss_clip": 0.01076318, "auxiliary_loss_mlp": 0.0102835, "balance_loss_clip": 1.03518283, "balance_loss_mlp": 1.01601171, "epoch": 0.8482188486397114, "flos": 33582689366400.0, "grad_norm": 1.5246182504502446, "language_loss": 0.73586017, "learning_rate": 2.3674875116214087e-07, "loss": 0.75690687, "num_input_tokens_seen": 304276785, "step": 14108, "time_per_iteration": 5.8729071617126465 }, { "auxiliary_loss_clip": 0.01106391, "auxiliary_loss_mlp": 0.01029487, "balance_loss_clip": 1.03722811, "balance_loss_mlp": 1.01592076, "epoch": 0.8482789718923793, "flos": 20919402195840.0, "grad_norm": 1.650722642214462, "language_loss": 0.72323227, "learning_rate": 2.3656497837857836e-07, "loss": 0.74459112, "num_input_tokens_seen": 304296310, "step": 14109, "time_per_iteration": 2.633683443069458 }, { "auxiliary_loss_clip": 0.01039152, "auxiliary_loss_mlp": 0.01036217, "balance_loss_clip": 1.03288758, "balance_loss_mlp": 1.02361703, "epoch": 0.8483390951450474, "flos": 12896204912640.0, "grad_norm": 2.505097141687178, "language_loss": 0.74121177, "learning_rate": 2.3638127246464811e-07, "loss": 0.76196551, "num_input_tokens_seen": 304311715, "step": 14110, "time_per_iteration": 2.7661683559417725 }, { "auxiliary_loss_clip": 0.0105041, "auxiliary_loss_mlp": 0.01031441, "balance_loss_clip": 1.03519773, "balance_loss_mlp": 1.01922786, "epoch": 0.8483992183977154, "flos": 25081628520960.0, "grad_norm": 1.695497905568318, "language_loss": 0.75963587, "learning_rate": 2.3619763342731658e-07, "loss": 0.7804544, "num_input_tokens_seen": 304331910, "step": 14111, "time_per_iteration": 2.809145450592041 }, { "auxiliary_loss_clip": 0.01107437, "auxiliary_loss_mlp": 0.01029784, "balance_loss_clip": 1.03754044, "balance_loss_mlp": 1.018435, "epoch": 0.8484593416503833, "flos": 25557445008000.0, "grad_norm": 3.4199751822671955, "language_loss": 0.67615312, "learning_rate": 2.3601406127354772e-07, "loss": 0.69752538, "num_input_tokens_seen": 304351405, "step": 14112, "time_per_iteration": 2.576991081237793 }, { "auxiliary_loss_clip": 0.01093257, "auxiliary_loss_mlp": 0.01032326, "balance_loss_clip": 1.03299069, "balance_loss_mlp": 1.0202558, "epoch": 0.8485194649030513, "flos": 27198454780800.0, "grad_norm": 1.4428256767877636, "language_loss": 0.73642004, "learning_rate": 2.3583055601030312e-07, "loss": 0.75767583, "num_input_tokens_seen": 304372935, "step": 14113, "time_per_iteration": 2.6638875007629395 }, { "auxiliary_loss_clip": 0.01071779, "auxiliary_loss_mlp": 0.01031894, "balance_loss_clip": 1.03808439, "balance_loss_mlp": 1.01990139, "epoch": 0.8485795881557192, "flos": 24205910941440.0, "grad_norm": 16.765212760184376, "language_loss": 0.66891378, "learning_rate": 2.3564711764454003e-07, "loss": 0.68995047, "num_input_tokens_seen": 304393070, "step": 14114, "time_per_iteration": 2.71993088722229 }, { "auxiliary_loss_clip": 0.01111702, "auxiliary_loss_mlp": 0.01031552, "balance_loss_clip": 1.03860688, "balance_loss_mlp": 1.01900554, "epoch": 0.8486397114083872, "flos": 21141653598720.0, "grad_norm": 1.6329871649970922, "language_loss": 0.78943914, "learning_rate": 2.3546374618321495e-07, "loss": 0.81087166, "num_input_tokens_seen": 304411195, "step": 14115, "time_per_iteration": 2.5624794960021973 }, { "auxiliary_loss_clip": 0.01110202, "auxiliary_loss_mlp": 0.01033883, "balance_loss_clip": 1.03798008, "balance_loss_mlp": 1.02150321, "epoch": 0.8486998346610551, "flos": 19974772373760.0, "grad_norm": 1.8401751033694997, "language_loss": 0.78926462, "learning_rate": 2.3528044163328187e-07, "loss": 0.81070548, "num_input_tokens_seen": 304429425, "step": 14116, "time_per_iteration": 2.5436830520629883 }, { "auxiliary_loss_clip": 0.01101053, "auxiliary_loss_mlp": 0.0102868, "balance_loss_clip": 1.037081, "balance_loss_mlp": 1.01596665, "epoch": 0.8487599579137232, "flos": 19792310261760.0, "grad_norm": 1.863909931261485, "language_loss": 0.68563157, "learning_rate": 2.3509720400169076e-07, "loss": 0.70692891, "num_input_tokens_seen": 304447460, "step": 14117, "time_per_iteration": 2.580505609512329 }, { "auxiliary_loss_clip": 0.01089877, "auxiliary_loss_mlp": 0.01028249, "balance_loss_clip": 1.0346086, "balance_loss_mlp": 1.01571453, "epoch": 0.8488200811663911, "flos": 26396030903040.0, "grad_norm": 1.9028577306172345, "language_loss": 0.65127873, "learning_rate": 2.3491403329539096e-07, "loss": 0.67246002, "num_input_tokens_seen": 304468230, "step": 14118, "time_per_iteration": 2.670452356338501 }, { "auxiliary_loss_clip": 0.01066258, "auxiliary_loss_mlp": 0.01029959, "balance_loss_clip": 1.0340513, "balance_loss_mlp": 1.01815736, "epoch": 0.8488802044190591, "flos": 16359285939840.0, "grad_norm": 1.7651382162143987, "language_loss": 0.7343511, "learning_rate": 2.3473092952132757e-07, "loss": 0.75531328, "num_input_tokens_seen": 304484860, "step": 14119, "time_per_iteration": 2.681450128555298 }, { "auxiliary_loss_clip": 0.01076463, "auxiliary_loss_mlp": 0.01030361, "balance_loss_clip": 1.03407943, "balance_loss_mlp": 1.01667559, "epoch": 0.848940327671727, "flos": 19208869649280.0, "grad_norm": 2.0775692345074486, "language_loss": 0.77856582, "learning_rate": 2.345478926864446e-07, "loss": 0.7996341, "num_input_tokens_seen": 304503575, "step": 14120, "time_per_iteration": 2.6914706230163574 }, { "auxiliary_loss_clip": 0.01094944, "auxiliary_loss_mlp": 0.01029569, "balance_loss_clip": 1.03799891, "balance_loss_mlp": 1.01668835, "epoch": 0.849000450924395, "flos": 21871178824320.0, "grad_norm": 1.7424170846480949, "language_loss": 0.75571072, "learning_rate": 2.3436492279768227e-07, "loss": 0.7769559, "num_input_tokens_seen": 304525005, "step": 14121, "time_per_iteration": 2.6244821548461914 }, { "auxiliary_loss_clip": 0.00992252, "auxiliary_loss_mlp": 0.00999952, "balance_loss_clip": 1.00927687, "balance_loss_mlp": 0.9989683, "epoch": 0.8490605741770629, "flos": 71166475624320.0, "grad_norm": 1.3338181745049902, "language_loss": 0.60148805, "learning_rate": 2.3418201986197883e-07, "loss": 0.62141007, "num_input_tokens_seen": 304585220, "step": 14122, "time_per_iteration": 3.219271659851074 }, { "auxiliary_loss_clip": 0.01098712, "auxiliary_loss_mlp": 0.01031259, "balance_loss_clip": 1.03732467, "balance_loss_mlp": 1.01909947, "epoch": 0.849120697429731, "flos": 24973357950720.0, "grad_norm": 1.7666847271822834, "language_loss": 0.79593515, "learning_rate": 2.3399918388627048e-07, "loss": 0.81723487, "num_input_tokens_seen": 304604665, "step": 14123, "time_per_iteration": 2.696174144744873 }, { "auxiliary_loss_clip": 0.01095036, "auxiliary_loss_mlp": 0.01030234, "balance_loss_clip": 1.03751028, "balance_loss_mlp": 1.01858711, "epoch": 0.8491808206823989, "flos": 23032277959680.0, "grad_norm": 1.9536438787496402, "language_loss": 0.82910216, "learning_rate": 2.3381641487749016e-07, "loss": 0.85035485, "num_input_tokens_seen": 304620600, "step": 14124, "time_per_iteration": 2.7340493202209473 }, { "auxiliary_loss_clip": 0.01064676, "auxiliary_loss_mlp": 0.01034224, "balance_loss_clip": 1.03637028, "balance_loss_mlp": 1.0209558, "epoch": 0.8492409439350669, "flos": 23878549365120.0, "grad_norm": 1.89858398727176, "language_loss": 0.72199571, "learning_rate": 2.3363371284256805e-07, "loss": 0.74298477, "num_input_tokens_seen": 304639540, "step": 14125, "time_per_iteration": 2.736920118331909 }, { "auxiliary_loss_clip": 0.01114158, "auxiliary_loss_mlp": 0.0103526, "balance_loss_clip": 1.03876936, "balance_loss_mlp": 1.0216043, "epoch": 0.8493010671877349, "flos": 22419893963520.0, "grad_norm": 1.649969149423374, "language_loss": 0.73402816, "learning_rate": 2.3345107778843288e-07, "loss": 0.75552237, "num_input_tokens_seen": 304660595, "step": 14126, "time_per_iteration": 2.5707271099090576 }, { "auxiliary_loss_clip": 0.01061518, "auxiliary_loss_mlp": 0.01039889, "balance_loss_clip": 1.03374052, "balance_loss_mlp": 1.0265317, "epoch": 0.8493611904404028, "flos": 17529435302400.0, "grad_norm": 1.4324709138124028, "language_loss": 0.67603076, "learning_rate": 2.3326850972200928e-07, "loss": 0.69704485, "num_input_tokens_seen": 304679580, "step": 14127, "time_per_iteration": 2.7047815322875977 }, { "auxiliary_loss_clip": 0.01075849, "auxiliary_loss_mlp": 0.00772172, "balance_loss_clip": 1.03386354, "balance_loss_mlp": 1.00027514, "epoch": 0.8494213136930708, "flos": 19462937523840.0, "grad_norm": 2.2394682768750727, "language_loss": 0.68882221, "learning_rate": 2.330860086502211e-07, "loss": 0.70730239, "num_input_tokens_seen": 304698385, "step": 14128, "time_per_iteration": 2.714137077331543 }, { "auxiliary_loss_clip": 0.01082408, "auxiliary_loss_mlp": 0.01032133, "balance_loss_clip": 1.03493214, "balance_loss_mlp": 1.01906157, "epoch": 0.8494814369457387, "flos": 18770292587520.0, "grad_norm": 1.7314045833982252, "language_loss": 0.77983749, "learning_rate": 2.3290357457998855e-07, "loss": 0.80098283, "num_input_tokens_seen": 304715430, "step": 14129, "time_per_iteration": 2.6516494750976562 }, { "auxiliary_loss_clip": 0.01044399, "auxiliary_loss_mlp": 0.01036739, "balance_loss_clip": 1.03597188, "balance_loss_mlp": 1.02454424, "epoch": 0.8495415601984068, "flos": 23331486251520.0, "grad_norm": 1.784130753830601, "language_loss": 0.67886949, "learning_rate": 2.3272120751823031e-07, "loss": 0.69968086, "num_input_tokens_seen": 304734345, "step": 14130, "time_per_iteration": 2.8086585998535156 }, { "auxiliary_loss_clip": 0.01099002, "auxiliary_loss_mlp": 0.01034825, "balance_loss_clip": 1.03699052, "balance_loss_mlp": 1.02243376, "epoch": 0.8496016834510747, "flos": 26612859352320.0, "grad_norm": 1.7932919190030374, "language_loss": 0.71109772, "learning_rate": 2.3253890747186e-07, "loss": 0.732436, "num_input_tokens_seen": 304755030, "step": 14131, "time_per_iteration": 2.704787254333496 }, { "auxiliary_loss_clip": 0.01079775, "auxiliary_loss_mlp": 0.01028208, "balance_loss_clip": 1.03795338, "balance_loss_mlp": 1.0159409, "epoch": 0.8496618067037427, "flos": 25480380378240.0, "grad_norm": 1.8086796883087504, "language_loss": 0.68577588, "learning_rate": 2.3235667444779162e-07, "loss": 0.70685571, "num_input_tokens_seen": 304774320, "step": 14132, "time_per_iteration": 2.7110090255737305 }, { "auxiliary_loss_clip": 0.01105556, "auxiliary_loss_mlp": 0.01035286, "balance_loss_clip": 1.03522205, "balance_loss_mlp": 1.0235796, "epoch": 0.8497219299564106, "flos": 25374587846400.0, "grad_norm": 1.7573733285315933, "language_loss": 0.70354646, "learning_rate": 2.3217450845293564e-07, "loss": 0.7249549, "num_input_tokens_seen": 304795355, "step": 14133, "time_per_iteration": 2.567920684814453 }, { "auxiliary_loss_clip": 0.00997066, "auxiliary_loss_mlp": 0.00751378, "balance_loss_clip": 1.01175642, "balance_loss_mlp": 0.99961358, "epoch": 0.8497820532090786, "flos": 67780279658880.0, "grad_norm": 0.7450619720676375, "language_loss": 0.57556748, "learning_rate": 2.3199240949419918e-07, "loss": 0.59305191, "num_input_tokens_seen": 304863915, "step": 14134, "time_per_iteration": 3.3689846992492676 }, { "auxiliary_loss_clip": 0.0107422, "auxiliary_loss_mlp": 0.01027994, "balance_loss_clip": 1.03716052, "balance_loss_mlp": 1.01549459, "epoch": 0.8498421764617465, "flos": 23440546920960.0, "grad_norm": 2.466409087633597, "language_loss": 0.78983986, "learning_rate": 2.3181037757848787e-07, "loss": 0.81086206, "num_input_tokens_seen": 304881555, "step": 14135, "time_per_iteration": 2.7446372509002686 }, { "auxiliary_loss_clip": 0.01097445, "auxiliary_loss_mlp": 0.01030777, "balance_loss_clip": 1.03782988, "balance_loss_mlp": 1.01817632, "epoch": 0.8499022997144146, "flos": 17712615686400.0, "grad_norm": 2.7995527616505966, "language_loss": 0.63055122, "learning_rate": 2.316284127127044e-07, "loss": 0.65183342, "num_input_tokens_seen": 304898760, "step": 14136, "time_per_iteration": 2.5907950401306152 }, { "auxiliary_loss_clip": 0.01101166, "auxiliary_loss_mlp": 0.01031023, "balance_loss_clip": 1.03812134, "balance_loss_mlp": 1.01783872, "epoch": 0.8499624229670825, "flos": 18588512833920.0, "grad_norm": 1.700183635273407, "language_loss": 0.84176117, "learning_rate": 2.3144651490374835e-07, "loss": 0.86308306, "num_input_tokens_seen": 304915465, "step": 14137, "time_per_iteration": 2.605083703994751 }, { "auxiliary_loss_clip": 0.010792, "auxiliary_loss_mlp": 0.01027843, "balance_loss_clip": 1.03870046, "balance_loss_mlp": 1.01687622, "epoch": 0.8500225462197505, "flos": 24345854328960.0, "grad_norm": 2.180201293156008, "language_loss": 0.78512466, "learning_rate": 2.3126468415851773e-07, "loss": 0.80619514, "num_input_tokens_seen": 304933190, "step": 14138, "time_per_iteration": 2.70701003074646 }, { "auxiliary_loss_clip": 0.01098762, "auxiliary_loss_mlp": 0.01028021, "balance_loss_clip": 1.03806686, "balance_loss_mlp": 1.0162487, "epoch": 0.8500826694724185, "flos": 16545518979840.0, "grad_norm": 1.5977485951908699, "language_loss": 0.64826471, "learning_rate": 2.310829204839073e-07, "loss": 0.66953254, "num_input_tokens_seen": 304951110, "step": 14139, "time_per_iteration": 2.5747222900390625 }, { "auxiliary_loss_clip": 0.01067444, "auxiliary_loss_mlp": 0.01031881, "balance_loss_clip": 1.03539836, "balance_loss_mlp": 1.02024066, "epoch": 0.8501427927250864, "flos": 16289404030080.0, "grad_norm": 1.8080135201880956, "language_loss": 0.7064625, "learning_rate": 2.3090122388681043e-07, "loss": 0.72745574, "num_input_tokens_seen": 304969095, "step": 14140, "time_per_iteration": 2.7031800746917725 }, { "auxiliary_loss_clip": 0.01073027, "auxiliary_loss_mlp": 0.01034541, "balance_loss_clip": 1.03628802, "balance_loss_mlp": 1.02165496, "epoch": 0.8502029159777544, "flos": 26687912820480.0, "grad_norm": 2.024190780090597, "language_loss": 0.64177513, "learning_rate": 2.3071959437411648e-07, "loss": 0.6628508, "num_input_tokens_seen": 304989315, "step": 14141, "time_per_iteration": 2.780942916870117 }, { "auxiliary_loss_clip": 0.01079122, "auxiliary_loss_mlp": 0.01034336, "balance_loss_clip": 1.03825319, "balance_loss_mlp": 1.02206933, "epoch": 0.8502630392304223, "flos": 35590778179200.0, "grad_norm": 1.598166482791058, "language_loss": 0.70859313, "learning_rate": 2.3053803195271214e-07, "loss": 0.72972775, "num_input_tokens_seen": 305011020, "step": 14142, "time_per_iteration": 2.8212552070617676 }, { "auxiliary_loss_clip": 0.01061273, "auxiliary_loss_mlp": 0.0103314, "balance_loss_clip": 1.03280842, "balance_loss_mlp": 1.0207963, "epoch": 0.8503231624830904, "flos": 21649466125440.0, "grad_norm": 1.747417790949646, "language_loss": 0.6528132, "learning_rate": 2.3035653662948375e-07, "loss": 0.67375731, "num_input_tokens_seen": 305033550, "step": 14143, "time_per_iteration": 2.785883665084839 }, { "auxiliary_loss_clip": 0.01081279, "auxiliary_loss_mlp": 0.00770514, "balance_loss_clip": 1.03600597, "balance_loss_mlp": 1.00017881, "epoch": 0.8503832857357583, "flos": 22417451838720.0, "grad_norm": 2.048866472556172, "language_loss": 0.68279046, "learning_rate": 2.3017510841131216e-07, "loss": 0.70130837, "num_input_tokens_seen": 305052885, "step": 14144, "time_per_iteration": 4.240123748779297 }, { "auxiliary_loss_clip": 0.01042348, "auxiliary_loss_mlp": 0.01033949, "balance_loss_clip": 1.03315759, "balance_loss_mlp": 1.02033496, "epoch": 0.8504434089884263, "flos": 18697968552960.0, "grad_norm": 2.1174262858689628, "language_loss": 0.6438145, "learning_rate": 2.299937473050777e-07, "loss": 0.66457748, "num_input_tokens_seen": 305071995, "step": 14145, "time_per_iteration": 4.4199535846710205 }, { "auxiliary_loss_clip": 0.01087485, "auxiliary_loss_mlp": 0.01031784, "balance_loss_clip": 1.03486562, "balance_loss_mlp": 1.01891518, "epoch": 0.8505035322410942, "flos": 20007989475840.0, "grad_norm": 1.8443246841114695, "language_loss": 0.8561762, "learning_rate": 2.2981245331765842e-07, "loss": 0.87736893, "num_input_tokens_seen": 305090190, "step": 14146, "time_per_iteration": 2.6533970832824707 }, { "auxiliary_loss_clip": 0.0110625, "auxiliary_loss_mlp": 0.01027868, "balance_loss_clip": 1.03480434, "balance_loss_mlp": 1.01580358, "epoch": 0.8505636554937622, "flos": 20812173120000.0, "grad_norm": 1.6073228623135094, "language_loss": 0.84023243, "learning_rate": 2.2963122645592814e-07, "loss": 0.86157364, "num_input_tokens_seen": 305109355, "step": 14147, "time_per_iteration": 4.045815706253052 }, { "auxiliary_loss_clip": 0.01099865, "auxiliary_loss_mlp": 0.01031152, "balance_loss_clip": 1.0365082, "balance_loss_mlp": 1.01814604, "epoch": 0.8506237787464301, "flos": 14174445277440.0, "grad_norm": 3.171596156329527, "language_loss": 0.85552716, "learning_rate": 2.2945006672675894e-07, "loss": 0.87683737, "num_input_tokens_seen": 305124165, "step": 14148, "time_per_iteration": 4.178382635116577 }, { "auxiliary_loss_clip": 0.01086687, "auxiliary_loss_mlp": 0.01033466, "balance_loss_clip": 1.03529978, "balance_loss_mlp": 1.02051425, "epoch": 0.8506839019990982, "flos": 23258372117760.0, "grad_norm": 1.5915072699945274, "language_loss": 0.71948111, "learning_rate": 2.292689741370204e-07, "loss": 0.7406826, "num_input_tokens_seen": 305143940, "step": 14149, "time_per_iteration": 2.7413246631622314 }, { "auxiliary_loss_clip": 0.01087525, "auxiliary_loss_mlp": 0.01029431, "balance_loss_clip": 1.03729534, "balance_loss_mlp": 1.0173254, "epoch": 0.8507440252517661, "flos": 23659206963840.0, "grad_norm": 1.895927290429812, "language_loss": 0.76037747, "learning_rate": 2.290879486935804e-07, "loss": 0.78154701, "num_input_tokens_seen": 305163505, "step": 14150, "time_per_iteration": 2.8601326942443848 }, { "auxiliary_loss_clip": 0.01068558, "auxiliary_loss_mlp": 0.01033535, "balance_loss_clip": 1.03508079, "balance_loss_mlp": 1.02081537, "epoch": 0.8508041485044341, "flos": 18661339658880.0, "grad_norm": 1.9437397223028954, "language_loss": 0.72261739, "learning_rate": 2.2890699040330231e-07, "loss": 0.74363828, "num_input_tokens_seen": 305182325, "step": 14151, "time_per_iteration": 2.7191174030303955 }, { "auxiliary_loss_clip": 0.00989017, "auxiliary_loss_mlp": 0.01001335, "balance_loss_clip": 1.01485205, "balance_loss_mlp": 1.00013149, "epoch": 0.8508642717571021, "flos": 52510918055040.0, "grad_norm": 0.8877797296007555, "language_loss": 0.5956288, "learning_rate": 2.2872609927304909e-07, "loss": 0.61553234, "num_input_tokens_seen": 305230775, "step": 14152, "time_per_iteration": 3.0959417819976807 }, { "auxiliary_loss_clip": 0.01012053, "auxiliary_loss_mlp": 0.01000683, "balance_loss_clip": 1.00869, "balance_loss_mlp": 0.99963391, "epoch": 0.85092439500977, "flos": 69297145050240.0, "grad_norm": 0.6913266470704932, "language_loss": 0.61156118, "learning_rate": 2.285452753096797e-07, "loss": 0.63168854, "num_input_tokens_seen": 305296000, "step": 14153, "time_per_iteration": 3.1953656673431396 }, { "auxiliary_loss_clip": 0.01099193, "auxiliary_loss_mlp": 0.01032395, "balance_loss_clip": 1.03656745, "balance_loss_mlp": 1.0191927, "epoch": 0.850984518262438, "flos": 24389737770240.0, "grad_norm": 1.8650933862340224, "language_loss": 0.80833215, "learning_rate": 2.2836451852005067e-07, "loss": 0.82964802, "num_input_tokens_seen": 305314705, "step": 14154, "time_per_iteration": 2.6398138999938965 }, { "auxiliary_loss_clip": 0.01070524, "auxiliary_loss_mlp": 0.01031227, "balance_loss_clip": 1.0340178, "balance_loss_mlp": 1.02010489, "epoch": 0.851044641515106, "flos": 23294821443840.0, "grad_norm": 3.9078640909935753, "language_loss": 0.79612941, "learning_rate": 2.281838289110165e-07, "loss": 0.8171469, "num_input_tokens_seen": 305333870, "step": 14155, "time_per_iteration": 2.7668473720550537 }, { "auxiliary_loss_clip": 0.01075246, "auxiliary_loss_mlp": 0.0103132, "balance_loss_clip": 1.03424454, "balance_loss_mlp": 1.01889825, "epoch": 0.851104764767774, "flos": 22050085489920.0, "grad_norm": 2.1664550070392172, "language_loss": 0.70601416, "learning_rate": 2.2800320648942904e-07, "loss": 0.72707975, "num_input_tokens_seen": 305352780, "step": 14156, "time_per_iteration": 2.712688684463501 }, { "auxiliary_loss_clip": 0.01067563, "auxiliary_loss_mlp": 0.01030898, "balance_loss_clip": 1.03651905, "balance_loss_mlp": 1.01922178, "epoch": 0.8511648880204419, "flos": 20704728562560.0, "grad_norm": 2.295507051871608, "language_loss": 0.73186374, "learning_rate": 2.278226512621386e-07, "loss": 0.75284839, "num_input_tokens_seen": 305371370, "step": 14157, "time_per_iteration": 2.702608108520508 }, { "auxiliary_loss_clip": 0.01040081, "auxiliary_loss_mlp": 0.010238, "balance_loss_clip": 1.03516209, "balance_loss_mlp": 1.01280284, "epoch": 0.8512250112731099, "flos": 24024669891840.0, "grad_norm": 2.053663216507987, "language_loss": 0.800686, "learning_rate": 2.2764216323598995e-07, "loss": 0.82132483, "num_input_tokens_seen": 305387955, "step": 14158, "time_per_iteration": 2.8139398097991943 }, { "auxiliary_loss_clip": 0.01094324, "auxiliary_loss_mlp": 0.01036878, "balance_loss_clip": 1.03557563, "balance_loss_mlp": 1.02236986, "epoch": 0.8512851345257778, "flos": 22015467757440.0, "grad_norm": 2.1721071422061446, "language_loss": 0.79100662, "learning_rate": 2.27461742417828e-07, "loss": 0.81231868, "num_input_tokens_seen": 305406285, "step": 14159, "time_per_iteration": 2.5417728424072266 }, { "auxiliary_loss_clip": 0.01089601, "auxiliary_loss_mlp": 0.0103497, "balance_loss_clip": 1.0372653, "balance_loss_mlp": 1.02239358, "epoch": 0.8513452577784458, "flos": 14830209924480.0, "grad_norm": 2.049292713518449, "language_loss": 0.71023905, "learning_rate": 2.2728138881449488e-07, "loss": 0.73148477, "num_input_tokens_seen": 305424500, "step": 14160, "time_per_iteration": 2.5549099445343018 }, { "auxiliary_loss_clip": 0.01104724, "auxiliary_loss_mlp": 0.01029216, "balance_loss_clip": 1.03866696, "balance_loss_mlp": 1.01627553, "epoch": 0.8514053810311137, "flos": 33035662166400.0, "grad_norm": 2.7738458222833637, "language_loss": 0.7019136, "learning_rate": 2.2710110243282866e-07, "loss": 0.72325301, "num_input_tokens_seen": 305442990, "step": 14161, "time_per_iteration": 2.5866782665252686 }, { "auxiliary_loss_clip": 0.01097425, "auxiliary_loss_mlp": 0.01030584, "balance_loss_clip": 1.03306913, "balance_loss_mlp": 1.01881218, "epoch": 0.8514655042837818, "flos": 27564456412800.0, "grad_norm": 2.41119817546413, "language_loss": 0.77940011, "learning_rate": 2.2692088327966653e-07, "loss": 0.80068016, "num_input_tokens_seen": 305463065, "step": 14162, "time_per_iteration": 2.7035062313079834 }, { "auxiliary_loss_clip": 0.01099345, "auxiliary_loss_mlp": 0.01033323, "balance_loss_clip": 1.03699732, "balance_loss_mlp": 1.02044845, "epoch": 0.8515256275364497, "flos": 35556052705920.0, "grad_norm": 1.8591590026423754, "language_loss": 0.77019423, "learning_rate": 2.2674073136184235e-07, "loss": 0.79152089, "num_input_tokens_seen": 305489070, "step": 14163, "time_per_iteration": 2.750953435897827 }, { "auxiliary_loss_clip": 0.01013801, "auxiliary_loss_mlp": 0.01003898, "balance_loss_clip": 1.01090002, "balance_loss_mlp": 1.00288486, "epoch": 0.8515857507891177, "flos": 70207372621440.0, "grad_norm": 0.6897705551367352, "language_loss": 0.54935861, "learning_rate": 2.2656064668618735e-07, "loss": 0.56953561, "num_input_tokens_seen": 305551490, "step": 14164, "time_per_iteration": 3.223865509033203 }, { "auxiliary_loss_clip": 0.01099487, "auxiliary_loss_mlp": 0.01033823, "balance_loss_clip": 1.03638053, "balance_loss_mlp": 1.02158666, "epoch": 0.8516458740417857, "flos": 22675290641280.0, "grad_norm": 1.9828346759864348, "language_loss": 0.7308625, "learning_rate": 2.2638062925953005e-07, "loss": 0.7521956, "num_input_tokens_seen": 305570535, "step": 14165, "time_per_iteration": 2.683063268661499 }, { "auxiliary_loss_clip": 0.01070656, "auxiliary_loss_mlp": 0.01030913, "balance_loss_clip": 1.03600621, "balance_loss_mlp": 1.01849699, "epoch": 0.8517059972944536, "flos": 22747435107840.0, "grad_norm": 1.5369280358332664, "language_loss": 0.6716156, "learning_rate": 2.26200679088697e-07, "loss": 0.6926313, "num_input_tokens_seen": 305590800, "step": 14166, "time_per_iteration": 2.7411108016967773 }, { "auxiliary_loss_clip": 0.01084994, "auxiliary_loss_mlp": 0.01034282, "balance_loss_clip": 1.03303361, "balance_loss_mlp": 1.02188396, "epoch": 0.8517661205471216, "flos": 21689147675520.0, "grad_norm": 1.785592393708889, "language_loss": 0.73291379, "learning_rate": 2.260207961805125e-07, "loss": 0.75410652, "num_input_tokens_seen": 305609495, "step": 14167, "time_per_iteration": 2.6664106845855713 }, { "auxiliary_loss_clip": 0.01109416, "auxiliary_loss_mlp": 0.0103133, "balance_loss_clip": 1.03773403, "balance_loss_mlp": 1.01968884, "epoch": 0.8518262437997896, "flos": 25374839241600.0, "grad_norm": 1.6176439709713288, "language_loss": 0.80560851, "learning_rate": 2.258409805417969e-07, "loss": 0.827016, "num_input_tokens_seen": 305629420, "step": 14168, "time_per_iteration": 2.59899640083313 }, { "auxiliary_loss_clip": 0.01106516, "auxiliary_loss_mlp": 0.01027237, "balance_loss_clip": 1.03524876, "balance_loss_mlp": 1.01554823, "epoch": 0.8518863670524576, "flos": 27235406897280.0, "grad_norm": 1.781183741177256, "language_loss": 0.76068074, "learning_rate": 2.2566123217936893e-07, "loss": 0.7820183, "num_input_tokens_seen": 305649835, "step": 14169, "time_per_iteration": 2.634589672088623 }, { "auxiliary_loss_clip": 0.01112356, "auxiliary_loss_mlp": 0.01030417, "balance_loss_clip": 1.03858984, "balance_loss_mlp": 1.01746488, "epoch": 0.8519464903051255, "flos": 20959514709120.0, "grad_norm": 1.6296067224566693, "language_loss": 0.63455546, "learning_rate": 2.254815511000452e-07, "loss": 0.65598321, "num_input_tokens_seen": 305668840, "step": 14170, "time_per_iteration": 2.556849718093872 }, { "auxiliary_loss_clip": 0.0109011, "auxiliary_loss_mlp": 0.01029143, "balance_loss_clip": 1.03445804, "balance_loss_mlp": 1.0168829, "epoch": 0.8520066135577935, "flos": 18441745862400.0, "grad_norm": 2.158149023964769, "language_loss": 0.8638401, "learning_rate": 2.253019373106384e-07, "loss": 0.88503265, "num_input_tokens_seen": 305686955, "step": 14171, "time_per_iteration": 2.6308727264404297 }, { "auxiliary_loss_clip": 0.01094344, "auxiliary_loss_mlp": 0.01038693, "balance_loss_clip": 1.0366205, "balance_loss_mlp": 1.02613449, "epoch": 0.8520667368104614, "flos": 29130233149440.0, "grad_norm": 1.7172812217189943, "language_loss": 0.54943144, "learning_rate": 2.2512239081796003e-07, "loss": 0.5707618, "num_input_tokens_seen": 305706290, "step": 14172, "time_per_iteration": 2.6792733669281006 }, { "auxiliary_loss_clip": 0.01082291, "auxiliary_loss_mlp": 0.01028716, "balance_loss_clip": 1.0339576, "balance_loss_mlp": 1.01860142, "epoch": 0.8521268600631294, "flos": 16034366488320.0, "grad_norm": 2.2874699102047824, "language_loss": 0.6964975, "learning_rate": 2.2494291162881862e-07, "loss": 0.71760762, "num_input_tokens_seen": 305723835, "step": 14173, "time_per_iteration": 2.656660795211792 }, { "auxiliary_loss_clip": 0.0108799, "auxiliary_loss_mlp": 0.0077035, "balance_loss_clip": 1.03576326, "balance_loss_mlp": 1.0002656, "epoch": 0.8521869833157973, "flos": 22454870832000.0, "grad_norm": 2.469794290307129, "language_loss": 0.77085257, "learning_rate": 2.247634997500205e-07, "loss": 0.78943598, "num_input_tokens_seen": 305741655, "step": 14174, "time_per_iteration": 2.6629743576049805 }, { "auxiliary_loss_clip": 0.01074547, "auxiliary_loss_mlp": 0.00771408, "balance_loss_clip": 1.03330672, "balance_loss_mlp": 1.00036669, "epoch": 0.8522471065684654, "flos": 24972029147520.0, "grad_norm": 3.681847019850499, "language_loss": 0.8197754, "learning_rate": 2.245841551883676e-07, "loss": 0.83823496, "num_input_tokens_seen": 305761890, "step": 14175, "time_per_iteration": 2.6883835792541504 }, { "auxiliary_loss_clip": 0.01112836, "auxiliary_loss_mlp": 0.01035663, "balance_loss_clip": 1.03869867, "balance_loss_mlp": 1.02256155, "epoch": 0.8523072298211333, "flos": 17710604524800.0, "grad_norm": 7.221526535208017, "language_loss": 0.65591013, "learning_rate": 2.2440487795066153e-07, "loss": 0.67739511, "num_input_tokens_seen": 305779190, "step": 14176, "time_per_iteration": 2.513249397277832 }, { "auxiliary_loss_clip": 0.01083655, "auxiliary_loss_mlp": 0.00769903, "balance_loss_clip": 1.03461874, "balance_loss_mlp": 1.00019979, "epoch": 0.8523673530738013, "flos": 25446193608960.0, "grad_norm": 1.6790468369786946, "language_loss": 0.7851091, "learning_rate": 2.2422566804370068e-07, "loss": 0.80364466, "num_input_tokens_seen": 305799870, "step": 14177, "time_per_iteration": 2.6671228408813477 }, { "auxiliary_loss_clip": 0.0108583, "auxiliary_loss_mlp": 0.01029951, "balance_loss_clip": 1.03573, "balance_loss_mlp": 1.01741612, "epoch": 0.8524274763264693, "flos": 31429593348480.0, "grad_norm": 1.9646253723972047, "language_loss": 0.73313767, "learning_rate": 2.2404652547428026e-07, "loss": 0.75429547, "num_input_tokens_seen": 305819695, "step": 14178, "time_per_iteration": 2.713926315307617 }, { "auxiliary_loss_clip": 0.01074008, "auxiliary_loss_mlp": 0.01037664, "balance_loss_clip": 1.03707623, "balance_loss_mlp": 1.02537966, "epoch": 0.8524875995791372, "flos": 17712651600000.0, "grad_norm": 1.8623872713684044, "language_loss": 0.74955928, "learning_rate": 2.238674502491935e-07, "loss": 0.77067608, "num_input_tokens_seen": 305837270, "step": 14179, "time_per_iteration": 2.6611170768737793 }, { "auxiliary_loss_clip": 0.01109256, "auxiliary_loss_mlp": 0.01029994, "balance_loss_clip": 1.03910112, "balance_loss_mlp": 1.01806116, "epoch": 0.8525477228318052, "flos": 21687316081920.0, "grad_norm": 2.060347527701932, "language_loss": 0.816504, "learning_rate": 2.2368844237523165e-07, "loss": 0.83789647, "num_input_tokens_seen": 305855250, "step": 14180, "time_per_iteration": 2.6562328338623047 }, { "auxiliary_loss_clip": 0.01051532, "auxiliary_loss_mlp": 0.01034699, "balance_loss_clip": 1.03316164, "balance_loss_mlp": 1.02265859, "epoch": 0.8526078460844732, "flos": 24827057856000.0, "grad_norm": 6.706307974363978, "language_loss": 0.60821462, "learning_rate": 2.235095018591815e-07, "loss": 0.62907696, "num_input_tokens_seen": 305875660, "step": 14181, "time_per_iteration": 2.7725861072540283 }, { "auxiliary_loss_clip": 0.011084, "auxiliary_loss_mlp": 0.01033406, "balance_loss_clip": 1.03824615, "balance_loss_mlp": 1.02208114, "epoch": 0.8526679693371412, "flos": 13516418073600.0, "grad_norm": 2.1617391285888314, "language_loss": 0.72616804, "learning_rate": 2.2333062870782894e-07, "loss": 0.74758613, "num_input_tokens_seen": 305892415, "step": 14182, "time_per_iteration": 2.5392303466796875 }, { "auxiliary_loss_clip": 0.01056951, "auxiliary_loss_mlp": 0.01033126, "balance_loss_clip": 1.03387702, "balance_loss_mlp": 1.0208416, "epoch": 0.8527280925898091, "flos": 23514092017920.0, "grad_norm": 1.4656692633945465, "language_loss": 0.70735776, "learning_rate": 2.2315182292795697e-07, "loss": 0.72825855, "num_input_tokens_seen": 305912665, "step": 14183, "time_per_iteration": 4.254406213760376 }, { "auxiliary_loss_clip": 0.01081461, "auxiliary_loss_mlp": 0.01031438, "balance_loss_clip": 1.03771853, "balance_loss_mlp": 1.01956463, "epoch": 0.8527882158424771, "flos": 20303031790080.0, "grad_norm": 1.7895488338576169, "language_loss": 0.72972029, "learning_rate": 2.2297308452634644e-07, "loss": 0.75084925, "num_input_tokens_seen": 305931515, "step": 14184, "time_per_iteration": 4.304826974868774 }, { "auxiliary_loss_clip": 0.01109825, "auxiliary_loss_mlp": 0.01033313, "balance_loss_clip": 1.0379746, "balance_loss_mlp": 1.02064705, "epoch": 0.852848339095145, "flos": 17202504689280.0, "grad_norm": 1.7597843900192167, "language_loss": 0.7711637, "learning_rate": 2.2279441350977457e-07, "loss": 0.79259503, "num_input_tokens_seen": 305949965, "step": 14185, "time_per_iteration": 2.5977091789245605 }, { "auxiliary_loss_clip": 0.0106691, "auxiliary_loss_mlp": 0.0102992, "balance_loss_clip": 1.03286219, "balance_loss_mlp": 1.01596713, "epoch": 0.852908462347813, "flos": 18368990864640.0, "grad_norm": 2.425914836353015, "language_loss": 0.79841149, "learning_rate": 2.2261580988501637e-07, "loss": 0.81937975, "num_input_tokens_seen": 305967820, "step": 14186, "time_per_iteration": 4.160691738128662 }, { "auxiliary_loss_clip": 0.01085946, "auxiliary_loss_mlp": 0.01029366, "balance_loss_clip": 1.03557575, "balance_loss_mlp": 1.01655054, "epoch": 0.8529685856004809, "flos": 18624890332800.0, "grad_norm": 1.6292428132802597, "language_loss": 0.62476075, "learning_rate": 2.224372736588449e-07, "loss": 0.64591384, "num_input_tokens_seen": 305985505, "step": 14187, "time_per_iteration": 4.218466758728027 }, { "auxiliary_loss_clip": 0.01056813, "auxiliary_loss_mlp": 0.01030511, "balance_loss_clip": 1.03186965, "balance_loss_mlp": 1.01697493, "epoch": 0.853028708853149, "flos": 29607665748480.0, "grad_norm": 1.8450425087178943, "language_loss": 0.76632512, "learning_rate": 2.2225880483803005e-07, "loss": 0.78719831, "num_input_tokens_seen": 306005220, "step": 14188, "time_per_iteration": 2.756181240081787 }, { "auxiliary_loss_clip": 0.01098789, "auxiliary_loss_mlp": 0.01029553, "balance_loss_clip": 1.03644693, "balance_loss_mlp": 1.01655281, "epoch": 0.8530888321058169, "flos": 26353153042560.0, "grad_norm": 1.4873919798576203, "language_loss": 0.7809422, "learning_rate": 2.2208040342933932e-07, "loss": 0.80222559, "num_input_tokens_seen": 306023785, "step": 14189, "time_per_iteration": 2.6410326957702637 }, { "auxiliary_loss_clip": 0.01086145, "auxiliary_loss_mlp": 0.01032823, "balance_loss_clip": 1.03470349, "balance_loss_mlp": 1.01997268, "epoch": 0.8531489553584849, "flos": 20521979141760.0, "grad_norm": 2.5651447957757005, "language_loss": 0.7962172, "learning_rate": 2.2190206943953793e-07, "loss": 0.81740683, "num_input_tokens_seen": 306041600, "step": 14190, "time_per_iteration": 2.6400444507598877 }, { "auxiliary_loss_clip": 0.01059317, "auxiliary_loss_mlp": 0.01029769, "balance_loss_clip": 1.03576827, "balance_loss_mlp": 1.01700187, "epoch": 0.8532090786111529, "flos": 20704297599360.0, "grad_norm": 2.162987125122954, "language_loss": 0.75559723, "learning_rate": 2.2172380287538894e-07, "loss": 0.77648813, "num_input_tokens_seen": 306060345, "step": 14191, "time_per_iteration": 2.691556692123413 }, { "auxiliary_loss_clip": 0.01098409, "auxiliary_loss_mlp": 0.01030091, "balance_loss_clip": 1.0377655, "balance_loss_mlp": 1.01723993, "epoch": 0.8532692018638208, "flos": 19828903242240.0, "grad_norm": 1.957341316580574, "language_loss": 0.69267607, "learning_rate": 2.2154560374365073e-07, "loss": 0.71396106, "num_input_tokens_seen": 306078285, "step": 14192, "time_per_iteration": 2.631347894668579 }, { "auxiliary_loss_clip": 0.01101694, "auxiliary_loss_mlp": 0.01035412, "balance_loss_clip": 1.03725314, "balance_loss_mlp": 1.02091622, "epoch": 0.8533293251164888, "flos": 20996790048000.0, "grad_norm": 2.1501600362317643, "language_loss": 0.63451266, "learning_rate": 2.2136747205108164e-07, "loss": 0.65588367, "num_input_tokens_seen": 306093760, "step": 14193, "time_per_iteration": 2.626577377319336 }, { "auxiliary_loss_clip": 0.01081646, "auxiliary_loss_mlp": 0.01028086, "balance_loss_clip": 1.03549838, "balance_loss_mlp": 1.01570606, "epoch": 0.8533894483691568, "flos": 22419606654720.0, "grad_norm": 2.0851012965746905, "language_loss": 0.76840144, "learning_rate": 2.211894078044365e-07, "loss": 0.78949881, "num_input_tokens_seen": 306112595, "step": 14194, "time_per_iteration": 2.6441872119903564 }, { "auxiliary_loss_clip": 0.01110242, "auxiliary_loss_mlp": 0.01028551, "balance_loss_clip": 1.03740048, "balance_loss_mlp": 1.01674914, "epoch": 0.8534495716218248, "flos": 21616536332160.0, "grad_norm": 2.217628380709863, "language_loss": 0.69469094, "learning_rate": 2.2101141101046705e-07, "loss": 0.71607888, "num_input_tokens_seen": 306131800, "step": 14195, "time_per_iteration": 2.679945707321167 }, { "auxiliary_loss_clip": 0.01082724, "auxiliary_loss_mlp": 0.01032645, "balance_loss_clip": 1.03602624, "balance_loss_mlp": 1.01968741, "epoch": 0.8535096948744927, "flos": 22346277039360.0, "grad_norm": 1.838474831161169, "language_loss": 0.85432625, "learning_rate": 2.2083348167592343e-07, "loss": 0.87547994, "num_input_tokens_seen": 306150590, "step": 14196, "time_per_iteration": 2.6546883583068848 }, { "auxiliary_loss_clip": 0.01011396, "auxiliary_loss_mlp": 0.01001419, "balance_loss_clip": 1.00853372, "balance_loss_mlp": 1.00029302, "epoch": 0.8535698181271607, "flos": 52762507891200.0, "grad_norm": 0.7576235506473017, "language_loss": 0.55055857, "learning_rate": 2.2065561980755243e-07, "loss": 0.5706867, "num_input_tokens_seen": 306205850, "step": 14197, "time_per_iteration": 3.1265292167663574 }, { "auxiliary_loss_clip": 0.01072451, "auxiliary_loss_mlp": 0.00769866, "balance_loss_clip": 1.03453422, "balance_loss_mlp": 1.0002501, "epoch": 0.8536299413798286, "flos": 19062892776960.0, "grad_norm": 1.7349390720233626, "language_loss": 0.81448388, "learning_rate": 2.2047782541209826e-07, "loss": 0.83290708, "num_input_tokens_seen": 306225220, "step": 14198, "time_per_iteration": 2.709376573562622 }, { "auxiliary_loss_clip": 0.01107145, "auxiliary_loss_mlp": 0.01028426, "balance_loss_clip": 1.03658509, "balance_loss_mlp": 1.01760149, "epoch": 0.8536900646324966, "flos": 49344743871360.0, "grad_norm": 3.58688569610331, "language_loss": 0.6833868, "learning_rate": 2.203000984963035e-07, "loss": 0.70474249, "num_input_tokens_seen": 306249865, "step": 14199, "time_per_iteration": 2.8150370121002197 }, { "auxiliary_loss_clip": 0.01070955, "auxiliary_loss_mlp": 0.01028243, "balance_loss_clip": 1.03376341, "balance_loss_mlp": 1.01713872, "epoch": 0.8537501878851645, "flos": 21762333636480.0, "grad_norm": 1.5671357707792795, "language_loss": 0.86500955, "learning_rate": 2.201224390669072e-07, "loss": 0.88600153, "num_input_tokens_seen": 306270215, "step": 14200, "time_per_iteration": 2.6922430992126465 }, { "auxiliary_loss_clip": 0.01079411, "auxiliary_loss_mlp": 0.01028505, "balance_loss_clip": 1.03768003, "balance_loss_mlp": 1.01712668, "epoch": 0.8538103111378326, "flos": 22269176496000.0, "grad_norm": 1.8836819449837667, "language_loss": 0.77679044, "learning_rate": 2.1994484713064666e-07, "loss": 0.79786962, "num_input_tokens_seen": 306288960, "step": 14201, "time_per_iteration": 2.686408758163452 }, { "auxiliary_loss_clip": 0.01080739, "auxiliary_loss_mlp": 0.01029211, "balance_loss_clip": 1.03589165, "balance_loss_mlp": 1.01757574, "epoch": 0.8538704343905005, "flos": 20303929630080.0, "grad_norm": 2.780473134009725, "language_loss": 0.6885708, "learning_rate": 2.19767322694256e-07, "loss": 0.70967031, "num_input_tokens_seen": 306308735, "step": 14202, "time_per_iteration": 2.6336662769317627 }, { "auxiliary_loss_clip": 0.01099521, "auxiliary_loss_mlp": 0.01036207, "balance_loss_clip": 1.037709, "balance_loss_mlp": 1.02389884, "epoch": 0.8539305576431685, "flos": 24755164784640.0, "grad_norm": 1.9187950545950658, "language_loss": 0.80178666, "learning_rate": 2.195898657644666e-07, "loss": 0.82314396, "num_input_tokens_seen": 306329015, "step": 14203, "time_per_iteration": 2.6216869354248047 }, { "auxiliary_loss_clip": 0.01090886, "auxiliary_loss_mlp": 0.01032011, "balance_loss_clip": 1.03592849, "balance_loss_mlp": 1.01897538, "epoch": 0.8539906808958365, "flos": 26687625511680.0, "grad_norm": 2.0827727006300543, "language_loss": 0.66570961, "learning_rate": 2.1941247634800808e-07, "loss": 0.68693864, "num_input_tokens_seen": 306349085, "step": 14204, "time_per_iteration": 2.7057762145996094 }, { "auxiliary_loss_clip": 0.01111148, "auxiliary_loss_mlp": 0.01032529, "balance_loss_clip": 1.03801191, "balance_loss_mlp": 1.01958251, "epoch": 0.8540508041485044, "flos": 13365521038080.0, "grad_norm": 2.906237255185196, "language_loss": 0.59810114, "learning_rate": 2.1923515445160667e-07, "loss": 0.61953795, "num_input_tokens_seen": 306365385, "step": 14205, "time_per_iteration": 2.573305368423462 }, { "auxiliary_loss_clip": 0.0108658, "auxiliary_loss_mlp": 0.0102928, "balance_loss_clip": 1.03708744, "balance_loss_mlp": 1.01709652, "epoch": 0.8541109274011724, "flos": 32780876019840.0, "grad_norm": 3.4708591258451116, "language_loss": 0.72213638, "learning_rate": 2.1905790008198655e-07, "loss": 0.74329495, "num_input_tokens_seen": 306384585, "step": 14206, "time_per_iteration": 2.7664809226989746 }, { "auxiliary_loss_clip": 0.01100148, "auxiliary_loss_mlp": 0.01027694, "balance_loss_clip": 1.03798437, "balance_loss_mlp": 1.01563621, "epoch": 0.8541710506538404, "flos": 17639286071040.0, "grad_norm": 2.7591002381259617, "language_loss": 0.76277685, "learning_rate": 2.1888071324586987e-07, "loss": 0.78405529, "num_input_tokens_seen": 306401565, "step": 14207, "time_per_iteration": 2.5857670307159424 }, { "auxiliary_loss_clip": 0.01110866, "auxiliary_loss_mlp": 0.01031023, "balance_loss_clip": 1.03753805, "balance_loss_mlp": 1.01777935, "epoch": 0.8542311739065084, "flos": 20263062931200.0, "grad_norm": 1.7437874977291616, "language_loss": 0.85243803, "learning_rate": 2.1870359394997485e-07, "loss": 0.8738569, "num_input_tokens_seen": 306419995, "step": 14208, "time_per_iteration": 2.5491318702697754 }, { "auxiliary_loss_clip": 0.01090714, "auxiliary_loss_mlp": 0.0102915, "balance_loss_clip": 1.03670692, "balance_loss_mlp": 1.01759243, "epoch": 0.8542912971591763, "flos": 17785657992960.0, "grad_norm": 1.579396751637571, "language_loss": 0.66011345, "learning_rate": 2.1852654220101785e-07, "loss": 0.68131208, "num_input_tokens_seen": 306439240, "step": 14209, "time_per_iteration": 2.619147539138794 }, { "auxiliary_loss_clip": 0.01062026, "auxiliary_loss_mlp": 0.01025767, "balance_loss_clip": 1.03395295, "balance_loss_mlp": 1.01391149, "epoch": 0.8543514204118443, "flos": 26979507429120.0, "grad_norm": 2.0420847855392297, "language_loss": 0.70425576, "learning_rate": 2.1834955800571287e-07, "loss": 0.72513366, "num_input_tokens_seen": 306458425, "step": 14210, "time_per_iteration": 2.7978549003601074 }, { "auxiliary_loss_clip": 0.01085485, "auxiliary_loss_mlp": 0.01031679, "balance_loss_clip": 1.03576684, "balance_loss_mlp": 1.0193646, "epoch": 0.8544115436645122, "flos": 24024598064640.0, "grad_norm": 1.6548708543912152, "language_loss": 0.70239341, "learning_rate": 2.1817264137077141e-07, "loss": 0.7235651, "num_input_tokens_seen": 306477210, "step": 14211, "time_per_iteration": 2.766183614730835 }, { "auxiliary_loss_clip": 0.01090016, "auxiliary_loss_mlp": 0.0103377, "balance_loss_clip": 1.03690755, "balance_loss_mlp": 1.02137232, "epoch": 0.8544716669171802, "flos": 16617986668800.0, "grad_norm": 2.2883331624161687, "language_loss": 0.81601977, "learning_rate": 2.1799579230290166e-07, "loss": 0.83725762, "num_input_tokens_seen": 306495820, "step": 14212, "time_per_iteration": 2.6845991611480713 }, { "auxiliary_loss_clip": 0.01073343, "auxiliary_loss_mlp": 0.01033296, "balance_loss_clip": 1.03170538, "balance_loss_mlp": 1.01963472, "epoch": 0.8545317901698481, "flos": 40005779489280.0, "grad_norm": 1.7913118059444788, "language_loss": 0.66273463, "learning_rate": 2.178190108088105e-07, "loss": 0.68380105, "num_input_tokens_seen": 306516420, "step": 14213, "time_per_iteration": 2.8582568168640137 }, { "auxiliary_loss_clip": 0.01107415, "auxiliary_loss_mlp": 0.01029384, "balance_loss_clip": 1.03667092, "balance_loss_mlp": 1.01733816, "epoch": 0.8545919134225162, "flos": 19902520166400.0, "grad_norm": 1.7812973298458348, "language_loss": 0.78218639, "learning_rate": 2.1764229689520098e-07, "loss": 0.80355442, "num_input_tokens_seen": 306534785, "step": 14214, "time_per_iteration": 2.5741806030273438 }, { "auxiliary_loss_clip": 0.01090515, "auxiliary_loss_mlp": 0.01030143, "balance_loss_clip": 1.03572309, "balance_loss_mlp": 1.01646936, "epoch": 0.8546520366751841, "flos": 18952970181120.0, "grad_norm": 2.3620228976169013, "language_loss": 0.66771472, "learning_rate": 2.1746565056877397e-07, "loss": 0.68892121, "num_input_tokens_seen": 306552440, "step": 14215, "time_per_iteration": 2.682720422744751 }, { "auxiliary_loss_clip": 0.01108233, "auxiliary_loss_mlp": 0.01027691, "balance_loss_clip": 1.03707683, "balance_loss_mlp": 1.01554906, "epoch": 0.8547121599278521, "flos": 35621445415680.0, "grad_norm": 1.6345629270986273, "language_loss": 0.62375963, "learning_rate": 2.172890718362279e-07, "loss": 0.64511889, "num_input_tokens_seen": 306573600, "step": 14216, "time_per_iteration": 2.675818681716919 }, { "auxiliary_loss_clip": 0.01073815, "auxiliary_loss_mlp": 0.01034717, "balance_loss_clip": 1.03433871, "balance_loss_mlp": 1.0223552, "epoch": 0.8547722831805201, "flos": 16910048154240.0, "grad_norm": 2.187084459340775, "language_loss": 0.6559574, "learning_rate": 2.17112560704259e-07, "loss": 0.67704272, "num_input_tokens_seen": 306592840, "step": 14217, "time_per_iteration": 2.6645264625549316 }, { "auxiliary_loss_clip": 0.01095964, "auxiliary_loss_mlp": 0.0103166, "balance_loss_clip": 1.03822827, "balance_loss_mlp": 1.01984668, "epoch": 0.854832406433188, "flos": 23002616304000.0, "grad_norm": 1.691658565151652, "language_loss": 0.64885128, "learning_rate": 2.1693611717956072e-07, "loss": 0.67012751, "num_input_tokens_seen": 306613210, "step": 14218, "time_per_iteration": 2.659118890762329 }, { "auxiliary_loss_clip": 0.01094891, "auxiliary_loss_mlp": 0.01035498, "balance_loss_clip": 1.03430879, "balance_loss_mlp": 1.02195024, "epoch": 0.854892529685856, "flos": 20412595249920.0, "grad_norm": 1.722487926122784, "language_loss": 0.70405877, "learning_rate": 2.167597412688238e-07, "loss": 0.72536266, "num_input_tokens_seen": 306631620, "step": 14219, "time_per_iteration": 2.6162991523742676 }, { "auxiliary_loss_clip": 0.01085887, "auxiliary_loss_mlp": 0.01039141, "balance_loss_clip": 1.03332317, "balance_loss_mlp": 1.02628446, "epoch": 0.854952652938524, "flos": 16398716094720.0, "grad_norm": 2.7265350217211397, "language_loss": 0.67212754, "learning_rate": 2.1658343297873549e-07, "loss": 0.69337785, "num_input_tokens_seen": 306646695, "step": 14220, "time_per_iteration": 2.618908166885376 }, { "auxiliary_loss_clip": 0.01105252, "auxiliary_loss_mlp": 0.01030801, "balance_loss_clip": 1.03653455, "balance_loss_mlp": 1.01895165, "epoch": 0.855012776191192, "flos": 21178677542400.0, "grad_norm": 1.9488426413623547, "language_loss": 0.71819413, "learning_rate": 2.164071923159827e-07, "loss": 0.73955464, "num_input_tokens_seen": 306665465, "step": 14221, "time_per_iteration": 2.547293186187744 }, { "auxiliary_loss_clip": 0.01077738, "auxiliary_loss_mlp": 0.01041646, "balance_loss_clip": 1.03548348, "balance_loss_mlp": 1.02897441, "epoch": 0.8550728994438599, "flos": 26140993361280.0, "grad_norm": 1.7974681861069632, "language_loss": 0.59693348, "learning_rate": 2.1623101928724763e-07, "loss": 0.61812735, "num_input_tokens_seen": 306685950, "step": 14222, "time_per_iteration": 4.256742477416992 }, { "auxiliary_loss_clip": 0.01079753, "auxiliary_loss_mlp": 0.01032489, "balance_loss_clip": 1.03260887, "balance_loss_mlp": 1.01989484, "epoch": 0.8551330226965279, "flos": 22786793435520.0, "grad_norm": 1.5392521458494535, "language_loss": 0.84364492, "learning_rate": 2.1605491389921093e-07, "loss": 0.86476731, "num_input_tokens_seen": 306705740, "step": 14223, "time_per_iteration": 2.6583445072174072 }, { "auxiliary_loss_clip": 0.01097669, "auxiliary_loss_mlp": 0.01031894, "balance_loss_clip": 1.03776193, "balance_loss_mlp": 1.01984763, "epoch": 0.8551931459491958, "flos": 22419032037120.0, "grad_norm": 1.7057034680905072, "language_loss": 0.74193132, "learning_rate": 2.158788761585515e-07, "loss": 0.76322699, "num_input_tokens_seen": 306725065, "step": 14224, "time_per_iteration": 4.2042076587677 }, { "auxiliary_loss_clip": 0.01081831, "auxiliary_loss_mlp": 0.00772053, "balance_loss_clip": 1.03394115, "balance_loss_mlp": 1.00025678, "epoch": 0.8552532692018638, "flos": 19573183342080.0, "grad_norm": 1.8055208702511056, "language_loss": 0.75255108, "learning_rate": 2.1570290607194307e-07, "loss": 0.77108991, "num_input_tokens_seen": 306743630, "step": 14225, "time_per_iteration": 4.162761449813843 }, { "auxiliary_loss_clip": 0.01047716, "auxiliary_loss_mlp": 0.01039572, "balance_loss_clip": 1.0343529, "balance_loss_mlp": 1.02750206, "epoch": 0.8553133924545318, "flos": 26432767537920.0, "grad_norm": 1.8461972921962662, "language_loss": 0.77405238, "learning_rate": 2.1552700364605925e-07, "loss": 0.79492527, "num_input_tokens_seen": 306763105, "step": 14226, "time_per_iteration": 2.7609846591949463 }, { "auxiliary_loss_clip": 0.01112703, "auxiliary_loss_mlp": 0.01038329, "balance_loss_clip": 1.03843546, "balance_loss_mlp": 1.02525818, "epoch": 0.8553735157071998, "flos": 16362446336640.0, "grad_norm": 18.71502714000466, "language_loss": 0.54893303, "learning_rate": 2.153511688875702e-07, "loss": 0.57044339, "num_input_tokens_seen": 306779875, "step": 14227, "time_per_iteration": 4.112335443496704 }, { "auxiliary_loss_clip": 0.01077046, "auxiliary_loss_mlp": 0.0077063, "balance_loss_clip": 1.03572583, "balance_loss_mlp": 1.00020015, "epoch": 0.8554336389598677, "flos": 20887334328960.0, "grad_norm": 1.839893156700162, "language_loss": 0.6559819, "learning_rate": 2.151754018031442e-07, "loss": 0.67445874, "num_input_tokens_seen": 306800015, "step": 14228, "time_per_iteration": 2.6349892616271973 }, { "auxiliary_loss_clip": 0.01076617, "auxiliary_loss_mlp": 0.01032816, "balance_loss_clip": 1.03681397, "balance_loss_mlp": 1.02012038, "epoch": 0.8554937622125357, "flos": 21284721469440.0, "grad_norm": 2.007233284435357, "language_loss": 0.73960888, "learning_rate": 2.1499970239944542e-07, "loss": 0.76070321, "num_input_tokens_seen": 306814160, "step": 14229, "time_per_iteration": 2.653921365737915 }, { "auxiliary_loss_clip": 0.01096335, "auxiliary_loss_mlp": 0.01031192, "balance_loss_clip": 1.03618884, "balance_loss_mlp": 1.01952744, "epoch": 0.8555538854652037, "flos": 22413178120320.0, "grad_norm": 2.129951857800807, "language_loss": 0.72556508, "learning_rate": 2.1482407068313724e-07, "loss": 0.74684036, "num_input_tokens_seen": 306833310, "step": 14230, "time_per_iteration": 2.611541509628296 }, { "auxiliary_loss_clip": 0.01094442, "auxiliary_loss_mlp": 0.01030708, "balance_loss_clip": 1.03460538, "balance_loss_mlp": 1.01829863, "epoch": 0.8556140087178716, "flos": 20193719725440.0, "grad_norm": 2.23514067772812, "language_loss": 0.82251632, "learning_rate": 2.1464850666087897e-07, "loss": 0.84376776, "num_input_tokens_seen": 306851345, "step": 14231, "time_per_iteration": 2.6085596084594727 }, { "auxiliary_loss_clip": 0.01100487, "auxiliary_loss_mlp": 0.01033167, "balance_loss_clip": 1.03759503, "balance_loss_mlp": 1.01945221, "epoch": 0.8556741319705397, "flos": 22638123043200.0, "grad_norm": 2.1730018430212175, "language_loss": 0.67839086, "learning_rate": 2.1447301033932796e-07, "loss": 0.69972742, "num_input_tokens_seen": 306871040, "step": 14232, "time_per_iteration": 2.619722843170166 }, { "auxiliary_loss_clip": 0.01088548, "auxiliary_loss_mlp": 0.01032023, "balance_loss_clip": 1.03769374, "balance_loss_mlp": 1.01942301, "epoch": 0.8557342552232076, "flos": 23549320281600.0, "grad_norm": 1.4620803714373924, "language_loss": 0.66840327, "learning_rate": 2.1429758172513955e-07, "loss": 0.68960893, "num_input_tokens_seen": 306891625, "step": 14233, "time_per_iteration": 2.645831346511841 }, { "auxiliary_loss_clip": 0.01096889, "auxiliary_loss_mlp": 0.01034394, "balance_loss_clip": 1.03637278, "balance_loss_mlp": 1.02236605, "epoch": 0.8557943784758756, "flos": 19609884063360.0, "grad_norm": 2.026925189610044, "language_loss": 0.76869869, "learning_rate": 2.1412222082496556e-07, "loss": 0.79001153, "num_input_tokens_seen": 306910020, "step": 14234, "time_per_iteration": 2.58845853805542 }, { "auxiliary_loss_clip": 0.01001494, "auxiliary_loss_mlp": 0.01021829, "balance_loss_clip": 1.00670254, "balance_loss_mlp": 1.02035093, "epoch": 0.8558545017285435, "flos": 70641891446400.0, "grad_norm": 0.7646124593211208, "language_loss": 0.57967913, "learning_rate": 2.1394692764545684e-07, "loss": 0.59991229, "num_input_tokens_seen": 306969505, "step": 14235, "time_per_iteration": 3.2275688648223877 }, { "auxiliary_loss_clip": 0.0101382, "auxiliary_loss_mlp": 0.01002617, "balance_loss_clip": 1.01051199, "balance_loss_mlp": 1.00143075, "epoch": 0.8559146249812115, "flos": 56649983086080.0, "grad_norm": 0.8551315667817418, "language_loss": 0.56688058, "learning_rate": 2.1377170219325858e-07, "loss": 0.58704495, "num_input_tokens_seen": 307027710, "step": 14236, "time_per_iteration": 3.086979866027832 }, { "auxiliary_loss_clip": 0.01086537, "auxiliary_loss_mlp": 0.01035642, "balance_loss_clip": 1.035743, "balance_loss_mlp": 1.02300572, "epoch": 0.8559747482338794, "flos": 22888240421760.0, "grad_norm": 1.785861454279873, "language_loss": 0.70469606, "learning_rate": 2.1359654447501673e-07, "loss": 0.72591788, "num_input_tokens_seen": 307045515, "step": 14237, "time_per_iteration": 2.615514039993286 }, { "auxiliary_loss_clip": 0.01085764, "auxiliary_loss_mlp": 0.01029104, "balance_loss_clip": 1.03411865, "balance_loss_mlp": 1.01737368, "epoch": 0.8560348714865474, "flos": 22601925112320.0, "grad_norm": 2.6092090917428465, "language_loss": 0.63390237, "learning_rate": 2.1342145449737314e-07, "loss": 0.65505099, "num_input_tokens_seen": 307064470, "step": 14238, "time_per_iteration": 2.8091626167297363 }, { "auxiliary_loss_clip": 0.01104641, "auxiliary_loss_mlp": 0.01032794, "balance_loss_clip": 1.03615522, "balance_loss_mlp": 1.02233911, "epoch": 0.8560949947392154, "flos": 17931455297280.0, "grad_norm": 1.7164911082437782, "language_loss": 0.69517374, "learning_rate": 2.1324643226696648e-07, "loss": 0.71654809, "num_input_tokens_seen": 307083900, "step": 14239, "time_per_iteration": 2.57605242729187 }, { "auxiliary_loss_clip": 0.01111794, "auxiliary_loss_mlp": 0.0103739, "balance_loss_clip": 1.0377574, "balance_loss_mlp": 1.02455664, "epoch": 0.8561551179918834, "flos": 31026208636800.0, "grad_norm": 2.169346981343539, "language_loss": 0.66365606, "learning_rate": 2.1307147779043455e-07, "loss": 0.68514788, "num_input_tokens_seen": 307104590, "step": 14240, "time_per_iteration": 2.6193511486053467 }, { "auxiliary_loss_clip": 0.01068263, "auxiliary_loss_mlp": 0.01040061, "balance_loss_clip": 1.0336616, "balance_loss_mlp": 1.02518964, "epoch": 0.8562152412445513, "flos": 30665198995200.0, "grad_norm": 1.6476205784607059, "language_loss": 0.62131298, "learning_rate": 2.1289659107441182e-07, "loss": 0.64239621, "num_input_tokens_seen": 307125580, "step": 14241, "time_per_iteration": 2.7614312171936035 }, { "auxiliary_loss_clip": 0.01112623, "auxiliary_loss_mlp": 0.01036862, "balance_loss_clip": 1.03619862, "balance_loss_mlp": 1.02343321, "epoch": 0.8562753644972193, "flos": 31576144838400.0, "grad_norm": 2.266500331980379, "language_loss": 0.74537355, "learning_rate": 2.1272177212552855e-07, "loss": 0.76686835, "num_input_tokens_seen": 307147625, "step": 14242, "time_per_iteration": 2.6258413791656494 }, { "auxiliary_loss_clip": 0.01043356, "auxiliary_loss_mlp": 0.01049301, "balance_loss_clip": 1.0376476, "balance_loss_mlp": 1.03507984, "epoch": 0.8563354877498872, "flos": 26213640618240.0, "grad_norm": 2.248077645392886, "language_loss": 0.7636081, "learning_rate": 2.1254702095041498e-07, "loss": 0.78453457, "num_input_tokens_seen": 307164665, "step": 14243, "time_per_iteration": 2.819819927215576 }, { "auxiliary_loss_clip": 0.01088321, "auxiliary_loss_mlp": 0.00769311, "balance_loss_clip": 1.03758311, "balance_loss_mlp": 1.00028658, "epoch": 0.8563956110025552, "flos": 24134341092480.0, "grad_norm": 2.314406650865767, "language_loss": 0.68075836, "learning_rate": 2.123723375556974e-07, "loss": 0.69933462, "num_input_tokens_seen": 307182530, "step": 14244, "time_per_iteration": 2.668156147003174 }, { "auxiliary_loss_clip": 0.01020209, "auxiliary_loss_mlp": 0.01006142, "balance_loss_clip": 1.0066725, "balance_loss_mlp": 1.00496769, "epoch": 0.8564557342552233, "flos": 56271986311680.0, "grad_norm": 0.7568226385522613, "language_loss": 0.58461487, "learning_rate": 2.1219772194800046e-07, "loss": 0.60487843, "num_input_tokens_seen": 307241240, "step": 14245, "time_per_iteration": 3.0361111164093018 }, { "auxiliary_loss_clip": 0.01102848, "auxiliary_loss_mlp": 0.01031456, "balance_loss_clip": 1.03873086, "balance_loss_mlp": 1.01862907, "epoch": 0.8565158575078912, "flos": 23440618748160.0, "grad_norm": 1.7549005151263664, "language_loss": 0.77337581, "learning_rate": 2.1202317413394488e-07, "loss": 0.79471886, "num_input_tokens_seen": 307261485, "step": 14246, "time_per_iteration": 2.630526542663574 }, { "auxiliary_loss_clip": 0.01082478, "auxiliary_loss_mlp": 0.01027673, "balance_loss_clip": 1.03102589, "balance_loss_mlp": 1.01518607, "epoch": 0.8565759807605592, "flos": 20375930442240.0, "grad_norm": 1.8941484357847163, "language_loss": 0.81755006, "learning_rate": 2.1184869412014938e-07, "loss": 0.83865154, "num_input_tokens_seen": 307279160, "step": 14247, "time_per_iteration": 2.637540578842163 }, { "auxiliary_loss_clip": 0.01088373, "auxiliary_loss_mlp": 0.01031625, "balance_loss_clip": 1.03624964, "balance_loss_mlp": 1.01832116, "epoch": 0.8566361040132271, "flos": 18807101049600.0, "grad_norm": 1.8985078396153772, "language_loss": 0.77648062, "learning_rate": 2.1167428191323112e-07, "loss": 0.79768062, "num_input_tokens_seen": 307297920, "step": 14248, "time_per_iteration": 2.637140989303589 }, { "auxiliary_loss_clip": 0.01059574, "auxiliary_loss_mlp": 0.01038673, "balance_loss_clip": 1.03150558, "balance_loss_mlp": 1.02398682, "epoch": 0.8566962272658951, "flos": 24535355506560.0, "grad_norm": 1.8205967022668303, "language_loss": 0.78117526, "learning_rate": 2.1149993751980278e-07, "loss": 0.8021577, "num_input_tokens_seen": 307318320, "step": 14249, "time_per_iteration": 2.747084856033325 }, { "auxiliary_loss_clip": 0.01082913, "auxiliary_loss_mlp": 0.01033057, "balance_loss_clip": 1.03500676, "balance_loss_mlp": 1.02062345, "epoch": 0.856756350518563, "flos": 23178506227200.0, "grad_norm": 1.834584951570381, "language_loss": 0.78369069, "learning_rate": 2.1132566094647597e-07, "loss": 0.80485034, "num_input_tokens_seen": 307336720, "step": 14250, "time_per_iteration": 2.6694507598876953 }, { "auxiliary_loss_clip": 0.01085775, "auxiliary_loss_mlp": 0.01030606, "balance_loss_clip": 1.03689909, "balance_loss_mlp": 1.01948988, "epoch": 0.856816473771231, "flos": 20808581760000.0, "grad_norm": 1.7702839302991833, "language_loss": 0.79165637, "learning_rate": 2.1115145219985942e-07, "loss": 0.81282026, "num_input_tokens_seen": 307354120, "step": 14251, "time_per_iteration": 2.61769962310791 }, { "auxiliary_loss_clip": 0.01071172, "auxiliary_loss_mlp": 0.01031805, "balance_loss_clip": 1.03706813, "balance_loss_mlp": 1.01999116, "epoch": 0.856876597023899, "flos": 20228157889920.0, "grad_norm": 2.063789660652868, "language_loss": 0.61335462, "learning_rate": 2.1097731128656005e-07, "loss": 0.63438439, "num_input_tokens_seen": 307373165, "step": 14252, "time_per_iteration": 2.730942964553833 }, { "auxiliary_loss_clip": 0.01088715, "auxiliary_loss_mlp": 0.01037397, "balance_loss_clip": 1.04091692, "balance_loss_mlp": 1.02395606, "epoch": 0.856936720276567, "flos": 18296128126080.0, "grad_norm": 1.8690578228710872, "language_loss": 0.69612849, "learning_rate": 2.1080323821317924e-07, "loss": 0.71738964, "num_input_tokens_seen": 307391000, "step": 14253, "time_per_iteration": 2.6573426723480225 }, { "auxiliary_loss_clip": 0.01013485, "auxiliary_loss_mlp": 0.01001116, "balance_loss_clip": 1.00999308, "balance_loss_mlp": 1.0000428, "epoch": 0.8569968435292349, "flos": 69878394933120.0, "grad_norm": 0.7842094362693159, "language_loss": 0.59178007, "learning_rate": 2.1062923298631907e-07, "loss": 0.61192608, "num_input_tokens_seen": 307452865, "step": 14254, "time_per_iteration": 3.2271313667297363 }, { "auxiliary_loss_clip": 0.0108384, "auxiliary_loss_mlp": 0.01034196, "balance_loss_clip": 1.0343616, "balance_loss_mlp": 1.02042699, "epoch": 0.8570569667819029, "flos": 25848572739840.0, "grad_norm": 1.7290830197798204, "language_loss": 0.80958641, "learning_rate": 2.1045529561257825e-07, "loss": 0.8307668, "num_input_tokens_seen": 307471940, "step": 14255, "time_per_iteration": 2.6941943168640137 }, { "auxiliary_loss_clip": 0.011065, "auxiliary_loss_mlp": 0.01024921, "balance_loss_clip": 1.03668928, "balance_loss_mlp": 1.01289284, "epoch": 0.8571170900345708, "flos": 23257115141760.0, "grad_norm": 1.9710027507831065, "language_loss": 0.67309523, "learning_rate": 2.1028142609855126e-07, "loss": 0.69440937, "num_input_tokens_seen": 307488745, "step": 14256, "time_per_iteration": 2.719081163406372 }, { "auxiliary_loss_clip": 0.01099477, "auxiliary_loss_mlp": 0.01031183, "balance_loss_clip": 1.037992, "balance_loss_mlp": 1.01950645, "epoch": 0.8571772132872388, "flos": 18917670090240.0, "grad_norm": 2.031884958657008, "language_loss": 0.70139217, "learning_rate": 2.1010762445083218e-07, "loss": 0.72269881, "num_input_tokens_seen": 307506855, "step": 14257, "time_per_iteration": 2.600598096847534 }, { "auxiliary_loss_clip": 0.01073361, "auxiliary_loss_mlp": 0.01032458, "balance_loss_clip": 1.03339398, "balance_loss_mlp": 1.01963735, "epoch": 0.8572373365399069, "flos": 33250120318080.0, "grad_norm": 2.468248667135471, "language_loss": 0.77000117, "learning_rate": 2.0993389067601197e-07, "loss": 0.79105937, "num_input_tokens_seen": 307526115, "step": 14258, "time_per_iteration": 2.757704973220825 }, { "auxiliary_loss_clip": 0.01096583, "auxiliary_loss_mlp": 0.00769575, "balance_loss_clip": 1.0357585, "balance_loss_mlp": 1.00029516, "epoch": 0.8572974597925748, "flos": 23327535755520.0, "grad_norm": 1.474412147771869, "language_loss": 0.6799866, "learning_rate": 2.0976022478067735e-07, "loss": 0.69864815, "num_input_tokens_seen": 307545230, "step": 14259, "time_per_iteration": 2.6122398376464844 }, { "auxiliary_loss_clip": 0.010953, "auxiliary_loss_mlp": 0.0103545, "balance_loss_clip": 1.03352249, "balance_loss_mlp": 1.02250957, "epoch": 0.8573575830452428, "flos": 24535858296960.0, "grad_norm": 1.6836228896931322, "language_loss": 0.77251399, "learning_rate": 2.0958662677141437e-07, "loss": 0.79382151, "num_input_tokens_seen": 307564900, "step": 14260, "time_per_iteration": 2.6170718669891357 }, { "auxiliary_loss_clip": 0.01083087, "auxiliary_loss_mlp": 0.01031864, "balance_loss_clip": 1.03345168, "balance_loss_mlp": 1.0186913, "epoch": 0.8574177062979107, "flos": 24165403378560.0, "grad_norm": 1.694275563361149, "language_loss": 0.74151957, "learning_rate": 2.09413096654806e-07, "loss": 0.76266909, "num_input_tokens_seen": 307583500, "step": 14261, "time_per_iteration": 4.178469181060791 }, { "auxiliary_loss_clip": 0.0109609, "auxiliary_loss_mlp": 0.01032733, "balance_loss_clip": 1.03748691, "balance_loss_mlp": 1.01923871, "epoch": 0.8574778295505787, "flos": 17930737025280.0, "grad_norm": 1.9745240066766159, "language_loss": 0.78983176, "learning_rate": 2.0923963443743276e-07, "loss": 0.81111997, "num_input_tokens_seen": 307601430, "step": 14262, "time_per_iteration": 2.646378993988037 }, { "auxiliary_loss_clip": 0.0107326, "auxiliary_loss_mlp": 0.01032783, "balance_loss_clip": 1.03582883, "balance_loss_mlp": 1.02097511, "epoch": 0.8575379528032466, "flos": 21580697537280.0, "grad_norm": 1.674172506907798, "language_loss": 0.67816055, "learning_rate": 2.0906624012587203e-07, "loss": 0.69922101, "num_input_tokens_seen": 307621495, "step": 14263, "time_per_iteration": 4.332361698150635 }, { "auxiliary_loss_clip": 0.01072214, "auxiliary_loss_mlp": 0.00770907, "balance_loss_clip": 1.03429055, "balance_loss_mlp": 1.00025988, "epoch": 0.8575980760559146, "flos": 21761579450880.0, "grad_norm": 1.4408705629721363, "language_loss": 0.79718733, "learning_rate": 2.088929137266986e-07, "loss": 0.81561852, "num_input_tokens_seen": 307640840, "step": 14264, "time_per_iteration": 2.753828287124634 }, { "auxiliary_loss_clip": 0.01071482, "auxiliary_loss_mlp": 0.01039645, "balance_loss_clip": 1.03247488, "balance_loss_mlp": 1.02618599, "epoch": 0.8576581993085826, "flos": 34386442047360.0, "grad_norm": 1.2896850911673399, "language_loss": 0.69861013, "learning_rate": 2.0871965524648582e-07, "loss": 0.71972132, "num_input_tokens_seen": 307663820, "step": 14265, "time_per_iteration": 4.417909145355225 }, { "auxiliary_loss_clip": 0.01105479, "auxiliary_loss_mlp": 0.01028348, "balance_loss_clip": 1.03650212, "balance_loss_mlp": 1.01695776, "epoch": 0.8577183225612506, "flos": 23222497409280.0, "grad_norm": 1.6592250093825642, "language_loss": 0.66188025, "learning_rate": 2.085464646918027e-07, "loss": 0.68321854, "num_input_tokens_seen": 307682385, "step": 14266, "time_per_iteration": 2.6130142211914062 }, { "auxiliary_loss_clip": 0.01087662, "auxiliary_loss_mlp": 0.01032473, "balance_loss_clip": 1.03722739, "balance_loss_mlp": 1.02009344, "epoch": 0.8577784458139185, "flos": 28804164462720.0, "grad_norm": 1.6281862094757322, "language_loss": 0.75571585, "learning_rate": 2.0837334206921731e-07, "loss": 0.77691722, "num_input_tokens_seen": 307704680, "step": 14267, "time_per_iteration": 4.48302960395813 }, { "auxiliary_loss_clip": 0.01095891, "auxiliary_loss_mlp": 0.01032819, "balance_loss_clip": 1.03645444, "balance_loss_mlp": 1.02119589, "epoch": 0.8578385690665865, "flos": 19755573626880.0, "grad_norm": 1.7702696425064848, "language_loss": 0.87967706, "learning_rate": 2.082002873852946e-07, "loss": 0.9009642, "num_input_tokens_seen": 307723245, "step": 14268, "time_per_iteration": 2.7304728031158447 }, { "auxiliary_loss_clip": 0.01098203, "auxiliary_loss_mlp": 0.01036419, "balance_loss_clip": 1.03701484, "balance_loss_mlp": 1.02400303, "epoch": 0.8578986923192544, "flos": 20704082117760.0, "grad_norm": 2.207459116191671, "language_loss": 0.72899628, "learning_rate": 2.0802730064659667e-07, "loss": 0.75034249, "num_input_tokens_seen": 307742510, "step": 14269, "time_per_iteration": 2.686720848083496 }, { "auxiliary_loss_clip": 0.01099494, "auxiliary_loss_mlp": 0.01032209, "balance_loss_clip": 1.03617907, "balance_loss_mlp": 1.01991236, "epoch": 0.8579588155719224, "flos": 36101715189120.0, "grad_norm": 1.7486556391574948, "language_loss": 0.66497004, "learning_rate": 2.0785438185968252e-07, "loss": 0.68628705, "num_input_tokens_seen": 307766030, "step": 14270, "time_per_iteration": 2.759577751159668 }, { "auxiliary_loss_clip": 0.01082271, "auxiliary_loss_mlp": 0.0103104, "balance_loss_clip": 1.0320828, "balance_loss_mlp": 1.01854658, "epoch": 0.8580189388245905, "flos": 22853479034880.0, "grad_norm": 1.9964784224395893, "language_loss": 0.73861098, "learning_rate": 2.0768153103110997e-07, "loss": 0.75974405, "num_input_tokens_seen": 307785800, "step": 14271, "time_per_iteration": 2.6652464866638184 }, { "auxiliary_loss_clip": 0.00990812, "auxiliary_loss_mlp": 0.00751033, "balance_loss_clip": 1.00730669, "balance_loss_mlp": 0.99962157, "epoch": 0.8580790620772584, "flos": 69642104290560.0, "grad_norm": 0.808728293182982, "language_loss": 0.595052, "learning_rate": 2.0750874816743358e-07, "loss": 0.61247051, "num_input_tokens_seen": 307850995, "step": 14272, "time_per_iteration": 3.3493616580963135 }, { "auxiliary_loss_clip": 0.0108737, "auxiliary_loss_mlp": 0.01037635, "balance_loss_clip": 1.03556502, "balance_loss_mlp": 1.02342474, "epoch": 0.8581391853299264, "flos": 13334243270400.0, "grad_norm": 1.7519497448745491, "language_loss": 0.75282109, "learning_rate": 2.0733603327520499e-07, "loss": 0.7740711, "num_input_tokens_seen": 307868585, "step": 14273, "time_per_iteration": 2.6791751384735107 }, { "auxiliary_loss_clip": 0.01097542, "auxiliary_loss_mlp": 0.01029287, "balance_loss_clip": 1.03653765, "balance_loss_mlp": 1.01684737, "epoch": 0.8581993085825943, "flos": 19645651031040.0, "grad_norm": 1.8670463657155183, "language_loss": 0.82038534, "learning_rate": 2.0716338636097385e-07, "loss": 0.84165359, "num_input_tokens_seen": 307886820, "step": 14274, "time_per_iteration": 2.617358446121216 }, { "auxiliary_loss_clip": 0.0101945, "auxiliary_loss_mlp": 0.01002494, "balance_loss_clip": 1.00673366, "balance_loss_mlp": 1.00137389, "epoch": 0.8582594318352623, "flos": 55825077294720.0, "grad_norm": 0.7943422785901219, "language_loss": 0.60750306, "learning_rate": 2.0699080743128672e-07, "loss": 0.6277225, "num_input_tokens_seen": 307944020, "step": 14275, "time_per_iteration": 3.2472341060638428 }, { "auxiliary_loss_clip": 0.01096248, "auxiliary_loss_mlp": 0.01028909, "balance_loss_clip": 1.03805137, "balance_loss_mlp": 1.01562345, "epoch": 0.8583195550879302, "flos": 24279563779200.0, "grad_norm": 2.0431646133306764, "language_loss": 0.59516066, "learning_rate": 2.0681829649268768e-07, "loss": 0.61641222, "num_input_tokens_seen": 307961055, "step": 14276, "time_per_iteration": 2.7009382247924805 }, { "auxiliary_loss_clip": 0.0108586, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 1.03556418, "balance_loss_mlp": 1.02205861, "epoch": 0.8583796783405983, "flos": 13444129952640.0, "grad_norm": 2.25300331444078, "language_loss": 0.76484519, "learning_rate": 2.0664585355171838e-07, "loss": 0.786044, "num_input_tokens_seen": 307978690, "step": 14277, "time_per_iteration": 2.6383044719696045 }, { "auxiliary_loss_clip": 0.01085815, "auxiliary_loss_mlp": 0.0102939, "balance_loss_clip": 1.0350821, "balance_loss_mlp": 1.01708126, "epoch": 0.8584398015932662, "flos": 16180271533440.0, "grad_norm": 1.614064459915635, "language_loss": 0.83699441, "learning_rate": 2.0647347861491803e-07, "loss": 0.85814643, "num_input_tokens_seen": 307995870, "step": 14278, "time_per_iteration": 2.690840721130371 }, { "auxiliary_loss_clip": 0.01087706, "auxiliary_loss_mlp": 0.01030174, "balance_loss_clip": 1.03669083, "balance_loss_mlp": 1.01709092, "epoch": 0.8584999248459342, "flos": 17450431338240.0, "grad_norm": 2.022696664220824, "language_loss": 0.74557948, "learning_rate": 2.0630117168882366e-07, "loss": 0.76675826, "num_input_tokens_seen": 308013645, "step": 14279, "time_per_iteration": 2.6342451572418213 }, { "auxiliary_loss_clip": 0.01107856, "auxiliary_loss_mlp": 0.0103262, "balance_loss_clip": 1.03726792, "balance_loss_mlp": 1.0206275, "epoch": 0.8585600480986021, "flos": 23441013797760.0, "grad_norm": 2.2412241965372095, "language_loss": 0.66438127, "learning_rate": 2.0612893277996845e-07, "loss": 0.68578601, "num_input_tokens_seen": 308032490, "step": 14280, "time_per_iteration": 2.586599349975586 }, { "auxiliary_loss_clip": 0.01095719, "auxiliary_loss_mlp": 0.01028274, "balance_loss_clip": 1.03592777, "balance_loss_mlp": 1.01651978, "epoch": 0.8586201713512701, "flos": 19937927998080.0, "grad_norm": 1.86716453090562, "language_loss": 0.62667966, "learning_rate": 2.0595676189488343e-07, "loss": 0.64791965, "num_input_tokens_seen": 308052110, "step": 14281, "time_per_iteration": 2.6187994480133057 }, { "auxiliary_loss_clip": 0.01084456, "auxiliary_loss_mlp": 0.00770032, "balance_loss_clip": 1.03628945, "balance_loss_mlp": 1.00014341, "epoch": 0.858680294603938, "flos": 15304769435520.0, "grad_norm": 3.7299826958950493, "language_loss": 0.73169029, "learning_rate": 2.0578465904009845e-07, "loss": 0.7502352, "num_input_tokens_seen": 308070660, "step": 14282, "time_per_iteration": 2.7070963382720947 }, { "auxiliary_loss_clip": 0.01080016, "auxiliary_loss_mlp": 0.01030436, "balance_loss_clip": 1.0322001, "balance_loss_mlp": 1.01892662, "epoch": 0.858740417856606, "flos": 22711237176960.0, "grad_norm": 1.8508205946022054, "language_loss": 0.75599825, "learning_rate": 2.0561262422213832e-07, "loss": 0.77710283, "num_input_tokens_seen": 308089520, "step": 14283, "time_per_iteration": 2.70784854888916 }, { "auxiliary_loss_clip": 0.01093289, "auxiliary_loss_mlp": 0.01032374, "balance_loss_clip": 1.03351057, "balance_loss_mlp": 1.01973772, "epoch": 0.8588005411092741, "flos": 34054303962240.0, "grad_norm": 1.810517869683259, "language_loss": 0.60200775, "learning_rate": 2.0544065744752736e-07, "loss": 0.62326431, "num_input_tokens_seen": 308111545, "step": 14284, "time_per_iteration": 2.804454803466797 }, { "auxiliary_loss_clip": 0.01080997, "auxiliary_loss_mlp": 0.01030838, "balance_loss_clip": 1.03671587, "balance_loss_mlp": 1.01877391, "epoch": 0.858860664361942, "flos": 28913584268160.0, "grad_norm": 1.9759393563383274, "language_loss": 0.75834155, "learning_rate": 2.0526875872278749e-07, "loss": 0.77945989, "num_input_tokens_seen": 308129690, "step": 14285, "time_per_iteration": 2.717355489730835 }, { "auxiliary_loss_clip": 0.01096428, "auxiliary_loss_mlp": 0.01034968, "balance_loss_clip": 1.03976953, "balance_loss_mlp": 1.0222249, "epoch": 0.85892078761461, "flos": 19792525743360.0, "grad_norm": 2.2818993237689242, "language_loss": 0.7433964, "learning_rate": 2.0509692805443524e-07, "loss": 0.76471031, "num_input_tokens_seen": 308147410, "step": 14286, "time_per_iteration": 2.60193133354187 }, { "auxiliary_loss_clip": 0.01009396, "auxiliary_loss_mlp": 0.00750956, "balance_loss_clip": 1.0070982, "balance_loss_mlp": 0.99964851, "epoch": 0.8589809108672779, "flos": 67106630039040.0, "grad_norm": 0.7818074542698659, "language_loss": 0.4943513, "learning_rate": 2.0492516544898718e-07, "loss": 0.51195478, "num_input_tokens_seen": 308204875, "step": 14287, "time_per_iteration": 3.223233461380005 }, { "auxiliary_loss_clip": 0.01099243, "auxiliary_loss_mlp": 0.01030496, "balance_loss_clip": 1.03820276, "balance_loss_mlp": 1.01868248, "epoch": 0.8590410341199459, "flos": 29716259541120.0, "grad_norm": 2.0753846040431574, "language_loss": 0.79119551, "learning_rate": 2.0475347091295704e-07, "loss": 0.81249291, "num_input_tokens_seen": 308225690, "step": 14288, "time_per_iteration": 2.8012468814849854 }, { "auxiliary_loss_clip": 0.01070856, "auxiliary_loss_mlp": 0.01034723, "balance_loss_clip": 1.03844345, "balance_loss_mlp": 1.02160382, "epoch": 0.8591011573726138, "flos": 23987430466560.0, "grad_norm": 2.333742079437343, "language_loss": 0.80807364, "learning_rate": 2.045818444528553e-07, "loss": 0.82912946, "num_input_tokens_seen": 308245255, "step": 14289, "time_per_iteration": 2.677363634109497 }, { "auxiliary_loss_clip": 0.01101023, "auxiliary_loss_mlp": 0.01032169, "balance_loss_clip": 1.03798854, "balance_loss_mlp": 1.01974702, "epoch": 0.8591612806252819, "flos": 14428656806400.0, "grad_norm": 1.8411584307927096, "language_loss": 0.65171552, "learning_rate": 2.0441028607518973e-07, "loss": 0.67304742, "num_input_tokens_seen": 308261755, "step": 14290, "time_per_iteration": 2.6130077838897705 }, { "auxiliary_loss_clip": 0.01088699, "auxiliary_loss_mlp": 0.01029909, "balance_loss_clip": 1.0362072, "balance_loss_mlp": 1.01692736, "epoch": 0.8592214038779498, "flos": 31577150419200.0, "grad_norm": 1.9868152248145707, "language_loss": 0.55034781, "learning_rate": 2.0423879578646642e-07, "loss": 0.57153386, "num_input_tokens_seen": 308285145, "step": 14291, "time_per_iteration": 2.7079780101776123 }, { "auxiliary_loss_clip": 0.0110119, "auxiliary_loss_mlp": 0.01031083, "balance_loss_clip": 1.03754354, "balance_loss_mlp": 1.01885247, "epoch": 0.8592815271306178, "flos": 17457290835840.0, "grad_norm": 2.056468770778706, "language_loss": 0.71314991, "learning_rate": 2.0406737359318792e-07, "loss": 0.73447263, "num_input_tokens_seen": 308304130, "step": 14292, "time_per_iteration": 2.595897674560547 }, { "auxiliary_loss_clip": 0.01098184, "auxiliary_loss_mlp": 0.01034158, "balance_loss_clip": 1.03526211, "balance_loss_mlp": 1.02187383, "epoch": 0.8593416503832857, "flos": 25411360394880.0, "grad_norm": 1.470631901330716, "language_loss": 0.71314609, "learning_rate": 2.038960195018542e-07, "loss": 0.73446953, "num_input_tokens_seen": 308324670, "step": 14293, "time_per_iteration": 2.652717351913452 }, { "auxiliary_loss_clip": 0.01080648, "auxiliary_loss_mlp": 0.01034358, "balance_loss_clip": 1.03720033, "balance_loss_mlp": 1.02217507, "epoch": 0.8594017736359537, "flos": 20996646393600.0, "grad_norm": 1.543293476083091, "language_loss": 0.6855827, "learning_rate": 2.0372473351896358e-07, "loss": 0.70673275, "num_input_tokens_seen": 308344215, "step": 14294, "time_per_iteration": 2.6766042709350586 }, { "auxiliary_loss_clip": 0.01104946, "auxiliary_loss_mlp": 0.01031001, "balance_loss_clip": 1.03467357, "balance_loss_mlp": 1.01901507, "epoch": 0.8594618968886216, "flos": 22091059929600.0, "grad_norm": 1.9038081617192384, "language_loss": 0.77887809, "learning_rate": 2.0355351565101087e-07, "loss": 0.80023754, "num_input_tokens_seen": 308360520, "step": 14295, "time_per_iteration": 2.6753733158111572 }, { "auxiliary_loss_clip": 0.01085392, "auxiliary_loss_mlp": 0.01037375, "balance_loss_clip": 1.03575659, "balance_loss_mlp": 1.02281332, "epoch": 0.8595220201412896, "flos": 11656245467520.0, "grad_norm": 2.8815633850100095, "language_loss": 0.69029182, "learning_rate": 2.0338236590448975e-07, "loss": 0.71151948, "num_input_tokens_seen": 308376865, "step": 14296, "time_per_iteration": 2.6722471714019775 }, { "auxiliary_loss_clip": 0.01081568, "auxiliary_loss_mlp": 0.01033296, "balance_loss_clip": 1.03467476, "balance_loss_mlp": 1.02070773, "epoch": 0.8595821433939577, "flos": 25040366772480.0, "grad_norm": 2.176741931564133, "language_loss": 0.78606057, "learning_rate": 2.0321128428588842e-07, "loss": 0.80720925, "num_input_tokens_seen": 308395870, "step": 14297, "time_per_iteration": 2.6577630043029785 }, { "auxiliary_loss_clip": 0.01091905, "auxiliary_loss_mlp": 0.01032076, "balance_loss_clip": 1.03271341, "balance_loss_mlp": 1.02086425, "epoch": 0.8596422666466256, "flos": 28511528359680.0, "grad_norm": 2.673036998280705, "language_loss": 0.67951548, "learning_rate": 2.030402708016954e-07, "loss": 0.7007553, "num_input_tokens_seen": 308417250, "step": 14298, "time_per_iteration": 2.7069945335388184 }, { "auxiliary_loss_clip": 0.01083251, "auxiliary_loss_mlp": 0.01035882, "balance_loss_clip": 1.0348295, "balance_loss_mlp": 1.02360308, "epoch": 0.8597023898992936, "flos": 13589137157760.0, "grad_norm": 2.2714540225430775, "language_loss": 0.68807364, "learning_rate": 2.0286932545839576e-07, "loss": 0.70926499, "num_input_tokens_seen": 308434565, "step": 14299, "time_per_iteration": 2.637234687805176 }, { "auxiliary_loss_clip": 0.01080144, "auxiliary_loss_mlp": 0.01036834, "balance_loss_clip": 1.03766489, "balance_loss_mlp": 1.02434683, "epoch": 0.8597625131519615, "flos": 32300821728000.0, "grad_norm": 2.455453131727374, "language_loss": 0.71315849, "learning_rate": 2.0269844826247096e-07, "loss": 0.73432827, "num_input_tokens_seen": 308450040, "step": 14300, "time_per_iteration": 4.307279109954834 }, { "auxiliary_loss_clip": 0.01080749, "auxiliary_loss_mlp": 0.01035226, "balance_loss_clip": 1.03184569, "balance_loss_mlp": 1.02227378, "epoch": 0.8598226364046295, "flos": 28730367970560.0, "grad_norm": 2.4178089215843377, "language_loss": 0.69498658, "learning_rate": 2.0252763922040116e-07, "loss": 0.71614629, "num_input_tokens_seen": 308470545, "step": 14301, "time_per_iteration": 2.7081966400146484 }, { "auxiliary_loss_clip": 0.01056383, "auxiliary_loss_mlp": 0.01033857, "balance_loss_clip": 1.03381944, "balance_loss_mlp": 1.02151251, "epoch": 0.8598827596572974, "flos": 21871825269120.0, "grad_norm": 1.627550751133936, "language_loss": 0.74207568, "learning_rate": 2.023568983386641e-07, "loss": 0.76297808, "num_input_tokens_seen": 308490020, "step": 14302, "time_per_iteration": 2.711632251739502 }, { "auxiliary_loss_clip": 0.01092554, "auxiliary_loss_mlp": 0.01030522, "balance_loss_clip": 1.03438914, "balance_loss_mlp": 1.01904821, "epoch": 0.8599428829099655, "flos": 23767297966080.0, "grad_norm": 1.6910498368057518, "language_loss": 0.83883357, "learning_rate": 2.02186225623733e-07, "loss": 0.86006427, "num_input_tokens_seen": 308509065, "step": 14303, "time_per_iteration": 4.2169249057769775 }, { "auxiliary_loss_clip": 0.01096255, "auxiliary_loss_mlp": 0.01036748, "balance_loss_clip": 1.03428876, "balance_loss_mlp": 1.02355707, "epoch": 0.8600030061626334, "flos": 16212770363520.0, "grad_norm": 2.148560231945797, "language_loss": 0.7746321, "learning_rate": 2.0201562108208025e-07, "loss": 0.7959621, "num_input_tokens_seen": 308524725, "step": 14304, "time_per_iteration": 4.171972990036011 }, { "auxiliary_loss_clip": 0.01110849, "auxiliary_loss_mlp": 0.01035361, "balance_loss_clip": 1.03822732, "balance_loss_mlp": 1.02181315, "epoch": 0.8600631294153014, "flos": 15669370437120.0, "grad_norm": 1.95456339418458, "language_loss": 0.54470098, "learning_rate": 2.0184508472017537e-07, "loss": 0.56616312, "num_input_tokens_seen": 308543525, "step": 14305, "time_per_iteration": 2.594041585922241 }, { "auxiliary_loss_clip": 0.01108772, "auxiliary_loss_mlp": 0.01029221, "balance_loss_clip": 1.03798604, "balance_loss_mlp": 1.01622105, "epoch": 0.8601232526679693, "flos": 17493093717120.0, "grad_norm": 1.9212568904782885, "language_loss": 0.84086001, "learning_rate": 2.0167461654448558e-07, "loss": 0.86223984, "num_input_tokens_seen": 308557995, "step": 14306, "time_per_iteration": 4.086545467376709 }, { "auxiliary_loss_clip": 0.01097083, "auxiliary_loss_mlp": 0.00769534, "balance_loss_clip": 1.03672814, "balance_loss_mlp": 1.00017905, "epoch": 0.8601833759206373, "flos": 26985935963520.0, "grad_norm": 1.3981944993349464, "language_loss": 0.71432567, "learning_rate": 2.01504216561474e-07, "loss": 0.73299187, "num_input_tokens_seen": 308582750, "step": 14307, "time_per_iteration": 2.7123961448669434 }, { "auxiliary_loss_clip": 0.01096964, "auxiliary_loss_mlp": 0.00771884, "balance_loss_clip": 1.03435898, "balance_loss_mlp": 1.00030386, "epoch": 0.8602434991733052, "flos": 25229760209280.0, "grad_norm": 1.8399000779871275, "language_loss": 0.636989, "learning_rate": 2.0133388477760316e-07, "loss": 0.6556775, "num_input_tokens_seen": 308603770, "step": 14308, "time_per_iteration": 2.6409523487091064 }, { "auxiliary_loss_clip": 0.01010709, "auxiliary_loss_mlp": 0.01001248, "balance_loss_clip": 1.00715673, "balance_loss_mlp": 1.00013912, "epoch": 0.8603036224259732, "flos": 71015363107200.0, "grad_norm": 0.6173812153983712, "language_loss": 0.48415971, "learning_rate": 2.0116362119933172e-07, "loss": 0.50427926, "num_input_tokens_seen": 308667735, "step": 14309, "time_per_iteration": 3.2728710174560547 }, { "auxiliary_loss_clip": 0.01054401, "auxiliary_loss_mlp": 0.01034995, "balance_loss_clip": 1.03519821, "balance_loss_mlp": 1.02176285, "epoch": 0.8603637456786413, "flos": 20300625578880.0, "grad_norm": 1.8566830795066585, "language_loss": 0.67076862, "learning_rate": 2.0099342583311563e-07, "loss": 0.69166255, "num_input_tokens_seen": 308686300, "step": 14310, "time_per_iteration": 2.7875287532806396 }, { "auxiliary_loss_clip": 0.01040328, "auxiliary_loss_mlp": 0.01035443, "balance_loss_clip": 1.02937603, "balance_loss_mlp": 1.02352858, "epoch": 0.8604238689313092, "flos": 21835842819840.0, "grad_norm": 1.7678336453099173, "language_loss": 0.7815913, "learning_rate": 2.0082329868540905e-07, "loss": 0.80234909, "num_input_tokens_seen": 308705825, "step": 14311, "time_per_iteration": 2.779208183288574 }, { "auxiliary_loss_clip": 0.01096237, "auxiliary_loss_mlp": 0.01031381, "balance_loss_clip": 1.03626657, "balance_loss_mlp": 1.0191319, "epoch": 0.8604839921839772, "flos": 18004210295040.0, "grad_norm": 2.0823633926297087, "language_loss": 0.72099596, "learning_rate": 2.006532397626639e-07, "loss": 0.74227214, "num_input_tokens_seen": 308723340, "step": 14312, "time_per_iteration": 2.572300672531128 }, { "auxiliary_loss_clip": 0.01079744, "auxiliary_loss_mlp": 0.01033351, "balance_loss_clip": 1.03377199, "balance_loss_mlp": 1.02101254, "epoch": 0.8605441154366451, "flos": 16252164604800.0, "grad_norm": 4.48770964436052, "language_loss": 0.77972746, "learning_rate": 2.0048324907132797e-07, "loss": 0.80085838, "num_input_tokens_seen": 308741280, "step": 14313, "time_per_iteration": 2.6455512046813965 }, { "auxiliary_loss_clip": 0.01084267, "auxiliary_loss_mlp": 0.0103529, "balance_loss_clip": 1.03463316, "balance_loss_mlp": 1.02147377, "epoch": 0.8606042386893131, "flos": 32267065921920.0, "grad_norm": 1.4772487181933294, "language_loss": 0.7305848, "learning_rate": 2.003133266178474e-07, "loss": 0.75178033, "num_input_tokens_seen": 308762875, "step": 14314, "time_per_iteration": 2.760899782180786 }, { "auxiliary_loss_clip": 0.01085045, "auxiliary_loss_mlp": 0.01033377, "balance_loss_clip": 1.03471231, "balance_loss_mlp": 1.02096725, "epoch": 0.860664361941981, "flos": 20229774001920.0, "grad_norm": 1.8071847940662549, "language_loss": 0.68796486, "learning_rate": 2.001434724086657e-07, "loss": 0.70914906, "num_input_tokens_seen": 308780315, "step": 14315, "time_per_iteration": 2.649801254272461 }, { "auxiliary_loss_clip": 0.01096879, "auxiliary_loss_mlp": 0.01032695, "balance_loss_clip": 1.03672695, "balance_loss_mlp": 1.02085114, "epoch": 0.8607244851946491, "flos": 25191622944000.0, "grad_norm": 1.885182281921848, "language_loss": 0.71844518, "learning_rate": 1.9997368645022418e-07, "loss": 0.73974097, "num_input_tokens_seen": 308799435, "step": 14316, "time_per_iteration": 2.7529983520507812 }, { "auxiliary_loss_clip": 0.01090676, "auxiliary_loss_mlp": 0.01030269, "balance_loss_clip": 1.04007196, "balance_loss_mlp": 1.0183723, "epoch": 0.860784608447317, "flos": 20482082110080.0, "grad_norm": 1.9094680545566136, "language_loss": 0.82880986, "learning_rate": 1.9980396874896056e-07, "loss": 0.85001934, "num_input_tokens_seen": 308817730, "step": 14317, "time_per_iteration": 2.6640453338623047 }, { "auxiliary_loss_clip": 0.01090255, "auxiliary_loss_mlp": 0.01030325, "balance_loss_clip": 1.03797185, "balance_loss_mlp": 1.01819539, "epoch": 0.860844731699985, "flos": 50476037696640.0, "grad_norm": 1.6214847591514705, "language_loss": 0.67348385, "learning_rate": 1.996343193113108e-07, "loss": 0.69468963, "num_input_tokens_seen": 308841735, "step": 14318, "time_per_iteration": 2.869259834289551 }, { "auxiliary_loss_clip": 0.01094097, "auxiliary_loss_mlp": 0.01027928, "balance_loss_clip": 1.0362227, "balance_loss_mlp": 1.01671052, "epoch": 0.8609048549526529, "flos": 41172768455040.0, "grad_norm": 1.558793225555784, "language_loss": 0.71354842, "learning_rate": 1.9946473814370911e-07, "loss": 0.73476869, "num_input_tokens_seen": 308865050, "step": 14319, "time_per_iteration": 2.844249963760376 }, { "auxiliary_loss_clip": 0.0109006, "auxiliary_loss_mlp": 0.00769912, "balance_loss_clip": 1.03683519, "balance_loss_mlp": 1.00023806, "epoch": 0.8609649782053209, "flos": 23951196622080.0, "grad_norm": 1.8769046861773884, "language_loss": 0.67780548, "learning_rate": 1.992952252525839e-07, "loss": 0.69640523, "num_input_tokens_seen": 308885375, "step": 14320, "time_per_iteration": 2.6762452125549316 }, { "auxiliary_loss_clip": 0.01080757, "auxiliary_loss_mlp": 0.01037726, "balance_loss_clip": 1.03380013, "balance_loss_mlp": 1.02343893, "epoch": 0.8610251014579888, "flos": 23112574813440.0, "grad_norm": 5.062268799214488, "language_loss": 0.79499638, "learning_rate": 1.9912578064436446e-07, "loss": 0.81618118, "num_input_tokens_seen": 308904700, "step": 14321, "time_per_iteration": 2.7844552993774414 }, { "auxiliary_loss_clip": 0.01092256, "auxiliary_loss_mlp": 0.00770223, "balance_loss_clip": 1.03433347, "balance_loss_mlp": 1.00014472, "epoch": 0.8610852247106568, "flos": 19426811420160.0, "grad_norm": 1.8063677075547142, "language_loss": 0.7084378, "learning_rate": 1.9895640432547567e-07, "loss": 0.72706258, "num_input_tokens_seen": 308922985, "step": 14322, "time_per_iteration": 2.6614699363708496 }, { "auxiliary_loss_clip": 0.01087983, "auxiliary_loss_mlp": 0.01039264, "balance_loss_clip": 1.03474808, "balance_loss_mlp": 1.02575755, "epoch": 0.8611453479633249, "flos": 19312076401920.0, "grad_norm": 1.9402477905188305, "language_loss": 0.56338006, "learning_rate": 1.9878709630234102e-07, "loss": 0.58465254, "num_input_tokens_seen": 308940765, "step": 14323, "time_per_iteration": 2.639302968978882 }, { "auxiliary_loss_clip": 0.01071823, "auxiliary_loss_mlp": 0.01026562, "balance_loss_clip": 1.03276682, "balance_loss_mlp": 1.01427782, "epoch": 0.8612054712159928, "flos": 23253667436160.0, "grad_norm": 2.0929228827532413, "language_loss": 0.75493181, "learning_rate": 1.986178565813801e-07, "loss": 0.77591568, "num_input_tokens_seen": 308960110, "step": 14324, "time_per_iteration": 2.6960513591766357 }, { "auxiliary_loss_clip": 0.01063342, "auxiliary_loss_mlp": 0.01035964, "balance_loss_clip": 1.03498292, "balance_loss_mlp": 1.02134275, "epoch": 0.8612655944686608, "flos": 16028440744320.0, "grad_norm": 2.114341094605167, "language_loss": 0.66620868, "learning_rate": 1.9844868516901036e-07, "loss": 0.68720174, "num_input_tokens_seen": 308976665, "step": 14325, "time_per_iteration": 2.704503297805786 }, { "auxiliary_loss_clip": 0.01099873, "auxiliary_loss_mlp": 0.01030402, "balance_loss_clip": 1.03732955, "balance_loss_mlp": 1.01800442, "epoch": 0.8613257177213287, "flos": 22492720788480.0, "grad_norm": 1.7540053494594063, "language_loss": 0.64823282, "learning_rate": 1.982795820716472e-07, "loss": 0.66953552, "num_input_tokens_seen": 308997015, "step": 14326, "time_per_iteration": 2.634575843811035 }, { "auxiliary_loss_clip": 0.01085647, "auxiliary_loss_mlp": 0.01033577, "balance_loss_clip": 1.03369999, "balance_loss_mlp": 1.02078009, "epoch": 0.8613858409739967, "flos": 17238056175360.0, "grad_norm": 1.9850234614136824, "language_loss": 0.84380805, "learning_rate": 1.9811054729570253e-07, "loss": 0.86500031, "num_input_tokens_seen": 309015250, "step": 14327, "time_per_iteration": 2.653275728225708 }, { "auxiliary_loss_clip": 0.01098118, "auxiliary_loss_mlp": 0.01031945, "balance_loss_clip": 1.0356977, "balance_loss_mlp": 1.01924908, "epoch": 0.8614459642266646, "flos": 22821123859200.0, "grad_norm": 2.1125726227452186, "language_loss": 0.7496419, "learning_rate": 1.9794158084758661e-07, "loss": 0.77094257, "num_input_tokens_seen": 309034140, "step": 14328, "time_per_iteration": 2.644585132598877 }, { "auxiliary_loss_clip": 0.01096938, "auxiliary_loss_mlp": 0.01027149, "balance_loss_clip": 1.03526139, "balance_loss_mlp": 1.01539493, "epoch": 0.8615060874793327, "flos": 26504301473280.0, "grad_norm": 1.8484016306146063, "language_loss": 0.80306005, "learning_rate": 1.9777268273370673e-07, "loss": 0.82430089, "num_input_tokens_seen": 309055075, "step": 14329, "time_per_iteration": 2.723478078842163 }, { "auxiliary_loss_clip": 0.01083147, "auxiliary_loss_mlp": 0.01031796, "balance_loss_clip": 1.03760588, "balance_loss_mlp": 1.01930857, "epoch": 0.8615662107320006, "flos": 24061011477120.0, "grad_norm": 2.0863615030267937, "language_loss": 0.76824546, "learning_rate": 1.9760385296046757e-07, "loss": 0.78939486, "num_input_tokens_seen": 309074650, "step": 14330, "time_per_iteration": 2.812311887741089 }, { "auxiliary_loss_clip": 0.01096755, "auxiliary_loss_mlp": 0.01030872, "balance_loss_clip": 1.03553391, "balance_loss_mlp": 1.01855159, "epoch": 0.8616263339846686, "flos": 24165044242560.0, "grad_norm": 1.8215281342853327, "language_loss": 0.64920008, "learning_rate": 1.974350915342702e-07, "loss": 0.67047632, "num_input_tokens_seen": 309094385, "step": 14331, "time_per_iteration": 2.6918468475341797 }, { "auxiliary_loss_clip": 0.01086033, "auxiliary_loss_mlp": 0.01032455, "balance_loss_clip": 1.03811228, "balance_loss_mlp": 1.02118349, "epoch": 0.8616864572373365, "flos": 21724340025600.0, "grad_norm": 1.6141703486069339, "language_loss": 0.760149, "learning_rate": 1.9726639846151506e-07, "loss": 0.7813338, "num_input_tokens_seen": 309111815, "step": 14332, "time_per_iteration": 2.7376909255981445 }, { "auxiliary_loss_clip": 0.01096761, "auxiliary_loss_mlp": 0.01031207, "balance_loss_clip": 1.03702247, "balance_loss_mlp": 1.01777267, "epoch": 0.8617465804900045, "flos": 23766651521280.0, "grad_norm": 1.7306536007075406, "language_loss": 0.67241013, "learning_rate": 1.9709777374859904e-07, "loss": 0.69368982, "num_input_tokens_seen": 309131385, "step": 14333, "time_per_iteration": 2.6434760093688965 }, { "auxiliary_loss_clip": 0.01086243, "auxiliary_loss_mlp": 0.0103825, "balance_loss_clip": 1.03663921, "balance_loss_mlp": 1.02411211, "epoch": 0.8618067037426724, "flos": 37703941251840.0, "grad_norm": 1.6353598696173437, "language_loss": 0.62017745, "learning_rate": 1.969292174019157e-07, "loss": 0.64142239, "num_input_tokens_seen": 309155020, "step": 14334, "time_per_iteration": 2.758512258529663 }, { "auxiliary_loss_clip": 0.01080188, "auxiliary_loss_mlp": 0.01048728, "balance_loss_clip": 1.0376997, "balance_loss_mlp": 1.03463769, "epoch": 0.8618668269953405, "flos": 21471026336640.0, "grad_norm": 4.004935288615531, "language_loss": 0.69439906, "learning_rate": 1.967607294278577e-07, "loss": 0.71568823, "num_input_tokens_seen": 309172865, "step": 14335, "time_per_iteration": 2.69771671295166 }, { "auxiliary_loss_clip": 0.01100982, "auxiliary_loss_mlp": 0.01035912, "balance_loss_clip": 1.03802538, "balance_loss_mlp": 1.02374029, "epoch": 0.8619269502480085, "flos": 22232691256320.0, "grad_norm": 3.0287384377889297, "language_loss": 0.82912672, "learning_rate": 1.965923098328135e-07, "loss": 0.85049564, "num_input_tokens_seen": 309193575, "step": 14336, "time_per_iteration": 2.6209864616394043 }, { "auxiliary_loss_clip": 0.01112224, "auxiliary_loss_mlp": 0.010339, "balance_loss_clip": 1.03766823, "balance_loss_mlp": 1.02133584, "epoch": 0.8619870735006764, "flos": 22710626645760.0, "grad_norm": 2.099074168500333, "language_loss": 0.67489713, "learning_rate": 1.9642395862316907e-07, "loss": 0.69635832, "num_input_tokens_seen": 309212680, "step": 14337, "time_per_iteration": 2.6033341884613037 }, { "auxiliary_loss_clip": 0.01069511, "auxiliary_loss_mlp": 0.01033217, "balance_loss_clip": 1.03057778, "balance_loss_mlp": 1.02058089, "epoch": 0.8620471967533444, "flos": 37520293991040.0, "grad_norm": 1.5608583142668484, "language_loss": 0.6694777, "learning_rate": 1.962556758053089e-07, "loss": 0.69050497, "num_input_tokens_seen": 309234485, "step": 14338, "time_per_iteration": 2.775123119354248 }, { "auxiliary_loss_clip": 0.01086678, "auxiliary_loss_mlp": 0.01031842, "balance_loss_clip": 1.03658581, "balance_loss_mlp": 1.02030885, "epoch": 0.8621073200060123, "flos": 19682459493120.0, "grad_norm": 1.9189965100666158, "language_loss": 0.62008345, "learning_rate": 1.9608746138561448e-07, "loss": 0.64126867, "num_input_tokens_seen": 309253630, "step": 14339, "time_per_iteration": 2.696450710296631 }, { "auxiliary_loss_clip": 0.01086707, "auxiliary_loss_mlp": 0.00770344, "balance_loss_clip": 1.03489327, "balance_loss_mlp": 1.00020528, "epoch": 0.8621674432586803, "flos": 14536855549440.0, "grad_norm": 1.8496565464342125, "language_loss": 0.62634254, "learning_rate": 1.9591931537046458e-07, "loss": 0.64491296, "num_input_tokens_seen": 309270950, "step": 14340, "time_per_iteration": 4.219670295715332 }, { "auxiliary_loss_clip": 0.01060496, "auxiliary_loss_mlp": 0.0102529, "balance_loss_clip": 1.03393662, "balance_loss_mlp": 1.01384556, "epoch": 0.8622275665113482, "flos": 20740100480640.0, "grad_norm": 1.5540537291722216, "language_loss": 0.79882658, "learning_rate": 1.9575123776623493e-07, "loss": 0.81968445, "num_input_tokens_seen": 309288780, "step": 14341, "time_per_iteration": 2.7992727756500244 }, { "auxiliary_loss_clip": 0.01092904, "auxiliary_loss_mlp": 0.01032082, "balance_loss_clip": 1.0364188, "balance_loss_mlp": 1.02028048, "epoch": 0.8622876897640163, "flos": 24715914197760.0, "grad_norm": 1.6849671618732158, "language_loss": 0.74542058, "learning_rate": 1.9558322857929887e-07, "loss": 0.76667047, "num_input_tokens_seen": 309310875, "step": 14342, "time_per_iteration": 4.3738038539886475 }, { "auxiliary_loss_clip": 0.01069834, "auxiliary_loss_mlp": 0.01028747, "balance_loss_clip": 1.03554666, "balance_loss_mlp": 1.01579463, "epoch": 0.8623478130166842, "flos": 17457362663040.0, "grad_norm": 1.6056166986401446, "language_loss": 0.68522966, "learning_rate": 1.95415287816028e-07, "loss": 0.7062155, "num_input_tokens_seen": 309329900, "step": 14343, "time_per_iteration": 4.237400770187378 }, { "auxiliary_loss_clip": 0.01096424, "auxiliary_loss_mlp": 0.01042074, "balance_loss_clip": 1.03559923, "balance_loss_mlp": 1.02879965, "epoch": 0.8624079362693522, "flos": 18109176814080.0, "grad_norm": 1.6148942161800302, "language_loss": 0.6802907, "learning_rate": 1.9524741548278967e-07, "loss": 0.70167565, "num_input_tokens_seen": 309347870, "step": 14344, "time_per_iteration": 4.1997270584106445 }, { "auxiliary_loss_clip": 0.01067509, "auxiliary_loss_mlp": 0.01046204, "balance_loss_clip": 1.0338335, "balance_loss_mlp": 1.03233421, "epoch": 0.8624680595220201, "flos": 30666455971200.0, "grad_norm": 1.5830249885479915, "language_loss": 0.81282222, "learning_rate": 1.9507961158595054e-07, "loss": 0.83395934, "num_input_tokens_seen": 309371695, "step": 14345, "time_per_iteration": 2.7645456790924072 }, { "auxiliary_loss_clip": 0.01103951, "auxiliary_loss_mlp": 0.01034874, "balance_loss_clip": 1.03953946, "balance_loss_mlp": 1.02208841, "epoch": 0.8625281827746881, "flos": 37998588516480.0, "grad_norm": 1.9141588154194698, "language_loss": 0.50585526, "learning_rate": 1.9491187613187355e-07, "loss": 0.52724349, "num_input_tokens_seen": 309394645, "step": 14346, "time_per_iteration": 2.7219948768615723 }, { "auxiliary_loss_clip": 0.01029718, "auxiliary_loss_mlp": 0.01032771, "balance_loss_clip": 1.03116322, "balance_loss_mlp": 1.01971757, "epoch": 0.862588306027356, "flos": 26249730808320.0, "grad_norm": 1.6259000305173057, "language_loss": 0.75161147, "learning_rate": 1.9474420912691913e-07, "loss": 0.77223635, "num_input_tokens_seen": 309413170, "step": 14347, "time_per_iteration": 2.8139262199401855 }, { "auxiliary_loss_clip": 0.01082643, "auxiliary_loss_mlp": 0.01030127, "balance_loss_clip": 1.03561497, "balance_loss_mlp": 1.01690078, "epoch": 0.862648429280024, "flos": 25878809013120.0, "grad_norm": 2.1840928220647684, "language_loss": 0.80749428, "learning_rate": 1.945766105774449e-07, "loss": 0.82862198, "num_input_tokens_seen": 309431315, "step": 14348, "time_per_iteration": 2.656729221343994 }, { "auxiliary_loss_clip": 0.01091404, "auxiliary_loss_mlp": 0.01029077, "balance_loss_clip": 1.03467631, "balance_loss_mlp": 1.01720428, "epoch": 0.862708552532692, "flos": 37816413713280.0, "grad_norm": 1.8503371551245635, "language_loss": 0.66269898, "learning_rate": 1.9440908048980665e-07, "loss": 0.68390381, "num_input_tokens_seen": 309453020, "step": 14349, "time_per_iteration": 2.799384832382202 }, { "auxiliary_loss_clip": 0.0109691, "auxiliary_loss_mlp": 0.01036094, "balance_loss_clip": 1.03515387, "balance_loss_mlp": 1.02395201, "epoch": 0.86276867578536, "flos": 19091800247040.0, "grad_norm": 2.6246269667941906, "language_loss": 0.7027539, "learning_rate": 1.942416188703573e-07, "loss": 0.7240839, "num_input_tokens_seen": 309469780, "step": 14350, "time_per_iteration": 2.5943920612335205 }, { "auxiliary_loss_clip": 0.0108035, "auxiliary_loss_mlp": 0.01033768, "balance_loss_clip": 1.03473318, "balance_loss_mlp": 1.02111983, "epoch": 0.862828799038028, "flos": 22164281804160.0, "grad_norm": 1.8551444377087964, "language_loss": 0.76769114, "learning_rate": 1.9407422572544618e-07, "loss": 0.78883231, "num_input_tokens_seen": 309489610, "step": 14351, "time_per_iteration": 2.6581666469573975 }, { "auxiliary_loss_clip": 0.01096886, "auxiliary_loss_mlp": 0.0103006, "balance_loss_clip": 1.0370357, "balance_loss_mlp": 1.01837111, "epoch": 0.8628889222906959, "flos": 23145576433920.0, "grad_norm": 3.863289439771493, "language_loss": 0.85162789, "learning_rate": 1.9390690106142204e-07, "loss": 0.87289739, "num_input_tokens_seen": 309508295, "step": 14352, "time_per_iteration": 2.6280806064605713 }, { "auxiliary_loss_clip": 0.01022246, "auxiliary_loss_mlp": 0.0100272, "balance_loss_clip": 1.0090481, "balance_loss_mlp": 1.00167739, "epoch": 0.8629490455433639, "flos": 57817762151040.0, "grad_norm": 0.7895499485816829, "language_loss": 0.61935335, "learning_rate": 1.9373964488462913e-07, "loss": 0.63960302, "num_input_tokens_seen": 309567960, "step": 14353, "time_per_iteration": 3.146935224533081 }, { "auxiliary_loss_clip": 0.01107884, "auxiliary_loss_mlp": 0.01030033, "balance_loss_clip": 1.038095, "balance_loss_mlp": 1.0188508, "epoch": 0.8630091687960318, "flos": 15919667383680.0, "grad_norm": 1.6638505981636493, "language_loss": 0.81754172, "learning_rate": 1.9357245720140948e-07, "loss": 0.83892089, "num_input_tokens_seen": 309586050, "step": 14354, "time_per_iteration": 2.566462993621826 }, { "auxiliary_loss_clip": 0.01086608, "auxiliary_loss_mlp": 0.01027335, "balance_loss_clip": 1.03349864, "balance_loss_mlp": 1.01475871, "epoch": 0.8630692920486999, "flos": 17961691570560.0, "grad_norm": 2.0513019933105827, "language_loss": 0.85992026, "learning_rate": 1.934053380181031e-07, "loss": 0.88105971, "num_input_tokens_seen": 309602910, "step": 14355, "time_per_iteration": 2.5831828117370605 }, { "auxiliary_loss_clip": 0.01069864, "auxiliary_loss_mlp": 0.01030488, "balance_loss_clip": 1.03425539, "balance_loss_mlp": 1.0177269, "epoch": 0.8631294153013678, "flos": 22455158140800.0, "grad_norm": 4.854829851946411, "language_loss": 0.58569849, "learning_rate": 1.9323828734104763e-07, "loss": 0.60670203, "num_input_tokens_seen": 309621175, "step": 14356, "time_per_iteration": 2.65341854095459 }, { "auxiliary_loss_clip": 0.01064009, "auxiliary_loss_mlp": 0.01035736, "balance_loss_clip": 1.03384709, "balance_loss_mlp": 1.02203846, "epoch": 0.8631895385540358, "flos": 16837005847680.0, "grad_norm": 1.8090879268972078, "language_loss": 0.77420521, "learning_rate": 1.9307130517657756e-07, "loss": 0.79520273, "num_input_tokens_seen": 309639395, "step": 14357, "time_per_iteration": 2.710195302963257 }, { "auxiliary_loss_clip": 0.01098594, "auxiliary_loss_mlp": 0.01033653, "balance_loss_clip": 1.03671813, "balance_loss_mlp": 1.02122521, "epoch": 0.8632496618067037, "flos": 18697214367360.0, "grad_norm": 12.099648120671757, "language_loss": 0.77500695, "learning_rate": 1.9290439153102468e-07, "loss": 0.79632944, "num_input_tokens_seen": 309657265, "step": 14358, "time_per_iteration": 2.6657116413116455 }, { "auxiliary_loss_clip": 0.01071096, "auxiliary_loss_mlp": 0.0103447, "balance_loss_clip": 1.03174829, "balance_loss_mlp": 1.02037358, "epoch": 0.8633097850593717, "flos": 24279922915200.0, "grad_norm": 1.4851174588982734, "language_loss": 0.75020039, "learning_rate": 1.9273754641071816e-07, "loss": 0.77125597, "num_input_tokens_seen": 309678610, "step": 14359, "time_per_iteration": 2.6872808933258057 }, { "auxiliary_loss_clip": 0.01045653, "auxiliary_loss_mlp": 0.01028999, "balance_loss_clip": 1.03073585, "balance_loss_mlp": 1.01629102, "epoch": 0.8633699083120396, "flos": 21178569801600.0, "grad_norm": 1.864228118741394, "language_loss": 0.70209599, "learning_rate": 1.9257076982198517e-07, "loss": 0.72284251, "num_input_tokens_seen": 309697710, "step": 14360, "time_per_iteration": 2.8204243183135986 }, { "auxiliary_loss_clip": 0.01079991, "auxiliary_loss_mlp": 0.01034146, "balance_loss_clip": 1.03886342, "balance_loss_mlp": 1.02069938, "epoch": 0.8634300315647077, "flos": 19244888012160.0, "grad_norm": 1.7674774133909552, "language_loss": 0.7663061, "learning_rate": 1.9240406177114953e-07, "loss": 0.78744745, "num_input_tokens_seen": 309715985, "step": 14361, "time_per_iteration": 2.7079758644104004 }, { "auxiliary_loss_clip": 0.01028441, "auxiliary_loss_mlp": 0.01002241, "balance_loss_clip": 1.00602102, "balance_loss_mlp": 1.00118601, "epoch": 0.8634901548173756, "flos": 66195648282240.0, "grad_norm": 0.9560869661193441, "language_loss": 0.58801341, "learning_rate": 1.922374222645329e-07, "loss": 0.60832024, "num_input_tokens_seen": 309779930, "step": 14362, "time_per_iteration": 3.145829677581787 }, { "auxiliary_loss_clip": 0.01042985, "auxiliary_loss_mlp": 0.01031427, "balance_loss_clip": 1.03692436, "balance_loss_mlp": 1.01842105, "epoch": 0.8635502780700436, "flos": 24789531121920.0, "grad_norm": 1.852310617760456, "language_loss": 0.80515075, "learning_rate": 1.9207085130845524e-07, "loss": 0.82589483, "num_input_tokens_seen": 309800580, "step": 14363, "time_per_iteration": 2.862398147583008 }, { "auxiliary_loss_clip": 0.01082251, "auxiliary_loss_mlp": 0.0104491, "balance_loss_clip": 1.03282666, "balance_loss_mlp": 1.02994919, "epoch": 0.8636104013227116, "flos": 25189970918400.0, "grad_norm": 2.305599711448788, "language_loss": 0.72819698, "learning_rate": 1.9190434890923112e-07, "loss": 0.74946856, "num_input_tokens_seen": 309821725, "step": 14364, "time_per_iteration": 2.7694895267486572 }, { "auxiliary_loss_clip": 0.01084893, "auxiliary_loss_mlp": 0.01037595, "balance_loss_clip": 1.033113, "balance_loss_mlp": 1.02479792, "epoch": 0.8636705245753795, "flos": 23878441624320.0, "grad_norm": 3.709270849116724, "language_loss": 0.71231377, "learning_rate": 1.917379150731755e-07, "loss": 0.73353863, "num_input_tokens_seen": 309841565, "step": 14365, "time_per_iteration": 2.6591691970825195 }, { "auxiliary_loss_clip": 0.01084976, "auxiliary_loss_mlp": 0.01048634, "balance_loss_clip": 1.03588641, "balance_loss_mlp": 1.03338158, "epoch": 0.8637306478280475, "flos": 23110455911040.0, "grad_norm": 2.5553133795092853, "language_loss": 0.7095083, "learning_rate": 1.915715498065993e-07, "loss": 0.73084438, "num_input_tokens_seen": 309858635, "step": 14366, "time_per_iteration": 2.654860019683838 }, { "auxiliary_loss_clip": 0.01080294, "auxiliary_loss_mlp": 0.01025551, "balance_loss_clip": 1.03619814, "balance_loss_mlp": 1.01414287, "epoch": 0.8637907710807154, "flos": 21906802137600.0, "grad_norm": 1.7096438755629864, "language_loss": 0.81546772, "learning_rate": 1.9140525311581146e-07, "loss": 0.83652616, "num_input_tokens_seen": 309877885, "step": 14367, "time_per_iteration": 2.658378839492798 }, { "auxiliary_loss_clip": 0.01084703, "auxiliary_loss_mlp": 0.01029547, "balance_loss_clip": 1.03672993, "balance_loss_mlp": 1.01633847, "epoch": 0.8638508943333835, "flos": 23580526222080.0, "grad_norm": 1.893928917899102, "language_loss": 0.61735493, "learning_rate": 1.9123902500711743e-07, "loss": 0.63849741, "num_input_tokens_seen": 309893140, "step": 14368, "time_per_iteration": 2.7563858032226562 }, { "auxiliary_loss_clip": 0.01100198, "auxiliary_loss_mlp": 0.01032496, "balance_loss_clip": 1.03874695, "balance_loss_mlp": 1.02039003, "epoch": 0.8639110175860514, "flos": 25775853655680.0, "grad_norm": 1.8793002534030256, "language_loss": 0.76034266, "learning_rate": 1.91072865486821e-07, "loss": 0.78166956, "num_input_tokens_seen": 309914175, "step": 14369, "time_per_iteration": 2.720898389816284 }, { "auxiliary_loss_clip": 0.01084672, "auxiliary_loss_mlp": 0.01036631, "balance_loss_clip": 1.03559625, "balance_loss_mlp": 1.02341676, "epoch": 0.8639711408387194, "flos": 23369443948800.0, "grad_norm": 1.7853455922645574, "language_loss": 0.64685416, "learning_rate": 1.9090677456122294e-07, "loss": 0.66806722, "num_input_tokens_seen": 309932395, "step": 14370, "time_per_iteration": 2.7746939659118652 }, { "auxiliary_loss_clip": 0.01051431, "auxiliary_loss_mlp": 0.01034289, "balance_loss_clip": 1.03813696, "balance_loss_mlp": 1.02186131, "epoch": 0.8640312640913873, "flos": 22127221946880.0, "grad_norm": 1.6691251892121577, "language_loss": 0.66381669, "learning_rate": 1.907407522366209e-07, "loss": 0.68467391, "num_input_tokens_seen": 309951720, "step": 14371, "time_per_iteration": 2.7832515239715576 }, { "auxiliary_loss_clip": 0.01010679, "auxiliary_loss_mlp": 0.0100181, "balance_loss_clip": 1.00754333, "balance_loss_mlp": 1.00070095, "epoch": 0.8640913873440553, "flos": 57571735944960.0, "grad_norm": 0.8715418299752374, "language_loss": 0.56873655, "learning_rate": 1.905747985193107e-07, "loss": 0.58886147, "num_input_tokens_seen": 310006120, "step": 14372, "time_per_iteration": 3.080965042114258 }, { "auxiliary_loss_clip": 0.01107085, "auxiliary_loss_mlp": 0.01031651, "balance_loss_clip": 1.03817725, "balance_loss_mlp": 1.01909208, "epoch": 0.8641515105967232, "flos": 23987430466560.0, "grad_norm": 1.722968636798083, "language_loss": 0.79519123, "learning_rate": 1.9040891341558597e-07, "loss": 0.81657857, "num_input_tokens_seen": 310026740, "step": 14373, "time_per_iteration": 2.635335683822632 }, { "auxiliary_loss_clip": 0.01110837, "auxiliary_loss_mlp": 0.01028883, "balance_loss_clip": 1.03787744, "balance_loss_mlp": 1.01607943, "epoch": 0.8642116338493913, "flos": 19062749122560.0, "grad_norm": 1.6653536401221238, "language_loss": 0.63377726, "learning_rate": 1.9024309693173656e-07, "loss": 0.65517449, "num_input_tokens_seen": 310044135, "step": 14374, "time_per_iteration": 2.5494918823242188 }, { "auxiliary_loss_clip": 0.01077851, "auxiliary_loss_mlp": 0.01034525, "balance_loss_clip": 1.03636634, "balance_loss_mlp": 1.02211523, "epoch": 0.8642717571020592, "flos": 18254148105600.0, "grad_norm": 1.7168843124571862, "language_loss": 0.77189004, "learning_rate": 1.9007734907404993e-07, "loss": 0.79301381, "num_input_tokens_seen": 310061560, "step": 14375, "time_per_iteration": 2.677976131439209 }, { "auxiliary_loss_clip": 0.01064524, "auxiliary_loss_mlp": 0.00770411, "balance_loss_clip": 1.03405952, "balance_loss_mlp": 1.00014496, "epoch": 0.8643318803547272, "flos": 57663270777600.0, "grad_norm": 1.8714174217127035, "language_loss": 0.60663325, "learning_rate": 1.899116698488117e-07, "loss": 0.6249826, "num_input_tokens_seen": 310087310, "step": 14376, "time_per_iteration": 3.0315792560577393 }, { "auxiliary_loss_clip": 0.01065318, "auxiliary_loss_mlp": 0.01037856, "balance_loss_clip": 1.0328449, "balance_loss_mlp": 1.02571476, "epoch": 0.8643920036073952, "flos": 19609524927360.0, "grad_norm": 1.4665083491596096, "language_loss": 0.66321123, "learning_rate": 1.8974605926230457e-07, "loss": 0.68424296, "num_input_tokens_seen": 310106260, "step": 14377, "time_per_iteration": 2.661478042602539 }, { "auxiliary_loss_clip": 0.01082246, "auxiliary_loss_mlp": 0.0104227, "balance_loss_clip": 1.03249764, "balance_loss_mlp": 1.02842414, "epoch": 0.8644521268600631, "flos": 20850346298880.0, "grad_norm": 1.6699792562126987, "language_loss": 0.70700777, "learning_rate": 1.8958051732080804e-07, "loss": 0.72825295, "num_input_tokens_seen": 310125305, "step": 14378, "time_per_iteration": 2.6440517902374268 }, { "auxiliary_loss_clip": 0.0101912, "auxiliary_loss_mlp": 0.01001905, "balance_loss_clip": 1.0065546, "balance_loss_mlp": 1.00082636, "epoch": 0.8645122501127311, "flos": 66719550101760.0, "grad_norm": 0.8082600022248976, "language_loss": 0.60236883, "learning_rate": 1.894150440305995e-07, "loss": 0.6225791, "num_input_tokens_seen": 310189270, "step": 14379, "time_per_iteration": 3.1792728900909424 }, { "auxiliary_loss_clip": 0.01077548, "auxiliary_loss_mlp": 0.01032454, "balance_loss_clip": 1.03373933, "balance_loss_mlp": 1.02031279, "epoch": 0.864572373365399, "flos": 21690009601920.0, "grad_norm": 1.8837339678348841, "language_loss": 0.74800771, "learning_rate": 1.8924963939795478e-07, "loss": 0.76910776, "num_input_tokens_seen": 310208395, "step": 14380, "time_per_iteration": 4.324819803237915 }, { "auxiliary_loss_clip": 0.01080903, "auxiliary_loss_mlp": 0.01036416, "balance_loss_clip": 1.03346038, "balance_loss_mlp": 1.02307606, "epoch": 0.8646324966180671, "flos": 20266402896000.0, "grad_norm": 1.9558839364360057, "language_loss": 0.75436544, "learning_rate": 1.8908430342914473e-07, "loss": 0.77553868, "num_input_tokens_seen": 310227415, "step": 14381, "time_per_iteration": 4.3003315925598145 }, { "auxiliary_loss_clip": 0.01085169, "auxiliary_loss_mlp": 0.01035093, "balance_loss_clip": 1.03721309, "balance_loss_mlp": 1.02337468, "epoch": 0.864692619870735, "flos": 11946188050560.0, "grad_norm": 2.531870478420468, "language_loss": 0.84684384, "learning_rate": 1.8891903613043892e-07, "loss": 0.86804652, "num_input_tokens_seen": 310242625, "step": 14382, "time_per_iteration": 4.235616683959961 }, { "auxiliary_loss_clip": 0.01101073, "auxiliary_loss_mlp": 0.01035119, "balance_loss_clip": 1.03812909, "balance_loss_mlp": 1.02230954, "epoch": 0.864752743123403, "flos": 21470703114240.0, "grad_norm": 2.020788387095791, "language_loss": 0.75921559, "learning_rate": 1.8875383750810504e-07, "loss": 0.78057754, "num_input_tokens_seen": 310260585, "step": 14383, "time_per_iteration": 2.743340015411377 }, { "auxiliary_loss_clip": 0.01089565, "auxiliary_loss_mlp": 0.0103368, "balance_loss_clip": 1.03891516, "balance_loss_mlp": 1.02116311, "epoch": 0.8648128663760709, "flos": 19530018172800.0, "grad_norm": 1.8560596447894047, "language_loss": 0.85428023, "learning_rate": 1.8858870756840738e-07, "loss": 0.87551272, "num_input_tokens_seen": 310277210, "step": 14384, "time_per_iteration": 4.140477418899536 }, { "auxiliary_loss_clip": 0.01093344, "auxiliary_loss_mlp": 0.01030627, "balance_loss_clip": 1.0340718, "balance_loss_mlp": 1.01875997, "epoch": 0.8648729896287389, "flos": 21287953693440.0, "grad_norm": 1.6613358165771401, "language_loss": 0.8117463, "learning_rate": 1.884236463176072e-07, "loss": 0.832986, "num_input_tokens_seen": 310296610, "step": 14385, "time_per_iteration": 2.563424825668335 }, { "auxiliary_loss_clip": 0.01094427, "auxiliary_loss_mlp": 0.01035484, "balance_loss_clip": 1.040411, "balance_loss_mlp": 1.02252555, "epoch": 0.8649331128814068, "flos": 24604483230720.0, "grad_norm": 2.3388483586087303, "language_loss": 0.72581172, "learning_rate": 1.8825865376196437e-07, "loss": 0.74711078, "num_input_tokens_seen": 310316830, "step": 14386, "time_per_iteration": 2.667926549911499 }, { "auxiliary_loss_clip": 0.01093992, "auxiliary_loss_mlp": 0.01041396, "balance_loss_clip": 1.03530121, "balance_loss_mlp": 1.02797318, "epoch": 0.8649932361340749, "flos": 15377811742080.0, "grad_norm": 4.510791763694996, "language_loss": 0.81868196, "learning_rate": 1.8809372990773476e-07, "loss": 0.8400358, "num_input_tokens_seen": 310334355, "step": 14387, "time_per_iteration": 2.660701036453247 }, { "auxiliary_loss_clip": 0.01106932, "auxiliary_loss_mlp": 0.01028302, "balance_loss_clip": 1.0378803, "balance_loss_mlp": 1.01641643, "epoch": 0.8650533593867428, "flos": 19901227276800.0, "grad_norm": 2.104447554520212, "language_loss": 0.68797326, "learning_rate": 1.8792887476117224e-07, "loss": 0.70932555, "num_input_tokens_seen": 310352900, "step": 14388, "time_per_iteration": 2.5773561000823975 }, { "auxiliary_loss_clip": 0.01073211, "auxiliary_loss_mlp": 0.01036245, "balance_loss_clip": 1.03666544, "balance_loss_mlp": 1.02510452, "epoch": 0.8651134826394108, "flos": 25626931868160.0, "grad_norm": 2.8952711176553043, "language_loss": 0.90358889, "learning_rate": 1.877640883285283e-07, "loss": 0.92468345, "num_input_tokens_seen": 310372855, "step": 14389, "time_per_iteration": 2.712479591369629 }, { "auxiliary_loss_clip": 0.01065736, "auxiliary_loss_mlp": 0.00769129, "balance_loss_clip": 1.03819394, "balance_loss_mlp": 1.0002389, "epoch": 0.8651736058920788, "flos": 18734525619840.0, "grad_norm": 1.556328693404614, "language_loss": 0.70784509, "learning_rate": 1.8759937061605212e-07, "loss": 0.72619373, "num_input_tokens_seen": 310391595, "step": 14390, "time_per_iteration": 2.7250664234161377 }, { "auxiliary_loss_clip": 0.0110984, "auxiliary_loss_mlp": 0.01034761, "balance_loss_clip": 1.03667974, "balance_loss_mlp": 1.02206492, "epoch": 0.8652337291447467, "flos": 20776765288320.0, "grad_norm": 3.0242900770440158, "language_loss": 0.82031155, "learning_rate": 1.8743472162998941e-07, "loss": 0.84175754, "num_input_tokens_seen": 310410090, "step": 14391, "time_per_iteration": 2.5874016284942627 }, { "auxiliary_loss_clip": 0.00999016, "auxiliary_loss_mlp": 0.00998272, "balance_loss_clip": 1.00931406, "balance_loss_mlp": 0.99692518, "epoch": 0.8652938523974147, "flos": 64227887464320.0, "grad_norm": 0.800129032908664, "language_loss": 0.67961007, "learning_rate": 1.8727014137658337e-07, "loss": 0.69958293, "num_input_tokens_seen": 310470055, "step": 14392, "time_per_iteration": 3.141786813735962 }, { "auxiliary_loss_clip": 0.01102797, "auxiliary_loss_mlp": 0.01032477, "balance_loss_clip": 1.03694808, "balance_loss_mlp": 1.01924479, "epoch": 0.8653539756500827, "flos": 18040587793920.0, "grad_norm": 1.8856474230308053, "language_loss": 0.75999135, "learning_rate": 1.8710562986207523e-07, "loss": 0.78134412, "num_input_tokens_seen": 310487665, "step": 14393, "time_per_iteration": 2.6403071880340576 }, { "auxiliary_loss_clip": 0.01085265, "auxiliary_loss_mlp": 0.01035656, "balance_loss_clip": 1.03292179, "balance_loss_mlp": 1.02319884, "epoch": 0.8654140989027507, "flos": 17382416935680.0, "grad_norm": 1.8766276101061499, "language_loss": 0.73443645, "learning_rate": 1.8694118709270357e-07, "loss": 0.75564563, "num_input_tokens_seen": 310506130, "step": 14394, "time_per_iteration": 2.589737892150879 }, { "auxiliary_loss_clip": 0.01098893, "auxiliary_loss_mlp": 0.01029702, "balance_loss_clip": 1.03559685, "balance_loss_mlp": 1.01642823, "epoch": 0.8654742221554186, "flos": 53284862448000.0, "grad_norm": 25.445187757651638, "language_loss": 0.65340948, "learning_rate": 1.867768130747036e-07, "loss": 0.67469549, "num_input_tokens_seen": 310532445, "step": 14395, "time_per_iteration": 2.8686017990112305 }, { "auxiliary_loss_clip": 0.01091975, "auxiliary_loss_mlp": 0.0103619, "balance_loss_clip": 1.03594851, "balance_loss_mlp": 1.02362514, "epoch": 0.8655343454080866, "flos": 23914711382400.0, "grad_norm": 3.648513206821013, "language_loss": 0.68270028, "learning_rate": 1.8661250781430838e-07, "loss": 0.70398188, "num_input_tokens_seen": 310552300, "step": 14396, "time_per_iteration": 2.691372871398926 }, { "auxiliary_loss_clip": 0.01102693, "auxiliary_loss_mlp": 0.01036564, "balance_loss_clip": 1.03977966, "balance_loss_mlp": 1.02393413, "epoch": 0.8655944686607545, "flos": 24097209408000.0, "grad_norm": 2.1296548078090067, "language_loss": 0.6985743, "learning_rate": 1.8644827131774954e-07, "loss": 0.71996689, "num_input_tokens_seen": 310572710, "step": 14397, "time_per_iteration": 2.6537063121795654 }, { "auxiliary_loss_clip": 0.01092627, "auxiliary_loss_mlp": 0.01029547, "balance_loss_clip": 1.03830481, "balance_loss_mlp": 1.01773953, "epoch": 0.8656545919134225, "flos": 23112718467840.0, "grad_norm": 1.7708020135557936, "language_loss": 0.63645488, "learning_rate": 1.86284103591253e-07, "loss": 0.65767658, "num_input_tokens_seen": 310592460, "step": 14398, "time_per_iteration": 2.721609592437744 }, { "auxiliary_loss_clip": 0.01072146, "auxiliary_loss_mlp": 0.01040273, "balance_loss_clip": 1.03550839, "balance_loss_mlp": 1.02659369, "epoch": 0.8657147151660904, "flos": 21141761339520.0, "grad_norm": 2.410679040433659, "language_loss": 0.76115006, "learning_rate": 1.8612000464104517e-07, "loss": 0.78227425, "num_input_tokens_seen": 310609375, "step": 14399, "time_per_iteration": 2.6792304515838623 }, { "auxiliary_loss_clip": 0.01091264, "auxiliary_loss_mlp": 0.0102886, "balance_loss_clip": 1.03629327, "balance_loss_mlp": 1.0173502, "epoch": 0.8657748384187585, "flos": 16289439943680.0, "grad_norm": 2.1842250603302906, "language_loss": 0.93539166, "learning_rate": 1.8595597447334855e-07, "loss": 0.95659292, "num_input_tokens_seen": 310627405, "step": 14400, "time_per_iteration": 2.557438850402832 }, { "auxiliary_loss_clip": 0.01044413, "auxiliary_loss_mlp": 0.01038088, "balance_loss_clip": 1.0341754, "balance_loss_mlp": 1.02537465, "epoch": 0.8658349616714264, "flos": 30843890179200.0, "grad_norm": 1.8571085521140969, "language_loss": 0.67723525, "learning_rate": 1.8579201309438353e-07, "loss": 0.69806027, "num_input_tokens_seen": 310649945, "step": 14401, "time_per_iteration": 2.8091368675231934 }, { "auxiliary_loss_clip": 0.01099417, "auxiliary_loss_mlp": 0.01031502, "balance_loss_clip": 1.03662825, "balance_loss_mlp": 1.01880038, "epoch": 0.8658950849240944, "flos": 18952862440320.0, "grad_norm": 2.157466322300169, "language_loss": 0.73613071, "learning_rate": 1.8562812051036714e-07, "loss": 0.75743997, "num_input_tokens_seen": 310668285, "step": 14402, "time_per_iteration": 2.570737838745117 }, { "auxiliary_loss_clip": 0.01036456, "auxiliary_loss_mlp": 0.01033626, "balance_loss_clip": 1.03347492, "balance_loss_mlp": 1.02177048, "epoch": 0.8659552081767624, "flos": 23364344217600.0, "grad_norm": 1.7804756809265996, "language_loss": 0.74911118, "learning_rate": 1.8546429672751397e-07, "loss": 0.76981199, "num_input_tokens_seen": 310687015, "step": 14403, "time_per_iteration": 2.8824269771575928 }, { "auxiliary_loss_clip": 0.0108389, "auxiliary_loss_mlp": 0.01034677, "balance_loss_clip": 1.03559339, "balance_loss_mlp": 1.02145052, "epoch": 0.8660153314294303, "flos": 23841992298240.0, "grad_norm": 1.9785439757020915, "language_loss": 0.73294771, "learning_rate": 1.853005417520368e-07, "loss": 0.75413334, "num_input_tokens_seen": 310707580, "step": 14404, "time_per_iteration": 2.691854238510132 }, { "auxiliary_loss_clip": 0.01070251, "auxiliary_loss_mlp": 0.01036432, "balance_loss_clip": 1.03529263, "balance_loss_mlp": 1.02364087, "epoch": 0.8660754546820983, "flos": 23112467072640.0, "grad_norm": 1.6230968808599193, "language_loss": 0.70621324, "learning_rate": 1.851368555901447e-07, "loss": 0.72728002, "num_input_tokens_seen": 310727300, "step": 14405, "time_per_iteration": 2.6545495986938477 }, { "auxiliary_loss_clip": 0.01099979, "auxiliary_loss_mlp": 0.0077033, "balance_loss_clip": 1.03619599, "balance_loss_mlp": 1.00023413, "epoch": 0.8661355779347663, "flos": 14391991998720.0, "grad_norm": 1.8683678221955426, "language_loss": 0.66598046, "learning_rate": 1.8497323824804467e-07, "loss": 0.68468356, "num_input_tokens_seen": 310744935, "step": 14406, "time_per_iteration": 2.6244313716888428 }, { "auxiliary_loss_clip": 0.01087721, "auxiliary_loss_mlp": 0.01027006, "balance_loss_clip": 1.0369488, "balance_loss_mlp": 1.01565766, "epoch": 0.8661957011874343, "flos": 21870137329920.0, "grad_norm": 1.713289909017667, "language_loss": 0.82678503, "learning_rate": 1.8480968973194177e-07, "loss": 0.84793234, "num_input_tokens_seen": 310765085, "step": 14407, "time_per_iteration": 2.7246527671813965 }, { "auxiliary_loss_clip": 0.01097432, "auxiliary_loss_mlp": 0.01038578, "balance_loss_clip": 1.03706372, "balance_loss_mlp": 1.02623403, "epoch": 0.8662558244401022, "flos": 21835160461440.0, "grad_norm": 1.640288408492858, "language_loss": 0.70144266, "learning_rate": 1.8464621004803748e-07, "loss": 0.72280276, "num_input_tokens_seen": 310783260, "step": 14408, "time_per_iteration": 2.688714027404785 }, { "auxiliary_loss_clip": 0.01088368, "auxiliary_loss_mlp": 0.01034051, "balance_loss_clip": 1.036026, "balance_loss_mlp": 1.02254152, "epoch": 0.8663159476927702, "flos": 17384104874880.0, "grad_norm": 1.9035272419543303, "language_loss": 0.7693873, "learning_rate": 1.844827992025304e-07, "loss": 0.79061151, "num_input_tokens_seen": 310801970, "step": 14409, "time_per_iteration": 2.668154239654541 }, { "auxiliary_loss_clip": 0.01101925, "auxiliary_loss_mlp": 0.01034118, "balance_loss_clip": 1.03869689, "balance_loss_mlp": 1.02009869, "epoch": 0.8663760709454381, "flos": 22747722416640.0, "grad_norm": 1.696612134520476, "language_loss": 0.77045894, "learning_rate": 1.8431945720161757e-07, "loss": 0.79181939, "num_input_tokens_seen": 310822070, "step": 14410, "time_per_iteration": 2.6069350242614746 }, { "auxiliary_loss_clip": 0.0106574, "auxiliary_loss_mlp": 0.0103477, "balance_loss_clip": 1.03402448, "balance_loss_mlp": 1.02225292, "epoch": 0.8664361941981061, "flos": 17376850327680.0, "grad_norm": 1.9481665792177514, "language_loss": 0.77590597, "learning_rate": 1.8415618405149315e-07, "loss": 0.79691112, "num_input_tokens_seen": 310838355, "step": 14411, "time_per_iteration": 2.6132922172546387 }, { "auxiliary_loss_clip": 0.01078109, "auxiliary_loss_mlp": 0.01035885, "balance_loss_clip": 1.03366232, "balance_loss_mlp": 1.02461982, "epoch": 0.866496317450774, "flos": 16034438315520.0, "grad_norm": 1.750688188601547, "language_loss": 0.74020624, "learning_rate": 1.8399297975834794e-07, "loss": 0.76134622, "num_input_tokens_seen": 310856055, "step": 14412, "time_per_iteration": 2.6058592796325684 }, { "auxiliary_loss_clip": 0.01090356, "auxiliary_loss_mlp": 0.00771287, "balance_loss_clip": 1.03415728, "balance_loss_mlp": 1.0002377, "epoch": 0.8665564407034421, "flos": 20814830726400.0, "grad_norm": 1.7730290452974458, "language_loss": 0.6952216, "learning_rate": 1.83829844328371e-07, "loss": 0.71383798, "num_input_tokens_seen": 310876695, "step": 14413, "time_per_iteration": 2.614438056945801 }, { "auxiliary_loss_clip": 0.01098326, "auxiliary_loss_mlp": 0.01035601, "balance_loss_clip": 1.03807211, "balance_loss_mlp": 1.02280378, "epoch": 0.86661656395611, "flos": 15815167741440.0, "grad_norm": 2.2624919572268603, "language_loss": 0.62299776, "learning_rate": 1.8366677776774874e-07, "loss": 0.64433706, "num_input_tokens_seen": 310893880, "step": 14414, "time_per_iteration": 2.5781359672546387 }, { "auxiliary_loss_clip": 0.01078873, "auxiliary_loss_mlp": 0.00769848, "balance_loss_clip": 1.03693521, "balance_loss_mlp": 1.00018334, "epoch": 0.866676687208778, "flos": 23036910814080.0, "grad_norm": 1.633402194861805, "language_loss": 0.6382761, "learning_rate": 1.8350378008266377e-07, "loss": 0.65676332, "num_input_tokens_seen": 310914145, "step": 14415, "time_per_iteration": 2.718871831893921 }, { "auxiliary_loss_clip": 0.01001561, "auxiliary_loss_mlp": 0.01003608, "balance_loss_clip": 1.00817573, "balance_loss_mlp": 1.00249326, "epoch": 0.866736810461446, "flos": 63802275212160.0, "grad_norm": 0.7984060732990605, "language_loss": 0.60386515, "learning_rate": 1.8334085127929754e-07, "loss": 0.62391675, "num_input_tokens_seen": 310972825, "step": 14416, "time_per_iteration": 3.32995343208313 }, { "auxiliary_loss_clip": 0.01101132, "auxiliary_loss_mlp": 0.00771613, "balance_loss_clip": 1.03657961, "balance_loss_mlp": 1.00021935, "epoch": 0.8667969337141139, "flos": 20449367798400.0, "grad_norm": 1.8418559136989974, "language_loss": 0.74591923, "learning_rate": 1.831779913638285e-07, "loss": 0.76464671, "num_input_tokens_seen": 310992050, "step": 14417, "time_per_iteration": 2.6240720748901367 }, { "auxiliary_loss_clip": 0.0108446, "auxiliary_loss_mlp": 0.01035619, "balance_loss_clip": 1.03623867, "balance_loss_mlp": 1.02401364, "epoch": 0.866857056966782, "flos": 21653703930240.0, "grad_norm": 1.6010496631035476, "language_loss": 0.75304806, "learning_rate": 1.830152003424319e-07, "loss": 0.77424884, "num_input_tokens_seen": 311011105, "step": 14418, "time_per_iteration": 2.6442039012908936 }, { "auxiliary_loss_clip": 0.01096633, "auxiliary_loss_mlp": 0.01034851, "balance_loss_clip": 1.0357796, "balance_loss_mlp": 1.02292967, "epoch": 0.8669171802194499, "flos": 22852832590080.0, "grad_norm": 1.669621966476557, "language_loss": 0.68341649, "learning_rate": 1.8285247822128126e-07, "loss": 0.70473135, "num_input_tokens_seen": 311032080, "step": 14419, "time_per_iteration": 2.623978853225708 }, { "auxiliary_loss_clip": 0.01099318, "auxiliary_loss_mlp": 0.01031681, "balance_loss_clip": 1.03616405, "balance_loss_mlp": 1.02020669, "epoch": 0.8669773034721179, "flos": 18734166483840.0, "grad_norm": 1.6685720418473156, "language_loss": 0.78522211, "learning_rate": 1.826898250065465e-07, "loss": 0.80653214, "num_input_tokens_seen": 311049735, "step": 14420, "time_per_iteration": 4.198700189590454 }, { "auxiliary_loss_clip": 0.01093862, "auxiliary_loss_mlp": 0.01032049, "balance_loss_clip": 1.03496552, "balance_loss_mlp": 1.01974106, "epoch": 0.8670374267247858, "flos": 18916018064640.0, "grad_norm": 1.5087342244931736, "language_loss": 0.83599997, "learning_rate": 1.8252724070439586e-07, "loss": 0.85725909, "num_input_tokens_seen": 311067675, "step": 14421, "time_per_iteration": 4.208746910095215 }, { "auxiliary_loss_clip": 0.01006687, "auxiliary_loss_mlp": 0.00999775, "balance_loss_clip": 1.00802314, "balance_loss_mlp": 0.99845761, "epoch": 0.8670975499774538, "flos": 48814527214080.0, "grad_norm": 0.7509779369384021, "language_loss": 0.49057785, "learning_rate": 1.823647253209941e-07, "loss": 0.51064241, "num_input_tokens_seen": 311126605, "step": 14422, "time_per_iteration": 4.777186870574951 }, { "auxiliary_loss_clip": 0.01087105, "auxiliary_loss_mlp": 0.00769697, "balance_loss_clip": 1.03720963, "balance_loss_mlp": 1.00028849, "epoch": 0.8671576732301217, "flos": 26136145025280.0, "grad_norm": 1.670233296430545, "language_loss": 0.73442525, "learning_rate": 1.8220227886250417e-07, "loss": 0.75299329, "num_input_tokens_seen": 311147325, "step": 14423, "time_per_iteration": 4.283585786819458 }, { "auxiliary_loss_clip": 0.01061427, "auxiliary_loss_mlp": 0.0103548, "balance_loss_clip": 1.03110516, "balance_loss_mlp": 1.02256989, "epoch": 0.8672177964827897, "flos": 18367446579840.0, "grad_norm": 1.5662705117653968, "language_loss": 0.76781297, "learning_rate": 1.8203990133508684e-07, "loss": 0.78878212, "num_input_tokens_seen": 311165385, "step": 14424, "time_per_iteration": 2.645517110824585 }, { "auxiliary_loss_clip": 0.01066724, "auxiliary_loss_mlp": 0.01040643, "balance_loss_clip": 1.03161621, "balance_loss_mlp": 1.02800703, "epoch": 0.8672779197354576, "flos": 28545355992960.0, "grad_norm": 1.9458194171790135, "language_loss": 0.71327066, "learning_rate": 1.8187759274489767e-07, "loss": 0.73434436, "num_input_tokens_seen": 311185860, "step": 14425, "time_per_iteration": 2.7444801330566406 }, { "auxiliary_loss_clip": 0.01100034, "auxiliary_loss_mlp": 0.01033916, "balance_loss_clip": 1.03743434, "balance_loss_mlp": 1.02065444, "epoch": 0.8673380429881257, "flos": 22382474970240.0, "grad_norm": 1.755018176625315, "language_loss": 0.6806134, "learning_rate": 1.817153530980926e-07, "loss": 0.70195293, "num_input_tokens_seen": 311205810, "step": 14426, "time_per_iteration": 2.5805845260620117 }, { "auxiliary_loss_clip": 0.01065339, "auxiliary_loss_mlp": 0.01027987, "balance_loss_clip": 1.03625464, "balance_loss_mlp": 1.01546359, "epoch": 0.8673981662407936, "flos": 20996430912000.0, "grad_norm": 1.8587393637126561, "language_loss": 0.70647991, "learning_rate": 1.815531824008234e-07, "loss": 0.72741318, "num_input_tokens_seen": 311226080, "step": 14427, "time_per_iteration": 2.685107469558716 }, { "auxiliary_loss_clip": 0.01080277, "auxiliary_loss_mlp": 0.0103208, "balance_loss_clip": 1.03615725, "balance_loss_mlp": 1.02000976, "epoch": 0.8674582894934616, "flos": 24426797627520.0, "grad_norm": 1.894860167096284, "language_loss": 0.68146193, "learning_rate": 1.8139108065924004e-07, "loss": 0.70258546, "num_input_tokens_seen": 311246380, "step": 14428, "time_per_iteration": 2.7677488327026367 }, { "auxiliary_loss_clip": 0.01080543, "auxiliary_loss_mlp": 0.01029001, "balance_loss_clip": 1.0359962, "balance_loss_mlp": 1.01683569, "epoch": 0.8675184127461296, "flos": 20737514701440.0, "grad_norm": 2.892495609398215, "language_loss": 0.70616251, "learning_rate": 1.812290478794889e-07, "loss": 0.72725797, "num_input_tokens_seen": 311266465, "step": 14429, "time_per_iteration": 2.624802827835083 }, { "auxiliary_loss_clip": 0.010878, "auxiliary_loss_mlp": 0.01030213, "balance_loss_clip": 1.03670454, "balance_loss_mlp": 1.01785088, "epoch": 0.8675785359987975, "flos": 19135647774720.0, "grad_norm": 1.8760175406026705, "language_loss": 0.66803014, "learning_rate": 1.810670840677151e-07, "loss": 0.6892103, "num_input_tokens_seen": 311285075, "step": 14430, "time_per_iteration": 2.6141793727874756 }, { "auxiliary_loss_clip": 0.01064719, "auxiliary_loss_mlp": 0.01037459, "balance_loss_clip": 1.03474712, "balance_loss_mlp": 1.02360034, "epoch": 0.8676386592514655, "flos": 22710662559360.0, "grad_norm": 1.8772851475850807, "language_loss": 0.69439894, "learning_rate": 1.8090518923005948e-07, "loss": 0.71542072, "num_input_tokens_seen": 311303230, "step": 14431, "time_per_iteration": 2.760996103286743 }, { "auxiliary_loss_clip": 0.01097351, "auxiliary_loss_mlp": 0.01040167, "balance_loss_clip": 1.03582358, "balance_loss_mlp": 1.02768576, "epoch": 0.8676987825041335, "flos": 14209853109120.0, "grad_norm": 2.630424540057507, "language_loss": 0.63210046, "learning_rate": 1.8074336337266116e-07, "loss": 0.65347564, "num_input_tokens_seen": 311318070, "step": 14432, "time_per_iteration": 2.5565524101257324 }, { "auxiliary_loss_clip": 0.0109965, "auxiliary_loss_mlp": 0.01039814, "balance_loss_clip": 1.03807235, "balance_loss_mlp": 1.02821505, "epoch": 0.8677589057568015, "flos": 13589927256960.0, "grad_norm": 1.9324335266361277, "language_loss": 0.78167832, "learning_rate": 1.8058160650165656e-07, "loss": 0.80307293, "num_input_tokens_seen": 311334885, "step": 14433, "time_per_iteration": 2.603163242340088 }, { "auxiliary_loss_clip": 0.01010943, "auxiliary_loss_mlp": 0.01002541, "balance_loss_clip": 1.00770855, "balance_loss_mlp": 1.00159311, "epoch": 0.8678190290094694, "flos": 68933657370240.0, "grad_norm": 0.7061148841104811, "language_loss": 0.5846473, "learning_rate": 1.804199186231805e-07, "loss": 0.6047821, "num_input_tokens_seen": 311399780, "step": 14434, "time_per_iteration": 3.2711222171783447 }, { "auxiliary_loss_clip": 0.01084546, "auxiliary_loss_mlp": 0.01034123, "balance_loss_clip": 1.03522635, "balance_loss_mlp": 1.02258372, "epoch": 0.8678791522621374, "flos": 32557726776960.0, "grad_norm": 1.9678570849349808, "language_loss": 0.80160731, "learning_rate": 1.802582997433628e-07, "loss": 0.82279408, "num_input_tokens_seen": 311419610, "step": 14435, "time_per_iteration": 2.729384660720825 }, { "auxiliary_loss_clip": 0.0108652, "auxiliary_loss_mlp": 0.00771159, "balance_loss_clip": 1.03368807, "balance_loss_mlp": 1.00019312, "epoch": 0.8679392755148053, "flos": 35042637657600.0, "grad_norm": 2.323598256693539, "language_loss": 0.62088466, "learning_rate": 1.8009674986833322e-07, "loss": 0.63946146, "num_input_tokens_seen": 311440045, "step": 14436, "time_per_iteration": 2.7514889240264893 }, { "auxiliary_loss_clip": 0.01084626, "auxiliary_loss_mlp": 0.01030406, "balance_loss_clip": 1.03650117, "balance_loss_mlp": 1.01762128, "epoch": 0.8679993987674733, "flos": 18552494471040.0, "grad_norm": 2.2152793164861477, "language_loss": 0.70417553, "learning_rate": 1.7993526900421706e-07, "loss": 0.72532582, "num_input_tokens_seen": 311456660, "step": 14437, "time_per_iteration": 2.682568311691284 }, { "auxiliary_loss_clip": 0.01073964, "auxiliary_loss_mlp": 0.01026899, "balance_loss_clip": 1.03458905, "balance_loss_mlp": 1.01451957, "epoch": 0.8680595220201412, "flos": 27454390162560.0, "grad_norm": 1.9672371609341477, "language_loss": 0.80644393, "learning_rate": 1.797738571571381e-07, "loss": 0.8274526, "num_input_tokens_seen": 311475460, "step": 14438, "time_per_iteration": 2.7269651889801025 }, { "auxiliary_loss_clip": 0.01089468, "auxiliary_loss_mlp": 0.01024249, "balance_loss_clip": 1.035025, "balance_loss_mlp": 1.01237011, "epoch": 0.8681196452728093, "flos": 19208797822080.0, "grad_norm": 1.7527538645260887, "language_loss": 0.67584556, "learning_rate": 1.7961251433321656e-07, "loss": 0.69698274, "num_input_tokens_seen": 311494575, "step": 14439, "time_per_iteration": 2.581627130508423 }, { "auxiliary_loss_clip": 0.01096234, "auxiliary_loss_mlp": 0.01034212, "balance_loss_clip": 1.03661394, "balance_loss_mlp": 1.02268469, "epoch": 0.8681797685254772, "flos": 37560442417920.0, "grad_norm": 1.484819058237711, "language_loss": 0.63649923, "learning_rate": 1.7945124053857085e-07, "loss": 0.65780365, "num_input_tokens_seen": 311515805, "step": 14440, "time_per_iteration": 2.761298656463623 }, { "auxiliary_loss_clip": 0.01095909, "auxiliary_loss_mlp": 0.01034623, "balance_loss_clip": 1.03644252, "balance_loss_mlp": 1.02241611, "epoch": 0.8682398917781452, "flos": 23289937194240.0, "grad_norm": 1.7310260750928266, "language_loss": 0.66075879, "learning_rate": 1.7929003577931722e-07, "loss": 0.68206406, "num_input_tokens_seen": 311536000, "step": 14441, "time_per_iteration": 2.5800838470458984 }, { "auxiliary_loss_clip": 0.01091494, "auxiliary_loss_mlp": 0.01025353, "balance_loss_clip": 1.0385139, "balance_loss_mlp": 1.0138557, "epoch": 0.8683000150308132, "flos": 21872794936320.0, "grad_norm": 1.681496330113871, "language_loss": 0.66083562, "learning_rate": 1.7912890006156722e-07, "loss": 0.68200409, "num_input_tokens_seen": 311556220, "step": 14442, "time_per_iteration": 2.642595052719116 }, { "auxiliary_loss_clip": 0.01084435, "auxiliary_loss_mlp": 0.01033988, "balance_loss_clip": 1.03615665, "balance_loss_mlp": 1.02031493, "epoch": 0.8683601382834811, "flos": 14647209108480.0, "grad_norm": 1.780014180776551, "language_loss": 0.72400081, "learning_rate": 1.7896783339143195e-07, "loss": 0.74518502, "num_input_tokens_seen": 311572530, "step": 14443, "time_per_iteration": 2.621661901473999 }, { "auxiliary_loss_clip": 0.01109856, "auxiliary_loss_mlp": 0.01028336, "balance_loss_clip": 1.0374794, "balance_loss_mlp": 1.01575971, "epoch": 0.8684202615361492, "flos": 26359904799360.0, "grad_norm": 1.7034879908488254, "language_loss": 0.83455396, "learning_rate": 1.7880683577501877e-07, "loss": 0.85593581, "num_input_tokens_seen": 311591105, "step": 14444, "time_per_iteration": 2.5682990550994873 }, { "auxiliary_loss_clip": 0.01071317, "auxiliary_loss_mlp": 0.01030828, "balance_loss_clip": 1.03839469, "balance_loss_mlp": 1.0183413, "epoch": 0.8684803847888171, "flos": 20704010290560.0, "grad_norm": 1.882585411960033, "language_loss": 0.77276009, "learning_rate": 1.7864590721843342e-07, "loss": 0.79378152, "num_input_tokens_seen": 311608350, "step": 14445, "time_per_iteration": 2.6933975219726562 }, { "auxiliary_loss_clip": 0.01097793, "auxiliary_loss_mlp": 0.01031892, "balance_loss_clip": 1.0368073, "balance_loss_mlp": 1.01954842, "epoch": 0.8685405080414851, "flos": 22638123043200.0, "grad_norm": 1.8974232570725826, "language_loss": 0.68224823, "learning_rate": 1.7848504772777728e-07, "loss": 0.70354509, "num_input_tokens_seen": 311626380, "step": 14446, "time_per_iteration": 2.6505656242370605 }, { "auxiliary_loss_clip": 0.01093238, "auxiliary_loss_mlp": 0.01034173, "balance_loss_clip": 1.03448546, "balance_loss_mlp": 1.0214951, "epoch": 0.868600631294153, "flos": 24822065865600.0, "grad_norm": 1.831558393609818, "language_loss": 0.83143735, "learning_rate": 1.7832425730915102e-07, "loss": 0.85271144, "num_input_tokens_seen": 311644345, "step": 14447, "time_per_iteration": 2.5855720043182373 }, { "auxiliary_loss_clip": 0.01028885, "auxiliary_loss_mlp": 0.01028809, "balance_loss_clip": 1.03098965, "balance_loss_mlp": 1.01697183, "epoch": 0.868660754546821, "flos": 25113983696640.0, "grad_norm": 1.612042145706922, "language_loss": 0.74218094, "learning_rate": 1.781635359686515e-07, "loss": 0.76275784, "num_input_tokens_seen": 311663340, "step": 14448, "time_per_iteration": 2.75423002243042 }, { "auxiliary_loss_clip": 0.01081834, "auxiliary_loss_mlp": 0.01032637, "balance_loss_clip": 1.03381288, "balance_loss_mlp": 1.01907682, "epoch": 0.8687208777994889, "flos": 12677832178560.0, "grad_norm": 1.9294306155040917, "language_loss": 0.79997855, "learning_rate": 1.7800288371237303e-07, "loss": 0.82112324, "num_input_tokens_seen": 311679860, "step": 14449, "time_per_iteration": 2.6481199264526367 }, { "auxiliary_loss_clip": 0.0100162, "auxiliary_loss_mlp": 0.01004017, "balance_loss_clip": 1.00803828, "balance_loss_mlp": 1.0030396, "epoch": 0.8687810010521569, "flos": 65617235573760.0, "grad_norm": 0.811742362179789, "language_loss": 0.60572553, "learning_rate": 1.7784230054640758e-07, "loss": 0.62578189, "num_input_tokens_seen": 311738135, "step": 14450, "time_per_iteration": 3.1744225025177 }, { "auxiliary_loss_clip": 0.01084674, "auxiliary_loss_mlp": 0.01030905, "balance_loss_clip": 1.03782833, "balance_loss_mlp": 1.01882339, "epoch": 0.8688411243048249, "flos": 24244012293120.0, "grad_norm": 1.7384604154685417, "language_loss": 0.76132762, "learning_rate": 1.7768178647684517e-07, "loss": 0.78248346, "num_input_tokens_seen": 311756975, "step": 14451, "time_per_iteration": 2.71647310256958 }, { "auxiliary_loss_clip": 0.01093999, "auxiliary_loss_mlp": 0.01027005, "balance_loss_clip": 1.03542089, "balance_loss_mlp": 1.01485705, "epoch": 0.8689012475574929, "flos": 18221828843520.0, "grad_norm": 3.077369236554663, "language_loss": 0.71929884, "learning_rate": 1.7752134150977205e-07, "loss": 0.74050885, "num_input_tokens_seen": 311771830, "step": 14452, "time_per_iteration": 2.6421010494232178 }, { "auxiliary_loss_clip": 0.01086249, "auxiliary_loss_mlp": 0.00770837, "balance_loss_clip": 1.03687978, "balance_loss_mlp": 1.00033617, "epoch": 0.8689613708101608, "flos": 19646728439040.0, "grad_norm": 1.4971300186991454, "language_loss": 0.72101021, "learning_rate": 1.7736096565127201e-07, "loss": 0.73958105, "num_input_tokens_seen": 311790130, "step": 14453, "time_per_iteration": 2.6629247665405273 }, { "auxiliary_loss_clip": 0.01096295, "auxiliary_loss_mlp": 0.01035101, "balance_loss_clip": 1.03675365, "balance_loss_mlp": 1.02261996, "epoch": 0.8690214940628288, "flos": 11728749070080.0, "grad_norm": 3.182912447217293, "language_loss": 0.73198676, "learning_rate": 1.7720065890742664e-07, "loss": 0.75330073, "num_input_tokens_seen": 311808360, "step": 14454, "time_per_iteration": 2.6625709533691406 }, { "auxiliary_loss_clip": 0.01109645, "auxiliary_loss_mlp": 0.01031456, "balance_loss_clip": 1.03889263, "balance_loss_mlp": 1.01947582, "epoch": 0.8690816173154968, "flos": 34936450076160.0, "grad_norm": 2.5283080783573615, "language_loss": 0.59421092, "learning_rate": 1.7704042128431552e-07, "loss": 0.61562192, "num_input_tokens_seen": 311831325, "step": 14455, "time_per_iteration": 2.716947078704834 }, { "auxiliary_loss_clip": 0.01088564, "auxiliary_loss_mlp": 0.01031079, "balance_loss_clip": 1.03601408, "balance_loss_mlp": 1.0188961, "epoch": 0.8691417405681647, "flos": 11614804151040.0, "grad_norm": 2.476455717843228, "language_loss": 0.80191058, "learning_rate": 1.7688025278801378e-07, "loss": 0.823107, "num_input_tokens_seen": 311848090, "step": 14456, "time_per_iteration": 2.608692169189453 }, { "auxiliary_loss_clip": 0.01050256, "auxiliary_loss_mlp": 0.01043748, "balance_loss_clip": 1.03250086, "balance_loss_mlp": 1.02862024, "epoch": 0.8692018638208328, "flos": 24608038677120.0, "grad_norm": 3.350924717538294, "language_loss": 0.74652326, "learning_rate": 1.7672015342459568e-07, "loss": 0.76746327, "num_input_tokens_seen": 311867855, "step": 14457, "time_per_iteration": 2.8124382495880127 }, { "auxiliary_loss_clip": 0.0104746, "auxiliary_loss_mlp": 0.0103056, "balance_loss_clip": 1.03249383, "balance_loss_mlp": 1.01879406, "epoch": 0.8692619870735007, "flos": 25995124229760.0, "grad_norm": 1.6659706537885548, "language_loss": 0.78279102, "learning_rate": 1.765601232001328e-07, "loss": 0.80357122, "num_input_tokens_seen": 311888675, "step": 14458, "time_per_iteration": 2.7865068912506104 }, { "auxiliary_loss_clip": 0.0109921, "auxiliary_loss_mlp": 0.01034656, "balance_loss_clip": 1.0370791, "balance_loss_mlp": 1.02152491, "epoch": 0.8693221103261687, "flos": 18041808856320.0, "grad_norm": 1.8944843149653803, "language_loss": 0.70788461, "learning_rate": 1.7640016212069187e-07, "loss": 0.72922325, "num_input_tokens_seen": 311907310, "step": 14459, "time_per_iteration": 4.2408952713012695 }, { "auxiliary_loss_clip": 0.01082625, "auxiliary_loss_mlp": 0.01030636, "balance_loss_clip": 1.03549707, "balance_loss_mlp": 1.01960313, "epoch": 0.8693822335788366, "flos": 27492347859840.0, "grad_norm": 1.4467054831762125, "language_loss": 0.73848921, "learning_rate": 1.762402701923398e-07, "loss": 0.75962174, "num_input_tokens_seen": 311929635, "step": 14460, "time_per_iteration": 4.442849636077881 }, { "auxiliary_loss_clip": 0.01092251, "auxiliary_loss_mlp": 0.01033418, "balance_loss_clip": 1.03765035, "balance_loss_mlp": 1.02094245, "epoch": 0.8694423568315046, "flos": 24097712198400.0, "grad_norm": 1.8288235329592715, "language_loss": 0.64751619, "learning_rate": 1.7608044742113947e-07, "loss": 0.66877288, "num_input_tokens_seen": 311948800, "step": 14461, "time_per_iteration": 2.68937087059021 }, { "auxiliary_loss_clip": 0.01093111, "auxiliary_loss_mlp": 0.01033043, "balance_loss_clip": 1.03253245, "balance_loss_mlp": 1.0203414, "epoch": 0.8695024800841725, "flos": 18362131367040.0, "grad_norm": 2.5518242110711933, "language_loss": 0.82737637, "learning_rate": 1.7592069381315123e-07, "loss": 0.84863782, "num_input_tokens_seen": 311964090, "step": 14462, "time_per_iteration": 5.744420289993286 }, { "auxiliary_loss_clip": 0.01096615, "auxiliary_loss_mlp": 0.01033503, "balance_loss_clip": 1.03401327, "balance_loss_mlp": 1.02065229, "epoch": 0.8695626033368405, "flos": 14027750133120.0, "grad_norm": 1.890249833404203, "language_loss": 0.65323138, "learning_rate": 1.757610093744335e-07, "loss": 0.67453253, "num_input_tokens_seen": 311981460, "step": 14463, "time_per_iteration": 2.601334810256958 }, { "auxiliary_loss_clip": 0.01091864, "auxiliary_loss_mlp": 0.01035596, "balance_loss_clip": 1.03908527, "balance_loss_mlp": 1.02291179, "epoch": 0.8696227265895085, "flos": 16836862193280.0, "grad_norm": 2.1647226205532206, "language_loss": 0.66890931, "learning_rate": 1.7560139411104058e-07, "loss": 0.690184, "num_input_tokens_seen": 312000115, "step": 14464, "time_per_iteration": 2.6851119995117188 }, { "auxiliary_loss_clip": 0.01090151, "auxiliary_loss_mlp": 0.01034839, "balance_loss_clip": 1.03739452, "balance_loss_mlp": 1.02226293, "epoch": 0.8696828498421765, "flos": 21799070271360.0, "grad_norm": 2.2457253344226245, "language_loss": 0.62439811, "learning_rate": 1.7544184802902607e-07, "loss": 0.64564812, "num_input_tokens_seen": 312020770, "step": 14465, "time_per_iteration": 2.79040265083313 }, { "auxiliary_loss_clip": 0.01091695, "auxiliary_loss_mlp": 0.01041479, "balance_loss_clip": 1.03505969, "balance_loss_mlp": 1.03027892, "epoch": 0.8697429730948444, "flos": 22894812610560.0, "grad_norm": 1.5293603652202958, "language_loss": 0.84881204, "learning_rate": 1.7528237113443934e-07, "loss": 0.87014377, "num_input_tokens_seen": 312041870, "step": 14466, "time_per_iteration": 2.636146306991577 }, { "auxiliary_loss_clip": 0.0108122, "auxiliary_loss_mlp": 0.01044083, "balance_loss_clip": 1.03755033, "balance_loss_mlp": 1.02939653, "epoch": 0.8698030963475124, "flos": 24717458482560.0, "grad_norm": 2.8453884595631846, "language_loss": 0.61869633, "learning_rate": 1.7512296343332779e-07, "loss": 0.63994938, "num_input_tokens_seen": 312058210, "step": 14467, "time_per_iteration": 2.6638076305389404 }, { "auxiliary_loss_clip": 0.01103261, "auxiliary_loss_mlp": 0.01028354, "balance_loss_clip": 1.03525686, "balance_loss_mlp": 1.01705909, "epoch": 0.8698632196001803, "flos": 28442221067520.0, "grad_norm": 1.4153650067531596, "language_loss": 0.68961638, "learning_rate": 1.7496362493173655e-07, "loss": 0.71093249, "num_input_tokens_seen": 312082665, "step": 14468, "time_per_iteration": 2.6570017337799072 }, { "auxiliary_loss_clip": 0.01083749, "auxiliary_loss_mlp": 0.01030565, "balance_loss_clip": 1.03446794, "balance_loss_mlp": 1.01894248, "epoch": 0.8699233428528483, "flos": 27636457224960.0, "grad_norm": 1.5754041648724575, "language_loss": 0.71199894, "learning_rate": 1.7480435563570773e-07, "loss": 0.73314214, "num_input_tokens_seen": 312101960, "step": 14469, "time_per_iteration": 2.6813437938690186 }, { "auxiliary_loss_clip": 0.01091595, "auxiliary_loss_mlp": 0.01032263, "balance_loss_clip": 1.03561163, "balance_loss_mlp": 1.0210638, "epoch": 0.8699834661055164, "flos": 20045659864320.0, "grad_norm": 1.885135452961054, "language_loss": 0.84151506, "learning_rate": 1.7464515555128024e-07, "loss": 0.86275363, "num_input_tokens_seen": 312117125, "step": 14470, "time_per_iteration": 2.6702113151550293 }, { "auxiliary_loss_clip": 0.01081371, "auxiliary_loss_mlp": 0.01034725, "balance_loss_clip": 1.03428483, "balance_loss_mlp": 1.02214813, "epoch": 0.8700435893581843, "flos": 23732787974400.0, "grad_norm": 1.7089523138026592, "language_loss": 0.72859287, "learning_rate": 1.7448602468449148e-07, "loss": 0.74975377, "num_input_tokens_seen": 312135775, "step": 14471, "time_per_iteration": 2.695295572280884 }, { "auxiliary_loss_clip": 0.01107843, "auxiliary_loss_mlp": 0.01025754, "balance_loss_clip": 1.03751683, "balance_loss_mlp": 1.01464319, "epoch": 0.8701037126108523, "flos": 23548422441600.0, "grad_norm": 1.3968254989831368, "language_loss": 0.78822994, "learning_rate": 1.7432696304137573e-07, "loss": 0.80956596, "num_input_tokens_seen": 312156070, "step": 14472, "time_per_iteration": 2.570103883743286 }, { "auxiliary_loss_clip": 0.01091602, "auxiliary_loss_mlp": 0.00771146, "balance_loss_clip": 1.03555846, "balance_loss_mlp": 1.00026262, "epoch": 0.8701638358635202, "flos": 18843442634880.0, "grad_norm": 2.053518578575987, "language_loss": 0.72808838, "learning_rate": 1.741679706279644e-07, "loss": 0.74671578, "num_input_tokens_seen": 312174380, "step": 14473, "time_per_iteration": 2.5629189014434814 }, { "auxiliary_loss_clip": 0.01111529, "auxiliary_loss_mlp": 0.01033322, "balance_loss_clip": 1.03829002, "balance_loss_mlp": 1.02074575, "epoch": 0.8702239591161882, "flos": 27928339142400.0, "grad_norm": 1.5975251132862047, "language_loss": 0.72459877, "learning_rate": 1.7400904745028644e-07, "loss": 0.74604738, "num_input_tokens_seen": 312195130, "step": 14474, "time_per_iteration": 2.629110097885132 }, { "auxiliary_loss_clip": 0.01084584, "auxiliary_loss_mlp": 0.01037278, "balance_loss_clip": 1.03387856, "balance_loss_mlp": 1.02389669, "epoch": 0.8702840823688561, "flos": 17233997938560.0, "grad_norm": 1.7683975899654203, "language_loss": 0.67307568, "learning_rate": 1.7385019351436925e-07, "loss": 0.69429433, "num_input_tokens_seen": 312212300, "step": 14475, "time_per_iteration": 2.7122128009796143 }, { "auxiliary_loss_clip": 0.01107714, "auxiliary_loss_mlp": 0.01025637, "balance_loss_clip": 1.03506005, "balance_loss_mlp": 1.01282167, "epoch": 0.8703442056215241, "flos": 19427565605760.0, "grad_norm": 1.7492617051008474, "language_loss": 0.77730834, "learning_rate": 1.736914088262349e-07, "loss": 0.79864192, "num_input_tokens_seen": 312231735, "step": 14476, "time_per_iteration": 2.6359400749206543 }, { "auxiliary_loss_clip": 0.01090317, "auxiliary_loss_mlp": 0.01034999, "balance_loss_clip": 1.03377438, "balance_loss_mlp": 1.02168965, "epoch": 0.8704043288741921, "flos": 22273845264000.0, "grad_norm": 1.9949328659253254, "language_loss": 0.72224838, "learning_rate": 1.7353269339190525e-07, "loss": 0.74350154, "num_input_tokens_seen": 312253060, "step": 14477, "time_per_iteration": 2.7253026962280273 }, { "auxiliary_loss_clip": 0.01100703, "auxiliary_loss_mlp": 0.01029682, "balance_loss_clip": 1.03792751, "balance_loss_mlp": 1.01752841, "epoch": 0.8704644521268601, "flos": 16648725732480.0, "grad_norm": 1.8285670196603703, "language_loss": 0.59689963, "learning_rate": 1.7337404721739946e-07, "loss": 0.61820352, "num_input_tokens_seen": 312269460, "step": 14478, "time_per_iteration": 2.6406443119049072 }, { "auxiliary_loss_clip": 0.01099279, "auxiliary_loss_mlp": 0.01028306, "balance_loss_clip": 1.04014349, "balance_loss_mlp": 1.01780367, "epoch": 0.870524575379528, "flos": 24280210224000.0, "grad_norm": 1.716825353140286, "language_loss": 0.71369159, "learning_rate": 1.732154703087323e-07, "loss": 0.73496747, "num_input_tokens_seen": 312289830, "step": 14479, "time_per_iteration": 2.6733837127685547 }, { "auxiliary_loss_clip": 0.01084359, "auxiliary_loss_mlp": 0.01031083, "balance_loss_clip": 1.03538418, "balance_loss_mlp": 1.01857221, "epoch": 0.870584698632196, "flos": 28768684803840.0, "grad_norm": 1.4964038489812062, "language_loss": 0.70916605, "learning_rate": 1.7305696267191805e-07, "loss": 0.73032045, "num_input_tokens_seen": 312311320, "step": 14480, "time_per_iteration": 2.724393367767334 }, { "auxiliary_loss_clip": 0.01056493, "auxiliary_loss_mlp": 0.01033786, "balance_loss_clip": 1.03123474, "balance_loss_mlp": 1.0217936, "epoch": 0.8706448218848639, "flos": 32449635774720.0, "grad_norm": 1.7419679363065612, "language_loss": 0.70210093, "learning_rate": 1.728985243129666e-07, "loss": 0.72300369, "num_input_tokens_seen": 312332095, "step": 14481, "time_per_iteration": 2.9082820415496826 }, { "auxiliary_loss_clip": 0.01096033, "auxiliary_loss_mlp": 0.0103073, "balance_loss_clip": 1.03603554, "balance_loss_mlp": 1.01895249, "epoch": 0.8707049451375319, "flos": 22748009725440.0, "grad_norm": 1.9715155189450182, "language_loss": 0.76938367, "learning_rate": 1.7274015523788643e-07, "loss": 0.79065132, "num_input_tokens_seen": 312351225, "step": 14482, "time_per_iteration": 2.663579225540161 }, { "auxiliary_loss_clip": 0.0108459, "auxiliary_loss_mlp": 0.01033132, "balance_loss_clip": 1.03461742, "balance_loss_mlp": 1.02019787, "epoch": 0.8707650683902, "flos": 15851976203520.0, "grad_norm": 1.9099743094346329, "language_loss": 0.76708519, "learning_rate": 1.7258185545268234e-07, "loss": 0.78826237, "num_input_tokens_seen": 312369730, "step": 14483, "time_per_iteration": 2.6323695182800293 }, { "auxiliary_loss_clip": 0.01102699, "auxiliary_loss_mlp": 0.01038639, "balance_loss_clip": 1.03712118, "balance_loss_mlp": 1.02540636, "epoch": 0.8708251916428679, "flos": 16468131127680.0, "grad_norm": 2.2142588001680856, "language_loss": 0.61881113, "learning_rate": 1.7242362496335749e-07, "loss": 0.64022452, "num_input_tokens_seen": 312386780, "step": 14484, "time_per_iteration": 2.710033893585205 }, { "auxiliary_loss_clip": 0.01108847, "auxiliary_loss_mlp": 0.01031795, "balance_loss_clip": 1.03816152, "balance_loss_mlp": 1.01980281, "epoch": 0.8708853148955359, "flos": 15377847655680.0, "grad_norm": 2.077574729557336, "language_loss": 0.68238926, "learning_rate": 1.7226546377591222e-07, "loss": 0.70379567, "num_input_tokens_seen": 312404875, "step": 14485, "time_per_iteration": 2.5754683017730713 }, { "auxiliary_loss_clip": 0.01050138, "auxiliary_loss_mlp": 0.00770399, "balance_loss_clip": 1.03129363, "balance_loss_mlp": 1.00021982, "epoch": 0.8709454381482038, "flos": 30551325903360.0, "grad_norm": 1.7252030737684174, "language_loss": 0.62990439, "learning_rate": 1.7210737189634373e-07, "loss": 0.64810973, "num_input_tokens_seen": 312425280, "step": 14486, "time_per_iteration": 2.9066638946533203 }, { "auxiliary_loss_clip": 0.01111488, "auxiliary_loss_mlp": 0.01033225, "balance_loss_clip": 1.03683174, "balance_loss_mlp": 1.02015388, "epoch": 0.8710055614008718, "flos": 22601422321920.0, "grad_norm": 1.8160916488481187, "language_loss": 0.61385965, "learning_rate": 1.7194934933064653e-07, "loss": 0.63530672, "num_input_tokens_seen": 312443835, "step": 14487, "time_per_iteration": 2.5739262104034424 }, { "auxiliary_loss_clip": 0.01081023, "auxiliary_loss_mlp": 0.00768637, "balance_loss_clip": 1.03572392, "balance_loss_mlp": 1.00022483, "epoch": 0.8710656846535397, "flos": 18443146492800.0, "grad_norm": 2.0123613366122126, "language_loss": 0.67942166, "learning_rate": 1.7179139608481318e-07, "loss": 0.6979183, "num_input_tokens_seen": 312460830, "step": 14488, "time_per_iteration": 2.7428195476531982 }, { "auxiliary_loss_clip": 0.01092486, "auxiliary_loss_mlp": 0.007699, "balance_loss_clip": 1.03904903, "balance_loss_mlp": 1.00028038, "epoch": 0.8711258079062077, "flos": 16503862181760.0, "grad_norm": 1.8864520858010565, "language_loss": 0.85530466, "learning_rate": 1.716335121648338e-07, "loss": 0.87392855, "num_input_tokens_seen": 312477575, "step": 14489, "time_per_iteration": 2.647411346435547 }, { "auxiliary_loss_clip": 0.01102857, "auxiliary_loss_mlp": 0.01030869, "balance_loss_clip": 1.03787231, "balance_loss_mlp": 1.01791716, "epoch": 0.8711859311588757, "flos": 15663336952320.0, "grad_norm": 11.279745936995974, "language_loss": 0.75571835, "learning_rate": 1.7147569757669445e-07, "loss": 0.77705562, "num_input_tokens_seen": 312492140, "step": 14490, "time_per_iteration": 2.602102041244507 }, { "auxiliary_loss_clip": 0.01100977, "auxiliary_loss_mlp": 0.01029449, "balance_loss_clip": 1.03637326, "balance_loss_mlp": 1.01625216, "epoch": 0.8712460544115437, "flos": 15557544420480.0, "grad_norm": 2.2840810833035157, "language_loss": 0.7581045, "learning_rate": 1.7131795232638012e-07, "loss": 0.77940881, "num_input_tokens_seen": 312508400, "step": 14491, "time_per_iteration": 2.600862503051758 }, { "auxiliary_loss_clip": 0.01080925, "auxiliary_loss_mlp": 0.01026616, "balance_loss_clip": 1.04117799, "balance_loss_mlp": 1.01437354, "epoch": 0.8713061776642116, "flos": 16763568491520.0, "grad_norm": 1.774399528748011, "language_loss": 0.67152178, "learning_rate": 1.711602764198723e-07, "loss": 0.69259721, "num_input_tokens_seen": 312525915, "step": 14492, "time_per_iteration": 2.666191577911377 }, { "auxiliary_loss_clip": 0.01095889, "auxiliary_loss_mlp": 0.01032256, "balance_loss_clip": 1.03753376, "balance_loss_mlp": 1.02096081, "epoch": 0.8713663009168796, "flos": 24279887001600.0, "grad_norm": 1.7247817112541417, "language_loss": 0.6931386, "learning_rate": 1.7100266986314992e-07, "loss": 0.71442008, "num_input_tokens_seen": 312544735, "step": 14493, "time_per_iteration": 2.6735992431640625 }, { "auxiliary_loss_clip": 0.01112164, "auxiliary_loss_mlp": 0.01033694, "balance_loss_clip": 1.03958261, "balance_loss_mlp": 1.02021742, "epoch": 0.8714264241695475, "flos": 23795594904960.0, "grad_norm": 2.938022699932479, "language_loss": 0.8914628, "learning_rate": 1.7084513266218936e-07, "loss": 0.91292143, "num_input_tokens_seen": 312557910, "step": 14494, "time_per_iteration": 2.5774879455566406 }, { "auxiliary_loss_clip": 0.01074718, "auxiliary_loss_mlp": 0.01032805, "balance_loss_clip": 1.03785324, "balance_loss_mlp": 1.02117586, "epoch": 0.8714865474222155, "flos": 37997942071680.0, "grad_norm": 1.9797291272398052, "language_loss": 0.59116101, "learning_rate": 1.7068766482296514e-07, "loss": 0.61223626, "num_input_tokens_seen": 312580360, "step": 14495, "time_per_iteration": 2.8289716243743896 }, { "auxiliary_loss_clip": 0.01076759, "auxiliary_loss_mlp": 0.01037611, "balance_loss_clip": 1.03488982, "balance_loss_mlp": 1.02474201, "epoch": 0.8715466706748836, "flos": 22455696844800.0, "grad_norm": 2.176188158663058, "language_loss": 0.80262101, "learning_rate": 1.7053026635144762e-07, "loss": 0.82376468, "num_input_tokens_seen": 312597550, "step": 14496, "time_per_iteration": 2.6638436317443848 }, { "auxiliary_loss_clip": 0.01083126, "auxiliary_loss_mlp": 0.01037102, "balance_loss_clip": 1.03796446, "balance_loss_mlp": 1.02335715, "epoch": 0.8716067939275515, "flos": 21215126868480.0, "grad_norm": 2.0021272743800536, "language_loss": 0.78574479, "learning_rate": 1.7037293725360624e-07, "loss": 0.80694699, "num_input_tokens_seen": 312616435, "step": 14497, "time_per_iteration": 2.6190896034240723 }, { "auxiliary_loss_clip": 0.01111391, "auxiliary_loss_mlp": 0.01030843, "balance_loss_clip": 1.03765655, "balance_loss_mlp": 1.01795101, "epoch": 0.8716669171802195, "flos": 22997732054400.0, "grad_norm": 1.9670976270372313, "language_loss": 0.67136586, "learning_rate": 1.70215677535406e-07, "loss": 0.69278824, "num_input_tokens_seen": 312632770, "step": 14498, "time_per_iteration": 4.060052394866943 }, { "auxiliary_loss_clip": 0.01070213, "auxiliary_loss_mlp": 0.01031582, "balance_loss_clip": 1.03320992, "balance_loss_mlp": 1.01950634, "epoch": 0.8717270404328874, "flos": 29784058462080.0, "grad_norm": 1.6975392941334262, "language_loss": 0.57051951, "learning_rate": 1.700584872028108e-07, "loss": 0.59153748, "num_input_tokens_seen": 312651900, "step": 14499, "time_per_iteration": 4.371240615844727 }, { "auxiliary_loss_clip": 0.01067535, "auxiliary_loss_mlp": 0.01035634, "balance_loss_clip": 1.03329492, "balance_loss_mlp": 1.02273571, "epoch": 0.8717871636855554, "flos": 22018125363840.0, "grad_norm": 2.018070377597452, "language_loss": 0.79869312, "learning_rate": 1.6990136626178097e-07, "loss": 0.8197248, "num_input_tokens_seen": 312671380, "step": 14500, "time_per_iteration": 2.641244888305664 }, { "auxiliary_loss_clip": 0.01093156, "auxiliary_loss_mlp": 0.0103193, "balance_loss_clip": 1.03767002, "balance_loss_mlp": 1.01997352, "epoch": 0.8718472869382233, "flos": 16654256426880.0, "grad_norm": 1.9117037031727822, "language_loss": 0.72699761, "learning_rate": 1.6974431471827466e-07, "loss": 0.74824846, "num_input_tokens_seen": 312689215, "step": 14501, "time_per_iteration": 5.817331552505493 }, { "auxiliary_loss_clip": 0.01072933, "auxiliary_loss_mlp": 0.0102922, "balance_loss_clip": 1.03339136, "balance_loss_mlp": 1.01612496, "epoch": 0.8719074101908914, "flos": 19495328613120.0, "grad_norm": 2.7665364794887934, "language_loss": 0.64852804, "learning_rate": 1.695873325782482e-07, "loss": 0.66954952, "num_input_tokens_seen": 312706400, "step": 14502, "time_per_iteration": 2.730670690536499 }, { "auxiliary_loss_clip": 0.01083793, "auxiliary_loss_mlp": 0.01040001, "balance_loss_clip": 1.03453636, "balance_loss_mlp": 1.02594066, "epoch": 0.8719675334435593, "flos": 33070890430080.0, "grad_norm": 1.7549915055892822, "language_loss": 0.68897182, "learning_rate": 1.6943041984765262e-07, "loss": 0.71020973, "num_input_tokens_seen": 312727985, "step": 14503, "time_per_iteration": 2.7599282264709473 }, { "auxiliary_loss_clip": 0.01085187, "auxiliary_loss_mlp": 0.01028495, "balance_loss_clip": 1.03614664, "balance_loss_mlp": 1.01606762, "epoch": 0.8720276566962273, "flos": 13626268842240.0, "grad_norm": 2.4452833389757833, "language_loss": 0.69641596, "learning_rate": 1.6927357653243912e-07, "loss": 0.71755278, "num_input_tokens_seen": 312745025, "step": 14504, "time_per_iteration": 2.651085376739502 }, { "auxiliary_loss_clip": 0.01095546, "auxiliary_loss_mlp": 0.00770191, "balance_loss_clip": 1.03598738, "balance_loss_mlp": 1.00016737, "epoch": 0.8720877799488952, "flos": 23514163845120.0, "grad_norm": 2.77338091149224, "language_loss": 0.7014603, "learning_rate": 1.691168026385552e-07, "loss": 0.72011769, "num_input_tokens_seen": 312764170, "step": 14505, "time_per_iteration": 2.6669936180114746 }, { "auxiliary_loss_clip": 0.010867, "auxiliary_loss_mlp": 0.01028087, "balance_loss_clip": 1.03689265, "balance_loss_mlp": 1.01638639, "epoch": 0.8721479032015632, "flos": 20814148368000.0, "grad_norm": 2.005999921971975, "language_loss": 0.78253883, "learning_rate": 1.6896009817194545e-07, "loss": 0.80368668, "num_input_tokens_seen": 312783830, "step": 14506, "time_per_iteration": 2.657680034637451 }, { "auxiliary_loss_clip": 0.01088712, "auxiliary_loss_mlp": 0.01028485, "balance_loss_clip": 1.03430939, "balance_loss_mlp": 1.01588416, "epoch": 0.8722080264542311, "flos": 19463655795840.0, "grad_norm": 2.6356366496590775, "language_loss": 0.73982906, "learning_rate": 1.6880346313855221e-07, "loss": 0.76100105, "num_input_tokens_seen": 312802015, "step": 14507, "time_per_iteration": 2.6549437046051025 }, { "auxiliary_loss_clip": 0.01050345, "auxiliary_loss_mlp": 0.01041227, "balance_loss_clip": 1.03153408, "balance_loss_mlp": 1.02601552, "epoch": 0.8722681497068991, "flos": 21761866759680.0, "grad_norm": 2.186590491088002, "language_loss": 0.72111464, "learning_rate": 1.686468975443156e-07, "loss": 0.74203038, "num_input_tokens_seen": 312820650, "step": 14508, "time_per_iteration": 2.7782466411590576 }, { "auxiliary_loss_clip": 0.01091843, "auxiliary_loss_mlp": 0.01035384, "balance_loss_clip": 1.03782344, "balance_loss_mlp": 1.02198446, "epoch": 0.8723282729595672, "flos": 28877134942080.0, "grad_norm": 9.271419619391889, "language_loss": 0.68848205, "learning_rate": 1.6849040139517202e-07, "loss": 0.70975429, "num_input_tokens_seen": 312841310, "step": 14509, "time_per_iteration": 2.729306221008301 }, { "auxiliary_loss_clip": 0.01084143, "auxiliary_loss_mlp": 0.01032677, "balance_loss_clip": 1.03603458, "balance_loss_mlp": 1.02049422, "epoch": 0.8723883962122351, "flos": 26469145036800.0, "grad_norm": 1.83494283279599, "language_loss": 0.58361018, "learning_rate": 1.683339746970558e-07, "loss": 0.60477841, "num_input_tokens_seen": 312862100, "step": 14510, "time_per_iteration": 2.712592363357544 }, { "auxiliary_loss_clip": 0.01115632, "auxiliary_loss_mlp": 0.01032731, "balance_loss_clip": 1.03837419, "balance_loss_mlp": 1.01929033, "epoch": 0.8724485194649031, "flos": 20521476351360.0, "grad_norm": 2.9455639532360003, "language_loss": 0.67271483, "learning_rate": 1.6817761745589865e-07, "loss": 0.69419849, "num_input_tokens_seen": 312880220, "step": 14511, "time_per_iteration": 2.6101818084716797 }, { "auxiliary_loss_clip": 0.01066568, "auxiliary_loss_mlp": 0.01035139, "balance_loss_clip": 1.03755903, "balance_loss_mlp": 1.02190125, "epoch": 0.872508642717571, "flos": 24353360271360.0, "grad_norm": 1.5822238245751863, "language_loss": 0.81579173, "learning_rate": 1.6802132967763027e-07, "loss": 0.8368088, "num_input_tokens_seen": 312900765, "step": 14512, "time_per_iteration": 2.8737993240356445 }, { "auxiliary_loss_clip": 0.01013613, "auxiliary_loss_mlp": 0.01001982, "balance_loss_clip": 1.01023149, "balance_loss_mlp": 1.00103402, "epoch": 0.872568765970239, "flos": 61410012485760.0, "grad_norm": 0.7938781120275261, "language_loss": 0.58586168, "learning_rate": 1.6786511136817617e-07, "loss": 0.60601765, "num_input_tokens_seen": 312955840, "step": 14513, "time_per_iteration": 3.0974059104919434 }, { "auxiliary_loss_clip": 0.01099507, "auxiliary_loss_mlp": 0.01033855, "balance_loss_clip": 1.03738713, "balance_loss_mlp": 1.02111769, "epoch": 0.8726288892229069, "flos": 22598046443520.0, "grad_norm": 1.742471393477679, "language_loss": 0.76562905, "learning_rate": 1.6770896253346112e-07, "loss": 0.78696269, "num_input_tokens_seen": 312973565, "step": 14514, "time_per_iteration": 2.6420650482177734 }, { "auxiliary_loss_clip": 0.01103565, "auxiliary_loss_mlp": 0.0102728, "balance_loss_clip": 1.03866398, "balance_loss_mlp": 1.01560926, "epoch": 0.872689012475575, "flos": 25885201633920.0, "grad_norm": 1.9498734403168592, "language_loss": 0.6555599, "learning_rate": 1.675528831794055e-07, "loss": 0.67686838, "num_input_tokens_seen": 312994660, "step": 14515, "time_per_iteration": 2.6264796257019043 }, { "auxiliary_loss_clip": 0.01097256, "auxiliary_loss_mlp": 0.01035195, "balance_loss_clip": 1.03490353, "balance_loss_mlp": 1.02188492, "epoch": 0.8727491357282429, "flos": 21506721477120.0, "grad_norm": 2.001096470926363, "language_loss": 0.79334152, "learning_rate": 1.6739687331192842e-07, "loss": 0.81466603, "num_input_tokens_seen": 313009860, "step": 14516, "time_per_iteration": 2.620288133621216 }, { "auxiliary_loss_clip": 0.01112304, "auxiliary_loss_mlp": 0.01034072, "balance_loss_clip": 1.0381372, "balance_loss_mlp": 1.02127457, "epoch": 0.8728092589809109, "flos": 19207504932480.0, "grad_norm": 2.0299342762070927, "language_loss": 0.72229123, "learning_rate": 1.672409329369453e-07, "loss": 0.74375498, "num_input_tokens_seen": 313027025, "step": 14517, "time_per_iteration": 2.5668914318084717 }, { "auxiliary_loss_clip": 0.0106993, "auxiliary_loss_mlp": 0.01024167, "balance_loss_clip": 1.03314495, "balance_loss_mlp": 1.01283014, "epoch": 0.8728693822335788, "flos": 20595308757120.0, "grad_norm": 2.054216166652221, "language_loss": 0.72725064, "learning_rate": 1.6708506206036966e-07, "loss": 0.74819165, "num_input_tokens_seen": 313046830, "step": 14518, "time_per_iteration": 2.6475393772125244 }, { "auxiliary_loss_clip": 0.01081214, "auxiliary_loss_mlp": 0.01038057, "balance_loss_clip": 1.03350496, "balance_loss_mlp": 1.02506304, "epoch": 0.8729295054862468, "flos": 21728613744000.0, "grad_norm": 1.3596830366410917, "language_loss": 0.743572, "learning_rate": 1.6692926068811275e-07, "loss": 0.76476473, "num_input_tokens_seen": 313067715, "step": 14519, "time_per_iteration": 2.6572721004486084 }, { "auxiliary_loss_clip": 0.01099689, "auxiliary_loss_mlp": 0.01031709, "balance_loss_clip": 1.03680825, "balance_loss_mlp": 1.0181669, "epoch": 0.8729896287389147, "flos": 17673436926720.0, "grad_norm": 2.5396553116313205, "language_loss": 0.76397449, "learning_rate": 1.6677352882608142e-07, "loss": 0.78528845, "num_input_tokens_seen": 313082305, "step": 14520, "time_per_iteration": 2.5867063999176025 }, { "auxiliary_loss_clip": 0.01086668, "auxiliary_loss_mlp": 0.01036169, "balance_loss_clip": 1.03518891, "balance_loss_mlp": 1.02296638, "epoch": 0.8730497519915827, "flos": 24571804832640.0, "grad_norm": 1.6038658913961292, "language_loss": 0.82005751, "learning_rate": 1.666178664801816e-07, "loss": 0.84128582, "num_input_tokens_seen": 313101190, "step": 14521, "time_per_iteration": 2.7092795372009277 }, { "auxiliary_loss_clip": 0.01097676, "auxiliary_loss_mlp": 0.01032217, "balance_loss_clip": 1.03878248, "balance_loss_mlp": 1.01914012, "epoch": 0.8731098752442508, "flos": 13443734903040.0, "grad_norm": 1.8658353480537415, "language_loss": 0.76242197, "learning_rate": 1.6646227365631616e-07, "loss": 0.78372091, "num_input_tokens_seen": 313118965, "step": 14522, "time_per_iteration": 2.5802886486053467 }, { "auxiliary_loss_clip": 0.01094482, "auxiliary_loss_mlp": 0.00769289, "balance_loss_clip": 1.03429079, "balance_loss_mlp": 1.0001862, "epoch": 0.8731699984969187, "flos": 23474446381440.0, "grad_norm": 3.16869295355315, "language_loss": 0.75775874, "learning_rate": 1.66306750360385e-07, "loss": 0.77639639, "num_input_tokens_seen": 313139280, "step": 14523, "time_per_iteration": 2.684039831161499 }, { "auxiliary_loss_clip": 0.01097173, "auxiliary_loss_mlp": 0.01032086, "balance_loss_clip": 1.03595114, "balance_loss_mlp": 1.01999831, "epoch": 0.8732301217495867, "flos": 17712651600000.0, "grad_norm": 2.782713247138861, "language_loss": 0.78118378, "learning_rate": 1.6615129659828542e-07, "loss": 0.80247641, "num_input_tokens_seen": 313156655, "step": 14524, "time_per_iteration": 2.5906875133514404 }, { "auxiliary_loss_clip": 0.01089545, "auxiliary_loss_mlp": 0.01031376, "balance_loss_clip": 1.03745615, "balance_loss_mlp": 1.02009869, "epoch": 0.8732902450022546, "flos": 22054359208320.0, "grad_norm": 4.924039303176845, "language_loss": 0.77730787, "learning_rate": 1.6599591237591272e-07, "loss": 0.79851705, "num_input_tokens_seen": 313174050, "step": 14525, "time_per_iteration": 2.6270298957824707 }, { "auxiliary_loss_clip": 0.01020522, "auxiliary_loss_mlp": 0.01034516, "balance_loss_clip": 1.03363109, "balance_loss_mlp": 1.02209401, "epoch": 0.8733503682549226, "flos": 22272983337600.0, "grad_norm": 2.157402662097444, "language_loss": 0.6920954, "learning_rate": 1.6584059769915902e-07, "loss": 0.71264577, "num_input_tokens_seen": 313192765, "step": 14526, "time_per_iteration": 3.1794915199279785 }, { "auxiliary_loss_clip": 0.01059512, "auxiliary_loss_mlp": 0.01041504, "balance_loss_clip": 1.03597927, "balance_loss_mlp": 1.02804565, "epoch": 0.8734104915075905, "flos": 23364344217600.0, "grad_norm": 2.126615018801638, "language_loss": 0.6124419, "learning_rate": 1.6568535257391326e-07, "loss": 0.63345206, "num_input_tokens_seen": 313210925, "step": 14527, "time_per_iteration": 2.93717098236084 }, { "auxiliary_loss_clip": 0.01102101, "auxiliary_loss_mlp": 0.01036741, "balance_loss_clip": 1.04113436, "balance_loss_mlp": 1.02263236, "epoch": 0.8734706147602586, "flos": 17712292464000.0, "grad_norm": 1.9327506841110211, "language_loss": 0.65617096, "learning_rate": 1.6553017700606265e-07, "loss": 0.67755938, "num_input_tokens_seen": 313228250, "step": 14528, "time_per_iteration": 2.5247788429260254 }, { "auxiliary_loss_clip": 0.01080324, "auxiliary_loss_mlp": 0.01027224, "balance_loss_clip": 1.03828454, "balance_loss_mlp": 1.01499307, "epoch": 0.8735307380129265, "flos": 22049367217920.0, "grad_norm": 2.128650528943947, "language_loss": 0.89494413, "learning_rate": 1.6537507100149205e-07, "loss": 0.91601956, "num_input_tokens_seen": 313247880, "step": 14529, "time_per_iteration": 2.800915241241455 }, { "auxiliary_loss_clip": 0.01085933, "auxiliary_loss_mlp": 0.01031322, "balance_loss_clip": 1.03527832, "balance_loss_mlp": 1.01898432, "epoch": 0.8735908612655945, "flos": 25338425829120.0, "grad_norm": 1.740049553302022, "language_loss": 0.84358543, "learning_rate": 1.6522003456608258e-07, "loss": 0.8647579, "num_input_tokens_seen": 313266790, "step": 14530, "time_per_iteration": 2.7246882915496826 }, { "auxiliary_loss_clip": 0.01086126, "auxiliary_loss_mlp": 0.01038129, "balance_loss_clip": 1.03533483, "balance_loss_mlp": 1.02629161, "epoch": 0.8736509845182624, "flos": 21540908246400.0, "grad_norm": 2.065068159593715, "language_loss": 0.74541724, "learning_rate": 1.650650677057128e-07, "loss": 0.7666598, "num_input_tokens_seen": 313286805, "step": 14531, "time_per_iteration": 2.7866251468658447 }, { "auxiliary_loss_clip": 0.01094848, "auxiliary_loss_mlp": 0.0103322, "balance_loss_clip": 1.0341115, "balance_loss_mlp": 1.02093542, "epoch": 0.8737111077709304, "flos": 22017227523840.0, "grad_norm": 2.6296616466434655, "language_loss": 0.6131202, "learning_rate": 1.6491017042625966e-07, "loss": 0.6344009, "num_input_tokens_seen": 313305415, "step": 14532, "time_per_iteration": 2.677741289138794 }, { "auxiliary_loss_clip": 0.01018177, "auxiliary_loss_mlp": 0.01004849, "balance_loss_clip": 1.005548, "balance_loss_mlp": 1.00377011, "epoch": 0.8737712310235983, "flos": 70066315912320.0, "grad_norm": 0.9206045969458919, "language_loss": 0.58650947, "learning_rate": 1.6475534273359704e-07, "loss": 0.60673976, "num_input_tokens_seen": 313369940, "step": 14533, "time_per_iteration": 4.089330434799194 }, { "auxiliary_loss_clip": 0.01079874, "auxiliary_loss_mlp": 0.01032746, "balance_loss_clip": 1.03403592, "balance_loss_mlp": 1.02048564, "epoch": 0.8738313542762663, "flos": 28658331244800.0, "grad_norm": 1.49408783242758, "language_loss": 0.76831782, "learning_rate": 1.646005846335954e-07, "loss": 0.78944403, "num_input_tokens_seen": 313390965, "step": 14534, "time_per_iteration": 2.702711582183838 }, { "auxiliary_loss_clip": 0.0108079, "auxiliary_loss_mlp": 0.01033546, "balance_loss_clip": 1.03330386, "balance_loss_mlp": 1.02107036, "epoch": 0.8738914775289344, "flos": 22346384780160.0, "grad_norm": 1.7135543711038013, "language_loss": 0.75193512, "learning_rate": 1.6444589613212357e-07, "loss": 0.77307844, "num_input_tokens_seen": 313409680, "step": 14535, "time_per_iteration": 2.6537675857543945 }, { "auxiliary_loss_clip": 0.01107851, "auxiliary_loss_mlp": 0.01033358, "balance_loss_clip": 1.03563666, "balance_loss_mlp": 1.02093053, "epoch": 0.8739516007816023, "flos": 31759648444800.0, "grad_norm": 2.0846644532444625, "language_loss": 0.74546909, "learning_rate": 1.64291277235048e-07, "loss": 0.76688123, "num_input_tokens_seen": 313431335, "step": 14536, "time_per_iteration": 2.6706697940826416 }, { "auxiliary_loss_clip": 0.01087464, "auxiliary_loss_mlp": 0.01031248, "balance_loss_clip": 1.03460896, "balance_loss_mlp": 1.01939237, "epoch": 0.8740117240342703, "flos": 21211715076480.0, "grad_norm": 1.8068501761157092, "language_loss": 0.63835013, "learning_rate": 1.641367279482304e-07, "loss": 0.65953726, "num_input_tokens_seen": 313449225, "step": 14537, "time_per_iteration": 4.280652761459351 }, { "auxiliary_loss_clip": 0.01094433, "auxiliary_loss_mlp": 0.01028086, "balance_loss_clip": 1.03392243, "balance_loss_mlp": 1.01478267, "epoch": 0.8740718472869382, "flos": 25186666867200.0, "grad_norm": 1.8076510907949124, "language_loss": 0.57990402, "learning_rate": 1.6398224827753216e-07, "loss": 0.60112923, "num_input_tokens_seen": 313467715, "step": 14538, "time_per_iteration": 4.291844844818115 }, { "auxiliary_loss_clip": 0.01096418, "auxiliary_loss_mlp": 0.01025884, "balance_loss_clip": 1.03719354, "balance_loss_mlp": 1.0136714, "epoch": 0.8741319705396062, "flos": 19500931134720.0, "grad_norm": 1.7388451814310184, "language_loss": 0.68716401, "learning_rate": 1.6382783822881142e-07, "loss": 0.70838702, "num_input_tokens_seen": 313486805, "step": 14539, "time_per_iteration": 2.5990817546844482 }, { "auxiliary_loss_clip": 0.01101524, "auxiliary_loss_mlp": 0.01030794, "balance_loss_clip": 1.03593516, "balance_loss_mlp": 1.01815796, "epoch": 0.8741920937922741, "flos": 14100900180480.0, "grad_norm": 2.0449241273671355, "language_loss": 0.74361241, "learning_rate": 1.6367349780792262e-07, "loss": 0.76493561, "num_input_tokens_seen": 313504880, "step": 14540, "time_per_iteration": 2.6135077476501465 }, { "auxiliary_loss_clip": 0.01082066, "auxiliary_loss_mlp": 0.0103892, "balance_loss_clip": 1.03429246, "balance_loss_mlp": 1.02535379, "epoch": 0.8742522170449422, "flos": 27709858667520.0, "grad_norm": 2.2042306692212947, "language_loss": 0.78727126, "learning_rate": 1.635192270207193e-07, "loss": 0.8084811, "num_input_tokens_seen": 313524995, "step": 14541, "time_per_iteration": 5.828189849853516 }, { "auxiliary_loss_clip": 0.01068115, "auxiliary_loss_mlp": 0.01034781, "balance_loss_clip": 1.03299069, "balance_loss_mlp": 1.02049947, "epoch": 0.8743123402976101, "flos": 21142587352320.0, "grad_norm": 2.5163397271017724, "language_loss": 0.66620183, "learning_rate": 1.6336502587305035e-07, "loss": 0.68723083, "num_input_tokens_seen": 313541740, "step": 14542, "time_per_iteration": 2.7577908039093018 }, { "auxiliary_loss_clip": 0.01027438, "auxiliary_loss_mlp": 0.0100168, "balance_loss_clip": 1.00493681, "balance_loss_mlp": 1.00071454, "epoch": 0.8743724635502781, "flos": 60870024351360.0, "grad_norm": 0.7818261146678972, "language_loss": 0.54485422, "learning_rate": 1.632108943707642e-07, "loss": 0.56514537, "num_input_tokens_seen": 313593445, "step": 14543, "time_per_iteration": 2.908863067626953 }, { "auxiliary_loss_clip": 0.01084752, "auxiliary_loss_mlp": 0.01035429, "balance_loss_clip": 1.0375371, "balance_loss_mlp": 1.02258444, "epoch": 0.874432586802946, "flos": 28109292883200.0, "grad_norm": 2.3839087640585457, "language_loss": 0.69428027, "learning_rate": 1.6305683251970458e-07, "loss": 0.71548212, "num_input_tokens_seen": 313615640, "step": 14544, "time_per_iteration": 2.6920766830444336 }, { "auxiliary_loss_clip": 0.01064253, "auxiliary_loss_mlp": 0.01028954, "balance_loss_clip": 1.03769612, "balance_loss_mlp": 1.01798081, "epoch": 0.874492710055614, "flos": 23550289948800.0, "grad_norm": 1.7246009574285497, "language_loss": 0.75945365, "learning_rate": 1.62902840325714e-07, "loss": 0.78038573, "num_input_tokens_seen": 313635550, "step": 14545, "time_per_iteration": 2.7786312103271484 }, { "auxiliary_loss_clip": 0.01097234, "auxiliary_loss_mlp": 0.00771469, "balance_loss_clip": 1.03498626, "balance_loss_mlp": 1.00026131, "epoch": 0.8745528333082819, "flos": 40915647924480.0, "grad_norm": 10.499099096665093, "language_loss": 0.66618592, "learning_rate": 1.6274891779463217e-07, "loss": 0.68487293, "num_input_tokens_seen": 313659275, "step": 14546, "time_per_iteration": 2.8346989154815674 }, { "auxiliary_loss_clip": 0.01109602, "auxiliary_loss_mlp": 0.01030191, "balance_loss_clip": 1.03745484, "balance_loss_mlp": 1.01785886, "epoch": 0.87461295656095, "flos": 23622901292160.0, "grad_norm": 1.5789135583569807, "language_loss": 0.7296229, "learning_rate": 1.6259506493229536e-07, "loss": 0.75102079, "num_input_tokens_seen": 313680595, "step": 14547, "time_per_iteration": 2.659517526626587 }, { "auxiliary_loss_clip": 0.01115124, "auxiliary_loss_mlp": 0.01040105, "balance_loss_clip": 1.03795385, "balance_loss_mlp": 1.02661061, "epoch": 0.874673079813618, "flos": 38794116983040.0, "grad_norm": 3.3678360175538087, "language_loss": 0.69317234, "learning_rate": 1.6244128174453752e-07, "loss": 0.71472466, "num_input_tokens_seen": 313699730, "step": 14548, "time_per_iteration": 2.754931926727295 }, { "auxiliary_loss_clip": 0.01090989, "auxiliary_loss_mlp": 0.01033861, "balance_loss_clip": 1.03693557, "balance_loss_mlp": 1.02118921, "epoch": 0.8747332030662859, "flos": 23696159080320.0, "grad_norm": 2.005045026903121, "language_loss": 0.70676434, "learning_rate": 1.6228756823719093e-07, "loss": 0.72801286, "num_input_tokens_seen": 313720090, "step": 14549, "time_per_iteration": 2.8153107166290283 }, { "auxiliary_loss_clip": 0.01101259, "auxiliary_loss_mlp": 0.00772545, "balance_loss_clip": 1.0357511, "balance_loss_mlp": 1.00031376, "epoch": 0.8747933263189539, "flos": 24462456854400.0, "grad_norm": 2.512472286488796, "language_loss": 0.84052968, "learning_rate": 1.6213392441608352e-07, "loss": 0.85926771, "num_input_tokens_seen": 313736795, "step": 14550, "time_per_iteration": 2.6691277027130127 }, { "auxiliary_loss_clip": 0.01100072, "auxiliary_loss_mlp": 0.01041079, "balance_loss_clip": 1.03686762, "balance_loss_mlp": 1.02883005, "epoch": 0.8748534495716218, "flos": 13809161917440.0, "grad_norm": 1.6392278362685582, "language_loss": 0.71681327, "learning_rate": 1.6198035028704183e-07, "loss": 0.7382248, "num_input_tokens_seen": 313754820, "step": 14551, "time_per_iteration": 2.6196999549865723 }, { "auxiliary_loss_clip": 0.01098688, "auxiliary_loss_mlp": 0.00770542, "balance_loss_clip": 1.03751254, "balance_loss_mlp": 1.00018144, "epoch": 0.8749135728242898, "flos": 29862092759040.0, "grad_norm": 5.521178940955395, "language_loss": 0.64576298, "learning_rate": 1.6182684585588934e-07, "loss": 0.66445529, "num_input_tokens_seen": 313775830, "step": 14552, "time_per_iteration": 2.7710392475128174 }, { "auxiliary_loss_clip": 0.01078604, "auxiliary_loss_mlp": 0.01028395, "balance_loss_clip": 1.03420365, "balance_loss_mlp": 1.01439333, "epoch": 0.8749736960769577, "flos": 24133479166080.0, "grad_norm": 5.011357337141667, "language_loss": 0.79550266, "learning_rate": 1.616734111284479e-07, "loss": 0.81657255, "num_input_tokens_seen": 313795745, "step": 14553, "time_per_iteration": 2.7544870376586914 }, { "auxiliary_loss_clip": 0.01093009, "auxiliary_loss_mlp": 0.01033697, "balance_loss_clip": 1.03364944, "balance_loss_mlp": 1.02119756, "epoch": 0.8750338193296258, "flos": 17202540602880.0, "grad_norm": 2.1871328119231337, "language_loss": 0.70039916, "learning_rate": 1.6152004611053416e-07, "loss": 0.72166622, "num_input_tokens_seen": 313813895, "step": 14554, "time_per_iteration": 2.5449023246765137 }, { "auxiliary_loss_clip": 0.01091308, "auxiliary_loss_mlp": 0.00770366, "balance_loss_clip": 1.03953791, "balance_loss_mlp": 1.00012708, "epoch": 0.8750939425822937, "flos": 23733218937600.0, "grad_norm": 1.5371757112217883, "language_loss": 0.83528662, "learning_rate": 1.6136675080796457e-07, "loss": 0.85390329, "num_input_tokens_seen": 313834225, "step": 14555, "time_per_iteration": 2.712270498275757 }, { "auxiliary_loss_clip": 0.01097341, "auxiliary_loss_mlp": 0.01034286, "balance_loss_clip": 1.03663278, "balance_loss_mlp": 1.02133369, "epoch": 0.8751540658349617, "flos": 26541684552960.0, "grad_norm": 1.5869522480564469, "language_loss": 0.71009433, "learning_rate": 1.6121352522655252e-07, "loss": 0.73141062, "num_input_tokens_seen": 313854430, "step": 14556, "time_per_iteration": 2.626359462738037 }, { "auxiliary_loss_clip": 0.01093494, "auxiliary_loss_mlp": 0.01036101, "balance_loss_clip": 1.03601527, "balance_loss_mlp": 1.02195692, "epoch": 0.8752141890876296, "flos": 19386806647680.0, "grad_norm": 1.8472844895882763, "language_loss": 0.76663041, "learning_rate": 1.6106036937210732e-07, "loss": 0.78792638, "num_input_tokens_seen": 313871600, "step": 14557, "time_per_iteration": 2.7687621116638184 }, { "auxiliary_loss_clip": 0.01072231, "auxiliary_loss_mlp": 0.01039476, "balance_loss_clip": 1.03658962, "balance_loss_mlp": 1.02650011, "epoch": 0.8752743123402976, "flos": 25374408278400.0, "grad_norm": 1.8980752716365015, "language_loss": 0.83232927, "learning_rate": 1.6090728325043767e-07, "loss": 0.85344636, "num_input_tokens_seen": 313891570, "step": 14558, "time_per_iteration": 2.7216644287109375 }, { "auxiliary_loss_clip": 0.01027546, "auxiliary_loss_mlp": 0.01003435, "balance_loss_clip": 1.00482631, "balance_loss_mlp": 1.00239205, "epoch": 0.8753344355929655, "flos": 59952398578560.0, "grad_norm": 0.8156616177259552, "language_loss": 0.56093448, "learning_rate": 1.6075426686734784e-07, "loss": 0.58124429, "num_input_tokens_seen": 313951290, "step": 14559, "time_per_iteration": 3.1608095169067383 }, { "auxiliary_loss_clip": 0.01099027, "auxiliary_loss_mlp": 0.01035007, "balance_loss_clip": 1.03775668, "balance_loss_mlp": 1.02299678, "epoch": 0.8753945588456336, "flos": 17894646835200.0, "grad_norm": 2.769429121490499, "language_loss": 0.66112006, "learning_rate": 1.606013202286407e-07, "loss": 0.68246031, "num_input_tokens_seen": 313968645, "step": 14560, "time_per_iteration": 2.62923526763916 }, { "auxiliary_loss_clip": 0.011089, "auxiliary_loss_mlp": 0.01030691, "balance_loss_clip": 1.03712916, "balance_loss_mlp": 1.0187583, "epoch": 0.8754546820983016, "flos": 30914885410560.0, "grad_norm": 3.865819478454591, "language_loss": 0.78949714, "learning_rate": 1.6044844334011541e-07, "loss": 0.810893, "num_input_tokens_seen": 313987580, "step": 14561, "time_per_iteration": 2.6706154346466064 }, { "auxiliary_loss_clip": 0.01109674, "auxiliary_loss_mlp": 0.01033224, "balance_loss_clip": 1.03582835, "balance_loss_mlp": 1.01984262, "epoch": 0.8755148053509695, "flos": 20631075724800.0, "grad_norm": 1.9781083362240712, "language_loss": 0.77276206, "learning_rate": 1.6029563620756982e-07, "loss": 0.79419112, "num_input_tokens_seen": 314004460, "step": 14562, "time_per_iteration": 2.5154237747192383 }, { "auxiliary_loss_clip": 0.01103173, "auxiliary_loss_mlp": 0.01027848, "balance_loss_clip": 1.03530455, "balance_loss_mlp": 1.0163027, "epoch": 0.8755749286036375, "flos": 34969739005440.0, "grad_norm": 1.5352533116826146, "language_loss": 0.71789098, "learning_rate": 1.601428988367981e-07, "loss": 0.73920125, "num_input_tokens_seen": 314026855, "step": 14563, "time_per_iteration": 2.743906021118164 }, { "auxiliary_loss_clip": 0.01114581, "auxiliary_loss_mlp": 0.01034001, "balance_loss_clip": 1.04004955, "balance_loss_mlp": 1.0215075, "epoch": 0.8756350518563054, "flos": 18186456925440.0, "grad_norm": 2.1781284121642304, "language_loss": 0.65630162, "learning_rate": 1.5999023123359235e-07, "loss": 0.67778742, "num_input_tokens_seen": 314042830, "step": 14564, "time_per_iteration": 2.601315498352051 }, { "auxiliary_loss_clip": 0.01095159, "auxiliary_loss_mlp": 0.01036435, "balance_loss_clip": 1.03489327, "balance_loss_mlp": 1.02443063, "epoch": 0.8756951751089734, "flos": 20084012611200.0, "grad_norm": 1.7268939160144312, "language_loss": 0.7091375, "learning_rate": 1.598376334037408e-07, "loss": 0.73045349, "num_input_tokens_seen": 314062225, "step": 14565, "time_per_iteration": 2.67029070854187 }, { "auxiliary_loss_clip": 0.01092949, "auxiliary_loss_mlp": 0.01036021, "balance_loss_clip": 1.03708506, "balance_loss_mlp": 1.02246666, "epoch": 0.8757552983616413, "flos": 27525241739520.0, "grad_norm": 1.5872462776777525, "language_loss": 0.77823293, "learning_rate": 1.5968510535303102e-07, "loss": 0.79952264, "num_input_tokens_seen": 314082325, "step": 14566, "time_per_iteration": 2.728349447250366 }, { "auxiliary_loss_clip": 0.01087655, "auxiliary_loss_mlp": 0.01031782, "balance_loss_clip": 1.03929698, "balance_loss_mlp": 1.01946163, "epoch": 0.8758154216143094, "flos": 18073014796800.0, "grad_norm": 1.606606930203952, "language_loss": 0.71347201, "learning_rate": 1.5953264708724624e-07, "loss": 0.73466635, "num_input_tokens_seen": 314100310, "step": 14567, "time_per_iteration": 2.6560468673706055 }, { "auxiliary_loss_clip": 0.01089483, "auxiliary_loss_mlp": 0.00770872, "balance_loss_clip": 1.0368377, "balance_loss_mlp": 1.00015092, "epoch": 0.8758755448669773, "flos": 25045681985280.0, "grad_norm": 1.924193327132232, "language_loss": 0.74096954, "learning_rate": 1.5938025861216776e-07, "loss": 0.7595731, "num_input_tokens_seen": 314121330, "step": 14568, "time_per_iteration": 2.669600248336792 }, { "auxiliary_loss_clip": 0.0106924, "auxiliary_loss_mlp": 0.01031283, "balance_loss_clip": 1.03213978, "balance_loss_mlp": 1.01898623, "epoch": 0.8759356681196453, "flos": 22856818999680.0, "grad_norm": 2.753044994093851, "language_loss": 0.86606205, "learning_rate": 1.5922793993357475e-07, "loss": 0.88706732, "num_input_tokens_seen": 314139875, "step": 14569, "time_per_iteration": 2.7353930473327637 }, { "auxiliary_loss_clip": 0.01069957, "auxiliary_loss_mlp": 0.01032444, "balance_loss_clip": 1.0342617, "balance_loss_mlp": 1.02065396, "epoch": 0.8759957913723132, "flos": 21032521102080.0, "grad_norm": 1.818630760471602, "language_loss": 0.74142909, "learning_rate": 1.5907569105724284e-07, "loss": 0.76245314, "num_input_tokens_seen": 314157850, "step": 14570, "time_per_iteration": 2.699028253555298 }, { "auxiliary_loss_clip": 0.01100775, "auxiliary_loss_mlp": 0.00770915, "balance_loss_clip": 1.03732276, "balance_loss_mlp": 1.00026119, "epoch": 0.8760559146249812, "flos": 20010467514240.0, "grad_norm": 1.5893457614137378, "language_loss": 0.67510492, "learning_rate": 1.5892351198894472e-07, "loss": 0.69382179, "num_input_tokens_seen": 314176720, "step": 14571, "time_per_iteration": 2.617493152618408 }, { "auxiliary_loss_clip": 0.01069948, "auxiliary_loss_mlp": 0.01027892, "balance_loss_clip": 1.03497171, "balance_loss_mlp": 1.01635253, "epoch": 0.8761160378776491, "flos": 19974161842560.0, "grad_norm": 2.1723550236606486, "language_loss": 0.62609088, "learning_rate": 1.5877140273445156e-07, "loss": 0.64706922, "num_input_tokens_seen": 314196645, "step": 14572, "time_per_iteration": 2.80468487739563 }, { "auxiliary_loss_clip": 0.01095539, "auxiliary_loss_mlp": 0.01029157, "balance_loss_clip": 1.03618896, "balance_loss_mlp": 1.01790953, "epoch": 0.8761761611303172, "flos": 28804415857920.0, "grad_norm": 1.6603874349444352, "language_loss": 0.73751938, "learning_rate": 1.5861936329953162e-07, "loss": 0.75876629, "num_input_tokens_seen": 314217430, "step": 14573, "time_per_iteration": 2.8996636867523193 }, { "auxiliary_loss_clip": 0.01058502, "auxiliary_loss_mlp": 0.0076881, "balance_loss_clip": 1.03545105, "balance_loss_mlp": 1.00015557, "epoch": 0.8762362843829851, "flos": 18332505624960.0, "grad_norm": 1.9024608944750214, "language_loss": 0.72550857, "learning_rate": 1.5846739368994966e-07, "loss": 0.74378169, "num_input_tokens_seen": 314235310, "step": 14574, "time_per_iteration": 2.7545413970947266 }, { "auxiliary_loss_clip": 0.01095926, "auxiliary_loss_mlp": 0.01036229, "balance_loss_clip": 1.03621411, "balance_loss_mlp": 1.02418888, "epoch": 0.8762964076356531, "flos": 15779149378560.0, "grad_norm": 1.8502793872644558, "language_loss": 0.76065028, "learning_rate": 1.5831549391146903e-07, "loss": 0.78197181, "num_input_tokens_seen": 314252355, "step": 14575, "time_per_iteration": 2.5257208347320557 }, { "auxiliary_loss_clip": 0.01081299, "auxiliary_loss_mlp": 0.01038665, "balance_loss_clip": 1.03579473, "balance_loss_mlp": 1.02677417, "epoch": 0.8763565308883211, "flos": 33176754789120.0, "grad_norm": 1.9305081146362895, "language_loss": 0.66477948, "learning_rate": 1.5816366396984916e-07, "loss": 0.68597913, "num_input_tokens_seen": 314272755, "step": 14576, "time_per_iteration": 2.7134413719177246 }, { "auxiliary_loss_clip": 0.01078146, "auxiliary_loss_mlp": 0.01034504, "balance_loss_clip": 1.03182101, "balance_loss_mlp": 1.02249372, "epoch": 0.876416654140989, "flos": 15888102307200.0, "grad_norm": 2.865595040791599, "language_loss": 0.668244, "learning_rate": 1.5801190387084806e-07, "loss": 0.68937051, "num_input_tokens_seen": 314291365, "step": 14577, "time_per_iteration": 5.730209589004517 }, { "auxiliary_loss_clip": 0.01099421, "auxiliary_loss_mlp": 0.01031703, "balance_loss_clip": 1.03849435, "balance_loss_mlp": 1.01906085, "epoch": 0.876476777393657, "flos": 25885237547520.0, "grad_norm": 2.277451139554719, "language_loss": 0.71319246, "learning_rate": 1.5786021362021962e-07, "loss": 0.73450363, "num_input_tokens_seen": 314310075, "step": 14578, "time_per_iteration": 2.6785285472869873 }, { "auxiliary_loss_clip": 0.01110348, "auxiliary_loss_mlp": 0.01034332, "balance_loss_clip": 1.03671813, "balance_loss_mlp": 1.02167737, "epoch": 0.876536900646325, "flos": 13589675861760.0, "grad_norm": 2.477066541201799, "language_loss": 0.7168777, "learning_rate": 1.5770859322371676e-07, "loss": 0.73832452, "num_input_tokens_seen": 314325695, "step": 14579, "time_per_iteration": 4.083740472793579 }, { "auxiliary_loss_clip": 0.01075998, "auxiliary_loss_mlp": 0.01036896, "balance_loss_clip": 1.0316453, "balance_loss_mlp": 1.02358592, "epoch": 0.876597023898993, "flos": 12203344494720.0, "grad_norm": 1.7087182635635378, "language_loss": 0.70119214, "learning_rate": 1.5755704268708912e-07, "loss": 0.72232103, "num_input_tokens_seen": 314343605, "step": 14580, "time_per_iteration": 4.30855393409729 }, { "auxiliary_loss_clip": 0.01105953, "auxiliary_loss_mlp": 0.00769599, "balance_loss_clip": 1.03692436, "balance_loss_mlp": 1.00017405, "epoch": 0.8766571471516609, "flos": 25336773803520.0, "grad_norm": 1.6590992493321417, "language_loss": 0.65825737, "learning_rate": 1.5740556201608256e-07, "loss": 0.67701292, "num_input_tokens_seen": 314364275, "step": 14581, "time_per_iteration": 2.6293153762817383 }, { "auxiliary_loss_clip": 0.0108123, "auxiliary_loss_mlp": 0.01033696, "balance_loss_clip": 1.03592646, "balance_loss_mlp": 1.0222156, "epoch": 0.8767172704043289, "flos": 30113287545600.0, "grad_norm": 1.6719712227937835, "language_loss": 0.7391513, "learning_rate": 1.572541512164416e-07, "loss": 0.76030058, "num_input_tokens_seen": 314385140, "step": 14582, "time_per_iteration": 2.8180127143859863 }, { "auxiliary_loss_clip": 0.01106807, "auxiliary_loss_mlp": 0.00770216, "balance_loss_clip": 1.03510261, "balance_loss_mlp": 1.00013459, "epoch": 0.8767773936569968, "flos": 19281157770240.0, "grad_norm": 1.8898145602887721, "language_loss": 0.66737789, "learning_rate": 1.5710281029390826e-07, "loss": 0.68614811, "num_input_tokens_seen": 314403715, "step": 14583, "time_per_iteration": 2.68875789642334 }, { "auxiliary_loss_clip": 0.011013, "auxiliary_loss_mlp": 0.00770347, "balance_loss_clip": 1.03735173, "balance_loss_mlp": 1.00024498, "epoch": 0.8768375169096648, "flos": 21247230648960.0, "grad_norm": 1.7289325254896564, "language_loss": 0.7945081, "learning_rate": 1.5695153925422067e-07, "loss": 0.81322455, "num_input_tokens_seen": 314421880, "step": 14584, "time_per_iteration": 2.6574294567108154 }, { "auxiliary_loss_clip": 0.01078304, "auxiliary_loss_mlp": 0.010306, "balance_loss_clip": 1.03573895, "balance_loss_mlp": 1.01824355, "epoch": 0.8768976401623327, "flos": 23295539715840.0, "grad_norm": 2.4589506169652147, "language_loss": 0.72250307, "learning_rate": 1.5680033810311555e-07, "loss": 0.74359208, "num_input_tokens_seen": 314441585, "step": 14585, "time_per_iteration": 2.755363702774048 }, { "auxiliary_loss_clip": 0.01087385, "auxiliary_loss_mlp": 0.01030085, "balance_loss_clip": 1.03488159, "balance_loss_mlp": 1.01720476, "epoch": 0.8769577634150008, "flos": 21361247395200.0, "grad_norm": 1.8198293013572575, "language_loss": 0.74285269, "learning_rate": 1.5664920684632654e-07, "loss": 0.76402736, "num_input_tokens_seen": 314459020, "step": 14586, "time_per_iteration": 2.7154970169067383 }, { "auxiliary_loss_clip": 0.01107064, "auxiliary_loss_mlp": 0.01030626, "balance_loss_clip": 1.0354507, "balance_loss_mlp": 1.0183115, "epoch": 0.8770178866676687, "flos": 23514056104320.0, "grad_norm": 1.7048029370407318, "language_loss": 0.78917754, "learning_rate": 1.564981454895844e-07, "loss": 0.81055439, "num_input_tokens_seen": 314478935, "step": 14587, "time_per_iteration": 2.659623384475708 }, { "auxiliary_loss_clip": 0.010977, "auxiliary_loss_mlp": 0.01033091, "balance_loss_clip": 1.0367806, "balance_loss_mlp": 1.01905441, "epoch": 0.8770780099203367, "flos": 19719052473600.0, "grad_norm": 1.5723021986517474, "language_loss": 0.73480511, "learning_rate": 1.5634715403861697e-07, "loss": 0.75611293, "num_input_tokens_seen": 314497635, "step": 14588, "time_per_iteration": 2.6490304470062256 }, { "auxiliary_loss_clip": 0.01042159, "auxiliary_loss_mlp": 0.0077056, "balance_loss_clip": 1.03166127, "balance_loss_mlp": 1.00015152, "epoch": 0.8771381331730047, "flos": 21395901041280.0, "grad_norm": 1.8014093436247518, "language_loss": 0.66976607, "learning_rate": 1.5619623249915016e-07, "loss": 0.68789327, "num_input_tokens_seen": 314515445, "step": 14589, "time_per_iteration": 2.7724153995513916 }, { "auxiliary_loss_clip": 0.01098134, "auxiliary_loss_mlp": 0.0103223, "balance_loss_clip": 1.03780675, "balance_loss_mlp": 1.01989174, "epoch": 0.8771982564256726, "flos": 20261770041600.0, "grad_norm": 2.6258083956029776, "language_loss": 0.70362616, "learning_rate": 1.5604538087690732e-07, "loss": 0.72492981, "num_input_tokens_seen": 314533040, "step": 14590, "time_per_iteration": 2.6688060760498047 }, { "auxiliary_loss_clip": 0.01085853, "auxiliary_loss_mlp": 0.01041073, "balance_loss_clip": 1.03592718, "balance_loss_mlp": 1.0268271, "epoch": 0.8772583796783406, "flos": 12489372495360.0, "grad_norm": 2.102125056445036, "language_loss": 0.74291348, "learning_rate": 1.558945991776086e-07, "loss": 0.76418269, "num_input_tokens_seen": 314548280, "step": 14591, "time_per_iteration": 2.644625425338745 }, { "auxiliary_loss_clip": 0.01104605, "auxiliary_loss_mlp": 0.01027114, "balance_loss_clip": 1.03682637, "balance_loss_mlp": 1.01522839, "epoch": 0.8773185029310085, "flos": 15921103927680.0, "grad_norm": 1.6170050772781672, "language_loss": 0.79845113, "learning_rate": 1.5574388740697096e-07, "loss": 0.81976831, "num_input_tokens_seen": 314565345, "step": 14592, "time_per_iteration": 2.604241132736206 }, { "auxiliary_loss_clip": 0.01106487, "auxiliary_loss_mlp": 0.0103183, "balance_loss_clip": 1.03708172, "balance_loss_mlp": 1.02006376, "epoch": 0.8773786261836766, "flos": 21504530747520.0, "grad_norm": 1.5930198030485112, "language_loss": 0.82747221, "learning_rate": 1.5559324557071052e-07, "loss": 0.84885532, "num_input_tokens_seen": 314584190, "step": 14593, "time_per_iteration": 2.5794694423675537 }, { "auxiliary_loss_clip": 0.0109194, "auxiliary_loss_mlp": 0.01028175, "balance_loss_clip": 1.03585052, "balance_loss_mlp": 1.0158962, "epoch": 0.8774387494363445, "flos": 26761493831040.0, "grad_norm": 1.3623421288990831, "language_loss": 0.76057625, "learning_rate": 1.5544267367453845e-07, "loss": 0.78177738, "num_input_tokens_seen": 314605625, "step": 14594, "time_per_iteration": 2.66890025138855 }, { "auxiliary_loss_clip": 0.01057614, "auxiliary_loss_mlp": 0.01040841, "balance_loss_clip": 1.03001809, "balance_loss_mlp": 1.02620232, "epoch": 0.8774988726890125, "flos": 18478841633280.0, "grad_norm": 2.047444074315711, "language_loss": 0.77807617, "learning_rate": 1.552921717241651e-07, "loss": 0.7990607, "num_input_tokens_seen": 314622630, "step": 14595, "time_per_iteration": 2.8318984508514404 }, { "auxiliary_loss_clip": 0.01075529, "auxiliary_loss_mlp": 0.01033164, "balance_loss_clip": 1.03548956, "balance_loss_mlp": 1.02087939, "epoch": 0.8775589959416804, "flos": 24426366664320.0, "grad_norm": 1.536994143649266, "language_loss": 0.70930111, "learning_rate": 1.5514173972529743e-07, "loss": 0.7303881, "num_input_tokens_seen": 314642460, "step": 14596, "time_per_iteration": 2.7869088649749756 }, { "auxiliary_loss_clip": 0.01074468, "auxiliary_loss_mlp": 0.01024808, "balance_loss_clip": 1.03594506, "balance_loss_mlp": 1.01340532, "epoch": 0.8776191191943484, "flos": 23440151871360.0, "grad_norm": 1.7123074266942537, "language_loss": 0.85920203, "learning_rate": 1.5499137768364067e-07, "loss": 0.88019478, "num_input_tokens_seen": 314659875, "step": 14597, "time_per_iteration": 2.741469383239746 }, { "auxiliary_loss_clip": 0.01095944, "auxiliary_loss_mlp": 0.01030022, "balance_loss_clip": 1.0365026, "balance_loss_mlp": 1.01824403, "epoch": 0.8776792424470163, "flos": 26830872950400.0, "grad_norm": 1.6418502548107807, "language_loss": 0.72893673, "learning_rate": 1.5484108560489494e-07, "loss": 0.7501964, "num_input_tokens_seen": 314680260, "step": 14598, "time_per_iteration": 2.679743766784668 }, { "auxiliary_loss_clip": 0.01093166, "auxiliary_loss_mlp": 0.00771018, "balance_loss_clip": 1.03571749, "balance_loss_mlp": 1.00025177, "epoch": 0.8777393656996844, "flos": 15626169354240.0, "grad_norm": 2.1344366739736915, "language_loss": 0.77418303, "learning_rate": 1.5469086349476036e-07, "loss": 0.79282486, "num_input_tokens_seen": 314696260, "step": 14599, "time_per_iteration": 2.645653486251831 }, { "auxiliary_loss_clip": 0.01077971, "auxiliary_loss_mlp": 0.01030468, "balance_loss_clip": 1.03576493, "balance_loss_mlp": 1.0187732, "epoch": 0.8777994889523523, "flos": 18879999701760.0, "grad_norm": 2.045708434317711, "language_loss": 0.67680991, "learning_rate": 1.545407113589332e-07, "loss": 0.69789433, "num_input_tokens_seen": 314714215, "step": 14600, "time_per_iteration": 2.67521333694458 }, { "auxiliary_loss_clip": 0.01098236, "auxiliary_loss_mlp": 0.01040238, "balance_loss_clip": 1.03573418, "balance_loss_mlp": 1.02782202, "epoch": 0.8778596122050203, "flos": 48826516400640.0, "grad_norm": 1.696137650912348, "language_loss": 0.69482052, "learning_rate": 1.543906292031072e-07, "loss": 0.71620524, "num_input_tokens_seen": 314735700, "step": 14601, "time_per_iteration": 2.852067708969116 }, { "auxiliary_loss_clip": 0.01102467, "auxiliary_loss_mlp": 0.01033846, "balance_loss_clip": 1.03806257, "balance_loss_mlp": 1.0211978, "epoch": 0.8779197354576883, "flos": 25660184883840.0, "grad_norm": 1.8150997518302137, "language_loss": 0.72907132, "learning_rate": 1.542406170329733e-07, "loss": 0.75043446, "num_input_tokens_seen": 314753335, "step": 14602, "time_per_iteration": 2.666530132293701 }, { "auxiliary_loss_clip": 0.01106598, "auxiliary_loss_mlp": 0.01033593, "balance_loss_clip": 1.03583145, "balance_loss_mlp": 1.02214909, "epoch": 0.8779798587103562, "flos": 18843227153280.0, "grad_norm": 2.0286896900141103, "language_loss": 0.70824677, "learning_rate": 1.5409067485422056e-07, "loss": 0.72964865, "num_input_tokens_seen": 314770800, "step": 14603, "time_per_iteration": 2.6004815101623535 }, { "auxiliary_loss_clip": 0.01011292, "auxiliary_loss_mlp": 0.01001925, "balance_loss_clip": 1.00817752, "balance_loss_mlp": 1.00094128, "epoch": 0.8780399819630242, "flos": 68613119377920.0, "grad_norm": 0.7394752492120941, "language_loss": 0.54153609, "learning_rate": 1.539408026725344e-07, "loss": 0.56166828, "num_input_tokens_seen": 314837275, "step": 14604, "time_per_iteration": 3.240145683288574 }, { "auxiliary_loss_clip": 0.01001285, "auxiliary_loss_mlp": 0.01016546, "balance_loss_clip": 1.00654078, "balance_loss_mlp": 1.01528251, "epoch": 0.8781001052156922, "flos": 65734807766400.0, "grad_norm": 0.7095982216693757, "language_loss": 0.59140944, "learning_rate": 1.537910004935976e-07, "loss": 0.61158776, "num_input_tokens_seen": 314902220, "step": 14605, "time_per_iteration": 3.193176507949829 }, { "auxiliary_loss_clip": 0.01068364, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.03649735, "balance_loss_mlp": 1.02195311, "epoch": 0.8781602284683602, "flos": 22049654526720.0, "grad_norm": 1.640207436482767, "language_loss": 0.85104489, "learning_rate": 1.536412683230912e-07, "loss": 0.87207323, "num_input_tokens_seen": 314921645, "step": 14606, "time_per_iteration": 2.7456490993499756 }, { "auxiliary_loss_clip": 0.01111634, "auxiliary_loss_mlp": 0.01031457, "balance_loss_clip": 1.03835487, "balance_loss_mlp": 1.01814675, "epoch": 0.8782203517210281, "flos": 17562939713280.0, "grad_norm": 2.106006869176157, "language_loss": 0.70568335, "learning_rate": 1.534916061666931e-07, "loss": 0.72711432, "num_input_tokens_seen": 314939390, "step": 14607, "time_per_iteration": 2.5896804332733154 }, { "auxiliary_loss_clip": 0.01086458, "auxiliary_loss_mlp": 0.01041468, "balance_loss_clip": 1.03582692, "balance_loss_mlp": 1.03008974, "epoch": 0.8782804749736961, "flos": 25520421064320.0, "grad_norm": 1.8237368142749963, "language_loss": 0.72306776, "learning_rate": 1.533420140300785e-07, "loss": 0.74434698, "num_input_tokens_seen": 314959205, "step": 14608, "time_per_iteration": 2.741672992706299 }, { "auxiliary_loss_clip": 0.01099239, "auxiliary_loss_mlp": 0.01037651, "balance_loss_clip": 1.03510618, "balance_loss_mlp": 1.0248003, "epoch": 0.878340598226364, "flos": 21798747048960.0, "grad_norm": 1.955140517106729, "language_loss": 0.87650675, "learning_rate": 1.5319249191891936e-07, "loss": 0.89787567, "num_input_tokens_seen": 314977485, "step": 14609, "time_per_iteration": 2.7044589519500732 }, { "auxiliary_loss_clip": 0.01064019, "auxiliary_loss_mlp": 0.01031489, "balance_loss_clip": 1.03733373, "balance_loss_mlp": 1.01938868, "epoch": 0.878400721479032, "flos": 21102403011840.0, "grad_norm": 1.6056637569887062, "language_loss": 0.70521188, "learning_rate": 1.5304303983888643e-07, "loss": 0.72616696, "num_input_tokens_seen": 314997830, "step": 14610, "time_per_iteration": 2.803408145904541 }, { "auxiliary_loss_clip": 0.01090443, "auxiliary_loss_mlp": 0.00770344, "balance_loss_clip": 1.03708553, "balance_loss_mlp": 1.0002346, "epoch": 0.8784608447316999, "flos": 20923532259840.0, "grad_norm": 5.657745869325684, "language_loss": 0.80772901, "learning_rate": 1.5289365779564612e-07, "loss": 0.82633686, "num_input_tokens_seen": 315016480, "step": 14611, "time_per_iteration": 2.660065174102783 }, { "auxiliary_loss_clip": 0.01108968, "auxiliary_loss_mlp": 0.01032438, "balance_loss_clip": 1.03722143, "balance_loss_mlp": 1.01999247, "epoch": 0.878520967984368, "flos": 23330660238720.0, "grad_norm": 1.533059433053689, "language_loss": 0.76187742, "learning_rate": 1.5274434579486338e-07, "loss": 0.78329152, "num_input_tokens_seen": 315036135, "step": 14612, "time_per_iteration": 2.6014697551727295 }, { "auxiliary_loss_clip": 0.01056207, "auxiliary_loss_mlp": 0.010328, "balance_loss_clip": 1.03447223, "balance_loss_mlp": 1.02104592, "epoch": 0.8785810912370359, "flos": 25518984520320.0, "grad_norm": 1.467098748610033, "language_loss": 0.72364843, "learning_rate": 1.525951038422002e-07, "loss": 0.74453855, "num_input_tokens_seen": 315057995, "step": 14613, "time_per_iteration": 2.865140676498413 }, { "auxiliary_loss_clip": 0.0100752, "auxiliary_loss_mlp": 0.01000964, "balance_loss_clip": 1.01305175, "balance_loss_mlp": 0.9998787, "epoch": 0.8786412144897039, "flos": 61841047691520.0, "grad_norm": 1.0274738596365884, "language_loss": 0.64512694, "learning_rate": 1.5244593194331667e-07, "loss": 0.6652118, "num_input_tokens_seen": 315104010, "step": 14614, "time_per_iteration": 3.0442123413085938 }, { "auxiliary_loss_clip": 0.01027601, "auxiliary_loss_mlp": 0.01004471, "balance_loss_clip": 1.00515628, "balance_loss_mlp": 1.00352311, "epoch": 0.8787013377423719, "flos": 70989364638720.0, "grad_norm": 0.6570239019291962, "language_loss": 0.58545709, "learning_rate": 1.5229683010386762e-07, "loss": 0.6057778, "num_input_tokens_seen": 315174550, "step": 14615, "time_per_iteration": 3.2077083587646484 }, { "auxiliary_loss_clip": 0.01059951, "auxiliary_loss_mlp": 0.01028954, "balance_loss_clip": 1.03297782, "balance_loss_mlp": 1.01675916, "epoch": 0.8787614609950398, "flos": 17347404153600.0, "grad_norm": 3.0128650645072503, "language_loss": 0.72307193, "learning_rate": 1.5214779832950807e-07, "loss": 0.74396092, "num_input_tokens_seen": 315191825, "step": 14616, "time_per_iteration": 4.491776704788208 }, { "auxiliary_loss_clip": 0.01028184, "auxiliary_loss_mlp": 0.01002892, "balance_loss_clip": 1.00566876, "balance_loss_mlp": 1.00189614, "epoch": 0.8788215842477078, "flos": 72511401588480.0, "grad_norm": 0.8039335760257915, "language_loss": 0.5797807, "learning_rate": 1.5199883662588953e-07, "loss": 0.60009146, "num_input_tokens_seen": 315255075, "step": 14617, "time_per_iteration": 3.238430976867676 }, { "auxiliary_loss_clip": 0.01081125, "auxiliary_loss_mlp": 0.01037319, "balance_loss_clip": 1.03397489, "balance_loss_mlp": 1.02404451, "epoch": 0.8788817075003758, "flos": 24827452905600.0, "grad_norm": 1.7430695626814152, "language_loss": 0.83371663, "learning_rate": 1.5184994499865987e-07, "loss": 0.85490113, "num_input_tokens_seen": 315273995, "step": 14618, "time_per_iteration": 2.6718552112579346 }, { "auxiliary_loss_clip": 0.01081904, "auxiliary_loss_mlp": 0.01028474, "balance_loss_clip": 1.03612018, "balance_loss_mlp": 1.0165534, "epoch": 0.8789418307530438, "flos": 22638769488000.0, "grad_norm": 1.5514378708700263, "language_loss": 0.69016528, "learning_rate": 1.5170112345346598e-07, "loss": 0.71126908, "num_input_tokens_seen": 315294485, "step": 14619, "time_per_iteration": 5.7080559730529785 }, { "auxiliary_loss_clip": 0.01067003, "auxiliary_loss_mlp": 0.01037225, "balance_loss_clip": 1.03445745, "balance_loss_mlp": 1.02513099, "epoch": 0.8790019540057117, "flos": 19785738072960.0, "grad_norm": 3.788287535500063, "language_loss": 0.77142107, "learning_rate": 1.5155237199595016e-07, "loss": 0.79246336, "num_input_tokens_seen": 315310420, "step": 14620, "time_per_iteration": 2.7002434730529785 }, { "auxiliary_loss_clip": 0.0108692, "auxiliary_loss_mlp": 0.01031499, "balance_loss_clip": 1.03867853, "balance_loss_mlp": 1.01823735, "epoch": 0.8790620772583797, "flos": 20229774001920.0, "grad_norm": 1.6265722719383797, "language_loss": 0.79121077, "learning_rate": 1.514036906317542e-07, "loss": 0.81239492, "num_input_tokens_seen": 315330110, "step": 14621, "time_per_iteration": 2.706190824508667 }, { "auxiliary_loss_clip": 0.01088315, "auxiliary_loss_mlp": 0.01033697, "balance_loss_clip": 1.03491962, "balance_loss_mlp": 1.02115011, "epoch": 0.8791222005110476, "flos": 24130785646080.0, "grad_norm": 1.9310015183922709, "language_loss": 0.66529369, "learning_rate": 1.5125507936651506e-07, "loss": 0.68651378, "num_input_tokens_seen": 315350080, "step": 14622, "time_per_iteration": 2.7165491580963135 }, { "auxiliary_loss_clip": 0.01082524, "auxiliary_loss_mlp": 0.01036818, "balance_loss_clip": 1.03749692, "balance_loss_mlp": 1.02424717, "epoch": 0.8791823237637156, "flos": 21614201948160.0, "grad_norm": 1.9313985868431403, "language_loss": 0.72802383, "learning_rate": 1.511065382058687e-07, "loss": 0.74921727, "num_input_tokens_seen": 315366360, "step": 14623, "time_per_iteration": 2.747246026992798 }, { "auxiliary_loss_clip": 0.01055452, "auxiliary_loss_mlp": 0.01032326, "balance_loss_clip": 1.03043795, "balance_loss_mlp": 1.02029753, "epoch": 0.8792424470163835, "flos": 24243401761920.0, "grad_norm": 1.9152762624748565, "language_loss": 0.78623891, "learning_rate": 1.5095806715544801e-07, "loss": 0.80711675, "num_input_tokens_seen": 315385890, "step": 14624, "time_per_iteration": 2.8343048095703125 }, { "auxiliary_loss_clip": 0.01097982, "auxiliary_loss_mlp": 0.01037468, "balance_loss_clip": 1.03468323, "balance_loss_mlp": 1.02431333, "epoch": 0.8793025702690516, "flos": 24893204751360.0, "grad_norm": 1.7619469810650616, "language_loss": 0.79745495, "learning_rate": 1.5080966622088265e-07, "loss": 0.81880945, "num_input_tokens_seen": 315403400, "step": 14625, "time_per_iteration": 2.660099983215332 }, { "auxiliary_loss_clip": 0.01083648, "auxiliary_loss_mlp": 0.01039128, "balance_loss_clip": 1.03540492, "balance_loss_mlp": 1.02714157, "epoch": 0.8793626935217195, "flos": 25373115388800.0, "grad_norm": 1.6785159518142898, "language_loss": 0.74372435, "learning_rate": 1.5066133540779967e-07, "loss": 0.76495212, "num_input_tokens_seen": 315423670, "step": 14626, "time_per_iteration": 2.676588535308838 }, { "auxiliary_loss_clip": 0.01098546, "auxiliary_loss_mlp": 0.01032578, "balance_loss_clip": 1.03614759, "balance_loss_mlp": 1.019876, "epoch": 0.8794228167743875, "flos": 34678000742400.0, "grad_norm": 3.563179851520993, "language_loss": 0.71319157, "learning_rate": 1.505130747218246e-07, "loss": 0.73450279, "num_input_tokens_seen": 315446265, "step": 14627, "time_per_iteration": 2.7192656993865967 }, { "auxiliary_loss_clip": 0.01081037, "auxiliary_loss_mlp": 0.01032231, "balance_loss_clip": 1.04066432, "balance_loss_mlp": 1.01920116, "epoch": 0.8794829400270555, "flos": 19464014931840.0, "grad_norm": 1.8203006608438008, "language_loss": 0.72041732, "learning_rate": 1.5036488416857873e-07, "loss": 0.74155003, "num_input_tokens_seen": 315464655, "step": 14628, "time_per_iteration": 2.6673803329467773 }, { "auxiliary_loss_clip": 0.01077339, "auxiliary_loss_mlp": 0.01034101, "balance_loss_clip": 1.03383875, "balance_loss_mlp": 1.02086902, "epoch": 0.8795430632797234, "flos": 15231403906560.0, "grad_norm": 2.5577360809446312, "language_loss": 0.69041932, "learning_rate": 1.5021676375368175e-07, "loss": 0.71153378, "num_input_tokens_seen": 315481090, "step": 14629, "time_per_iteration": 2.6587491035461426 }, { "auxiliary_loss_clip": 0.01082842, "auxiliary_loss_mlp": 0.0103309, "balance_loss_clip": 1.03334451, "balance_loss_mlp": 1.02162218, "epoch": 0.8796031865323914, "flos": 27744727795200.0, "grad_norm": 1.5244181147754692, "language_loss": 0.68586159, "learning_rate": 1.5006871348275053e-07, "loss": 0.70702088, "num_input_tokens_seen": 315502010, "step": 14630, "time_per_iteration": 2.6706295013427734 }, { "auxiliary_loss_clip": 0.01081928, "auxiliary_loss_mlp": 0.01033864, "balance_loss_clip": 1.03443193, "balance_loss_mlp": 1.02096558, "epoch": 0.8796633097850594, "flos": 31285412156160.0, "grad_norm": 1.7384017818460198, "language_loss": 0.74517637, "learning_rate": 1.499207333613999e-07, "loss": 0.7663343, "num_input_tokens_seen": 315523040, "step": 14631, "time_per_iteration": 2.7020559310913086 }, { "auxiliary_loss_clip": 0.01085004, "auxiliary_loss_mlp": 0.00769583, "balance_loss_clip": 1.03570437, "balance_loss_mlp": 1.00020719, "epoch": 0.8797234330377274, "flos": 24243150366720.0, "grad_norm": 2.2960657953969434, "language_loss": 0.69393373, "learning_rate": 1.4977282339523954e-07, "loss": 0.71247965, "num_input_tokens_seen": 315541865, "step": 14632, "time_per_iteration": 2.75093674659729 }, { "auxiliary_loss_clip": 0.01087331, "auxiliary_loss_mlp": 0.01027739, "balance_loss_clip": 1.03704596, "balance_loss_mlp": 1.01637244, "epoch": 0.8797835562903953, "flos": 24167414540160.0, "grad_norm": 1.8690741277115708, "language_loss": 0.65338004, "learning_rate": 1.4962498358987929e-07, "loss": 0.67453068, "num_input_tokens_seen": 315561470, "step": 14633, "time_per_iteration": 2.6868348121643066 }, { "auxiliary_loss_clip": 0.01075776, "auxiliary_loss_mlp": 0.01034988, "balance_loss_clip": 1.03406906, "balance_loss_mlp": 1.0226382, "epoch": 0.8798436795430633, "flos": 19284677303040.0, "grad_norm": 1.4189442310597726, "language_loss": 0.84372133, "learning_rate": 1.4947721395092528e-07, "loss": 0.864829, "num_input_tokens_seen": 315583140, "step": 14634, "time_per_iteration": 2.711578845977783 }, { "auxiliary_loss_clip": 0.01085532, "auxiliary_loss_mlp": 0.00770557, "balance_loss_clip": 1.03607786, "balance_loss_mlp": 1.00022292, "epoch": 0.8799038027957312, "flos": 28179390274560.0, "grad_norm": 1.6380725692975024, "language_loss": 0.79907227, "learning_rate": 1.4932951448398056e-07, "loss": 0.81763315, "num_input_tokens_seen": 315601935, "step": 14635, "time_per_iteration": 2.7726967334747314 }, { "auxiliary_loss_clip": 0.01081031, "auxiliary_loss_mlp": 0.01025598, "balance_loss_clip": 1.03709126, "balance_loss_mlp": 1.01331937, "epoch": 0.8799639260483992, "flos": 24644703484800.0, "grad_norm": 1.9658310023555117, "language_loss": 0.65064734, "learning_rate": 1.4918188519464648e-07, "loss": 0.67171359, "num_input_tokens_seen": 315619995, "step": 14636, "time_per_iteration": 2.686582565307617 }, { "auxiliary_loss_clip": 0.01082702, "auxiliary_loss_mlp": 0.01038782, "balance_loss_clip": 1.03411579, "balance_loss_mlp": 1.02539492, "epoch": 0.8800240493010671, "flos": 22200479735040.0, "grad_norm": 1.4477537955972881, "language_loss": 0.70313036, "learning_rate": 1.4903432608852074e-07, "loss": 0.72434527, "num_input_tokens_seen": 315637895, "step": 14637, "time_per_iteration": 2.6938488483428955 }, { "auxiliary_loss_clip": 0.01087054, "auxiliary_loss_mlp": 0.01029647, "balance_loss_clip": 1.03786731, "balance_loss_mlp": 1.01791048, "epoch": 0.8800841725537352, "flos": 14246086953600.0, "grad_norm": 2.359329981837555, "language_loss": 0.66048372, "learning_rate": 1.4888683717119843e-07, "loss": 0.6816507, "num_input_tokens_seen": 315655520, "step": 14638, "time_per_iteration": 2.633389472961426 }, { "auxiliary_loss_clip": 0.01097569, "auxiliary_loss_mlp": 0.01029703, "balance_loss_clip": 1.03652537, "balance_loss_mlp": 1.01738858, "epoch": 0.8801442958064031, "flos": 37415794348800.0, "grad_norm": 2.0860441545932247, "language_loss": 0.57805324, "learning_rate": 1.4873941844827286e-07, "loss": 0.59932595, "num_input_tokens_seen": 315678955, "step": 14639, "time_per_iteration": 2.762080669403076 }, { "auxiliary_loss_clip": 0.01081797, "auxiliary_loss_mlp": 0.0103561, "balance_loss_clip": 1.03559208, "balance_loss_mlp": 1.0227828, "epoch": 0.8802044190590711, "flos": 25047334010880.0, "grad_norm": 1.6274947606267138, "language_loss": 0.74253106, "learning_rate": 1.4859206992533402e-07, "loss": 0.76370513, "num_input_tokens_seen": 315700360, "step": 14640, "time_per_iteration": 2.6815481185913086 }, { "auxiliary_loss_clip": 0.010844, "auxiliary_loss_mlp": 0.01043439, "balance_loss_clip": 1.03346467, "balance_loss_mlp": 1.03030789, "epoch": 0.8802645423117391, "flos": 24133874215680.0, "grad_norm": 2.333395940952266, "language_loss": 0.69967985, "learning_rate": 1.4844479160796985e-07, "loss": 0.72095823, "num_input_tokens_seen": 315719270, "step": 14641, "time_per_iteration": 2.749075174331665 }, { "auxiliary_loss_clip": 0.01095024, "auxiliary_loss_mlp": 0.01030792, "balance_loss_clip": 1.03647685, "balance_loss_mlp": 1.01772094, "epoch": 0.880324665564407, "flos": 17931203902080.0, "grad_norm": 2.1703882572052837, "language_loss": 0.84749234, "learning_rate": 1.4829758350176457e-07, "loss": 0.86875057, "num_input_tokens_seen": 315737425, "step": 14642, "time_per_iteration": 2.5922858715057373 }, { "auxiliary_loss_clip": 0.0107106, "auxiliary_loss_mlp": 0.01034269, "balance_loss_clip": 1.04185271, "balance_loss_mlp": 1.02141285, "epoch": 0.880384788817075, "flos": 21287630471040.0, "grad_norm": 1.7146284056948287, "language_loss": 0.78968871, "learning_rate": 1.4815044561230038e-07, "loss": 0.81074202, "num_input_tokens_seen": 315755725, "step": 14643, "time_per_iteration": 2.7133426666259766 }, { "auxiliary_loss_clip": 0.01091961, "auxiliary_loss_mlp": 0.010299, "balance_loss_clip": 1.03380251, "balance_loss_mlp": 1.01829529, "epoch": 0.880444912069743, "flos": 12458489777280.0, "grad_norm": 1.637601444546806, "language_loss": 0.72898597, "learning_rate": 1.4800337794515705e-07, "loss": 0.75020456, "num_input_tokens_seen": 315773835, "step": 14644, "time_per_iteration": 2.644477367401123 }, { "auxiliary_loss_clip": 0.01111824, "auxiliary_loss_mlp": 0.00770434, "balance_loss_clip": 1.03767347, "balance_loss_mlp": 1.00029421, "epoch": 0.880505035322411, "flos": 13625945619840.0, "grad_norm": 1.899004626318215, "language_loss": 0.79560626, "learning_rate": 1.47856380505911e-07, "loss": 0.81442887, "num_input_tokens_seen": 315790615, "step": 14645, "time_per_iteration": 2.5354764461517334 }, { "auxiliary_loss_clip": 0.01092346, "auxiliary_loss_mlp": 0.01037836, "balance_loss_clip": 1.03347158, "balance_loss_mlp": 1.02530098, "epoch": 0.8805651585750789, "flos": 23183067254400.0, "grad_norm": 1.7052158673760782, "language_loss": 0.64392948, "learning_rate": 1.477094533001364e-07, "loss": 0.66523129, "num_input_tokens_seen": 315811010, "step": 14646, "time_per_iteration": 2.579423427581787 }, { "auxiliary_loss_clip": 0.01080209, "auxiliary_loss_mlp": 0.01036788, "balance_loss_clip": 1.03776836, "balance_loss_mlp": 1.02298915, "epoch": 0.8806252818277469, "flos": 14903000835840.0, "grad_norm": 2.7067451127953874, "language_loss": 0.77432781, "learning_rate": 1.475625963334055e-07, "loss": 0.79549778, "num_input_tokens_seen": 315828130, "step": 14647, "time_per_iteration": 2.6500446796417236 }, { "auxiliary_loss_clip": 0.01106216, "auxiliary_loss_mlp": 0.01031301, "balance_loss_clip": 1.03662324, "balance_loss_mlp": 1.01965976, "epoch": 0.8806854050804148, "flos": 17639178330240.0, "grad_norm": 2.4139145058404976, "language_loss": 0.75048065, "learning_rate": 1.4741580961128652e-07, "loss": 0.77185583, "num_input_tokens_seen": 315844900, "step": 14648, "time_per_iteration": 2.5998997688293457 }, { "auxiliary_loss_clip": 0.01087799, "auxiliary_loss_mlp": 0.01031948, "balance_loss_clip": 1.03425181, "balance_loss_mlp": 1.01994991, "epoch": 0.8807455283330828, "flos": 25332392344320.0, "grad_norm": 1.6348853786721524, "language_loss": 0.65398651, "learning_rate": 1.4726909313934522e-07, "loss": 0.67518401, "num_input_tokens_seen": 315863745, "step": 14649, "time_per_iteration": 2.7652242183685303 }, { "auxiliary_loss_clip": 0.010729, "auxiliary_loss_mlp": 0.01033475, "balance_loss_clip": 1.03727496, "balance_loss_mlp": 1.02036798, "epoch": 0.8808056515857507, "flos": 25265168040960.0, "grad_norm": 1.3476131678952612, "language_loss": 0.62504375, "learning_rate": 1.4712244692314578e-07, "loss": 0.64610744, "num_input_tokens_seen": 315885765, "step": 14650, "time_per_iteration": 2.77528715133667 }, { "auxiliary_loss_clip": 0.01081061, "auxiliary_loss_mlp": 0.01032731, "balance_loss_clip": 1.03366303, "balance_loss_mlp": 1.02105451, "epoch": 0.8808657748384188, "flos": 26578852151040.0, "grad_norm": 1.497639019636266, "language_loss": 0.72776234, "learning_rate": 1.4697587096824914e-07, "loss": 0.74890018, "num_input_tokens_seen": 315907340, "step": 14651, "time_per_iteration": 2.755974769592285 }, { "auxiliary_loss_clip": 0.01102624, "auxiliary_loss_mlp": 0.01034556, "balance_loss_clip": 1.03813457, "balance_loss_mlp": 1.0211333, "epoch": 0.8809258980910867, "flos": 18661231918080.0, "grad_norm": 1.7947734047574024, "language_loss": 0.71671438, "learning_rate": 1.4682936528021284e-07, "loss": 0.73808622, "num_input_tokens_seen": 315924935, "step": 14652, "time_per_iteration": 2.6478350162506104 }, { "auxiliary_loss_clip": 0.01088537, "auxiliary_loss_mlp": 0.01029586, "balance_loss_clip": 1.03456485, "balance_loss_mlp": 1.01757014, "epoch": 0.8809860213437547, "flos": 19792274348160.0, "grad_norm": 2.741119431069501, "language_loss": 0.74593818, "learning_rate": 1.4668292986459286e-07, "loss": 0.76711941, "num_input_tokens_seen": 315943165, "step": 14653, "time_per_iteration": 2.657860517501831 }, { "auxiliary_loss_clip": 0.01111355, "auxiliary_loss_mlp": 0.01031409, "balance_loss_clip": 1.03685915, "balance_loss_mlp": 1.01822495, "epoch": 0.8810461445964227, "flos": 17894467267200.0, "grad_norm": 1.7692800722324005, "language_loss": 0.71231246, "learning_rate": 1.465365647269421e-07, "loss": 0.73374015, "num_input_tokens_seen": 315961340, "step": 14654, "time_per_iteration": 2.6377742290496826 }, { "auxiliary_loss_clip": 0.01062842, "auxiliary_loss_mlp": 0.01038906, "balance_loss_clip": 1.03567505, "balance_loss_mlp": 1.02497637, "epoch": 0.8811062678490906, "flos": 29163917128320.0, "grad_norm": 1.6194615705289337, "language_loss": 0.71497536, "learning_rate": 1.4639026987281012e-07, "loss": 0.73599279, "num_input_tokens_seen": 315981335, "step": 14655, "time_per_iteration": 4.449506044387817 }, { "auxiliary_loss_clip": 0.01059688, "auxiliary_loss_mlp": 0.01035269, "balance_loss_clip": 1.03264832, "balance_loss_mlp": 1.02179229, "epoch": 0.8811663911017587, "flos": 20338834671360.0, "grad_norm": 2.1016384343696246, "language_loss": 0.81381142, "learning_rate": 1.462440453077449e-07, "loss": 0.83476096, "num_input_tokens_seen": 316001325, "step": 14656, "time_per_iteration": 4.342563629150391 }, { "auxiliary_loss_clip": 0.01084679, "auxiliary_loss_mlp": 0.01034904, "balance_loss_clip": 1.03799617, "balance_loss_mlp": 1.02292371, "epoch": 0.8812265143544266, "flos": 25885704424320.0, "grad_norm": 1.9594168695096041, "language_loss": 0.68740302, "learning_rate": 1.460978910372914e-07, "loss": 0.70859885, "num_input_tokens_seen": 316022540, "step": 14657, "time_per_iteration": 2.75775408744812 }, { "auxiliary_loss_clip": 0.01086792, "auxiliary_loss_mlp": 0.01036523, "balance_loss_clip": 1.03888392, "balance_loss_mlp": 1.02426791, "epoch": 0.8812866376070946, "flos": 27195509865600.0, "grad_norm": 2.309045431146604, "language_loss": 0.84054673, "learning_rate": 1.4595180706699207e-07, "loss": 0.86177993, "num_input_tokens_seen": 316037735, "step": 14658, "time_per_iteration": 4.1529762744903564 }, { "auxiliary_loss_clip": 0.01094436, "auxiliary_loss_mlp": 0.01035928, "balance_loss_clip": 1.03857708, "balance_loss_mlp": 1.02275574, "epoch": 0.8813467608597625, "flos": 23807194997760.0, "grad_norm": 1.9486638108186574, "language_loss": 0.77363259, "learning_rate": 1.4580579340238554e-07, "loss": 0.79493624, "num_input_tokens_seen": 316058105, "step": 14659, "time_per_iteration": 4.211735010147095 }, { "auxiliary_loss_clip": 0.01085864, "auxiliary_loss_mlp": 0.01034435, "balance_loss_clip": 1.03627634, "balance_loss_mlp": 1.0214169, "epoch": 0.8814068841124305, "flos": 21105455667840.0, "grad_norm": 2.1180822282078235, "language_loss": 0.60540521, "learning_rate": 1.4565985004900894e-07, "loss": 0.62660819, "num_input_tokens_seen": 316074415, "step": 14660, "time_per_iteration": 2.6319613456726074 }, { "auxiliary_loss_clip": 0.01094829, "auxiliary_loss_mlp": 0.01038386, "balance_loss_clip": 1.04060745, "balance_loss_mlp": 1.02493942, "epoch": 0.8814670073650984, "flos": 24716991605760.0, "grad_norm": 1.6496161205890179, "language_loss": 0.77789259, "learning_rate": 1.455139770123972e-07, "loss": 0.79922473, "num_input_tokens_seen": 316094405, "step": 14661, "time_per_iteration": 2.633333444595337 }, { "auxiliary_loss_clip": 0.01068997, "auxiliary_loss_mlp": 0.01045562, "balance_loss_clip": 1.03819084, "balance_loss_mlp": 1.03196073, "epoch": 0.8815271306177664, "flos": 22966274718720.0, "grad_norm": 2.4670359855209374, "language_loss": 0.76707077, "learning_rate": 1.45368174298081e-07, "loss": 0.78821635, "num_input_tokens_seen": 316113390, "step": 14662, "time_per_iteration": 2.645803451538086 }, { "auxiliary_loss_clip": 0.01059478, "auxiliary_loss_mlp": 0.01029767, "balance_loss_clip": 1.03322673, "balance_loss_mlp": 1.01856136, "epoch": 0.8815872538704344, "flos": 19460064435840.0, "grad_norm": 2.046618055728614, "language_loss": 0.73941565, "learning_rate": 1.4522244191158929e-07, "loss": 0.76030809, "num_input_tokens_seen": 316131085, "step": 14663, "time_per_iteration": 2.7289090156555176 }, { "auxiliary_loss_clip": 0.01099377, "auxiliary_loss_mlp": 0.00769769, "balance_loss_clip": 1.03778672, "balance_loss_mlp": 1.00022185, "epoch": 0.8816473771231024, "flos": 32156604622080.0, "grad_norm": 2.211377651108035, "language_loss": 0.69977838, "learning_rate": 1.450767798584489e-07, "loss": 0.71846986, "num_input_tokens_seen": 316151440, "step": 14664, "time_per_iteration": 2.679704427719116 }, { "auxiliary_loss_clip": 0.01028116, "auxiliary_loss_mlp": 0.01040404, "balance_loss_clip": 1.03083181, "balance_loss_mlp": 1.02833962, "epoch": 0.8817075003757703, "flos": 19682279925120.0, "grad_norm": 1.499474682944125, "language_loss": 0.80967414, "learning_rate": 1.449311881441828e-07, "loss": 0.83035928, "num_input_tokens_seen": 316170750, "step": 14665, "time_per_iteration": 2.818871021270752 }, { "auxiliary_loss_clip": 0.01085891, "auxiliary_loss_mlp": 0.01035568, "balance_loss_clip": 1.03590584, "balance_loss_mlp": 1.0237484, "epoch": 0.8817676236284383, "flos": 15668616251520.0, "grad_norm": 2.192576285565641, "language_loss": 0.5833683, "learning_rate": 1.447856667743117e-07, "loss": 0.60458285, "num_input_tokens_seen": 316187265, "step": 14666, "time_per_iteration": 2.6670124530792236 }, { "auxiliary_loss_clip": 0.01101515, "auxiliary_loss_mlp": 0.01031699, "balance_loss_clip": 1.03911185, "balance_loss_mlp": 1.01791823, "epoch": 0.8818277468811063, "flos": 17895185539200.0, "grad_norm": 2.486999206259205, "language_loss": 0.83586216, "learning_rate": 1.4464021575435403e-07, "loss": 0.8571943, "num_input_tokens_seen": 316206555, "step": 14667, "time_per_iteration": 2.6268537044525146 }, { "auxiliary_loss_clip": 0.01109075, "auxiliary_loss_mlp": 0.01032153, "balance_loss_clip": 1.03729033, "balance_loss_mlp": 1.01920688, "epoch": 0.8818878701337742, "flos": 18770508069120.0, "grad_norm": 1.817207647136482, "language_loss": 0.62429118, "learning_rate": 1.4449483508982563e-07, "loss": 0.64570343, "num_input_tokens_seen": 316225210, "step": 14668, "time_per_iteration": 2.552854061126709 }, { "auxiliary_loss_clip": 0.01095167, "auxiliary_loss_mlp": 0.01031398, "balance_loss_clip": 1.03637564, "balance_loss_mlp": 1.02023387, "epoch": 0.8819479933864423, "flos": 17712292464000.0, "grad_norm": 2.79196460175423, "language_loss": 0.57027191, "learning_rate": 1.4434952478623918e-07, "loss": 0.59153754, "num_input_tokens_seen": 316242685, "step": 14669, "time_per_iteration": 2.565288782119751 }, { "auxiliary_loss_clip": 0.0110705, "auxiliary_loss_mlp": 0.01032118, "balance_loss_clip": 1.03566611, "balance_loss_mlp": 1.01975608, "epoch": 0.8820081166391102, "flos": 11728749070080.0, "grad_norm": 1.8986730900413675, "language_loss": 0.71354139, "learning_rate": 1.442042848491043e-07, "loss": 0.73493308, "num_input_tokens_seen": 316260935, "step": 14670, "time_per_iteration": 2.563056707382202 }, { "auxiliary_loss_clip": 0.01090236, "auxiliary_loss_mlp": 0.01032669, "balance_loss_clip": 1.03279638, "balance_loss_mlp": 1.02009296, "epoch": 0.8820682398917782, "flos": 27490372611840.0, "grad_norm": 1.913343870820924, "language_loss": 0.73558605, "learning_rate": 1.44059115283929e-07, "loss": 0.75681508, "num_input_tokens_seen": 316281190, "step": 14671, "time_per_iteration": 2.648991346359253 }, { "auxiliary_loss_clip": 0.0108854, "auxiliary_loss_mlp": 0.01032215, "balance_loss_clip": 1.03446448, "balance_loss_mlp": 1.01891685, "epoch": 0.8821283631444461, "flos": 16873850223360.0, "grad_norm": 2.5015746427003878, "language_loss": 0.84854722, "learning_rate": 1.43914016096218e-07, "loss": 0.86975479, "num_input_tokens_seen": 316297115, "step": 14672, "time_per_iteration": 2.582524061203003 }, { "auxiliary_loss_clip": 0.01071178, "auxiliary_loss_mlp": 0.01030273, "balance_loss_clip": 1.03337216, "balance_loss_mlp": 1.01805353, "epoch": 0.8821884863971141, "flos": 24280964409600.0, "grad_norm": 1.630028849005291, "language_loss": 0.7247709, "learning_rate": 1.4376898729147336e-07, "loss": 0.74578547, "num_input_tokens_seen": 316318235, "step": 14673, "time_per_iteration": 2.7013115882873535 }, { "auxiliary_loss_clip": 0.01008529, "auxiliary_loss_mlp": 0.01000562, "balance_loss_clip": 1.00525308, "balance_loss_mlp": 0.99949533, "epoch": 0.882248609649782, "flos": 59432342492160.0, "grad_norm": 0.8079833493209833, "language_loss": 0.49358672, "learning_rate": 1.4362402887519487e-07, "loss": 0.5136776, "num_input_tokens_seen": 316384705, "step": 14674, "time_per_iteration": 3.268969774246216 }, { "auxiliary_loss_clip": 0.01083711, "auxiliary_loss_mlp": 0.00770966, "balance_loss_clip": 1.03282237, "balance_loss_mlp": 1.00024939, "epoch": 0.88230873290245, "flos": 19937784343680.0, "grad_norm": 2.0037273036642578, "language_loss": 0.76279628, "learning_rate": 1.4347914085287971e-07, "loss": 0.78134304, "num_input_tokens_seen": 316401165, "step": 14675, "time_per_iteration": 2.6139438152313232 }, { "auxiliary_loss_clip": 0.01083195, "auxiliary_loss_mlp": 0.01035716, "balance_loss_clip": 1.03536808, "balance_loss_mlp": 1.02411079, "epoch": 0.882368856155118, "flos": 16362769559040.0, "grad_norm": 1.8400865500932602, "language_loss": 0.79260898, "learning_rate": 1.4333432323002105e-07, "loss": 0.81379807, "num_input_tokens_seen": 316418780, "step": 14676, "time_per_iteration": 2.6346635818481445 }, { "auxiliary_loss_clip": 0.00997838, "auxiliary_loss_mlp": 0.01005545, "balance_loss_clip": 1.01021266, "balance_loss_mlp": 1.00431693, "epoch": 0.882428979407786, "flos": 70594563277440.0, "grad_norm": 0.7902692138186003, "language_loss": 0.54692107, "learning_rate": 1.431895760121109e-07, "loss": 0.56695491, "num_input_tokens_seen": 316482030, "step": 14677, "time_per_iteration": 3.293663501739502 }, { "auxiliary_loss_clip": 0.01105406, "auxiliary_loss_mlp": 0.0103004, "balance_loss_clip": 1.03478503, "balance_loss_mlp": 1.01775551, "epoch": 0.8824891026604539, "flos": 18150294908160.0, "grad_norm": 2.2487393421780673, "language_loss": 0.64326406, "learning_rate": 1.4304489920463847e-07, "loss": 0.66461849, "num_input_tokens_seen": 316499175, "step": 14678, "time_per_iteration": 2.5656399726867676 }, { "auxiliary_loss_clip": 0.01087368, "auxiliary_loss_mlp": 0.01031922, "balance_loss_clip": 1.03426218, "balance_loss_mlp": 1.01929188, "epoch": 0.8825492259131219, "flos": 27232713377280.0, "grad_norm": 1.973421047113739, "language_loss": 0.71194983, "learning_rate": 1.4290029281308936e-07, "loss": 0.73314273, "num_input_tokens_seen": 316519495, "step": 14679, "time_per_iteration": 2.717034339904785 }, { "auxiliary_loss_clip": 0.01084094, "auxiliary_loss_mlp": 0.01031717, "balance_loss_clip": 1.03604484, "balance_loss_mlp": 1.02098179, "epoch": 0.8826093491657898, "flos": 22274419881600.0, "grad_norm": 1.9561596241675088, "language_loss": 0.63978046, "learning_rate": 1.4275575684294694e-07, "loss": 0.66093856, "num_input_tokens_seen": 316538180, "step": 14680, "time_per_iteration": 2.6951301097869873 }, { "auxiliary_loss_clip": 0.01107228, "auxiliary_loss_mlp": 0.01033016, "balance_loss_clip": 1.03680277, "balance_loss_mlp": 1.0208087, "epoch": 0.8826694724184578, "flos": 14204753377920.0, "grad_norm": 2.3967020475044767, "language_loss": 0.77099824, "learning_rate": 1.4261129129969328e-07, "loss": 0.79240072, "num_input_tokens_seen": 316551750, "step": 14681, "time_per_iteration": 2.5262744426727295 }, { "auxiliary_loss_clip": 0.01087034, "auxiliary_loss_mlp": 0.01034558, "balance_loss_clip": 1.03454781, "balance_loss_mlp": 1.02127814, "epoch": 0.8827295956711259, "flos": 20631686256000.0, "grad_norm": 1.7532857738520948, "language_loss": 0.72604549, "learning_rate": 1.424668961888047e-07, "loss": 0.74726152, "num_input_tokens_seen": 316570680, "step": 14682, "time_per_iteration": 2.632432699203491 }, { "auxiliary_loss_clip": 0.01069185, "auxiliary_loss_mlp": 0.0103165, "balance_loss_clip": 1.03995907, "balance_loss_mlp": 1.01723146, "epoch": 0.8827897189237938, "flos": 18513064316160.0, "grad_norm": 1.9501054227353172, "language_loss": 0.74376327, "learning_rate": 1.4232257151575765e-07, "loss": 0.76477158, "num_input_tokens_seen": 316588635, "step": 14683, "time_per_iteration": 2.7173256874084473 }, { "auxiliary_loss_clip": 0.01074481, "auxiliary_loss_mlp": 0.01032656, "balance_loss_clip": 1.03458118, "balance_loss_mlp": 1.01993001, "epoch": 0.8828498421764618, "flos": 22747399194240.0, "grad_norm": 1.85393754134711, "language_loss": 0.65667385, "learning_rate": 1.4217831728602492e-07, "loss": 0.67774516, "num_input_tokens_seen": 316607550, "step": 14684, "time_per_iteration": 2.7330434322357178 }, { "auxiliary_loss_clip": 0.0109236, "auxiliary_loss_mlp": 0.01029425, "balance_loss_clip": 1.03487706, "balance_loss_mlp": 1.01779604, "epoch": 0.8829099654291297, "flos": 15012384727680.0, "grad_norm": 1.9479646224303804, "language_loss": 0.69623429, "learning_rate": 1.4203413350507677e-07, "loss": 0.71745217, "num_input_tokens_seen": 316624460, "step": 14685, "time_per_iteration": 2.5940215587615967 }, { "auxiliary_loss_clip": 0.01057757, "auxiliary_loss_mlp": 0.0103887, "balance_loss_clip": 1.03562188, "balance_loss_mlp": 1.02445173, "epoch": 0.8829700886817977, "flos": 16720546976640.0, "grad_norm": 1.9726343446445405, "language_loss": 0.74293447, "learning_rate": 1.418900201783806e-07, "loss": 0.76390076, "num_input_tokens_seen": 316640765, "step": 14686, "time_per_iteration": 2.724073886871338 }, { "auxiliary_loss_clip": 0.01055836, "auxiliary_loss_mlp": 0.01028261, "balance_loss_clip": 1.03210068, "balance_loss_mlp": 1.01602983, "epoch": 0.8830302119344656, "flos": 15263256291840.0, "grad_norm": 1.8765198803907357, "language_loss": 0.63015836, "learning_rate": 1.417459773114007e-07, "loss": 0.65099931, "num_input_tokens_seen": 316656120, "step": 14687, "time_per_iteration": 2.707498550415039 }, { "auxiliary_loss_clip": 0.01100271, "auxiliary_loss_mlp": 0.01038773, "balance_loss_clip": 1.03685296, "balance_loss_mlp": 1.02611268, "epoch": 0.8830903351871336, "flos": 28617751854720.0, "grad_norm": 1.78215533171273, "language_loss": 0.69295615, "learning_rate": 1.4160200490959984e-07, "loss": 0.71434665, "num_input_tokens_seen": 316676095, "step": 14688, "time_per_iteration": 2.6418840885162354 }, { "auxiliary_loss_clip": 0.0109326, "auxiliary_loss_mlp": 0.01027423, "balance_loss_clip": 1.03498924, "balance_loss_mlp": 1.01532912, "epoch": 0.8831504584398016, "flos": 28001632844160.0, "grad_norm": 1.920117351658533, "language_loss": 0.66948056, "learning_rate": 1.4145810297843697e-07, "loss": 0.69068736, "num_input_tokens_seen": 316696235, "step": 14689, "time_per_iteration": 2.572154998779297 }, { "auxiliary_loss_clip": 0.01082065, "auxiliary_loss_mlp": 0.01027763, "balance_loss_clip": 1.03897274, "balance_loss_mlp": 1.01591396, "epoch": 0.8832105816924696, "flos": 26579642250240.0, "grad_norm": 1.390214083666347, "language_loss": 0.74641317, "learning_rate": 1.4131427152336905e-07, "loss": 0.76751149, "num_input_tokens_seen": 316719680, "step": 14690, "time_per_iteration": 2.7160091400146484 }, { "auxiliary_loss_clip": 0.0108565, "auxiliary_loss_mlp": 0.01037391, "balance_loss_clip": 1.0344497, "balance_loss_mlp": 1.02380705, "epoch": 0.8832707049451375, "flos": 24898771359360.0, "grad_norm": 1.4286133557095182, "language_loss": 0.72746867, "learning_rate": 1.4117051054985018e-07, "loss": 0.74869907, "num_input_tokens_seen": 316739830, "step": 14691, "time_per_iteration": 2.650376558303833 }, { "auxiliary_loss_clip": 0.01076966, "auxiliary_loss_mlp": 0.0102843, "balance_loss_clip": 1.03778577, "balance_loss_mlp": 1.01508439, "epoch": 0.8833308281978055, "flos": 15451141357440.0, "grad_norm": 2.0965604291100277, "language_loss": 0.51753283, "learning_rate": 1.4102682006333243e-07, "loss": 0.53858674, "num_input_tokens_seen": 316758105, "step": 14692, "time_per_iteration": 2.656104564666748 }, { "auxiliary_loss_clip": 0.01072794, "auxiliary_loss_mlp": 0.01033738, "balance_loss_clip": 1.03685403, "balance_loss_mlp": 1.02114379, "epoch": 0.8833909514504734, "flos": 20301523418880.0, "grad_norm": 2.5366757871087264, "language_loss": 0.60396338, "learning_rate": 1.4088320006926346e-07, "loss": 0.62502873, "num_input_tokens_seen": 316777455, "step": 14693, "time_per_iteration": 2.6937055587768555 }, { "auxiliary_loss_clip": 0.01104793, "auxiliary_loss_mlp": 0.01027886, "balance_loss_clip": 1.03680062, "balance_loss_mlp": 1.01657307, "epoch": 0.8834510747031414, "flos": 20374027021440.0, "grad_norm": 1.6196331469074723, "language_loss": 0.75283146, "learning_rate": 1.407396505730898e-07, "loss": 0.77415824, "num_input_tokens_seen": 316796300, "step": 14694, "time_per_iteration": 2.577456474304199 }, { "auxiliary_loss_clip": 0.01092067, "auxiliary_loss_mlp": 0.01031516, "balance_loss_clip": 1.03433728, "balance_loss_mlp": 1.02011991, "epoch": 0.8835111979558095, "flos": 29752026508800.0, "grad_norm": 1.8847177158288673, "language_loss": 0.72582275, "learning_rate": 1.4059617158025527e-07, "loss": 0.74705863, "num_input_tokens_seen": 316819090, "step": 14695, "time_per_iteration": 5.806610822677612 }, { "auxiliary_loss_clip": 0.01092613, "auxiliary_loss_mlp": 0.01026382, "balance_loss_clip": 1.03546548, "balance_loss_mlp": 1.01503897, "epoch": 0.8835713212084774, "flos": 24134556574080.0, "grad_norm": 1.7805903977771496, "language_loss": 0.80249125, "learning_rate": 1.404527630961998e-07, "loss": 0.82368124, "num_input_tokens_seen": 316839250, "step": 14696, "time_per_iteration": 2.6262238025665283 }, { "auxiliary_loss_clip": 0.01070594, "auxiliary_loss_mlp": 0.01033129, "balance_loss_clip": 1.03721249, "balance_loss_mlp": 1.02114844, "epoch": 0.8836314444611454, "flos": 27672331933440.0, "grad_norm": 1.3961231216590477, "language_loss": 0.74813706, "learning_rate": 1.4030942512636236e-07, "loss": 0.76917428, "num_input_tokens_seen": 316861315, "step": 14697, "time_per_iteration": 4.375631809234619 }, { "auxiliary_loss_clip": 0.01087263, "auxiliary_loss_mlp": 0.01030892, "balance_loss_clip": 1.03708208, "balance_loss_mlp": 1.01885819, "epoch": 0.8836915677138133, "flos": 16836969934080.0, "grad_norm": 2.288430876034272, "language_loss": 0.72242546, "learning_rate": 1.401661576761779e-07, "loss": 0.74360704, "num_input_tokens_seen": 316879325, "step": 14698, "time_per_iteration": 4.223493576049805 }, { "auxiliary_loss_clip": 0.01018409, "auxiliary_loss_mlp": 0.00999712, "balance_loss_clip": 1.00626993, "balance_loss_mlp": 0.99860901, "epoch": 0.8837516909664813, "flos": 69310540823040.0, "grad_norm": 0.8057459170171036, "language_loss": 0.53683382, "learning_rate": 1.4002296075107856e-07, "loss": 0.55701506, "num_input_tokens_seen": 316936425, "step": 14699, "time_per_iteration": 3.2273147106170654 }, { "auxiliary_loss_clip": 0.01087948, "auxiliary_loss_mlp": 0.01031005, "balance_loss_clip": 1.03542256, "balance_loss_mlp": 1.01808274, "epoch": 0.8838118142191492, "flos": 21324726241920.0, "grad_norm": 1.773127577183959, "language_loss": 0.76996839, "learning_rate": 1.3987983435649508e-07, "loss": 0.79115796, "num_input_tokens_seen": 316956360, "step": 14700, "time_per_iteration": 2.7143490314483643 }, { "auxiliary_loss_clip": 0.01074827, "auxiliary_loss_mlp": 0.01031532, "balance_loss_clip": 1.03586185, "balance_loss_mlp": 1.01926565, "epoch": 0.8838719374718172, "flos": 21470559459840.0, "grad_norm": 1.7340083630316034, "language_loss": 0.72865736, "learning_rate": 1.3973677849785494e-07, "loss": 0.74972093, "num_input_tokens_seen": 316975295, "step": 14701, "time_per_iteration": 2.6882786750793457 }, { "auxiliary_loss_clip": 0.01086251, "auxiliary_loss_mlp": 0.01036661, "balance_loss_clip": 1.03455663, "balance_loss_mlp": 1.02270126, "epoch": 0.8839320607244852, "flos": 26468929555200.0, "grad_norm": 1.8625463465240368, "language_loss": 0.71503305, "learning_rate": 1.3959379318058262e-07, "loss": 0.73626214, "num_input_tokens_seen": 316994520, "step": 14702, "time_per_iteration": 2.72592830657959 }, { "auxiliary_loss_clip": 0.01071764, "auxiliary_loss_mlp": 0.01044197, "balance_loss_clip": 1.03413224, "balance_loss_mlp": 1.02983212, "epoch": 0.8839921839771532, "flos": 45222270923520.0, "grad_norm": 1.7559603641053307, "language_loss": 0.71454448, "learning_rate": 1.3945087841010006e-07, "loss": 0.73570406, "num_input_tokens_seen": 317018095, "step": 14703, "time_per_iteration": 2.9277431964874268 }, { "auxiliary_loss_clip": 0.01065783, "auxiliary_loss_mlp": 0.01031109, "balance_loss_clip": 1.03791165, "balance_loss_mlp": 1.01922941, "epoch": 0.8840523072298211, "flos": 20006876154240.0, "grad_norm": 2.1820279654831474, "language_loss": 0.6694417, "learning_rate": 1.3930803419182645e-07, "loss": 0.69041061, "num_input_tokens_seen": 317035755, "step": 14704, "time_per_iteration": 2.8294484615325928 }, { "auxiliary_loss_clip": 0.01087087, "auxiliary_loss_mlp": 0.0102583, "balance_loss_clip": 1.03454638, "balance_loss_mlp": 1.0141418, "epoch": 0.8841124304824891, "flos": 24426007528320.0, "grad_norm": 1.887740201159673, "language_loss": 0.70546407, "learning_rate": 1.3916526053117905e-07, "loss": 0.72659326, "num_input_tokens_seen": 317055765, "step": 14705, "time_per_iteration": 2.7231884002685547 }, { "auxiliary_loss_clip": 0.01086994, "auxiliary_loss_mlp": 0.01032189, "balance_loss_clip": 1.03693652, "balance_loss_mlp": 1.02126944, "epoch": 0.884172553735157, "flos": 31284622056960.0, "grad_norm": 1.4798383584085324, "language_loss": 0.70781028, "learning_rate": 1.3902255743357104e-07, "loss": 0.72900212, "num_input_tokens_seen": 317077955, "step": 14706, "time_per_iteration": 2.817166805267334 }, { "auxiliary_loss_clip": 0.0109745, "auxiliary_loss_mlp": 0.01031235, "balance_loss_clip": 1.03596139, "balance_loss_mlp": 1.0189985, "epoch": 0.884232676987825, "flos": 21391160446080.0, "grad_norm": 1.9611948964387604, "language_loss": 0.74633074, "learning_rate": 1.3887992490441413e-07, "loss": 0.76761764, "num_input_tokens_seen": 317095825, "step": 14707, "time_per_iteration": 2.692667007446289 }, { "auxiliary_loss_clip": 0.01001598, "auxiliary_loss_mlp": 0.01000676, "balance_loss_clip": 1.00856423, "balance_loss_mlp": 0.99968618, "epoch": 0.8842928002404931, "flos": 57911451799680.0, "grad_norm": 0.8872244282469403, "language_loss": 0.60417277, "learning_rate": 1.387373629491173e-07, "loss": 0.62419552, "num_input_tokens_seen": 317152875, "step": 14708, "time_per_iteration": 3.083991765975952 }, { "auxiliary_loss_clip": 0.01077587, "auxiliary_loss_mlp": 0.01032084, "balance_loss_clip": 1.03236675, "balance_loss_mlp": 1.02057397, "epoch": 0.884352923493161, "flos": 41463896186880.0, "grad_norm": 4.7609896272216305, "language_loss": 0.67469186, "learning_rate": 1.3859487157308625e-07, "loss": 0.6957885, "num_input_tokens_seen": 317176725, "step": 14709, "time_per_iteration": 2.8194525241851807 }, { "auxiliary_loss_clip": 0.01091628, "auxiliary_loss_mlp": 0.01036852, "balance_loss_clip": 1.03700888, "balance_loss_mlp": 1.02251136, "epoch": 0.884413046745829, "flos": 46541234332800.0, "grad_norm": 1.664665419544956, "language_loss": 0.62438279, "learning_rate": 1.3845245078172373e-07, "loss": 0.64566755, "num_input_tokens_seen": 317206880, "step": 14710, "time_per_iteration": 2.9080650806427 }, { "auxiliary_loss_clip": 0.01074046, "auxiliary_loss_mlp": 0.01025213, "balance_loss_clip": 1.03497434, "balance_loss_mlp": 1.01367342, "epoch": 0.8844731699984969, "flos": 19135324552320.0, "grad_norm": 5.507655560622358, "language_loss": 0.63936687, "learning_rate": 1.38310100580431e-07, "loss": 0.66035938, "num_input_tokens_seen": 317224135, "step": 14711, "time_per_iteration": 2.7565457820892334 }, { "auxiliary_loss_clip": 0.0107192, "auxiliary_loss_mlp": 0.01033004, "balance_loss_clip": 1.03220356, "balance_loss_mlp": 1.01972961, "epoch": 0.8845332932511649, "flos": 23260634674560.0, "grad_norm": 2.576105371894639, "language_loss": 0.76215911, "learning_rate": 1.38167820974606e-07, "loss": 0.78320837, "num_input_tokens_seen": 317244505, "step": 14712, "time_per_iteration": 2.7664034366607666 }, { "auxiliary_loss_clip": 0.01048291, "auxiliary_loss_mlp": 0.01029784, "balance_loss_clip": 1.02974892, "balance_loss_mlp": 1.01695108, "epoch": 0.8845934165038328, "flos": 17564591738880.0, "grad_norm": 2.26538239437818, "language_loss": 0.80963331, "learning_rate": 1.3802561196964368e-07, "loss": 0.83041406, "num_input_tokens_seen": 317257830, "step": 14713, "time_per_iteration": 2.7584569454193115 }, { "auxiliary_loss_clip": 0.01084824, "auxiliary_loss_mlp": 0.01028599, "balance_loss_clip": 1.03427589, "balance_loss_mlp": 1.01581335, "epoch": 0.8846535397565009, "flos": 27485739757440.0, "grad_norm": 1.3779261727690353, "language_loss": 0.55363518, "learning_rate": 1.3788347357093688e-07, "loss": 0.57476938, "num_input_tokens_seen": 317278430, "step": 14714, "time_per_iteration": 2.733762502670288 }, { "auxiliary_loss_clip": 0.01053317, "auxiliary_loss_mlp": 0.01038776, "balance_loss_clip": 1.03666592, "balance_loss_mlp": 1.02476311, "epoch": 0.8847136630091688, "flos": 28761430256640.0, "grad_norm": 1.7611265846696629, "language_loss": 0.74193525, "learning_rate": 1.377414057838755e-07, "loss": 0.76285625, "num_input_tokens_seen": 317295970, "step": 14715, "time_per_iteration": 2.841095447540283 }, { "auxiliary_loss_clip": 0.01098367, "auxiliary_loss_mlp": 0.01030945, "balance_loss_clip": 1.03592944, "balance_loss_mlp": 1.0190537, "epoch": 0.8847737862618368, "flos": 23476924419840.0, "grad_norm": 2.015334930490414, "language_loss": 0.75194365, "learning_rate": 1.375994086138461e-07, "loss": 0.77323675, "num_input_tokens_seen": 317316185, "step": 14716, "time_per_iteration": 2.661020517349243 }, { "auxiliary_loss_clip": 0.01075664, "auxiliary_loss_mlp": 0.0103605, "balance_loss_clip": 1.03667819, "balance_loss_mlp": 1.02395606, "epoch": 0.8848339095145047, "flos": 18660872782080.0, "grad_norm": 1.993706294910503, "language_loss": 0.71433997, "learning_rate": 1.3745748206623397e-07, "loss": 0.73545712, "num_input_tokens_seen": 317333275, "step": 14717, "time_per_iteration": 2.7001688480377197 }, { "auxiliary_loss_clip": 0.01093455, "auxiliary_loss_mlp": 0.01033322, "balance_loss_clip": 1.0350275, "balance_loss_mlp": 1.02174115, "epoch": 0.8848940327671727, "flos": 32270298145920.0, "grad_norm": 2.3327665948166643, "language_loss": 0.73770732, "learning_rate": 1.373156261464208e-07, "loss": 0.75897503, "num_input_tokens_seen": 317351245, "step": 14718, "time_per_iteration": 2.677098274230957 }, { "auxiliary_loss_clip": 0.01058475, "auxiliary_loss_mlp": 0.01029814, "balance_loss_clip": 1.03630209, "balance_loss_mlp": 1.01655793, "epoch": 0.8849541560198406, "flos": 24021832717440.0, "grad_norm": 2.0713842614778755, "language_loss": 0.78531897, "learning_rate": 1.3717384085978602e-07, "loss": 0.80620188, "num_input_tokens_seen": 317370740, "step": 14719, "time_per_iteration": 2.8046772480010986 }, { "auxiliary_loss_clip": 0.01108831, "auxiliary_loss_mlp": 0.01026154, "balance_loss_clip": 1.03627968, "balance_loss_mlp": 1.01376843, "epoch": 0.8850142792725086, "flos": 16873060124160.0, "grad_norm": 1.562689851566494, "language_loss": 0.71582258, "learning_rate": 1.3703212621170579e-07, "loss": 0.73717248, "num_input_tokens_seen": 317388370, "step": 14720, "time_per_iteration": 2.6795947551727295 }, { "auxiliary_loss_clip": 0.01087567, "auxiliary_loss_mlp": 0.01032607, "balance_loss_clip": 1.03469348, "balance_loss_mlp": 1.02011943, "epoch": 0.8850744025251767, "flos": 24024059360640.0, "grad_norm": 1.9018606695741462, "language_loss": 0.82328093, "learning_rate": 1.3689048220755383e-07, "loss": 0.84448266, "num_input_tokens_seen": 317407390, "step": 14721, "time_per_iteration": 2.7234106063842773 }, { "auxiliary_loss_clip": 0.01087774, "auxiliary_loss_mlp": 0.01030218, "balance_loss_clip": 1.03554928, "balance_loss_mlp": 1.01725388, "epoch": 0.8851345257778446, "flos": 47955575329920.0, "grad_norm": 2.0019899609402994, "language_loss": 0.6242708, "learning_rate": 1.3674890885270186e-07, "loss": 0.64545077, "num_input_tokens_seen": 317430825, "step": 14722, "time_per_iteration": 2.94286847114563 }, { "auxiliary_loss_clip": 0.01098996, "auxiliary_loss_mlp": 0.01030621, "balance_loss_clip": 1.03618419, "balance_loss_mlp": 1.01827097, "epoch": 0.8851946490305126, "flos": 36611000173440.0, "grad_norm": 2.1418673941566815, "language_loss": 0.68605435, "learning_rate": 1.3660740615251754e-07, "loss": 0.70735055, "num_input_tokens_seen": 317451905, "step": 14723, "time_per_iteration": 2.733093023300171 }, { "auxiliary_loss_clip": 0.01073469, "auxiliary_loss_mlp": 0.01037418, "balance_loss_clip": 1.03269005, "balance_loss_mlp": 1.02493691, "epoch": 0.8852547722831805, "flos": 21544248211200.0, "grad_norm": 1.6603204159034268, "language_loss": 0.77997786, "learning_rate": 1.3646597411236703e-07, "loss": 0.80108678, "num_input_tokens_seen": 317470030, "step": 14724, "time_per_iteration": 2.667952299118042 }, { "auxiliary_loss_clip": 0.01018949, "auxiliary_loss_mlp": 0.01000573, "balance_loss_clip": 1.00656819, "balance_loss_mlp": 0.9996435, "epoch": 0.8853148955358485, "flos": 63059246472960.0, "grad_norm": 0.79872504573919, "language_loss": 0.58856809, "learning_rate": 1.363246127376143e-07, "loss": 0.60876334, "num_input_tokens_seen": 317527460, "step": 14725, "time_per_iteration": 3.0969929695129395 }, { "auxiliary_loss_clip": 0.010877, "auxiliary_loss_mlp": 0.00772122, "balance_loss_clip": 1.0332129, "balance_loss_mlp": 1.00029242, "epoch": 0.8853750187885164, "flos": 18149828031360.0, "grad_norm": 1.9516180183005214, "language_loss": 0.69201702, "learning_rate": 1.3618332203361837e-07, "loss": 0.71061528, "num_input_tokens_seen": 317544070, "step": 14726, "time_per_iteration": 2.6915600299835205 }, { "auxiliary_loss_clip": 0.01095197, "auxiliary_loss_mlp": 0.00770245, "balance_loss_clip": 1.03544807, "balance_loss_mlp": 1.00021529, "epoch": 0.8854351420411845, "flos": 39570542392320.0, "grad_norm": 1.2107511197334673, "language_loss": 0.69623214, "learning_rate": 1.3604210200573785e-07, "loss": 0.71488655, "num_input_tokens_seen": 317570275, "step": 14727, "time_per_iteration": 2.7665956020355225 }, { "auxiliary_loss_clip": 0.01088033, "auxiliary_loss_mlp": 0.01032809, "balance_loss_clip": 1.03910947, "balance_loss_mlp": 1.02020836, "epoch": 0.8854952652938524, "flos": 23769309127680.0, "grad_norm": 1.5740836195645216, "language_loss": 0.69980741, "learning_rate": 1.3590095265932733e-07, "loss": 0.72101581, "num_input_tokens_seen": 317590160, "step": 14728, "time_per_iteration": 2.765291929244995 }, { "auxiliary_loss_clip": 0.0107448, "auxiliary_loss_mlp": 0.01027952, "balance_loss_clip": 1.03473592, "balance_loss_mlp": 1.01644814, "epoch": 0.8855553885465204, "flos": 18290310122880.0, "grad_norm": 2.332652743923133, "language_loss": 0.66558629, "learning_rate": 1.3575987399973987e-07, "loss": 0.68661064, "num_input_tokens_seen": 317608340, "step": 14729, "time_per_iteration": 2.7198948860168457 }, { "auxiliary_loss_clip": 0.01079258, "auxiliary_loss_mlp": 0.01037721, "balance_loss_clip": 1.03742743, "balance_loss_mlp": 1.02642, "epoch": 0.8856155117991883, "flos": 36867402432000.0, "grad_norm": 1.6722891950677918, "language_loss": 0.62810826, "learning_rate": 1.3561886603232453e-07, "loss": 0.64927804, "num_input_tokens_seen": 317629910, "step": 14730, "time_per_iteration": 2.8442556858062744 }, { "auxiliary_loss_clip": 0.01071976, "auxiliary_loss_mlp": 0.01031648, "balance_loss_clip": 1.03443062, "balance_loss_mlp": 1.01946437, "epoch": 0.8856756350518563, "flos": 22163886754560.0, "grad_norm": 1.401332014115865, "language_loss": 0.79437548, "learning_rate": 1.3547792876242904e-07, "loss": 0.81541169, "num_input_tokens_seen": 317650265, "step": 14731, "time_per_iteration": 2.762430429458618 }, { "auxiliary_loss_clip": 0.01072107, "auxiliary_loss_mlp": 0.01033343, "balance_loss_clip": 1.0311588, "balance_loss_mlp": 1.02106476, "epoch": 0.8857357583045242, "flos": 20740962407040.0, "grad_norm": 1.5976657601317488, "language_loss": 0.82999492, "learning_rate": 1.3533706219539708e-07, "loss": 0.85104942, "num_input_tokens_seen": 317669045, "step": 14732, "time_per_iteration": 2.7181379795074463 }, { "auxiliary_loss_clip": 0.01009214, "auxiliary_loss_mlp": 0.01003697, "balance_loss_clip": 1.00654268, "balance_loss_mlp": 1.00273728, "epoch": 0.8857958815571922, "flos": 69892329409920.0, "grad_norm": 0.9009578672979854, "language_loss": 0.5992915, "learning_rate": 1.3519626633657045e-07, "loss": 0.61942059, "num_input_tokens_seen": 317728065, "step": 14733, "time_per_iteration": 4.828664064407349 }, { "auxiliary_loss_clip": 0.01109749, "auxiliary_loss_mlp": 0.0077073, "balance_loss_clip": 1.03790414, "balance_loss_mlp": 1.00016737, "epoch": 0.8858560048098603, "flos": 15121948187520.0, "grad_norm": 4.085770877577171, "language_loss": 0.66732299, "learning_rate": 1.3505554119128838e-07, "loss": 0.68612778, "num_input_tokens_seen": 317746120, "step": 14734, "time_per_iteration": 4.0870819091796875 }, { "auxiliary_loss_clip": 0.01081595, "auxiliary_loss_mlp": 0.01037825, "balance_loss_clip": 1.03617239, "balance_loss_mlp": 1.02644062, "epoch": 0.8859161280625282, "flos": 16611019430400.0, "grad_norm": 1.9769334143757535, "language_loss": 0.75267172, "learning_rate": 1.3491488676488682e-07, "loss": 0.77386594, "num_input_tokens_seen": 317762280, "step": 14735, "time_per_iteration": 2.596672534942627 }, { "auxiliary_loss_clip": 0.01070336, "auxiliary_loss_mlp": 0.0103395, "balance_loss_clip": 1.03347635, "balance_loss_mlp": 1.02087295, "epoch": 0.8859762513151962, "flos": 18694484933760.0, "grad_norm": 1.9172644356964386, "language_loss": 0.70264298, "learning_rate": 1.3477430306270066e-07, "loss": 0.72368586, "num_input_tokens_seen": 317780615, "step": 14736, "time_per_iteration": 4.219033479690552 }, { "auxiliary_loss_clip": 0.01077332, "auxiliary_loss_mlp": 0.01031715, "balance_loss_clip": 1.03754532, "balance_loss_mlp": 1.01955533, "epoch": 0.8860363745678641, "flos": 19536877670400.0, "grad_norm": 5.918141742658791, "language_loss": 0.84637642, "learning_rate": 1.3463379009005892e-07, "loss": 0.86746687, "num_input_tokens_seen": 317798830, "step": 14737, "time_per_iteration": 4.119691848754883 }, { "auxiliary_loss_clip": 0.01084938, "auxiliary_loss_mlp": 0.01034567, "balance_loss_clip": 1.03715491, "balance_loss_mlp": 1.02060747, "epoch": 0.8860964978205321, "flos": 35954912304000.0, "grad_norm": 2.9176785944040087, "language_loss": 0.67942357, "learning_rate": 1.3449334785229093e-07, "loss": 0.70061862, "num_input_tokens_seen": 317819235, "step": 14738, "time_per_iteration": 2.865959882736206 }, { "auxiliary_loss_clip": 0.01101518, "auxiliary_loss_mlp": 0.01030491, "balance_loss_clip": 1.03650808, "balance_loss_mlp": 1.0173068, "epoch": 0.8861566210732, "flos": 21212577002880.0, "grad_norm": 1.8188122899172712, "language_loss": 0.75242293, "learning_rate": 1.343529763547222e-07, "loss": 0.77374303, "num_input_tokens_seen": 317836785, "step": 14739, "time_per_iteration": 2.6084749698638916 }, { "auxiliary_loss_clip": 0.01096641, "auxiliary_loss_mlp": 0.01033392, "balance_loss_clip": 1.03646207, "balance_loss_mlp": 1.02176273, "epoch": 0.886216744325868, "flos": 14609071843200.0, "grad_norm": 2.373250938513307, "language_loss": 0.87370729, "learning_rate": 1.3421267560267559e-07, "loss": 0.89500761, "num_input_tokens_seen": 317854225, "step": 14740, "time_per_iteration": 2.6357059478759766 }, { "auxiliary_loss_clip": 0.01058963, "auxiliary_loss_mlp": 0.01034429, "balance_loss_clip": 1.03261304, "balance_loss_mlp": 1.02202511, "epoch": 0.886276867578536, "flos": 26651643062400.0, "grad_norm": 1.7903686918676003, "language_loss": 0.63587701, "learning_rate": 1.34072445601471e-07, "loss": 0.656811, "num_input_tokens_seen": 317874865, "step": 14741, "time_per_iteration": 2.7529678344726562 }, { "auxiliary_loss_clip": 0.01108743, "auxiliary_loss_mlp": 0.01029702, "balance_loss_clip": 1.03720188, "balance_loss_mlp": 1.01766753, "epoch": 0.886336990831204, "flos": 16764071281920.0, "grad_norm": 1.7833239303064403, "language_loss": 0.72917497, "learning_rate": 1.3393228635642717e-07, "loss": 0.75055945, "num_input_tokens_seen": 317892830, "step": 14742, "time_per_iteration": 2.5617966651916504 }, { "auxiliary_loss_clip": 0.01097185, "auxiliary_loss_mlp": 0.00770206, "balance_loss_clip": 1.0359509, "balance_loss_mlp": 1.00016761, "epoch": 0.8863971140838719, "flos": 25265275781760.0, "grad_norm": 1.894504945703206, "language_loss": 0.59785163, "learning_rate": 1.3379219787285733e-07, "loss": 0.61652559, "num_input_tokens_seen": 317911780, "step": 14743, "time_per_iteration": 2.7500805854797363 }, { "auxiliary_loss_clip": 0.01079179, "auxiliary_loss_mlp": 0.01033862, "balance_loss_clip": 1.03562689, "balance_loss_mlp": 1.02005744, "epoch": 0.8864572373365399, "flos": 23404313076480.0, "grad_norm": 1.5564571259362694, "language_loss": 0.60083222, "learning_rate": 1.3365218015607437e-07, "loss": 0.62196267, "num_input_tokens_seen": 317932855, "step": 14744, "time_per_iteration": 2.770298957824707 }, { "auxiliary_loss_clip": 0.01092438, "auxiliary_loss_mlp": 0.0077049, "balance_loss_clip": 1.03708875, "balance_loss_mlp": 1.00017428, "epoch": 0.8865173605892078, "flos": 18548759456640.0, "grad_norm": 1.674319681826978, "language_loss": 0.76905382, "learning_rate": 1.3351223321138762e-07, "loss": 0.78768307, "num_input_tokens_seen": 317952090, "step": 14745, "time_per_iteration": 2.5852930545806885 }, { "auxiliary_loss_clip": 0.01107283, "auxiliary_loss_mlp": 0.00770198, "balance_loss_clip": 1.03665972, "balance_loss_mlp": 1.00020969, "epoch": 0.8865774838418758, "flos": 19025868833280.0, "grad_norm": 2.096197494867565, "language_loss": 0.77457786, "learning_rate": 1.3337235704410454e-07, "loss": 0.79335266, "num_input_tokens_seen": 317970370, "step": 14746, "time_per_iteration": 2.573580026626587 }, { "auxiliary_loss_clip": 0.01086009, "auxiliary_loss_mlp": 0.01035282, "balance_loss_clip": 1.0389545, "balance_loss_mlp": 1.02199042, "epoch": 0.8866376070945439, "flos": 22163168482560.0, "grad_norm": 2.0671047150936674, "language_loss": 0.76368475, "learning_rate": 1.3323255165952873e-07, "loss": 0.78489769, "num_input_tokens_seen": 317989125, "step": 14747, "time_per_iteration": 2.624581813812256 }, { "auxiliary_loss_clip": 0.01082631, "auxiliary_loss_mlp": 0.007697, "balance_loss_clip": 1.03356695, "balance_loss_mlp": 1.00016332, "epoch": 0.8866977303472118, "flos": 20704261685760.0, "grad_norm": 1.7191098225964694, "language_loss": 0.82627869, "learning_rate": 1.3309281706296127e-07, "loss": 0.84480202, "num_input_tokens_seen": 318007820, "step": 14748, "time_per_iteration": 2.67641282081604 }, { "auxiliary_loss_clip": 0.01099108, "auxiliary_loss_mlp": 0.01035329, "balance_loss_clip": 1.03823555, "balance_loss_mlp": 1.02254343, "epoch": 0.8867578535998798, "flos": 48794448533760.0, "grad_norm": 1.734559291294961, "language_loss": 0.77452302, "learning_rate": 1.3295315325970148e-07, "loss": 0.79586738, "num_input_tokens_seen": 318030435, "step": 14749, "time_per_iteration": 2.84780216217041 }, { "auxiliary_loss_clip": 0.01044507, "auxiliary_loss_mlp": 0.00770609, "balance_loss_clip": 1.0361824, "balance_loss_mlp": 1.0002166, "epoch": 0.8868179768525477, "flos": 21105312013440.0, "grad_norm": 1.9998873550656093, "language_loss": 0.69549012, "learning_rate": 1.328135602550451e-07, "loss": 0.71364129, "num_input_tokens_seen": 318049465, "step": 14750, "time_per_iteration": 2.714163064956665 }, { "auxiliary_loss_clip": 0.01097015, "auxiliary_loss_mlp": 0.01037086, "balance_loss_clip": 1.03601015, "balance_loss_mlp": 1.02531457, "epoch": 0.8868781001052157, "flos": 21830922656640.0, "grad_norm": 1.7739396110359793, "language_loss": 0.59205437, "learning_rate": 1.3267403805428546e-07, "loss": 0.61339533, "num_input_tokens_seen": 318067760, "step": 14751, "time_per_iteration": 2.627380609512329 }, { "auxiliary_loss_clip": 0.01109091, "auxiliary_loss_mlp": 0.01032178, "balance_loss_clip": 1.03742659, "balance_loss_mlp": 1.01954198, "epoch": 0.8869382233578836, "flos": 13516418073600.0, "grad_norm": 2.24908964745291, "language_loss": 0.81063259, "learning_rate": 1.3253458666271344e-07, "loss": 0.83204532, "num_input_tokens_seen": 318082785, "step": 14752, "time_per_iteration": 2.548123836517334 }, { "auxiliary_loss_clip": 0.01090623, "auxiliary_loss_mlp": 0.01030836, "balance_loss_clip": 1.0372467, "balance_loss_mlp": 1.01752663, "epoch": 0.8869983466105517, "flos": 22704988210560.0, "grad_norm": 2.2048651718571963, "language_loss": 0.80242121, "learning_rate": 1.3239520608561793e-07, "loss": 0.82363582, "num_input_tokens_seen": 318101925, "step": 14753, "time_per_iteration": 2.634328842163086 }, { "auxiliary_loss_clip": 0.01106619, "auxiliary_loss_mlp": 0.01033372, "balance_loss_clip": 1.03586936, "balance_loss_mlp": 1.02094483, "epoch": 0.8870584698632196, "flos": 15340751884800.0, "grad_norm": 1.7988782645876313, "language_loss": 0.65307128, "learning_rate": 1.3225589632828248e-07, "loss": 0.67447126, "num_input_tokens_seen": 318119945, "step": 14754, "time_per_iteration": 2.5431594848632812 }, { "auxiliary_loss_clip": 0.01110421, "auxiliary_loss_mlp": 0.01031473, "balance_loss_clip": 1.03804612, "balance_loss_mlp": 1.01891458, "epoch": 0.8871185931158876, "flos": 26615624699520.0, "grad_norm": 2.066769305763262, "language_loss": 0.7433095, "learning_rate": 1.3211665739599065e-07, "loss": 0.76472843, "num_input_tokens_seen": 318139685, "step": 14755, "time_per_iteration": 2.5941274166107178 }, { "auxiliary_loss_clip": 0.01084027, "auxiliary_loss_mlp": 0.01033231, "balance_loss_clip": 1.03191829, "balance_loss_mlp": 1.01927161, "epoch": 0.8871787163685555, "flos": 21799034357760.0, "grad_norm": 1.4611791846416269, "language_loss": 0.77831644, "learning_rate": 1.3197748929402262e-07, "loss": 0.79948902, "num_input_tokens_seen": 318160375, "step": 14756, "time_per_iteration": 2.7859320640563965 }, { "auxiliary_loss_clip": 0.01089134, "auxiliary_loss_mlp": 0.01034258, "balance_loss_clip": 1.0377419, "balance_loss_mlp": 1.02150822, "epoch": 0.8872388396212235, "flos": 14902964922240.0, "grad_norm": 2.1242136336639414, "language_loss": 0.76514637, "learning_rate": 1.3183839202765535e-07, "loss": 0.78638029, "num_input_tokens_seen": 318177995, "step": 14757, "time_per_iteration": 2.637052059173584 }, { "auxiliary_loss_clip": 0.01048992, "auxiliary_loss_mlp": 0.01036807, "balance_loss_clip": 1.03180897, "balance_loss_mlp": 1.02424812, "epoch": 0.8872989628738914, "flos": 26432157006720.0, "grad_norm": 1.8638847565120873, "language_loss": 0.68011022, "learning_rate": 1.316993656021632e-07, "loss": 0.70096827, "num_input_tokens_seen": 318197030, "step": 14758, "time_per_iteration": 2.852785348892212 }, { "auxiliary_loss_clip": 0.01108807, "auxiliary_loss_mlp": 0.01035484, "balance_loss_clip": 1.03694987, "balance_loss_mlp": 1.02170336, "epoch": 0.8873590861265594, "flos": 48142562555520.0, "grad_norm": 3.8430422864269356, "language_loss": 0.69252694, "learning_rate": 1.3156041002281915e-07, "loss": 0.71396983, "num_input_tokens_seen": 318221780, "step": 14759, "time_per_iteration": 2.795743942260742 }, { "auxiliary_loss_clip": 0.01106874, "auxiliary_loss_mlp": 0.01033127, "balance_loss_clip": 1.03578842, "balance_loss_mlp": 1.0204078, "epoch": 0.8874192093792275, "flos": 18332972501760.0, "grad_norm": 1.7718328909299519, "language_loss": 0.74552894, "learning_rate": 1.3142152529489092e-07, "loss": 0.76692903, "num_input_tokens_seen": 318239710, "step": 14760, "time_per_iteration": 2.5467581748962402 }, { "auxiliary_loss_clip": 0.01090454, "auxiliary_loss_mlp": 0.01034099, "balance_loss_clip": 1.03724909, "balance_loss_mlp": 1.02152801, "epoch": 0.8874793326318954, "flos": 17894215872000.0, "grad_norm": 2.98069622772717, "language_loss": 0.76240933, "learning_rate": 1.3128271142364565e-07, "loss": 0.78365493, "num_input_tokens_seen": 318257425, "step": 14761, "time_per_iteration": 2.641578197479248 }, { "auxiliary_loss_clip": 0.01110247, "auxiliary_loss_mlp": 0.01036705, "balance_loss_clip": 1.03677571, "balance_loss_mlp": 1.02415276, "epoch": 0.8875394558845634, "flos": 31102231772160.0, "grad_norm": 1.7387210735055314, "language_loss": 0.61797994, "learning_rate": 1.3114396841434717e-07, "loss": 0.63944948, "num_input_tokens_seen": 318278485, "step": 14762, "time_per_iteration": 2.6031076908111572 }, { "auxiliary_loss_clip": 0.0109514, "auxiliary_loss_mlp": 0.01034017, "balance_loss_clip": 1.03448653, "balance_loss_mlp": 1.02041471, "epoch": 0.8875995791372313, "flos": 21142048648320.0, "grad_norm": 1.7775042808478863, "language_loss": 0.63881463, "learning_rate": 1.3100529627225697e-07, "loss": 0.66010618, "num_input_tokens_seen": 318297560, "step": 14763, "time_per_iteration": 2.5757639408111572 }, { "auxiliary_loss_clip": 0.01082921, "auxiliary_loss_mlp": 0.00770724, "balance_loss_clip": 1.03658664, "balance_loss_mlp": 1.00031114, "epoch": 0.8876597023898993, "flos": 17455136019840.0, "grad_norm": 2.009886034280031, "language_loss": 0.71068102, "learning_rate": 1.3086669500263335e-07, "loss": 0.72921747, "num_input_tokens_seen": 318313060, "step": 14764, "time_per_iteration": 2.6084272861480713 }, { "auxiliary_loss_clip": 0.01113096, "auxiliary_loss_mlp": 0.01036581, "balance_loss_clip": 1.03770447, "balance_loss_mlp": 1.02350986, "epoch": 0.8877198256425672, "flos": 22707933125760.0, "grad_norm": 2.026362447668411, "language_loss": 0.66558039, "learning_rate": 1.3072816461073166e-07, "loss": 0.68707716, "num_input_tokens_seen": 318332030, "step": 14765, "time_per_iteration": 2.547609806060791 }, { "auxiliary_loss_clip": 0.01068364, "auxiliary_loss_mlp": 0.01027425, "balance_loss_clip": 1.0361414, "balance_loss_mlp": 1.01615393, "epoch": 0.8877799488952353, "flos": 24535104111360.0, "grad_norm": 1.776562659783939, "language_loss": 0.7677201, "learning_rate": 1.3058970510180568e-07, "loss": 0.78867799, "num_input_tokens_seen": 318351090, "step": 14766, "time_per_iteration": 2.6800858974456787 }, { "auxiliary_loss_clip": 0.01076267, "auxiliary_loss_mlp": 0.01031608, "balance_loss_clip": 1.03301024, "balance_loss_mlp": 1.01951444, "epoch": 0.8878400721479032, "flos": 20959191486720.0, "grad_norm": 1.9295075111293745, "language_loss": 0.73348194, "learning_rate": 1.3045131648110496e-07, "loss": 0.75456071, "num_input_tokens_seen": 318372000, "step": 14767, "time_per_iteration": 2.605175256729126 }, { "auxiliary_loss_clip": 0.0110506, "auxiliary_loss_mlp": 0.0103329, "balance_loss_clip": 1.03636575, "balance_loss_mlp": 1.02166677, "epoch": 0.8879001954005712, "flos": 25295260659840.0, "grad_norm": 1.7081054740283463, "language_loss": 0.70993221, "learning_rate": 1.303129987538778e-07, "loss": 0.73131573, "num_input_tokens_seen": 318391530, "step": 14768, "time_per_iteration": 2.5900521278381348 }, { "auxiliary_loss_clip": 0.01093069, "auxiliary_loss_mlp": 0.01031049, "balance_loss_clip": 1.03431153, "balance_loss_mlp": 1.01872909, "epoch": 0.8879603186532391, "flos": 23185329811200.0, "grad_norm": 1.9932230097119548, "language_loss": 0.70054102, "learning_rate": 1.3017475192536932e-07, "loss": 0.72178221, "num_input_tokens_seen": 318410690, "step": 14769, "time_per_iteration": 2.5676157474517822 }, { "auxiliary_loss_clip": 0.01080083, "auxiliary_loss_mlp": 0.01031718, "balance_loss_clip": 1.03361869, "balance_loss_mlp": 1.01996374, "epoch": 0.8880204419059071, "flos": 13655427707520.0, "grad_norm": 2.022851777751632, "language_loss": 0.67168438, "learning_rate": 1.3003657600082174e-07, "loss": 0.69280243, "num_input_tokens_seen": 318427380, "step": 14770, "time_per_iteration": 2.6081535816192627 }, { "auxiliary_loss_clip": 0.01094329, "auxiliary_loss_mlp": 0.0103224, "balance_loss_clip": 1.03698134, "balance_loss_mlp": 1.01974094, "epoch": 0.888080565158575, "flos": 20631865824000.0, "grad_norm": 1.758420734888644, "language_loss": 0.65032512, "learning_rate": 1.2989847098547424e-07, "loss": 0.67159081, "num_input_tokens_seen": 318448530, "step": 14771, "time_per_iteration": 2.6046431064605713 }, { "auxiliary_loss_clip": 0.01084735, "auxiliary_loss_mlp": 0.01028682, "balance_loss_clip": 1.03336012, "balance_loss_mlp": 1.01646304, "epoch": 0.888140688411243, "flos": 28620014411520.0, "grad_norm": 1.5574826798475248, "language_loss": 0.82247543, "learning_rate": 1.2976043688456396e-07, "loss": 0.84360957, "num_input_tokens_seen": 318468655, "step": 14772, "time_per_iteration": 2.7616019248962402 }, { "auxiliary_loss_clip": 0.01079313, "auxiliary_loss_mlp": 0.01024388, "balance_loss_clip": 1.03151107, "balance_loss_mlp": 1.01318812, "epoch": 0.8882008116639111, "flos": 25520241496320.0, "grad_norm": 1.4234953861106903, "language_loss": 0.76511365, "learning_rate": 1.296224737033258e-07, "loss": 0.78615069, "num_input_tokens_seen": 318488740, "step": 14773, "time_per_iteration": 6.201860427856445 }, { "auxiliary_loss_clip": 0.01083069, "auxiliary_loss_mlp": 0.01026892, "balance_loss_clip": 1.03498697, "balance_loss_mlp": 1.01539993, "epoch": 0.888260934916579, "flos": 27673696650240.0, "grad_norm": 1.9867965850384985, "language_loss": 0.75016356, "learning_rate": 1.294845814469907e-07, "loss": 0.77126318, "num_input_tokens_seen": 318508810, "step": 14774, "time_per_iteration": 2.675410270690918 }, { "auxiliary_loss_clip": 0.0106342, "auxiliary_loss_mlp": 0.00770109, "balance_loss_clip": 1.03600156, "balance_loss_mlp": 1.0002929, "epoch": 0.888321058169247, "flos": 21611077464960.0, "grad_norm": 2.763852995715363, "language_loss": 0.72647572, "learning_rate": 1.2934676012078783e-07, "loss": 0.74481106, "num_input_tokens_seen": 318526860, "step": 14775, "time_per_iteration": 2.768602132797241 }, { "auxiliary_loss_clip": 0.01106903, "auxiliary_loss_mlp": 0.01032645, "balance_loss_clip": 1.03619862, "balance_loss_mlp": 1.02074754, "epoch": 0.8883811814219149, "flos": 18149109759360.0, "grad_norm": 1.6801831073555262, "language_loss": 0.79828447, "learning_rate": 1.292090097299432e-07, "loss": 0.81967992, "num_input_tokens_seen": 318545180, "step": 14776, "time_per_iteration": 5.694887399673462 }, { "auxiliary_loss_clip": 0.01103137, "auxiliary_loss_mlp": 0.01037272, "balance_loss_clip": 1.03596711, "balance_loss_mlp": 1.02439141, "epoch": 0.8884413046745829, "flos": 28324648874880.0, "grad_norm": 2.2946403260680746, "language_loss": 0.69125223, "learning_rate": 1.290713302796802e-07, "loss": 0.71265632, "num_input_tokens_seen": 318564350, "step": 14777, "time_per_iteration": 2.6711583137512207 }, { "auxiliary_loss_clip": 0.01091804, "auxiliary_loss_mlp": 0.01033294, "balance_loss_clip": 1.0316937, "balance_loss_mlp": 1.0206517, "epoch": 0.8885014279272508, "flos": 15158756649600.0, "grad_norm": 1.744756226696034, "language_loss": 0.71044743, "learning_rate": 1.2893372177522e-07, "loss": 0.73169839, "num_input_tokens_seen": 318582275, "step": 14778, "time_per_iteration": 2.5861656665802 }, { "auxiliary_loss_clip": 0.01107976, "auxiliary_loss_mlp": 0.01029742, "balance_loss_clip": 1.035954, "balance_loss_mlp": 1.01773167, "epoch": 0.8885615511799189, "flos": 19099593498240.0, "grad_norm": 3.336773779105202, "language_loss": 0.77618229, "learning_rate": 1.287961842217804e-07, "loss": 0.79755944, "num_input_tokens_seen": 318601230, "step": 14779, "time_per_iteration": 2.5533976554870605 }, { "auxiliary_loss_clip": 0.01005115, "auxiliary_loss_mlp": 0.00999201, "balance_loss_clip": 1.00668931, "balance_loss_mlp": 0.99814647, "epoch": 0.8886216744325868, "flos": 51186567605760.0, "grad_norm": 0.8737021693090686, "language_loss": 0.56777793, "learning_rate": 1.2865871762457747e-07, "loss": 0.58782107, "num_input_tokens_seen": 318645595, "step": 14780, "time_per_iteration": 2.964052438735962 }, { "auxiliary_loss_clip": 0.01028008, "auxiliary_loss_mlp": 0.01000581, "balance_loss_clip": 1.00549233, "balance_loss_mlp": 0.99967527, "epoch": 0.8886817976852548, "flos": 61612981263360.0, "grad_norm": 0.7941416089367529, "language_loss": 0.62353128, "learning_rate": 1.2852132198882326e-07, "loss": 0.64381719, "num_input_tokens_seen": 318707850, "step": 14781, "time_per_iteration": 3.181043863296509 }, { "auxiliary_loss_clip": 0.00963643, "auxiliary_loss_mlp": 0.01006454, "balance_loss_clip": 1.01169443, "balance_loss_mlp": 1.00542259, "epoch": 0.8887419209379227, "flos": 60646946935680.0, "grad_norm": 0.7977280372936163, "language_loss": 0.58126575, "learning_rate": 1.2838399731972805e-07, "loss": 0.60096675, "num_input_tokens_seen": 318764915, "step": 14782, "time_per_iteration": 3.2847399711608887 }, { "auxiliary_loss_clip": 0.01106535, "auxiliary_loss_mlp": 0.01029737, "balance_loss_clip": 1.03703415, "balance_loss_mlp": 1.01808405, "epoch": 0.8888020441905907, "flos": 29205861235200.0, "grad_norm": 1.567088080659984, "language_loss": 0.65746784, "learning_rate": 1.2824674362249922e-07, "loss": 0.67883062, "num_input_tokens_seen": 318785660, "step": 14783, "time_per_iteration": 2.841909646987915 }, { "auxiliary_loss_clip": 0.0111198, "auxiliary_loss_mlp": 0.01035797, "balance_loss_clip": 1.03731346, "balance_loss_mlp": 1.02278554, "epoch": 0.8888621674432586, "flos": 22162701605760.0, "grad_norm": 1.5542908685815622, "language_loss": 0.77494425, "learning_rate": 1.281095609023415e-07, "loss": 0.796422, "num_input_tokens_seen": 318806080, "step": 14784, "time_per_iteration": 2.597027540206909 }, { "auxiliary_loss_clip": 0.01083474, "auxiliary_loss_mlp": 0.01034386, "balance_loss_clip": 1.03568983, "balance_loss_mlp": 1.02146983, "epoch": 0.8889222906959267, "flos": 27672834723840.0, "grad_norm": 15.751964718050344, "language_loss": 0.6070298, "learning_rate": 1.279724491644565e-07, "loss": 0.6282084, "num_input_tokens_seen": 318826445, "step": 14785, "time_per_iteration": 2.7380104064941406 }, { "auxiliary_loss_clip": 0.01073801, "auxiliary_loss_mlp": 0.01035055, "balance_loss_clip": 1.03463125, "balance_loss_mlp": 1.02198935, "epoch": 0.8889824139485947, "flos": 14168627274240.0, "grad_norm": 1.8296614273320466, "language_loss": 0.65093189, "learning_rate": 1.278354084140445e-07, "loss": 0.67202044, "num_input_tokens_seen": 318843915, "step": 14786, "time_per_iteration": 2.774667978286743 }, { "auxiliary_loss_clip": 0.01076771, "auxiliary_loss_mlp": 0.00771472, "balance_loss_clip": 1.03597903, "balance_loss_mlp": 1.00018907, "epoch": 0.8890425372012626, "flos": 12853003829760.0, "grad_norm": 2.7089037879672624, "language_loss": 0.85490113, "learning_rate": 1.276984386563009e-07, "loss": 0.87338352, "num_input_tokens_seen": 318859670, "step": 14787, "time_per_iteration": 2.6649672985076904 }, { "auxiliary_loss_clip": 0.01084573, "auxiliary_loss_mlp": 0.01030017, "balance_loss_clip": 1.03664386, "balance_loss_mlp": 1.01775646, "epoch": 0.8891026604539306, "flos": 21689291329920.0, "grad_norm": 2.1717922675442094, "language_loss": 0.70967633, "learning_rate": 1.2756153989642027e-07, "loss": 0.73082221, "num_input_tokens_seen": 318877855, "step": 14788, "time_per_iteration": 2.832113027572632 }, { "auxiliary_loss_clip": 0.01105551, "auxiliary_loss_mlp": 0.01029542, "balance_loss_clip": 1.03675187, "balance_loss_mlp": 1.01768684, "epoch": 0.8891627837065985, "flos": 21871430219520.0, "grad_norm": 1.719821366869133, "language_loss": 0.69946039, "learning_rate": 1.274247121395935e-07, "loss": 0.72081137, "num_input_tokens_seen": 318896045, "step": 14789, "time_per_iteration": 2.6089062690734863 }, { "auxiliary_loss_clip": 0.01100862, "auxiliary_loss_mlp": 0.01030753, "balance_loss_clip": 1.03966713, "balance_loss_mlp": 1.01853967, "epoch": 0.8892229069592665, "flos": 21580230660480.0, "grad_norm": 1.4843336736816757, "language_loss": 0.70594078, "learning_rate": 1.2728795539100956e-07, "loss": 0.72725689, "num_input_tokens_seen": 318915515, "step": 14790, "time_per_iteration": 2.6216959953308105 }, { "auxiliary_loss_clip": 0.01088486, "auxiliary_loss_mlp": 0.01027627, "balance_loss_clip": 1.03701544, "balance_loss_mlp": 1.01623666, "epoch": 0.8892830302119344, "flos": 23075981832960.0, "grad_norm": 1.8356781695474516, "language_loss": 0.72947121, "learning_rate": 1.2715126965585387e-07, "loss": 0.75063235, "num_input_tokens_seen": 318934305, "step": 14791, "time_per_iteration": 2.7145907878875732 }, { "auxiliary_loss_clip": 0.01078142, "auxiliary_loss_mlp": 0.01033106, "balance_loss_clip": 1.03699768, "balance_loss_mlp": 1.02080894, "epoch": 0.8893431534646025, "flos": 23072139077760.0, "grad_norm": 1.7972192952628998, "language_loss": 0.74159795, "learning_rate": 1.2701465493931008e-07, "loss": 0.76271045, "num_input_tokens_seen": 318953880, "step": 14792, "time_per_iteration": 2.689258575439453 }, { "auxiliary_loss_clip": 0.01041593, "auxiliary_loss_mlp": 0.01037472, "balance_loss_clip": 1.03244281, "balance_loss_mlp": 1.02338743, "epoch": 0.8894032767172704, "flos": 22454978572800.0, "grad_norm": 1.9651444821716726, "language_loss": 0.66043746, "learning_rate": 1.2687811124655801e-07, "loss": 0.68122816, "num_input_tokens_seen": 318971395, "step": 14793, "time_per_iteration": 2.73183012008667 }, { "auxiliary_loss_clip": 0.01079264, "auxiliary_loss_mlp": 0.01031003, "balance_loss_clip": 1.03588605, "balance_loss_mlp": 1.01774693, "epoch": 0.8894633999699384, "flos": 25338246261120.0, "grad_norm": 1.671450826366533, "language_loss": 0.71594059, "learning_rate": 1.2674163858277552e-07, "loss": 0.73704326, "num_input_tokens_seen": 318990580, "step": 14794, "time_per_iteration": 2.7042224407196045 }, { "auxiliary_loss_clip": 0.01099154, "auxiliary_loss_mlp": 0.01034078, "balance_loss_clip": 1.03871417, "balance_loss_mlp": 1.02107775, "epoch": 0.8895235232226063, "flos": 20994096528000.0, "grad_norm": 1.7160866842792333, "language_loss": 0.75350553, "learning_rate": 1.2660523695313785e-07, "loss": 0.77483785, "num_input_tokens_seen": 319010040, "step": 14795, "time_per_iteration": 2.5956714153289795 }, { "auxiliary_loss_clip": 0.01003077, "auxiliary_loss_mlp": 0.00999947, "balance_loss_clip": 1.00997567, "balance_loss_mlp": 0.99892819, "epoch": 0.8895836464752743, "flos": 69732956764800.0, "grad_norm": 0.7671992564200865, "language_loss": 0.56051087, "learning_rate": 1.2646890636281727e-07, "loss": 0.58054101, "num_input_tokens_seen": 319063860, "step": 14796, "time_per_iteration": 3.078346014022827 }, { "auxiliary_loss_clip": 0.01111208, "auxiliary_loss_mlp": 0.01030403, "balance_loss_clip": 1.03733194, "balance_loss_mlp": 1.01666403, "epoch": 0.8896437697279422, "flos": 23221815050880.0, "grad_norm": 1.7603233439489925, "language_loss": 0.70576537, "learning_rate": 1.263326468169843e-07, "loss": 0.72718143, "num_input_tokens_seen": 319082335, "step": 14797, "time_per_iteration": 2.576277017593384 }, { "auxiliary_loss_clip": 0.01017004, "auxiliary_loss_mlp": 0.01002028, "balance_loss_clip": 1.01229072, "balance_loss_mlp": 1.00102699, "epoch": 0.8897038929806103, "flos": 70752711882240.0, "grad_norm": 0.7794431422590221, "language_loss": 0.5794524, "learning_rate": 1.2619645832080417e-07, "loss": 0.59964275, "num_input_tokens_seen": 319147075, "step": 14798, "time_per_iteration": 3.218555212020874 }, { "auxiliary_loss_clip": 0.01097846, "auxiliary_loss_mlp": 0.01029628, "balance_loss_clip": 1.03578901, "balance_loss_mlp": 1.01621103, "epoch": 0.8897640162332782, "flos": 19245103493760.0, "grad_norm": 1.822201303291812, "language_loss": 0.7947073, "learning_rate": 1.2606034087944251e-07, "loss": 0.81598198, "num_input_tokens_seen": 319166630, "step": 14799, "time_per_iteration": 2.6169159412384033 }, { "auxiliary_loss_clip": 0.01018703, "auxiliary_loss_mlp": 0.01003426, "balance_loss_clip": 1.0060674, "balance_loss_mlp": 1.00247824, "epoch": 0.8898241394859462, "flos": 41356275039360.0, "grad_norm": 0.8879772067683966, "language_loss": 0.58123994, "learning_rate": 1.2592429449806053e-07, "loss": 0.60146117, "num_input_tokens_seen": 319221865, "step": 14800, "time_per_iteration": 3.090841054916382 }, { "auxiliary_loss_clip": 0.01099994, "auxiliary_loss_mlp": 0.01034134, "balance_loss_clip": 1.03885424, "balance_loss_mlp": 1.02245724, "epoch": 0.8898842627386142, "flos": 18986295024000.0, "grad_norm": 1.5949008184751121, "language_loss": 0.66234601, "learning_rate": 1.2578831918181698e-07, "loss": 0.68368721, "num_input_tokens_seen": 319240710, "step": 14801, "time_per_iteration": 2.5842556953430176 }, { "auxiliary_loss_clip": 0.01073781, "auxiliary_loss_mlp": 0.01036561, "balance_loss_clip": 1.03613853, "balance_loss_mlp": 1.02248251, "epoch": 0.8899443859912821, "flos": 13217173868160.0, "grad_norm": 2.7903408199323496, "language_loss": 0.7563743, "learning_rate": 1.256524149358682e-07, "loss": 0.77747774, "num_input_tokens_seen": 319256495, "step": 14802, "time_per_iteration": 2.6613779067993164 }, { "auxiliary_loss_clip": 0.01091905, "auxiliary_loss_mlp": 0.01030893, "balance_loss_clip": 1.03725505, "balance_loss_mlp": 1.01900768, "epoch": 0.8900045092439501, "flos": 22674680110080.0, "grad_norm": 1.8379867635089826, "language_loss": 0.73482311, "learning_rate": 1.2551658176536805e-07, "loss": 0.75605106, "num_input_tokens_seen": 319273620, "step": 14803, "time_per_iteration": 2.675278425216675 }, { "auxiliary_loss_clip": 0.01081084, "auxiliary_loss_mlp": 0.01036017, "balance_loss_clip": 1.03560674, "balance_loss_mlp": 1.02347028, "epoch": 0.890064632496618, "flos": 21141617685120.0, "grad_norm": 1.8905881524985035, "language_loss": 0.71867836, "learning_rate": 1.2538081967546664e-07, "loss": 0.73984939, "num_input_tokens_seen": 319291720, "step": 14804, "time_per_iteration": 2.637640953063965 }, { "auxiliary_loss_clip": 0.01093595, "auxiliary_loss_mlp": 0.0103033, "balance_loss_clip": 1.03525758, "balance_loss_mlp": 1.01756275, "epoch": 0.8901247557492861, "flos": 23397058529280.0, "grad_norm": 1.8040298487747064, "language_loss": 0.81148362, "learning_rate": 1.252451286713123e-07, "loss": 0.8327229, "num_input_tokens_seen": 319310380, "step": 14805, "time_per_iteration": 2.6288270950317383 }, { "auxiliary_loss_clip": 0.01100196, "auxiliary_loss_mlp": 0.01029911, "balance_loss_clip": 1.03652012, "balance_loss_mlp": 1.01704848, "epoch": 0.890184879001954, "flos": 29169591477120.0, "grad_norm": 2.314720607634321, "language_loss": 0.67655379, "learning_rate": 1.251095087580505e-07, "loss": 0.69785488, "num_input_tokens_seen": 319331765, "step": 14806, "time_per_iteration": 2.701447010040283 }, { "auxiliary_loss_clip": 0.01082875, "auxiliary_loss_mlp": 0.0103147, "balance_loss_clip": 1.03327703, "balance_loss_mlp": 1.0191853, "epoch": 0.890245002254622, "flos": 14427830793600.0, "grad_norm": 1.860806449184193, "language_loss": 0.6715759, "learning_rate": 1.2497395994082438e-07, "loss": 0.6927194, "num_input_tokens_seen": 319349135, "step": 14807, "time_per_iteration": 2.6722195148468018 }, { "auxiliary_loss_clip": 0.01082528, "auxiliary_loss_mlp": 0.01030238, "balance_loss_clip": 1.03432024, "balance_loss_mlp": 1.01869881, "epoch": 0.8903051255072899, "flos": 22382187661440.0, "grad_norm": 1.7718809355965226, "language_loss": 0.75224829, "learning_rate": 1.248384822247732e-07, "loss": 0.77337593, "num_input_tokens_seen": 319368410, "step": 14808, "time_per_iteration": 2.640336036682129 }, { "auxiliary_loss_clip": 0.0107632, "auxiliary_loss_mlp": 0.01030363, "balance_loss_clip": 1.0358547, "balance_loss_mlp": 1.01811981, "epoch": 0.8903652487599579, "flos": 20777375819520.0, "grad_norm": 12.27562140526227, "language_loss": 0.81525707, "learning_rate": 1.2470307561503513e-07, "loss": 0.83632386, "num_input_tokens_seen": 319387535, "step": 14809, "time_per_iteration": 2.6590049266815186 }, { "auxiliary_loss_clip": 0.01099147, "auxiliary_loss_mlp": 0.0103237, "balance_loss_clip": 1.03634763, "balance_loss_mlp": 1.02048481, "epoch": 0.8904253720126258, "flos": 24424499157120.0, "grad_norm": 2.1293350080998747, "language_loss": 0.68579054, "learning_rate": 1.2456774011674442e-07, "loss": 0.70710576, "num_input_tokens_seen": 319407210, "step": 14810, "time_per_iteration": 2.601858139038086 }, { "auxiliary_loss_clip": 0.01074787, "auxiliary_loss_mlp": 0.01028878, "balance_loss_clip": 1.03349328, "balance_loss_mlp": 1.01603925, "epoch": 0.8904854952652939, "flos": 19463871277440.0, "grad_norm": 2.1159045694256124, "language_loss": 0.70389724, "learning_rate": 1.2443247573503257e-07, "loss": 0.72493392, "num_input_tokens_seen": 319425340, "step": 14811, "time_per_iteration": 2.652963876724243 }, { "auxiliary_loss_clip": 0.01077147, "auxiliary_loss_mlp": 0.00770711, "balance_loss_clip": 1.03590763, "balance_loss_mlp": 1.00018835, "epoch": 0.8905456185179618, "flos": 50800741666560.0, "grad_norm": 2.4983735528249182, "language_loss": 0.66081208, "learning_rate": 1.2429728247502924e-07, "loss": 0.67929065, "num_input_tokens_seen": 319448150, "step": 14812, "time_per_iteration": 4.636792182922363 }, { "auxiliary_loss_clip": 0.01060766, "auxiliary_loss_mlp": 0.01031191, "balance_loss_clip": 1.03516841, "balance_loss_mlp": 1.01957428, "epoch": 0.8906057417706298, "flos": 17784867893760.0, "grad_norm": 1.7995676770850613, "language_loss": 0.68747163, "learning_rate": 1.24162160341861e-07, "loss": 0.70839119, "num_input_tokens_seen": 319466115, "step": 14813, "time_per_iteration": 4.193687200546265 }, { "auxiliary_loss_clip": 0.01084515, "auxiliary_loss_mlp": 0.01040982, "balance_loss_clip": 1.03238082, "balance_loss_mlp": 1.02447116, "epoch": 0.8906658650232978, "flos": 21944867575680.0, "grad_norm": 3.876084846753058, "language_loss": 0.75659066, "learning_rate": 1.2402710934065198e-07, "loss": 0.77784562, "num_input_tokens_seen": 319485255, "step": 14814, "time_per_iteration": 2.6463520526885986 }, { "auxiliary_loss_clip": 0.01100125, "auxiliary_loss_mlp": 0.01030982, "balance_loss_clip": 1.03604758, "balance_loss_mlp": 1.01783299, "epoch": 0.8907259882759657, "flos": 21287810039040.0, "grad_norm": 2.0688857636131734, "language_loss": 0.74374747, "learning_rate": 1.2389212947652229e-07, "loss": 0.76505852, "num_input_tokens_seen": 319501800, "step": 14815, "time_per_iteration": 4.110440492630005 }, { "auxiliary_loss_clip": 0.01068212, "auxiliary_loss_mlp": 0.01031366, "balance_loss_clip": 1.03145206, "balance_loss_mlp": 1.01870036, "epoch": 0.8907861115286337, "flos": 20120426023680.0, "grad_norm": 2.123609537525354, "language_loss": 0.75087738, "learning_rate": 1.237572207545914e-07, "loss": 0.77187324, "num_input_tokens_seen": 319520415, "step": 14816, "time_per_iteration": 4.275893926620483 }, { "auxiliary_loss_clip": 0.01086936, "auxiliary_loss_mlp": 0.01031641, "balance_loss_clip": 1.03456652, "balance_loss_mlp": 1.01913631, "epoch": 0.8908462347813016, "flos": 20084156265600.0, "grad_norm": 1.7646380277651805, "language_loss": 0.77968502, "learning_rate": 1.2362238317997476e-07, "loss": 0.80087078, "num_input_tokens_seen": 319538410, "step": 14817, "time_per_iteration": 2.694972515106201 }, { "auxiliary_loss_clip": 0.01001525, "auxiliary_loss_mlp": 0.01001251, "balance_loss_clip": 1.00726986, "balance_loss_mlp": 1.00008297, "epoch": 0.8909063580339697, "flos": 65503649790720.0, "grad_norm": 0.7456782467309502, "language_loss": 0.56431699, "learning_rate": 1.2348761675778517e-07, "loss": 0.58434474, "num_input_tokens_seen": 319602565, "step": 14818, "time_per_iteration": 3.234703540802002 }, { "auxiliary_loss_clip": 0.01059509, "auxiliary_loss_mlp": 0.01034162, "balance_loss_clip": 1.0355022, "balance_loss_mlp": 1.02152014, "epoch": 0.8909664812866376, "flos": 29863062426240.0, "grad_norm": 1.7646144877343908, "language_loss": 0.64705229, "learning_rate": 1.2335292149313325e-07, "loss": 0.66798902, "num_input_tokens_seen": 319624645, "step": 14819, "time_per_iteration": 2.7950870990753174 }, { "auxiliary_loss_clip": 0.01097653, "auxiliary_loss_mlp": 0.01030768, "balance_loss_clip": 1.03588057, "balance_loss_mlp": 1.01794112, "epoch": 0.8910266045393056, "flos": 25447127362560.0, "grad_norm": 2.2154062344071312, "language_loss": 0.78340304, "learning_rate": 1.2321829739112731e-07, "loss": 0.80468726, "num_input_tokens_seen": 319644040, "step": 14820, "time_per_iteration": 2.6286323070526123 }, { "auxiliary_loss_clip": 0.01070015, "auxiliary_loss_mlp": 0.00769687, "balance_loss_clip": 1.03580856, "balance_loss_mlp": 1.00026464, "epoch": 0.8910867277919735, "flos": 24499121662080.0, "grad_norm": 1.856207333364825, "language_loss": 0.76575708, "learning_rate": 1.2308374445687087e-07, "loss": 0.78415406, "num_input_tokens_seen": 319663930, "step": 14821, "time_per_iteration": 2.710040330886841 }, { "auxiliary_loss_clip": 0.01014485, "auxiliary_loss_mlp": 0.00751361, "balance_loss_clip": 1.00564671, "balance_loss_mlp": 0.99960148, "epoch": 0.8911468510446415, "flos": 60688136856960.0, "grad_norm": 0.7925502121917717, "language_loss": 0.59283942, "learning_rate": 1.2294926269546712e-07, "loss": 0.61049783, "num_input_tokens_seen": 319721245, "step": 14822, "time_per_iteration": 3.042881727218628 }, { "auxiliary_loss_clip": 0.0109278, "auxiliary_loss_mlp": 0.0103595, "balance_loss_clip": 1.03620601, "balance_loss_mlp": 1.02346885, "epoch": 0.8912069742973094, "flos": 25337492075520.0, "grad_norm": 2.0091476458751845, "language_loss": 0.69135273, "learning_rate": 1.2281485211201515e-07, "loss": 0.71263999, "num_input_tokens_seen": 319741200, "step": 14823, "time_per_iteration": 2.6208603382110596 }, { "auxiliary_loss_clip": 0.01089302, "auxiliary_loss_mlp": 0.01035298, "balance_loss_clip": 1.03342748, "balance_loss_mlp": 1.02241755, "epoch": 0.8912670975499775, "flos": 18223516782720.0, "grad_norm": 1.5978850394355568, "language_loss": 0.69198072, "learning_rate": 1.2268051271161262e-07, "loss": 0.71322668, "num_input_tokens_seen": 319759265, "step": 14824, "time_per_iteration": 2.508863687515259 }, { "auxiliary_loss_clip": 0.01058099, "auxiliary_loss_mlp": 0.0103706, "balance_loss_clip": 1.03319716, "balance_loss_mlp": 1.02307105, "epoch": 0.8913272208026454, "flos": 26504481041280.0, "grad_norm": 1.9932021748393736, "language_loss": 0.70705098, "learning_rate": 1.2254624449935303e-07, "loss": 0.72800255, "num_input_tokens_seen": 319777560, "step": 14825, "time_per_iteration": 2.654224157333374 }, { "auxiliary_loss_clip": 0.01085791, "auxiliary_loss_mlp": 0.01032779, "balance_loss_clip": 1.03422439, "balance_loss_mlp": 1.01951671, "epoch": 0.8913873440553134, "flos": 18802324540800.0, "grad_norm": 1.827676363511503, "language_loss": 0.71464586, "learning_rate": 1.2241204748032786e-07, "loss": 0.7358315, "num_input_tokens_seen": 319794125, "step": 14826, "time_per_iteration": 2.5119738578796387 }, { "auxiliary_loss_clip": 0.0109572, "auxiliary_loss_mlp": 0.01028162, "balance_loss_clip": 1.03623497, "balance_loss_mlp": 1.01646793, "epoch": 0.8914474673079814, "flos": 20884892204160.0, "grad_norm": 2.0101591277509243, "language_loss": 0.75315851, "learning_rate": 1.2227792165962615e-07, "loss": 0.77439737, "num_input_tokens_seen": 319810310, "step": 14827, "time_per_iteration": 2.4767954349517822 }, { "auxiliary_loss_clip": 0.01100376, "auxiliary_loss_mlp": 0.0103277, "balance_loss_clip": 1.03736746, "balance_loss_mlp": 1.02037859, "epoch": 0.8915075905606493, "flos": 20952439729920.0, "grad_norm": 2.25546419546836, "language_loss": 0.78480828, "learning_rate": 1.221438670423336e-07, "loss": 0.80613977, "num_input_tokens_seen": 319828505, "step": 14828, "time_per_iteration": 2.4681639671325684 }, { "auxiliary_loss_clip": 0.01068483, "auxiliary_loss_mlp": 0.01032448, "balance_loss_clip": 1.0356158, "balance_loss_mlp": 1.01987755, "epoch": 0.8915677138133173, "flos": 23076305055360.0, "grad_norm": 1.7213243632049227, "language_loss": 0.75276792, "learning_rate": 1.2200988363353392e-07, "loss": 0.77377725, "num_input_tokens_seen": 319848680, "step": 14829, "time_per_iteration": 2.6100480556488037 }, { "auxiliary_loss_clip": 0.01108879, "auxiliary_loss_mlp": 0.01034466, "balance_loss_clip": 1.03637552, "balance_loss_mlp": 1.02299213, "epoch": 0.8916278370659853, "flos": 23440259612160.0, "grad_norm": 1.5513516933839315, "language_loss": 0.84576946, "learning_rate": 1.2187597143830773e-07, "loss": 0.867203, "num_input_tokens_seen": 319868835, "step": 14830, "time_per_iteration": 2.5831005573272705 }, { "auxiliary_loss_clip": 0.01093236, "auxiliary_loss_mlp": 0.01030146, "balance_loss_clip": 1.03435206, "balance_loss_mlp": 1.01864195, "epoch": 0.8916879603186533, "flos": 25160488830720.0, "grad_norm": 1.3477965843038384, "language_loss": 0.74875772, "learning_rate": 1.2174213046173299e-07, "loss": 0.76999158, "num_input_tokens_seen": 319891585, "step": 14831, "time_per_iteration": 2.7232887744903564 }, { "auxiliary_loss_clip": 0.01100471, "auxiliary_loss_mlp": 0.01029141, "balance_loss_clip": 1.03624547, "balance_loss_mlp": 1.01663041, "epoch": 0.8917480835713212, "flos": 20229845829120.0, "grad_norm": 1.8265908258617016, "language_loss": 0.72934276, "learning_rate": 1.216083607088847e-07, "loss": 0.75063884, "num_input_tokens_seen": 319910315, "step": 14832, "time_per_iteration": 2.616689443588257 }, { "auxiliary_loss_clip": 0.01045927, "auxiliary_loss_mlp": 0.00770458, "balance_loss_clip": 1.03222537, "balance_loss_mlp": 1.00019884, "epoch": 0.8918082068239892, "flos": 26101922342400.0, "grad_norm": 3.1162015685797972, "language_loss": 0.66912735, "learning_rate": 1.214746621848355e-07, "loss": 0.68729126, "num_input_tokens_seen": 319932275, "step": 14833, "time_per_iteration": 2.8316352367401123 }, { "auxiliary_loss_clip": 0.01106023, "auxiliary_loss_mlp": 0.01034649, "balance_loss_clip": 1.03974128, "balance_loss_mlp": 1.02139854, "epoch": 0.8918683300766571, "flos": 24831439315200.0, "grad_norm": 1.9997597617659202, "language_loss": 0.73976004, "learning_rate": 1.2134103489465575e-07, "loss": 0.76116675, "num_input_tokens_seen": 319955335, "step": 14834, "time_per_iteration": 2.7026243209838867 }, { "auxiliary_loss_clip": 0.01065475, "auxiliary_loss_mlp": 0.0103389, "balance_loss_clip": 1.03502977, "balance_loss_mlp": 1.02165282, "epoch": 0.8919284533293251, "flos": 22305158945280.0, "grad_norm": 1.9340437806838273, "language_loss": 0.78773081, "learning_rate": 1.2120747884341188e-07, "loss": 0.80872452, "num_input_tokens_seen": 319973990, "step": 14835, "time_per_iteration": 2.64371395111084 }, { "auxiliary_loss_clip": 0.01103945, "auxiliary_loss_mlp": 0.0103108, "balance_loss_clip": 1.03464007, "balance_loss_mlp": 1.01960659, "epoch": 0.891988576581993, "flos": 30373532559360.0, "grad_norm": 1.6176322749361627, "language_loss": 0.74194962, "learning_rate": 1.210739940361689e-07, "loss": 0.76329982, "num_input_tokens_seen": 319995555, "step": 14836, "time_per_iteration": 2.6271843910217285 }, { "auxiliary_loss_clip": 0.01087557, "auxiliary_loss_mlp": 0.01032238, "balance_loss_clip": 1.03471708, "balance_loss_mlp": 1.01970363, "epoch": 0.8920486998346611, "flos": 15552947479680.0, "grad_norm": 3.2025172292231625, "language_loss": 0.68644428, "learning_rate": 1.2094058047798838e-07, "loss": 0.7076422, "num_input_tokens_seen": 320012385, "step": 14837, "time_per_iteration": 2.612969160079956 }, { "auxiliary_loss_clip": 0.01050841, "auxiliary_loss_mlp": 0.0103232, "balance_loss_clip": 1.03323007, "balance_loss_mlp": 1.01922536, "epoch": 0.892108823087329, "flos": 21214983214080.0, "grad_norm": 1.653711357861068, "language_loss": 0.67707741, "learning_rate": 1.2080723817392913e-07, "loss": 0.697909, "num_input_tokens_seen": 320032390, "step": 14838, "time_per_iteration": 2.7335948944091797 }, { "auxiliary_loss_clip": 0.01096545, "auxiliary_loss_mlp": 0.01031606, "balance_loss_clip": 1.03442597, "balance_loss_mlp": 1.0184747, "epoch": 0.892168946339997, "flos": 21978982517760.0, "grad_norm": 2.2756639024172722, "language_loss": 0.76234394, "learning_rate": 1.2067396712904777e-07, "loss": 0.78362542, "num_input_tokens_seen": 320052885, "step": 14839, "time_per_iteration": 2.6222732067108154 }, { "auxiliary_loss_clip": 0.00999654, "auxiliary_loss_mlp": 0.00751271, "balance_loss_clip": 1.00644863, "balance_loss_mlp": 0.99961644, "epoch": 0.892229069592665, "flos": 67475289277440.0, "grad_norm": 0.6958789552427521, "language_loss": 0.49386242, "learning_rate": 1.205407673483978e-07, "loss": 0.51137161, "num_input_tokens_seen": 320113685, "step": 14840, "time_per_iteration": 3.1971607208251953 }, { "auxiliary_loss_clip": 0.0111346, "auxiliary_loss_mlp": 0.01031899, "balance_loss_clip": 1.03685474, "balance_loss_mlp": 1.01813662, "epoch": 0.8922891928453329, "flos": 19459561645440.0, "grad_norm": 2.2275620590123575, "language_loss": 0.64040601, "learning_rate": 1.2040763883703074e-07, "loss": 0.66185963, "num_input_tokens_seen": 320130810, "step": 14841, "time_per_iteration": 2.5630903244018555 }, { "auxiliary_loss_clip": 0.01073374, "auxiliary_loss_mlp": 0.00768866, "balance_loss_clip": 1.03585565, "balance_loss_mlp": 1.00014949, "epoch": 0.8923493160980009, "flos": 23367396873600.0, "grad_norm": 1.4260666189370539, "language_loss": 0.68198895, "learning_rate": 1.2027458159999438e-07, "loss": 0.70041138, "num_input_tokens_seen": 320152170, "step": 14842, "time_per_iteration": 2.7487709522247314 }, { "auxiliary_loss_clip": 0.01107456, "auxiliary_loss_mlp": 0.01036165, "balance_loss_clip": 1.03805566, "balance_loss_mlp": 1.02464318, "epoch": 0.8924094393506689, "flos": 26177047637760.0, "grad_norm": 2.0828434512728387, "language_loss": 0.80424309, "learning_rate": 1.2014159564233373e-07, "loss": 0.8256793, "num_input_tokens_seen": 320172360, "step": 14843, "time_per_iteration": 2.6367337703704834 }, { "auxiliary_loss_clip": 0.01084909, "auxiliary_loss_mlp": 0.01033483, "balance_loss_clip": 1.03361225, "balance_loss_mlp": 1.01991701, "epoch": 0.8924695626033369, "flos": 22018520413440.0, "grad_norm": 2.382089830168308, "language_loss": 0.68838096, "learning_rate": 1.2000868096909257e-07, "loss": 0.70956492, "num_input_tokens_seen": 320192130, "step": 14844, "time_per_iteration": 2.6400132179260254 }, { "auxiliary_loss_clip": 0.01064131, "auxiliary_loss_mlp": 0.0103004, "balance_loss_clip": 1.03404808, "balance_loss_mlp": 1.01779175, "epoch": 0.8925296858560048, "flos": 14793940166400.0, "grad_norm": 2.2436852387053134, "language_loss": 0.91622436, "learning_rate": 1.1987583758531038e-07, "loss": 0.93716609, "num_input_tokens_seen": 320207760, "step": 14845, "time_per_iteration": 2.74336314201355 }, { "auxiliary_loss_clip": 0.01089634, "auxiliary_loss_mlp": 0.01031578, "balance_loss_clip": 1.03469348, "balance_loss_mlp": 1.01985955, "epoch": 0.8925898091086728, "flos": 22346636175360.0, "grad_norm": 1.8155448981855211, "language_loss": 0.72219133, "learning_rate": 1.1974306549602476e-07, "loss": 0.74340343, "num_input_tokens_seen": 320225325, "step": 14846, "time_per_iteration": 2.628924608230591 }, { "auxiliary_loss_clip": 0.01084746, "auxiliary_loss_mlp": 0.01033411, "balance_loss_clip": 1.03907979, "balance_loss_mlp": 1.02118051, "epoch": 0.8926499323613407, "flos": 45806322067200.0, "grad_norm": 2.129136173165777, "language_loss": 0.56949878, "learning_rate": 1.1961036470627094e-07, "loss": 0.5906803, "num_input_tokens_seen": 320247645, "step": 14847, "time_per_iteration": 2.8942604064941406 }, { "auxiliary_loss_clip": 0.01071094, "auxiliary_loss_mlp": 0.01034217, "balance_loss_clip": 1.03545833, "balance_loss_mlp": 1.0223918, "epoch": 0.8927100556140087, "flos": 22127042378880.0, "grad_norm": 2.417347097790333, "language_loss": 0.76218295, "learning_rate": 1.1947773522108052e-07, "loss": 0.78323603, "num_input_tokens_seen": 320266005, "step": 14848, "time_per_iteration": 2.703596830368042 }, { "auxiliary_loss_clip": 0.01043101, "auxiliary_loss_mlp": 0.01046178, "balance_loss_clip": 1.0295552, "balance_loss_mlp": 1.03208137, "epoch": 0.8927701788666766, "flos": 28330143655680.0, "grad_norm": 2.5994238973384554, "language_loss": 0.69254899, "learning_rate": 1.1934517704548251e-07, "loss": 0.71344179, "num_input_tokens_seen": 320285555, "step": 14849, "time_per_iteration": 2.7903876304626465 }, { "auxiliary_loss_clip": 0.01099654, "auxiliary_loss_mlp": 0.01032823, "balance_loss_clip": 1.03864908, "balance_loss_mlp": 1.02075911, "epoch": 0.8928303021193447, "flos": 25294973351040.0, "grad_norm": 1.9684228103737367, "language_loss": 0.80747259, "learning_rate": 1.1921269018450364e-07, "loss": 0.8287974, "num_input_tokens_seen": 320305395, "step": 14850, "time_per_iteration": 2.6187615394592285 }, { "auxiliary_loss_clip": 0.01087788, "auxiliary_loss_mlp": 0.01037651, "balance_loss_clip": 1.03636372, "balance_loss_mlp": 1.02547944, "epoch": 0.8928904253720126, "flos": 22236713579520.0, "grad_norm": 1.6645229603446685, "language_loss": 0.74605459, "learning_rate": 1.1908027464316872e-07, "loss": 0.76730895, "num_input_tokens_seen": 320324220, "step": 14851, "time_per_iteration": 2.6631858348846436 }, { "auxiliary_loss_clip": 0.0108452, "auxiliary_loss_mlp": 0.01029205, "balance_loss_clip": 1.03504527, "balance_loss_mlp": 1.01692009, "epoch": 0.8929505486246806, "flos": 27092374940160.0, "grad_norm": 1.5560164927466833, "language_loss": 0.78718781, "learning_rate": 1.1894793042649775e-07, "loss": 0.80832505, "num_input_tokens_seen": 320347195, "step": 14852, "time_per_iteration": 5.973539113998413 }, { "auxiliary_loss_clip": 0.01091326, "auxiliary_loss_mlp": 0.01033169, "balance_loss_clip": 1.03806448, "balance_loss_mlp": 1.0212301, "epoch": 0.8930106718773486, "flos": 23039352938880.0, "grad_norm": 2.4577931840380596, "language_loss": 0.69120765, "learning_rate": 1.1881565753951006e-07, "loss": 0.71245253, "num_input_tokens_seen": 320366850, "step": 14853, "time_per_iteration": 2.6630473136901855 }, { "auxiliary_loss_clip": 0.01060947, "auxiliary_loss_mlp": 0.01032204, "balance_loss_clip": 1.03697348, "balance_loss_mlp": 1.01997924, "epoch": 0.8930707951300165, "flos": 35626652887680.0, "grad_norm": 1.537977130569083, "language_loss": 0.67207319, "learning_rate": 1.1868345598722118e-07, "loss": 0.69300473, "num_input_tokens_seen": 320388895, "step": 14854, "time_per_iteration": 4.400064945220947 }, { "auxiliary_loss_clip": 0.01081067, "auxiliary_loss_mlp": 0.01040836, "balance_loss_clip": 1.03309155, "balance_loss_mlp": 1.02784824, "epoch": 0.8931309183826845, "flos": 23039891642880.0, "grad_norm": 1.6742794068707105, "language_loss": 0.74868983, "learning_rate": 1.1855132577464399e-07, "loss": 0.76990891, "num_input_tokens_seen": 320408520, "step": 14855, "time_per_iteration": 4.200139284133911 }, { "auxiliary_loss_clip": 0.01086542, "auxiliary_loss_mlp": 0.01032762, "balance_loss_clip": 1.03601122, "balance_loss_mlp": 1.02056086, "epoch": 0.8931910416353525, "flos": 26504624695680.0, "grad_norm": 2.5885445984861613, "language_loss": 0.64431441, "learning_rate": 1.1841926690678893e-07, "loss": 0.6655075, "num_input_tokens_seen": 320427400, "step": 14856, "time_per_iteration": 2.657810926437378 }, { "auxiliary_loss_clip": 0.0110682, "auxiliary_loss_mlp": 0.0102884, "balance_loss_clip": 1.03531027, "balance_loss_mlp": 1.01715207, "epoch": 0.8932511648880205, "flos": 24973609345920.0, "grad_norm": 1.6750308846874502, "language_loss": 0.66575366, "learning_rate": 1.1828727938866378e-07, "loss": 0.68711025, "num_input_tokens_seen": 320447570, "step": 14857, "time_per_iteration": 2.644740343093872 }, { "auxiliary_loss_clip": 0.01068637, "auxiliary_loss_mlp": 0.01038826, "balance_loss_clip": 1.04051542, "balance_loss_mlp": 1.02599871, "epoch": 0.8933112881406884, "flos": 24460733001600.0, "grad_norm": 2.232767512472365, "language_loss": 0.75065112, "learning_rate": 1.1815536322527408e-07, "loss": 0.77172571, "num_input_tokens_seen": 320464405, "step": 14858, "time_per_iteration": 2.7609682083129883 }, { "auxiliary_loss_clip": 0.01096177, "auxiliary_loss_mlp": 0.0102937, "balance_loss_clip": 1.03594685, "balance_loss_mlp": 1.01651311, "epoch": 0.8933714113933564, "flos": 28293083798400.0, "grad_norm": 1.825199606882533, "language_loss": 0.69551903, "learning_rate": 1.1802351842162139e-07, "loss": 0.71677446, "num_input_tokens_seen": 320485525, "step": 14859, "time_per_iteration": 2.6836822032928467 }, { "auxiliary_loss_clip": 0.01056346, "auxiliary_loss_mlp": 0.01028474, "balance_loss_clip": 1.03371429, "balance_loss_mlp": 1.0170536, "epoch": 0.8934315346460243, "flos": 21434864319360.0, "grad_norm": 1.6309895409207762, "language_loss": 0.75540131, "learning_rate": 1.1789174498270526e-07, "loss": 0.77624959, "num_input_tokens_seen": 320506725, "step": 14860, "time_per_iteration": 2.76859450340271 }, { "auxiliary_loss_clip": 0.01086873, "auxiliary_loss_mlp": 0.01033583, "balance_loss_clip": 1.03512859, "balance_loss_mlp": 1.02008855, "epoch": 0.8934916578986923, "flos": 23769596436480.0, "grad_norm": 4.01916529302481, "language_loss": 0.57677805, "learning_rate": 1.1776004291352303e-07, "loss": 0.59798259, "num_input_tokens_seen": 320525425, "step": 14861, "time_per_iteration": 2.661344289779663 }, { "auxiliary_loss_clip": 0.01078056, "auxiliary_loss_mlp": 0.01033158, "balance_loss_clip": 1.03267503, "balance_loss_mlp": 1.02077198, "epoch": 0.8935517811513602, "flos": 18916161719040.0, "grad_norm": 1.9140763695424603, "language_loss": 0.63545376, "learning_rate": 1.176284122190685e-07, "loss": 0.6565659, "num_input_tokens_seen": 320543010, "step": 14862, "time_per_iteration": 2.5856823921203613 }, { "auxiliary_loss_clip": 0.01092562, "auxiliary_loss_mlp": 0.01026666, "balance_loss_clip": 1.03338671, "balance_loss_mlp": 1.01455998, "epoch": 0.8936119044040283, "flos": 24061370613120.0, "grad_norm": 2.1334167708433323, "language_loss": 0.78088272, "learning_rate": 1.1749685290433298e-07, "loss": 0.80207497, "num_input_tokens_seen": 320562180, "step": 14863, "time_per_iteration": 2.611900806427002 }, { "auxiliary_loss_clip": 0.01080768, "auxiliary_loss_mlp": 0.01037163, "balance_loss_clip": 1.03352034, "balance_loss_mlp": 1.02448487, "epoch": 0.8936720276566962, "flos": 21324079797120.0, "grad_norm": 1.7735911629661039, "language_loss": 0.71075487, "learning_rate": 1.1736536497430627e-07, "loss": 0.73193425, "num_input_tokens_seen": 320580395, "step": 14864, "time_per_iteration": 2.691619873046875 }, { "auxiliary_loss_clip": 0.01101616, "auxiliary_loss_mlp": 0.01037124, "balance_loss_clip": 1.03658938, "balance_loss_mlp": 1.02402878, "epoch": 0.8937321509093642, "flos": 18406122549120.0, "grad_norm": 2.399528351176047, "language_loss": 0.76093769, "learning_rate": 1.1723394843397283e-07, "loss": 0.78232509, "num_input_tokens_seen": 320599505, "step": 14865, "time_per_iteration": 2.6147727966308594 }, { "auxiliary_loss_clip": 0.01069542, "auxiliary_loss_mlp": 0.01032163, "balance_loss_clip": 1.03533304, "balance_loss_mlp": 1.02058804, "epoch": 0.8937922741620322, "flos": 22054754257920.0, "grad_norm": 1.8011765216812077, "language_loss": 0.72078204, "learning_rate": 1.1710260328831668e-07, "loss": 0.74179912, "num_input_tokens_seen": 320619825, "step": 14866, "time_per_iteration": 2.7329297065734863 }, { "auxiliary_loss_clip": 0.01100829, "auxiliary_loss_mlp": 0.01029076, "balance_loss_clip": 1.0382688, "balance_loss_mlp": 1.01533771, "epoch": 0.8938523974147001, "flos": 25664386775040.0, "grad_norm": 1.830929850281708, "language_loss": 0.83762133, "learning_rate": 1.1697132954231869e-07, "loss": 0.8589204, "num_input_tokens_seen": 320638515, "step": 14867, "time_per_iteration": 2.668128728866577 }, { "auxiliary_loss_clip": 0.0109843, "auxiliary_loss_mlp": 0.01029062, "balance_loss_clip": 1.03669333, "balance_loss_mlp": 1.01795816, "epoch": 0.8939125206673681, "flos": 25742852035200.0, "grad_norm": 1.586495908389307, "language_loss": 0.80449593, "learning_rate": 1.168401272009567e-07, "loss": 0.82577085, "num_input_tokens_seen": 320659430, "step": 14868, "time_per_iteration": 2.680034637451172 }, { "auxiliary_loss_clip": 0.01083053, "auxiliary_loss_mlp": 0.01033575, "balance_loss_clip": 1.03728485, "balance_loss_mlp": 1.0209384, "epoch": 0.8939726439200361, "flos": 27344503480320.0, "grad_norm": 1.8649016797962312, "language_loss": 0.7731384, "learning_rate": 1.167089962692056e-07, "loss": 0.79430467, "num_input_tokens_seen": 320679295, "step": 14869, "time_per_iteration": 2.745805263519287 }, { "auxiliary_loss_clip": 0.01097268, "auxiliary_loss_mlp": 0.00769412, "balance_loss_clip": 1.03609347, "balance_loss_mlp": 1.00023556, "epoch": 0.8940327671727041, "flos": 20338834671360.0, "grad_norm": 1.4278023080407176, "language_loss": 0.65314829, "learning_rate": 1.1657793675203853e-07, "loss": 0.67181504, "num_input_tokens_seen": 320697535, "step": 14870, "time_per_iteration": 2.6284589767456055 }, { "auxiliary_loss_clip": 0.00993024, "auxiliary_loss_mlp": 0.0102124, "balance_loss_clip": 1.00702477, "balance_loss_mlp": 1.01970196, "epoch": 0.894092890425372, "flos": 58410573235200.0, "grad_norm": 0.7966327834428544, "language_loss": 0.55929744, "learning_rate": 1.1644694865442461e-07, "loss": 0.57944012, "num_input_tokens_seen": 320758635, "step": 14871, "time_per_iteration": 3.3122901916503906 }, { "auxiliary_loss_clip": 0.01091917, "auxiliary_loss_mlp": 0.0103181, "balance_loss_clip": 1.03682566, "balance_loss_mlp": 1.02005625, "epoch": 0.89415301367804, "flos": 19829657427840.0, "grad_norm": 1.9266754384359623, "language_loss": 0.76406336, "learning_rate": 1.16316031981331e-07, "loss": 0.78530067, "num_input_tokens_seen": 320777175, "step": 14872, "time_per_iteration": 2.6247551441192627 }, { "auxiliary_loss_clip": 0.01094372, "auxiliary_loss_mlp": 0.01031313, "balance_loss_clip": 1.03704429, "balance_loss_mlp": 1.02015495, "epoch": 0.8942131369307079, "flos": 25775781828480.0, "grad_norm": 1.648018727425323, "language_loss": 0.67068255, "learning_rate": 1.1618518673772215e-07, "loss": 0.69193947, "num_input_tokens_seen": 320797670, "step": 14873, "time_per_iteration": 2.6552417278289795 }, { "auxiliary_loss_clip": 0.01105979, "auxiliary_loss_mlp": 0.01034727, "balance_loss_clip": 1.03645134, "balance_loss_mlp": 1.02241898, "epoch": 0.8942732601833759, "flos": 23149024139520.0, "grad_norm": 1.5958829385063367, "language_loss": 0.59345031, "learning_rate": 1.1605441292856033e-07, "loss": 0.61485744, "num_input_tokens_seen": 320817410, "step": 14874, "time_per_iteration": 2.5860843658447266 }, { "auxiliary_loss_clip": 0.01078313, "auxiliary_loss_mlp": 0.01032167, "balance_loss_clip": 1.03743887, "balance_loss_mlp": 1.01903629, "epoch": 0.8943333834360438, "flos": 27855548231040.0, "grad_norm": 1.8290237697003595, "language_loss": 0.75576758, "learning_rate": 1.1592371055880356e-07, "loss": 0.7768724, "num_input_tokens_seen": 320836745, "step": 14875, "time_per_iteration": 2.7420151233673096 }, { "auxiliary_loss_clip": 0.01079183, "auxiliary_loss_mlp": 0.0103557, "balance_loss_clip": 1.03446269, "balance_loss_mlp": 1.0205195, "epoch": 0.8943935066887119, "flos": 22163958581760.0, "grad_norm": 2.3429928427333926, "language_loss": 0.77405798, "learning_rate": 1.1579307963340857e-07, "loss": 0.79520553, "num_input_tokens_seen": 320853305, "step": 14876, "time_per_iteration": 2.816397190093994 }, { "auxiliary_loss_clip": 0.01096244, "auxiliary_loss_mlp": 0.01025808, "balance_loss_clip": 1.0358882, "balance_loss_mlp": 1.01482916, "epoch": 0.8944536299413798, "flos": 21470056669440.0, "grad_norm": 1.6703549010755179, "language_loss": 0.78432184, "learning_rate": 1.156625201573287e-07, "loss": 0.80554235, "num_input_tokens_seen": 320872885, "step": 14877, "time_per_iteration": 2.7098886966705322 }, { "auxiliary_loss_clip": 0.01059905, "auxiliary_loss_mlp": 0.01039763, "balance_loss_clip": 1.03192687, "balance_loss_mlp": 1.02515423, "epoch": 0.8945137531940478, "flos": 17748777703680.0, "grad_norm": 2.0737748491478465, "language_loss": 0.7512145, "learning_rate": 1.155320321355151e-07, "loss": 0.77221119, "num_input_tokens_seen": 320889755, "step": 14878, "time_per_iteration": 2.6619186401367188 }, { "auxiliary_loss_clip": 0.01094053, "auxiliary_loss_mlp": 0.01030234, "balance_loss_clip": 1.03389883, "balance_loss_mlp": 1.01564312, "epoch": 0.8945738764467158, "flos": 21142264129920.0, "grad_norm": 1.682076176326582, "language_loss": 0.76145089, "learning_rate": 1.1540161557291539e-07, "loss": 0.78269374, "num_input_tokens_seen": 320907860, "step": 14879, "time_per_iteration": 2.5775701999664307 }, { "auxiliary_loss_clip": 0.01078076, "auxiliary_loss_mlp": 0.0103149, "balance_loss_clip": 1.03829026, "balance_loss_mlp": 1.01952147, "epoch": 0.8946339996993837, "flos": 14903000835840.0, "grad_norm": 1.842392268931871, "language_loss": 0.74446988, "learning_rate": 1.1527127047447538e-07, "loss": 0.76556557, "num_input_tokens_seen": 320925825, "step": 14880, "time_per_iteration": 2.665179967880249 }, { "auxiliary_loss_clip": 0.0109132, "auxiliary_loss_mlp": 0.01029879, "balance_loss_clip": 1.03410816, "balance_loss_mlp": 1.01687312, "epoch": 0.8946941229520518, "flos": 27382173868800.0, "grad_norm": 1.5269173028094163, "language_loss": 0.82799721, "learning_rate": 1.1514099684513822e-07, "loss": 0.84920919, "num_input_tokens_seen": 320946165, "step": 14881, "time_per_iteration": 2.6503562927246094 }, { "auxiliary_loss_clip": 0.01067605, "auxiliary_loss_mlp": 0.00770988, "balance_loss_clip": 1.03390133, "balance_loss_mlp": 1.00022626, "epoch": 0.8947542462047197, "flos": 31796277338880.0, "grad_norm": 1.614884288144251, "language_loss": 0.67639142, "learning_rate": 1.1501079468984287e-07, "loss": 0.69477737, "num_input_tokens_seen": 320969330, "step": 14882, "time_per_iteration": 2.7693512439727783 }, { "auxiliary_loss_clip": 0.01085159, "auxiliary_loss_mlp": 0.0103542, "balance_loss_clip": 1.03287458, "balance_loss_mlp": 1.0205431, "epoch": 0.8948143694573877, "flos": 20883599314560.0, "grad_norm": 2.045453824962206, "language_loss": 0.74976206, "learning_rate": 1.1488066401352691e-07, "loss": 0.77096784, "num_input_tokens_seen": 320985055, "step": 14883, "time_per_iteration": 2.6624233722686768 }, { "auxiliary_loss_clip": 0.01080827, "auxiliary_loss_mlp": 0.01033735, "balance_loss_clip": 1.03383732, "balance_loss_mlp": 1.02153993, "epoch": 0.8948744927100556, "flos": 28215552291840.0, "grad_norm": 1.5810148244458424, "language_loss": 0.72292316, "learning_rate": 1.147506048211253e-07, "loss": 0.74406874, "num_input_tokens_seen": 321004720, "step": 14884, "time_per_iteration": 2.6995975971221924 }, { "auxiliary_loss_clip": 0.01076203, "auxiliary_loss_mlp": 0.01030683, "balance_loss_clip": 1.03102303, "balance_loss_mlp": 1.0188036, "epoch": 0.8949346159627236, "flos": 21902672073600.0, "grad_norm": 1.6922147897555293, "language_loss": 0.75564313, "learning_rate": 1.1462061711756987e-07, "loss": 0.77671194, "num_input_tokens_seen": 321022350, "step": 14885, "time_per_iteration": 2.628843069076538 }, { "auxiliary_loss_clip": 0.01081812, "auxiliary_loss_mlp": 0.01031169, "balance_loss_clip": 1.03561521, "balance_loss_mlp": 1.01841331, "epoch": 0.8949947392153915, "flos": 21359128492800.0, "grad_norm": 1.9911058650536606, "language_loss": 0.81962872, "learning_rate": 1.1449070090778911e-07, "loss": 0.84075844, "num_input_tokens_seen": 321040450, "step": 14886, "time_per_iteration": 2.6610560417175293 }, { "auxiliary_loss_clip": 0.01047486, "auxiliary_loss_mlp": 0.01027777, "balance_loss_clip": 1.03327608, "balance_loss_mlp": 1.01596951, "epoch": 0.8950548624680595, "flos": 52445342799360.0, "grad_norm": 1.5558434759275688, "language_loss": 0.63781691, "learning_rate": 1.1436085619671043e-07, "loss": 0.65856951, "num_input_tokens_seen": 321063970, "step": 14887, "time_per_iteration": 3.0324647426605225 }, { "auxiliary_loss_clip": 0.01088528, "auxiliary_loss_mlp": 0.01035221, "balance_loss_clip": 1.03487492, "balance_loss_mlp": 1.02251327, "epoch": 0.8951149857207275, "flos": 20121323863680.0, "grad_norm": 1.8589921868531927, "language_loss": 0.60964525, "learning_rate": 1.1423108298925698e-07, "loss": 0.63088268, "num_input_tokens_seen": 321083840, "step": 14888, "time_per_iteration": 2.745520830154419 }, { "auxiliary_loss_clip": 0.01110592, "auxiliary_loss_mlp": 0.0103061, "balance_loss_clip": 1.0367682, "balance_loss_mlp": 1.01834917, "epoch": 0.8951751089733955, "flos": 29862631463040.0, "grad_norm": 2.002662666178723, "language_loss": 0.70275199, "learning_rate": 1.1410138129034952e-07, "loss": 0.72416401, "num_input_tokens_seen": 321104165, "step": 14889, "time_per_iteration": 2.6176459789276123 }, { "auxiliary_loss_clip": 0.01096532, "auxiliary_loss_mlp": 0.00770989, "balance_loss_clip": 1.03800106, "balance_loss_mlp": 1.00024951, "epoch": 0.8952352322260634, "flos": 15262789415040.0, "grad_norm": 2.7797851150305615, "language_loss": 0.71586537, "learning_rate": 1.1397175110490676e-07, "loss": 0.73454058, "num_input_tokens_seen": 321117290, "step": 14890, "time_per_iteration": 2.5783839225769043 }, { "auxiliary_loss_clip": 0.0102349, "auxiliary_loss_mlp": 0.00773622, "balance_loss_clip": 1.02805948, "balance_loss_mlp": 1.00013435, "epoch": 0.8952953554787314, "flos": 26798338206720.0, "grad_norm": 1.483485143798382, "language_loss": 0.75744319, "learning_rate": 1.1384219243784454e-07, "loss": 0.77541423, "num_input_tokens_seen": 321137115, "step": 14891, "time_per_iteration": 6.244478225708008 }, { "auxiliary_loss_clip": 0.01051483, "auxiliary_loss_mlp": 0.01035049, "balance_loss_clip": 1.03069568, "balance_loss_mlp": 1.02226329, "epoch": 0.8953554787313994, "flos": 14137205852160.0, "grad_norm": 2.0123273105882586, "language_loss": 0.76453358, "learning_rate": 1.1371270529407517e-07, "loss": 0.7853989, "num_input_tokens_seen": 321154490, "step": 14892, "time_per_iteration": 3.087535858154297 }, { "auxiliary_loss_clip": 0.01093667, "auxiliary_loss_mlp": 0.01032529, "balance_loss_clip": 1.03796649, "balance_loss_mlp": 1.02048337, "epoch": 0.8954156019840673, "flos": 25703314139520.0, "grad_norm": 3.335988726881917, "language_loss": 0.81619698, "learning_rate": 1.1358328967850895e-07, "loss": 0.83745897, "num_input_tokens_seen": 321175625, "step": 14893, "time_per_iteration": 4.313986778259277 }, { "auxiliary_loss_clip": 0.01061423, "auxiliary_loss_mlp": 0.01032555, "balance_loss_clip": 1.03349638, "balance_loss_mlp": 1.02072382, "epoch": 0.8954757252367354, "flos": 21907987286400.0, "grad_norm": 1.880542691848622, "language_loss": 0.74994141, "learning_rate": 1.1345394559605348e-07, "loss": 0.77088118, "num_input_tokens_seen": 321193895, "step": 14894, "time_per_iteration": 2.9463634490966797 }, { "auxiliary_loss_clip": 0.01097897, "auxiliary_loss_mlp": 0.0103303, "balance_loss_clip": 1.03915453, "balance_loss_mlp": 1.01995826, "epoch": 0.8955358484894033, "flos": 12970396454400.0, "grad_norm": 1.9665552489767175, "language_loss": 0.66606176, "learning_rate": 1.1332467305161352e-07, "loss": 0.68737108, "num_input_tokens_seen": 321211610, "step": 14895, "time_per_iteration": 4.159812927246094 }, { "auxiliary_loss_clip": 0.01099951, "auxiliary_loss_mlp": 0.01029624, "balance_loss_clip": 1.03752875, "balance_loss_mlp": 1.01608145, "epoch": 0.8955959717420713, "flos": 17273966797440.0, "grad_norm": 1.671045590451987, "language_loss": 0.67131901, "learning_rate": 1.1319547205009094e-07, "loss": 0.69261479, "num_input_tokens_seen": 321229805, "step": 14896, "time_per_iteration": 2.5856170654296875 }, { "auxiliary_loss_clip": 0.01099928, "auxiliary_loss_mlp": 0.01033038, "balance_loss_clip": 1.03831029, "balance_loss_mlp": 1.0207119, "epoch": 0.8956560949947392, "flos": 14793868339200.0, "grad_norm": 1.8809584975485838, "language_loss": 0.75465834, "learning_rate": 1.1306634259638492e-07, "loss": 0.77598798, "num_input_tokens_seen": 321247165, "step": 14897, "time_per_iteration": 2.657931089401245 }, { "auxiliary_loss_clip": 0.00994794, "auxiliary_loss_mlp": 0.00751908, "balance_loss_clip": 1.00807071, "balance_loss_mlp": 0.99958485, "epoch": 0.8957162182474072, "flos": 63607817957760.0, "grad_norm": 0.7439840356253357, "language_loss": 0.55338937, "learning_rate": 1.129372846953931e-07, "loss": 0.57085639, "num_input_tokens_seen": 321308425, "step": 14898, "time_per_iteration": 3.3162429332733154 }, { "auxiliary_loss_clip": 0.01109726, "auxiliary_loss_mlp": 0.00771113, "balance_loss_clip": 1.03748989, "balance_loss_mlp": 1.00012457, "epoch": 0.8957763415000751, "flos": 25009843190400.0, "grad_norm": 1.500591280857772, "language_loss": 0.70237386, "learning_rate": 1.12808298352008e-07, "loss": 0.72118223, "num_input_tokens_seen": 321329295, "step": 14899, "time_per_iteration": 2.6604552268981934 }, { "auxiliary_loss_clip": 0.01054108, "auxiliary_loss_mlp": 0.0103587, "balance_loss_clip": 1.03760815, "balance_loss_mlp": 1.02217865, "epoch": 0.8958364647527431, "flos": 19828615933440.0, "grad_norm": 1.672513533456995, "language_loss": 0.73965251, "learning_rate": 1.1267938357112106e-07, "loss": 0.76055229, "num_input_tokens_seen": 321347580, "step": 14900, "time_per_iteration": 2.7858917713165283 }, { "auxiliary_loss_clip": 0.00999101, "auxiliary_loss_mlp": 0.01000333, "balance_loss_clip": 1.01374125, "balance_loss_mlp": 0.99923056, "epoch": 0.895896588005411, "flos": 65537190115200.0, "grad_norm": 0.793037706766976, "language_loss": 0.61771894, "learning_rate": 1.1255054035762124e-07, "loss": 0.63771325, "num_input_tokens_seen": 321407820, "step": 14901, "time_per_iteration": 3.225350856781006 }, { "auxiliary_loss_clip": 0.01099179, "auxiliary_loss_mlp": 0.01029055, "balance_loss_clip": 1.03669286, "balance_loss_mlp": 1.01680589, "epoch": 0.8959567112580791, "flos": 25591021246080.0, "grad_norm": 1.6768583776386496, "language_loss": 0.70434642, "learning_rate": 1.1242176871639441e-07, "loss": 0.72562879, "num_input_tokens_seen": 321426745, "step": 14902, "time_per_iteration": 2.629722833633423 }, { "auxiliary_loss_clip": 0.01080163, "auxiliary_loss_mlp": 0.01030377, "balance_loss_clip": 1.03510499, "balance_loss_mlp": 1.01877761, "epoch": 0.896016834510747, "flos": 24201780877440.0, "grad_norm": 1.8587409033889455, "language_loss": 0.78276879, "learning_rate": 1.1229306865232313e-07, "loss": 0.80387414, "num_input_tokens_seen": 321446165, "step": 14903, "time_per_iteration": 2.6630077362060547 }, { "auxiliary_loss_clip": 0.01085975, "auxiliary_loss_mlp": 0.01034269, "balance_loss_clip": 1.03611159, "balance_loss_mlp": 1.02067935, "epoch": 0.896076957763415, "flos": 23075945919360.0, "grad_norm": 1.7273682997495312, "language_loss": 0.73095953, "learning_rate": 1.121644401702877e-07, "loss": 0.75216204, "num_input_tokens_seen": 321465285, "step": 14904, "time_per_iteration": 2.656641721725464 }, { "auxiliary_loss_clip": 0.01097461, "auxiliary_loss_mlp": 0.01028056, "balance_loss_clip": 1.03512216, "balance_loss_mlp": 1.0144484, "epoch": 0.8961370810160829, "flos": 22236605838720.0, "grad_norm": 1.972644186412881, "language_loss": 0.74508619, "learning_rate": 1.12035883275166e-07, "loss": 0.76634133, "num_input_tokens_seen": 321483670, "step": 14905, "time_per_iteration": 2.5795624256134033 }, { "auxiliary_loss_clip": 0.01096538, "auxiliary_loss_mlp": 0.01032548, "balance_loss_clip": 1.03503621, "balance_loss_mlp": 1.02032363, "epoch": 0.8961972042687509, "flos": 23072318645760.0, "grad_norm": 2.276578769172911, "language_loss": 0.76414752, "learning_rate": 1.1190739797183279e-07, "loss": 0.78543842, "num_input_tokens_seen": 321501190, "step": 14906, "time_per_iteration": 2.608065605163574 }, { "auxiliary_loss_clip": 0.0109916, "auxiliary_loss_mlp": 0.01034258, "balance_loss_clip": 1.03820026, "balance_loss_mlp": 1.02151513, "epoch": 0.896257327521419, "flos": 18185882307840.0, "grad_norm": 1.6230699036233884, "language_loss": 0.7410239, "learning_rate": 1.1177898426515996e-07, "loss": 0.76235807, "num_input_tokens_seen": 321518540, "step": 14907, "time_per_iteration": 2.5740091800689697 }, { "auxiliary_loss_clip": 0.01098288, "auxiliary_loss_mlp": 0.0103583, "balance_loss_clip": 1.03720152, "balance_loss_mlp": 1.0237062, "epoch": 0.8963174507740869, "flos": 17895472848000.0, "grad_norm": 3.741927314180935, "language_loss": 0.82670319, "learning_rate": 1.1165064216001785e-07, "loss": 0.84804434, "num_input_tokens_seen": 321536555, "step": 14908, "time_per_iteration": 2.5786521434783936 }, { "auxiliary_loss_clip": 0.01086384, "auxiliary_loss_mlp": 0.01031261, "balance_loss_clip": 1.0361675, "balance_loss_mlp": 1.01765347, "epoch": 0.8963775740267549, "flos": 21032269706880.0, "grad_norm": 2.161346134185943, "language_loss": 0.70245093, "learning_rate": 1.1152237166127232e-07, "loss": 0.72362739, "num_input_tokens_seen": 321557655, "step": 14909, "time_per_iteration": 2.652540445327759 }, { "auxiliary_loss_clip": 0.01076255, "auxiliary_loss_mlp": 0.01036306, "balance_loss_clip": 1.03869569, "balance_loss_mlp": 1.02353907, "epoch": 0.8964376972794228, "flos": 23179619548800.0, "grad_norm": 16.444537313084084, "language_loss": 0.7209096, "learning_rate": 1.113941727737877e-07, "loss": 0.74203527, "num_input_tokens_seen": 321576160, "step": 14910, "time_per_iteration": 2.6874682903289795 }, { "auxiliary_loss_clip": 0.01095164, "auxiliary_loss_mlp": 0.01028278, "balance_loss_clip": 1.03482211, "balance_loss_mlp": 1.01633346, "epoch": 0.8964978205320908, "flos": 24972998814720.0, "grad_norm": 2.519588986898142, "language_loss": 0.6361804, "learning_rate": 1.1126604550242502e-07, "loss": 0.65741479, "num_input_tokens_seen": 321596205, "step": 14911, "time_per_iteration": 2.594196081161499 }, { "auxiliary_loss_clip": 0.01082355, "auxiliary_loss_mlp": 0.00770688, "balance_loss_clip": 1.03677964, "balance_loss_mlp": 1.00020111, "epoch": 0.8965579437847587, "flos": 19172025273600.0, "grad_norm": 1.6361676394072804, "language_loss": 0.74929178, "learning_rate": 1.111379898520437e-07, "loss": 0.76782227, "num_input_tokens_seen": 321614800, "step": 14912, "time_per_iteration": 2.620948076248169 }, { "auxiliary_loss_clip": 0.01083336, "auxiliary_loss_mlp": 0.01037867, "balance_loss_clip": 1.03508806, "balance_loss_mlp": 1.02545714, "epoch": 0.8966180670374267, "flos": 24276690691200.0, "grad_norm": 1.791048209942099, "language_loss": 0.81890047, "learning_rate": 1.1101000582749876e-07, "loss": 0.84011245, "num_input_tokens_seen": 321633445, "step": 14913, "time_per_iteration": 2.6343531608581543 }, { "auxiliary_loss_clip": 0.01101255, "auxiliary_loss_mlp": 0.01035811, "balance_loss_clip": 1.03797901, "balance_loss_mlp": 1.02259076, "epoch": 0.8966781902900947, "flos": 13553190622080.0, "grad_norm": 3.5493075869596176, "language_loss": 0.61391163, "learning_rate": 1.1088209343364407e-07, "loss": 0.63528228, "num_input_tokens_seen": 321650890, "step": 14914, "time_per_iteration": 2.611363649368286 }, { "auxiliary_loss_clip": 0.01005981, "auxiliary_loss_mlp": 0.00999937, "balance_loss_clip": 1.00627279, "balance_loss_mlp": 0.99880487, "epoch": 0.8967383135427627, "flos": 65066114223360.0, "grad_norm": 2.6237376103475905, "language_loss": 0.5505228, "learning_rate": 1.1075425267532956e-07, "loss": 0.57058197, "num_input_tokens_seen": 321710960, "step": 14915, "time_per_iteration": 3.191149950027466 }, { "auxiliary_loss_clip": 0.01068433, "auxiliary_loss_mlp": 0.01032694, "balance_loss_clip": 1.03356564, "balance_loss_mlp": 1.02107704, "epoch": 0.8967984367954306, "flos": 29713027317120.0, "grad_norm": 1.506591711885427, "language_loss": 0.71458489, "learning_rate": 1.1062648355740289e-07, "loss": 0.73559618, "num_input_tokens_seen": 321733290, "step": 14916, "time_per_iteration": 2.7623350620269775 }, { "auxiliary_loss_clip": 0.01087907, "auxiliary_loss_mlp": 0.01030723, "balance_loss_clip": 1.03692842, "balance_loss_mlp": 1.01904035, "epoch": 0.8968585600480986, "flos": 25702488126720.0, "grad_norm": 1.8194206716370875, "language_loss": 0.77866107, "learning_rate": 1.1049878608470931e-07, "loss": 0.79984742, "num_input_tokens_seen": 321753120, "step": 14917, "time_per_iteration": 2.6854681968688965 }, { "auxiliary_loss_clip": 0.01102374, "auxiliary_loss_mlp": 0.01041532, "balance_loss_clip": 1.03805685, "balance_loss_mlp": 1.02815735, "epoch": 0.8969186833007665, "flos": 30044698525440.0, "grad_norm": 1.9817396257364666, "language_loss": 0.6853829, "learning_rate": 1.1037116026209137e-07, "loss": 0.70682192, "num_input_tokens_seen": 321772840, "step": 14918, "time_per_iteration": 2.6850335597991943 }, { "auxiliary_loss_clip": 0.01059733, "auxiliary_loss_mlp": 0.010324, "balance_loss_clip": 1.0353421, "balance_loss_mlp": 1.02087831, "epoch": 0.8969788065534345, "flos": 22818143030400.0, "grad_norm": 1.8968334913567422, "language_loss": 0.83584672, "learning_rate": 1.102436060943881e-07, "loss": 0.85676813, "num_input_tokens_seen": 321791020, "step": 14919, "time_per_iteration": 2.7944953441619873 }, { "auxiliary_loss_clip": 0.0110904, "auxiliary_loss_mlp": 0.00771505, "balance_loss_clip": 1.03592348, "balance_loss_mlp": 1.00021255, "epoch": 0.8970389298061026, "flos": 13261488272640.0, "grad_norm": 6.002098471284828, "language_loss": 0.72274148, "learning_rate": 1.1011612358643696e-07, "loss": 0.74154693, "num_input_tokens_seen": 321810075, "step": 14920, "time_per_iteration": 2.641122579574585 }, { "auxiliary_loss_clip": 0.01096514, "auxiliary_loss_mlp": 0.01031999, "balance_loss_clip": 1.03508401, "balance_loss_mlp": 1.01865947, "epoch": 0.8970990530587705, "flos": 10266071345280.0, "grad_norm": 2.2258437639369753, "language_loss": 0.90893173, "learning_rate": 1.0998871274307164e-07, "loss": 0.93021685, "num_input_tokens_seen": 321822635, "step": 14921, "time_per_iteration": 2.5753695964813232 }, { "auxiliary_loss_clip": 0.0105106, "auxiliary_loss_mlp": 0.0103667, "balance_loss_clip": 1.03290153, "balance_loss_mlp": 1.02218616, "epoch": 0.8971591763114385, "flos": 20302708567680.0, "grad_norm": 1.7221269856692987, "language_loss": 0.73712015, "learning_rate": 1.0986137356912384e-07, "loss": 0.75799739, "num_input_tokens_seen": 321841130, "step": 14922, "time_per_iteration": 2.796809673309326 }, { "auxiliary_loss_clip": 0.01059125, "auxiliary_loss_mlp": 0.01039549, "balance_loss_clip": 1.0326159, "balance_loss_mlp": 1.02563119, "epoch": 0.8972192995641064, "flos": 23257043314560.0, "grad_norm": 1.7526261778537016, "language_loss": 0.70386976, "learning_rate": 1.097341060694219e-07, "loss": 0.7248565, "num_input_tokens_seen": 321859855, "step": 14923, "time_per_iteration": 2.716149091720581 }, { "auxiliary_loss_clip": 0.01087701, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 1.03695786, "balance_loss_mlp": 1.01585746, "epoch": 0.8972794228167744, "flos": 18369601395840.0, "grad_norm": 2.5800290587382606, "language_loss": 0.7121672, "learning_rate": 1.0960691024879221e-07, "loss": 0.73333609, "num_input_tokens_seen": 321877990, "step": 14924, "time_per_iteration": 2.6310861110687256 }, { "auxiliary_loss_clip": 0.01094366, "auxiliary_loss_mlp": 0.01035713, "balance_loss_clip": 1.03357565, "balance_loss_mlp": 1.02411425, "epoch": 0.8973395460694423, "flos": 23952058548480.0, "grad_norm": 1.524405249104344, "language_loss": 0.720016, "learning_rate": 1.0947978611205844e-07, "loss": 0.7413168, "num_input_tokens_seen": 321898120, "step": 14925, "time_per_iteration": 2.665548324584961 }, { "auxiliary_loss_clip": 0.01087294, "auxiliary_loss_mlp": 0.00773098, "balance_loss_clip": 1.03590477, "balance_loss_mlp": 1.0001936, "epoch": 0.8973996693221103, "flos": 24970843998720.0, "grad_norm": 1.8138493402848186, "language_loss": 0.82518828, "learning_rate": 1.0935273366404008e-07, "loss": 0.84379226, "num_input_tokens_seen": 321918140, "step": 14926, "time_per_iteration": 2.6425201892852783 }, { "auxiliary_loss_clip": 0.01054597, "auxiliary_loss_mlp": 0.01030973, "balance_loss_clip": 1.03193653, "balance_loss_mlp": 1.0189445, "epoch": 0.8974597925747783, "flos": 25738937452800.0, "grad_norm": 1.4975243359696364, "language_loss": 0.7919172, "learning_rate": 1.092257529095555e-07, "loss": 0.81277287, "num_input_tokens_seen": 321938580, "step": 14927, "time_per_iteration": 2.760615825653076 }, { "auxiliary_loss_clip": 0.01081394, "auxiliary_loss_mlp": 0.01029342, "balance_loss_clip": 1.03361082, "balance_loss_mlp": 1.01776123, "epoch": 0.8975199158274463, "flos": 38071918131840.0, "grad_norm": 1.6317289116194253, "language_loss": 0.66483474, "learning_rate": 1.0909884385341994e-07, "loss": 0.68594205, "num_input_tokens_seen": 321961135, "step": 14928, "time_per_iteration": 2.778822898864746 }, { "auxiliary_loss_clip": 0.01087431, "auxiliary_loss_mlp": 0.01043461, "balance_loss_clip": 1.03568482, "balance_loss_mlp": 1.02808905, "epoch": 0.8975800390801142, "flos": 25411683617280.0, "grad_norm": 5.313736387639944, "language_loss": 0.70643723, "learning_rate": 1.0897200650044602e-07, "loss": 0.72774613, "num_input_tokens_seen": 321980945, "step": 14929, "time_per_iteration": 2.7232232093811035 }, { "auxiliary_loss_clip": 0.01089831, "auxiliary_loss_mlp": 0.01031432, "balance_loss_clip": 1.03910744, "balance_loss_mlp": 1.01977956, "epoch": 0.8976401623327822, "flos": 21759604202880.0, "grad_norm": 1.7936229016193426, "language_loss": 0.68214059, "learning_rate": 1.0884524085544256e-07, "loss": 0.70335329, "num_input_tokens_seen": 322000350, "step": 14930, "time_per_iteration": 4.204017162322998 }, { "auxiliary_loss_clip": 0.01078251, "auxiliary_loss_mlp": 0.01028551, "balance_loss_clip": 1.03327012, "balance_loss_mlp": 1.01617682, "epoch": 0.8977002855854501, "flos": 13845323934720.0, "grad_norm": 3.3144307660697994, "language_loss": 0.74537098, "learning_rate": 1.0871854692321769e-07, "loss": 0.76643896, "num_input_tokens_seen": 322018980, "step": 14931, "time_per_iteration": 4.21280837059021 }, { "auxiliary_loss_clip": 0.01098516, "auxiliary_loss_mlp": 0.0102899, "balance_loss_clip": 1.0380764, "balance_loss_mlp": 1.01730156, "epoch": 0.8977604088381181, "flos": 19427529692160.0, "grad_norm": 1.8135529971721605, "language_loss": 0.62872756, "learning_rate": 1.0859192470857492e-07, "loss": 0.6500026, "num_input_tokens_seen": 322037675, "step": 14932, "time_per_iteration": 2.5633347034454346 }, { "auxiliary_loss_clip": 0.01091207, "auxiliary_loss_mlp": 0.01028215, "balance_loss_clip": 1.03397417, "balance_loss_mlp": 1.01719403, "epoch": 0.8978205320907862, "flos": 22742083981440.0, "grad_norm": 1.6627829242988799, "language_loss": 0.7173481, "learning_rate": 1.0846537421631552e-07, "loss": 0.73854238, "num_input_tokens_seen": 322055130, "step": 14933, "time_per_iteration": 5.648598909378052 }, { "auxiliary_loss_clip": 0.01061803, "auxiliary_loss_mlp": 0.01036521, "balance_loss_clip": 1.03099751, "balance_loss_mlp": 1.02254987, "epoch": 0.8978806553434541, "flos": 21360529123200.0, "grad_norm": 1.8892940748793305, "language_loss": 0.74708331, "learning_rate": 1.0833889545123898e-07, "loss": 0.76806653, "num_input_tokens_seen": 322074850, "step": 14934, "time_per_iteration": 2.7452452182769775 }, { "auxiliary_loss_clip": 0.01063828, "auxiliary_loss_mlp": 0.01038833, "balance_loss_clip": 1.03115392, "balance_loss_mlp": 1.02425992, "epoch": 0.8979407785961221, "flos": 20924178704640.0, "grad_norm": 1.7395229013410125, "language_loss": 0.60459125, "learning_rate": 1.0821248841814123e-07, "loss": 0.62561786, "num_input_tokens_seen": 322093315, "step": 14935, "time_per_iteration": 2.6802937984466553 }, { "auxiliary_loss_clip": 0.01067049, "auxiliary_loss_mlp": 0.0102824, "balance_loss_clip": 1.03403175, "balance_loss_mlp": 1.01568127, "epoch": 0.89800090184879, "flos": 25228934196480.0, "grad_norm": 2.3833073137139773, "language_loss": 0.76938522, "learning_rate": 1.0808615312181512e-07, "loss": 0.79033804, "num_input_tokens_seen": 322112555, "step": 14936, "time_per_iteration": 2.6882402896881104 }, { "auxiliary_loss_clip": 0.01084705, "auxiliary_loss_mlp": 0.01033293, "balance_loss_clip": 1.0342505, "balance_loss_mlp": 1.02111554, "epoch": 0.898061025101458, "flos": 22562674525440.0, "grad_norm": 1.7261485222993433, "language_loss": 0.74040693, "learning_rate": 1.0795988956705193e-07, "loss": 0.7615869, "num_input_tokens_seen": 322130440, "step": 14937, "time_per_iteration": 2.6710762977600098 }, { "auxiliary_loss_clip": 0.01000999, "auxiliary_loss_mlp": 0.00999075, "balance_loss_clip": 1.00671172, "balance_loss_mlp": 0.9980852, "epoch": 0.8981211483541259, "flos": 56192551384320.0, "grad_norm": 0.843865572085313, "language_loss": 0.63512671, "learning_rate": 1.0783369775863915e-07, "loss": 0.65512741, "num_input_tokens_seen": 322187295, "step": 14938, "time_per_iteration": 3.0942494869232178 }, { "auxiliary_loss_clip": 0.01085887, "auxiliary_loss_mlp": 0.01026506, "balance_loss_clip": 1.03574538, "balance_loss_mlp": 1.0140903, "epoch": 0.898181271606794, "flos": 16392718523520.0, "grad_norm": 2.1860479541490268, "language_loss": 0.79759568, "learning_rate": 1.0770757770136251e-07, "loss": 0.81871951, "num_input_tokens_seen": 322202965, "step": 14939, "time_per_iteration": 2.663742780685425 }, { "auxiliary_loss_clip": 0.01000054, "auxiliary_loss_mlp": 0.01000102, "balance_loss_clip": 1.00716364, "balance_loss_mlp": 0.99917239, "epoch": 0.8982413948594619, "flos": 63440259989760.0, "grad_norm": 0.7229819252676494, "language_loss": 0.52847624, "learning_rate": 1.0758152940000375e-07, "loss": 0.54847777, "num_input_tokens_seen": 322269490, "step": 14940, "time_per_iteration": 3.3590850830078125 }, { "auxiliary_loss_clip": 0.01109001, "auxiliary_loss_mlp": 0.01032379, "balance_loss_clip": 1.03646505, "balance_loss_mlp": 1.01890242, "epoch": 0.8983015181121299, "flos": 21835340029440.0, "grad_norm": 1.950102596930943, "language_loss": 0.77829498, "learning_rate": 1.0745555285934327e-07, "loss": 0.79970872, "num_input_tokens_seen": 322288060, "step": 14941, "time_per_iteration": 2.744305372238159 }, { "auxiliary_loss_clip": 0.01098003, "auxiliary_loss_mlp": 0.01036176, "balance_loss_clip": 1.0353359, "balance_loss_mlp": 1.02308095, "epoch": 0.8983616413647978, "flos": 28949961767040.0, "grad_norm": 2.416025895078288, "language_loss": 0.73365378, "learning_rate": 1.0732964808415834e-07, "loss": 0.75499552, "num_input_tokens_seen": 322307930, "step": 14942, "time_per_iteration": 2.754950523376465 }, { "auxiliary_loss_clip": 0.01087926, "auxiliary_loss_mlp": 0.01039089, "balance_loss_clip": 1.03435743, "balance_loss_mlp": 1.0259459, "epoch": 0.8984217646174658, "flos": 17785083375360.0, "grad_norm": 3.391273382759864, "language_loss": 0.79918504, "learning_rate": 1.0720381507922205e-07, "loss": 0.82045519, "num_input_tokens_seen": 322326155, "step": 14943, "time_per_iteration": 2.7248191833496094 }, { "auxiliary_loss_clip": 0.01085525, "auxiliary_loss_mlp": 0.01032768, "balance_loss_clip": 1.0354557, "balance_loss_mlp": 1.01938701, "epoch": 0.8984818878701337, "flos": 23404528558080.0, "grad_norm": 1.5192187964233135, "language_loss": 0.71140742, "learning_rate": 1.0707805384930701e-07, "loss": 0.73259044, "num_input_tokens_seen": 322345850, "step": 14944, "time_per_iteration": 2.6967763900756836 }, { "auxiliary_loss_clip": 0.01069595, "auxiliary_loss_mlp": 0.01033426, "balance_loss_clip": 1.03214753, "balance_loss_mlp": 1.01949, "epoch": 0.8985420111228017, "flos": 22346061557760.0, "grad_norm": 2.216725804590017, "language_loss": 0.76311302, "learning_rate": 1.0695236439918187e-07, "loss": 0.78414327, "num_input_tokens_seen": 322364715, "step": 14945, "time_per_iteration": 2.6679043769836426 }, { "auxiliary_loss_clip": 0.01114813, "auxiliary_loss_mlp": 0.01031111, "balance_loss_clip": 1.03778219, "balance_loss_mlp": 1.01778316, "epoch": 0.8986021343754698, "flos": 21392776558080.0, "grad_norm": 2.148577693611771, "language_loss": 0.73464406, "learning_rate": 1.0682674673361302e-07, "loss": 0.75610334, "num_input_tokens_seen": 322383570, "step": 14946, "time_per_iteration": 2.5922229290008545 }, { "auxiliary_loss_clip": 0.0105656, "auxiliary_loss_mlp": 0.01032137, "balance_loss_clip": 1.03178859, "balance_loss_mlp": 1.01898205, "epoch": 0.8986622576281377, "flos": 21325372686720.0, "grad_norm": 1.8845669239623069, "language_loss": 0.64757031, "learning_rate": 1.0670120085736334e-07, "loss": 0.66845727, "num_input_tokens_seen": 322401375, "step": 14947, "time_per_iteration": 2.7270290851593018 }, { "auxiliary_loss_clip": 0.01087566, "auxiliary_loss_mlp": 0.01034127, "balance_loss_clip": 1.03707767, "balance_loss_mlp": 1.02179492, "epoch": 0.8987223808808057, "flos": 23988292392960.0, "grad_norm": 2.018840894039702, "language_loss": 0.70409435, "learning_rate": 1.0657572677519411e-07, "loss": 0.72531128, "num_input_tokens_seen": 322421890, "step": 14948, "time_per_iteration": 2.712301254272461 }, { "auxiliary_loss_clip": 0.01076508, "auxiliary_loss_mlp": 0.01028981, "balance_loss_clip": 1.03520036, "balance_loss_mlp": 1.01646447, "epoch": 0.8987825041334736, "flos": 41500956044160.0, "grad_norm": 1.7555603952219132, "language_loss": 0.7477864, "learning_rate": 1.0645032449186309e-07, "loss": 0.76884139, "num_input_tokens_seen": 322445730, "step": 14949, "time_per_iteration": 2.8739330768585205 }, { "auxiliary_loss_clip": 0.01067975, "auxiliary_loss_mlp": 0.01032509, "balance_loss_clip": 1.03525853, "balance_loss_mlp": 1.01840615, "epoch": 0.8988426273861416, "flos": 27564276844800.0, "grad_norm": 1.6443346508458696, "language_loss": 0.75822496, "learning_rate": 1.0632499401212513e-07, "loss": 0.77922982, "num_input_tokens_seen": 322464595, "step": 14950, "time_per_iteration": 2.801135301589966 }, { "auxiliary_loss_clip": 0.01082227, "auxiliary_loss_mlp": 0.01031565, "balance_loss_clip": 1.03504586, "balance_loss_mlp": 1.01929891, "epoch": 0.8989027506388095, "flos": 17092653920640.0, "grad_norm": 1.910819021087814, "language_loss": 0.66423386, "learning_rate": 1.0619973534073334e-07, "loss": 0.68537182, "num_input_tokens_seen": 322483305, "step": 14951, "time_per_iteration": 2.722646951675415 }, { "auxiliary_loss_clip": 0.01110481, "auxiliary_loss_mlp": 0.01030196, "balance_loss_clip": 1.03482461, "balance_loss_mlp": 1.01835823, "epoch": 0.8989628738914776, "flos": 20555124416640.0, "grad_norm": 1.9405005215432696, "language_loss": 0.73878247, "learning_rate": 1.0607454848243769e-07, "loss": 0.76018929, "num_input_tokens_seen": 322501905, "step": 14952, "time_per_iteration": 2.638542413711548 }, { "auxiliary_loss_clip": 0.01108749, "auxiliary_loss_mlp": 0.01033676, "balance_loss_clip": 1.03708589, "balance_loss_mlp": 1.02110505, "epoch": 0.8990229971441455, "flos": 16251087196800.0, "grad_norm": 2.2604855154768595, "language_loss": 0.56825626, "learning_rate": 1.0594943344198481e-07, "loss": 0.58968055, "num_input_tokens_seen": 322518135, "step": 14953, "time_per_iteration": 2.674570083618164 }, { "auxiliary_loss_clip": 0.01083928, "auxiliary_loss_mlp": 0.01033798, "balance_loss_clip": 1.03378558, "balance_loss_mlp": 1.021281, "epoch": 0.8990831203968135, "flos": 21981316901760.0, "grad_norm": 2.264909455658383, "language_loss": 0.82036901, "learning_rate": 1.0582439022411915e-07, "loss": 0.8415463, "num_input_tokens_seen": 322537905, "step": 14954, "time_per_iteration": 2.6860923767089844 }, { "auxiliary_loss_clip": 0.01107036, "auxiliary_loss_mlp": 0.01032255, "balance_loss_clip": 1.0373497, "balance_loss_mlp": 1.01929116, "epoch": 0.8991432436494814, "flos": 27447171528960.0, "grad_norm": 1.822158313950773, "language_loss": 0.59985012, "learning_rate": 1.0569941883358224e-07, "loss": 0.621243, "num_input_tokens_seen": 322557945, "step": 14955, "time_per_iteration": 2.645461082458496 }, { "auxiliary_loss_clip": 0.01097918, "auxiliary_loss_mlp": 0.01032904, "balance_loss_clip": 1.03783774, "balance_loss_mlp": 1.02125764, "epoch": 0.8992033669021494, "flos": 21579835610880.0, "grad_norm": 2.0084560499241486, "language_loss": 0.54700983, "learning_rate": 1.0557451927511341e-07, "loss": 0.56831801, "num_input_tokens_seen": 322575765, "step": 14956, "time_per_iteration": 2.6565489768981934 }, { "auxiliary_loss_clip": 0.01063944, "auxiliary_loss_mlp": 0.01036733, "balance_loss_clip": 1.03451157, "balance_loss_mlp": 1.02390623, "epoch": 0.8992634901548173, "flos": 28584211530240.0, "grad_norm": 1.8401685244545993, "language_loss": 0.79821646, "learning_rate": 1.0544969155344863e-07, "loss": 0.81922328, "num_input_tokens_seen": 322595665, "step": 14957, "time_per_iteration": 2.797804117202759 }, { "auxiliary_loss_clip": 0.0111253, "auxiliary_loss_mlp": 0.01031569, "balance_loss_clip": 1.03749204, "balance_loss_mlp": 1.01827729, "epoch": 0.8993236134074853, "flos": 19867435557120.0, "grad_norm": 2.4531476671988663, "language_loss": 0.78357041, "learning_rate": 1.0532493567332123e-07, "loss": 0.80501139, "num_input_tokens_seen": 322614755, "step": 14958, "time_per_iteration": 2.6688661575317383 }, { "auxiliary_loss_clip": 0.0104078, "auxiliary_loss_mlp": 0.01030342, "balance_loss_clip": 1.03928471, "balance_loss_mlp": 1.01852262, "epoch": 0.8993837366601534, "flos": 19390649402880.0, "grad_norm": 2.9878038930671362, "language_loss": 0.74742228, "learning_rate": 1.0520025163946277e-07, "loss": 0.76813352, "num_input_tokens_seen": 322633425, "step": 14959, "time_per_iteration": 2.8125593662261963 }, { "auxiliary_loss_clip": 0.01103112, "auxiliary_loss_mlp": 0.0103209, "balance_loss_clip": 1.03359628, "balance_loss_mlp": 1.01983559, "epoch": 0.8994438599128213, "flos": 18551740285440.0, "grad_norm": 2.0035831193239684, "language_loss": 0.68275356, "learning_rate": 1.0507563945660015e-07, "loss": 0.70410562, "num_input_tokens_seen": 322652065, "step": 14960, "time_per_iteration": 2.5540730953216553 }, { "auxiliary_loss_clip": 0.01084725, "auxiliary_loss_mlp": 0.010279, "balance_loss_clip": 1.03622437, "balance_loss_mlp": 1.01650357, "epoch": 0.8995039831654893, "flos": 24427587726720.0, "grad_norm": 1.431119232973545, "language_loss": 0.65543896, "learning_rate": 1.049510991294591e-07, "loss": 0.67656523, "num_input_tokens_seen": 322673275, "step": 14961, "time_per_iteration": 2.7903378009796143 }, { "auxiliary_loss_clip": 0.01084623, "auxiliary_loss_mlp": 0.01027938, "balance_loss_clip": 1.03403842, "balance_loss_mlp": 1.01648808, "epoch": 0.8995641064181572, "flos": 21251324799360.0, "grad_norm": 1.6157800679699814, "language_loss": 0.83261824, "learning_rate": 1.0482663066276254e-07, "loss": 0.85374379, "num_input_tokens_seen": 322693375, "step": 14962, "time_per_iteration": 2.640796661376953 }, { "auxiliary_loss_clip": 0.01090281, "auxiliary_loss_mlp": 0.01030459, "balance_loss_clip": 1.0377152, "balance_loss_mlp": 1.01667809, "epoch": 0.8996242296708252, "flos": 23513661054720.0, "grad_norm": 2.0695727885892095, "language_loss": 0.76674181, "learning_rate": 1.047022340612298e-07, "loss": 0.7879492, "num_input_tokens_seen": 322712615, "step": 14963, "time_per_iteration": 2.6461212635040283 }, { "auxiliary_loss_clip": 0.00991703, "auxiliary_loss_mlp": 0.01005224, "balance_loss_clip": 1.01595902, "balance_loss_mlp": 1.00418079, "epoch": 0.8996843529234931, "flos": 62403230430720.0, "grad_norm": 0.7797202356654998, "language_loss": 0.57483667, "learning_rate": 1.0457790932957867e-07, "loss": 0.59480596, "num_input_tokens_seen": 322766855, "step": 14964, "time_per_iteration": 3.1848866939544678 }, { "auxiliary_loss_clip": 0.0110498, "auxiliary_loss_mlp": 0.01033929, "balance_loss_clip": 1.03950953, "balance_loss_mlp": 1.02064347, "epoch": 0.8997444761761612, "flos": 24236829573120.0, "grad_norm": 3.314723962162985, "language_loss": 0.6772269, "learning_rate": 1.0445365647252269e-07, "loss": 0.69861603, "num_input_tokens_seen": 322781130, "step": 14965, "time_per_iteration": 2.6162235736846924 }, { "auxiliary_loss_clip": 0.01110984, "auxiliary_loss_mlp": 0.01030286, "balance_loss_clip": 1.03775227, "balance_loss_mlp": 1.01822209, "epoch": 0.8998045994288291, "flos": 21361103740800.0, "grad_norm": 2.9087273995519136, "language_loss": 0.71626663, "learning_rate": 1.0432947549477433e-07, "loss": 0.73767936, "num_input_tokens_seen": 322800310, "step": 14966, "time_per_iteration": 2.5625483989715576 }, { "auxiliary_loss_clip": 0.01076915, "auxiliary_loss_mlp": 0.0103174, "balance_loss_clip": 1.0351249, "balance_loss_mlp": 1.01843047, "epoch": 0.8998647226814971, "flos": 28986159697920.0, "grad_norm": 1.8489899153137084, "language_loss": 0.73536384, "learning_rate": 1.0420536640104205e-07, "loss": 0.75645041, "num_input_tokens_seen": 322820955, "step": 14967, "time_per_iteration": 2.785755157470703 }, { "auxiliary_loss_clip": 0.01064386, "auxiliary_loss_mlp": 0.00770622, "balance_loss_clip": 1.03535485, "balance_loss_mlp": 1.00016153, "epoch": 0.899924845934165, "flos": 13625909706240.0, "grad_norm": 1.9571169995533768, "language_loss": 0.72163457, "learning_rate": 1.040813291960323e-07, "loss": 0.73998475, "num_input_tokens_seen": 322838780, "step": 14968, "time_per_iteration": 2.7936058044433594 }, { "auxiliary_loss_clip": 0.01093703, "auxiliary_loss_mlp": 0.01032931, "balance_loss_clip": 1.03627658, "balance_loss_mlp": 1.02080774, "epoch": 0.899984969186833, "flos": 20882629647360.0, "grad_norm": 1.942509479538182, "language_loss": 0.71323812, "learning_rate": 1.0395736388444864e-07, "loss": 0.73450446, "num_input_tokens_seen": 322856710, "step": 14969, "time_per_iteration": 4.1407389640808105 }, { "auxiliary_loss_clip": 0.01111967, "auxiliary_loss_mlp": 0.01031075, "balance_loss_clip": 1.039024, "balance_loss_mlp": 1.01810515, "epoch": 0.9000450924395009, "flos": 20921808407040.0, "grad_norm": 1.978725901368175, "language_loss": 0.75983673, "learning_rate": 1.0383347047099201e-07, "loss": 0.78126717, "num_input_tokens_seen": 322876070, "step": 14970, "time_per_iteration": 4.195037603378296 }, { "auxiliary_loss_clip": 0.01101891, "auxiliary_loss_mlp": 0.01032608, "balance_loss_clip": 1.036654, "balance_loss_mlp": 1.0206151, "epoch": 0.900105215692169, "flos": 17165049782400.0, "grad_norm": 1.6764087084105503, "language_loss": 0.73020303, "learning_rate": 1.0370964896035972e-07, "loss": 0.75154805, "num_input_tokens_seen": 322895095, "step": 14971, "time_per_iteration": 2.5875184535980225 }, { "auxiliary_loss_clip": 0.0107201, "auxiliary_loss_mlp": 0.01031462, "balance_loss_clip": 1.03537893, "balance_loss_mlp": 1.01815248, "epoch": 0.900165338944837, "flos": 19931930426880.0, "grad_norm": 2.0581551062194703, "language_loss": 0.8157441, "learning_rate": 1.035858993572476e-07, "loss": 0.83677888, "num_input_tokens_seen": 322911845, "step": 14972, "time_per_iteration": 4.170926094055176 }, { "auxiliary_loss_clip": 0.01080845, "auxiliary_loss_mlp": 0.01030543, "balance_loss_clip": 1.03386259, "balance_loss_mlp": 1.01756763, "epoch": 0.9002254621975049, "flos": 16107085572480.0, "grad_norm": 5.44647111318727, "language_loss": 0.8157503, "learning_rate": 1.0346222166634855e-07, "loss": 0.83686423, "num_input_tokens_seen": 322928170, "step": 14973, "time_per_iteration": 4.245764493942261 }, { "auxiliary_loss_clip": 0.01108859, "auxiliary_loss_mlp": 0.01033665, "balance_loss_clip": 1.03745437, "balance_loss_mlp": 1.02064097, "epoch": 0.9002855854501729, "flos": 28476120528000.0, "grad_norm": 1.8931986793937958, "language_loss": 0.58183479, "learning_rate": 1.0333861589235193e-07, "loss": 0.60326004, "num_input_tokens_seen": 322948165, "step": 14974, "time_per_iteration": 2.6841914653778076 }, { "auxiliary_loss_clip": 0.01112242, "auxiliary_loss_mlp": 0.01034905, "balance_loss_clip": 1.04007757, "balance_loss_mlp": 1.02229297, "epoch": 0.9003457087028408, "flos": 25630307746560.0, "grad_norm": 1.7431363980937327, "language_loss": 0.63522345, "learning_rate": 1.0321508203994489e-07, "loss": 0.65669495, "num_input_tokens_seen": 322968880, "step": 14975, "time_per_iteration": 2.620419979095459 }, { "auxiliary_loss_clip": 0.01098045, "auxiliary_loss_mlp": 0.01032816, "balance_loss_clip": 1.03662229, "balance_loss_mlp": 1.02011395, "epoch": 0.9004058319555088, "flos": 24389414547840.0, "grad_norm": 1.7931871131687724, "language_loss": 0.73011506, "learning_rate": 1.0309162011381257e-07, "loss": 0.75142372, "num_input_tokens_seen": 322989395, "step": 14976, "time_per_iteration": 2.6519412994384766 }, { "auxiliary_loss_clip": 0.01092647, "auxiliary_loss_mlp": 0.01031748, "balance_loss_clip": 1.03749645, "balance_loss_mlp": 1.01954126, "epoch": 0.9004659552081767, "flos": 29059345658880.0, "grad_norm": 1.7534579820157172, "language_loss": 0.69598532, "learning_rate": 1.0296823011863565e-07, "loss": 0.71722925, "num_input_tokens_seen": 323009060, "step": 14977, "time_per_iteration": 2.6647446155548096 }, { "auxiliary_loss_clip": 0.01082206, "auxiliary_loss_mlp": 0.00771481, "balance_loss_clip": 1.03483725, "balance_loss_mlp": 1.00023878, "epoch": 0.9005260784608448, "flos": 16763855800320.0, "grad_norm": 2.4242637443808603, "language_loss": 0.65483779, "learning_rate": 1.0284491205909351e-07, "loss": 0.67337465, "num_input_tokens_seen": 323027530, "step": 14978, "time_per_iteration": 2.6061410903930664 }, { "auxiliary_loss_clip": 0.01078235, "auxiliary_loss_mlp": 0.01038613, "balance_loss_clip": 1.03480434, "balance_loss_mlp": 1.02505875, "epoch": 0.9005862017135127, "flos": 20376002269440.0, "grad_norm": 1.675464257364332, "language_loss": 0.78981739, "learning_rate": 1.0272166593986286e-07, "loss": 0.81098592, "num_input_tokens_seen": 323045370, "step": 14979, "time_per_iteration": 2.6818509101867676 }, { "auxiliary_loss_clip": 0.01008335, "auxiliary_loss_mlp": 0.01001784, "balance_loss_clip": 1.00541806, "balance_loss_mlp": 1.00071073, "epoch": 0.9006463249661807, "flos": 67580255796480.0, "grad_norm": 0.7182102286721572, "language_loss": 0.535707, "learning_rate": 1.0259849176561642e-07, "loss": 0.55580819, "num_input_tokens_seen": 323105660, "step": 14980, "time_per_iteration": 3.2093987464904785 }, { "auxiliary_loss_clip": 0.01103101, "auxiliary_loss_mlp": 0.01041025, "balance_loss_clip": 1.03967285, "balance_loss_mlp": 1.0275898, "epoch": 0.9007064482188486, "flos": 28293335193600.0, "grad_norm": 1.66637606590706, "language_loss": 0.82372773, "learning_rate": 1.0247538954102553e-07, "loss": 0.84516907, "num_input_tokens_seen": 323126365, "step": 14981, "time_per_iteration": 2.650113582611084 }, { "auxiliary_loss_clip": 0.01066706, "auxiliary_loss_mlp": 0.01032966, "balance_loss_clip": 1.03680325, "balance_loss_mlp": 1.02073503, "epoch": 0.9007665714715166, "flos": 21616320850560.0, "grad_norm": 2.004462394579591, "language_loss": 0.81781876, "learning_rate": 1.0235235927075758e-07, "loss": 0.83881551, "num_input_tokens_seen": 323145655, "step": 14982, "time_per_iteration": 2.7423040866851807 }, { "auxiliary_loss_clip": 0.01075107, "auxiliary_loss_mlp": 0.01040244, "balance_loss_clip": 1.03167033, "balance_loss_mlp": 1.0271908, "epoch": 0.9008266947241845, "flos": 26541864120960.0, "grad_norm": 1.9584785414964334, "language_loss": 0.71540499, "learning_rate": 1.0222940095947885e-07, "loss": 0.73655844, "num_input_tokens_seen": 323164540, "step": 14983, "time_per_iteration": 2.7024779319763184 }, { "auxiliary_loss_clip": 0.01097308, "auxiliary_loss_mlp": 0.0102736, "balance_loss_clip": 1.03790069, "balance_loss_mlp": 1.01611233, "epoch": 0.9008868179768525, "flos": 23110527738240.0, "grad_norm": 1.3086245920828656, "language_loss": 0.74951446, "learning_rate": 1.0210651461185115e-07, "loss": 0.77076113, "num_input_tokens_seen": 323186960, "step": 14984, "time_per_iteration": 2.813418388366699 }, { "auxiliary_loss_clip": 0.01104396, "auxiliary_loss_mlp": 0.01032793, "balance_loss_clip": 1.03461576, "balance_loss_mlp": 1.02068746, "epoch": 0.9009469412295206, "flos": 19060809788160.0, "grad_norm": 1.7050072282727156, "language_loss": 0.70293552, "learning_rate": 1.0198370023253456e-07, "loss": 0.72430742, "num_input_tokens_seen": 323206135, "step": 14985, "time_per_iteration": 2.767937183380127 }, { "auxiliary_loss_clip": 0.01087695, "auxiliary_loss_mlp": 0.01033923, "balance_loss_clip": 1.03466606, "balance_loss_mlp": 1.02110791, "epoch": 0.9010070644821885, "flos": 23222281927680.0, "grad_norm": 2.1137974022402575, "language_loss": 0.70276654, "learning_rate": 1.0186095782618643e-07, "loss": 0.72398281, "num_input_tokens_seen": 323225980, "step": 14986, "time_per_iteration": 2.7246689796447754 }, { "auxiliary_loss_clip": 0.01096893, "auxiliary_loss_mlp": 0.01032434, "balance_loss_clip": 1.03352499, "balance_loss_mlp": 1.01991105, "epoch": 0.9010671877348565, "flos": 17384823146880.0, "grad_norm": 1.658677803041605, "language_loss": 0.76861989, "learning_rate": 1.0173828739746104e-07, "loss": 0.78991318, "num_input_tokens_seen": 323243700, "step": 14987, "time_per_iteration": 2.5764570236206055 }, { "auxiliary_loss_clip": 0.01092941, "auxiliary_loss_mlp": 0.0103351, "balance_loss_clip": 1.03674459, "balance_loss_mlp": 1.02103519, "epoch": 0.9011273109875244, "flos": 21908166854400.0, "grad_norm": 1.956004475384015, "language_loss": 0.73540664, "learning_rate": 1.0161568895100981e-07, "loss": 0.75667119, "num_input_tokens_seen": 323261535, "step": 14988, "time_per_iteration": 2.558128595352173 }, { "auxiliary_loss_clip": 0.01086646, "auxiliary_loss_mlp": 0.01032746, "balance_loss_clip": 1.03845191, "balance_loss_mlp": 1.01903129, "epoch": 0.9011874342401924, "flos": 24060831909120.0, "grad_norm": 1.6803307482650078, "language_loss": 0.69135392, "learning_rate": 1.0149316249148188e-07, "loss": 0.7125479, "num_input_tokens_seen": 323281855, "step": 14989, "time_per_iteration": 2.650520086288452 }, { "auxiliary_loss_clip": 0.01109667, "auxiliary_loss_mlp": 0.0102853, "balance_loss_clip": 1.03716099, "balance_loss_mlp": 1.01638818, "epoch": 0.9012475574928603, "flos": 16758791982720.0, "grad_norm": 1.8703364751087568, "language_loss": 0.79935807, "learning_rate": 1.0137070802352376e-07, "loss": 0.8207401, "num_input_tokens_seen": 323299505, "step": 14990, "time_per_iteration": 2.5482540130615234 }, { "auxiliary_loss_clip": 0.0107379, "auxiliary_loss_mlp": 0.01031402, "balance_loss_clip": 1.03743267, "balance_loss_mlp": 1.01825941, "epoch": 0.9013076807455284, "flos": 19971109186560.0, "grad_norm": 1.999297573895168, "language_loss": 0.78150022, "learning_rate": 1.0124832555177842e-07, "loss": 0.8025521, "num_input_tokens_seen": 323318365, "step": 14991, "time_per_iteration": 2.7104129791259766 }, { "auxiliary_loss_clip": 0.00995246, "auxiliary_loss_mlp": 0.00751703, "balance_loss_clip": 1.00523067, "balance_loss_mlp": 0.9995659, "epoch": 0.9013678039981963, "flos": 65180274624000.0, "grad_norm": 0.7792478224468473, "language_loss": 0.60261661, "learning_rate": 1.0112601508088726e-07, "loss": 0.62008613, "num_input_tokens_seen": 323371835, "step": 14992, "time_per_iteration": 3.123297691345215 }, { "auxiliary_loss_clip": 0.0109359, "auxiliary_loss_mlp": 0.0102822, "balance_loss_clip": 1.03499448, "balance_loss_mlp": 1.01605463, "epoch": 0.9014279272508643, "flos": 20521224956160.0, "grad_norm": 2.279288260696507, "language_loss": 0.82825989, "learning_rate": 1.0100377661548764e-07, "loss": 0.84947795, "num_input_tokens_seen": 323388495, "step": 14993, "time_per_iteration": 2.574572801589966 }, { "auxiliary_loss_clip": 0.01107431, "auxiliary_loss_mlp": 0.01034277, "balance_loss_clip": 1.03556728, "balance_loss_mlp": 1.02142596, "epoch": 0.9014880505035322, "flos": 17309051406720.0, "grad_norm": 2.2716926772447286, "language_loss": 0.73481464, "learning_rate": 1.0088161016021502e-07, "loss": 0.75623167, "num_input_tokens_seen": 323405280, "step": 14994, "time_per_iteration": 2.538275957107544 }, { "auxiliary_loss_clip": 0.01093439, "auxiliary_loss_mlp": 0.01026115, "balance_loss_clip": 1.03458691, "balance_loss_mlp": 1.01476073, "epoch": 0.9015481737562002, "flos": 28402862739840.0, "grad_norm": 1.830419215860498, "language_loss": 0.64486498, "learning_rate": 1.0075951571970187e-07, "loss": 0.66606051, "num_input_tokens_seen": 323425310, "step": 14995, "time_per_iteration": 2.623666286468506 }, { "auxiliary_loss_clip": 0.01069201, "auxiliary_loss_mlp": 0.01033584, "balance_loss_clip": 1.03037524, "balance_loss_mlp": 1.01956463, "epoch": 0.9016082970088681, "flos": 29752672953600.0, "grad_norm": 1.8771395079815063, "language_loss": 0.66334212, "learning_rate": 1.0063749329857873e-07, "loss": 0.68436992, "num_input_tokens_seen": 323447805, "step": 14996, "time_per_iteration": 2.781064510345459 }, { "auxiliary_loss_clip": 0.01095585, "auxiliary_loss_mlp": 0.01028339, "balance_loss_clip": 1.03407955, "balance_loss_mlp": 1.01678181, "epoch": 0.9016684202615362, "flos": 23513230091520.0, "grad_norm": 1.8246548425287972, "language_loss": 0.66247928, "learning_rate": 1.0051554290147168e-07, "loss": 0.68371856, "num_input_tokens_seen": 323467150, "step": 14997, "time_per_iteration": 2.71907114982605 }, { "auxiliary_loss_clip": 0.01080625, "auxiliary_loss_mlp": 0.01037629, "balance_loss_clip": 1.03254914, "balance_loss_mlp": 1.02483201, "epoch": 0.9017285435142042, "flos": 16979247705600.0, "grad_norm": 1.8261489433850353, "language_loss": 0.77650619, "learning_rate": 1.0039366453300613e-07, "loss": 0.79768866, "num_input_tokens_seen": 323484250, "step": 14998, "time_per_iteration": 2.6528589725494385 }, { "auxiliary_loss_clip": 0.0110937, "auxiliary_loss_mlp": 0.01029624, "balance_loss_clip": 1.03644896, "balance_loss_mlp": 1.0174706, "epoch": 0.9017886667668721, "flos": 21393351175680.0, "grad_norm": 1.6740643670552307, "language_loss": 0.75225437, "learning_rate": 1.0027185819780281e-07, "loss": 0.77364427, "num_input_tokens_seen": 323502910, "step": 14999, "time_per_iteration": 2.5951831340789795 }, { "auxiliary_loss_clip": 0.01045283, "auxiliary_loss_mlp": 0.01030998, "balance_loss_clip": 1.03599596, "balance_loss_mlp": 1.01742625, "epoch": 0.9018487900195401, "flos": 20996574566400.0, "grad_norm": 8.557112820549731, "language_loss": 0.75833976, "learning_rate": 1.0015012390048117e-07, "loss": 0.77910256, "num_input_tokens_seen": 323521820, "step": 15000, "time_per_iteration": 2.7700390815734863 }, { "auxiliary_loss_clip": 0.01090367, "auxiliary_loss_mlp": 0.01028007, "balance_loss_clip": 1.03579473, "balance_loss_mlp": 1.01666379, "epoch": 0.901908913272208, "flos": 53358443458560.0, "grad_norm": 2.1350900173970153, "language_loss": 0.80694187, "learning_rate": 1.0002846164565704e-07, "loss": 0.8281256, "num_input_tokens_seen": 323543200, "step": 15001, "time_per_iteration": 2.914686918258667 }, { "auxiliary_loss_clip": 0.01076218, "auxiliary_loss_mlp": 0.01028009, "balance_loss_clip": 1.03696334, "balance_loss_mlp": 1.01640391, "epoch": 0.901969036524876, "flos": 22089838867200.0, "grad_norm": 1.603585496644282, "language_loss": 0.78372264, "learning_rate": 9.990687143794407e-08, "loss": 0.80476493, "num_input_tokens_seen": 323563075, "step": 15002, "time_per_iteration": 2.7641050815582275 }, { "auxiliary_loss_clip": 0.01082464, "auxiliary_loss_mlp": 0.01045075, "balance_loss_clip": 1.03580308, "balance_loss_mlp": 1.03049612, "epoch": 0.9020291597775439, "flos": 23835025059840.0, "grad_norm": 2.0404977120917147, "language_loss": 0.68748105, "learning_rate": 9.978535328195347e-08, "loss": 0.70875645, "num_input_tokens_seen": 323579065, "step": 15003, "time_per_iteration": 2.771782875061035 }, { "auxiliary_loss_clip": 0.01085032, "auxiliary_loss_mlp": 0.0103817, "balance_loss_clip": 1.03330088, "balance_loss_mlp": 1.02506328, "epoch": 0.902089283030212, "flos": 18326005263360.0, "grad_norm": 1.8107770462670902, "language_loss": 0.85949785, "learning_rate": 9.9663907182292e-08, "loss": 0.88072991, "num_input_tokens_seen": 323594835, "step": 15004, "time_per_iteration": 2.666977882385254 }, { "auxiliary_loss_clip": 0.0107511, "auxiliary_loss_mlp": 0.01035534, "balance_loss_clip": 1.03480256, "balance_loss_mlp": 1.02221882, "epoch": 0.9021494062828799, "flos": 24170359455360.0, "grad_norm": 2.562101889878063, "language_loss": 0.71954483, "learning_rate": 9.954253314356575e-08, "loss": 0.74065125, "num_input_tokens_seen": 323611475, "step": 15005, "time_per_iteration": 2.759964942932129 }, { "auxiliary_loss_clip": 0.01100393, "auxiliary_loss_mlp": 0.01030904, "balance_loss_clip": 1.03423667, "balance_loss_mlp": 1.01778543, "epoch": 0.9022095295355479, "flos": 21616859554560.0, "grad_norm": 1.9300274914184496, "language_loss": 0.70556152, "learning_rate": 9.942123117037748e-08, "loss": 0.72687459, "num_input_tokens_seen": 323629730, "step": 15006, "time_per_iteration": 2.6384735107421875 }, { "auxiliary_loss_clip": 0.01086555, "auxiliary_loss_mlp": 0.0102873, "balance_loss_clip": 1.03485382, "balance_loss_mlp": 1.01679754, "epoch": 0.9022696527882158, "flos": 18726229578240.0, "grad_norm": 3.0054319156686264, "language_loss": 0.84866273, "learning_rate": 9.930000126732618e-08, "loss": 0.86981565, "num_input_tokens_seen": 323646000, "step": 15007, "time_per_iteration": 2.648921489715576 }, { "auxiliary_loss_clip": 0.01079211, "auxiliary_loss_mlp": 0.01030937, "balance_loss_clip": 1.03299296, "balance_loss_mlp": 1.01809239, "epoch": 0.9023297760408838, "flos": 26761206522240.0, "grad_norm": 1.9317011784213973, "language_loss": 0.7883476, "learning_rate": 9.917884343900928e-08, "loss": 0.80944914, "num_input_tokens_seen": 323667250, "step": 15008, "time_per_iteration": 4.242715120315552 }, { "auxiliary_loss_clip": 0.01063806, "auxiliary_loss_mlp": 0.01033642, "balance_loss_clip": 1.03497434, "balance_loss_mlp": 1.0214889, "epoch": 0.9023898992935517, "flos": 20522553759360.0, "grad_norm": 1.866627762329264, "language_loss": 0.73153245, "learning_rate": 9.905775769002156e-08, "loss": 0.75250691, "num_input_tokens_seen": 323687150, "step": 15009, "time_per_iteration": 2.691822052001953 }, { "auxiliary_loss_clip": 0.01107314, "auxiliary_loss_mlp": 0.01035667, "balance_loss_clip": 1.0361793, "balance_loss_mlp": 1.02314413, "epoch": 0.9024500225462198, "flos": 17456644391040.0, "grad_norm": 1.76387616724559, "language_loss": 0.73348868, "learning_rate": 9.893674402495399e-08, "loss": 0.75491852, "num_input_tokens_seen": 323703660, "step": 15010, "time_per_iteration": 4.291422128677368 }, { "auxiliary_loss_clip": 0.0108209, "auxiliary_loss_mlp": 0.01035862, "balance_loss_clip": 1.03634191, "balance_loss_mlp": 1.02284431, "epoch": 0.9025101457988878, "flos": 20813609664000.0, "grad_norm": 2.097335794667479, "language_loss": 0.74242449, "learning_rate": 9.881580244839538e-08, "loss": 0.76360393, "num_input_tokens_seen": 323722060, "step": 15011, "time_per_iteration": 4.15416693687439 }, { "auxiliary_loss_clip": 0.01101836, "auxiliary_loss_mlp": 0.01031616, "balance_loss_clip": 1.03616571, "balance_loss_mlp": 1.01824617, "epoch": 0.9025702690515557, "flos": 19026371623680.0, "grad_norm": 10.830412851776218, "language_loss": 0.72975504, "learning_rate": 9.869493296493204e-08, "loss": 0.75108945, "num_input_tokens_seen": 323740645, "step": 15012, "time_per_iteration": 4.172262668609619 }, { "auxiliary_loss_clip": 0.01073966, "auxiliary_loss_mlp": 0.01038584, "balance_loss_clip": 1.03479862, "balance_loss_mlp": 1.02719402, "epoch": 0.9026303923042237, "flos": 19682818629120.0, "grad_norm": 1.6805885971222159, "language_loss": 0.69541949, "learning_rate": 9.857413557914763e-08, "loss": 0.71654499, "num_input_tokens_seen": 323758905, "step": 15013, "time_per_iteration": 2.6801204681396484 }, { "auxiliary_loss_clip": 0.01092922, "auxiliary_loss_mlp": 0.01031842, "balance_loss_clip": 1.03412437, "balance_loss_mlp": 1.01987374, "epoch": 0.9026905155568916, "flos": 24608110504320.0, "grad_norm": 1.451081928504829, "language_loss": 0.73157448, "learning_rate": 9.845341029562249e-08, "loss": 0.75282216, "num_input_tokens_seen": 323780595, "step": 15014, "time_per_iteration": 2.6699087619781494 }, { "auxiliary_loss_clip": 0.01107905, "auxiliary_loss_mlp": 0.01032528, "balance_loss_clip": 1.03593612, "balance_loss_mlp": 1.01995111, "epoch": 0.9027506388095596, "flos": 20521799573760.0, "grad_norm": 1.9727005096909034, "language_loss": 0.72401255, "learning_rate": 9.833275711893474e-08, "loss": 0.74541688, "num_input_tokens_seen": 323798160, "step": 15015, "time_per_iteration": 2.536134958267212 }, { "auxiliary_loss_clip": 0.01083409, "auxiliary_loss_mlp": 0.01034606, "balance_loss_clip": 1.03356743, "balance_loss_mlp": 1.02245855, "epoch": 0.9028107620622275, "flos": 22784494965120.0, "grad_norm": 2.2967609307485213, "language_loss": 0.6894691, "learning_rate": 9.821217605365895e-08, "loss": 0.71064925, "num_input_tokens_seen": 323816810, "step": 15016, "time_per_iteration": 2.696544647216797 }, { "auxiliary_loss_clip": 0.01105993, "auxiliary_loss_mlp": 0.01027953, "balance_loss_clip": 1.03578448, "balance_loss_mlp": 1.0165025, "epoch": 0.9028708853148956, "flos": 25410534382080.0, "grad_norm": 1.8623697779544957, "language_loss": 0.7037698, "learning_rate": 9.809166710436855e-08, "loss": 0.72510922, "num_input_tokens_seen": 323836900, "step": 15017, "time_per_iteration": 2.595538377761841 }, { "auxiliary_loss_clip": 0.01086858, "auxiliary_loss_mlp": 0.01033508, "balance_loss_clip": 1.03965449, "balance_loss_mlp": 1.02197492, "epoch": 0.9029310085675635, "flos": 21871322478720.0, "grad_norm": 1.936832914018773, "language_loss": 0.69448954, "learning_rate": 9.797123027563237e-08, "loss": 0.71569324, "num_input_tokens_seen": 323855325, "step": 15018, "time_per_iteration": 2.6294448375701904 }, { "auxiliary_loss_clip": 0.01097184, "auxiliary_loss_mlp": 0.01030964, "balance_loss_clip": 1.0363183, "balance_loss_mlp": 1.01848841, "epoch": 0.9029911318202315, "flos": 26214394803840.0, "grad_norm": 2.670057075172495, "language_loss": 0.68977821, "learning_rate": 9.785086557201782e-08, "loss": 0.71105969, "num_input_tokens_seen": 323875650, "step": 15019, "time_per_iteration": 2.7993857860565186 }, { "auxiliary_loss_clip": 0.01105429, "auxiliary_loss_mlp": 0.0103295, "balance_loss_clip": 1.03574717, "balance_loss_mlp": 1.02123153, "epoch": 0.9030512550728994, "flos": 15961360095360.0, "grad_norm": 1.9111353110117102, "language_loss": 0.72140992, "learning_rate": 9.773057299808951e-08, "loss": 0.74279368, "num_input_tokens_seen": 323892920, "step": 15020, "time_per_iteration": 2.588925361633301 }, { "auxiliary_loss_clip": 0.01094641, "auxiliary_loss_mlp": 0.01030177, "balance_loss_clip": 1.03369665, "balance_loss_mlp": 1.01739788, "epoch": 0.9031113783255674, "flos": 23987610034560.0, "grad_norm": 1.5881960753658597, "language_loss": 0.74275625, "learning_rate": 9.7610352558408e-08, "loss": 0.76400447, "num_input_tokens_seen": 323913835, "step": 15021, "time_per_iteration": 2.588358163833618 }, { "auxiliary_loss_clip": 0.01112744, "auxiliary_loss_mlp": 0.01031351, "balance_loss_clip": 1.03767323, "balance_loss_mlp": 1.01803565, "epoch": 0.9031715015782353, "flos": 22237216369920.0, "grad_norm": 2.206963784071178, "language_loss": 0.7280935, "learning_rate": 9.749020425753251e-08, "loss": 0.74953449, "num_input_tokens_seen": 323933440, "step": 15022, "time_per_iteration": 2.536369562149048 }, { "auxiliary_loss_clip": 0.01068128, "auxiliary_loss_mlp": 0.01027903, "balance_loss_clip": 1.03312647, "balance_loss_mlp": 1.01603556, "epoch": 0.9032316248309034, "flos": 26323168164480.0, "grad_norm": 2.626283652398094, "language_loss": 0.72459871, "learning_rate": 9.737012810001943e-08, "loss": 0.74555898, "num_input_tokens_seen": 323954090, "step": 15023, "time_per_iteration": 2.7086663246154785 }, { "auxiliary_loss_clip": 0.01095012, "auxiliary_loss_mlp": 0.01033688, "balance_loss_clip": 1.03661966, "balance_loss_mlp": 1.02148056, "epoch": 0.9032917480835713, "flos": 22636686499200.0, "grad_norm": 1.615390594189699, "language_loss": 0.82334167, "learning_rate": 9.725012409042155e-08, "loss": 0.84462869, "num_input_tokens_seen": 323974040, "step": 15024, "time_per_iteration": 2.6185879707336426 }, { "auxiliary_loss_clip": 0.01099161, "auxiliary_loss_mlp": 0.01028549, "balance_loss_clip": 1.03624964, "balance_loss_mlp": 1.01650262, "epoch": 0.9033518713362393, "flos": 23878764846720.0, "grad_norm": 1.6458847486672181, "language_loss": 0.69518673, "learning_rate": 9.713019223328966e-08, "loss": 0.7164638, "num_input_tokens_seen": 323996125, "step": 15025, "time_per_iteration": 2.6076362133026123 }, { "auxiliary_loss_clip": 0.01073996, "auxiliary_loss_mlp": 0.01035637, "balance_loss_clip": 1.03491449, "balance_loss_mlp": 1.02332294, "epoch": 0.9034119945889073, "flos": 26905279973760.0, "grad_norm": 1.5601899591487556, "language_loss": 0.76521379, "learning_rate": 9.70103325331717e-08, "loss": 0.78631014, "num_input_tokens_seen": 324017645, "step": 15026, "time_per_iteration": 2.7674145698547363 }, { "auxiliary_loss_clip": 0.01098222, "auxiliary_loss_mlp": 0.01030884, "balance_loss_clip": 1.03840423, "balance_loss_mlp": 1.01899886, "epoch": 0.9034721178415752, "flos": 20850166730880.0, "grad_norm": 2.0222752747400192, "language_loss": 0.68377501, "learning_rate": 9.68905449946129e-08, "loss": 0.70506608, "num_input_tokens_seen": 324036875, "step": 15027, "time_per_iteration": 2.6653904914855957 }, { "auxiliary_loss_clip": 0.01052551, "auxiliary_loss_mlp": 0.01041128, "balance_loss_clip": 1.03196084, "balance_loss_mlp": 1.02769923, "epoch": 0.9035322410942432, "flos": 22234307368320.0, "grad_norm": 1.6548540409634305, "language_loss": 0.75698447, "learning_rate": 9.677082962215477e-08, "loss": 0.7779212, "num_input_tokens_seen": 324057045, "step": 15028, "time_per_iteration": 2.7179388999938965 }, { "auxiliary_loss_clip": 0.01052919, "auxiliary_loss_mlp": 0.0103746, "balance_loss_clip": 1.03367233, "balance_loss_mlp": 1.02507436, "epoch": 0.9035923643469111, "flos": 25923410726400.0, "grad_norm": 1.805593039358967, "language_loss": 0.69399357, "learning_rate": 9.665118642033765e-08, "loss": 0.71489739, "num_input_tokens_seen": 324079735, "step": 15029, "time_per_iteration": 2.813114643096924 }, { "auxiliary_loss_clip": 0.01096672, "auxiliary_loss_mlp": 0.01034607, "balance_loss_clip": 1.03852797, "balance_loss_mlp": 1.02123141, "epoch": 0.9036524875995792, "flos": 20339804338560.0, "grad_norm": 1.8345501751502649, "language_loss": 0.7369951, "learning_rate": 9.653161539369858e-08, "loss": 0.75830793, "num_input_tokens_seen": 324097785, "step": 15030, "time_per_iteration": 2.696516990661621 }, { "auxiliary_loss_clip": 0.01101797, "auxiliary_loss_mlp": 0.01030355, "balance_loss_clip": 1.03739715, "balance_loss_mlp": 1.01790965, "epoch": 0.9037126108522471, "flos": 40114624677120.0, "grad_norm": 2.451430150859209, "language_loss": 0.6831615, "learning_rate": 9.641211654677151e-08, "loss": 0.70448303, "num_input_tokens_seen": 324121625, "step": 15031, "time_per_iteration": 2.776313543319702 }, { "auxiliary_loss_clip": 0.01085756, "auxiliary_loss_mlp": 0.01028456, "balance_loss_clip": 1.03706944, "balance_loss_mlp": 1.01662993, "epoch": 0.9037727341049151, "flos": 23332024955520.0, "grad_norm": 1.492349301530935, "language_loss": 0.76186407, "learning_rate": 9.629268988408723e-08, "loss": 0.78300619, "num_input_tokens_seen": 324142535, "step": 15032, "time_per_iteration": 2.722729206085205 }, { "auxiliary_loss_clip": 0.01110023, "auxiliary_loss_mlp": 0.01033692, "balance_loss_clip": 1.03756511, "balance_loss_mlp": 1.02144957, "epoch": 0.903832857357583, "flos": 12822659815680.0, "grad_norm": 1.761761043861274, "language_loss": 0.75420368, "learning_rate": 9.617333541017502e-08, "loss": 0.77564085, "num_input_tokens_seen": 324159610, "step": 15033, "time_per_iteration": 2.6790883541107178 }, { "auxiliary_loss_clip": 0.01074569, "auxiliary_loss_mlp": 0.01038477, "balance_loss_clip": 1.03108501, "balance_loss_mlp": 1.02516127, "epoch": 0.903892980610251, "flos": 25703026830720.0, "grad_norm": 1.9648590511752269, "language_loss": 0.73957044, "learning_rate": 9.605405312956105e-08, "loss": 0.76070094, "num_input_tokens_seen": 324182510, "step": 15034, "time_per_iteration": 2.7564845085144043 }, { "auxiliary_loss_clip": 0.01076984, "auxiliary_loss_mlp": 0.01032868, "balance_loss_clip": 1.03676867, "balance_loss_mlp": 1.02031517, "epoch": 0.9039531038629189, "flos": 14684089397760.0, "grad_norm": 2.177722949634339, "language_loss": 0.6356231, "learning_rate": 9.593484304676791e-08, "loss": 0.65672159, "num_input_tokens_seen": 324200555, "step": 15035, "time_per_iteration": 2.714242935180664 }, { "auxiliary_loss_clip": 0.01109298, "auxiliary_loss_mlp": 0.01032355, "balance_loss_clip": 1.0378021, "balance_loss_mlp": 1.01890254, "epoch": 0.904013227115587, "flos": 24024921287040.0, "grad_norm": 2.5713675612269897, "language_loss": 0.61697221, "learning_rate": 9.581570516631643e-08, "loss": 0.63838875, "num_input_tokens_seen": 324220255, "step": 15036, "time_per_iteration": 2.6531126499176025 }, { "auxiliary_loss_clip": 0.01057116, "auxiliary_loss_mlp": 0.01033081, "balance_loss_clip": 1.03590751, "balance_loss_mlp": 1.02079058, "epoch": 0.9040733503682549, "flos": 22856459863680.0, "grad_norm": 1.6688110224130346, "language_loss": 0.82059491, "learning_rate": 9.569663949272455e-08, "loss": 0.84149683, "num_input_tokens_seen": 324237855, "step": 15037, "time_per_iteration": 2.667306661605835 }, { "auxiliary_loss_clip": 0.01111291, "auxiliary_loss_mlp": 0.01029139, "balance_loss_clip": 1.03720188, "balance_loss_mlp": 1.01677668, "epoch": 0.9041334736209229, "flos": 19974951941760.0, "grad_norm": 1.9034264024294631, "language_loss": 0.67595971, "learning_rate": 9.557764603050667e-08, "loss": 0.69736397, "num_input_tokens_seen": 324257050, "step": 15038, "time_per_iteration": 2.546713352203369 }, { "auxiliary_loss_clip": 0.01085126, "auxiliary_loss_mlp": 0.0103871, "balance_loss_clip": 1.03417087, "balance_loss_mlp": 1.02606213, "epoch": 0.9041935968735909, "flos": 17530548624000.0, "grad_norm": 2.007069946827801, "language_loss": 0.7516647, "learning_rate": 9.545872478417494e-08, "loss": 0.77290308, "num_input_tokens_seen": 324275510, "step": 15039, "time_per_iteration": 2.6198740005493164 }, { "auxiliary_loss_clip": 0.01082867, "auxiliary_loss_mlp": 0.01030216, "balance_loss_clip": 1.03606772, "balance_loss_mlp": 1.01828885, "epoch": 0.9042537201262588, "flos": 22780149419520.0, "grad_norm": 1.4865254834014996, "language_loss": 0.70274264, "learning_rate": 9.533987575823977e-08, "loss": 0.7238735, "num_input_tokens_seen": 324295150, "step": 15040, "time_per_iteration": 2.6253907680511475 }, { "auxiliary_loss_clip": 0.01073575, "auxiliary_loss_mlp": 0.01031021, "balance_loss_clip": 1.03373194, "balance_loss_mlp": 1.01905835, "epoch": 0.9043138433789268, "flos": 20595416497920.0, "grad_norm": 1.5884049488424423, "language_loss": 0.67547166, "learning_rate": 9.522109895720709e-08, "loss": 0.69651759, "num_input_tokens_seen": 324313855, "step": 15041, "time_per_iteration": 2.6538193225860596 }, { "auxiliary_loss_clip": 0.01096511, "auxiliary_loss_mlp": 0.01033073, "balance_loss_clip": 1.0354538, "balance_loss_mlp": 1.02016878, "epoch": 0.9043739666315948, "flos": 32962978995840.0, "grad_norm": 1.757404597325935, "language_loss": 0.57556689, "learning_rate": 9.510239438558155e-08, "loss": 0.59686273, "num_input_tokens_seen": 324338465, "step": 15042, "time_per_iteration": 2.7718114852905273 }, { "auxiliary_loss_clip": 0.01010523, "auxiliary_loss_mlp": 0.00751383, "balance_loss_clip": 1.00739527, "balance_loss_mlp": 0.99962682, "epoch": 0.9044340898842628, "flos": 67296418525440.0, "grad_norm": 0.79646583953312, "language_loss": 0.56897914, "learning_rate": 9.498376204786351e-08, "loss": 0.58659816, "num_input_tokens_seen": 324398740, "step": 15043, "time_per_iteration": 3.1866395473480225 }, { "auxiliary_loss_clip": 0.01086756, "auxiliary_loss_mlp": 0.01031593, "balance_loss_clip": 1.03518081, "balance_loss_mlp": 1.01791954, "epoch": 0.9044942131369307, "flos": 17713154390400.0, "grad_norm": 2.037927105640118, "language_loss": 0.69802731, "learning_rate": 9.486520194855274e-08, "loss": 0.71921074, "num_input_tokens_seen": 324417335, "step": 15044, "time_per_iteration": 2.6936917304992676 }, { "auxiliary_loss_clip": 0.01089873, "auxiliary_loss_mlp": 0.01040675, "balance_loss_clip": 1.03643358, "balance_loss_mlp": 1.02699018, "epoch": 0.9045543363895987, "flos": 17820563034240.0, "grad_norm": 2.361452153722679, "language_loss": 0.69954962, "learning_rate": 9.474671409214407e-08, "loss": 0.72085512, "num_input_tokens_seen": 324433240, "step": 15045, "time_per_iteration": 2.655958414077759 }, { "auxiliary_loss_clip": 0.01077221, "auxiliary_loss_mlp": 0.01037261, "balance_loss_clip": 1.0350486, "balance_loss_mlp": 1.02417183, "epoch": 0.9046144596422666, "flos": 21872723109120.0, "grad_norm": 1.816781294987572, "language_loss": 0.65513825, "learning_rate": 9.462829848313081e-08, "loss": 0.67628312, "num_input_tokens_seen": 324452675, "step": 15046, "time_per_iteration": 2.704993963241577 }, { "auxiliary_loss_clip": 0.01077406, "auxiliary_loss_mlp": 0.01039336, "balance_loss_clip": 1.03620148, "balance_loss_mlp": 1.02714109, "epoch": 0.9046745828949346, "flos": 17672646827520.0, "grad_norm": 2.027120160637291, "language_loss": 0.62039495, "learning_rate": 9.450995512600379e-08, "loss": 0.6415624, "num_input_tokens_seen": 324467865, "step": 15047, "time_per_iteration": 2.731316089630127 }, { "auxiliary_loss_clip": 0.01109878, "auxiliary_loss_mlp": 0.00770221, "balance_loss_clip": 1.03869438, "balance_loss_mlp": 1.00023651, "epoch": 0.9047347061476025, "flos": 25702559953920.0, "grad_norm": 1.5037316307134132, "language_loss": 0.71319842, "learning_rate": 9.439168402525032e-08, "loss": 0.73199946, "num_input_tokens_seen": 324490430, "step": 15048, "time_per_iteration": 5.092748403549194 }, { "auxiliary_loss_clip": 0.01098767, "auxiliary_loss_mlp": 0.0103572, "balance_loss_clip": 1.03479016, "balance_loss_mlp": 1.02233887, "epoch": 0.9047948294002706, "flos": 15158146118400.0, "grad_norm": 2.1618818731676637, "language_loss": 0.748658, "learning_rate": 9.427348518535483e-08, "loss": 0.7700029, "num_input_tokens_seen": 324506620, "step": 15049, "time_per_iteration": 4.3224146366119385 }, { "auxiliary_loss_clip": 0.01095393, "auxiliary_loss_mlp": 0.0103323, "balance_loss_clip": 1.0372622, "balance_loss_mlp": 1.02072453, "epoch": 0.9048549526529385, "flos": 21872292145920.0, "grad_norm": 2.5225470406592105, "language_loss": 0.75863099, "learning_rate": 9.415535861079993e-08, "loss": 0.77991724, "num_input_tokens_seen": 324525505, "step": 15050, "time_per_iteration": 4.230266094207764 }, { "auxiliary_loss_clip": 0.01109636, "auxiliary_loss_mlp": 0.00769663, "balance_loss_clip": 1.03721118, "balance_loss_mlp": 1.00019288, "epoch": 0.9049150759056065, "flos": 23546626761600.0, "grad_norm": 1.8328342559703832, "language_loss": 0.81820488, "learning_rate": 9.403730430606472e-08, "loss": 0.83699787, "num_input_tokens_seen": 324544415, "step": 15051, "time_per_iteration": 4.13810133934021 }, { "auxiliary_loss_clip": 0.0109796, "auxiliary_loss_mlp": 0.01030797, "balance_loss_clip": 1.03711987, "balance_loss_mlp": 1.01926315, "epoch": 0.9049751991582745, "flos": 19645902426240.0, "grad_norm": 2.063226238004681, "language_loss": 0.89144683, "learning_rate": 9.391932227562582e-08, "loss": 0.91273439, "num_input_tokens_seen": 324562555, "step": 15052, "time_per_iteration": 2.5994207859039307 }, { "auxiliary_loss_clip": 0.01101275, "auxiliary_loss_mlp": 0.01032827, "balance_loss_clip": 1.03616786, "balance_loss_mlp": 1.02020848, "epoch": 0.9050353224109424, "flos": 15596220389760.0, "grad_norm": 3.6081086448903616, "language_loss": 0.77183485, "learning_rate": 9.380141252395724e-08, "loss": 0.79317588, "num_input_tokens_seen": 324580865, "step": 15053, "time_per_iteration": 2.546614170074463 }, { "auxiliary_loss_clip": 0.01095283, "auxiliary_loss_mlp": 0.01033259, "balance_loss_clip": 1.03654027, "balance_loss_mlp": 1.02096224, "epoch": 0.9050954456636104, "flos": 28183592165760.0, "grad_norm": 2.4176866972554927, "language_loss": 0.73160625, "learning_rate": 9.368357505553049e-08, "loss": 0.75289166, "num_input_tokens_seen": 324600665, "step": 15054, "time_per_iteration": 2.658132553100586 }, { "auxiliary_loss_clip": 0.01054009, "auxiliary_loss_mlp": 0.01034334, "balance_loss_clip": 1.03092122, "balance_loss_mlp": 1.0217638, "epoch": 0.9051555689162784, "flos": 25731611078400.0, "grad_norm": 1.6662566471194642, "language_loss": 0.83386469, "learning_rate": 9.356580987481333e-08, "loss": 0.85474813, "num_input_tokens_seen": 324618145, "step": 15055, "time_per_iteration": 2.7756059169769287 }, { "auxiliary_loss_clip": 0.01094483, "auxiliary_loss_mlp": 0.01034211, "balance_loss_clip": 1.03571832, "balance_loss_mlp": 1.02193809, "epoch": 0.9052156921689464, "flos": 23257258796160.0, "grad_norm": 1.7590583279943084, "language_loss": 0.85093272, "learning_rate": 9.344811698627176e-08, "loss": 0.87221962, "num_input_tokens_seen": 324638165, "step": 15056, "time_per_iteration": 2.6432409286499023 }, { "auxiliary_loss_clip": 0.01079366, "auxiliary_loss_mlp": 0.01028685, "balance_loss_clip": 1.03504348, "balance_loss_mlp": 1.01706791, "epoch": 0.9052758154216143, "flos": 29564285097600.0, "grad_norm": 2.874678812458683, "language_loss": 0.72274697, "learning_rate": 9.333049639436863e-08, "loss": 0.74382746, "num_input_tokens_seen": 324658560, "step": 15057, "time_per_iteration": 2.729560613632202 }, { "auxiliary_loss_clip": 0.0109434, "auxiliary_loss_mlp": 0.01032293, "balance_loss_clip": 1.03419363, "balance_loss_mlp": 1.02033675, "epoch": 0.9053359386742823, "flos": 22127688823680.0, "grad_norm": 1.7504023555736803, "language_loss": 0.80625844, "learning_rate": 9.321294810356418e-08, "loss": 0.82752472, "num_input_tokens_seen": 324679185, "step": 15058, "time_per_iteration": 2.7866742610931396 }, { "auxiliary_loss_clip": 0.01016738, "auxiliary_loss_mlp": 0.01001155, "balance_loss_clip": 1.00546241, "balance_loss_mlp": 1.00027263, "epoch": 0.9053960619269502, "flos": 67090112760960.0, "grad_norm": 0.6742645002897684, "language_loss": 0.51343101, "learning_rate": 9.309547211831592e-08, "loss": 0.53360993, "num_input_tokens_seen": 324744830, "step": 15059, "time_per_iteration": 3.2885544300079346 }, { "auxiliary_loss_clip": 0.01072001, "auxiliary_loss_mlp": 0.01030876, "balance_loss_clip": 1.04110169, "balance_loss_mlp": 1.0184902, "epoch": 0.9054561851796182, "flos": 15815419136640.0, "grad_norm": 1.7140803408550112, "language_loss": 0.67263991, "learning_rate": 9.297806844307831e-08, "loss": 0.69366872, "num_input_tokens_seen": 324762905, "step": 15060, "time_per_iteration": 2.8112542629241943 }, { "auxiliary_loss_clip": 0.01089234, "auxiliary_loss_mlp": 0.01032459, "balance_loss_clip": 1.03664804, "balance_loss_mlp": 1.01979876, "epoch": 0.9055163084322861, "flos": 17566997950080.0, "grad_norm": 2.3975546753010915, "language_loss": 0.64229333, "learning_rate": 9.286073708230357e-08, "loss": 0.66351026, "num_input_tokens_seen": 324781905, "step": 15061, "time_per_iteration": 2.6348559856414795 }, { "auxiliary_loss_clip": 0.01083114, "auxiliary_loss_mlp": 0.01038728, "balance_loss_clip": 1.03490663, "balance_loss_mlp": 1.02568662, "epoch": 0.9055764316849542, "flos": 17639573379840.0, "grad_norm": 1.6952248050793448, "language_loss": 0.71770173, "learning_rate": 9.274347804044058e-08, "loss": 0.73892021, "num_input_tokens_seen": 324799260, "step": 15062, "time_per_iteration": 2.889420986175537 }, { "auxiliary_loss_clip": 0.01106793, "auxiliary_loss_mlp": 0.01031859, "balance_loss_clip": 1.03594065, "balance_loss_mlp": 1.01968181, "epoch": 0.9056365549376221, "flos": 20120856986880.0, "grad_norm": 2.4465454482745534, "language_loss": 0.71081591, "learning_rate": 9.2626291321936e-08, "loss": 0.73220247, "num_input_tokens_seen": 324817800, "step": 15063, "time_per_iteration": 2.5845255851745605 }, { "auxiliary_loss_clip": 0.01066505, "auxiliary_loss_mlp": 0.01033382, "balance_loss_clip": 1.03441405, "balance_loss_mlp": 1.02137733, "epoch": 0.9056966781902901, "flos": 27598786836480.0, "grad_norm": 1.6140764840552748, "language_loss": 0.72192168, "learning_rate": 9.250917693123406e-08, "loss": 0.74292052, "num_input_tokens_seen": 324838445, "step": 15064, "time_per_iteration": 2.711472511291504 }, { "auxiliary_loss_clip": 0.01099676, "auxiliary_loss_mlp": 0.01032131, "balance_loss_clip": 1.0358665, "balance_loss_mlp": 1.01976943, "epoch": 0.9057568014429581, "flos": 25920106675200.0, "grad_norm": 1.9283380616790378, "language_loss": 0.69733697, "learning_rate": 9.23921348727752e-08, "loss": 0.71865511, "num_input_tokens_seen": 324859895, "step": 15065, "time_per_iteration": 2.6254019737243652 }, { "auxiliary_loss_clip": 0.01076646, "auxiliary_loss_mlp": 0.01034346, "balance_loss_clip": 1.03431368, "balance_loss_mlp": 1.02240729, "epoch": 0.905816924695626, "flos": 22930364096640.0, "grad_norm": 1.5639103383265116, "language_loss": 0.62895906, "learning_rate": 9.227516515099743e-08, "loss": 0.65006894, "num_input_tokens_seen": 324879580, "step": 15066, "time_per_iteration": 2.7154438495635986 }, { "auxiliary_loss_clip": 0.01035849, "auxiliary_loss_mlp": 0.01033037, "balance_loss_clip": 1.02947974, "balance_loss_mlp": 1.01869655, "epoch": 0.905877047948294, "flos": 22157422306560.0, "grad_norm": 1.934180125308043, "language_loss": 0.80121052, "learning_rate": 9.215826777033675e-08, "loss": 0.82189941, "num_input_tokens_seen": 324898950, "step": 15067, "time_per_iteration": 2.7812981605529785 }, { "auxiliary_loss_clip": 0.0108924, "auxiliary_loss_mlp": 0.01033309, "balance_loss_clip": 1.0376116, "balance_loss_mlp": 1.02020836, "epoch": 0.905937171200962, "flos": 15304805349120.0, "grad_norm": 1.6228923811634084, "language_loss": 0.70006502, "learning_rate": 9.204144273522563e-08, "loss": 0.72129059, "num_input_tokens_seen": 324917455, "step": 15068, "time_per_iteration": 2.865957021713257 }, { "auxiliary_loss_clip": 0.01104355, "auxiliary_loss_mlp": 0.01028972, "balance_loss_clip": 1.03523481, "balance_loss_mlp": 1.01681864, "epoch": 0.90599729445363, "flos": 19462973437440.0, "grad_norm": 2.0548899338022064, "language_loss": 0.85366511, "learning_rate": 9.19246900500943e-08, "loss": 0.87499845, "num_input_tokens_seen": 324934495, "step": 15069, "time_per_iteration": 2.5832648277282715 }, { "auxiliary_loss_clip": 0.01100336, "auxiliary_loss_mlp": 0.01032536, "balance_loss_clip": 1.03515148, "balance_loss_mlp": 1.01913118, "epoch": 0.9060574177062979, "flos": 23732967542400.0, "grad_norm": 1.7734674553578826, "language_loss": 0.59089136, "learning_rate": 9.180800971936987e-08, "loss": 0.61222005, "num_input_tokens_seen": 324953230, "step": 15070, "time_per_iteration": 2.6578190326690674 }, { "auxiliary_loss_clip": 0.01073063, "auxiliary_loss_mlp": 0.01029432, "balance_loss_clip": 1.03542089, "balance_loss_mlp": 1.01644397, "epoch": 0.9061175409589659, "flos": 17311134395520.0, "grad_norm": 2.114967180727135, "language_loss": 0.81690538, "learning_rate": 9.169140174747724e-08, "loss": 0.83793026, "num_input_tokens_seen": 324969880, "step": 15071, "time_per_iteration": 2.677042245864868 }, { "auxiliary_loss_clip": 0.0111224, "auxiliary_loss_mlp": 0.01041359, "balance_loss_clip": 1.03753805, "balance_loss_mlp": 1.02798986, "epoch": 0.9061776642116338, "flos": 17778439359360.0, "grad_norm": 1.8991196777690924, "language_loss": 0.61947775, "learning_rate": 9.157486613883758e-08, "loss": 0.64101374, "num_input_tokens_seen": 324987005, "step": 15072, "time_per_iteration": 2.5581016540527344 }, { "auxiliary_loss_clip": 0.01088368, "auxiliary_loss_mlp": 0.01035581, "balance_loss_clip": 1.03575015, "balance_loss_mlp": 1.02321947, "epoch": 0.9062377874643018, "flos": 42777688037760.0, "grad_norm": 1.883547115522317, "language_loss": 0.73039377, "learning_rate": 9.145840289787021e-08, "loss": 0.75163323, "num_input_tokens_seen": 325010700, "step": 15073, "time_per_iteration": 2.933929681777954 }, { "auxiliary_loss_clip": 0.01094334, "auxiliary_loss_mlp": 0.01027115, "balance_loss_clip": 1.0359031, "balance_loss_mlp": 1.01591563, "epoch": 0.9062979107169697, "flos": 16361620323840.0, "grad_norm": 1.8214785876499617, "language_loss": 0.8087334, "learning_rate": 9.134201202899161e-08, "loss": 0.82994789, "num_input_tokens_seen": 325028760, "step": 15074, "time_per_iteration": 2.6201162338256836 }, { "auxiliary_loss_clip": 0.00984336, "auxiliary_loss_mlp": 0.00752175, "balance_loss_clip": 1.00953913, "balance_loss_mlp": 0.99961203, "epoch": 0.9063580339696378, "flos": 69313988528640.0, "grad_norm": 0.7424455220001136, "language_loss": 0.52306926, "learning_rate": 9.122569353661513e-08, "loss": 0.54043436, "num_input_tokens_seen": 325093545, "step": 15075, "time_per_iteration": 3.318652391433716 }, { "auxiliary_loss_clip": 0.00997512, "auxiliary_loss_mlp": 0.00998485, "balance_loss_clip": 1.0082109, "balance_loss_mlp": 0.99731654, "epoch": 0.9064181572223057, "flos": 58794747148800.0, "grad_norm": 0.7354115302623626, "language_loss": 0.62038195, "learning_rate": 9.11094474251517e-08, "loss": 0.640342, "num_input_tokens_seen": 325152295, "step": 15076, "time_per_iteration": 3.1302971839904785 }, { "auxiliary_loss_clip": 0.01095732, "auxiliary_loss_mlp": 0.01035747, "balance_loss_clip": 1.0357511, "balance_loss_mlp": 1.0237844, "epoch": 0.9064782804749737, "flos": 21762692772480.0, "grad_norm": 1.7300331938520934, "language_loss": 0.81917107, "learning_rate": 9.09932736990091e-08, "loss": 0.84048593, "num_input_tokens_seen": 325169705, "step": 15077, "time_per_iteration": 2.6315958499908447 }, { "auxiliary_loss_clip": 0.01081763, "auxiliary_loss_mlp": 0.00769991, "balance_loss_clip": 1.03210878, "balance_loss_mlp": 1.00007868, "epoch": 0.9065384037276417, "flos": 21397373498880.0, "grad_norm": 1.5468663255290942, "language_loss": 0.83872044, "learning_rate": 9.08771723625934e-08, "loss": 0.85723794, "num_input_tokens_seen": 325189175, "step": 15078, "time_per_iteration": 2.727109670639038 }, { "auxiliary_loss_clip": 0.01093852, "auxiliary_loss_mlp": 0.00770079, "balance_loss_clip": 1.03619432, "balance_loss_mlp": 1.00015736, "epoch": 0.9065985269803096, "flos": 38283646849920.0, "grad_norm": 1.6827515544701097, "language_loss": 0.65606648, "learning_rate": 9.076114342030617e-08, "loss": 0.67470574, "num_input_tokens_seen": 325211020, "step": 15079, "time_per_iteration": 2.771944999694824 }, { "auxiliary_loss_clip": 0.01028805, "auxiliary_loss_mlp": 0.01027589, "balance_loss_clip": 1.03047419, "balance_loss_mlp": 1.0151794, "epoch": 0.9066586502329776, "flos": 44818562989440.0, "grad_norm": 1.7893675619126004, "language_loss": 0.70638371, "learning_rate": 9.064518687654765e-08, "loss": 0.72694761, "num_input_tokens_seen": 325236970, "step": 15080, "time_per_iteration": 2.9839913845062256 }, { "auxiliary_loss_clip": 0.01096514, "auxiliary_loss_mlp": 0.01031379, "balance_loss_clip": 1.03848863, "balance_loss_mlp": 1.01837301, "epoch": 0.9067187734856456, "flos": 18623992492800.0, "grad_norm": 2.4819155827069452, "language_loss": 0.71019328, "learning_rate": 9.052930273571547e-08, "loss": 0.73147219, "num_input_tokens_seen": 325252670, "step": 15081, "time_per_iteration": 2.5639331340789795 }, { "auxiliary_loss_clip": 0.01082423, "auxiliary_loss_mlp": 0.01034663, "balance_loss_clip": 1.03733432, "balance_loss_mlp": 1.02240872, "epoch": 0.9067788967383136, "flos": 22747578762240.0, "grad_norm": 5.90815505153055, "language_loss": 0.7437706, "learning_rate": 9.04134910022032e-08, "loss": 0.76494145, "num_input_tokens_seen": 325273860, "step": 15082, "time_per_iteration": 2.6862359046936035 }, { "auxiliary_loss_clip": 0.01073569, "auxiliary_loss_mlp": 0.0103586, "balance_loss_clip": 1.03576851, "balance_loss_mlp": 1.02364099, "epoch": 0.9068390199909815, "flos": 27670787648640.0, "grad_norm": 2.0228960329106904, "language_loss": 0.78056735, "learning_rate": 9.029775168040266e-08, "loss": 0.80166161, "num_input_tokens_seen": 325294140, "step": 15083, "time_per_iteration": 2.7631537914276123 }, { "auxiliary_loss_clip": 0.01082943, "auxiliary_loss_mlp": 0.0076928, "balance_loss_clip": 1.03722239, "balance_loss_mlp": 1.00023723, "epoch": 0.9068991432436495, "flos": 24244012293120.0, "grad_norm": 1.5997317426680842, "language_loss": 0.68783748, "learning_rate": 9.01820847747028e-08, "loss": 0.70635974, "num_input_tokens_seen": 325313130, "step": 15084, "time_per_iteration": 2.720623731613159 }, { "auxiliary_loss_clip": 0.01108826, "auxiliary_loss_mlp": 0.01031366, "balance_loss_clip": 1.03775597, "balance_loss_mlp": 1.01930761, "epoch": 0.9069592664963174, "flos": 28033305661440.0, "grad_norm": 7.23400764704548, "language_loss": 0.67128915, "learning_rate": 9.006649028948965e-08, "loss": 0.69269109, "num_input_tokens_seen": 325334880, "step": 15085, "time_per_iteration": 2.6862213611602783 }, { "auxiliary_loss_clip": 0.00998184, "auxiliary_loss_mlp": 0.01017743, "balance_loss_clip": 1.00960755, "balance_loss_mlp": 1.01620471, "epoch": 0.9070193897489854, "flos": 68778414789120.0, "grad_norm": 0.7963063657697701, "language_loss": 0.61316264, "learning_rate": 8.995096822914638e-08, "loss": 0.63332188, "num_input_tokens_seen": 325394175, "step": 15086, "time_per_iteration": 3.2537643909454346 }, { "auxiliary_loss_clip": 0.01093775, "auxiliary_loss_mlp": 0.01038417, "balance_loss_clip": 1.03427684, "balance_loss_mlp": 1.02487493, "epoch": 0.9070795130016533, "flos": 23441624328960.0, "grad_norm": 1.436388517862248, "language_loss": 0.72142053, "learning_rate": 8.983551859805416e-08, "loss": 0.74274248, "num_input_tokens_seen": 325415020, "step": 15087, "time_per_iteration": 4.312045335769653 }, { "auxiliary_loss_clip": 0.01084735, "auxiliary_loss_mlp": 0.01027139, "balance_loss_clip": 1.03397894, "balance_loss_mlp": 1.01522434, "epoch": 0.9071396362543214, "flos": 18916413114240.0, "grad_norm": 1.949639239308053, "language_loss": 0.76511991, "learning_rate": 8.972014140059058e-08, "loss": 0.78623861, "num_input_tokens_seen": 325433595, "step": 15088, "time_per_iteration": 4.274383783340454 }, { "auxiliary_loss_clip": 0.01073577, "auxiliary_loss_mlp": 0.01032164, "balance_loss_clip": 1.03376746, "balance_loss_mlp": 1.02019525, "epoch": 0.9071997595069893, "flos": 25228646887680.0, "grad_norm": 1.7650984011067665, "language_loss": 0.73451883, "learning_rate": 8.960483664113038e-08, "loss": 0.75557625, "num_input_tokens_seen": 325451605, "step": 15089, "time_per_iteration": 4.142383575439453 }, { "auxiliary_loss_clip": 0.01103445, "auxiliary_loss_mlp": 0.01034573, "balance_loss_clip": 1.03631544, "balance_loss_mlp": 1.02313471, "epoch": 0.9072598827596573, "flos": 24346608514560.0, "grad_norm": 1.785554810845489, "language_loss": 0.75460756, "learning_rate": 8.948960432404628e-08, "loss": 0.77598774, "num_input_tokens_seen": 325470645, "step": 15090, "time_per_iteration": 4.125551462173462 }, { "auxiliary_loss_clip": 0.01081669, "auxiliary_loss_mlp": 0.01030269, "balance_loss_clip": 1.03531027, "balance_loss_mlp": 1.0168643, "epoch": 0.9073200060123253, "flos": 22674967418880.0, "grad_norm": 2.644042732321969, "language_loss": 0.7796579, "learning_rate": 8.93744444537079e-08, "loss": 0.8007772, "num_input_tokens_seen": 325488070, "step": 15091, "time_per_iteration": 2.611660957336426 }, { "auxiliary_loss_clip": 0.01080451, "auxiliary_loss_mlp": 0.01025973, "balance_loss_clip": 1.03320861, "balance_loss_mlp": 1.01513076, "epoch": 0.9073801292649932, "flos": 23695476721920.0, "grad_norm": 1.8611765559863347, "language_loss": 0.85915703, "learning_rate": 8.925935703448217e-08, "loss": 0.88022125, "num_input_tokens_seen": 325509285, "step": 15092, "time_per_iteration": 2.6740128993988037 }, { "auxiliary_loss_clip": 0.01084789, "auxiliary_loss_mlp": 0.0103167, "balance_loss_clip": 1.03747833, "balance_loss_mlp": 1.0196414, "epoch": 0.9074402525176612, "flos": 25375413859200.0, "grad_norm": 1.5044941360603252, "language_loss": 0.78849494, "learning_rate": 8.914434207073296e-08, "loss": 0.80965954, "num_input_tokens_seen": 325529360, "step": 15093, "time_per_iteration": 2.680701494216919 }, { "auxiliary_loss_clip": 0.01019381, "auxiliary_loss_mlp": 0.01002565, "balance_loss_clip": 1.00606823, "balance_loss_mlp": 1.00151622, "epoch": 0.9075003757703292, "flos": 67649024384640.0, "grad_norm": 0.7360353686888242, "language_loss": 0.56958818, "learning_rate": 8.902939956682188e-08, "loss": 0.58980775, "num_input_tokens_seen": 325583565, "step": 15094, "time_per_iteration": 3.086918592453003 }, { "auxiliary_loss_clip": 0.01099075, "auxiliary_loss_mlp": 0.01034812, "balance_loss_clip": 1.03545427, "balance_loss_mlp": 1.02190804, "epoch": 0.9075604990229972, "flos": 22453649769600.0, "grad_norm": 1.9406797492354237, "language_loss": 0.71160638, "learning_rate": 8.891452952710742e-08, "loss": 0.73294526, "num_input_tokens_seen": 325603690, "step": 15095, "time_per_iteration": 2.6372621059417725 }, { "auxiliary_loss_clip": 0.01066408, "auxiliary_loss_mlp": 0.01034351, "balance_loss_clip": 1.0342015, "balance_loss_mlp": 1.02175641, "epoch": 0.9076206222756651, "flos": 19536662188800.0, "grad_norm": 2.201890556865997, "language_loss": 0.7416867, "learning_rate": 8.879973195594526e-08, "loss": 0.76269424, "num_input_tokens_seen": 325622255, "step": 15096, "time_per_iteration": 2.7420341968536377 }, { "auxiliary_loss_clip": 0.01109715, "auxiliary_loss_mlp": 0.01038367, "balance_loss_clip": 1.03712845, "balance_loss_mlp": 1.02484858, "epoch": 0.9076807455283331, "flos": 30116914819200.0, "grad_norm": 1.8892024302552053, "language_loss": 0.56777847, "learning_rate": 8.868500685768898e-08, "loss": 0.58925933, "num_input_tokens_seen": 325640165, "step": 15097, "time_per_iteration": 2.66786527633667 }, { "auxiliary_loss_clip": 0.01085602, "auxiliary_loss_mlp": 0.01024669, "balance_loss_clip": 1.03317809, "balance_loss_mlp": 1.01340389, "epoch": 0.907740868781001, "flos": 18697537589760.0, "grad_norm": 1.7446964488150043, "language_loss": 0.79539967, "learning_rate": 8.857035423668935e-08, "loss": 0.81650233, "num_input_tokens_seen": 325659455, "step": 15098, "time_per_iteration": 2.6101489067077637 }, { "auxiliary_loss_clip": 0.010671, "auxiliary_loss_mlp": 0.00771611, "balance_loss_clip": 1.03485239, "balance_loss_mlp": 1.00026011, "epoch": 0.907800992033669, "flos": 22638805401600.0, "grad_norm": 18.550819833404994, "language_loss": 0.66001773, "learning_rate": 8.845577409729266e-08, "loss": 0.67840481, "num_input_tokens_seen": 325678095, "step": 15099, "time_per_iteration": 2.782886266708374 }, { "auxiliary_loss_clip": 0.01089093, "auxiliary_loss_mlp": 0.0103633, "balance_loss_clip": 1.03569531, "balance_loss_mlp": 1.02341413, "epoch": 0.907861115286337, "flos": 21287666384640.0, "grad_norm": 2.095035959000706, "language_loss": 0.70761675, "learning_rate": 8.834126644384477e-08, "loss": 0.72887093, "num_input_tokens_seen": 325695825, "step": 15100, "time_per_iteration": 2.718719482421875 }, { "auxiliary_loss_clip": 0.01018547, "auxiliary_loss_mlp": 0.01002357, "balance_loss_clip": 1.00599432, "balance_loss_mlp": 1.00136185, "epoch": 0.907921238539005, "flos": 69739493040000.0, "grad_norm": 0.6221166311541254, "language_loss": 0.5336588, "learning_rate": 8.822683128068775e-08, "loss": 0.55386788, "num_input_tokens_seen": 325764515, "step": 15101, "time_per_iteration": 3.2601447105407715 }, { "auxiliary_loss_clip": 0.01074173, "auxiliary_loss_mlp": 0.0103007, "balance_loss_clip": 1.03405142, "balance_loss_mlp": 1.01715326, "epoch": 0.9079813617916729, "flos": 23477391296640.0, "grad_norm": 1.6841110565912183, "language_loss": 0.68209207, "learning_rate": 8.811246861216081e-08, "loss": 0.70313448, "num_input_tokens_seen": 325783235, "step": 15102, "time_per_iteration": 2.6863279342651367 }, { "auxiliary_loss_clip": 0.01094848, "auxiliary_loss_mlp": 0.0103185, "balance_loss_clip": 1.03587008, "balance_loss_mlp": 1.01915479, "epoch": 0.9080414850443409, "flos": 22929933133440.0, "grad_norm": 1.7674184353723423, "language_loss": 0.79133558, "learning_rate": 8.799817844260049e-08, "loss": 0.81260264, "num_input_tokens_seen": 325800195, "step": 15103, "time_per_iteration": 2.672898054122925 }, { "auxiliary_loss_clip": 0.0108183, "auxiliary_loss_mlp": 0.01033096, "balance_loss_clip": 1.03430343, "balance_loss_mlp": 1.02016127, "epoch": 0.9081016082970089, "flos": 26177083551360.0, "grad_norm": 1.7434121737063208, "language_loss": 0.71796912, "learning_rate": 8.78839607763413e-08, "loss": 0.73911834, "num_input_tokens_seen": 325820215, "step": 15104, "time_per_iteration": 2.6979503631591797 }, { "auxiliary_loss_clip": 0.01083633, "auxiliary_loss_mlp": 0.01026392, "balance_loss_clip": 1.03431463, "balance_loss_mlp": 1.01508558, "epoch": 0.9081617315496768, "flos": 24462169545600.0, "grad_norm": 1.7433195469593918, "language_loss": 0.77697951, "learning_rate": 8.77698156177138e-08, "loss": 0.79807979, "num_input_tokens_seen": 325838415, "step": 15105, "time_per_iteration": 2.693650722503662 }, { "auxiliary_loss_clip": 0.01106144, "auxiliary_loss_mlp": 0.00770719, "balance_loss_clip": 1.03435302, "balance_loss_mlp": 1.00018311, "epoch": 0.9082218548023449, "flos": 24746868743040.0, "grad_norm": 2.4921159969268625, "language_loss": 0.73882461, "learning_rate": 8.765574297104628e-08, "loss": 0.75759327, "num_input_tokens_seen": 325855580, "step": 15106, "time_per_iteration": 2.6928508281707764 }, { "auxiliary_loss_clip": 0.01059785, "auxiliary_loss_mlp": 0.0103567, "balance_loss_clip": 1.02983892, "balance_loss_mlp": 1.02212226, "epoch": 0.9082819780550128, "flos": 24421302846720.0, "grad_norm": 1.956694255658617, "language_loss": 0.80682945, "learning_rate": 8.754174284066462e-08, "loss": 0.82778394, "num_input_tokens_seen": 325874890, "step": 15107, "time_per_iteration": 2.8211913108825684 }, { "auxiliary_loss_clip": 0.01003818, "auxiliary_loss_mlp": 0.01000224, "balance_loss_clip": 1.00530005, "balance_loss_mlp": 0.99906158, "epoch": 0.9083421013076808, "flos": 59609704872960.0, "grad_norm": 0.8163194562351376, "language_loss": 0.59763622, "learning_rate": 8.742781523089205e-08, "loss": 0.61767673, "num_input_tokens_seen": 325935835, "step": 15108, "time_per_iteration": 3.176673173904419 }, { "auxiliary_loss_clip": 0.01085744, "auxiliary_loss_mlp": 0.0102493, "balance_loss_clip": 1.03396034, "balance_loss_mlp": 1.01259756, "epoch": 0.9084022245603487, "flos": 33620216100480.0, "grad_norm": 1.5754460951726812, "language_loss": 0.73228884, "learning_rate": 8.73139601460482e-08, "loss": 0.75339556, "num_input_tokens_seen": 325958035, "step": 15109, "time_per_iteration": 2.744368314743042 }, { "auxiliary_loss_clip": 0.01072978, "auxiliary_loss_mlp": 0.01028525, "balance_loss_clip": 1.03370285, "balance_loss_mlp": 1.01687264, "epoch": 0.9084623478130167, "flos": 24971705925120.0, "grad_norm": 4.290837775967832, "language_loss": 0.71557301, "learning_rate": 8.720017759045073e-08, "loss": 0.736588, "num_input_tokens_seen": 325979870, "step": 15110, "time_per_iteration": 2.7875888347625732 }, { "auxiliary_loss_clip": 0.0107739, "auxiliary_loss_mlp": 0.01035324, "balance_loss_clip": 1.03073955, "balance_loss_mlp": 1.0219785, "epoch": 0.9085224710656846, "flos": 31461804869760.0, "grad_norm": 1.8448389320189542, "language_loss": 0.69122839, "learning_rate": 8.708646756841421e-08, "loss": 0.71235561, "num_input_tokens_seen": 325998245, "step": 15111, "time_per_iteration": 2.7633275985717773 }, { "auxiliary_loss_clip": 0.00998747, "auxiliary_loss_mlp": 0.01004801, "balance_loss_clip": 1.00629544, "balance_loss_mlp": 1.00380516, "epoch": 0.9085825943183526, "flos": 64917012867840.0, "grad_norm": 0.6888629438196041, "language_loss": 0.51703209, "learning_rate": 8.697283008425026e-08, "loss": 0.53706759, "num_input_tokens_seen": 326061770, "step": 15112, "time_per_iteration": 3.2464187145233154 }, { "auxiliary_loss_clip": 0.0109824, "auxiliary_loss_mlp": 0.01033068, "balance_loss_clip": 1.03503668, "balance_loss_mlp": 1.02022314, "epoch": 0.9086427175710206, "flos": 18953221576320.0, "grad_norm": 1.723855201970508, "language_loss": 0.7027775, "learning_rate": 8.685926514226837e-08, "loss": 0.72409058, "num_input_tokens_seen": 326080945, "step": 15113, "time_per_iteration": 2.615265130996704 }, { "auxiliary_loss_clip": 0.01098496, "auxiliary_loss_mlp": 0.01030884, "balance_loss_clip": 1.03785408, "balance_loss_mlp": 1.0189271, "epoch": 0.9087028408236886, "flos": 34014873807360.0, "grad_norm": 2.0973757596387004, "language_loss": 0.78994763, "learning_rate": 8.674577274677508e-08, "loss": 0.81124145, "num_input_tokens_seen": 326100630, "step": 15114, "time_per_iteration": 2.7337305545806885 }, { "auxiliary_loss_clip": 0.01070616, "auxiliary_loss_mlp": 0.01033895, "balance_loss_clip": 1.03684914, "balance_loss_mlp": 1.02053201, "epoch": 0.9087629640763565, "flos": 21944580266880.0, "grad_norm": 3.929458307617432, "language_loss": 0.70178634, "learning_rate": 8.663235290207405e-08, "loss": 0.72283143, "num_input_tokens_seen": 326120145, "step": 15115, "time_per_iteration": 2.751361131668091 }, { "auxiliary_loss_clip": 0.01086218, "auxiliary_loss_mlp": 0.01032577, "balance_loss_clip": 1.03923655, "balance_loss_mlp": 1.01895118, "epoch": 0.9088230873290245, "flos": 21762908254080.0, "grad_norm": 2.3483964506042603, "language_loss": 0.65777099, "learning_rate": 8.651900561246561e-08, "loss": 0.67895895, "num_input_tokens_seen": 326140715, "step": 15116, "time_per_iteration": 2.715759754180908 }, { "auxiliary_loss_clip": 0.01106542, "auxiliary_loss_mlp": 0.01034021, "balance_loss_clip": 1.0372858, "balance_loss_mlp": 1.02119398, "epoch": 0.9088832105816925, "flos": 21541267382400.0, "grad_norm": 6.342744698991267, "language_loss": 0.69591606, "learning_rate": 8.640573088224812e-08, "loss": 0.71732175, "num_input_tokens_seen": 326159130, "step": 15117, "time_per_iteration": 2.582552433013916 }, { "auxiliary_loss_clip": 0.01066284, "auxiliary_loss_mlp": 0.01026525, "balance_loss_clip": 1.03425217, "balance_loss_mlp": 1.01489568, "epoch": 0.9089433338343604, "flos": 25996704428160.0, "grad_norm": 3.6808698691711856, "language_loss": 0.74660701, "learning_rate": 8.629252871571745e-08, "loss": 0.76753509, "num_input_tokens_seen": 326181375, "step": 15118, "time_per_iteration": 2.751481056213379 }, { "auxiliary_loss_clip": 0.01083211, "auxiliary_loss_mlp": 0.01034962, "balance_loss_clip": 1.03344107, "balance_loss_mlp": 1.02128291, "epoch": 0.9090034570870285, "flos": 21178426147200.0, "grad_norm": 2.13733826102676, "language_loss": 0.73172134, "learning_rate": 8.617939911716554e-08, "loss": 0.75290304, "num_input_tokens_seen": 326199740, "step": 15119, "time_per_iteration": 2.7050302028656006 }, { "auxiliary_loss_clip": 0.01073499, "auxiliary_loss_mlp": 0.0103193, "balance_loss_clip": 1.03588152, "balance_loss_mlp": 1.01727891, "epoch": 0.9090635803396964, "flos": 16141811045760.0, "grad_norm": 2.3309233232368376, "language_loss": 0.71525586, "learning_rate": 8.60663420908827e-08, "loss": 0.73631012, "num_input_tokens_seen": 326214350, "step": 15120, "time_per_iteration": 2.748596429824829 }, { "auxiliary_loss_clip": 0.01109717, "auxiliary_loss_mlp": 0.00770513, "balance_loss_clip": 1.03689528, "balance_loss_mlp": 1.0002079, "epoch": 0.9091237035923644, "flos": 20591537829120.0, "grad_norm": 2.1685106805534002, "language_loss": 0.65576839, "learning_rate": 8.595335764115596e-08, "loss": 0.67457068, "num_input_tokens_seen": 326234580, "step": 15121, "time_per_iteration": 2.6541824340820312 }, { "auxiliary_loss_clip": 0.01098528, "auxiliary_loss_mlp": 0.01035932, "balance_loss_clip": 1.03610933, "balance_loss_mlp": 1.02343321, "epoch": 0.9091838268450323, "flos": 52227760164480.0, "grad_norm": 2.467654081114951, "language_loss": 0.70642388, "learning_rate": 8.58404457722699e-08, "loss": 0.72776842, "num_input_tokens_seen": 326259080, "step": 15122, "time_per_iteration": 2.925644636154175 }, { "auxiliary_loss_clip": 0.01052109, "auxiliary_loss_mlp": 0.01030913, "balance_loss_clip": 1.03168774, "balance_loss_mlp": 1.01879561, "epoch": 0.9092439500977003, "flos": 20559613616640.0, "grad_norm": 1.4208742035415944, "language_loss": 0.74525023, "learning_rate": 8.572760648850575e-08, "loss": 0.76608044, "num_input_tokens_seen": 326280175, "step": 15123, "time_per_iteration": 2.734441041946411 }, { "auxiliary_loss_clip": 0.0109521, "auxiliary_loss_mlp": 0.01033097, "balance_loss_clip": 1.03593588, "balance_loss_mlp": 1.02159882, "epoch": 0.9093040733503682, "flos": 28617859595520.0, "grad_norm": 1.8896970450570774, "language_loss": 0.7576673, "learning_rate": 8.561483979414253e-08, "loss": 0.77895033, "num_input_tokens_seen": 326297990, "step": 15124, "time_per_iteration": 2.6362528800964355 }, { "auxiliary_loss_clip": 0.01090802, "auxiliary_loss_mlp": 0.01032412, "balance_loss_clip": 1.03465593, "balance_loss_mlp": 1.02002668, "epoch": 0.9093641966030362, "flos": 23440187784960.0, "grad_norm": 1.8805276614968602, "language_loss": 0.71919298, "learning_rate": 8.55021456934566e-08, "loss": 0.74042511, "num_input_tokens_seen": 326316735, "step": 15125, "time_per_iteration": 2.5915915966033936 }, { "auxiliary_loss_clip": 0.01068085, "auxiliary_loss_mlp": 0.01035917, "balance_loss_clip": 1.03625441, "balance_loss_mlp": 1.02334034, "epoch": 0.9094243198557042, "flos": 16800197385600.0, "grad_norm": 1.6292066230001099, "language_loss": 0.79466188, "learning_rate": 8.538952419072143e-08, "loss": 0.8157019, "num_input_tokens_seen": 326334370, "step": 15126, "time_per_iteration": 4.219731569290161 }, { "auxiliary_loss_clip": 0.01065083, "auxiliary_loss_mlp": 0.01034805, "balance_loss_clip": 1.03633046, "balance_loss_mlp": 1.02255654, "epoch": 0.9094844431083722, "flos": 24273278899200.0, "grad_norm": 1.707765126078796, "language_loss": 0.75641441, "learning_rate": 8.527697529020694e-08, "loss": 0.77741325, "num_input_tokens_seen": 326353435, "step": 15127, "time_per_iteration": 2.7128138542175293 }, { "auxiliary_loss_clip": 0.01027678, "auxiliary_loss_mlp": 0.01033923, "balance_loss_clip": 1.03002882, "balance_loss_mlp": 1.02145934, "epoch": 0.9095445663610401, "flos": 21944652094080.0, "grad_norm": 1.8998405875281965, "language_loss": 0.62571168, "learning_rate": 8.516449899618173e-08, "loss": 0.64632773, "num_input_tokens_seen": 326371810, "step": 15128, "time_per_iteration": 4.432798385620117 }, { "auxiliary_loss_clip": 0.01075251, "auxiliary_loss_mlp": 0.01024187, "balance_loss_clip": 1.03530467, "balance_loss_mlp": 1.01223636, "epoch": 0.9096046896137081, "flos": 19792848965760.0, "grad_norm": 1.7664774928724292, "language_loss": 0.76836801, "learning_rate": 8.505209531291013e-08, "loss": 0.78936237, "num_input_tokens_seen": 326391380, "step": 15129, "time_per_iteration": 4.206790447235107 }, { "auxiliary_loss_clip": 0.01096669, "auxiliary_loss_mlp": 0.01027809, "balance_loss_clip": 1.03541172, "balance_loss_mlp": 1.01559019, "epoch": 0.909664812866376, "flos": 22638087129600.0, "grad_norm": 1.9024505356481058, "language_loss": 0.83078182, "learning_rate": 8.49397642446552e-08, "loss": 0.85202664, "num_input_tokens_seen": 326408800, "step": 15130, "time_per_iteration": 2.6001152992248535 }, { "auxiliary_loss_clip": 0.0108696, "auxiliary_loss_mlp": 0.01032978, "balance_loss_clip": 1.0359422, "balance_loss_mlp": 1.01988339, "epoch": 0.909724936119044, "flos": 39852153020160.0, "grad_norm": 1.6192884326083825, "language_loss": 0.75177467, "learning_rate": 8.482750579567644e-08, "loss": 0.77297407, "num_input_tokens_seen": 326431565, "step": 15131, "time_per_iteration": 2.848465919494629 }, { "auxiliary_loss_clip": 0.01083451, "auxiliary_loss_mlp": 0.01035354, "balance_loss_clip": 1.03611147, "balance_loss_mlp": 1.02193737, "epoch": 0.9097850593717121, "flos": 35071616954880.0, "grad_norm": 1.8781997333884533, "language_loss": 0.599832, "learning_rate": 8.471531997023085e-08, "loss": 0.62102008, "num_input_tokens_seen": 326451715, "step": 15132, "time_per_iteration": 2.7317306995391846 }, { "auxiliary_loss_clip": 0.01068526, "auxiliary_loss_mlp": 0.01031798, "balance_loss_clip": 1.0371139, "balance_loss_mlp": 1.02007413, "epoch": 0.90984518262438, "flos": 23367468700800.0, "grad_norm": 1.7969799110161846, "language_loss": 0.82646108, "learning_rate": 8.460320677257193e-08, "loss": 0.84746432, "num_input_tokens_seen": 326470855, "step": 15133, "time_per_iteration": 2.666724920272827 }, { "auxiliary_loss_clip": 0.01084851, "auxiliary_loss_mlp": 0.01033684, "balance_loss_clip": 1.03276467, "balance_loss_mlp": 1.0209645, "epoch": 0.909905305877048, "flos": 27523302405120.0, "grad_norm": 1.9696626904627623, "language_loss": 0.74180704, "learning_rate": 8.449116620695118e-08, "loss": 0.76299238, "num_input_tokens_seen": 326490480, "step": 15134, "time_per_iteration": 2.7024521827697754 }, { "auxiliary_loss_clip": 0.01081442, "auxiliary_loss_mlp": 0.01032002, "balance_loss_clip": 1.03796458, "balance_loss_mlp": 1.01934206, "epoch": 0.9099654291297159, "flos": 24347865490560.0, "grad_norm": 1.5144614886506496, "language_loss": 0.72592616, "learning_rate": 8.437919827761786e-08, "loss": 0.74706054, "num_input_tokens_seen": 326509445, "step": 15135, "time_per_iteration": 2.7127246856689453 }, { "auxiliary_loss_clip": 0.01096766, "auxiliary_loss_mlp": 0.01030937, "balance_loss_clip": 1.03764153, "balance_loss_mlp": 1.01891482, "epoch": 0.9100255523823839, "flos": 21215234609280.0, "grad_norm": 1.683349330463744, "language_loss": 0.70137173, "learning_rate": 8.426730298881702e-08, "loss": 0.72264874, "num_input_tokens_seen": 326528380, "step": 15136, "time_per_iteration": 2.6193113327026367 }, { "auxiliary_loss_clip": 0.00990412, "auxiliary_loss_mlp": 0.01005783, "balance_loss_clip": 1.00657475, "balance_loss_mlp": 1.00484753, "epoch": 0.9100856756350518, "flos": 46052276446080.0, "grad_norm": 0.825688175716241, "language_loss": 0.59235996, "learning_rate": 8.415548034479214e-08, "loss": 0.61232191, "num_input_tokens_seen": 326576940, "step": 15137, "time_per_iteration": 3.083552837371826 }, { "auxiliary_loss_clip": 0.01098465, "auxiliary_loss_mlp": 0.01035683, "balance_loss_clip": 1.03574395, "balance_loss_mlp": 1.02372026, "epoch": 0.9101457988877198, "flos": 20229917656320.0, "grad_norm": 2.33052803483979, "language_loss": 0.82487237, "learning_rate": 8.40437303497834e-08, "loss": 0.84621382, "num_input_tokens_seen": 326596100, "step": 15138, "time_per_iteration": 2.674602508544922 }, { "auxiliary_loss_clip": 0.01094423, "auxiliary_loss_mlp": 0.01026368, "balance_loss_clip": 1.037696, "balance_loss_mlp": 1.01526928, "epoch": 0.9102059221403878, "flos": 26615157822720.0, "grad_norm": 1.5741555664538536, "language_loss": 0.81272125, "learning_rate": 8.39320530080283e-08, "loss": 0.83392918, "num_input_tokens_seen": 326615700, "step": 15139, "time_per_iteration": 2.694201946258545 }, { "auxiliary_loss_clip": 0.01076496, "auxiliary_loss_mlp": 0.01033648, "balance_loss_clip": 1.03743947, "balance_loss_mlp": 1.02160764, "epoch": 0.9102660453930558, "flos": 21908561904000.0, "grad_norm": 2.050091798744291, "language_loss": 0.77814442, "learning_rate": 8.382044832376167e-08, "loss": 0.79924583, "num_input_tokens_seen": 326635905, "step": 15140, "time_per_iteration": 2.722778558731079 }, { "auxiliary_loss_clip": 0.01106393, "auxiliary_loss_mlp": 0.01031373, "balance_loss_clip": 1.0352447, "balance_loss_mlp": 1.01943445, "epoch": 0.9103261686457237, "flos": 36176660916480.0, "grad_norm": 1.7205881923201032, "language_loss": 0.66666603, "learning_rate": 8.370891630121569e-08, "loss": 0.68804365, "num_input_tokens_seen": 326661855, "step": 15141, "time_per_iteration": 2.7130444049835205 }, { "auxiliary_loss_clip": 0.01095941, "auxiliary_loss_mlp": 0.01037339, "balance_loss_clip": 1.03542638, "balance_loss_mlp": 1.02527499, "epoch": 0.9103862918983917, "flos": 23878549365120.0, "grad_norm": 1.8944850892633267, "language_loss": 0.75325441, "learning_rate": 8.359745694462005e-08, "loss": 0.77458721, "num_input_tokens_seen": 326679320, "step": 15142, "time_per_iteration": 2.6429429054260254 }, { "auxiliary_loss_clip": 0.01069268, "auxiliary_loss_mlp": 0.01042478, "balance_loss_clip": 1.03122544, "balance_loss_mlp": 1.02982378, "epoch": 0.9104464151510596, "flos": 14939521989120.0, "grad_norm": 1.6543746947107703, "language_loss": 0.64361405, "learning_rate": 8.348607025820076e-08, "loss": 0.6647315, "num_input_tokens_seen": 326698110, "step": 15143, "time_per_iteration": 2.669706344604492 }, { "auxiliary_loss_clip": 0.01110746, "auxiliary_loss_mlp": 0.0103534, "balance_loss_clip": 1.03664672, "balance_loss_mlp": 1.02197671, "epoch": 0.9105065384037276, "flos": 33655803500160.0, "grad_norm": 1.826138803106712, "language_loss": 0.61111665, "learning_rate": 8.337475624618152e-08, "loss": 0.63257754, "num_input_tokens_seen": 326718370, "step": 15144, "time_per_iteration": 2.659849166870117 }, { "auxiliary_loss_clip": 0.01065641, "auxiliary_loss_mlp": 0.01027587, "balance_loss_clip": 1.0301441, "balance_loss_mlp": 1.01508248, "epoch": 0.9105666616563957, "flos": 24316695463680.0, "grad_norm": 1.5990370313133018, "language_loss": 0.70864612, "learning_rate": 8.326351491278382e-08, "loss": 0.72957838, "num_input_tokens_seen": 326738445, "step": 15145, "time_per_iteration": 2.685203790664673 }, { "auxiliary_loss_clip": 0.01047743, "auxiliary_loss_mlp": 0.01032721, "balance_loss_clip": 1.03243327, "balance_loss_mlp": 1.02036476, "epoch": 0.9106267849090636, "flos": 29971692132480.0, "grad_norm": 1.5060644265455205, "language_loss": 0.70642048, "learning_rate": 8.315234626222545e-08, "loss": 0.72722512, "num_input_tokens_seen": 326758855, "step": 15146, "time_per_iteration": 2.7676496505737305 }, { "auxiliary_loss_clip": 0.01085776, "auxiliary_loss_mlp": 0.01032662, "balance_loss_clip": 1.03418636, "balance_loss_mlp": 1.02068782, "epoch": 0.9106869081617316, "flos": 25337743470720.0, "grad_norm": 1.8260905410066164, "language_loss": 0.72899806, "learning_rate": 8.304125029872233e-08, "loss": 0.75018245, "num_input_tokens_seen": 326777140, "step": 15147, "time_per_iteration": 2.6421234607696533 }, { "auxiliary_loss_clip": 0.01081187, "auxiliary_loss_mlp": 0.01031004, "balance_loss_clip": 1.0361135, "balance_loss_mlp": 1.01835012, "epoch": 0.9107470314143995, "flos": 18187031543040.0, "grad_norm": 1.914291203586608, "language_loss": 0.80780458, "learning_rate": 8.293022702648711e-08, "loss": 0.82892644, "num_input_tokens_seen": 326794070, "step": 15148, "time_per_iteration": 2.6653599739074707 }, { "auxiliary_loss_clip": 0.01076044, "auxiliary_loss_mlp": 0.01039047, "balance_loss_clip": 1.03479314, "balance_loss_mlp": 1.02636874, "epoch": 0.9108071546670675, "flos": 23550828652800.0, "grad_norm": 2.087055328388918, "language_loss": 0.67585528, "learning_rate": 8.281927644972996e-08, "loss": 0.69700611, "num_input_tokens_seen": 326814695, "step": 15149, "time_per_iteration": 2.758857011795044 }, { "auxiliary_loss_clip": 0.01108552, "auxiliary_loss_mlp": 0.01030478, "balance_loss_clip": 1.03687429, "balance_loss_mlp": 1.01744866, "epoch": 0.9108672779197354, "flos": 25630307746560.0, "grad_norm": 1.9181044295268432, "language_loss": 0.63203096, "learning_rate": 8.270839857265776e-08, "loss": 0.65342128, "num_input_tokens_seen": 326835295, "step": 15150, "time_per_iteration": 2.650240898132324 }, { "auxiliary_loss_clip": 0.01066309, "auxiliary_loss_mlp": 0.01031448, "balance_loss_clip": 1.03402328, "balance_loss_mlp": 1.01881194, "epoch": 0.9109274011724035, "flos": 22339094319360.0, "grad_norm": 2.2733833539943333, "language_loss": 0.72643161, "learning_rate": 8.259759339947514e-08, "loss": 0.74740922, "num_input_tokens_seen": 326853350, "step": 15151, "time_per_iteration": 2.706934690475464 }, { "auxiliary_loss_clip": 0.01095436, "auxiliary_loss_mlp": 0.01029482, "balance_loss_clip": 1.03496432, "balance_loss_mlp": 1.01727509, "epoch": 0.9109875244250714, "flos": 26688200129280.0, "grad_norm": 1.648582433866266, "language_loss": 0.64558387, "learning_rate": 8.248686093438429e-08, "loss": 0.66683304, "num_input_tokens_seen": 326873425, "step": 15152, "time_per_iteration": 2.699647903442383 }, { "auxiliary_loss_clip": 0.0108822, "auxiliary_loss_mlp": 0.00770055, "balance_loss_clip": 1.03658628, "balance_loss_mlp": 1.00032091, "epoch": 0.9110476476777394, "flos": 22930112701440.0, "grad_norm": 1.8488661615298092, "language_loss": 0.73683035, "learning_rate": 8.23762011815834e-08, "loss": 0.75541312, "num_input_tokens_seen": 326893455, "step": 15153, "time_per_iteration": 2.6998884677886963 }, { "auxiliary_loss_clip": 0.01067073, "auxiliary_loss_mlp": 0.01050891, "balance_loss_clip": 1.03213048, "balance_loss_mlp": 1.03591788, "epoch": 0.9111077709304073, "flos": 13472857854720.0, "grad_norm": 2.3413318457237775, "language_loss": 0.72122753, "learning_rate": 8.226561414526956e-08, "loss": 0.74240714, "num_input_tokens_seen": 326910210, "step": 15154, "time_per_iteration": 2.683474540710449 }, { "auxiliary_loss_clip": 0.01088157, "auxiliary_loss_mlp": 0.01032363, "balance_loss_clip": 1.03857923, "balance_loss_mlp": 1.02037024, "epoch": 0.9111678941830753, "flos": 20850561780480.0, "grad_norm": 1.7345027920232028, "language_loss": 0.82108957, "learning_rate": 8.215509982963564e-08, "loss": 0.84229481, "num_input_tokens_seen": 326929350, "step": 15155, "time_per_iteration": 2.7106335163116455 }, { "auxiliary_loss_clip": 0.01096529, "auxiliary_loss_mlp": 0.01031415, "balance_loss_clip": 1.03773642, "balance_loss_mlp": 1.01885629, "epoch": 0.9112280174357432, "flos": 19682244011520.0, "grad_norm": 1.8052926393059447, "language_loss": 0.5958488, "learning_rate": 8.204465823887252e-08, "loss": 0.61712825, "num_input_tokens_seen": 326949060, "step": 15156, "time_per_iteration": 2.6679844856262207 }, { "auxiliary_loss_clip": 0.01099444, "auxiliary_loss_mlp": 0.01028039, "balance_loss_clip": 1.03477848, "balance_loss_mlp": 1.01498008, "epoch": 0.9112881406884112, "flos": 25447163276160.0, "grad_norm": 2.321869813201265, "language_loss": 0.74290884, "learning_rate": 8.193428937716796e-08, "loss": 0.76418364, "num_input_tokens_seen": 326968950, "step": 15157, "time_per_iteration": 2.6687350273132324 }, { "auxiliary_loss_clip": 0.01063031, "auxiliary_loss_mlp": 0.01033754, "balance_loss_clip": 1.03153825, "balance_loss_mlp": 1.02228022, "epoch": 0.9113482639410793, "flos": 33066975847680.0, "grad_norm": 1.6132914581945528, "language_loss": 0.59553444, "learning_rate": 8.182399324870747e-08, "loss": 0.61650229, "num_input_tokens_seen": 326989455, "step": 15158, "time_per_iteration": 2.8011231422424316 }, { "auxiliary_loss_clip": 0.01050049, "auxiliary_loss_mlp": 0.01031146, "balance_loss_clip": 1.03574824, "balance_loss_mlp": 1.01942158, "epoch": 0.9114083871937472, "flos": 21835591424640.0, "grad_norm": 2.2386737595671047, "language_loss": 0.68004364, "learning_rate": 8.171376985767375e-08, "loss": 0.70085549, "num_input_tokens_seen": 327009640, "step": 15159, "time_per_iteration": 2.772341251373291 }, { "auxiliary_loss_clip": 0.01087373, "auxiliary_loss_mlp": 0.01029488, "balance_loss_clip": 1.03617239, "balance_loss_mlp": 1.0176506, "epoch": 0.9114685104464152, "flos": 27088999061760.0, "grad_norm": 2.7938055787234015, "language_loss": 0.78473425, "learning_rate": 8.160361920824588e-08, "loss": 0.8059029, "num_input_tokens_seen": 327027690, "step": 15160, "time_per_iteration": 2.7388458251953125 }, { "auxiliary_loss_clip": 0.01111531, "auxiliary_loss_mlp": 0.01029053, "balance_loss_clip": 1.03913951, "balance_loss_mlp": 1.01570201, "epoch": 0.9115286336990831, "flos": 17967042696960.0, "grad_norm": 1.6224624723660812, "language_loss": 0.69028407, "learning_rate": 8.149354130460073e-08, "loss": 0.71168995, "num_input_tokens_seen": 327045915, "step": 15161, "time_per_iteration": 2.6148221492767334 }, { "auxiliary_loss_clip": 0.01060884, "auxiliary_loss_mlp": 0.01040252, "balance_loss_clip": 1.03252292, "balance_loss_mlp": 1.02619767, "epoch": 0.9115887569517511, "flos": 22929861306240.0, "grad_norm": 1.7334002472530148, "language_loss": 0.7660948, "learning_rate": 8.138353615091321e-08, "loss": 0.78710622, "num_input_tokens_seen": 327066355, "step": 15162, "time_per_iteration": 2.938532590866089 }, { "auxiliary_loss_clip": 0.01082027, "auxiliary_loss_mlp": 0.01032242, "balance_loss_clip": 1.03714919, "balance_loss_mlp": 1.01954055, "epoch": 0.911648880204419, "flos": 23988436047360.0, "grad_norm": 1.8353414047586432, "language_loss": 0.66910523, "learning_rate": 8.127360375135395e-08, "loss": 0.69024795, "num_input_tokens_seen": 327086735, "step": 15163, "time_per_iteration": 2.6603245735168457 }, { "auxiliary_loss_clip": 0.01066197, "auxiliary_loss_mlp": 0.01033842, "balance_loss_clip": 1.03512335, "balance_loss_mlp": 1.0209856, "epoch": 0.911709003457087, "flos": 17055306754560.0, "grad_norm": 7.686069864980859, "language_loss": 0.70642608, "learning_rate": 8.116374411009186e-08, "loss": 0.72742647, "num_input_tokens_seen": 327104035, "step": 15164, "time_per_iteration": 2.7450454235076904 }, { "auxiliary_loss_clip": 0.01108615, "auxiliary_loss_mlp": 0.01031494, "balance_loss_clip": 1.03994727, "balance_loss_mlp": 1.01950121, "epoch": 0.911769126709755, "flos": 21653344794240.0, "grad_norm": 1.5696959903057297, "language_loss": 0.76052606, "learning_rate": 8.105395723129315e-08, "loss": 0.78192717, "num_input_tokens_seen": 327124370, "step": 15165, "time_per_iteration": 2.588705062866211 }, { "auxiliary_loss_clip": 0.01093363, "auxiliary_loss_mlp": 0.01033622, "balance_loss_clip": 1.03510165, "balance_loss_mlp": 1.02148008, "epoch": 0.911829249962423, "flos": 24790321221120.0, "grad_norm": 2.0749050237393423, "language_loss": 0.72525322, "learning_rate": 8.094424311912074e-08, "loss": 0.74652308, "num_input_tokens_seen": 327140915, "step": 15166, "time_per_iteration": 4.245245933532715 }, { "auxiliary_loss_clip": 0.01060198, "auxiliary_loss_mlp": 0.01038217, "balance_loss_clip": 1.03464365, "balance_loss_mlp": 1.02491355, "epoch": 0.9118893732150909, "flos": 20959406968320.0, "grad_norm": 1.8141703562808917, "language_loss": 0.73241115, "learning_rate": 8.083460177773482e-08, "loss": 0.75339532, "num_input_tokens_seen": 327158940, "step": 15167, "time_per_iteration": 5.897623062133789 }, { "auxiliary_loss_clip": 0.0101816, "auxiliary_loss_mlp": 0.00998888, "balance_loss_clip": 1.01390624, "balance_loss_mlp": 0.99787515, "epoch": 0.9119494964677589, "flos": 67917385872000.0, "grad_norm": 0.7753194150086553, "language_loss": 0.65546739, "learning_rate": 8.072503321129298e-08, "loss": 0.67563796, "num_input_tokens_seen": 327217450, "step": 15168, "time_per_iteration": 3.210770845413208 }, { "auxiliary_loss_clip": 0.01078881, "auxiliary_loss_mlp": 0.01031567, "balance_loss_clip": 1.03501606, "balance_loss_mlp": 1.01959848, "epoch": 0.9120096197204268, "flos": 18551524803840.0, "grad_norm": 1.9364628157336585, "language_loss": 0.78129464, "learning_rate": 8.061553742395033e-08, "loss": 0.80239916, "num_input_tokens_seen": 327233905, "step": 15169, "time_per_iteration": 4.273360729217529 }, { "auxiliary_loss_clip": 0.01097706, "auxiliary_loss_mlp": 0.0103039, "balance_loss_clip": 1.03744388, "balance_loss_mlp": 1.01839125, "epoch": 0.9120697429730948, "flos": 19025725178880.0, "grad_norm": 1.8060821353354455, "language_loss": 0.81748688, "learning_rate": 8.05061144198591e-08, "loss": 0.83876789, "num_input_tokens_seen": 327252430, "step": 15170, "time_per_iteration": 2.6498122215270996 }, { "auxiliary_loss_clip": 0.01100439, "auxiliary_loss_mlp": 0.01030541, "balance_loss_clip": 1.03837538, "balance_loss_mlp": 1.01746333, "epoch": 0.9121298662257629, "flos": 17163685065600.0, "grad_norm": 2.097374036278885, "language_loss": 0.76902175, "learning_rate": 8.039676420316799e-08, "loss": 0.79033154, "num_input_tokens_seen": 327269215, "step": 15171, "time_per_iteration": 2.6777992248535156 }, { "auxiliary_loss_clip": 0.01025503, "auxiliary_loss_mlp": 0.01038378, "balance_loss_clip": 1.03109252, "balance_loss_mlp": 1.02510428, "epoch": 0.9121899894784308, "flos": 19682710888320.0, "grad_norm": 1.2924384179927062, "language_loss": 0.66694897, "learning_rate": 8.02874867780241e-08, "loss": 0.68758774, "num_input_tokens_seen": 327290320, "step": 15172, "time_per_iteration": 2.851702928543091 }, { "auxiliary_loss_clip": 0.01079756, "auxiliary_loss_mlp": 0.01033459, "balance_loss_clip": 1.03634048, "balance_loss_mlp": 1.02087665, "epoch": 0.9122501127310988, "flos": 22235743912320.0, "grad_norm": 1.6696295487638473, "language_loss": 0.74975204, "learning_rate": 8.017828214857103e-08, "loss": 0.77088416, "num_input_tokens_seen": 327310150, "step": 15173, "time_per_iteration": 2.6567437648773193 }, { "auxiliary_loss_clip": 0.01093131, "auxiliary_loss_mlp": 0.01034759, "balance_loss_clip": 1.03830385, "balance_loss_mlp": 1.02032316, "epoch": 0.9123102359837667, "flos": 15957122290560.0, "grad_norm": 5.127558879454518, "language_loss": 0.6578263, "learning_rate": 8.00691503189499e-08, "loss": 0.67910528, "num_input_tokens_seen": 327326660, "step": 15174, "time_per_iteration": 2.6690120697021484 }, { "auxiliary_loss_clip": 0.01096653, "auxiliary_loss_mlp": 0.0103167, "balance_loss_clip": 1.03507042, "balance_loss_mlp": 1.01747251, "epoch": 0.9123703592364347, "flos": 25155784149120.0, "grad_norm": 1.9591497521426535, "language_loss": 0.74826527, "learning_rate": 7.996009129329894e-08, "loss": 0.76954854, "num_input_tokens_seen": 327346700, "step": 15175, "time_per_iteration": 2.6358284950256348 }, { "auxiliary_loss_clip": 0.01017603, "auxiliary_loss_mlp": 0.01002357, "balance_loss_clip": 1.00503564, "balance_loss_mlp": 1.00146246, "epoch": 0.9124304824891026, "flos": 60801650812800.0, "grad_norm": 0.9602139847905503, "language_loss": 0.58486784, "learning_rate": 7.985110507575421e-08, "loss": 0.60506743, "num_input_tokens_seen": 327403050, "step": 15176, "time_per_iteration": 3.1978743076324463 }, { "auxiliary_loss_clip": 0.01083812, "auxiliary_loss_mlp": 0.01036933, "balance_loss_clip": 1.03273082, "balance_loss_mlp": 1.02405846, "epoch": 0.9124906057417707, "flos": 18150941352960.0, "grad_norm": 1.6481113085508423, "language_loss": 0.65639609, "learning_rate": 7.97421916704475e-08, "loss": 0.67760354, "num_input_tokens_seen": 327422225, "step": 15177, "time_per_iteration": 2.6916801929473877 }, { "auxiliary_loss_clip": 0.0107591, "auxiliary_loss_mlp": 0.01028967, "balance_loss_clip": 1.03282464, "balance_loss_mlp": 1.01652193, "epoch": 0.9125507289944386, "flos": 11686769049600.0, "grad_norm": 2.237261929253729, "language_loss": 0.81215572, "learning_rate": 7.963335108150926e-08, "loss": 0.83320451, "num_input_tokens_seen": 327437025, "step": 15178, "time_per_iteration": 2.6279830932617188 }, { "auxiliary_loss_clip": 0.01049012, "auxiliary_loss_mlp": 0.01039158, "balance_loss_clip": 1.03083158, "balance_loss_mlp": 1.02516901, "epoch": 0.9126108522471066, "flos": 17748813617280.0, "grad_norm": 2.000734331425356, "language_loss": 0.79079652, "learning_rate": 7.952458331306711e-08, "loss": 0.81167829, "num_input_tokens_seen": 327453915, "step": 15179, "time_per_iteration": 2.675297737121582 }, { "auxiliary_loss_clip": 0.01084629, "auxiliary_loss_mlp": 0.01031574, "balance_loss_clip": 1.0357244, "balance_loss_mlp": 1.02000451, "epoch": 0.9126709754997745, "flos": 27635738952960.0, "grad_norm": 1.5039394152550116, "language_loss": 0.67973173, "learning_rate": 7.941588836924507e-08, "loss": 0.70089382, "num_input_tokens_seen": 327474415, "step": 15180, "time_per_iteration": 2.697028875350952 }, { "auxiliary_loss_clip": 0.0109496, "auxiliary_loss_mlp": 0.01027912, "balance_loss_clip": 1.03393316, "balance_loss_mlp": 1.01655757, "epoch": 0.9127310987524425, "flos": 15924982596480.0, "grad_norm": 1.6922587349839364, "language_loss": 0.75127202, "learning_rate": 7.930726625416495e-08, "loss": 0.77250075, "num_input_tokens_seen": 327492750, "step": 15181, "time_per_iteration": 2.6039087772369385 }, { "auxiliary_loss_clip": 0.01113895, "auxiliary_loss_mlp": 0.01031043, "balance_loss_clip": 1.03893065, "balance_loss_mlp": 1.01871705, "epoch": 0.9127912220051104, "flos": 21536885923200.0, "grad_norm": 4.263529248122138, "language_loss": 0.74789053, "learning_rate": 7.919871697194614e-08, "loss": 0.76933992, "num_input_tokens_seen": 327509470, "step": 15182, "time_per_iteration": 2.5808985233306885 }, { "auxiliary_loss_clip": 0.01109967, "auxiliary_loss_mlp": 0.01030307, "balance_loss_clip": 1.036412, "balance_loss_mlp": 1.01767075, "epoch": 0.9128513452577784, "flos": 24063561342720.0, "grad_norm": 1.4783992665801426, "language_loss": 0.7637254, "learning_rate": 7.909024052670421e-08, "loss": 0.78512818, "num_input_tokens_seen": 327530520, "step": 15183, "time_per_iteration": 2.690436601638794 }, { "auxiliary_loss_clip": 0.0109821, "auxiliary_loss_mlp": 0.01031178, "balance_loss_clip": 1.03847337, "balance_loss_mlp": 1.01838112, "epoch": 0.9129114685104465, "flos": 16216469464320.0, "grad_norm": 2.432279077679038, "language_loss": 0.76472139, "learning_rate": 7.898183692255256e-08, "loss": 0.78601527, "num_input_tokens_seen": 327546960, "step": 15184, "time_per_iteration": 2.643298864364624 }, { "auxiliary_loss_clip": 0.01093284, "auxiliary_loss_mlp": 0.01035355, "balance_loss_clip": 1.03755832, "balance_loss_mlp": 1.02360058, "epoch": 0.9129715917631144, "flos": 19384364522880.0, "grad_norm": 1.6196695380751174, "language_loss": 0.74525392, "learning_rate": 7.887350616360233e-08, "loss": 0.76654035, "num_input_tokens_seen": 327564830, "step": 15185, "time_per_iteration": 2.5846035480499268 }, { "auxiliary_loss_clip": 0.0108538, "auxiliary_loss_mlp": 0.01031798, "balance_loss_clip": 1.03683412, "balance_loss_mlp": 1.01925135, "epoch": 0.9130317150157824, "flos": 20590460421120.0, "grad_norm": 2.0406191594638257, "language_loss": 0.68331826, "learning_rate": 7.876524825396158e-08, "loss": 0.70449007, "num_input_tokens_seen": 327583675, "step": 15186, "time_per_iteration": 2.6857335567474365 }, { "auxiliary_loss_clip": 0.01089556, "auxiliary_loss_mlp": 0.01041285, "balance_loss_clip": 1.03548872, "balance_loss_mlp": 1.02558517, "epoch": 0.9130918382684503, "flos": 20189230525440.0, "grad_norm": 2.094200267173926, "language_loss": 0.77826124, "learning_rate": 7.865706319773502e-08, "loss": 0.79956973, "num_input_tokens_seen": 327602280, "step": 15187, "time_per_iteration": 2.707458972930908 }, { "auxiliary_loss_clip": 0.01108019, "auxiliary_loss_mlp": 0.007702, "balance_loss_clip": 1.03599858, "balance_loss_mlp": 1.00022209, "epoch": 0.9131519615211183, "flos": 25556870390400.0, "grad_norm": 6.79519157361436, "language_loss": 0.65794706, "learning_rate": 7.854895099902515e-08, "loss": 0.6767292, "num_input_tokens_seen": 327623515, "step": 15188, "time_per_iteration": 2.6106925010681152 }, { "auxiliary_loss_clip": 0.0103354, "auxiliary_loss_mlp": 0.01035627, "balance_loss_clip": 1.02865291, "balance_loss_mlp": 1.02201962, "epoch": 0.9132120847737862, "flos": 17931563038080.0, "grad_norm": 1.7656682346209025, "language_loss": 0.76258671, "learning_rate": 7.844091166193157e-08, "loss": 0.78327841, "num_input_tokens_seen": 327642875, "step": 15189, "time_per_iteration": 2.8081729412078857 }, { "auxiliary_loss_clip": 0.0109744, "auxiliary_loss_mlp": 0.01029243, "balance_loss_clip": 1.03559053, "balance_loss_mlp": 1.0180254, "epoch": 0.9132722080264543, "flos": 20047635112320.0, "grad_norm": 1.7520638649774822, "language_loss": 0.75371557, "learning_rate": 7.8332945190551e-08, "loss": 0.77498239, "num_input_tokens_seen": 327662450, "step": 15190, "time_per_iteration": 2.6704981327056885 }, { "auxiliary_loss_clip": 0.01019225, "auxiliary_loss_mlp": 0.01003714, "balance_loss_clip": 1.00641704, "balance_loss_mlp": 1.00264728, "epoch": 0.9133323312791222, "flos": 70439967141120.0, "grad_norm": 0.7014520418780014, "language_loss": 0.57308424, "learning_rate": 7.822505158897797e-08, "loss": 0.59331357, "num_input_tokens_seen": 327723845, "step": 15191, "time_per_iteration": 3.21588134765625 }, { "auxiliary_loss_clip": 0.01113051, "auxiliary_loss_mlp": 0.01033224, "balance_loss_clip": 1.03901196, "balance_loss_mlp": 1.02014041, "epoch": 0.9133924545317902, "flos": 25483792170240.0, "grad_norm": 1.7022640616397489, "language_loss": 0.74351078, "learning_rate": 7.81172308613034e-08, "loss": 0.76497352, "num_input_tokens_seen": 327742590, "step": 15192, "time_per_iteration": 2.615525245666504 }, { "auxiliary_loss_clip": 0.01096745, "auxiliary_loss_mlp": 0.01028743, "balance_loss_clip": 1.03728342, "balance_loss_mlp": 1.01645255, "epoch": 0.9134525777844581, "flos": 39930690107520.0, "grad_norm": 1.536018225691407, "language_loss": 0.69412756, "learning_rate": 7.800948301161647e-08, "loss": 0.71538246, "num_input_tokens_seen": 327764350, "step": 15193, "time_per_iteration": 2.774912118911743 }, { "auxiliary_loss_clip": 0.01095342, "auxiliary_loss_mlp": 0.0103875, "balance_loss_clip": 1.03767395, "balance_loss_mlp": 1.02737117, "epoch": 0.9135127010371261, "flos": 20886723797760.0, "grad_norm": 1.712567345292954, "language_loss": 0.73434842, "learning_rate": 7.790180804400215e-08, "loss": 0.75568932, "num_input_tokens_seen": 327783120, "step": 15194, "time_per_iteration": 2.581974983215332 }, { "auxiliary_loss_clip": 0.01063051, "auxiliary_loss_mlp": 0.01041182, "balance_loss_clip": 1.03309762, "balance_loss_mlp": 1.02517855, "epoch": 0.913572824289794, "flos": 20813250528000.0, "grad_norm": 1.8550488642948777, "language_loss": 0.61682135, "learning_rate": 7.779420596254383e-08, "loss": 0.63786364, "num_input_tokens_seen": 327801960, "step": 15195, "time_per_iteration": 2.881197929382324 }, { "auxiliary_loss_clip": 0.01098691, "auxiliary_loss_mlp": 0.01034267, "balance_loss_clip": 1.03617358, "balance_loss_mlp": 1.02182126, "epoch": 0.913632947542462, "flos": 25703278225920.0, "grad_norm": 1.4758121064048373, "language_loss": 0.71160495, "learning_rate": 7.768667677132201e-08, "loss": 0.73293453, "num_input_tokens_seen": 327823795, "step": 15196, "time_per_iteration": 2.6203744411468506 }, { "auxiliary_loss_clip": 0.01084959, "auxiliary_loss_mlp": 0.01035793, "balance_loss_clip": 1.034657, "balance_loss_mlp": 1.02372885, "epoch": 0.9136930707951301, "flos": 26286216048000.0, "grad_norm": 1.471790705908436, "language_loss": 0.71344984, "learning_rate": 7.757922047441411e-08, "loss": 0.73465735, "num_input_tokens_seen": 327845175, "step": 15197, "time_per_iteration": 2.6849207878112793 }, { "auxiliary_loss_clip": 0.01088436, "auxiliary_loss_mlp": 0.01027135, "balance_loss_clip": 1.03387213, "balance_loss_mlp": 1.01404572, "epoch": 0.913753194047798, "flos": 22091885942400.0, "grad_norm": 1.7806883440096042, "language_loss": 0.7787807, "learning_rate": 7.747183707589489e-08, "loss": 0.79993641, "num_input_tokens_seen": 327863150, "step": 15198, "time_per_iteration": 2.629854202270508 }, { "auxiliary_loss_clip": 0.01089748, "auxiliary_loss_mlp": 0.01030529, "balance_loss_clip": 1.03545046, "balance_loss_mlp": 1.01816726, "epoch": 0.913813317300466, "flos": 23587206151680.0, "grad_norm": 1.509412256528269, "language_loss": 0.67781103, "learning_rate": 7.736452657983616e-08, "loss": 0.69901383, "num_input_tokens_seen": 327883445, "step": 15199, "time_per_iteration": 2.6181437969207764 }, { "auxiliary_loss_clip": 0.01097631, "auxiliary_loss_mlp": 0.00769993, "balance_loss_clip": 1.03525543, "balance_loss_mlp": 1.00025439, "epoch": 0.9138734405531339, "flos": 28876452583680.0, "grad_norm": 1.5467213284534869, "language_loss": 0.67587829, "learning_rate": 7.725728899030714e-08, "loss": 0.69455445, "num_input_tokens_seen": 327905745, "step": 15200, "time_per_iteration": 2.768298387527466 }, { "auxiliary_loss_clip": 0.0109491, "auxiliary_loss_mlp": 0.01031874, "balance_loss_clip": 1.03708506, "balance_loss_mlp": 1.020787, "epoch": 0.9139335638058019, "flos": 22821087945600.0, "grad_norm": 1.5631891180078048, "language_loss": 0.71305549, "learning_rate": 7.715012431137435e-08, "loss": 0.73432332, "num_input_tokens_seen": 327925435, "step": 15201, "time_per_iteration": 2.6898791790008545 }, { "auxiliary_loss_clip": 0.01096112, "auxiliary_loss_mlp": 0.01027534, "balance_loss_clip": 1.03487992, "balance_loss_mlp": 1.01640594, "epoch": 0.9139936870584698, "flos": 18004174381440.0, "grad_norm": 1.9050793527303824, "language_loss": 0.70880222, "learning_rate": 7.704303254710165e-08, "loss": 0.73003864, "num_input_tokens_seen": 327944145, "step": 15202, "time_per_iteration": 2.645087718963623 }, { "auxiliary_loss_clip": 0.01107696, "auxiliary_loss_mlp": 0.01031158, "balance_loss_clip": 1.03578544, "balance_loss_mlp": 1.01858711, "epoch": 0.9140538103111379, "flos": 15813767111040.0, "grad_norm": 5.183790549538575, "language_loss": 0.66272342, "learning_rate": 7.693601370155001e-08, "loss": 0.68411195, "num_input_tokens_seen": 327960565, "step": 15203, "time_per_iteration": 2.5569849014282227 }, { "auxiliary_loss_clip": 0.01099433, "auxiliary_loss_mlp": 0.0102949, "balance_loss_clip": 1.03735852, "balance_loss_mlp": 1.01664543, "epoch": 0.9141139335638058, "flos": 23987035416960.0, "grad_norm": 1.5350852350505626, "language_loss": 0.68632525, "learning_rate": 7.682906777877751e-08, "loss": 0.70761448, "num_input_tokens_seen": 327981180, "step": 15204, "time_per_iteration": 2.609595537185669 }, { "auxiliary_loss_clip": 0.01096665, "auxiliary_loss_mlp": 0.01024903, "balance_loss_clip": 1.03312159, "balance_loss_mlp": 1.01215935, "epoch": 0.9141740568164738, "flos": 24024418496640.0, "grad_norm": 1.940906740500505, "language_loss": 0.59392846, "learning_rate": 7.672219478283915e-08, "loss": 0.61514413, "num_input_tokens_seen": 328001500, "step": 15205, "time_per_iteration": 4.150220632553101 }, { "auxiliary_loss_clip": 0.01065472, "auxiliary_loss_mlp": 0.0103254, "balance_loss_clip": 1.03354537, "balance_loss_mlp": 1.01977837, "epoch": 0.9142341800691417, "flos": 27018291139200.0, "grad_norm": 1.7151218871860374, "language_loss": 0.81336343, "learning_rate": 7.661539471778811e-08, "loss": 0.83434355, "num_input_tokens_seen": 328023025, "step": 15206, "time_per_iteration": 4.417832612991333 }, { "auxiliary_loss_clip": 0.01062676, "auxiliary_loss_mlp": 0.0102896, "balance_loss_clip": 1.03224123, "balance_loss_mlp": 1.01588321, "epoch": 0.9142943033218097, "flos": 20412487509120.0, "grad_norm": 2.7859643949116695, "language_loss": 0.73940361, "learning_rate": 7.650866758767382e-08, "loss": 0.76031995, "num_input_tokens_seen": 328041410, "step": 15207, "time_per_iteration": 2.729606866836548 }, { "auxiliary_loss_clip": 0.01068037, "auxiliary_loss_mlp": 0.01037257, "balance_loss_clip": 1.04014826, "balance_loss_mlp": 1.02391171, "epoch": 0.9143544265744776, "flos": 19755322231680.0, "grad_norm": 1.6574585771542836, "language_loss": 0.7323935, "learning_rate": 7.640201339654373e-08, "loss": 0.75344646, "num_input_tokens_seen": 328060495, "step": 15208, "time_per_iteration": 4.227857351303101 }, { "auxiliary_loss_clip": 0.01091165, "auxiliary_loss_mlp": 0.01027772, "balance_loss_clip": 1.03750086, "balance_loss_mlp": 1.01647067, "epoch": 0.9144145498271457, "flos": 17165444832000.0, "grad_norm": 2.0923542291564545, "language_loss": 0.8601079, "learning_rate": 7.629543214844237e-08, "loss": 0.88129735, "num_input_tokens_seen": 328076905, "step": 15209, "time_per_iteration": 2.590949058532715 }, { "auxiliary_loss_clip": 0.01091262, "auxiliary_loss_mlp": 0.01034147, "balance_loss_clip": 1.04051423, "balance_loss_mlp": 1.0222261, "epoch": 0.9144746730798137, "flos": 23726072131200.0, "grad_norm": 1.9387336499719838, "language_loss": 0.75063741, "learning_rate": 7.618892384741093e-08, "loss": 0.77189153, "num_input_tokens_seen": 328096960, "step": 15210, "time_per_iteration": 2.6469690799713135 }, { "auxiliary_loss_clip": 0.01083487, "auxiliary_loss_mlp": 0.01032807, "balance_loss_clip": 1.03146422, "balance_loss_mlp": 1.02025414, "epoch": 0.9145347963324816, "flos": 25847854467840.0, "grad_norm": 2.0189583543818994, "language_loss": 0.78215957, "learning_rate": 7.6082488497488e-08, "loss": 0.80332255, "num_input_tokens_seen": 328115445, "step": 15211, "time_per_iteration": 2.6844332218170166 }, { "auxiliary_loss_clip": 0.01100808, "auxiliary_loss_mlp": 0.01026537, "balance_loss_clip": 1.03790462, "balance_loss_mlp": 1.01447928, "epoch": 0.9145949195851496, "flos": 19242769109760.0, "grad_norm": 1.6970166038949297, "language_loss": 0.82861638, "learning_rate": 7.597612610270986e-08, "loss": 0.84988987, "num_input_tokens_seen": 328133965, "step": 15212, "time_per_iteration": 2.670666217803955 }, { "auxiliary_loss_clip": 0.01095988, "auxiliary_loss_mlp": 0.01029094, "balance_loss_clip": 1.03628695, "balance_loss_mlp": 1.01744699, "epoch": 0.9146550428378175, "flos": 18296379521280.0, "grad_norm": 1.816708490158756, "language_loss": 0.83801937, "learning_rate": 7.586983666711022e-08, "loss": 0.85927022, "num_input_tokens_seen": 328151520, "step": 15213, "time_per_iteration": 2.5807952880859375 }, { "auxiliary_loss_clip": 0.01092484, "auxiliary_loss_mlp": 0.0102752, "balance_loss_clip": 1.03717518, "balance_loss_mlp": 1.01593268, "epoch": 0.9147151660904855, "flos": 20084264006400.0, "grad_norm": 1.7762329213074084, "language_loss": 0.70716697, "learning_rate": 7.576362019471894e-08, "loss": 0.72836697, "num_input_tokens_seen": 328171275, "step": 15214, "time_per_iteration": 2.606302499771118 }, { "auxiliary_loss_clip": 0.01100282, "auxiliary_loss_mlp": 0.0103609, "balance_loss_clip": 1.03756428, "balance_loss_mlp": 1.02288795, "epoch": 0.9147752893431534, "flos": 24389127239040.0, "grad_norm": 2.6763056235741876, "language_loss": 0.62738419, "learning_rate": 7.565747668956413e-08, "loss": 0.64874792, "num_input_tokens_seen": 328192115, "step": 15215, "time_per_iteration": 2.624128580093384 }, { "auxiliary_loss_clip": 0.01083257, "auxiliary_loss_mlp": 0.01031489, "balance_loss_clip": 1.04120791, "balance_loss_mlp": 1.0186621, "epoch": 0.9148354125958215, "flos": 18150402648960.0, "grad_norm": 2.856196608513459, "language_loss": 0.75838691, "learning_rate": 7.555140615567058e-08, "loss": 0.77953434, "num_input_tokens_seen": 328208990, "step": 15216, "time_per_iteration": 2.683112144470215 }, { "auxiliary_loss_clip": 0.01082061, "auxiliary_loss_mlp": 0.0104043, "balance_loss_clip": 1.0344038, "balance_loss_mlp": 1.02597594, "epoch": 0.9148955358484894, "flos": 23367540528000.0, "grad_norm": 2.1556116302861223, "language_loss": 0.679968, "learning_rate": 7.544540859706062e-08, "loss": 0.70119286, "num_input_tokens_seen": 328227840, "step": 15217, "time_per_iteration": 2.7583320140838623 }, { "auxiliary_loss_clip": 0.01096251, "auxiliary_loss_mlp": 0.01034398, "balance_loss_clip": 1.03755021, "balance_loss_mlp": 1.02222061, "epoch": 0.9149556591011574, "flos": 18076498416000.0, "grad_norm": 1.7866598830816114, "language_loss": 0.79880273, "learning_rate": 7.533948401775347e-08, "loss": 0.82010925, "num_input_tokens_seen": 328246250, "step": 15218, "time_per_iteration": 2.5897185802459717 }, { "auxiliary_loss_clip": 0.0099941, "auxiliary_loss_mlp": 0.00999986, "balance_loss_clip": 1.00879896, "balance_loss_mlp": 0.99891329, "epoch": 0.9150157823538253, "flos": 54586374825600.0, "grad_norm": 0.8465659506320653, "language_loss": 0.59200621, "learning_rate": 7.523363242176595e-08, "loss": 0.61200017, "num_input_tokens_seen": 328303625, "step": 15219, "time_per_iteration": 3.1801815032958984 }, { "auxiliary_loss_clip": 0.01096152, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.03535295, "balance_loss_mlp": 1.0223403, "epoch": 0.9150759056064933, "flos": 17893102550400.0, "grad_norm": 2.4543943314261063, "language_loss": 0.78340375, "learning_rate": 7.512785381311216e-08, "loss": 0.80470991, "num_input_tokens_seen": 328322135, "step": 15220, "time_per_iteration": 2.595521926879883 }, { "auxiliary_loss_clip": 0.01057387, "auxiliary_loss_mlp": 0.01042337, "balance_loss_clip": 1.03327441, "balance_loss_mlp": 1.02777517, "epoch": 0.9151360288591612, "flos": 18073517587200.0, "grad_norm": 2.0267534769754683, "language_loss": 0.66091788, "learning_rate": 7.50221481958031e-08, "loss": 0.68191504, "num_input_tokens_seen": 328340750, "step": 15221, "time_per_iteration": 2.7066280841827393 }, { "auxiliary_loss_clip": 0.01086188, "auxiliary_loss_mlp": 0.01031232, "balance_loss_clip": 1.0361774, "balance_loss_mlp": 1.01978827, "epoch": 0.9151961521118293, "flos": 19354523299200.0, "grad_norm": 1.6413784171664523, "language_loss": 0.84243524, "learning_rate": 7.491651557384692e-08, "loss": 0.86360949, "num_input_tokens_seen": 328359995, "step": 15222, "time_per_iteration": 2.6501386165618896 }, { "auxiliary_loss_clip": 0.01014171, "auxiliary_loss_mlp": 0.0100656, "balance_loss_clip": 1.01053584, "balance_loss_mlp": 1.00542736, "epoch": 0.9152562753644973, "flos": 72146621018880.0, "grad_norm": 0.7238738726338669, "language_loss": 0.49580848, "learning_rate": 7.481095595124953e-08, "loss": 0.51601577, "num_input_tokens_seen": 328426865, "step": 15223, "time_per_iteration": 3.214282751083374 }, { "auxiliary_loss_clip": 0.01078844, "auxiliary_loss_mlp": 0.01037006, "balance_loss_clip": 1.03739119, "balance_loss_mlp": 1.02367282, "epoch": 0.9153163986171652, "flos": 20777016683520.0, "grad_norm": 2.2306023467876175, "language_loss": 0.72199959, "learning_rate": 7.470546933201349e-08, "loss": 0.7431581, "num_input_tokens_seen": 328445970, "step": 15224, "time_per_iteration": 2.673509359359741 }, { "auxiliary_loss_clip": 0.01093298, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 1.03519857, "balance_loss_mlp": 1.01645935, "epoch": 0.9153765218698332, "flos": 23040107124480.0, "grad_norm": 1.873148880683522, "language_loss": 0.81030774, "learning_rate": 7.460005572013895e-08, "loss": 0.83153254, "num_input_tokens_seen": 328464585, "step": 15225, "time_per_iteration": 2.5755882263183594 }, { "auxiliary_loss_clip": 0.01105808, "auxiliary_loss_mlp": 0.01023692, "balance_loss_clip": 1.03513598, "balance_loss_mlp": 1.01225948, "epoch": 0.9154366451225011, "flos": 28990900293120.0, "grad_norm": 1.4093561745696859, "language_loss": 0.71350908, "learning_rate": 7.44947151196238e-08, "loss": 0.73480415, "num_input_tokens_seen": 328490155, "step": 15226, "time_per_iteration": 2.658024787902832 }, { "auxiliary_loss_clip": 0.01038791, "auxiliary_loss_mlp": 0.01029628, "balance_loss_clip": 1.03364909, "balance_loss_mlp": 1.01687872, "epoch": 0.9154967683751691, "flos": 22309504490880.0, "grad_norm": 2.8076014846483166, "language_loss": 0.74480593, "learning_rate": 7.43894475344613e-08, "loss": 0.76549006, "num_input_tokens_seen": 328508275, "step": 15227, "time_per_iteration": 2.8204689025878906 }, { "auxiliary_loss_clip": 0.01084535, "auxiliary_loss_mlp": 0.01031001, "balance_loss_clip": 1.03527713, "balance_loss_mlp": 1.01924694, "epoch": 0.915556891627837, "flos": 24571481610240.0, "grad_norm": 1.973795210729136, "language_loss": 0.74037504, "learning_rate": 7.428425296864404e-08, "loss": 0.7615304, "num_input_tokens_seen": 328529425, "step": 15228, "time_per_iteration": 2.745267152786255 }, { "auxiliary_loss_clip": 0.0106924, "auxiliary_loss_mlp": 0.01029082, "balance_loss_clip": 1.03406215, "balance_loss_mlp": 1.01733994, "epoch": 0.9156170148805051, "flos": 22164676853760.0, "grad_norm": 1.4512437253894719, "language_loss": 0.71928173, "learning_rate": 7.417913142616106e-08, "loss": 0.74026489, "num_input_tokens_seen": 328550200, "step": 15229, "time_per_iteration": 2.8107035160064697 }, { "auxiliary_loss_clip": 0.01111837, "auxiliary_loss_mlp": 0.01035356, "balance_loss_clip": 1.03959012, "balance_loss_mlp": 1.0219506, "epoch": 0.915677138133173, "flos": 20920659171840.0, "grad_norm": 1.9803845760849772, "language_loss": 0.83079779, "learning_rate": 7.407408291099848e-08, "loss": 0.85226971, "num_input_tokens_seen": 328568540, "step": 15230, "time_per_iteration": 2.5778980255126953 }, { "auxiliary_loss_clip": 0.01068692, "auxiliary_loss_mlp": 0.0102912, "balance_loss_clip": 1.03630972, "balance_loss_mlp": 1.01733065, "epoch": 0.915737261385841, "flos": 24345136056960.0, "grad_norm": 1.5638994000303916, "language_loss": 0.83665484, "learning_rate": 7.396910742713957e-08, "loss": 0.85763288, "num_input_tokens_seen": 328587300, "step": 15231, "time_per_iteration": 2.757667303085327 }, { "auxiliary_loss_clip": 0.0109037, "auxiliary_loss_mlp": 0.0102554, "balance_loss_clip": 1.03120708, "balance_loss_mlp": 1.01339293, "epoch": 0.9157973846385089, "flos": 26761386090240.0, "grad_norm": 1.4862156687145838, "language_loss": 0.72474539, "learning_rate": 7.386420497856516e-08, "loss": 0.74590445, "num_input_tokens_seen": 328610055, "step": 15232, "time_per_iteration": 2.65309739112854 }, { "auxiliary_loss_clip": 0.01110021, "auxiliary_loss_mlp": 0.01035652, "balance_loss_clip": 1.03648698, "balance_loss_mlp": 1.02338552, "epoch": 0.9158575078911769, "flos": 18478733892480.0, "grad_norm": 2.186963867327178, "language_loss": 0.67672479, "learning_rate": 7.375937556925338e-08, "loss": 0.69818151, "num_input_tokens_seen": 328626815, "step": 15233, "time_per_iteration": 2.5290985107421875 }, { "auxiliary_loss_clip": 0.01084574, "auxiliary_loss_mlp": 0.01037951, "balance_loss_clip": 1.03832459, "balance_loss_mlp": 1.02474308, "epoch": 0.9159176311438448, "flos": 21798926616960.0, "grad_norm": 1.9371619126619564, "language_loss": 0.69512558, "learning_rate": 7.365461920317861e-08, "loss": 0.71635091, "num_input_tokens_seen": 328643995, "step": 15234, "time_per_iteration": 2.6468849182128906 }, { "auxiliary_loss_clip": 0.01086822, "auxiliary_loss_mlp": 0.01034886, "balance_loss_clip": 1.0372566, "balance_loss_mlp": 1.02233958, "epoch": 0.9159777543965129, "flos": 24783749032320.0, "grad_norm": 1.9164787678121122, "language_loss": 0.88101876, "learning_rate": 7.354993588431391e-08, "loss": 0.90223587, "num_input_tokens_seen": 328659565, "step": 15235, "time_per_iteration": 2.681330919265747 }, { "auxiliary_loss_clip": 0.0104198, "auxiliary_loss_mlp": 0.01037221, "balance_loss_clip": 1.03242683, "balance_loss_mlp": 1.0227077, "epoch": 0.9160378776491809, "flos": 26868758820480.0, "grad_norm": 1.7189420130737911, "language_loss": 0.77287024, "learning_rate": 7.344532561662853e-08, "loss": 0.79366231, "num_input_tokens_seen": 328679045, "step": 15236, "time_per_iteration": 2.7985198497772217 }, { "auxiliary_loss_clip": 0.00988696, "auxiliary_loss_mlp": 0.01006396, "balance_loss_clip": 1.01333547, "balance_loss_mlp": 1.00522804, "epoch": 0.9160980009018488, "flos": 70578222589440.0, "grad_norm": 0.6745147326692066, "language_loss": 0.62227875, "learning_rate": 7.334078840409019e-08, "loss": 0.64222974, "num_input_tokens_seen": 328744565, "step": 15237, "time_per_iteration": 3.2159206867218018 }, { "auxiliary_loss_clip": 0.0111032, "auxiliary_loss_mlp": 0.00770462, "balance_loss_clip": 1.03761566, "balance_loss_mlp": 1.00039566, "epoch": 0.9161581241545168, "flos": 16289332202880.0, "grad_norm": 2.2962314429529638, "language_loss": 0.75145757, "learning_rate": 7.323632425066151e-08, "loss": 0.77026534, "num_input_tokens_seen": 328762455, "step": 15238, "time_per_iteration": 2.5952906608581543 }, { "auxiliary_loss_clip": 0.01108796, "auxiliary_loss_mlp": 0.01025665, "balance_loss_clip": 1.03680956, "balance_loss_mlp": 1.01369047, "epoch": 0.9162182474071847, "flos": 18438154502400.0, "grad_norm": 2.6834766833849693, "language_loss": 0.7463975, "learning_rate": 7.313193316030464e-08, "loss": 0.76774204, "num_input_tokens_seen": 328780320, "step": 15239, "time_per_iteration": 2.5366570949554443 }, { "auxiliary_loss_clip": 0.01078699, "auxiliary_loss_mlp": 0.01035056, "balance_loss_clip": 1.03494883, "balance_loss_mlp": 1.02270627, "epoch": 0.9162783706598527, "flos": 19167248764800.0, "grad_norm": 3.1115685298181797, "language_loss": 0.63496542, "learning_rate": 7.302761513697819e-08, "loss": 0.65610296, "num_input_tokens_seen": 328797570, "step": 15240, "time_per_iteration": 2.654343366622925 }, { "auxiliary_loss_clip": 0.01084597, "auxiliary_loss_mlp": 0.00769911, "balance_loss_clip": 1.0354557, "balance_loss_mlp": 1.00024796, "epoch": 0.9163384939125206, "flos": 20412990299520.0, "grad_norm": 1.818210522630089, "language_loss": 0.7633701, "learning_rate": 7.292337018463746e-08, "loss": 0.78191519, "num_input_tokens_seen": 328814075, "step": 15241, "time_per_iteration": 2.681783676147461 }, { "auxiliary_loss_clip": 0.01103855, "auxiliary_loss_mlp": 0.01030564, "balance_loss_clip": 1.03727055, "balance_loss_mlp": 1.01654494, "epoch": 0.9163986171651887, "flos": 19645902426240.0, "grad_norm": 2.120916469746568, "language_loss": 0.67877054, "learning_rate": 7.281919830723549e-08, "loss": 0.70011473, "num_input_tokens_seen": 328831990, "step": 15242, "time_per_iteration": 2.695181131362915 }, { "auxiliary_loss_clip": 0.01095195, "auxiliary_loss_mlp": 0.01035014, "balance_loss_clip": 1.03303313, "balance_loss_mlp": 1.02215111, "epoch": 0.9164587404178566, "flos": 12823054865280.0, "grad_norm": 2.0974216325015944, "language_loss": 0.80733311, "learning_rate": 7.271509950872334e-08, "loss": 0.8286351, "num_input_tokens_seen": 328849105, "step": 15243, "time_per_iteration": 2.634120464324951 }, { "auxiliary_loss_clip": 0.01082905, "auxiliary_loss_mlp": 0.0103102, "balance_loss_clip": 1.03140903, "balance_loss_mlp": 1.01816344, "epoch": 0.9165188636705246, "flos": 22309396750080.0, "grad_norm": 1.8693748825507899, "language_loss": 0.82145083, "learning_rate": 7.261107379304721e-08, "loss": 0.84259009, "num_input_tokens_seen": 328866810, "step": 15244, "time_per_iteration": 4.170153617858887 }, { "auxiliary_loss_clip": 0.01113607, "auxiliary_loss_mlp": 0.01035382, "balance_loss_clip": 1.0378207, "balance_loss_mlp": 1.02204251, "epoch": 0.9165789869231925, "flos": 18223337214720.0, "grad_norm": 3.40014047465237, "language_loss": 0.71937442, "learning_rate": 7.250712116415214e-08, "loss": 0.74086428, "num_input_tokens_seen": 328885325, "step": 15245, "time_per_iteration": 4.17969822883606 }, { "auxiliary_loss_clip": 0.01083804, "auxiliary_loss_mlp": 0.01029232, "balance_loss_clip": 1.03430676, "balance_loss_mlp": 1.01741219, "epoch": 0.9166391101758605, "flos": 13691553811200.0, "grad_norm": 1.6435574883208541, "language_loss": 0.74527669, "learning_rate": 7.240324162598033e-08, "loss": 0.76640707, "num_input_tokens_seen": 328902655, "step": 15246, "time_per_iteration": 4.363448858261108 }, { "auxiliary_loss_clip": 0.01080629, "auxiliary_loss_mlp": 0.01034328, "balance_loss_clip": 1.03388071, "balance_loss_mlp": 1.02122653, "epoch": 0.9166992334285284, "flos": 17346793622400.0, "grad_norm": 1.9577914656696735, "language_loss": 0.75327551, "learning_rate": 7.229943518247106e-08, "loss": 0.77442503, "num_input_tokens_seen": 328918440, "step": 15247, "time_per_iteration": 2.664409637451172 }, { "auxiliary_loss_clip": 0.01101374, "auxiliary_loss_mlp": 0.01027057, "balance_loss_clip": 1.03908849, "balance_loss_mlp": 1.01476669, "epoch": 0.9167593566811965, "flos": 23731135948800.0, "grad_norm": 1.669323682742113, "language_loss": 0.76257682, "learning_rate": 7.219570183756052e-08, "loss": 0.7838611, "num_input_tokens_seen": 328938055, "step": 15248, "time_per_iteration": 4.128343820571899 }, { "auxiliary_loss_clip": 0.01097593, "auxiliary_loss_mlp": 0.01037777, "balance_loss_clip": 1.03494728, "balance_loss_mlp": 1.02446711, "epoch": 0.9168194799338644, "flos": 27818201064960.0, "grad_norm": 2.2661509072382424, "language_loss": 0.72809201, "learning_rate": 7.209204159518178e-08, "loss": 0.74944574, "num_input_tokens_seen": 328957895, "step": 15249, "time_per_iteration": 2.67682147026062 }, { "auxiliary_loss_clip": 0.01060539, "auxiliary_loss_mlp": 0.01029442, "balance_loss_clip": 1.03332496, "balance_loss_mlp": 1.01615024, "epoch": 0.9168796031865324, "flos": 21717552355200.0, "grad_norm": 3.1939184411772406, "language_loss": 0.75809246, "learning_rate": 7.198845445926616e-08, "loss": 0.7789923, "num_input_tokens_seen": 328971365, "step": 15250, "time_per_iteration": 2.738577365875244 }, { "auxiliary_loss_clip": 0.01066866, "auxiliary_loss_mlp": 0.01026181, "balance_loss_clip": 1.03519356, "balance_loss_mlp": 1.01423001, "epoch": 0.9169397264392004, "flos": 23404420817280.0, "grad_norm": 1.6784135757036345, "language_loss": 0.75771379, "learning_rate": 7.188494043374138e-08, "loss": 0.77864426, "num_input_tokens_seen": 328990830, "step": 15251, "time_per_iteration": 2.7864675521850586 }, { "auxiliary_loss_clip": 0.01084617, "auxiliary_loss_mlp": 0.01033127, "balance_loss_clip": 1.03682351, "balance_loss_mlp": 1.01889396, "epoch": 0.9169998496918683, "flos": 23950981140480.0, "grad_norm": 3.1809452254911896, "language_loss": 0.79785126, "learning_rate": 7.178149952253298e-08, "loss": 0.81902874, "num_input_tokens_seen": 329008345, "step": 15252, "time_per_iteration": 2.67496395111084 }, { "auxiliary_loss_clip": 0.01108344, "auxiliary_loss_mlp": 0.01034651, "balance_loss_clip": 1.03633821, "balance_loss_mlp": 1.02253342, "epoch": 0.9170599729445363, "flos": 18332469711360.0, "grad_norm": 1.6979211858236058, "language_loss": 0.77028179, "learning_rate": 7.167813172956316e-08, "loss": 0.79171169, "num_input_tokens_seen": 329027440, "step": 15253, "time_per_iteration": 2.5820562839508057 }, { "auxiliary_loss_clip": 0.01099567, "auxiliary_loss_mlp": 0.01026712, "balance_loss_clip": 1.03753924, "balance_loss_mlp": 1.0148387, "epoch": 0.9171200961972042, "flos": 22674859678080.0, "grad_norm": 1.9944636420338524, "language_loss": 0.73225999, "learning_rate": 7.157483705875256e-08, "loss": 0.75352275, "num_input_tokens_seen": 329046445, "step": 15254, "time_per_iteration": 2.66645884513855 }, { "auxiliary_loss_clip": 0.01069043, "auxiliary_loss_mlp": 0.01024866, "balance_loss_clip": 1.03459096, "balance_loss_mlp": 1.01324344, "epoch": 0.9171802194498723, "flos": 26719298328960.0, "grad_norm": 1.757918865833482, "language_loss": 0.79068267, "learning_rate": 7.14716155140167e-08, "loss": 0.81162179, "num_input_tokens_seen": 329065555, "step": 15255, "time_per_iteration": 2.791233539581299 }, { "auxiliary_loss_clip": 0.01099583, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.0360918, "balance_loss_mlp": 1.01973057, "epoch": 0.9172403427025402, "flos": 37889240538240.0, "grad_norm": 2.1163696122590228, "language_loss": 0.68610239, "learning_rate": 7.136846709927047e-08, "loss": 0.70742488, "num_input_tokens_seen": 329087515, "step": 15256, "time_per_iteration": 2.8768861293792725 }, { "auxiliary_loss_clip": 0.0109198, "auxiliary_loss_mlp": 0.01039298, "balance_loss_clip": 1.03456831, "balance_loss_mlp": 1.02614951, "epoch": 0.9173004659552082, "flos": 17055163100160.0, "grad_norm": 1.585817342288342, "language_loss": 0.83782554, "learning_rate": 7.126539181842561e-08, "loss": 0.85913831, "num_input_tokens_seen": 329106820, "step": 15257, "time_per_iteration": 2.65502667427063 }, { "auxiliary_loss_clip": 0.01082945, "auxiliary_loss_mlp": 0.01033799, "balance_loss_clip": 1.03255379, "balance_loss_mlp": 1.0220809, "epoch": 0.9173605892078761, "flos": 22201593056640.0, "grad_norm": 1.60833944396701, "language_loss": 0.7756505, "learning_rate": 7.116238967539012e-08, "loss": 0.79681796, "num_input_tokens_seen": 329126515, "step": 15258, "time_per_iteration": 2.6512203216552734 }, { "auxiliary_loss_clip": 0.01093895, "auxiliary_loss_mlp": 0.01030397, "balance_loss_clip": 1.03959584, "balance_loss_mlp": 1.01836896, "epoch": 0.9174207124605441, "flos": 16507776764160.0, "grad_norm": 2.0334925748000794, "language_loss": 0.78772163, "learning_rate": 7.105946067406999e-08, "loss": 0.80896461, "num_input_tokens_seen": 329142660, "step": 15259, "time_per_iteration": 2.5838210582733154 }, { "auxiliary_loss_clip": 0.01059246, "auxiliary_loss_mlp": 0.01035054, "balance_loss_clip": 1.03190184, "balance_loss_mlp": 1.02319837, "epoch": 0.917480835713212, "flos": 24535606901760.0, "grad_norm": 1.6551719766080486, "language_loss": 0.76302671, "learning_rate": 7.095660481836895e-08, "loss": 0.7839697, "num_input_tokens_seen": 329162575, "step": 15260, "time_per_iteration": 2.682069778442383 }, { "auxiliary_loss_clip": 0.01066153, "auxiliary_loss_mlp": 0.01028553, "balance_loss_clip": 1.03227329, "balance_loss_mlp": 1.0160774, "epoch": 0.9175409589658801, "flos": 20880726226560.0, "grad_norm": 1.5511805000911754, "language_loss": 0.61173445, "learning_rate": 7.085382211218637e-08, "loss": 0.63268149, "num_input_tokens_seen": 329182090, "step": 15261, "time_per_iteration": 2.681443929672241 }, { "auxiliary_loss_clip": 0.01080586, "auxiliary_loss_mlp": 0.01029547, "balance_loss_clip": 1.03192782, "balance_loss_mlp": 1.01745868, "epoch": 0.917601082218548, "flos": 14276035918080.0, "grad_norm": 1.8788230361145468, "language_loss": 0.73716688, "learning_rate": 7.075111255942002e-08, "loss": 0.75826824, "num_input_tokens_seen": 329196535, "step": 15262, "time_per_iteration": 2.6560230255126953 }, { "auxiliary_loss_clip": 0.01110257, "auxiliary_loss_mlp": 0.01038053, "balance_loss_clip": 1.03490496, "balance_loss_mlp": 1.0255841, "epoch": 0.917661205471216, "flos": 19099234362240.0, "grad_norm": 1.8199175016949676, "language_loss": 0.77784705, "learning_rate": 7.064847616396496e-08, "loss": 0.79933017, "num_input_tokens_seen": 329215135, "step": 15263, "time_per_iteration": 2.5552515983581543 }, { "auxiliary_loss_clip": 0.01110998, "auxiliary_loss_mlp": 0.01029572, "balance_loss_clip": 1.03634441, "balance_loss_mlp": 1.017097, "epoch": 0.917721328723884, "flos": 21106568989440.0, "grad_norm": 2.03433288811874, "language_loss": 0.75501031, "learning_rate": 7.054591292971324e-08, "loss": 0.776416, "num_input_tokens_seen": 329235150, "step": 15264, "time_per_iteration": 2.5273077487945557 }, { "auxiliary_loss_clip": 0.01085288, "auxiliary_loss_mlp": 0.0103495, "balance_loss_clip": 1.03628254, "balance_loss_mlp": 1.02340412, "epoch": 0.9177814519765519, "flos": 21943215550080.0, "grad_norm": 1.6751862272881284, "language_loss": 0.83633941, "learning_rate": 7.044342286055394e-08, "loss": 0.8575418, "num_input_tokens_seen": 329254365, "step": 15265, "time_per_iteration": 2.6066534519195557 }, { "auxiliary_loss_clip": 0.01114086, "auxiliary_loss_mlp": 0.01040959, "balance_loss_clip": 1.03847134, "balance_loss_mlp": 1.02778673, "epoch": 0.9178415752292199, "flos": 24205982768640.0, "grad_norm": 1.6645706894370145, "language_loss": 0.7328164, "learning_rate": 7.034100596037306e-08, "loss": 0.75436687, "num_input_tokens_seen": 329274385, "step": 15266, "time_per_iteration": 2.5833418369293213 }, { "auxiliary_loss_clip": 0.01108539, "auxiliary_loss_mlp": 0.01030083, "balance_loss_clip": 1.03649783, "balance_loss_mlp": 1.01844811, "epoch": 0.9179016984818879, "flos": 20042068504320.0, "grad_norm": 1.59558924604592, "language_loss": 0.77707624, "learning_rate": 7.023866223305486e-08, "loss": 0.79846251, "num_input_tokens_seen": 329292160, "step": 15267, "time_per_iteration": 2.551771879196167 }, { "auxiliary_loss_clip": 0.01017255, "auxiliary_loss_mlp": 0.00751276, "balance_loss_clip": 1.00686395, "balance_loss_mlp": 0.99959415, "epoch": 0.9179618217345559, "flos": 65555901100800.0, "grad_norm": 0.7374753112947235, "language_loss": 0.56223977, "learning_rate": 7.013639168247975e-08, "loss": 0.57992506, "num_input_tokens_seen": 329351870, "step": 15268, "time_per_iteration": 3.2256064414978027 }, { "auxiliary_loss_clip": 0.01110226, "auxiliary_loss_mlp": 0.00770103, "balance_loss_clip": 1.03661978, "balance_loss_mlp": 1.00023341, "epoch": 0.9180219449872238, "flos": 21324618501120.0, "grad_norm": 1.9824828423996201, "language_loss": 0.76052523, "learning_rate": 7.0034194312526e-08, "loss": 0.77932847, "num_input_tokens_seen": 329370930, "step": 15269, "time_per_iteration": 2.571711540222168 }, { "auxiliary_loss_clip": 0.01074295, "auxiliary_loss_mlp": 0.0103616, "balance_loss_clip": 1.03202271, "balance_loss_mlp": 1.02265382, "epoch": 0.9180820682398918, "flos": 41060008684800.0, "grad_norm": 1.7451151800168656, "language_loss": 0.72839332, "learning_rate": 6.993207012706936e-08, "loss": 0.74949783, "num_input_tokens_seen": 329391275, "step": 15270, "time_per_iteration": 2.877145290374756 }, { "auxiliary_loss_clip": 0.01105632, "auxiliary_loss_mlp": 0.01032058, "balance_loss_clip": 1.03500867, "balance_loss_mlp": 1.0196898, "epoch": 0.9181421914925597, "flos": 28072915384320.0, "grad_norm": 1.5262987533233972, "language_loss": 0.80171967, "learning_rate": 6.98300191299821e-08, "loss": 0.82309657, "num_input_tokens_seen": 329412775, "step": 15271, "time_per_iteration": 2.696314573287964 }, { "auxiliary_loss_clip": 0.0106623, "auxiliary_loss_mlp": 0.01035218, "balance_loss_clip": 1.03281534, "balance_loss_mlp": 1.02193236, "epoch": 0.9182023147452277, "flos": 29169411909120.0, "grad_norm": 2.0203873157492387, "language_loss": 0.72958052, "learning_rate": 6.972804132513355e-08, "loss": 0.75059497, "num_input_tokens_seen": 329432440, "step": 15272, "time_per_iteration": 2.7418758869171143 }, { "auxiliary_loss_clip": 0.01080541, "auxiliary_loss_mlp": 0.01034356, "balance_loss_clip": 1.03587949, "balance_loss_mlp": 1.0225302, "epoch": 0.9182624379978956, "flos": 24060831909120.0, "grad_norm": 1.8761263587608576, "language_loss": 0.72443533, "learning_rate": 6.962613671639105e-08, "loss": 0.74558425, "num_input_tokens_seen": 329450605, "step": 15273, "time_per_iteration": 2.5915794372558594 }, { "auxiliary_loss_clip": 0.01068999, "auxiliary_loss_mlp": 0.01026814, "balance_loss_clip": 1.033952, "balance_loss_mlp": 1.01544738, "epoch": 0.9183225612505637, "flos": 23293528554240.0, "grad_norm": 1.6815527096411953, "language_loss": 0.74486136, "learning_rate": 6.952430530761933e-08, "loss": 0.76581949, "num_input_tokens_seen": 329470550, "step": 15274, "time_per_iteration": 2.757570266723633 }, { "auxiliary_loss_clip": 0.01095676, "auxiliary_loss_mlp": 0.01038928, "balance_loss_clip": 1.03320456, "balance_loss_mlp": 1.02651846, "epoch": 0.9183826845032316, "flos": 19609237618560.0, "grad_norm": 1.4749833825049345, "language_loss": 0.68892634, "learning_rate": 6.942254710267902e-08, "loss": 0.71027237, "num_input_tokens_seen": 329489765, "step": 15275, "time_per_iteration": 2.5961973667144775 }, { "auxiliary_loss_clip": 0.01094254, "auxiliary_loss_mlp": 0.0103149, "balance_loss_clip": 1.03530109, "balance_loss_mlp": 1.01921117, "epoch": 0.9184428077558996, "flos": 18479057114880.0, "grad_norm": 1.9188925482656494, "language_loss": 0.72735369, "learning_rate": 6.932086210542953e-08, "loss": 0.74861109, "num_input_tokens_seen": 329507040, "step": 15276, "time_per_iteration": 2.557286024093628 }, { "auxiliary_loss_clip": 0.01086791, "auxiliary_loss_mlp": 0.01032139, "balance_loss_clip": 1.03626883, "balance_loss_mlp": 1.02049232, "epoch": 0.9185029310085676, "flos": 20741034234240.0, "grad_norm": 1.5932066455164264, "language_loss": 0.73415935, "learning_rate": 6.921925031972642e-08, "loss": 0.75534868, "num_input_tokens_seen": 329525540, "step": 15277, "time_per_iteration": 2.6720054149627686 }, { "auxiliary_loss_clip": 0.01000655, "auxiliary_loss_mlp": 0.00999523, "balance_loss_clip": 1.00764501, "balance_loss_mlp": 0.99853915, "epoch": 0.9185630542612355, "flos": 68209231875840.0, "grad_norm": 0.7136545127762665, "language_loss": 0.59176219, "learning_rate": 6.91177117494226e-08, "loss": 0.61176395, "num_input_tokens_seen": 329592905, "step": 15278, "time_per_iteration": 3.3310906887054443 }, { "auxiliary_loss_clip": 0.01068097, "auxiliary_loss_mlp": 0.01030448, "balance_loss_clip": 1.03167319, "balance_loss_mlp": 1.01953995, "epoch": 0.9186231775139035, "flos": 12239470598400.0, "grad_norm": 1.6780534662903475, "language_loss": 0.63930976, "learning_rate": 6.901624639836879e-08, "loss": 0.66029525, "num_input_tokens_seen": 329610150, "step": 15279, "time_per_iteration": 2.6621286869049072 }, { "auxiliary_loss_clip": 0.0102767, "auxiliary_loss_mlp": 0.00751159, "balance_loss_clip": 1.0052371, "balance_loss_mlp": 0.99961108, "epoch": 0.9186833007665715, "flos": 63939237770880.0, "grad_norm": 0.8547221489704414, "language_loss": 0.60236037, "learning_rate": 6.891485427041211e-08, "loss": 0.62014866, "num_input_tokens_seen": 329673650, "step": 15280, "time_per_iteration": 3.122877836227417 }, { "auxiliary_loss_clip": 0.01090206, "auxiliary_loss_mlp": 0.01032261, "balance_loss_clip": 1.03693056, "balance_loss_mlp": 1.01968455, "epoch": 0.9187434240192395, "flos": 19974700546560.0, "grad_norm": 2.122988890708145, "language_loss": 0.69674432, "learning_rate": 6.881353536939815e-08, "loss": 0.717969, "num_input_tokens_seen": 329692520, "step": 15281, "time_per_iteration": 2.6311352252960205 }, { "auxiliary_loss_clip": 0.01086175, "auxiliary_loss_mlp": 0.01029211, "balance_loss_clip": 1.03539133, "balance_loss_mlp": 1.01567471, "epoch": 0.9188035472719074, "flos": 25227820874880.0, "grad_norm": 1.7776435853136854, "language_loss": 0.84506124, "learning_rate": 6.871228969916831e-08, "loss": 0.86621511, "num_input_tokens_seen": 329713750, "step": 15282, "time_per_iteration": 2.6757116317749023 }, { "auxiliary_loss_clip": 0.01082882, "auxiliary_loss_mlp": 0.01031746, "balance_loss_clip": 1.03398228, "balance_loss_mlp": 1.01928234, "epoch": 0.9188636705245754, "flos": 18405547931520.0, "grad_norm": 1.9199915461067039, "language_loss": 0.60464978, "learning_rate": 6.861111726356194e-08, "loss": 0.62579608, "num_input_tokens_seen": 329730960, "step": 15283, "time_per_iteration": 2.666703224182129 }, { "auxiliary_loss_clip": 0.01100933, "auxiliary_loss_mlp": 0.00770887, "balance_loss_clip": 1.03721941, "balance_loss_mlp": 1.00024927, "epoch": 0.9189237937772433, "flos": 23769129559680.0, "grad_norm": 1.5468834987808995, "language_loss": 0.65656137, "learning_rate": 6.851001806641554e-08, "loss": 0.67527962, "num_input_tokens_seen": 329750975, "step": 15284, "time_per_iteration": 5.761394023895264 }, { "auxiliary_loss_clip": 0.01106112, "auxiliary_loss_mlp": 0.01032597, "balance_loss_clip": 1.03494716, "balance_loss_mlp": 1.02003229, "epoch": 0.9189839170299113, "flos": 21214624078080.0, "grad_norm": 2.0145798278900164, "language_loss": 0.73759109, "learning_rate": 6.840899211156292e-08, "loss": 0.75897819, "num_input_tokens_seen": 329769645, "step": 15285, "time_per_iteration": 2.5861620903015137 }, { "auxiliary_loss_clip": 0.0110641, "auxiliary_loss_mlp": 0.01035061, "balance_loss_clip": 1.03556252, "balance_loss_mlp": 1.02236581, "epoch": 0.9190440402825792, "flos": 16727370560640.0, "grad_norm": 1.8392842036391315, "language_loss": 0.71751177, "learning_rate": 6.830803940283458e-08, "loss": 0.73892653, "num_input_tokens_seen": 329788185, "step": 15286, "time_per_iteration": 4.326793193817139 }, { "auxiliary_loss_clip": 0.01109819, "auxiliary_loss_mlp": 0.01033357, "balance_loss_clip": 1.03742743, "balance_loss_mlp": 1.02026165, "epoch": 0.9191041635352473, "flos": 23441193365760.0, "grad_norm": 1.8870763932590424, "language_loss": 0.73988366, "learning_rate": 6.820715994405945e-08, "loss": 0.76131546, "num_input_tokens_seen": 329806780, "step": 15287, "time_per_iteration": 2.582787275314331 }, { "auxiliary_loss_clip": 0.01110857, "auxiliary_loss_mlp": 0.01029796, "balance_loss_clip": 1.03881836, "balance_loss_mlp": 1.01651573, "epoch": 0.9191642867879152, "flos": 18807532012800.0, "grad_norm": 2.0747808934421883, "language_loss": 0.65521705, "learning_rate": 6.810635373906226e-08, "loss": 0.67662358, "num_input_tokens_seen": 329826350, "step": 15288, "time_per_iteration": 4.104849338531494 }, { "auxiliary_loss_clip": 0.01112827, "auxiliary_loss_mlp": 0.010354, "balance_loss_clip": 1.04050171, "balance_loss_mlp": 1.02340722, "epoch": 0.9192244100405832, "flos": 32160950167680.0, "grad_norm": 1.8366679912878503, "language_loss": 0.71489662, "learning_rate": 6.800562079166549e-08, "loss": 0.73637891, "num_input_tokens_seen": 329846160, "step": 15289, "time_per_iteration": 2.628432273864746 }, { "auxiliary_loss_clip": 0.01067852, "auxiliary_loss_mlp": 0.01037164, "balance_loss_clip": 1.03277981, "balance_loss_mlp": 1.02398539, "epoch": 0.9192845332932512, "flos": 16357669827840.0, "grad_norm": 2.022083421674923, "language_loss": 0.7447117, "learning_rate": 6.790496110568921e-08, "loss": 0.76576185, "num_input_tokens_seen": 329862020, "step": 15290, "time_per_iteration": 2.6732118129730225 }, { "auxiliary_loss_clip": 0.01067483, "auxiliary_loss_mlp": 0.01027695, "balance_loss_clip": 1.03620386, "balance_loss_mlp": 1.01607156, "epoch": 0.9193446565459191, "flos": 26614475464320.0, "grad_norm": 1.914747567902696, "language_loss": 0.72083873, "learning_rate": 6.78043746849506e-08, "loss": 0.74179053, "num_input_tokens_seen": 329880185, "step": 15291, "time_per_iteration": 2.72456431388855 }, { "auxiliary_loss_clip": 0.01083225, "auxiliary_loss_mlp": 0.01026967, "balance_loss_clip": 1.03504729, "balance_loss_mlp": 1.0149684, "epoch": 0.9194047797985871, "flos": 22492182084480.0, "grad_norm": 1.6500392247637397, "language_loss": 0.71124983, "learning_rate": 6.770386153326346e-08, "loss": 0.73235166, "num_input_tokens_seen": 329900255, "step": 15292, "time_per_iteration": 2.6152868270874023 }, { "auxiliary_loss_clip": 0.01087602, "auxiliary_loss_mlp": 0.01029375, "balance_loss_clip": 1.03518999, "balance_loss_mlp": 1.01654267, "epoch": 0.9194649030512551, "flos": 25078791346560.0, "grad_norm": 2.1012892949543454, "language_loss": 0.72765195, "learning_rate": 6.760342165443988e-08, "loss": 0.74882174, "num_input_tokens_seen": 329919095, "step": 15293, "time_per_iteration": 2.7014577388763428 }, { "auxiliary_loss_clip": 0.01106702, "auxiliary_loss_mlp": 0.01026876, "balance_loss_clip": 1.03621578, "balance_loss_mlp": 1.01458549, "epoch": 0.9195250263039231, "flos": 11911139354880.0, "grad_norm": 1.8656934281191482, "language_loss": 0.78315026, "learning_rate": 6.750305505228837e-08, "loss": 0.80448604, "num_input_tokens_seen": 329936505, "step": 15294, "time_per_iteration": 2.547825813293457 }, { "auxiliary_loss_clip": 0.01088089, "auxiliary_loss_mlp": 0.01036683, "balance_loss_clip": 1.0347265, "balance_loss_mlp": 1.02261615, "epoch": 0.919585149556591, "flos": 21834154880640.0, "grad_norm": 1.8102816220705245, "language_loss": 0.77170849, "learning_rate": 6.74027617306141e-08, "loss": 0.79295617, "num_input_tokens_seen": 329956795, "step": 15295, "time_per_iteration": 2.7039098739624023 }, { "auxiliary_loss_clip": 0.01106989, "auxiliary_loss_mlp": 0.01029958, "balance_loss_clip": 1.03723979, "balance_loss_mlp": 1.01890755, "epoch": 0.919645272809259, "flos": 28184059042560.0, "grad_norm": 2.3118295307682066, "language_loss": 0.7140969, "learning_rate": 6.730254169322114e-08, "loss": 0.73546642, "num_input_tokens_seen": 329977195, "step": 15296, "time_per_iteration": 2.6299383640289307 }, { "auxiliary_loss_clip": 0.01109705, "auxiliary_loss_mlp": 0.01039854, "balance_loss_clip": 1.03783691, "balance_loss_mlp": 1.02766538, "epoch": 0.9197053960619269, "flos": 18332828847360.0, "grad_norm": 2.0214476637003567, "language_loss": 0.75176775, "learning_rate": 6.720239494390912e-08, "loss": 0.77326334, "num_input_tokens_seen": 329992095, "step": 15297, "time_per_iteration": 2.5208096504211426 }, { "auxiliary_loss_clip": 0.01093577, "auxiliary_loss_mlp": 0.00770462, "balance_loss_clip": 1.03651249, "balance_loss_mlp": 1.00015736, "epoch": 0.9197655193145949, "flos": 28183448511360.0, "grad_norm": 1.600708869843347, "language_loss": 0.73453987, "learning_rate": 6.710232148647676e-08, "loss": 0.75318027, "num_input_tokens_seen": 330011490, "step": 15298, "time_per_iteration": 2.5899410247802734 }, { "auxiliary_loss_clip": 0.01084548, "auxiliary_loss_mlp": 0.01035002, "balance_loss_clip": 1.03919554, "balance_loss_mlp": 1.02254462, "epoch": 0.9198256425672628, "flos": 17306321973120.0, "grad_norm": 1.9381032663604973, "language_loss": 0.79355192, "learning_rate": 6.70023213247175e-08, "loss": 0.81474739, "num_input_tokens_seen": 330027885, "step": 15299, "time_per_iteration": 2.618654251098633 }, { "auxiliary_loss_clip": 0.01078356, "auxiliary_loss_mlp": 0.01023938, "balance_loss_clip": 1.03582788, "balance_loss_mlp": 1.01230943, "epoch": 0.9198857658199309, "flos": 17858520731520.0, "grad_norm": 2.334922484548837, "language_loss": 0.63701689, "learning_rate": 6.690239446242385e-08, "loss": 0.65803981, "num_input_tokens_seen": 330046230, "step": 15300, "time_per_iteration": 2.6653809547424316 }, { "auxiliary_loss_clip": 0.01079487, "auxiliary_loss_mlp": 0.00768011, "balance_loss_clip": 1.03474522, "balance_loss_mlp": 1.00012684, "epoch": 0.9199458890725988, "flos": 22127545169280.0, "grad_norm": 1.7470881044851607, "language_loss": 0.69722879, "learning_rate": 6.680254090338545e-08, "loss": 0.71570385, "num_input_tokens_seen": 330065535, "step": 15301, "time_per_iteration": 2.6812119483947754 }, { "auxiliary_loss_clip": 0.01096515, "auxiliary_loss_mlp": 0.01040071, "balance_loss_clip": 1.0358305, "balance_loss_mlp": 1.02490699, "epoch": 0.9200060123252668, "flos": 16034043265920.0, "grad_norm": 1.711835493107777, "language_loss": 0.71127915, "learning_rate": 6.670276065138814e-08, "loss": 0.73264498, "num_input_tokens_seen": 330082920, "step": 15302, "time_per_iteration": 2.5945441722869873 }, { "auxiliary_loss_clip": 0.01110029, "auxiliary_loss_mlp": 0.01030513, "balance_loss_clip": 1.03716493, "balance_loss_mlp": 1.0183115, "epoch": 0.9200661355779348, "flos": 26864521015680.0, "grad_norm": 2.8681928190187556, "language_loss": 0.76527154, "learning_rate": 6.660305371021579e-08, "loss": 0.78667694, "num_input_tokens_seen": 330101165, "step": 15303, "time_per_iteration": 2.641113519668579 }, { "auxiliary_loss_clip": 0.01088214, "auxiliary_loss_mlp": 0.01031045, "balance_loss_clip": 1.03663945, "balance_loss_mlp": 1.01886749, "epoch": 0.9201262588306027, "flos": 12786749193600.0, "grad_norm": 3.150146783563773, "language_loss": 0.88236862, "learning_rate": 6.650342008365006e-08, "loss": 0.90356123, "num_input_tokens_seen": 330118775, "step": 15304, "time_per_iteration": 2.6956560611724854 }, { "auxiliary_loss_clip": 0.0104635, "auxiliary_loss_mlp": 0.01037753, "balance_loss_clip": 1.03275561, "balance_loss_mlp": 1.02204168, "epoch": 0.9201863820832707, "flos": 20631614428800.0, "grad_norm": 2.036135949691738, "language_loss": 0.77178156, "learning_rate": 6.64038597754677e-08, "loss": 0.79262257, "num_input_tokens_seen": 330135570, "step": 15305, "time_per_iteration": 2.817863941192627 }, { "auxiliary_loss_clip": 0.01091635, "auxiliary_loss_mlp": 0.01036183, "balance_loss_clip": 1.03597045, "balance_loss_mlp": 1.02348161, "epoch": 0.9202465053359387, "flos": 26395815421440.0, "grad_norm": 6.064835951868583, "language_loss": 0.8149547, "learning_rate": 6.630437278944501e-08, "loss": 0.8362329, "num_input_tokens_seen": 330152840, "step": 15306, "time_per_iteration": 2.6748034954071045 }, { "auxiliary_loss_clip": 0.01067915, "auxiliary_loss_mlp": 0.01030425, "balance_loss_clip": 1.03378415, "balance_loss_mlp": 1.01910639, "epoch": 0.9203066285886067, "flos": 10488179093760.0, "grad_norm": 1.9090343566843708, "language_loss": 0.72313774, "learning_rate": 6.62049591293541e-08, "loss": 0.74412113, "num_input_tokens_seen": 330168605, "step": 15307, "time_per_iteration": 2.707096815109253 }, { "auxiliary_loss_clip": 0.01100301, "auxiliary_loss_mlp": 0.01030096, "balance_loss_clip": 1.03705478, "balance_loss_mlp": 1.01726282, "epoch": 0.9203667518412746, "flos": 19390721230080.0, "grad_norm": 2.092849830568689, "language_loss": 0.78399515, "learning_rate": 6.610561879896526e-08, "loss": 0.80529916, "num_input_tokens_seen": 330186160, "step": 15308, "time_per_iteration": 2.606255531311035 }, { "auxiliary_loss_clip": 0.01084659, "auxiliary_loss_mlp": 0.01032923, "balance_loss_clip": 1.03293347, "balance_loss_mlp": 1.01967311, "epoch": 0.9204268750939426, "flos": 15924982596480.0, "grad_norm": 2.276895603959481, "language_loss": 0.77885747, "learning_rate": 6.600635180204484e-08, "loss": 0.80003333, "num_input_tokens_seen": 330201780, "step": 15309, "time_per_iteration": 2.637420654296875 }, { "auxiliary_loss_clip": 0.01054204, "auxiliary_loss_mlp": 0.01030272, "balance_loss_clip": 1.03081393, "balance_loss_mlp": 1.01686096, "epoch": 0.9204869983466105, "flos": 16471758401280.0, "grad_norm": 1.8296999045819267, "language_loss": 0.66413641, "learning_rate": 6.590715814235781e-08, "loss": 0.68498123, "num_input_tokens_seen": 330219165, "step": 15310, "time_per_iteration": 2.7335994243621826 }, { "auxiliary_loss_clip": 0.01044089, "auxiliary_loss_mlp": 0.01032237, "balance_loss_clip": 1.03122044, "balance_loss_mlp": 1.01953483, "epoch": 0.9205471215992785, "flos": 21539220307200.0, "grad_norm": 1.6521926444868564, "language_loss": 0.66375726, "learning_rate": 6.580803782366495e-08, "loss": 0.6845206, "num_input_tokens_seen": 330238975, "step": 15311, "time_per_iteration": 2.8604588508605957 }, { "auxiliary_loss_clip": 0.01097174, "auxiliary_loss_mlp": 0.01034831, "balance_loss_clip": 1.03502798, "balance_loss_mlp": 1.02240396, "epoch": 0.9206072448519464, "flos": 25005892694400.0, "grad_norm": 1.6158209988301302, "language_loss": 0.7622931, "learning_rate": 6.570899084972503e-08, "loss": 0.78361315, "num_input_tokens_seen": 330259755, "step": 15312, "time_per_iteration": 2.664778232574463 }, { "auxiliary_loss_clip": 0.01095599, "auxiliary_loss_mlp": 0.01038065, "balance_loss_clip": 1.03726745, "balance_loss_mlp": 1.02628684, "epoch": 0.9206673681046145, "flos": 20522661500160.0, "grad_norm": 1.6943388606397072, "language_loss": 0.79487884, "learning_rate": 6.561001722429394e-08, "loss": 0.81621552, "num_input_tokens_seen": 330277660, "step": 15313, "time_per_iteration": 2.5808446407318115 }, { "auxiliary_loss_clip": 0.01100191, "auxiliary_loss_mlp": 0.01030615, "balance_loss_clip": 1.03598011, "balance_loss_mlp": 1.01823509, "epoch": 0.9207274913572824, "flos": 20883455660160.0, "grad_norm": 2.6484489133321976, "language_loss": 0.78395313, "learning_rate": 6.55111169511251e-08, "loss": 0.80526119, "num_input_tokens_seen": 330295455, "step": 15314, "time_per_iteration": 2.6530680656433105 }, { "auxiliary_loss_clip": 0.01093159, "auxiliary_loss_mlp": 0.01034883, "balance_loss_clip": 1.0372566, "balance_loss_mlp": 1.02071548, "epoch": 0.9207876146099504, "flos": 22708256348160.0, "grad_norm": 1.9276699965768014, "language_loss": 0.79122138, "learning_rate": 6.541229003396864e-08, "loss": 0.81250179, "num_input_tokens_seen": 330315310, "step": 15315, "time_per_iteration": 2.6690027713775635 }, { "auxiliary_loss_clip": 0.01089675, "auxiliary_loss_mlp": 0.01032551, "balance_loss_clip": 1.03612041, "balance_loss_mlp": 1.01993239, "epoch": 0.9208477378626184, "flos": 18507354053760.0, "grad_norm": 1.761446107604308, "language_loss": 0.75961876, "learning_rate": 6.531353647657156e-08, "loss": 0.78084099, "num_input_tokens_seen": 330333260, "step": 15316, "time_per_iteration": 2.5912938117980957 }, { "auxiliary_loss_clip": 0.01108895, "auxiliary_loss_mlp": 0.01034715, "balance_loss_clip": 1.03550375, "balance_loss_mlp": 1.02175713, "epoch": 0.9209078611152863, "flos": 22999635475200.0, "grad_norm": 1.6108706295980322, "language_loss": 0.69277954, "learning_rate": 6.521485628267931e-08, "loss": 0.71421564, "num_input_tokens_seen": 330352465, "step": 15317, "time_per_iteration": 2.5787100791931152 }, { "auxiliary_loss_clip": 0.01098793, "auxiliary_loss_mlp": 0.01031182, "balance_loss_clip": 1.03747189, "balance_loss_mlp": 1.01845622, "epoch": 0.9209679843679544, "flos": 24061514267520.0, "grad_norm": 1.6422186031600345, "language_loss": 0.8337481, "learning_rate": 6.511624945603378e-08, "loss": 0.85504782, "num_input_tokens_seen": 330372685, "step": 15318, "time_per_iteration": 2.655625820159912 }, { "auxiliary_loss_clip": 0.01087423, "auxiliary_loss_mlp": 0.01030908, "balance_loss_clip": 1.03772366, "balance_loss_mlp": 1.01855183, "epoch": 0.9210281076206223, "flos": 13553370190080.0, "grad_norm": 1.8520706427370603, "language_loss": 0.85584986, "learning_rate": 6.501771600037354e-08, "loss": 0.87703317, "num_input_tokens_seen": 330388860, "step": 15319, "time_per_iteration": 2.62506103515625 }, { "auxiliary_loss_clip": 0.01027307, "auxiliary_loss_mlp": 0.01001328, "balance_loss_clip": 1.00478411, "balance_loss_mlp": 1.0003742, "epoch": 0.9210882308732903, "flos": 71426289674880.0, "grad_norm": 0.7696536988394306, "language_loss": 0.56245381, "learning_rate": 6.491925591943559e-08, "loss": 0.58274013, "num_input_tokens_seen": 330448735, "step": 15320, "time_per_iteration": 3.1641623973846436 }, { "auxiliary_loss_clip": 0.01060714, "auxiliary_loss_mlp": 0.01048122, "balance_loss_clip": 1.03641021, "balance_loss_mlp": 1.03252339, "epoch": 0.9211483541259582, "flos": 18509113820160.0, "grad_norm": 3.2582738862572156, "language_loss": 0.63959485, "learning_rate": 6.482086921695384e-08, "loss": 0.66068316, "num_input_tokens_seen": 330465600, "step": 15321, "time_per_iteration": 2.677826404571533 }, { "auxiliary_loss_clip": 0.01068249, "auxiliary_loss_mlp": 0.01028212, "balance_loss_clip": 1.03475666, "balance_loss_mlp": 1.01626706, "epoch": 0.9212084773786262, "flos": 23258228463360.0, "grad_norm": 1.6685688646331795, "language_loss": 0.71651804, "learning_rate": 6.47225558966582e-08, "loss": 0.73748261, "num_input_tokens_seen": 330485770, "step": 15322, "time_per_iteration": 2.740342855453491 }, { "auxiliary_loss_clip": 0.01058964, "auxiliary_loss_mlp": 0.01032827, "balance_loss_clip": 1.03519404, "balance_loss_mlp": 1.02108479, "epoch": 0.9212686006312941, "flos": 16289511770880.0, "grad_norm": 1.8646866235028916, "language_loss": 0.69607079, "learning_rate": 6.462431596227725e-08, "loss": 0.71698868, "num_input_tokens_seen": 330504255, "step": 15323, "time_per_iteration": 4.281275987625122 }, { "auxiliary_loss_clip": 0.010823, "auxiliary_loss_mlp": 0.01039009, "balance_loss_clip": 1.03287673, "balance_loss_mlp": 1.02479923, "epoch": 0.9213287238839621, "flos": 19785773986560.0, "grad_norm": 1.8454161764247499, "language_loss": 0.74490941, "learning_rate": 6.452614941753597e-08, "loss": 0.76612252, "num_input_tokens_seen": 330520705, "step": 15324, "time_per_iteration": 4.212737798690796 }, { "auxiliary_loss_clip": 0.01099326, "auxiliary_loss_mlp": 0.01041809, "balance_loss_clip": 1.0375061, "balance_loss_mlp": 1.02970934, "epoch": 0.92138884713663, "flos": 21030402199680.0, "grad_norm": 1.8170423555389452, "language_loss": 0.71340334, "learning_rate": 6.442805626615744e-08, "loss": 0.73481476, "num_input_tokens_seen": 330539245, "step": 15325, "time_per_iteration": 4.3058435916900635 }, { "auxiliary_loss_clip": 0.01081418, "auxiliary_loss_mlp": 0.01031747, "balance_loss_clip": 1.03530788, "balance_loss_mlp": 1.0195992, "epoch": 0.9214489703892981, "flos": 28587264186240.0, "grad_norm": 1.763186738417038, "language_loss": 0.78558946, "learning_rate": 6.433003651186109e-08, "loss": 0.80672109, "num_input_tokens_seen": 330561815, "step": 15326, "time_per_iteration": 2.703559160232544 }, { "auxiliary_loss_clip": 0.01101844, "auxiliary_loss_mlp": 0.01033367, "balance_loss_clip": 1.03805542, "balance_loss_mlp": 1.02046824, "epoch": 0.921509093641966, "flos": 16361476669440.0, "grad_norm": 2.8495287751902754, "language_loss": 0.71737856, "learning_rate": 6.42320901583635e-08, "loss": 0.73873067, "num_input_tokens_seen": 330579760, "step": 15327, "time_per_iteration": 4.265162706375122 }, { "auxiliary_loss_clip": 0.01101192, "auxiliary_loss_mlp": 0.0104188, "balance_loss_clip": 1.03807735, "balance_loss_mlp": 1.02861834, "epoch": 0.921569216894634, "flos": 26830837036800.0, "grad_norm": 1.806043843779226, "language_loss": 0.77786517, "learning_rate": 6.413421720937906e-08, "loss": 0.79929584, "num_input_tokens_seen": 330598545, "step": 15328, "time_per_iteration": 2.7142398357391357 }, { "auxiliary_loss_clip": 0.01088664, "auxiliary_loss_mlp": 0.01032833, "balance_loss_clip": 1.03698349, "balance_loss_mlp": 1.02065539, "epoch": 0.921629340147302, "flos": 24645134448000.0, "grad_norm": 3.3382204523213455, "language_loss": 0.71625078, "learning_rate": 6.4036417668619e-08, "loss": 0.73746574, "num_input_tokens_seen": 330616700, "step": 15329, "time_per_iteration": 2.8545138835906982 }, { "auxiliary_loss_clip": 0.01095503, "auxiliary_loss_mlp": 0.01028111, "balance_loss_clip": 1.03497839, "balance_loss_mlp": 1.01688147, "epoch": 0.9216894633999699, "flos": 15086504442240.0, "grad_norm": 2.2459771654219067, "language_loss": 0.86542726, "learning_rate": 6.393869153979192e-08, "loss": 0.88666344, "num_input_tokens_seen": 330633355, "step": 15330, "time_per_iteration": 2.5924322605133057 }, { "auxiliary_loss_clip": 0.01074582, "auxiliary_loss_mlp": 0.01031838, "balance_loss_clip": 1.03277349, "balance_loss_mlp": 1.0190115, "epoch": 0.921749586652638, "flos": 19204524103680.0, "grad_norm": 2.041503026001501, "language_loss": 0.75815696, "learning_rate": 6.384103882660397e-08, "loss": 0.77922112, "num_input_tokens_seen": 330651470, "step": 15331, "time_per_iteration": 2.6607861518859863 }, { "auxiliary_loss_clip": 0.01096924, "auxiliary_loss_mlp": 0.01029082, "balance_loss_clip": 1.03500032, "balance_loss_mlp": 1.01668429, "epoch": 0.9218097099053059, "flos": 20522446018560.0, "grad_norm": 1.901322595674086, "language_loss": 0.75386262, "learning_rate": 6.374345953275794e-08, "loss": 0.7751227, "num_input_tokens_seen": 330669170, "step": 15332, "time_per_iteration": 2.682168483734131 }, { "auxiliary_loss_clip": 0.01055682, "auxiliary_loss_mlp": 0.01030851, "balance_loss_clip": 1.03246427, "balance_loss_mlp": 1.0191865, "epoch": 0.9218698331579739, "flos": 17348625216000.0, "grad_norm": 1.7775108010679808, "language_loss": 0.74603796, "learning_rate": 6.364595366195358e-08, "loss": 0.76690328, "num_input_tokens_seen": 330686635, "step": 15333, "time_per_iteration": 2.7291512489318848 }, { "auxiliary_loss_clip": 0.01017268, "auxiliary_loss_mlp": 0.01001133, "balance_loss_clip": 1.00694776, "balance_loss_mlp": 1.00006628, "epoch": 0.9219299564106418, "flos": 61958332575360.0, "grad_norm": 0.8092949717587729, "language_loss": 0.52865499, "learning_rate": 6.354852121788879e-08, "loss": 0.54883903, "num_input_tokens_seen": 330749160, "step": 15334, "time_per_iteration": 3.11421275138855 }, { "auxiliary_loss_clip": 0.01080248, "auxiliary_loss_mlp": 0.01032803, "balance_loss_clip": 1.03543484, "balance_loss_mlp": 1.02087057, "epoch": 0.9219900796633098, "flos": 15701761526400.0, "grad_norm": 1.9555030178553923, "language_loss": 0.62425917, "learning_rate": 6.345116220425839e-08, "loss": 0.64538974, "num_input_tokens_seen": 330766840, "step": 15335, "time_per_iteration": 2.64497971534729 }, { "auxiliary_loss_clip": 0.01055617, "auxiliary_loss_mlp": 0.0103028, "balance_loss_clip": 1.03126609, "balance_loss_mlp": 1.01756644, "epoch": 0.9220502029159777, "flos": 24932670819840.0, "grad_norm": 1.6552756447627857, "language_loss": 0.71621144, "learning_rate": 6.335387662475366e-08, "loss": 0.73707038, "num_input_tokens_seen": 330785585, "step": 15336, "time_per_iteration": 2.7646801471710205 }, { "auxiliary_loss_clip": 0.01083887, "auxiliary_loss_mlp": 0.01032421, "balance_loss_clip": 1.03509367, "balance_loss_mlp": 1.02121532, "epoch": 0.9221103261686457, "flos": 15667215621120.0, "grad_norm": 1.8250492219467316, "language_loss": 0.71701425, "learning_rate": 6.325666448306433e-08, "loss": 0.7381773, "num_input_tokens_seen": 330800750, "step": 15337, "time_per_iteration": 2.6583242416381836 }, { "auxiliary_loss_clip": 0.01020516, "auxiliary_loss_mlp": 0.01000329, "balance_loss_clip": 1.00723362, "balance_loss_mlp": 0.99938756, "epoch": 0.9221704494213137, "flos": 67516299630720.0, "grad_norm": 0.8846440580369678, "language_loss": 0.65341711, "learning_rate": 6.31595257828763e-08, "loss": 0.67362559, "num_input_tokens_seen": 330863640, "step": 15338, "time_per_iteration": 3.1719980239868164 }, { "auxiliary_loss_clip": 0.01101462, "auxiliary_loss_mlp": 0.01033959, "balance_loss_clip": 1.0384047, "balance_loss_mlp": 1.02131093, "epoch": 0.9222305726739817, "flos": 30226945155840.0, "grad_norm": 1.9711725803511775, "language_loss": 0.66986012, "learning_rate": 6.306246052787289e-08, "loss": 0.69121432, "num_input_tokens_seen": 330884675, "step": 15339, "time_per_iteration": 2.7261481285095215 }, { "auxiliary_loss_clip": 0.01109081, "auxiliary_loss_mlp": 0.01029578, "balance_loss_clip": 1.03689742, "balance_loss_mlp": 1.01729918, "epoch": 0.9222906959266496, "flos": 25337204766720.0, "grad_norm": 2.2637051134502015, "language_loss": 0.71722078, "learning_rate": 6.296546872173513e-08, "loss": 0.73860735, "num_input_tokens_seen": 330904125, "step": 15340, "time_per_iteration": 2.571516275405884 }, { "auxiliary_loss_clip": 0.01074794, "auxiliary_loss_mlp": 0.01031494, "balance_loss_clip": 1.03479934, "balance_loss_mlp": 1.01920938, "epoch": 0.9223508191793176, "flos": 27599864244480.0, "grad_norm": 1.6254811741615818, "language_loss": 0.70379698, "learning_rate": 6.286855036814098e-08, "loss": 0.72485995, "num_input_tokens_seen": 330925140, "step": 15341, "time_per_iteration": 2.8622758388519287 }, { "auxiliary_loss_clip": 0.01056229, "auxiliary_loss_mlp": 0.01028103, "balance_loss_clip": 1.03552556, "balance_loss_mlp": 1.01709414, "epoch": 0.9224109424319856, "flos": 27307587277440.0, "grad_norm": 1.6316629656961243, "language_loss": 0.67473853, "learning_rate": 6.277170547076571e-08, "loss": 0.69558185, "num_input_tokens_seen": 330946625, "step": 15342, "time_per_iteration": 2.9130048751831055 }, { "auxiliary_loss_clip": 0.01059826, "auxiliary_loss_mlp": 0.01031139, "balance_loss_clip": 1.03590834, "balance_loss_mlp": 1.01951027, "epoch": 0.9224710656846535, "flos": 48208314401280.0, "grad_norm": 2.4312862834547175, "language_loss": 0.6953969, "learning_rate": 6.26749340332815e-08, "loss": 0.71630651, "num_input_tokens_seen": 330967795, "step": 15343, "time_per_iteration": 3.0083987712860107 }, { "auxiliary_loss_clip": 0.01011696, "auxiliary_loss_mlp": 0.0100494, "balance_loss_clip": 1.008178, "balance_loss_mlp": 1.00394428, "epoch": 0.9225311889373216, "flos": 66722171794560.0, "grad_norm": 0.7265525500100153, "language_loss": 0.51983988, "learning_rate": 6.257823605935786e-08, "loss": 0.54000616, "num_input_tokens_seen": 331040850, "step": 15344, "time_per_iteration": 3.4099650382995605 }, { "auxiliary_loss_clip": 0.01104022, "auxiliary_loss_mlp": 0.01032501, "balance_loss_clip": 1.03630853, "balance_loss_mlp": 1.0211345, "epoch": 0.9225913121899895, "flos": 22271295398400.0, "grad_norm": 1.703377825859211, "language_loss": 0.70327353, "learning_rate": 6.248161155266162e-08, "loss": 0.7246387, "num_input_tokens_seen": 331060595, "step": 15345, "time_per_iteration": 2.576371431350708 }, { "auxiliary_loss_clip": 0.01087623, "auxiliary_loss_mlp": 0.01037657, "balance_loss_clip": 1.03598809, "balance_loss_mlp": 1.02505088, "epoch": 0.9226514354426575, "flos": 20082719721600.0, "grad_norm": 2.157686246893833, "language_loss": 0.77242136, "learning_rate": 6.238506051685677e-08, "loss": 0.79367411, "num_input_tokens_seen": 331080195, "step": 15346, "time_per_iteration": 2.6608493328094482 }, { "auxiliary_loss_clip": 0.01089778, "auxiliary_loss_mlp": 0.01037854, "balance_loss_clip": 1.03755546, "balance_loss_mlp": 1.02469873, "epoch": 0.9227115586953254, "flos": 16070851728000.0, "grad_norm": 1.7988632334787429, "language_loss": 0.76320672, "learning_rate": 6.228858295560457e-08, "loss": 0.78448308, "num_input_tokens_seen": 331097645, "step": 15347, "time_per_iteration": 2.6887784004211426 }, { "auxiliary_loss_clip": 0.01095866, "auxiliary_loss_mlp": 0.01030722, "balance_loss_clip": 1.03849506, "balance_loss_mlp": 1.01933718, "epoch": 0.9227716819479934, "flos": 20446027833600.0, "grad_norm": 1.7976243281525446, "language_loss": 0.76849055, "learning_rate": 6.219217887256367e-08, "loss": 0.78975642, "num_input_tokens_seen": 331116830, "step": 15348, "time_per_iteration": 2.6028568744659424 }, { "auxiliary_loss_clip": 0.01087325, "auxiliary_loss_mlp": 0.01033594, "balance_loss_clip": 1.03495049, "balance_loss_mlp": 1.02063584, "epoch": 0.9228318052006613, "flos": 25007401065600.0, "grad_norm": 1.9643980003377204, "language_loss": 0.67811698, "learning_rate": 6.209584827138959e-08, "loss": 0.69932616, "num_input_tokens_seen": 331137235, "step": 15349, "time_per_iteration": 2.6671433448791504 }, { "auxiliary_loss_clip": 0.01067448, "auxiliary_loss_mlp": 0.01030854, "balance_loss_clip": 1.03284109, "balance_loss_mlp": 1.01793194, "epoch": 0.9228919284533293, "flos": 12677257560960.0, "grad_norm": 5.153703084259653, "language_loss": 0.86846638, "learning_rate": 6.199959115573495e-08, "loss": 0.88944942, "num_input_tokens_seen": 331153155, "step": 15350, "time_per_iteration": 2.703225612640381 }, { "auxiliary_loss_clip": 0.01009812, "auxiliary_loss_mlp": 0.01000808, "balance_loss_clip": 1.00661051, "balance_loss_mlp": 0.9998011, "epoch": 0.9229520517059973, "flos": 69986162712960.0, "grad_norm": 0.7762360061430656, "language_loss": 0.60365206, "learning_rate": 6.190340752924994e-08, "loss": 0.62375826, "num_input_tokens_seen": 331214895, "step": 15351, "time_per_iteration": 3.158869504928589 }, { "auxiliary_loss_clip": 0.01083781, "auxiliary_loss_mlp": 0.01026683, "balance_loss_clip": 1.03323722, "balance_loss_mlp": 1.01475024, "epoch": 0.9230121749586653, "flos": 14793832425600.0, "grad_norm": 1.832472265730707, "language_loss": 0.77387846, "learning_rate": 6.180729739558233e-08, "loss": 0.79498303, "num_input_tokens_seen": 331232185, "step": 15352, "time_per_iteration": 2.627760171890259 }, { "auxiliary_loss_clip": 0.0107378, "auxiliary_loss_mlp": 0.01042996, "balance_loss_clip": 1.03285944, "balance_loss_mlp": 1.0284164, "epoch": 0.9230722982113332, "flos": 22967208472320.0, "grad_norm": 1.8679415758316251, "language_loss": 0.59430194, "learning_rate": 6.171126075837585e-08, "loss": 0.61546969, "num_input_tokens_seen": 331251065, "step": 15353, "time_per_iteration": 2.7041702270507812 }, { "auxiliary_loss_clip": 0.01083679, "auxiliary_loss_mlp": 0.01026903, "balance_loss_clip": 1.034688, "balance_loss_mlp": 1.01505327, "epoch": 0.9231324214640012, "flos": 18551452976640.0, "grad_norm": 1.711390419205093, "language_loss": 0.7429471, "learning_rate": 6.161529762127293e-08, "loss": 0.76405293, "num_input_tokens_seen": 331269110, "step": 15354, "time_per_iteration": 2.6137607097625732 }, { "auxiliary_loss_clip": 0.01112951, "auxiliary_loss_mlp": 0.01036765, "balance_loss_clip": 1.03797793, "balance_loss_mlp": 1.02363443, "epoch": 0.9231925447166691, "flos": 22082727974400.0, "grad_norm": 2.2506024709408345, "language_loss": 0.64660299, "learning_rate": 6.1519407987912e-08, "loss": 0.66810012, "num_input_tokens_seen": 331286555, "step": 15355, "time_per_iteration": 2.562422275543213 }, { "auxiliary_loss_clip": 0.0108125, "auxiliary_loss_mlp": 0.01041248, "balance_loss_clip": 1.03451049, "balance_loss_mlp": 1.02839768, "epoch": 0.9232526679693371, "flos": 26541145848960.0, "grad_norm": 1.5394884585282018, "language_loss": 0.7420373, "learning_rate": 6.142359186192947e-08, "loss": 0.76326227, "num_input_tokens_seen": 331307660, "step": 15356, "time_per_iteration": 2.6385319232940674 }, { "auxiliary_loss_clip": 0.01084284, "auxiliary_loss_mlp": 0.01035835, "balance_loss_clip": 1.03417146, "balance_loss_mlp": 1.02270436, "epoch": 0.9233127912220052, "flos": 14756664827520.0, "grad_norm": 1.6931372093662804, "language_loss": 0.60944784, "learning_rate": 6.132784924695844e-08, "loss": 0.63064903, "num_input_tokens_seen": 331324885, "step": 15357, "time_per_iteration": 2.6290340423583984 }, { "auxiliary_loss_clip": 0.01082317, "auxiliary_loss_mlp": 0.01033403, "balance_loss_clip": 1.03603005, "balance_loss_mlp": 1.01992083, "epoch": 0.9233729144746731, "flos": 25261792162560.0, "grad_norm": 1.453584491070713, "language_loss": 0.70108932, "learning_rate": 6.123218014662956e-08, "loss": 0.72224653, "num_input_tokens_seen": 331345885, "step": 15358, "time_per_iteration": 2.752317190170288 }, { "auxiliary_loss_clip": 0.01108354, "auxiliary_loss_mlp": 0.01033314, "balance_loss_clip": 1.03619421, "balance_loss_mlp": 1.02111292, "epoch": 0.9234330377273411, "flos": 27849837968640.0, "grad_norm": 2.186505896470512, "language_loss": 0.73299533, "learning_rate": 6.113658456457104e-08, "loss": 0.754412, "num_input_tokens_seen": 331364320, "step": 15359, "time_per_iteration": 2.597811460494995 }, { "auxiliary_loss_clip": 0.01047199, "auxiliary_loss_mlp": 0.01033574, "balance_loss_clip": 1.03515124, "balance_loss_mlp": 1.02113438, "epoch": 0.923493160980009, "flos": 24608361899520.0, "grad_norm": 1.822606379106128, "language_loss": 0.64573818, "learning_rate": 6.104106250440732e-08, "loss": 0.66654599, "num_input_tokens_seen": 331384135, "step": 15360, "time_per_iteration": 2.8328487873077393 }, { "auxiliary_loss_clip": 0.01017958, "auxiliary_loss_mlp": 0.00751388, "balance_loss_clip": 1.00556254, "balance_loss_mlp": 0.99968225, "epoch": 0.923553284232677, "flos": 67700916558720.0, "grad_norm": 0.7601562180978135, "language_loss": 0.5516786, "learning_rate": 6.094561396976083e-08, "loss": 0.56937212, "num_input_tokens_seen": 331440645, "step": 15361, "time_per_iteration": 3.0788414478302 }, { "auxiliary_loss_clip": 0.01075936, "auxiliary_loss_mlp": 0.01031297, "balance_loss_clip": 1.03308797, "balance_loss_mlp": 1.01755273, "epoch": 0.9236134074853449, "flos": 18807244704000.0, "grad_norm": 1.8816264544050445, "language_loss": 0.69994414, "learning_rate": 6.085023896425112e-08, "loss": 0.72101647, "num_input_tokens_seen": 331459580, "step": 15362, "time_per_iteration": 4.193416118621826 }, { "auxiliary_loss_clip": 0.01094932, "auxiliary_loss_mlp": 0.0103223, "balance_loss_clip": 1.0347358, "balance_loss_mlp": 1.01748371, "epoch": 0.923673530738013, "flos": 27782362270080.0, "grad_norm": 1.488556622948845, "language_loss": 0.75529814, "learning_rate": 6.075493749149463e-08, "loss": 0.77656972, "num_input_tokens_seen": 331481560, "step": 15363, "time_per_iteration": 2.6561429500579834 }, { "auxiliary_loss_clip": 0.01109631, "auxiliary_loss_mlp": 0.01029622, "balance_loss_clip": 1.03737402, "balance_loss_mlp": 1.01739717, "epoch": 0.9237336539906809, "flos": 26797117144320.0, "grad_norm": 1.930031039204044, "language_loss": 0.82993495, "learning_rate": 6.065970955510514e-08, "loss": 0.85132754, "num_input_tokens_seen": 331499090, "step": 15364, "time_per_iteration": 5.842444181442261 }, { "auxiliary_loss_clip": 0.01074668, "auxiliary_loss_mlp": 0.0102538, "balance_loss_clip": 1.03544402, "balance_loss_mlp": 1.01388252, "epoch": 0.9237937772433489, "flos": 23587708942080.0, "grad_norm": 1.4281985355444542, "language_loss": 0.67964804, "learning_rate": 6.056455515869419e-08, "loss": 0.70064855, "num_input_tokens_seen": 331519420, "step": 15365, "time_per_iteration": 2.7319743633270264 }, { "auxiliary_loss_clip": 0.01109561, "auxiliary_loss_mlp": 0.01030475, "balance_loss_clip": 1.03753567, "balance_loss_mlp": 1.01805329, "epoch": 0.9238539004960168, "flos": 26140562398080.0, "grad_norm": 2.080228129033925, "language_loss": 0.62466252, "learning_rate": 6.046947430586913e-08, "loss": 0.64606285, "num_input_tokens_seen": 331538720, "step": 15366, "time_per_iteration": 4.141804456710815 }, { "auxiliary_loss_clip": 0.01076799, "auxiliary_loss_mlp": 0.01028677, "balance_loss_clip": 1.03669524, "balance_loss_mlp": 1.01587987, "epoch": 0.9239140237486848, "flos": 21068000760960.0, "grad_norm": 1.4710054055259818, "language_loss": 0.74650168, "learning_rate": 6.037446700023619e-08, "loss": 0.76755643, "num_input_tokens_seen": 331558505, "step": 15367, "time_per_iteration": 2.6937508583068848 }, { "auxiliary_loss_clip": 0.01083975, "auxiliary_loss_mlp": 0.00768707, "balance_loss_clip": 1.03666592, "balance_loss_mlp": 1.0002172, "epoch": 0.9239741470013527, "flos": 24607930936320.0, "grad_norm": 2.0965464238234857, "language_loss": 0.65042406, "learning_rate": 6.027953324539759e-08, "loss": 0.66895086, "num_input_tokens_seen": 331578440, "step": 15368, "time_per_iteration": 2.7437262535095215 }, { "auxiliary_loss_clip": 0.01101382, "auxiliary_loss_mlp": 0.01034709, "balance_loss_clip": 1.03610659, "balance_loss_mlp": 1.02171481, "epoch": 0.9240342702540207, "flos": 24718248581760.0, "grad_norm": 1.7086931963123835, "language_loss": 0.74773824, "learning_rate": 6.018467304495401e-08, "loss": 0.76909912, "num_input_tokens_seen": 331598945, "step": 15369, "time_per_iteration": 2.6743035316467285 }, { "auxiliary_loss_clip": 0.01104923, "auxiliary_loss_mlp": 0.01037418, "balance_loss_clip": 1.04013598, "balance_loss_mlp": 1.02334499, "epoch": 0.9240943935066888, "flos": 20849987162880.0, "grad_norm": 2.038986001331123, "language_loss": 0.76338404, "learning_rate": 6.008988640250145e-08, "loss": 0.78480744, "num_input_tokens_seen": 331616700, "step": 15370, "time_per_iteration": 2.607760429382324 }, { "auxiliary_loss_clip": 0.01109143, "auxiliary_loss_mlp": 0.01031875, "balance_loss_clip": 1.03663468, "balance_loss_mlp": 1.0196085, "epoch": 0.9241545167593567, "flos": 24462313200000.0, "grad_norm": 2.4044373841458495, "language_loss": 0.66958445, "learning_rate": 5.999517332163528e-08, "loss": 0.69099462, "num_input_tokens_seen": 331635625, "step": 15371, "time_per_iteration": 2.6164920330047607 }, { "auxiliary_loss_clip": 0.01011382, "auxiliary_loss_mlp": 0.01002865, "balance_loss_clip": 1.00752091, "balance_loss_mlp": 1.00176203, "epoch": 0.9242146400120247, "flos": 61827259847040.0, "grad_norm": 0.7346104185663305, "language_loss": 0.57681966, "learning_rate": 5.99005338059464e-08, "loss": 0.59696221, "num_input_tokens_seen": 331698595, "step": 15372, "time_per_iteration": 3.1782453060150146 }, { "auxiliary_loss_clip": 0.01108932, "auxiliary_loss_mlp": 0.01030295, "balance_loss_clip": 1.03912938, "balance_loss_mlp": 1.01884508, "epoch": 0.9242747632646926, "flos": 22048397550720.0, "grad_norm": 2.204946411354503, "language_loss": 0.70037317, "learning_rate": 5.98059678590237e-08, "loss": 0.72176552, "num_input_tokens_seen": 331717975, "step": 15373, "time_per_iteration": 2.5716300010681152 }, { "auxiliary_loss_clip": 0.01093668, "auxiliary_loss_mlp": 0.01037655, "balance_loss_clip": 1.03443408, "balance_loss_mlp": 1.02547812, "epoch": 0.9243348865173606, "flos": 18478338842880.0, "grad_norm": 2.311286050873559, "language_loss": 0.75668836, "learning_rate": 5.971147548445299e-08, "loss": 0.77800161, "num_input_tokens_seen": 331737220, "step": 15374, "time_per_iteration": 2.6773972511291504 }, { "auxiliary_loss_clip": 0.01071113, "auxiliary_loss_mlp": 0.01034847, "balance_loss_clip": 1.03411102, "balance_loss_mlp": 1.02240705, "epoch": 0.9243950097700285, "flos": 23258767167360.0, "grad_norm": 1.61997040297718, "language_loss": 0.64933169, "learning_rate": 5.961705668581784e-08, "loss": 0.67039132, "num_input_tokens_seen": 331757300, "step": 15375, "time_per_iteration": 2.724712371826172 }, { "auxiliary_loss_clip": 0.01080494, "auxiliary_loss_mlp": 0.01034023, "balance_loss_clip": 1.03776979, "balance_loss_mlp": 1.02213168, "epoch": 0.9244551330226966, "flos": 29749081593600.0, "grad_norm": 1.73726236759537, "language_loss": 0.66592222, "learning_rate": 5.952271146669829e-08, "loss": 0.68706739, "num_input_tokens_seen": 331776995, "step": 15376, "time_per_iteration": 2.7318432331085205 }, { "auxiliary_loss_clip": 0.01027325, "auxiliary_loss_mlp": 0.01000699, "balance_loss_clip": 1.004807, "balance_loss_mlp": 0.99974555, "epoch": 0.9245152562753645, "flos": 68864960609280.0, "grad_norm": 0.6503040166791668, "language_loss": 0.61148441, "learning_rate": 5.94284398306717e-08, "loss": 0.63176465, "num_input_tokens_seen": 331845015, "step": 15377, "time_per_iteration": 3.3028318881988525 }, { "auxiliary_loss_clip": 0.01066627, "auxiliary_loss_mlp": 0.01036109, "balance_loss_clip": 1.03230667, "balance_loss_mlp": 1.02378881, "epoch": 0.9245753795280325, "flos": 21579260993280.0, "grad_norm": 1.8441759941724984, "language_loss": 0.74442959, "learning_rate": 5.933424178131341e-08, "loss": 0.76545691, "num_input_tokens_seen": 331862795, "step": 15378, "time_per_iteration": 2.7058117389678955 }, { "auxiliary_loss_clip": 0.01111214, "auxiliary_loss_mlp": 0.0103357, "balance_loss_clip": 1.03782296, "balance_loss_mlp": 1.02046919, "epoch": 0.9246355027807004, "flos": 34496077334400.0, "grad_norm": 2.4430826807179122, "language_loss": 0.62603706, "learning_rate": 5.924011732219503e-08, "loss": 0.6474849, "num_input_tokens_seen": 331882535, "step": 15379, "time_per_iteration": 2.672102928161621 }, { "auxiliary_loss_clip": 0.01027241, "auxiliary_loss_mlp": 0.01034407, "balance_loss_clip": 1.03090858, "balance_loss_mlp": 1.02008975, "epoch": 0.9246956260333684, "flos": 15953854152960.0, "grad_norm": 2.0533190004869133, "language_loss": 0.83975178, "learning_rate": 5.914606645688591e-08, "loss": 0.86036825, "num_input_tokens_seen": 331899335, "step": 15380, "time_per_iteration": 2.8909592628479004 }, { "auxiliary_loss_clip": 0.01110328, "auxiliary_loss_mlp": 0.01034939, "balance_loss_clip": 1.03606331, "balance_loss_mlp": 1.02165866, "epoch": 0.9247557492860363, "flos": 23368366540800.0, "grad_norm": 1.485445137788739, "language_loss": 0.73372233, "learning_rate": 5.905208918895233e-08, "loss": 0.75517505, "num_input_tokens_seen": 331919030, "step": 15381, "time_per_iteration": 2.6360280513763428 }, { "auxiliary_loss_clip": 0.01093808, "auxiliary_loss_mlp": 0.01032177, "balance_loss_clip": 1.03822911, "balance_loss_mlp": 1.01991057, "epoch": 0.9248158725387043, "flos": 23039855729280.0, "grad_norm": 1.7916241982506211, "language_loss": 0.78368294, "learning_rate": 5.8958185521958524e-08, "loss": 0.80494279, "num_input_tokens_seen": 331936465, "step": 15382, "time_per_iteration": 2.6322009563446045 }, { "auxiliary_loss_clip": 0.01085867, "auxiliary_loss_mlp": 0.01035748, "balance_loss_clip": 1.03582263, "balance_loss_mlp": 1.02334988, "epoch": 0.9248759957913724, "flos": 22522418357760.0, "grad_norm": 1.7508606986515263, "language_loss": 0.75239515, "learning_rate": 5.886435545946455e-08, "loss": 0.77361131, "num_input_tokens_seen": 331954625, "step": 15383, "time_per_iteration": 2.6861977577209473 }, { "auxiliary_loss_clip": 0.01084507, "auxiliary_loss_mlp": 0.01026683, "balance_loss_clip": 1.03370142, "balance_loss_mlp": 1.0149585, "epoch": 0.9249361190440403, "flos": 25447271016960.0, "grad_norm": 1.6935679354612814, "language_loss": 0.75976408, "learning_rate": 5.8770599005028456e-08, "loss": 0.78087592, "num_input_tokens_seen": 331975865, "step": 15384, "time_per_iteration": 2.7150700092315674 }, { "auxiliary_loss_clip": 0.01075864, "auxiliary_loss_mlp": 0.01031764, "balance_loss_clip": 1.03653836, "balance_loss_mlp": 1.01944935, "epoch": 0.9249962422967083, "flos": 12378623886720.0, "grad_norm": 3.7142425878175223, "language_loss": 0.66128278, "learning_rate": 5.8676916162206045e-08, "loss": 0.68235904, "num_input_tokens_seen": 331992760, "step": 15385, "time_per_iteration": 2.7027785778045654 }, { "auxiliary_loss_clip": 0.01106721, "auxiliary_loss_mlp": 0.01033952, "balance_loss_clip": 1.03535783, "balance_loss_mlp": 1.02156663, "epoch": 0.9250563655493762, "flos": 22929430343040.0, "grad_norm": 1.9713792057532418, "language_loss": 0.81076729, "learning_rate": 5.85833069345496e-08, "loss": 0.83217394, "num_input_tokens_seen": 332011890, "step": 15386, "time_per_iteration": 2.5687849521636963 }, { "auxiliary_loss_clip": 0.01094924, "auxiliary_loss_mlp": 0.0103698, "balance_loss_clip": 1.03508687, "balance_loss_mlp": 1.02440369, "epoch": 0.9251164888020442, "flos": 18478662065280.0, "grad_norm": 1.617640933108123, "language_loss": 0.75817406, "learning_rate": 5.8489771325608504e-08, "loss": 0.77949309, "num_input_tokens_seen": 332029485, "step": 15387, "time_per_iteration": 2.6368582248687744 }, { "auxiliary_loss_clip": 0.0109213, "auxiliary_loss_mlp": 0.01034821, "balance_loss_clip": 1.03527617, "balance_loss_mlp": 1.02329326, "epoch": 0.9251766120547121, "flos": 33037062796800.0, "grad_norm": 1.4014991534530432, "language_loss": 0.700683, "learning_rate": 5.839630933893014e-08, "loss": 0.72195256, "num_input_tokens_seen": 332052970, "step": 15388, "time_per_iteration": 2.754608392715454 }, { "auxiliary_loss_clip": 0.01097095, "auxiliary_loss_mlp": 0.01030709, "balance_loss_clip": 1.03522015, "balance_loss_mlp": 1.01837683, "epoch": 0.9252367353073802, "flos": 24387906176640.0, "grad_norm": 2.101057031092642, "language_loss": 0.82379329, "learning_rate": 5.8302920978058115e-08, "loss": 0.84507132, "num_input_tokens_seen": 332070395, "step": 15389, "time_per_iteration": 2.6602766513824463 }, { "auxiliary_loss_clip": 0.01104924, "auxiliary_loss_mlp": 0.01031779, "balance_loss_clip": 1.03799844, "balance_loss_mlp": 1.01822543, "epoch": 0.9252968585600481, "flos": 18916844077440.0, "grad_norm": 1.6739329388921937, "language_loss": 0.79294932, "learning_rate": 5.820960624653381e-08, "loss": 0.81431639, "num_input_tokens_seen": 332090185, "step": 15390, "time_per_iteration": 2.623624563217163 }, { "auxiliary_loss_clip": 0.01076005, "auxiliary_loss_mlp": 0.01039788, "balance_loss_clip": 1.03474212, "balance_loss_mlp": 1.02668691, "epoch": 0.9253569818127161, "flos": 21725345606400.0, "grad_norm": 1.640565709280766, "language_loss": 0.75278354, "learning_rate": 5.811636514789597e-08, "loss": 0.77394152, "num_input_tokens_seen": 332109050, "step": 15391, "time_per_iteration": 2.6627962589263916 }, { "auxiliary_loss_clip": 0.0108717, "auxiliary_loss_mlp": 0.01033982, "balance_loss_clip": 1.03444612, "balance_loss_mlp": 1.02011776, "epoch": 0.925417105065384, "flos": 34240357434240.0, "grad_norm": 2.199534306893137, "language_loss": 0.52898717, "learning_rate": 5.80231976856802e-08, "loss": 0.55019867, "num_input_tokens_seen": 332131180, "step": 15392, "time_per_iteration": 2.780339479446411 }, { "auxiliary_loss_clip": 0.01106895, "auxiliary_loss_mlp": 0.01032462, "balance_loss_clip": 1.03467309, "balance_loss_mlp": 1.02046967, "epoch": 0.925477228318052, "flos": 25959536830080.0, "grad_norm": 2.305545825428345, "language_loss": 0.77058631, "learning_rate": 5.7930103863419454e-08, "loss": 0.79197991, "num_input_tokens_seen": 332149555, "step": 15393, "time_per_iteration": 2.602755069732666 }, { "auxiliary_loss_clip": 0.01078205, "auxiliary_loss_mlp": 0.01032904, "balance_loss_clip": 1.03438604, "balance_loss_mlp": 1.02044106, "epoch": 0.9255373515707199, "flos": 11838240702720.0, "grad_norm": 3.40082651052626, "language_loss": 0.69679272, "learning_rate": 5.783708368464357e-08, "loss": 0.71790373, "num_input_tokens_seen": 332165830, "step": 15394, "time_per_iteration": 2.6185452938079834 }, { "auxiliary_loss_clip": 0.0110989, "auxiliary_loss_mlp": 0.01030861, "balance_loss_clip": 1.0379678, "balance_loss_mlp": 1.01829052, "epoch": 0.925597474823388, "flos": 21434325615360.0, "grad_norm": 1.7473180632451621, "language_loss": 0.72795445, "learning_rate": 5.7744137152879956e-08, "loss": 0.74936193, "num_input_tokens_seen": 332185130, "step": 15395, "time_per_iteration": 2.6088504791259766 }, { "auxiliary_loss_clip": 0.01057286, "auxiliary_loss_mlp": 0.01032502, "balance_loss_clip": 1.03256583, "balance_loss_mlp": 1.02103376, "epoch": 0.925657598076056, "flos": 22857573185280.0, "grad_norm": 1.8404815888609334, "language_loss": 0.71465933, "learning_rate": 5.7651264271653785e-08, "loss": 0.7355572, "num_input_tokens_seen": 332203695, "step": 15396, "time_per_iteration": 2.7053260803222656 }, { "auxiliary_loss_clip": 0.01106531, "auxiliary_loss_mlp": 0.01030786, "balance_loss_clip": 1.03571641, "balance_loss_mlp": 1.01807809, "epoch": 0.9257177213287239, "flos": 25704032411520.0, "grad_norm": 1.6857437416132761, "language_loss": 0.87266874, "learning_rate": 5.755846504448603e-08, "loss": 0.8940419, "num_input_tokens_seen": 332224850, "step": 15397, "time_per_iteration": 2.5987706184387207 }, { "auxiliary_loss_clip": 0.01027242, "auxiliary_loss_mlp": 0.00998861, "balance_loss_clip": 1.00477362, "balance_loss_mlp": 0.9978596, "epoch": 0.9257778445813919, "flos": 59592933221760.0, "grad_norm": 0.80472899949222, "language_loss": 0.55124933, "learning_rate": 5.746573947489586e-08, "loss": 0.57151037, "num_input_tokens_seen": 332278085, "step": 15398, "time_per_iteration": 3.022796869277954 }, { "auxiliary_loss_clip": 0.01088846, "auxiliary_loss_mlp": 0.01032308, "balance_loss_clip": 1.03451252, "balance_loss_mlp": 1.01805639, "epoch": 0.9258379678340598, "flos": 27709427704320.0, "grad_norm": 1.9788973951742164, "language_loss": 0.76231545, "learning_rate": 5.7373087566400025e-08, "loss": 0.78352696, "num_input_tokens_seen": 332297875, "step": 15399, "time_per_iteration": 2.6782071590423584 }, { "auxiliary_loss_clip": 0.01077436, "auxiliary_loss_mlp": 0.01029493, "balance_loss_clip": 1.03120267, "balance_loss_mlp": 1.01828766, "epoch": 0.9258980910867278, "flos": 24863543095680.0, "grad_norm": 1.6019049009837816, "language_loss": 0.78070617, "learning_rate": 5.7280509322510826e-08, "loss": 0.8017754, "num_input_tokens_seen": 332318500, "step": 15400, "time_per_iteration": 2.6918084621429443 }, { "auxiliary_loss_clip": 0.01019125, "auxiliary_loss_mlp": 0.01002511, "balance_loss_clip": 1.00581372, "balance_loss_mlp": 1.00144386, "epoch": 0.9259582143393957, "flos": 63134587249920.0, "grad_norm": 0.7229223052902047, "language_loss": 0.51348114, "learning_rate": 5.718800474673946e-08, "loss": 0.53369749, "num_input_tokens_seen": 332381980, "step": 15401, "time_per_iteration": 4.655211448669434 }, { "auxiliary_loss_clip": 0.01095608, "auxiliary_loss_mlp": 0.01034993, "balance_loss_clip": 1.03721333, "balance_loss_mlp": 1.02316141, "epoch": 0.9260183375920638, "flos": 24127122458880.0, "grad_norm": 1.7293202030573633, "language_loss": 0.8252185, "learning_rate": 5.709557384259378e-08, "loss": 0.84652448, "num_input_tokens_seen": 332399510, "step": 15402, "time_per_iteration": 2.7125723361968994 }, { "auxiliary_loss_clip": 0.01027546, "auxiliary_loss_mlp": 0.01001395, "balance_loss_clip": 1.00508666, "balance_loss_mlp": 1.0004828, "epoch": 0.9260784608447317, "flos": 63042872849280.0, "grad_norm": 0.7337152983858821, "language_loss": 0.51039803, "learning_rate": 5.700321661357876e-08, "loss": 0.53068745, "num_input_tokens_seen": 332459130, "step": 15403, "time_per_iteration": 4.670861005783081 }, { "auxiliary_loss_clip": 0.01007604, "auxiliary_loss_mlp": 0.01001265, "balance_loss_clip": 1.00503612, "balance_loss_mlp": 1.0004487, "epoch": 0.9261385840973997, "flos": 70585979927040.0, "grad_norm": 0.6850090598665447, "language_loss": 0.58746749, "learning_rate": 5.69109330631965e-08, "loss": 0.60755622, "num_input_tokens_seen": 332526555, "step": 15404, "time_per_iteration": 4.747823238372803 }, { "auxiliary_loss_clip": 0.01083395, "auxiliary_loss_mlp": 0.01035934, "balance_loss_clip": 1.03603053, "balance_loss_mlp": 1.02242208, "epoch": 0.9261987073500676, "flos": 20229917656320.0, "grad_norm": 2.2086830252227903, "language_loss": 0.71290517, "learning_rate": 5.681872319494596e-08, "loss": 0.73409843, "num_input_tokens_seen": 332544005, "step": 15405, "time_per_iteration": 2.6791491508483887 }, { "auxiliary_loss_clip": 0.01061911, "auxiliary_loss_mlp": 0.01037759, "balance_loss_clip": 1.03471172, "balance_loss_mlp": 1.02468121, "epoch": 0.9262588306027356, "flos": 20954163582720.0, "grad_norm": 1.7027063262346462, "language_loss": 0.68240035, "learning_rate": 5.672658701232458e-08, "loss": 0.70339704, "num_input_tokens_seen": 332563070, "step": 15406, "time_per_iteration": 4.413249731063843 }, { "auxiliary_loss_clip": 0.01056836, "auxiliary_loss_mlp": 0.01046779, "balance_loss_clip": 1.03164291, "balance_loss_mlp": 1.03166914, "epoch": 0.9263189538554035, "flos": 22158679282560.0, "grad_norm": 3.2449194361819234, "language_loss": 0.76197219, "learning_rate": 5.663452451882555e-08, "loss": 0.78300834, "num_input_tokens_seen": 332579620, "step": 15407, "time_per_iteration": 2.7800557613372803 }, { "auxiliary_loss_clip": 0.01076765, "auxiliary_loss_mlp": 0.01037766, "balance_loss_clip": 1.03282285, "balance_loss_mlp": 1.02446783, "epoch": 0.9263790771080715, "flos": 18187211111040.0, "grad_norm": 1.9744947993410575, "language_loss": 0.72376311, "learning_rate": 5.6542535717940096e-08, "loss": 0.74490839, "num_input_tokens_seen": 332597795, "step": 15408, "time_per_iteration": 2.654872179031372 }, { "auxiliary_loss_clip": 0.01077908, "auxiliary_loss_mlp": 0.01028464, "balance_loss_clip": 1.03418171, "balance_loss_mlp": 1.01781821, "epoch": 0.9264392003607396, "flos": 48178545004800.0, "grad_norm": 1.6454364766131493, "language_loss": 0.68587399, "learning_rate": 5.645062061315675e-08, "loss": 0.70693767, "num_input_tokens_seen": 332620375, "step": 15409, "time_per_iteration": 2.850269317626953 }, { "auxiliary_loss_clip": 0.01074672, "auxiliary_loss_mlp": 0.01030839, "balance_loss_clip": 1.03626847, "balance_loss_mlp": 1.01791096, "epoch": 0.9264993236134075, "flos": 26389458714240.0, "grad_norm": 2.1360446991021416, "language_loss": 0.75711519, "learning_rate": 5.6358779207960506e-08, "loss": 0.77817023, "num_input_tokens_seen": 332639510, "step": 15410, "time_per_iteration": 2.7220871448516846 }, { "auxiliary_loss_clip": 0.01057013, "auxiliary_loss_mlp": 0.01030911, "balance_loss_clip": 1.03571475, "balance_loss_mlp": 1.01858473, "epoch": 0.9265594468660755, "flos": 20920084554240.0, "grad_norm": 1.538032014217413, "language_loss": 0.82166702, "learning_rate": 5.6267011505833905e-08, "loss": 0.84254622, "num_input_tokens_seen": 332658350, "step": 15411, "time_per_iteration": 2.7539865970611572 }, { "auxiliary_loss_clip": 0.01085605, "auxiliary_loss_mlp": 0.01037973, "balance_loss_clip": 1.03824568, "balance_loss_mlp": 1.02592707, "epoch": 0.9266195701187434, "flos": 17525017929600.0, "grad_norm": 1.9373145629003894, "language_loss": 0.75171757, "learning_rate": 5.617531751025728e-08, "loss": 0.77295339, "num_input_tokens_seen": 332676715, "step": 15412, "time_per_iteration": 2.6214661598205566 }, { "auxiliary_loss_clip": 0.0110647, "auxiliary_loss_mlp": 0.01029847, "balance_loss_clip": 1.03467417, "balance_loss_mlp": 1.01769996, "epoch": 0.9266796933714114, "flos": 33688733293440.0, "grad_norm": 1.8589935579070962, "language_loss": 0.66795665, "learning_rate": 5.6083697224707406e-08, "loss": 0.68931985, "num_input_tokens_seen": 332701470, "step": 15413, "time_per_iteration": 2.690272808074951 }, { "auxiliary_loss_clip": 0.0105034, "auxiliary_loss_mlp": 0.0103768, "balance_loss_clip": 1.0341413, "balance_loss_mlp": 1.0243938, "epoch": 0.9267398166240793, "flos": 18916520855040.0, "grad_norm": 2.0080072670412794, "language_loss": 0.76213551, "learning_rate": 5.5992150652658167e-08, "loss": 0.78301573, "num_input_tokens_seen": 332719060, "step": 15414, "time_per_iteration": 2.858206033706665 }, { "auxiliary_loss_clip": 0.01094062, "auxiliary_loss_mlp": 0.01028504, "balance_loss_clip": 1.03724313, "balance_loss_mlp": 1.01658273, "epoch": 0.9267999398767474, "flos": 20478957626880.0, "grad_norm": 3.161190355469832, "language_loss": 0.81600469, "learning_rate": 5.59006777975819e-08, "loss": 0.83723032, "num_input_tokens_seen": 332736345, "step": 15415, "time_per_iteration": 2.6205687522888184 }, { "auxiliary_loss_clip": 0.01086858, "auxiliary_loss_mlp": 0.01034472, "balance_loss_clip": 1.03274202, "balance_loss_mlp": 1.02163887, "epoch": 0.9268600631294153, "flos": 24789351553920.0, "grad_norm": 1.3966462489262188, "language_loss": 0.54340827, "learning_rate": 5.580927866294671e-08, "loss": 0.56462157, "num_input_tokens_seen": 332756270, "step": 15416, "time_per_iteration": 2.73563814163208 }, { "auxiliary_loss_clip": 0.01067608, "auxiliary_loss_mlp": 0.01035609, "balance_loss_clip": 1.03344822, "balance_loss_mlp": 1.02311552, "epoch": 0.9269201863820833, "flos": 18697178453760.0, "grad_norm": 1.5074603678728897, "language_loss": 0.72186983, "learning_rate": 5.571795325221807e-08, "loss": 0.74290192, "num_input_tokens_seen": 332775185, "step": 15417, "time_per_iteration": 2.715012788772583 }, { "auxiliary_loss_clip": 0.01094578, "auxiliary_loss_mlp": 0.01033031, "balance_loss_clip": 1.03809214, "balance_loss_mlp": 1.02029991, "epoch": 0.9269803096347512, "flos": 20923999136640.0, "grad_norm": 4.4376132167371365, "language_loss": 0.7579149, "learning_rate": 5.5626701568859624e-08, "loss": 0.77919102, "num_input_tokens_seen": 332794320, "step": 15418, "time_per_iteration": 2.6377668380737305 }, { "auxiliary_loss_clip": 0.01095083, "auxiliary_loss_mlp": 0.01030369, "balance_loss_clip": 1.03478622, "balance_loss_mlp": 1.01755381, "epoch": 0.9270404328874192, "flos": 28002710252160.0, "grad_norm": 1.4381586641081634, "language_loss": 0.76076263, "learning_rate": 5.553552361633174e-08, "loss": 0.78201711, "num_input_tokens_seen": 332818095, "step": 15419, "time_per_iteration": 2.7292606830596924 }, { "auxiliary_loss_clip": 0.01104418, "auxiliary_loss_mlp": 0.0103305, "balance_loss_clip": 1.03501427, "balance_loss_mlp": 1.02151632, "epoch": 0.9271005561400871, "flos": 25889870401920.0, "grad_norm": 1.6372901887386972, "language_loss": 0.75610423, "learning_rate": 5.5444419398091636e-08, "loss": 0.77747887, "num_input_tokens_seen": 332839860, "step": 15420, "time_per_iteration": 2.6438181400299072 }, { "auxiliary_loss_clip": 0.01099967, "auxiliary_loss_mlp": 0.01032703, "balance_loss_clip": 1.03629184, "balance_loss_mlp": 1.01973283, "epoch": 0.9271606793927551, "flos": 27053914452480.0, "grad_norm": 1.6035461883801339, "language_loss": 0.77056849, "learning_rate": 5.535338891759389e-08, "loss": 0.79189527, "num_input_tokens_seen": 332861155, "step": 15421, "time_per_iteration": 2.6203770637512207 }, { "auxiliary_loss_clip": 0.0108251, "auxiliary_loss_mlp": 0.01032083, "balance_loss_clip": 1.03615677, "balance_loss_mlp": 1.0196619, "epoch": 0.9272208026454232, "flos": 26209869690240.0, "grad_norm": 2.078534179324168, "language_loss": 0.72883129, "learning_rate": 5.526243217829041e-08, "loss": 0.74997723, "num_input_tokens_seen": 332881110, "step": 15422, "time_per_iteration": 2.700307607650757 }, { "auxiliary_loss_clip": 0.01099345, "auxiliary_loss_mlp": 0.01039891, "balance_loss_clip": 1.03540778, "balance_loss_mlp": 1.0265038, "epoch": 0.9272809258980911, "flos": 12458453863680.0, "grad_norm": 1.8814873155718879, "language_loss": 0.77395117, "learning_rate": 5.517154918363065e-08, "loss": 0.79534352, "num_input_tokens_seen": 332899350, "step": 15423, "time_per_iteration": 2.7268893718719482 }, { "auxiliary_loss_clip": 0.01099209, "auxiliary_loss_mlp": 0.01033057, "balance_loss_clip": 1.03573775, "balance_loss_mlp": 1.01999736, "epoch": 0.9273410491507591, "flos": 22856890826880.0, "grad_norm": 2.3478977381899364, "language_loss": 0.75240654, "learning_rate": 5.508073993706053e-08, "loss": 0.77372921, "num_input_tokens_seen": 332918105, "step": 15424, "time_per_iteration": 2.6554524898529053 }, { "auxiliary_loss_clip": 0.01019493, "auxiliary_loss_mlp": 0.01002831, "balance_loss_clip": 1.006253, "balance_loss_mlp": 1.00180626, "epoch": 0.927401172403427, "flos": 47665384329600.0, "grad_norm": 0.7785886890233412, "language_loss": 0.60644341, "learning_rate": 5.499000444202351e-08, "loss": 0.62666667, "num_input_tokens_seen": 332969490, "step": 15425, "time_per_iteration": 2.9746127128601074 }, { "auxiliary_loss_clip": 0.01086691, "auxiliary_loss_mlp": 0.00770701, "balance_loss_clip": 1.03668869, "balance_loss_mlp": 1.00019503, "epoch": 0.927461295656095, "flos": 29972374490880.0, "grad_norm": 1.4170561273174695, "language_loss": 0.70912516, "learning_rate": 5.489934270196106e-08, "loss": 0.72769904, "num_input_tokens_seen": 332988805, "step": 15426, "time_per_iteration": 2.7353477478027344 }, { "auxiliary_loss_clip": 0.01083876, "auxiliary_loss_mlp": 0.01027564, "balance_loss_clip": 1.03567636, "balance_loss_mlp": 1.01585722, "epoch": 0.9275214189087629, "flos": 20375427651840.0, "grad_norm": 1.8095946188152212, "language_loss": 0.82924026, "learning_rate": 5.480875472030977e-08, "loss": 0.85035467, "num_input_tokens_seen": 333007960, "step": 15427, "time_per_iteration": 2.6290063858032227 }, { "auxiliary_loss_clip": 0.01074923, "auxiliary_loss_mlp": 0.01034079, "balance_loss_clip": 1.03522468, "balance_loss_mlp": 1.02114439, "epoch": 0.927581542161431, "flos": 22383193242240.0, "grad_norm": 1.5641364814856114, "language_loss": 0.77063322, "learning_rate": 5.471824050050555e-08, "loss": 0.79172319, "num_input_tokens_seen": 333026035, "step": 15428, "time_per_iteration": 2.7724509239196777 }, { "auxiliary_loss_clip": 0.01068711, "auxiliary_loss_mlp": 0.01034095, "balance_loss_clip": 1.03291845, "balance_loss_mlp": 1.02194142, "epoch": 0.9276416654140989, "flos": 23952453598080.0, "grad_norm": 1.8224763848392078, "language_loss": 0.74805522, "learning_rate": 5.4627800045980555e-08, "loss": 0.76908326, "num_input_tokens_seen": 333045590, "step": 15429, "time_per_iteration": 2.859591245651245 }, { "auxiliary_loss_clip": 0.01070146, "auxiliary_loss_mlp": 0.01033725, "balance_loss_clip": 1.03224564, "balance_loss_mlp": 1.02171516, "epoch": 0.9277017886667669, "flos": 13917719796480.0, "grad_norm": 1.7936478622179974, "language_loss": 0.74859536, "learning_rate": 5.45374333601647e-08, "loss": 0.76963401, "num_input_tokens_seen": 333063355, "step": 15430, "time_per_iteration": 2.7022671699523926 }, { "auxiliary_loss_clip": 0.01097528, "auxiliary_loss_mlp": 0.0103505, "balance_loss_clip": 1.03492427, "balance_loss_mlp": 1.02224135, "epoch": 0.9277619119194348, "flos": 35666478092160.0, "grad_norm": 1.3597069220305837, "language_loss": 0.76239693, "learning_rate": 5.444714044648391e-08, "loss": 0.78372276, "num_input_tokens_seen": 333088045, "step": 15431, "time_per_iteration": 2.7746591567993164 }, { "auxiliary_loss_clip": 0.01095653, "auxiliary_loss_mlp": 0.01030326, "balance_loss_clip": 1.03667474, "balance_loss_mlp": 1.01806593, "epoch": 0.9278220351721028, "flos": 23841238112640.0, "grad_norm": 1.8615596661189457, "language_loss": 0.70812196, "learning_rate": 5.4356921308363e-08, "loss": 0.72938174, "num_input_tokens_seen": 333108005, "step": 15432, "time_per_iteration": 2.6617555618286133 }, { "auxiliary_loss_clip": 0.01063577, "auxiliary_loss_mlp": 0.01032538, "balance_loss_clip": 1.03703666, "balance_loss_mlp": 1.02040219, "epoch": 0.9278821584247707, "flos": 15228135768960.0, "grad_norm": 2.1745322103620044, "language_loss": 0.81965214, "learning_rate": 5.4266775949222354e-08, "loss": 0.84061331, "num_input_tokens_seen": 333124335, "step": 15433, "time_per_iteration": 2.669423818588257 }, { "auxiliary_loss_clip": 0.01104445, "auxiliary_loss_mlp": 0.01028126, "balance_loss_clip": 1.03630841, "balance_loss_mlp": 1.01734364, "epoch": 0.9279422816774388, "flos": 24681404206080.0, "grad_norm": 1.9129971221663375, "language_loss": 0.66100991, "learning_rate": 5.417670437248056e-08, "loss": 0.68233562, "num_input_tokens_seen": 333143995, "step": 15434, "time_per_iteration": 2.5970053672790527 }, { "auxiliary_loss_clip": 0.01077405, "auxiliary_loss_mlp": 0.01030104, "balance_loss_clip": 1.03205669, "balance_loss_mlp": 1.0184691, "epoch": 0.9280024049301068, "flos": 19169188099200.0, "grad_norm": 1.6749341071635755, "language_loss": 0.68276256, "learning_rate": 5.40867065815529e-08, "loss": 0.70383763, "num_input_tokens_seen": 333162805, "step": 15435, "time_per_iteration": 2.6586270332336426 }, { "auxiliary_loss_clip": 0.0110958, "auxiliary_loss_mlp": 0.01031502, "balance_loss_clip": 1.03709114, "balance_loss_mlp": 1.01851392, "epoch": 0.9280625281827747, "flos": 11393701983360.0, "grad_norm": 1.9176254237254773, "language_loss": 0.72329485, "learning_rate": 5.399678257985263e-08, "loss": 0.74470568, "num_input_tokens_seen": 333175770, "step": 15436, "time_per_iteration": 2.5913889408111572 }, { "auxiliary_loss_clip": 0.01083394, "auxiliary_loss_mlp": 0.01031424, "balance_loss_clip": 1.03532553, "balance_loss_mlp": 1.01925242, "epoch": 0.9281226514354427, "flos": 24785616539520.0, "grad_norm": 1.984161101089214, "language_loss": 0.66967964, "learning_rate": 5.390693237078925e-08, "loss": 0.69082779, "num_input_tokens_seen": 333194775, "step": 15437, "time_per_iteration": 2.681321144104004 }, { "auxiliary_loss_clip": 0.01098237, "auxiliary_loss_mlp": 0.01033718, "balance_loss_clip": 1.03667545, "balance_loss_mlp": 1.02040851, "epoch": 0.9281827746881106, "flos": 15083128563840.0, "grad_norm": 1.9610204855405788, "language_loss": 0.71449178, "learning_rate": 5.3817155957770254e-08, "loss": 0.73581135, "num_input_tokens_seen": 333208920, "step": 15438, "time_per_iteration": 2.6930477619171143 }, { "auxiliary_loss_clip": 0.01108794, "auxiliary_loss_mlp": 0.01029773, "balance_loss_clip": 1.03654385, "balance_loss_mlp": 1.01757753, "epoch": 0.9282428979407786, "flos": 24135059364480.0, "grad_norm": 1.5819700006479365, "language_loss": 0.64896679, "learning_rate": 5.3727453344199366e-08, "loss": 0.67035246, "num_input_tokens_seen": 333229350, "step": 15439, "time_per_iteration": 2.6033389568328857 }, { "auxiliary_loss_clip": 0.01085049, "auxiliary_loss_mlp": 0.01030625, "balance_loss_clip": 1.0345124, "balance_loss_mlp": 1.01829338, "epoch": 0.9283030211934465, "flos": 24823215100800.0, "grad_norm": 1.6494013213160663, "language_loss": 0.70283854, "learning_rate": 5.363782453347876e-08, "loss": 0.72399533, "num_input_tokens_seen": 333246125, "step": 15440, "time_per_iteration": 4.2000510692596436 }, { "auxiliary_loss_clip": 0.01072935, "auxiliary_loss_mlp": 0.00771755, "balance_loss_clip": 1.03401041, "balance_loss_mlp": 1.00015736, "epoch": 0.9283631444461146, "flos": 23981037845760.0, "grad_norm": 1.6732819714869454, "language_loss": 0.76933122, "learning_rate": 5.354826952900682e-08, "loss": 0.78777814, "num_input_tokens_seen": 333263685, "step": 15441, "time_per_iteration": 2.6747872829437256 }, { "auxiliary_loss_clip": 0.01091447, "auxiliary_loss_mlp": 0.01029564, "balance_loss_clip": 1.03517056, "balance_loss_mlp": 1.01912725, "epoch": 0.9284232676987825, "flos": 22784530878720.0, "grad_norm": 2.5725907020773073, "language_loss": 0.64269817, "learning_rate": 5.345878833417949e-08, "loss": 0.6639083, "num_input_tokens_seen": 333282435, "step": 15442, "time_per_iteration": 4.172094106674194 }, { "auxiliary_loss_clip": 0.01064932, "auxiliary_loss_mlp": 0.01047387, "balance_loss_clip": 1.03302109, "balance_loss_mlp": 1.03327239, "epoch": 0.9284833909514505, "flos": 19500500171520.0, "grad_norm": 1.9244730176476685, "language_loss": 0.80415273, "learning_rate": 5.3369380952390295e-08, "loss": 0.8252759, "num_input_tokens_seen": 333300400, "step": 15443, "time_per_iteration": 4.384172201156616 }, { "auxiliary_loss_clip": 0.01098927, "auxiliary_loss_mlp": 0.00770098, "balance_loss_clip": 1.03640699, "balance_loss_mlp": 1.00019741, "epoch": 0.9285435142041184, "flos": 23185976256000.0, "grad_norm": 2.3934730189580278, "language_loss": 0.6569308, "learning_rate": 5.328004738702896e-08, "loss": 0.67562109, "num_input_tokens_seen": 333318980, "step": 15444, "time_per_iteration": 2.6536576747894287 }, { "auxiliary_loss_clip": 0.01066958, "auxiliary_loss_mlp": 0.01030149, "balance_loss_clip": 1.03394997, "balance_loss_mlp": 1.01776958, "epoch": 0.9286036374567864, "flos": 17675519915520.0, "grad_norm": 1.9812585053043275, "language_loss": 0.73909259, "learning_rate": 5.3190787641483215e-08, "loss": 0.76006365, "num_input_tokens_seen": 333334135, "step": 15445, "time_per_iteration": 4.150733947753906 }, { "auxiliary_loss_clip": 0.0109372, "auxiliary_loss_mlp": 0.01039239, "balance_loss_clip": 1.03644156, "balance_loss_mlp": 1.02563095, "epoch": 0.9286637607094543, "flos": 20886687884160.0, "grad_norm": 1.737150069171567, "language_loss": 0.71495092, "learning_rate": 5.3101601719138135e-08, "loss": 0.7362805, "num_input_tokens_seen": 333353325, "step": 15446, "time_per_iteration": 2.6059980392456055 }, { "auxiliary_loss_clip": 0.0105076, "auxiliary_loss_mlp": 0.01033839, "balance_loss_clip": 1.0349431, "balance_loss_mlp": 1.02085745, "epoch": 0.9287238839621224, "flos": 19026012487680.0, "grad_norm": 1.7396993406776888, "language_loss": 0.69318455, "learning_rate": 5.301248962337523e-08, "loss": 0.71403056, "num_input_tokens_seen": 333371110, "step": 15447, "time_per_iteration": 2.785006046295166 }, { "auxiliary_loss_clip": 0.01101911, "auxiliary_loss_mlp": 0.01030242, "balance_loss_clip": 1.03475642, "balance_loss_mlp": 1.01898217, "epoch": 0.9287840072147904, "flos": 20557027837440.0, "grad_norm": 1.5415290454981314, "language_loss": 0.72406214, "learning_rate": 5.292345135757403e-08, "loss": 0.74538368, "num_input_tokens_seen": 333391420, "step": 15448, "time_per_iteration": 2.5804696083068848 }, { "auxiliary_loss_clip": 0.01108235, "auxiliary_loss_mlp": 0.01028889, "balance_loss_clip": 1.03618634, "balance_loss_mlp": 1.01485801, "epoch": 0.9288441304674583, "flos": 21250822008960.0, "grad_norm": 1.5546568234640936, "language_loss": 0.74195588, "learning_rate": 5.283448692511072e-08, "loss": 0.76332712, "num_input_tokens_seen": 333410365, "step": 15449, "time_per_iteration": 2.5949409008026123 }, { "auxiliary_loss_clip": 0.01108056, "auxiliary_loss_mlp": 0.00770321, "balance_loss_clip": 1.03558385, "balance_loss_mlp": 1.00032389, "epoch": 0.9289042537201263, "flos": 27669853895040.0, "grad_norm": 1.737092405076848, "language_loss": 0.67934465, "learning_rate": 5.27455963293586e-08, "loss": 0.69812846, "num_input_tokens_seen": 333430000, "step": 15450, "time_per_iteration": 2.666686773300171 }, { "auxiliary_loss_clip": 0.01076756, "auxiliary_loss_mlp": 0.01028132, "balance_loss_clip": 1.03465009, "balance_loss_mlp": 1.01593733, "epoch": 0.9289643769727942, "flos": 19317750750720.0, "grad_norm": 2.158761151214465, "language_loss": 0.71885049, "learning_rate": 5.265677957368875e-08, "loss": 0.73989934, "num_input_tokens_seen": 333445800, "step": 15451, "time_per_iteration": 2.7205255031585693 }, { "auxiliary_loss_clip": 0.01083407, "auxiliary_loss_mlp": 0.010432, "balance_loss_clip": 1.033409, "balance_loss_mlp": 1.03013515, "epoch": 0.9290245002254622, "flos": 14058058233600.0, "grad_norm": 1.931438285562406, "language_loss": 0.732077, "learning_rate": 5.25680366614687e-08, "loss": 0.75334305, "num_input_tokens_seen": 333461550, "step": 15452, "time_per_iteration": 2.7509524822235107 }, { "auxiliary_loss_clip": 0.01089897, "auxiliary_loss_mlp": 0.01029462, "balance_loss_clip": 1.03840566, "balance_loss_mlp": 1.01675391, "epoch": 0.9290846234781301, "flos": 20047132321920.0, "grad_norm": 2.107575184826188, "language_loss": 0.74144852, "learning_rate": 5.2479367596064196e-08, "loss": 0.76264215, "num_input_tokens_seen": 333478835, "step": 15453, "time_per_iteration": 2.7099177837371826 }, { "auxiliary_loss_clip": 0.00992131, "auxiliary_loss_mlp": 0.0100121, "balance_loss_clip": 1.00884318, "balance_loss_mlp": 1.00022078, "epoch": 0.9291447467307982, "flos": 61227514460160.0, "grad_norm": 0.8159194207787102, "language_loss": 0.60648072, "learning_rate": 5.2390772380837226e-08, "loss": 0.62641418, "num_input_tokens_seen": 333535250, "step": 15454, "time_per_iteration": 3.143502950668335 }, { "auxiliary_loss_clip": 0.01082676, "auxiliary_loss_mlp": 0.01042707, "balance_loss_clip": 1.03245759, "balance_loss_mlp": 1.02956378, "epoch": 0.9292048699834661, "flos": 20553328736640.0, "grad_norm": 1.6995805540725026, "language_loss": 0.68820882, "learning_rate": 5.230225101914709e-08, "loss": 0.70946264, "num_input_tokens_seen": 333553805, "step": 15455, "time_per_iteration": 2.6724045276641846 }, { "auxiliary_loss_clip": 0.01063528, "auxiliary_loss_mlp": 0.01034362, "balance_loss_clip": 1.03471339, "balance_loss_mlp": 1.02136862, "epoch": 0.9292649932361341, "flos": 23623655477760.0, "grad_norm": 1.7980946522964238, "language_loss": 0.64908248, "learning_rate": 5.22138035143509e-08, "loss": 0.67006135, "num_input_tokens_seen": 333572800, "step": 15456, "time_per_iteration": 2.6736927032470703 }, { "auxiliary_loss_clip": 0.01061601, "auxiliary_loss_mlp": 0.01031323, "balance_loss_clip": 1.03250432, "balance_loss_mlp": 1.01845431, "epoch": 0.929325116488802, "flos": 15009942602880.0, "grad_norm": 2.326561847657641, "language_loss": 0.68176067, "learning_rate": 5.2125429869802615e-08, "loss": 0.70268989, "num_input_tokens_seen": 333588520, "step": 15457, "time_per_iteration": 2.722505807876587 }, { "auxiliary_loss_clip": 0.01086966, "auxiliary_loss_mlp": 0.01029179, "balance_loss_clip": 1.03466225, "balance_loss_mlp": 1.01685286, "epoch": 0.92938523974147, "flos": 17967365919360.0, "grad_norm": 3.4374144419000388, "language_loss": 0.80816442, "learning_rate": 5.203713008885291e-08, "loss": 0.82932585, "num_input_tokens_seen": 333603435, "step": 15458, "time_per_iteration": 2.7122104167938232 }, { "auxiliary_loss_clip": 0.01100699, "auxiliary_loss_mlp": 0.01034056, "balance_loss_clip": 1.03775263, "balance_loss_mlp": 1.02139056, "epoch": 0.9294453629941379, "flos": 23003047267200.0, "grad_norm": 1.5848358931339006, "language_loss": 0.72326326, "learning_rate": 5.194890417485065e-08, "loss": 0.74461079, "num_input_tokens_seen": 333623305, "step": 15459, "time_per_iteration": 2.6949429512023926 }, { "auxiliary_loss_clip": 0.01070452, "auxiliary_loss_mlp": 0.01035576, "balance_loss_clip": 1.03244114, "balance_loss_mlp": 1.02314806, "epoch": 0.929505486246806, "flos": 17055234927360.0, "grad_norm": 2.206929984070201, "language_loss": 0.58746719, "learning_rate": 5.1860752131141384e-08, "loss": 0.60852748, "num_input_tokens_seen": 333641205, "step": 15460, "time_per_iteration": 2.7503440380096436 }, { "auxiliary_loss_clip": 0.01057483, "auxiliary_loss_mlp": 0.01033599, "balance_loss_clip": 1.03144884, "balance_loss_mlp": 1.02011657, "epoch": 0.9295656094994739, "flos": 27340409329920.0, "grad_norm": 1.8867552786807409, "language_loss": 0.80609381, "learning_rate": 5.177267396106733e-08, "loss": 0.82700461, "num_input_tokens_seen": 333659615, "step": 15461, "time_per_iteration": 2.773244857788086 }, { "auxiliary_loss_clip": 0.01083444, "auxiliary_loss_mlp": 0.01025336, "balance_loss_clip": 1.03467631, "balance_loss_mlp": 1.01330769, "epoch": 0.9296257327521419, "flos": 21470954509440.0, "grad_norm": 2.177903881361931, "language_loss": 0.78115839, "learning_rate": 5.168466966796869e-08, "loss": 0.80224615, "num_input_tokens_seen": 333678985, "step": 15462, "time_per_iteration": 2.6601200103759766 }, { "auxiliary_loss_clip": 0.01065181, "auxiliary_loss_mlp": 0.0102857, "balance_loss_clip": 1.02944088, "balance_loss_mlp": 1.01573753, "epoch": 0.9296858560048099, "flos": 16362661818240.0, "grad_norm": 1.838020182070701, "language_loss": 0.62704271, "learning_rate": 5.159673925518282e-08, "loss": 0.64798021, "num_input_tokens_seen": 333696410, "step": 15463, "time_per_iteration": 2.65974760055542 }, { "auxiliary_loss_clip": 0.01082053, "auxiliary_loss_mlp": 0.0103221, "balance_loss_clip": 1.03119493, "balance_loss_mlp": 1.020522, "epoch": 0.9297459792574778, "flos": 29858609139840.0, "grad_norm": 1.45056589452118, "language_loss": 0.70977533, "learning_rate": 5.15088827260437e-08, "loss": 0.73091793, "num_input_tokens_seen": 333716615, "step": 15464, "time_per_iteration": 2.7430808544158936 }, { "auxiliary_loss_clip": 0.01082227, "auxiliary_loss_mlp": 0.01031071, "balance_loss_clip": 1.0332849, "balance_loss_mlp": 1.01884627, "epoch": 0.9298061025101458, "flos": 15924838942080.0, "grad_norm": 1.7994510129894437, "language_loss": 0.77051908, "learning_rate": 5.1421100083883115e-08, "loss": 0.79165208, "num_input_tokens_seen": 333732800, "step": 15465, "time_per_iteration": 2.645766019821167 }, { "auxiliary_loss_clip": 0.0098002, "auxiliary_loss_mlp": 0.01003378, "balance_loss_clip": 1.01005721, "balance_loss_mlp": 1.00234056, "epoch": 0.9298662257628137, "flos": 64096994304000.0, "grad_norm": 0.6965140655325198, "language_loss": 0.56410694, "learning_rate": 5.133339133202952e-08, "loss": 0.58394086, "num_input_tokens_seen": 333799300, "step": 15466, "time_per_iteration": 3.565849781036377 }, { "auxiliary_loss_clip": 0.01085072, "auxiliary_loss_mlp": 0.01039957, "balance_loss_clip": 1.03284919, "balance_loss_mlp": 1.02618241, "epoch": 0.9299263490154818, "flos": 24280210224000.0, "grad_norm": 1.7308849012204341, "language_loss": 0.72874355, "learning_rate": 5.1245756473809355e-08, "loss": 0.7499938, "num_input_tokens_seen": 333820360, "step": 15467, "time_per_iteration": 3.236931800842285 }, { "auxiliary_loss_clip": 0.01080183, "auxiliary_loss_mlp": 0.01034565, "balance_loss_clip": 1.03504908, "balance_loss_mlp": 1.02171993, "epoch": 0.9299864722681497, "flos": 23294354567040.0, "grad_norm": 1.5735167762659585, "language_loss": 0.7158711, "learning_rate": 5.1158195512545076e-08, "loss": 0.73701859, "num_input_tokens_seen": 333840415, "step": 15468, "time_per_iteration": 2.7365386486053467 }, { "auxiliary_loss_clip": 0.01094813, "auxiliary_loss_mlp": 0.01036159, "balance_loss_clip": 1.0341078, "balance_loss_mlp": 1.02179384, "epoch": 0.9300465955208177, "flos": 21395972868480.0, "grad_norm": 1.7470237335941396, "language_loss": 0.75426078, "learning_rate": 5.107070845155737e-08, "loss": 0.77557051, "num_input_tokens_seen": 333859910, "step": 15469, "time_per_iteration": 2.7027781009674072 }, { "auxiliary_loss_clip": 0.01082382, "auxiliary_loss_mlp": 0.01034783, "balance_loss_clip": 1.03710604, "balance_loss_mlp": 1.02252209, "epoch": 0.9301067187734856, "flos": 24571445696640.0, "grad_norm": 2.784639154051725, "language_loss": 0.75578332, "learning_rate": 5.098329529416379e-08, "loss": 0.77695501, "num_input_tokens_seen": 333880495, "step": 15470, "time_per_iteration": 2.7347373962402344 }, { "auxiliary_loss_clip": 0.01067813, "auxiliary_loss_mlp": 0.01032447, "balance_loss_clip": 1.03560543, "balance_loss_mlp": 1.02088356, "epoch": 0.9301668420261536, "flos": 22196960202240.0, "grad_norm": 2.387489989885237, "language_loss": 0.74822462, "learning_rate": 5.089595604367902e-08, "loss": 0.76922727, "num_input_tokens_seen": 333897640, "step": 15471, "time_per_iteration": 2.756758213043213 }, { "auxiliary_loss_clip": 0.01093505, "auxiliary_loss_mlp": 0.01030868, "balance_loss_clip": 1.03590918, "balance_loss_mlp": 1.01813686, "epoch": 0.9302269652788215, "flos": 17747628468480.0, "grad_norm": 3.1400649028733345, "language_loss": 0.68857515, "learning_rate": 5.080869070341487e-08, "loss": 0.70981896, "num_input_tokens_seen": 333913670, "step": 15472, "time_per_iteration": 2.6190030574798584 }, { "auxiliary_loss_clip": 0.01078893, "auxiliary_loss_mlp": 0.01028134, "balance_loss_clip": 1.03282297, "balance_loss_mlp": 1.01614726, "epoch": 0.9302870885314896, "flos": 19390793057280.0, "grad_norm": 1.7830315106807422, "language_loss": 0.88541853, "learning_rate": 5.0721499276680233e-08, "loss": 0.90648878, "num_input_tokens_seen": 333934105, "step": 15473, "time_per_iteration": 2.5981593132019043 }, { "auxiliary_loss_clip": 0.01087498, "auxiliary_loss_mlp": 0.01036678, "balance_loss_clip": 1.03732419, "balance_loss_mlp": 1.0225395, "epoch": 0.9303472117841575, "flos": 21760286561280.0, "grad_norm": 1.8849274973342631, "language_loss": 0.64160311, "learning_rate": 5.063438176678203e-08, "loss": 0.6628449, "num_input_tokens_seen": 333953635, "step": 15474, "time_per_iteration": 2.6480371952056885 }, { "auxiliary_loss_clip": 0.01109387, "auxiliary_loss_mlp": 0.01034944, "balance_loss_clip": 1.03766966, "balance_loss_mlp": 1.0225817, "epoch": 0.9304073350368255, "flos": 19609740408960.0, "grad_norm": 1.7431064439867472, "language_loss": 0.74580079, "learning_rate": 5.054733817702339e-08, "loss": 0.7672441, "num_input_tokens_seen": 333971825, "step": 15475, "time_per_iteration": 2.9433109760284424 }, { "auxiliary_loss_clip": 0.01094883, "auxiliary_loss_mlp": 0.01029771, "balance_loss_clip": 1.03424644, "balance_loss_mlp": 1.01804042, "epoch": 0.9304674582894935, "flos": 30441582875520.0, "grad_norm": 1.8594741529837064, "language_loss": 0.66352129, "learning_rate": 5.0460368510704786e-08, "loss": 0.68476784, "num_input_tokens_seen": 333990120, "step": 15476, "time_per_iteration": 2.669774293899536 }, { "auxiliary_loss_clip": 0.01066383, "auxiliary_loss_mlp": 0.01033281, "balance_loss_clip": 1.03680027, "balance_loss_mlp": 1.02047777, "epoch": 0.9305275815421614, "flos": 17785693906560.0, "grad_norm": 1.928617647812536, "language_loss": 0.68966222, "learning_rate": 5.0373472771124914e-08, "loss": 0.71065891, "num_input_tokens_seen": 334007970, "step": 15477, "time_per_iteration": 2.7191553115844727 }, { "auxiliary_loss_clip": 0.01087769, "auxiliary_loss_mlp": 0.01030039, "balance_loss_clip": 1.03722644, "balance_loss_mlp": 1.01820755, "epoch": 0.9305877047948294, "flos": 25298456970240.0, "grad_norm": 3.581472725351638, "language_loss": 0.58644545, "learning_rate": 5.0286650961578027e-08, "loss": 0.60762358, "num_input_tokens_seen": 334027120, "step": 15478, "time_per_iteration": 2.6942026615142822 }, { "auxiliary_loss_clip": 0.01089048, "auxiliary_loss_mlp": 0.01030623, "balance_loss_clip": 1.03869212, "balance_loss_mlp": 1.0165447, "epoch": 0.9306478280474973, "flos": 16977236544000.0, "grad_norm": 2.150126266839501, "language_loss": 0.78858852, "learning_rate": 5.01999030853566e-08, "loss": 0.80978525, "num_input_tokens_seen": 334042785, "step": 15479, "time_per_iteration": 4.2738165855407715 }, { "auxiliary_loss_clip": 0.01109061, "auxiliary_loss_mlp": 0.01031407, "balance_loss_clip": 1.03678465, "balance_loss_mlp": 1.0195874, "epoch": 0.9307079513001654, "flos": 35663353608960.0, "grad_norm": 1.63393685516193, "language_loss": 0.6846534, "learning_rate": 5.0113229145750445e-08, "loss": 0.70605814, "num_input_tokens_seen": 334063480, "step": 15480, "time_per_iteration": 2.7149746417999268 }, { "auxiliary_loss_clip": 0.01109905, "auxiliary_loss_mlp": 0.01031927, "balance_loss_clip": 1.0378406, "balance_loss_mlp": 1.01929665, "epoch": 0.9307680745528333, "flos": 19208151377280.0, "grad_norm": 1.6956958650454903, "language_loss": 0.67578673, "learning_rate": 5.002662914604583e-08, "loss": 0.69720507, "num_input_tokens_seen": 334082005, "step": 15481, "time_per_iteration": 4.039956331253052 }, { "auxiliary_loss_clip": 0.01080993, "auxiliary_loss_mlp": 0.01032175, "balance_loss_clip": 1.033005, "balance_loss_mlp": 1.0192585, "epoch": 0.9308281978055013, "flos": 19062641381760.0, "grad_norm": 1.8467240460111785, "language_loss": 0.74883473, "learning_rate": 4.994010308952701e-08, "loss": 0.76996636, "num_input_tokens_seen": 334101375, "step": 15482, "time_per_iteration": 2.6518447399139404 }, { "auxiliary_loss_clip": 0.01094658, "auxiliary_loss_mlp": 0.01028267, "balance_loss_clip": 1.03394866, "balance_loss_mlp": 1.01649547, "epoch": 0.9308883210581692, "flos": 20521548178560.0, "grad_norm": 1.9180378851164899, "language_loss": 0.80164105, "learning_rate": 4.985365097947469e-08, "loss": 0.82287037, "num_input_tokens_seen": 334119460, "step": 15483, "time_per_iteration": 4.348911285400391 }, { "auxiliary_loss_clip": 0.01082688, "auxiliary_loss_mlp": 0.01033349, "balance_loss_clip": 1.03657448, "balance_loss_mlp": 1.02083826, "epoch": 0.9309484443108372, "flos": 13001422826880.0, "grad_norm": 1.8698210896088554, "language_loss": 0.74462926, "learning_rate": 4.976727281916782e-08, "loss": 0.76578963, "num_input_tokens_seen": 334136065, "step": 15484, "time_per_iteration": 2.664431095123291 }, { "auxiliary_loss_clip": 0.01086381, "auxiliary_loss_mlp": 0.01032674, "balance_loss_clip": 1.03691006, "balance_loss_mlp": 1.01994252, "epoch": 0.9310085675635051, "flos": 12567765928320.0, "grad_norm": 2.155944343427052, "language_loss": 0.76539695, "learning_rate": 4.968096861188087e-08, "loss": 0.78658748, "num_input_tokens_seen": 334153690, "step": 15485, "time_per_iteration": 4.154724597930908 }, { "auxiliary_loss_clip": 0.01063188, "auxiliary_loss_mlp": 0.01035653, "balance_loss_clip": 1.03246868, "balance_loss_mlp": 1.02108002, "epoch": 0.9310686908161732, "flos": 23477570864640.0, "grad_norm": 2.136554797063734, "language_loss": 0.78668422, "learning_rate": 4.959473836088723e-08, "loss": 0.80767262, "num_input_tokens_seen": 334171880, "step": 15486, "time_per_iteration": 2.7599616050720215 }, { "auxiliary_loss_clip": 0.01079287, "auxiliary_loss_mlp": 0.01030569, "balance_loss_clip": 1.03827739, "balance_loss_mlp": 1.01740253, "epoch": 0.9311288140688411, "flos": 24170287628160.0, "grad_norm": 1.753625770007885, "language_loss": 0.7688942, "learning_rate": 4.950858206945674e-08, "loss": 0.78999281, "num_input_tokens_seen": 334190005, "step": 15487, "time_per_iteration": 2.6973049640655518 }, { "auxiliary_loss_clip": 0.01080553, "auxiliary_loss_mlp": 0.01029283, "balance_loss_clip": 1.0376873, "balance_loss_mlp": 1.01600909, "epoch": 0.9311889373215091, "flos": 35590203561600.0, "grad_norm": 2.2700820281078213, "language_loss": 0.67102247, "learning_rate": 4.942249974085633e-08, "loss": 0.69212085, "num_input_tokens_seen": 334209545, "step": 15488, "time_per_iteration": 2.7322824001312256 }, { "auxiliary_loss_clip": 0.01083742, "auxiliary_loss_mlp": 0.0103061, "balance_loss_clip": 1.03624427, "balance_loss_mlp": 1.01787865, "epoch": 0.9312490605741771, "flos": 20230528187520.0, "grad_norm": 1.926714614555793, "language_loss": 0.75009143, "learning_rate": 4.933649137834983e-08, "loss": 0.77123499, "num_input_tokens_seen": 334228900, "step": 15489, "time_per_iteration": 2.5786027908325195 }, { "auxiliary_loss_clip": 0.01111206, "auxiliary_loss_mlp": 0.01032778, "balance_loss_clip": 1.03734827, "balance_loss_mlp": 1.01991534, "epoch": 0.931309183826845, "flos": 13950577762560.0, "grad_norm": 3.9683320091837406, "language_loss": 0.80892265, "learning_rate": 4.925055698519931e-08, "loss": 0.83036256, "num_input_tokens_seen": 334245500, "step": 15490, "time_per_iteration": 2.5186619758605957 }, { "auxiliary_loss_clip": 0.01061842, "auxiliary_loss_mlp": 0.0103459, "balance_loss_clip": 1.03452861, "balance_loss_mlp": 1.02170372, "epoch": 0.931369307079513, "flos": 20156731695360.0, "grad_norm": 1.6571168923434456, "language_loss": 0.72082543, "learning_rate": 4.9164696564663264e-08, "loss": 0.74178976, "num_input_tokens_seen": 334264370, "step": 15491, "time_per_iteration": 2.6573195457458496 }, { "auxiliary_loss_clip": 0.0108057, "auxiliary_loss_mlp": 0.0076884, "balance_loss_clip": 1.03255057, "balance_loss_mlp": 1.00006044, "epoch": 0.931429430332181, "flos": 25338569483520.0, "grad_norm": 1.8544048731654292, "language_loss": 0.74552429, "learning_rate": 4.9078910119997096e-08, "loss": 0.7640183, "num_input_tokens_seen": 334283905, "step": 15492, "time_per_iteration": 2.5493483543395996 }, { "auxiliary_loss_clip": 0.01019334, "auxiliary_loss_mlp": 0.01002201, "balance_loss_clip": 1.0056994, "balance_loss_mlp": 1.00118196, "epoch": 0.931489553584849, "flos": 71226193985280.0, "grad_norm": 0.707578892585582, "language_loss": 0.53487962, "learning_rate": 4.899319765445442e-08, "loss": 0.55509502, "num_input_tokens_seen": 334339925, "step": 15493, "time_per_iteration": 2.947209358215332 }, { "auxiliary_loss_clip": 0.01097309, "auxiliary_loss_mlp": 0.01033315, "balance_loss_clip": 1.03523576, "balance_loss_mlp": 1.02147126, "epoch": 0.9315496768375169, "flos": 14643653662080.0, "grad_norm": 1.8350932820938002, "language_loss": 0.70629972, "learning_rate": 4.890755917128531e-08, "loss": 0.72760594, "num_input_tokens_seen": 334357225, "step": 15494, "time_per_iteration": 2.721458673477173 }, { "auxiliary_loss_clip": 0.01093067, "auxiliary_loss_mlp": 0.01029285, "balance_loss_clip": 1.03598893, "balance_loss_mlp": 1.01683998, "epoch": 0.9316098000901849, "flos": 28329928174080.0, "grad_norm": 2.829925163289834, "language_loss": 0.68261808, "learning_rate": 4.882199467373671e-08, "loss": 0.70384157, "num_input_tokens_seen": 334375945, "step": 15495, "time_per_iteration": 2.6126840114593506 }, { "auxiliary_loss_clip": 0.01104588, "auxiliary_loss_mlp": 0.01034305, "balance_loss_clip": 1.03474832, "balance_loss_mlp": 1.02263463, "epoch": 0.9316699233428528, "flos": 28512677594880.0, "grad_norm": 2.116367071751547, "language_loss": 0.61655867, "learning_rate": 4.8736504165053815e-08, "loss": 0.63794762, "num_input_tokens_seen": 334395310, "step": 15496, "time_per_iteration": 2.5984606742858887 }, { "auxiliary_loss_clip": 0.01099753, "auxiliary_loss_mlp": 0.01032832, "balance_loss_clip": 1.03712618, "balance_loss_mlp": 1.0196712, "epoch": 0.9317300465955208, "flos": 33693402061440.0, "grad_norm": 1.4743797033954789, "language_loss": 0.76852232, "learning_rate": 4.865108764847825e-08, "loss": 0.78984821, "num_input_tokens_seen": 334416965, "step": 15497, "time_per_iteration": 2.694024085998535 }, { "auxiliary_loss_clip": 0.01102298, "auxiliary_loss_mlp": 0.00771221, "balance_loss_clip": 1.0387435, "balance_loss_mlp": 1.00019717, "epoch": 0.9317901698481887, "flos": 23658237296640.0, "grad_norm": 1.6171461718999425, "language_loss": 0.66427922, "learning_rate": 4.856574512724898e-08, "loss": 0.68301445, "num_input_tokens_seen": 334435620, "step": 15498, "time_per_iteration": 2.617232084274292 }, { "auxiliary_loss_clip": 0.01087695, "auxiliary_loss_mlp": 0.01035624, "balance_loss_clip": 1.03662407, "balance_loss_mlp": 1.02294576, "epoch": 0.9318502931008568, "flos": 20960017499520.0, "grad_norm": 1.6376617462499037, "language_loss": 0.79631472, "learning_rate": 4.8480476604602305e-08, "loss": 0.81754798, "num_input_tokens_seen": 334456210, "step": 15499, "time_per_iteration": 2.663939952850342 }, { "auxiliary_loss_clip": 0.01065124, "auxiliary_loss_mlp": 0.01033992, "balance_loss_clip": 1.03444028, "balance_loss_mlp": 1.02104545, "epoch": 0.9319104163535247, "flos": 23441049711360.0, "grad_norm": 1.9165014592612015, "language_loss": 0.76588839, "learning_rate": 4.8395282083771196e-08, "loss": 0.78687954, "num_input_tokens_seen": 334475485, "step": 15500, "time_per_iteration": 2.8950650691986084 }, { "auxiliary_loss_clip": 0.01075294, "auxiliary_loss_mlp": 0.01026196, "balance_loss_clip": 1.03517914, "balance_loss_mlp": 1.01429939, "epoch": 0.9319705396061927, "flos": 22347426274560.0, "grad_norm": 1.8007520102301227, "language_loss": 0.72160745, "learning_rate": 4.8310161567987064e-08, "loss": 0.74262238, "num_input_tokens_seen": 334494740, "step": 15501, "time_per_iteration": 2.671736240386963 }, { "auxiliary_loss_clip": 0.0111059, "auxiliary_loss_mlp": 0.01035234, "balance_loss_clip": 1.0367043, "balance_loss_mlp": 1.02227604, "epoch": 0.9320306628588607, "flos": 20993557824000.0, "grad_norm": 1.6678005947570964, "language_loss": 0.66245615, "learning_rate": 4.822511506047666e-08, "loss": 0.68391442, "num_input_tokens_seen": 334511910, "step": 15502, "time_per_iteration": 2.640803098678589 }, { "auxiliary_loss_clip": 0.01100429, "auxiliary_loss_mlp": 0.00770206, "balance_loss_clip": 1.03718948, "balance_loss_mlp": 1.00017929, "epoch": 0.9320907861115286, "flos": 24538300421760.0, "grad_norm": 1.5006777821326498, "language_loss": 0.65834957, "learning_rate": 4.814014256446586e-08, "loss": 0.67705584, "num_input_tokens_seen": 334533150, "step": 15503, "time_per_iteration": 2.6871988773345947 }, { "auxiliary_loss_clip": 0.01070897, "auxiliary_loss_mlp": 0.01037527, "balance_loss_clip": 1.03008294, "balance_loss_mlp": 1.02359104, "epoch": 0.9321509093641966, "flos": 19785414850560.0, "grad_norm": 1.5576125560522005, "language_loss": 0.75215459, "learning_rate": 4.805524408317652e-08, "loss": 0.77323884, "num_input_tokens_seen": 334550940, "step": 15504, "time_per_iteration": 2.7060647010803223 }, { "auxiliary_loss_clip": 0.01099259, "auxiliary_loss_mlp": 0.00770405, "balance_loss_clip": 1.03809631, "balance_loss_mlp": 1.00028038, "epoch": 0.9322110326168646, "flos": 24972675592320.0, "grad_norm": 2.4285521975478472, "language_loss": 0.70985043, "learning_rate": 4.797041961982762e-08, "loss": 0.7285471, "num_input_tokens_seen": 334570935, "step": 15505, "time_per_iteration": 2.632615089416504 }, { "auxiliary_loss_clip": 0.01089757, "auxiliary_loss_mlp": 0.01032109, "balance_loss_clip": 1.03672528, "balance_loss_mlp": 1.01909173, "epoch": 0.9322711558695326, "flos": 16143642639360.0, "grad_norm": 2.685326437023756, "language_loss": 0.75406563, "learning_rate": 4.788566917763614e-08, "loss": 0.77528429, "num_input_tokens_seen": 334589315, "step": 15506, "time_per_iteration": 2.6244046688079834 }, { "auxiliary_loss_clip": 0.01069315, "auxiliary_loss_mlp": 0.01031235, "balance_loss_clip": 1.03417552, "balance_loss_mlp": 1.01864064, "epoch": 0.9323312791222005, "flos": 23732428838400.0, "grad_norm": 1.7967505184283636, "language_loss": 0.830755, "learning_rate": 4.780099275981597e-08, "loss": 0.85176057, "num_input_tokens_seen": 334608990, "step": 15507, "time_per_iteration": 2.7944211959838867 }, { "auxiliary_loss_clip": 0.01110233, "auxiliary_loss_mlp": 0.01033256, "balance_loss_clip": 1.03728533, "balance_loss_mlp": 1.02054238, "epoch": 0.9323914023748685, "flos": 20777914523520.0, "grad_norm": 1.7255820052461415, "language_loss": 0.68139851, "learning_rate": 4.771639036957742e-08, "loss": 0.70283341, "num_input_tokens_seen": 334628655, "step": 15508, "time_per_iteration": 2.6024084091186523 }, { "auxiliary_loss_clip": 0.01074834, "auxiliary_loss_mlp": 0.01031451, "balance_loss_clip": 1.03604794, "balance_loss_mlp": 1.01885068, "epoch": 0.9324515256275364, "flos": 23915178259200.0, "grad_norm": 1.6572638063256202, "language_loss": 0.72395205, "learning_rate": 4.7631862010129033e-08, "loss": 0.74501491, "num_input_tokens_seen": 334648295, "step": 15509, "time_per_iteration": 2.7021539211273193 }, { "auxiliary_loss_clip": 0.01097551, "auxiliary_loss_mlp": 0.01032405, "balance_loss_clip": 1.03582215, "balance_loss_mlp": 1.02004337, "epoch": 0.9325116488802044, "flos": 18005215875840.0, "grad_norm": 1.9028125229589103, "language_loss": 0.73969936, "learning_rate": 4.754740768467624e-08, "loss": 0.7609989, "num_input_tokens_seen": 334666280, "step": 15510, "time_per_iteration": 2.5746052265167236 }, { "auxiliary_loss_clip": 0.0109828, "auxiliary_loss_mlp": 0.01029114, "balance_loss_clip": 1.03353786, "balance_loss_mlp": 1.01676393, "epoch": 0.9325717721328723, "flos": 29021603443200.0, "grad_norm": 2.4238247125248304, "language_loss": 0.70348555, "learning_rate": 4.746302739642161e-08, "loss": 0.72475946, "num_input_tokens_seen": 334688830, "step": 15511, "time_per_iteration": 2.6687567234039307 }, { "auxiliary_loss_clip": 0.01080656, "auxiliary_loss_mlp": 0.01040972, "balance_loss_clip": 1.03440976, "balance_loss_mlp": 1.02819276, "epoch": 0.9326318953855404, "flos": 21646341642240.0, "grad_norm": 1.8803327029828805, "language_loss": 0.78425443, "learning_rate": 4.737872114856412e-08, "loss": 0.80547071, "num_input_tokens_seen": 334705205, "step": 15512, "time_per_iteration": 2.614408016204834 }, { "auxiliary_loss_clip": 0.01107297, "auxiliary_loss_mlp": 0.01028882, "balance_loss_clip": 1.03561831, "balance_loss_mlp": 1.01595938, "epoch": 0.9326920186382083, "flos": 26065724411520.0, "grad_norm": 2.094641259738662, "language_loss": 0.80607069, "learning_rate": 4.7294488944301436e-08, "loss": 0.82743245, "num_input_tokens_seen": 334723830, "step": 15513, "time_per_iteration": 2.6240689754486084 }, { "auxiliary_loss_clip": 0.01086027, "auxiliary_loss_mlp": 0.01032384, "balance_loss_clip": 1.03791964, "balance_loss_mlp": 1.01878834, "epoch": 0.9327521418908763, "flos": 12057116227200.0, "grad_norm": 1.8470318509254438, "language_loss": 0.80033004, "learning_rate": 4.721033078682768e-08, "loss": 0.82151413, "num_input_tokens_seen": 334740825, "step": 15514, "time_per_iteration": 2.6301167011260986 }, { "auxiliary_loss_clip": 0.01074823, "auxiliary_loss_mlp": 0.01037395, "balance_loss_clip": 1.0366869, "balance_loss_mlp": 1.02556312, "epoch": 0.9328122651435443, "flos": 43834395271680.0, "grad_norm": 1.7635259328932469, "language_loss": 0.71414572, "learning_rate": 4.7126246679333626e-08, "loss": 0.73526788, "num_input_tokens_seen": 334765825, "step": 15515, "time_per_iteration": 2.9563915729522705 }, { "auxiliary_loss_clip": 0.01093417, "auxiliary_loss_mlp": 0.01033132, "balance_loss_clip": 1.03784752, "balance_loss_mlp": 1.02009666, "epoch": 0.9328723883962122, "flos": 15194954580480.0, "grad_norm": 3.7303999678347153, "language_loss": 0.80836952, "learning_rate": 4.704223662500806e-08, "loss": 0.82963496, "num_input_tokens_seen": 334782680, "step": 15516, "time_per_iteration": 2.6183342933654785 }, { "auxiliary_loss_clip": 0.01070452, "auxiliary_loss_mlp": 0.01039697, "balance_loss_clip": 1.03149724, "balance_loss_mlp": 1.02574384, "epoch": 0.9329325116488802, "flos": 20261770041600.0, "grad_norm": 1.6405205567812482, "language_loss": 0.80559999, "learning_rate": 4.695830062703643e-08, "loss": 0.82670152, "num_input_tokens_seen": 334800160, "step": 15517, "time_per_iteration": 2.6957180500030518 }, { "auxiliary_loss_clip": 0.0108811, "auxiliary_loss_mlp": 0.01031178, "balance_loss_clip": 1.03524601, "balance_loss_mlp": 1.01821351, "epoch": 0.9329926349015482, "flos": 13115008609920.0, "grad_norm": 2.5144620557591364, "language_loss": 0.74485952, "learning_rate": 4.687443868860219e-08, "loss": 0.76605237, "num_input_tokens_seen": 334815840, "step": 15518, "time_per_iteration": 2.6164944171905518 }, { "auxiliary_loss_clip": 0.01083865, "auxiliary_loss_mlp": 0.01042053, "balance_loss_clip": 1.03354347, "balance_loss_mlp": 1.02916634, "epoch": 0.9330527581542162, "flos": 23040250778880.0, "grad_norm": 2.0399988917234904, "language_loss": 0.76014853, "learning_rate": 4.679065081288458e-08, "loss": 0.78140771, "num_input_tokens_seen": 334834735, "step": 15519, "time_per_iteration": 4.253001689910889 }, { "auxiliary_loss_clip": 0.01053866, "auxiliary_loss_mlp": 0.01036997, "balance_loss_clip": 1.0320313, "balance_loss_mlp": 1.02326381, "epoch": 0.9331128814068841, "flos": 15559627409280.0, "grad_norm": 2.031373785693728, "language_loss": 0.83167887, "learning_rate": 4.6706937003061275e-08, "loss": 0.85258746, "num_input_tokens_seen": 334853490, "step": 15520, "time_per_iteration": 4.2622270584106445 }, { "auxiliary_loss_clip": 0.01096231, "auxiliary_loss_mlp": 0.01030529, "balance_loss_clip": 1.03505969, "balance_loss_mlp": 1.01851249, "epoch": 0.9331730046595521, "flos": 22271762275200.0, "grad_norm": 1.641224021105838, "language_loss": 0.76203525, "learning_rate": 4.6623297262306846e-08, "loss": 0.78330284, "num_input_tokens_seen": 334873675, "step": 15521, "time_per_iteration": 2.6855099201202393 }, { "auxiliary_loss_clip": 0.01098694, "auxiliary_loss_mlp": 0.01030692, "balance_loss_clip": 1.03746796, "balance_loss_mlp": 1.01878297, "epoch": 0.93323312791222, "flos": 15777641007360.0, "grad_norm": 1.8982902543298203, "language_loss": 0.77620465, "learning_rate": 4.6539731593792545e-08, "loss": 0.79749846, "num_input_tokens_seen": 334890970, "step": 15522, "time_per_iteration": 4.228564977645874 }, { "auxiliary_loss_clip": 0.01075483, "auxiliary_loss_mlp": 0.00770903, "balance_loss_clip": 1.03559947, "balance_loss_mlp": 1.00036263, "epoch": 0.933293251164888, "flos": 22010978557440.0, "grad_norm": 2.1529045189336076, "language_loss": 0.62858284, "learning_rate": 4.6456240000687373e-08, "loss": 0.64704674, "num_input_tokens_seen": 334906635, "step": 15523, "time_per_iteration": 2.720323324203491 }, { "auxiliary_loss_clip": 0.01085105, "auxiliary_loss_mlp": 0.01031084, "balance_loss_clip": 1.0346992, "balance_loss_mlp": 1.0190022, "epoch": 0.933353374417556, "flos": 26031358074240.0, "grad_norm": 2.0555270470512035, "language_loss": 0.6804812, "learning_rate": 4.63728224861577e-08, "loss": 0.70164317, "num_input_tokens_seen": 334926230, "step": 15524, "time_per_iteration": 4.186232805252075 }, { "auxiliary_loss_clip": 0.01065918, "auxiliary_loss_mlp": 0.01036023, "balance_loss_clip": 1.03418577, "balance_loss_mlp": 1.02345872, "epoch": 0.933413497670224, "flos": 24900100162560.0, "grad_norm": 1.5473346576932632, "language_loss": 0.73752666, "learning_rate": 4.628947905336589e-08, "loss": 0.75854605, "num_input_tokens_seen": 334946680, "step": 15525, "time_per_iteration": 2.740737199783325 }, { "auxiliary_loss_clip": 0.01054757, "auxiliary_loss_mlp": 0.01041762, "balance_loss_clip": 1.03350389, "balance_loss_mlp": 1.02915573, "epoch": 0.9334736209228919, "flos": 23688689051520.0, "grad_norm": 1.733935635799957, "language_loss": 0.83531857, "learning_rate": 4.6206209705473175e-08, "loss": 0.85628378, "num_input_tokens_seen": 334964785, "step": 15526, "time_per_iteration": 2.6978201866149902 }, { "auxiliary_loss_clip": 0.01062457, "auxiliary_loss_mlp": 0.01033558, "balance_loss_clip": 1.03334141, "balance_loss_mlp": 1.02088642, "epoch": 0.9335337441755599, "flos": 15377344865280.0, "grad_norm": 1.8748787634103812, "language_loss": 0.69386899, "learning_rate": 4.61230144456366e-08, "loss": 0.71482921, "num_input_tokens_seen": 334982400, "step": 15527, "time_per_iteration": 2.7441039085388184 }, { "auxiliary_loss_clip": 0.01110964, "auxiliary_loss_mlp": 0.01030308, "balance_loss_clip": 1.03706026, "balance_loss_mlp": 1.01640248, "epoch": 0.9335938674282279, "flos": 16106726436480.0, "grad_norm": 2.0901783734577535, "language_loss": 0.65065324, "learning_rate": 4.603989327701141e-08, "loss": 0.67206597, "num_input_tokens_seen": 334999685, "step": 15528, "time_per_iteration": 2.643949270248413 }, { "auxiliary_loss_clip": 0.01110618, "auxiliary_loss_mlp": 0.01029994, "balance_loss_clip": 1.03654647, "balance_loss_mlp": 1.01733375, "epoch": 0.9336539906808958, "flos": 18952898353920.0, "grad_norm": 1.8767297797851592, "language_loss": 0.74917662, "learning_rate": 4.5956846202748867e-08, "loss": 0.77058274, "num_input_tokens_seen": 335019160, "step": 15529, "time_per_iteration": 2.634995698928833 }, { "auxiliary_loss_clip": 0.01062705, "auxiliary_loss_mlp": 0.01032394, "balance_loss_clip": 1.03274131, "balance_loss_mlp": 1.0203954, "epoch": 0.9337141139335638, "flos": 18109104986880.0, "grad_norm": 1.748892845656801, "language_loss": 0.62968796, "learning_rate": 4.5873873225998674e-08, "loss": 0.65063894, "num_input_tokens_seen": 335037350, "step": 15530, "time_per_iteration": 2.7044005393981934 }, { "auxiliary_loss_clip": 0.01088546, "auxiliary_loss_mlp": 0.01028529, "balance_loss_clip": 1.03820205, "balance_loss_mlp": 1.01650035, "epoch": 0.9337742371862318, "flos": 17345716214400.0, "grad_norm": 1.6832056860999263, "language_loss": 0.72579157, "learning_rate": 4.5790974349907194e-08, "loss": 0.74696231, "num_input_tokens_seen": 335056060, "step": 15531, "time_per_iteration": 2.6301660537719727 }, { "auxiliary_loss_clip": 0.01085265, "auxiliary_loss_mlp": 0.01030468, "balance_loss_clip": 1.0341419, "balance_loss_mlp": 1.01802897, "epoch": 0.9338343604388998, "flos": 29058986522880.0, "grad_norm": 1.6070200509837302, "language_loss": 0.7085079, "learning_rate": 4.5708149577617925e-08, "loss": 0.72966528, "num_input_tokens_seen": 335075410, "step": 15532, "time_per_iteration": 2.6982882022857666 }, { "auxiliary_loss_clip": 0.01110813, "auxiliary_loss_mlp": 0.00770577, "balance_loss_clip": 1.03735983, "balance_loss_mlp": 1.00021112, "epoch": 0.9338944836915677, "flos": 18660908695680.0, "grad_norm": 1.6762698781307237, "language_loss": 0.73232746, "learning_rate": 4.5625398912271016e-08, "loss": 0.75114131, "num_input_tokens_seen": 335095190, "step": 15533, "time_per_iteration": 2.570868730545044 }, { "auxiliary_loss_clip": 0.0107118, "auxiliary_loss_mlp": 0.01029018, "balance_loss_clip": 1.03274035, "balance_loss_mlp": 1.01729369, "epoch": 0.9339546069442357, "flos": 16617735273600.0, "grad_norm": 1.7512837189908117, "language_loss": 0.7965132, "learning_rate": 4.554272235700507e-08, "loss": 0.81751519, "num_input_tokens_seen": 335113825, "step": 15534, "time_per_iteration": 2.659533977508545 }, { "auxiliary_loss_clip": 0.01104268, "auxiliary_loss_mlp": 0.0102812, "balance_loss_clip": 1.03758454, "balance_loss_mlp": 1.01707494, "epoch": 0.9340147301969036, "flos": 23693106424320.0, "grad_norm": 1.785516509672164, "language_loss": 0.74561787, "learning_rate": 4.546011991495513e-08, "loss": 0.76694173, "num_input_tokens_seen": 335136425, "step": 15535, "time_per_iteration": 2.615487575531006 }, { "auxiliary_loss_clip": 0.01094475, "auxiliary_loss_mlp": 0.0102863, "balance_loss_clip": 1.03818846, "balance_loss_mlp": 1.0162499, "epoch": 0.9340748534495716, "flos": 28654452576000.0, "grad_norm": 2.554180895365387, "language_loss": 0.77858245, "learning_rate": 4.537759158925292e-08, "loss": 0.79981351, "num_input_tokens_seen": 335157925, "step": 15536, "time_per_iteration": 2.6514716148376465 }, { "auxiliary_loss_clip": 0.01078909, "auxiliary_loss_mlp": 0.01027656, "balance_loss_clip": 1.0358901, "balance_loss_mlp": 1.01566911, "epoch": 0.9341349767022396, "flos": 24899633285760.0, "grad_norm": 1.5285441088297342, "language_loss": 0.80702901, "learning_rate": 4.5295137383028593e-08, "loss": 0.82809466, "num_input_tokens_seen": 335177840, "step": 15537, "time_per_iteration": 2.71079683303833 }, { "auxiliary_loss_clip": 0.01089177, "auxiliary_loss_mlp": 0.01033529, "balance_loss_clip": 1.03725171, "balance_loss_mlp": 1.02132761, "epoch": 0.9341950999549076, "flos": 29059525226880.0, "grad_norm": 2.04950932055524, "language_loss": 0.77909076, "learning_rate": 4.5212757299408764e-08, "loss": 0.80031782, "num_input_tokens_seen": 335199470, "step": 15538, "time_per_iteration": 2.7233970165252686 }, { "auxiliary_loss_clip": 0.01080561, "auxiliary_loss_mlp": 0.01028684, "balance_loss_clip": 1.03509426, "balance_loss_mlp": 1.01653659, "epoch": 0.9342552232075755, "flos": 23587062497280.0, "grad_norm": 1.7693540282121059, "language_loss": 0.73224825, "learning_rate": 4.513045134151672e-08, "loss": 0.75334066, "num_input_tokens_seen": 335218885, "step": 15539, "time_per_iteration": 2.7510504722595215 }, { "auxiliary_loss_clip": 0.01063064, "auxiliary_loss_mlp": 0.01030385, "balance_loss_clip": 1.03815532, "balance_loss_mlp": 1.01935768, "epoch": 0.9343153464602435, "flos": 36721389646080.0, "grad_norm": 1.501458356905732, "language_loss": 0.64681369, "learning_rate": 4.504821951247373e-08, "loss": 0.66774815, "num_input_tokens_seen": 335239485, "step": 15540, "time_per_iteration": 2.8745980262756348 }, { "auxiliary_loss_clip": 0.01095708, "auxiliary_loss_mlp": 0.0103328, "balance_loss_clip": 1.03505111, "balance_loss_mlp": 1.02149022, "epoch": 0.9343754697129115, "flos": 22236498097920.0, "grad_norm": 1.8557589560760206, "language_loss": 0.76556802, "learning_rate": 4.496606181539864e-08, "loss": 0.78685796, "num_input_tokens_seen": 335258355, "step": 15541, "time_per_iteration": 2.651571035385132 }, { "auxiliary_loss_clip": 0.01096825, "auxiliary_loss_mlp": 0.01033897, "balance_loss_clip": 1.03985929, "balance_loss_mlp": 1.02145147, "epoch": 0.9344355929655794, "flos": 29710333797120.0, "grad_norm": 1.9744538682632873, "language_loss": 0.66810614, "learning_rate": 4.4883978253406066e-08, "loss": 0.68941331, "num_input_tokens_seen": 335276835, "step": 15542, "time_per_iteration": 2.760667085647583 }, { "auxiliary_loss_clip": 0.01065848, "auxiliary_loss_mlp": 0.01029314, "balance_loss_clip": 1.03444171, "balance_loss_mlp": 1.01672554, "epoch": 0.9344957162182475, "flos": 18880394751360.0, "grad_norm": 1.8654482805757453, "language_loss": 0.69444913, "learning_rate": 4.480196882960907e-08, "loss": 0.71540076, "num_input_tokens_seen": 335296220, "step": 15543, "time_per_iteration": 2.7620866298675537 }, { "auxiliary_loss_clip": 0.0109899, "auxiliary_loss_mlp": 0.01029553, "balance_loss_clip": 1.03395653, "balance_loss_mlp": 1.01592147, "epoch": 0.9345558394709154, "flos": 27417761268480.0, "grad_norm": 2.0257017035114493, "language_loss": 0.69519067, "learning_rate": 4.4720033547117394e-08, "loss": 0.71647608, "num_input_tokens_seen": 335316335, "step": 15544, "time_per_iteration": 2.7088634967803955 }, { "auxiliary_loss_clip": 0.01094451, "auxiliary_loss_mlp": 0.01046236, "balance_loss_clip": 1.03500128, "balance_loss_mlp": 1.03233039, "epoch": 0.9346159627235834, "flos": 20741285629440.0, "grad_norm": 1.7227662917872677, "language_loss": 0.77327919, "learning_rate": 4.463817240903789e-08, "loss": 0.79468608, "num_input_tokens_seen": 335335545, "step": 15545, "time_per_iteration": 2.630438804626465 }, { "auxiliary_loss_clip": 0.0109898, "auxiliary_loss_mlp": 0.01026698, "balance_loss_clip": 1.03614378, "balance_loss_mlp": 1.01519418, "epoch": 0.9346760859762513, "flos": 21069221823360.0, "grad_norm": 1.7337615176350853, "language_loss": 0.68626702, "learning_rate": 4.455638541847495e-08, "loss": 0.70752382, "num_input_tokens_seen": 335355350, "step": 15546, "time_per_iteration": 2.5841619968414307 }, { "auxiliary_loss_clip": 0.01066558, "auxiliary_loss_mlp": 0.01029596, "balance_loss_clip": 1.03200841, "balance_loss_mlp": 1.01754951, "epoch": 0.9347362092289193, "flos": 29204927481600.0, "grad_norm": 2.0917837457460466, "language_loss": 0.82409191, "learning_rate": 4.447467257852966e-08, "loss": 0.84505343, "num_input_tokens_seen": 335375160, "step": 15547, "time_per_iteration": 2.737194538116455 }, { "auxiliary_loss_clip": 0.01089071, "auxiliary_loss_mlp": 0.01038533, "balance_loss_clip": 1.03189087, "balance_loss_mlp": 1.02542627, "epoch": 0.9347963324815872, "flos": 19427350124160.0, "grad_norm": 1.9497482945485352, "language_loss": 0.83475363, "learning_rate": 4.439303389230087e-08, "loss": 0.85602963, "num_input_tokens_seen": 335394080, "step": 15548, "time_per_iteration": 2.550107479095459 }, { "auxiliary_loss_clip": 0.01101099, "auxiliary_loss_mlp": 0.01037896, "balance_loss_clip": 1.03632116, "balance_loss_mlp": 1.0238775, "epoch": 0.9348564557342552, "flos": 36901840596480.0, "grad_norm": 1.5421911327365105, "language_loss": 0.65587002, "learning_rate": 4.4311469362884326e-08, "loss": 0.67725998, "num_input_tokens_seen": 335414230, "step": 15549, "time_per_iteration": 2.7219295501708984 }, { "auxiliary_loss_clip": 0.01101825, "auxiliary_loss_mlp": 0.01037163, "balance_loss_clip": 1.03933716, "balance_loss_mlp": 1.02354288, "epoch": 0.9349165789869232, "flos": 21690117342720.0, "grad_norm": 2.010328548001079, "language_loss": 0.80039644, "learning_rate": 4.4229978993372665e-08, "loss": 0.82178628, "num_input_tokens_seen": 335432890, "step": 15550, "time_per_iteration": 2.640012741088867 }, { "auxiliary_loss_clip": 0.01096493, "auxiliary_loss_mlp": 0.01032466, "balance_loss_clip": 1.0383265, "balance_loss_mlp": 1.02041388, "epoch": 0.9349767022395912, "flos": 18844053166080.0, "grad_norm": 1.681452605729496, "language_loss": 0.75687659, "learning_rate": 4.4148562786856524e-08, "loss": 0.77816617, "num_input_tokens_seen": 335452085, "step": 15551, "time_per_iteration": 2.584329843521118 }, { "auxiliary_loss_clip": 0.01051893, "auxiliary_loss_mlp": 0.01030912, "balance_loss_clip": 1.03308678, "balance_loss_mlp": 1.02025425, "epoch": 0.9350368254922591, "flos": 24973429777920.0, "grad_norm": 1.499035355144879, "language_loss": 0.73651052, "learning_rate": 4.406722074642255e-08, "loss": 0.75733852, "num_input_tokens_seen": 335472130, "step": 15552, "time_per_iteration": 2.7739923000335693 }, { "auxiliary_loss_clip": 0.01059946, "auxiliary_loss_mlp": 0.01040283, "balance_loss_clip": 1.03191781, "balance_loss_mlp": 1.02666366, "epoch": 0.9350969487449271, "flos": 23070594792960.0, "grad_norm": 1.5949406765998282, "language_loss": 0.77295089, "learning_rate": 4.3985952875155386e-08, "loss": 0.79395318, "num_input_tokens_seen": 335489970, "step": 15553, "time_per_iteration": 2.7346534729003906 }, { "auxiliary_loss_clip": 0.01074123, "auxiliary_loss_mlp": 0.01033891, "balance_loss_clip": 1.03367734, "balance_loss_mlp": 1.02047396, "epoch": 0.9351570719975951, "flos": 18625177641600.0, "grad_norm": 1.630847703889005, "language_loss": 0.78214866, "learning_rate": 4.390475917613723e-08, "loss": 0.8032288, "num_input_tokens_seen": 335509125, "step": 15554, "time_per_iteration": 2.6710941791534424 }, { "auxiliary_loss_clip": 0.01077218, "auxiliary_loss_mlp": 0.01037117, "balance_loss_clip": 1.03197753, "balance_loss_mlp": 1.02502322, "epoch": 0.935217195250263, "flos": 15888353702400.0, "grad_norm": 2.49632757150129, "language_loss": 0.69451249, "learning_rate": 4.382363965244695e-08, "loss": 0.7156558, "num_input_tokens_seen": 335525620, "step": 15555, "time_per_iteration": 2.6385841369628906 }, { "auxiliary_loss_clip": 0.01014929, "auxiliary_loss_mlp": 0.01045967, "balance_loss_clip": 1.02853274, "balance_loss_mlp": 1.0316503, "epoch": 0.935277318502931, "flos": 24390312387840.0, "grad_norm": 1.5017504373533854, "language_loss": 0.75400025, "learning_rate": 4.374259430715965e-08, "loss": 0.77460921, "num_input_tokens_seen": 335547565, "step": 15556, "time_per_iteration": 3.059551477432251 }, { "auxiliary_loss_clip": 0.01085152, "auxiliary_loss_mlp": 0.01031847, "balance_loss_clip": 1.03423309, "balance_loss_mlp": 1.02010441, "epoch": 0.935337441755599, "flos": 27600259294080.0, "grad_norm": 1.4976349419869153, "language_loss": 0.72337437, "learning_rate": 4.366162314334953e-08, "loss": 0.74454439, "num_input_tokens_seen": 335570285, "step": 15557, "time_per_iteration": 4.6448400020599365 }, { "auxiliary_loss_clip": 0.01108474, "auxiliary_loss_mlp": 0.01033173, "balance_loss_clip": 1.0365355, "balance_loss_mlp": 1.01982188, "epoch": 0.935397565008267, "flos": 20482872209280.0, "grad_norm": 1.660550178775489, "language_loss": 0.63404226, "learning_rate": 4.358072616408681e-08, "loss": 0.65545875, "num_input_tokens_seen": 335588600, "step": 15558, "time_per_iteration": 2.6054418087005615 }, { "auxiliary_loss_clip": 0.01087055, "auxiliary_loss_mlp": 0.01030987, "balance_loss_clip": 1.03696275, "balance_loss_mlp": 1.01723039, "epoch": 0.9354576882609349, "flos": 23654394541440.0, "grad_norm": 1.8757208660532867, "language_loss": 0.72988653, "learning_rate": 4.34999033724388e-08, "loss": 0.75106692, "num_input_tokens_seen": 335606235, "step": 15559, "time_per_iteration": 2.6042425632476807 }, { "auxiliary_loss_clip": 0.01053197, "auxiliary_loss_mlp": 0.00769565, "balance_loss_clip": 1.03214157, "balance_loss_mlp": 1.00029421, "epoch": 0.9355178115136029, "flos": 36684904406400.0, "grad_norm": 2.2075476861746526, "language_loss": 0.63396823, "learning_rate": 4.341915477147062e-08, "loss": 0.65219581, "num_input_tokens_seen": 335628240, "step": 15560, "time_per_iteration": 4.612861633300781 }, { "auxiliary_loss_clip": 0.01049187, "auxiliary_loss_mlp": 0.01034698, "balance_loss_clip": 1.03704762, "balance_loss_mlp": 1.02041054, "epoch": 0.9355779347662708, "flos": 14460401450880.0, "grad_norm": 2.3052566052568193, "language_loss": 0.64168519, "learning_rate": 4.3338480364244034e-08, "loss": 0.66252398, "num_input_tokens_seen": 335643755, "step": 15561, "time_per_iteration": 4.437899827957153 }, { "auxiliary_loss_clip": 0.0110932, "auxiliary_loss_mlp": 0.01036594, "balance_loss_clip": 1.03827786, "balance_loss_mlp": 1.02375484, "epoch": 0.9356380580189388, "flos": 23185976256000.0, "grad_norm": 1.6937389446463813, "language_loss": 0.75591785, "learning_rate": 4.325788015381859e-08, "loss": 0.77737701, "num_input_tokens_seen": 335665160, "step": 15562, "time_per_iteration": 2.7620413303375244 }, { "auxiliary_loss_clip": 0.01016437, "auxiliary_loss_mlp": 0.01002066, "balance_loss_clip": 1.00517988, "balance_loss_mlp": 1.0011481, "epoch": 0.9356981812716068, "flos": 67471626090240.0, "grad_norm": 0.9484711819717426, "language_loss": 0.62294793, "learning_rate": 4.31773541432503e-08, "loss": 0.64313298, "num_input_tokens_seen": 335715240, "step": 15563, "time_per_iteration": 4.560046672821045 }, { "auxiliary_loss_clip": 0.01059821, "auxiliary_loss_mlp": 0.01032447, "balance_loss_clip": 1.0360657, "balance_loss_mlp": 1.02043045, "epoch": 0.9357583045242748, "flos": 24681619687680.0, "grad_norm": 1.6275464297875282, "language_loss": 0.78383303, "learning_rate": 4.3096902335592714e-08, "loss": 0.80475569, "num_input_tokens_seen": 335734970, "step": 15564, "time_per_iteration": 2.7581684589385986 }, { "auxiliary_loss_clip": 0.01111071, "auxiliary_loss_mlp": 0.0103029, "balance_loss_clip": 1.0369916, "balance_loss_mlp": 1.0166707, "epoch": 0.9358184277769427, "flos": 19463727623040.0, "grad_norm": 2.0197933120923164, "language_loss": 0.78051835, "learning_rate": 4.301652473389694e-08, "loss": 0.80193192, "num_input_tokens_seen": 335753435, "step": 15565, "time_per_iteration": 2.6100919246673584 }, { "auxiliary_loss_clip": 0.01094214, "auxiliary_loss_mlp": 0.01031155, "balance_loss_clip": 1.03455317, "balance_loss_mlp": 1.01927018, "epoch": 0.9358785510296107, "flos": 18916987731840.0, "grad_norm": 3.310186857599268, "language_loss": 0.72122169, "learning_rate": 4.2936221341210774e-08, "loss": 0.74247533, "num_input_tokens_seen": 335772105, "step": 15566, "time_per_iteration": 2.5962870121002197 }, { "auxiliary_loss_clip": 0.0106957, "auxiliary_loss_mlp": 0.00771396, "balance_loss_clip": 1.03290153, "balance_loss_mlp": 1.00026023, "epoch": 0.9359386742822787, "flos": 23441265192960.0, "grad_norm": 1.9305985811175064, "language_loss": 0.67621976, "learning_rate": 4.285599216057889e-08, "loss": 0.69462943, "num_input_tokens_seen": 335789125, "step": 15567, "time_per_iteration": 2.6770172119140625 }, { "auxiliary_loss_clip": 0.01078108, "auxiliary_loss_mlp": 0.0103394, "balance_loss_clip": 1.03551221, "balance_loss_mlp": 1.02124989, "epoch": 0.9359987975349466, "flos": 32744067557760.0, "grad_norm": 3.1950642417815778, "language_loss": 0.62192923, "learning_rate": 4.277583719504418e-08, "loss": 0.64304972, "num_input_tokens_seen": 335810995, "step": 15568, "time_per_iteration": 2.7253639698028564 }, { "auxiliary_loss_clip": 0.01082433, "auxiliary_loss_mlp": 0.01037861, "balance_loss_clip": 1.03121352, "balance_loss_mlp": 1.02551699, "epoch": 0.9360589207876147, "flos": 22819651401600.0, "grad_norm": 1.8305652991188055, "language_loss": 0.7874766, "learning_rate": 4.269575644764556e-08, "loss": 0.80867952, "num_input_tokens_seen": 335830580, "step": 15569, "time_per_iteration": 2.648876905441284 }, { "auxiliary_loss_clip": 0.01090445, "auxiliary_loss_mlp": 0.01033037, "balance_loss_clip": 1.03811383, "balance_loss_mlp": 1.02041864, "epoch": 0.9361190440402826, "flos": 20885251340160.0, "grad_norm": 3.2418615597680263, "language_loss": 0.697613, "learning_rate": 4.261574992142014e-08, "loss": 0.71884787, "num_input_tokens_seen": 335846515, "step": 15570, "time_per_iteration": 2.695789337158203 }, { "auxiliary_loss_clip": 0.0109347, "auxiliary_loss_mlp": 0.01030501, "balance_loss_clip": 1.03646827, "balance_loss_mlp": 1.0180912, "epoch": 0.9361791672929506, "flos": 19317822577920.0, "grad_norm": 3.942506001346151, "language_loss": 0.78369403, "learning_rate": 4.2535817619401726e-08, "loss": 0.80493373, "num_input_tokens_seen": 335863350, "step": 15571, "time_per_iteration": 2.613274335861206 }, { "auxiliary_loss_clip": 0.01076748, "auxiliary_loss_mlp": 0.01031253, "balance_loss_clip": 1.03460646, "balance_loss_mlp": 1.01874197, "epoch": 0.9362392905456185, "flos": 15158182032000.0, "grad_norm": 2.841657798727435, "language_loss": 0.77677691, "learning_rate": 4.2455959544621224e-08, "loss": 0.79785693, "num_input_tokens_seen": 335880510, "step": 15572, "time_per_iteration": 2.803063154220581 }, { "auxiliary_loss_clip": 0.01082647, "auxiliary_loss_mlp": 0.01041561, "balance_loss_clip": 1.03344643, "balance_loss_mlp": 1.0294075, "epoch": 0.9362994137982865, "flos": 22085888371200.0, "grad_norm": 1.8952922672820693, "language_loss": 0.78173578, "learning_rate": 4.237617570010688e-08, "loss": 0.80297786, "num_input_tokens_seen": 335899440, "step": 15573, "time_per_iteration": 2.64709734916687 }, { "auxiliary_loss_clip": 0.01072731, "auxiliary_loss_mlp": 0.01028381, "balance_loss_clip": 1.03296494, "balance_loss_mlp": 1.01635885, "epoch": 0.9363595370509544, "flos": 23512260424320.0, "grad_norm": 2.4715293938233316, "language_loss": 0.74473417, "learning_rate": 4.2296466088884044e-08, "loss": 0.76574528, "num_input_tokens_seen": 335919540, "step": 15574, "time_per_iteration": 2.8169214725494385 }, { "auxiliary_loss_clip": 0.01050172, "auxiliary_loss_mlp": 0.01035605, "balance_loss_clip": 1.03272486, "balance_loss_mlp": 1.02266467, "epoch": 0.9364196603036224, "flos": 27123473139840.0, "grad_norm": 1.920556373302248, "language_loss": 0.68192244, "learning_rate": 4.221683071397564e-08, "loss": 0.70278013, "num_input_tokens_seen": 335939665, "step": 15575, "time_per_iteration": 2.798386573791504 }, { "auxiliary_loss_clip": 0.01078254, "auxiliary_loss_mlp": 0.01033935, "balance_loss_clip": 1.03272521, "balance_loss_mlp": 1.02136481, "epoch": 0.9364797835562904, "flos": 18479057114880.0, "grad_norm": 1.7600184524514564, "language_loss": 0.65193367, "learning_rate": 4.2137269578401026e-08, "loss": 0.67305553, "num_input_tokens_seen": 335958580, "step": 15576, "time_per_iteration": 2.6554365158081055 }, { "auxiliary_loss_clip": 0.01093147, "auxiliary_loss_mlp": 0.01030045, "balance_loss_clip": 1.03174019, "balance_loss_mlp": 1.0161159, "epoch": 0.9365399068089584, "flos": 13005552890880.0, "grad_norm": 2.420160931511476, "language_loss": 0.76176679, "learning_rate": 4.2057782685177566e-08, "loss": 0.78299868, "num_input_tokens_seen": 335974965, "step": 15577, "time_per_iteration": 2.5658376216888428 }, { "auxiliary_loss_clip": 0.01062399, "auxiliary_loss_mlp": 0.01030216, "balance_loss_clip": 1.03228045, "balance_loss_mlp": 1.01722205, "epoch": 0.9366000300616263, "flos": 25666433850240.0, "grad_norm": 3.270982347260187, "language_loss": 0.52259952, "learning_rate": 4.1978370037318855e-08, "loss": 0.5435257, "num_input_tokens_seen": 335996575, "step": 15578, "time_per_iteration": 2.753800392150879 }, { "auxiliary_loss_clip": 0.01044474, "auxiliary_loss_mlp": 0.01035654, "balance_loss_clip": 1.03016138, "balance_loss_mlp": 1.02336335, "epoch": 0.9366601533142943, "flos": 21433355948160.0, "grad_norm": 1.5769540357516516, "language_loss": 0.70730215, "learning_rate": 4.189903163783692e-08, "loss": 0.7281034, "num_input_tokens_seen": 336017265, "step": 15579, "time_per_iteration": 2.776789903640747 }, { "auxiliary_loss_clip": 0.01081419, "auxiliary_loss_mlp": 0.01027227, "balance_loss_clip": 1.03318858, "balance_loss_mlp": 1.01544309, "epoch": 0.9367202765669622, "flos": 24093222998400.0, "grad_norm": 1.8470459873947023, "language_loss": 0.76132309, "learning_rate": 4.181976748973959e-08, "loss": 0.78240955, "num_input_tokens_seen": 336035905, "step": 15580, "time_per_iteration": 2.685457468032837 }, { "auxiliary_loss_clip": 0.01097941, "auxiliary_loss_mlp": 0.01031636, "balance_loss_clip": 1.03599906, "balance_loss_mlp": 1.01848698, "epoch": 0.9367803998196302, "flos": 20888842700160.0, "grad_norm": 1.6988000782542536, "language_loss": 0.66216934, "learning_rate": 4.1740577596033114e-08, "loss": 0.68346512, "num_input_tokens_seen": 336055585, "step": 15581, "time_per_iteration": 2.642705202102661 }, { "auxiliary_loss_clip": 0.01099156, "auxiliary_loss_mlp": 0.01028235, "balance_loss_clip": 1.03769445, "balance_loss_mlp": 1.01575327, "epoch": 0.9368405230722983, "flos": 22564362464640.0, "grad_norm": 1.6283591696925621, "language_loss": 0.76962942, "learning_rate": 4.166146195972042e-08, "loss": 0.79090333, "num_input_tokens_seen": 336076695, "step": 15582, "time_per_iteration": 2.6836650371551514 }, { "auxiliary_loss_clip": 0.01033952, "auxiliary_loss_mlp": 0.01033194, "balance_loss_clip": 1.03131258, "balance_loss_mlp": 1.02007508, "epoch": 0.9369006463249662, "flos": 18880215183360.0, "grad_norm": 1.9959612516768654, "language_loss": 0.73610139, "learning_rate": 4.1582420583800905e-08, "loss": 0.75677288, "num_input_tokens_seen": 336094740, "step": 15583, "time_per_iteration": 2.9247751235961914 }, { "auxiliary_loss_clip": 0.01113025, "auxiliary_loss_mlp": 0.01031893, "balance_loss_clip": 1.03807962, "balance_loss_mlp": 1.01861954, "epoch": 0.9369607695776342, "flos": 26432516142720.0, "grad_norm": 2.0019759787362417, "language_loss": 0.84050167, "learning_rate": 4.1503453471272376e-08, "loss": 0.86195087, "num_input_tokens_seen": 336113985, "step": 15584, "time_per_iteration": 2.7832884788513184 }, { "auxiliary_loss_clip": 0.01098693, "auxiliary_loss_mlp": 0.00771025, "balance_loss_clip": 1.03800797, "balance_loss_mlp": 1.00032699, "epoch": 0.9370208928303021, "flos": 39567346081920.0, "grad_norm": 1.4532418436154226, "language_loss": 0.72163695, "learning_rate": 4.1424560625129334e-08, "loss": 0.74033409, "num_input_tokens_seen": 336136395, "step": 15585, "time_per_iteration": 2.81025767326355 }, { "auxiliary_loss_clip": 0.01073011, "auxiliary_loss_mlp": 0.01021827, "balance_loss_clip": 1.03393424, "balance_loss_mlp": 1.01078236, "epoch": 0.9370810160829701, "flos": 22963114321920.0, "grad_norm": 1.7172742978336988, "language_loss": 0.8027873, "learning_rate": 4.134574204836316e-08, "loss": 0.82373559, "num_input_tokens_seen": 336156345, "step": 15586, "time_per_iteration": 2.66705322265625 }, { "auxiliary_loss_clip": 0.01068881, "auxiliary_loss_mlp": 0.01036223, "balance_loss_clip": 1.03491676, "balance_loss_mlp": 1.0236938, "epoch": 0.937141139335638, "flos": 23075048079360.0, "grad_norm": 1.5900972808595355, "language_loss": 0.76568019, "learning_rate": 4.126699774396258e-08, "loss": 0.78673124, "num_input_tokens_seen": 336176760, "step": 15587, "time_per_iteration": 2.696638345718384 }, { "auxiliary_loss_clip": 0.01089529, "auxiliary_loss_mlp": 0.01037342, "balance_loss_clip": 1.03515196, "balance_loss_mlp": 1.02427721, "epoch": 0.937201262588306, "flos": 16356664247040.0, "grad_norm": 1.8569642874741914, "language_loss": 0.87623429, "learning_rate": 4.118832771491387e-08, "loss": 0.89750302, "num_input_tokens_seen": 336193285, "step": 15588, "time_per_iteration": 2.6571919918060303 }, { "auxiliary_loss_clip": 0.01106178, "auxiliary_loss_mlp": 0.00770286, "balance_loss_clip": 1.03689957, "balance_loss_mlp": 1.0001812, "epoch": 0.937261385840974, "flos": 20194078861440.0, "grad_norm": 1.9442823727757126, "language_loss": 0.78136659, "learning_rate": 4.11097319642002e-08, "loss": 0.80013114, "num_input_tokens_seen": 336211425, "step": 15589, "time_per_iteration": 2.5364420413970947 }, { "auxiliary_loss_clip": 0.01106688, "auxiliary_loss_mlp": 0.0103402, "balance_loss_clip": 1.03706598, "balance_loss_mlp": 1.02196836, "epoch": 0.937321509093642, "flos": 18295948558080.0, "grad_norm": 1.7833559240011974, "language_loss": 0.77980852, "learning_rate": 4.103121049480163e-08, "loss": 0.80121559, "num_input_tokens_seen": 336230205, "step": 15590, "time_per_iteration": 2.5236690044403076 }, { "auxiliary_loss_clip": 0.01079152, "auxiliary_loss_mlp": 0.01038917, "balance_loss_clip": 1.03359151, "balance_loss_mlp": 1.02445698, "epoch": 0.9373816323463099, "flos": 25884662929920.0, "grad_norm": 1.8039863283037736, "language_loss": 0.71324873, "learning_rate": 4.095276330969577e-08, "loss": 0.73442948, "num_input_tokens_seen": 336252440, "step": 15591, "time_per_iteration": 2.675104856491089 }, { "auxiliary_loss_clip": 0.01097841, "auxiliary_loss_mlp": 0.00771749, "balance_loss_clip": 1.03754783, "balance_loss_mlp": 1.00026131, "epoch": 0.9374417555989779, "flos": 27198849830400.0, "grad_norm": 2.2483638992357844, "language_loss": 0.53910917, "learning_rate": 4.0874390411857804e-08, "loss": 0.55780506, "num_input_tokens_seen": 336273845, "step": 15592, "time_per_iteration": 2.620513439178467 }, { "auxiliary_loss_clip": 0.01092328, "auxiliary_loss_mlp": 0.01027667, "balance_loss_clip": 1.03775334, "balance_loss_mlp": 1.01602066, "epoch": 0.9375018788516458, "flos": 23621249266560.0, "grad_norm": 1.5680593734812756, "language_loss": 0.67480534, "learning_rate": 4.0796091804259136e-08, "loss": 0.69600528, "num_input_tokens_seen": 336292790, "step": 15593, "time_per_iteration": 2.606893301010132 }, { "auxiliary_loss_clip": 0.01086764, "auxiliary_loss_mlp": 0.01028451, "balance_loss_clip": 1.03426361, "balance_loss_mlp": 1.01641703, "epoch": 0.9375620021043138, "flos": 22678774260480.0, "grad_norm": 1.5375149732930165, "language_loss": 0.74182671, "learning_rate": 4.0717867489868715e-08, "loss": 0.76297885, "num_input_tokens_seen": 336312600, "step": 15594, "time_per_iteration": 2.6576709747314453 }, { "auxiliary_loss_clip": 0.01093114, "auxiliary_loss_mlp": 0.01027231, "balance_loss_clip": 1.03431714, "balance_loss_mlp": 1.01590586, "epoch": 0.9376221253569819, "flos": 27560254521600.0, "grad_norm": 1.6954995365401158, "language_loss": 0.74231362, "learning_rate": 4.063971747165351e-08, "loss": 0.76351708, "num_input_tokens_seen": 336332770, "step": 15595, "time_per_iteration": 2.6582190990448 }, { "auxiliary_loss_clip": 0.01080536, "auxiliary_loss_mlp": 0.01029991, "balance_loss_clip": 1.03524542, "balance_loss_mlp": 1.01823688, "epoch": 0.9376822486096498, "flos": 24129887806080.0, "grad_norm": 1.8418600900837818, "language_loss": 0.75974333, "learning_rate": 4.056164175257626e-08, "loss": 0.78084862, "num_input_tokens_seen": 336351445, "step": 15596, "time_per_iteration": 2.6803321838378906 }, { "auxiliary_loss_clip": 0.01079836, "auxiliary_loss_mlp": 0.01030847, "balance_loss_clip": 1.03544092, "balance_loss_mlp": 1.01825309, "epoch": 0.9377423718623178, "flos": 22784028088320.0, "grad_norm": 1.7137269862110038, "language_loss": 0.78881788, "learning_rate": 4.0483640335597926e-08, "loss": 0.80992472, "num_input_tokens_seen": 336368690, "step": 15597, "time_per_iteration": 4.308673143386841 }, { "auxiliary_loss_clip": 0.01113389, "auxiliary_loss_mlp": 0.01033675, "balance_loss_clip": 1.03775406, "balance_loss_mlp": 1.02094936, "epoch": 0.9378024951149857, "flos": 19168900790400.0, "grad_norm": 1.564327070136616, "language_loss": 0.81037343, "learning_rate": 4.0405713223676363e-08, "loss": 0.83184403, "num_input_tokens_seen": 336388165, "step": 15598, "time_per_iteration": 2.5458343029022217 }, { "auxiliary_loss_clip": 0.01077427, "auxiliary_loss_mlp": 0.01031736, "balance_loss_clip": 1.03376913, "balance_loss_mlp": 1.01846755, "epoch": 0.9378626183676537, "flos": 23505508667520.0, "grad_norm": 2.005294265343008, "language_loss": 0.62860727, "learning_rate": 4.0327860419766994e-08, "loss": 0.64969885, "num_input_tokens_seen": 336406475, "step": 15599, "time_per_iteration": 2.638820171356201 }, { "auxiliary_loss_clip": 0.01068952, "auxiliary_loss_mlp": 0.01033888, "balance_loss_clip": 1.03511238, "balance_loss_mlp": 1.0210557, "epoch": 0.9379227416203216, "flos": 18405655672320.0, "grad_norm": 1.8480598201397724, "language_loss": 0.73232383, "learning_rate": 4.0250081926821e-08, "loss": 0.75335222, "num_input_tokens_seen": 336424690, "step": 15600, "time_per_iteration": 6.016250848770142 }, { "auxiliary_loss_clip": 0.01083039, "auxiliary_loss_mlp": 0.01031732, "balance_loss_clip": 1.03592873, "balance_loss_mlp": 1.02013838, "epoch": 0.9379828648729897, "flos": 17821855923840.0, "grad_norm": 1.7892851032269996, "language_loss": 0.69339931, "learning_rate": 4.0172377747788474e-08, "loss": 0.71454704, "num_input_tokens_seen": 336443055, "step": 15601, "time_per_iteration": 2.6296818256378174 }, { "auxiliary_loss_clip": 0.01019215, "auxiliary_loss_mlp": 0.01003727, "balance_loss_clip": 1.00596642, "balance_loss_mlp": 1.00267816, "epoch": 0.9380429881256576, "flos": 68024399466240.0, "grad_norm": 0.7524579876237703, "language_loss": 0.58074123, "learning_rate": 4.009474788561573e-08, "loss": 0.60097063, "num_input_tokens_seen": 336510190, "step": 15602, "time_per_iteration": 3.3650712966918945 }, { "auxiliary_loss_clip": 0.01035142, "auxiliary_loss_mlp": 0.01039086, "balance_loss_clip": 1.03298295, "balance_loss_mlp": 1.02651513, "epoch": 0.9381031113783256, "flos": 20776980769920.0, "grad_norm": 2.016134606608171, "language_loss": 0.71942216, "learning_rate": 4.001719234324663e-08, "loss": 0.7401644, "num_input_tokens_seen": 336529250, "step": 15603, "time_per_iteration": 4.292678356170654 }, { "auxiliary_loss_clip": 0.01100161, "auxiliary_loss_mlp": 0.01029251, "balance_loss_clip": 1.03342152, "balance_loss_mlp": 1.01796222, "epoch": 0.9381632346309935, "flos": 19025078734080.0, "grad_norm": 1.630988444834905, "language_loss": 0.76084709, "learning_rate": 3.993971112362171e-08, "loss": 0.78214121, "num_input_tokens_seen": 336548530, "step": 15604, "time_per_iteration": 2.5863354206085205 }, { "auxiliary_loss_clip": 0.01083382, "auxiliary_loss_mlp": 0.01039836, "balance_loss_clip": 1.03308749, "balance_loss_mlp": 1.02494097, "epoch": 0.9382233578836615, "flos": 23513840622720.0, "grad_norm": 2.0761522756468005, "language_loss": 0.65524292, "learning_rate": 3.9862304229679734e-08, "loss": 0.67647505, "num_input_tokens_seen": 336568510, "step": 15605, "time_per_iteration": 2.7903220653533936 }, { "auxiliary_loss_clip": 0.01075306, "auxiliary_loss_mlp": 0.00770626, "balance_loss_clip": 1.03514504, "balance_loss_mlp": 1.00017333, "epoch": 0.9382834811363294, "flos": 43067882016000.0, "grad_norm": 2.098655820983203, "language_loss": 0.67783493, "learning_rate": 3.9784971664355683e-08, "loss": 0.69629425, "num_input_tokens_seen": 336592020, "step": 15606, "time_per_iteration": 2.8691816329956055 }, { "auxiliary_loss_clip": 0.01091361, "auxiliary_loss_mlp": 0.01027503, "balance_loss_clip": 1.03324687, "balance_loss_mlp": 1.01593983, "epoch": 0.9383436043889974, "flos": 16436242828800.0, "grad_norm": 1.7643369071420325, "language_loss": 0.77210492, "learning_rate": 3.970771343058166e-08, "loss": 0.7932936, "num_input_tokens_seen": 336610010, "step": 15607, "time_per_iteration": 2.670970916748047 }, { "auxiliary_loss_clip": 0.01098186, "auxiliary_loss_mlp": 0.01027702, "balance_loss_clip": 1.03540564, "balance_loss_mlp": 1.01609111, "epoch": 0.9384037276416655, "flos": 20740603271040.0, "grad_norm": 2.3923436535832927, "language_loss": 0.82524753, "learning_rate": 3.963052953128776e-08, "loss": 0.84650642, "num_input_tokens_seen": 336628520, "step": 15608, "time_per_iteration": 2.6184029579162598 }, { "auxiliary_loss_clip": 0.01099685, "auxiliary_loss_mlp": 0.01035669, "balance_loss_clip": 1.0386703, "balance_loss_mlp": 1.02291393, "epoch": 0.9384638508943334, "flos": 19062677295360.0, "grad_norm": 1.6462700950548765, "language_loss": 0.68830276, "learning_rate": 3.9553419969400536e-08, "loss": 0.7096563, "num_input_tokens_seen": 336647365, "step": 15609, "time_per_iteration": 2.5987517833709717 }, { "auxiliary_loss_clip": 0.01080403, "auxiliary_loss_mlp": 0.01031835, "balance_loss_clip": 1.03563523, "balance_loss_mlp": 1.01835871, "epoch": 0.9385239741470014, "flos": 23404887694080.0, "grad_norm": 2.499558460038554, "language_loss": 0.75453949, "learning_rate": 3.9476384747844316e-08, "loss": 0.77566183, "num_input_tokens_seen": 336667165, "step": 15610, "time_per_iteration": 2.7642691135406494 }, { "auxiliary_loss_clip": 0.01044401, "auxiliary_loss_mlp": 0.01027529, "balance_loss_clip": 1.03432107, "balance_loss_mlp": 1.0161804, "epoch": 0.9385840973996693, "flos": 12824742804480.0, "grad_norm": 2.318341323536946, "language_loss": 0.75083077, "learning_rate": 3.939942386953987e-08, "loss": 0.77155006, "num_input_tokens_seen": 336684130, "step": 15611, "time_per_iteration": 2.753612518310547 }, { "auxiliary_loss_clip": 0.01069021, "auxiliary_loss_mlp": 0.01029199, "balance_loss_clip": 1.03686237, "balance_loss_mlp": 1.01732564, "epoch": 0.9386442206523373, "flos": 15486980152320.0, "grad_norm": 1.8818734447956798, "language_loss": 0.6593554, "learning_rate": 3.9322537337405756e-08, "loss": 0.68033767, "num_input_tokens_seen": 336701520, "step": 15612, "time_per_iteration": 2.637763738632202 }, { "auxiliary_loss_clip": 0.01095795, "auxiliary_loss_mlp": 0.01028771, "balance_loss_clip": 1.03593373, "balance_loss_mlp": 1.01703501, "epoch": 0.9387043439050052, "flos": 21178821196800.0, "grad_norm": 2.000722743721445, "language_loss": 0.57039118, "learning_rate": 3.924572515435742e-08, "loss": 0.59163684, "num_input_tokens_seen": 336720675, "step": 15613, "time_per_iteration": 2.733313798904419 }, { "auxiliary_loss_clip": 0.01084485, "auxiliary_loss_mlp": 0.01036056, "balance_loss_clip": 1.03319824, "balance_loss_mlp": 1.02405143, "epoch": 0.9387644671576733, "flos": 27668273696640.0, "grad_norm": 2.367003168945266, "language_loss": 0.70944715, "learning_rate": 3.916898732330764e-08, "loss": 0.73065257, "num_input_tokens_seen": 336741005, "step": 15614, "time_per_iteration": 2.706362009048462 }, { "auxiliary_loss_clip": 0.01101068, "auxiliary_loss_mlp": 0.01031027, "balance_loss_clip": 1.03795266, "balance_loss_mlp": 1.018224, "epoch": 0.9388245904103412, "flos": 18836331742080.0, "grad_norm": 1.9586081993753126, "language_loss": 0.81213439, "learning_rate": 3.9092323847166544e-08, "loss": 0.83345532, "num_input_tokens_seen": 336757990, "step": 15615, "time_per_iteration": 2.5509698390960693 }, { "auxiliary_loss_clip": 0.01078844, "auxiliary_loss_mlp": 0.01030958, "balance_loss_clip": 1.03181601, "balance_loss_mlp": 1.01881695, "epoch": 0.9388847136630092, "flos": 25483828083840.0, "grad_norm": 1.8203668140159897, "language_loss": 0.71924144, "learning_rate": 3.901573472884134e-08, "loss": 0.7403394, "num_input_tokens_seen": 336777705, "step": 15616, "time_per_iteration": 2.6393303871154785 }, { "auxiliary_loss_clip": 0.01108573, "auxiliary_loss_mlp": 0.01029358, "balance_loss_clip": 1.03755164, "balance_loss_mlp": 1.01691222, "epoch": 0.9389448369156771, "flos": 18734992496640.0, "grad_norm": 2.3142633085226536, "language_loss": 0.66507453, "learning_rate": 3.89392199712355e-08, "loss": 0.68645382, "num_input_tokens_seen": 336798275, "step": 15617, "time_per_iteration": 2.5801546573638916 }, { "auxiliary_loss_clip": 0.01100466, "auxiliary_loss_mlp": 0.01036181, "balance_loss_clip": 1.03689265, "balance_loss_mlp": 1.02243066, "epoch": 0.9390049601683451, "flos": 21717839664000.0, "grad_norm": 2.370004086672154, "language_loss": 0.73481232, "learning_rate": 3.886277957725092e-08, "loss": 0.7561788, "num_input_tokens_seen": 336813835, "step": 15618, "time_per_iteration": 2.6102712154388428 }, { "auxiliary_loss_clip": 0.01114877, "auxiliary_loss_mlp": 0.01031951, "balance_loss_clip": 1.03841376, "balance_loss_mlp": 1.01817656, "epoch": 0.939065083421013, "flos": 19391224020480.0, "grad_norm": 1.8942748596777075, "language_loss": 0.70133412, "learning_rate": 3.878641354978662e-08, "loss": 0.7228024, "num_input_tokens_seen": 336832210, "step": 15619, "time_per_iteration": 2.5149004459381104 }, { "auxiliary_loss_clip": 0.01083274, "auxiliary_loss_mlp": 0.01031368, "balance_loss_clip": 1.03280878, "balance_loss_mlp": 1.01836836, "epoch": 0.939125206673681, "flos": 24681511946880.0, "grad_norm": 1.6109808579498737, "language_loss": 0.7760632, "learning_rate": 3.8710121891737834e-08, "loss": 0.79720962, "num_input_tokens_seen": 336851380, "step": 15620, "time_per_iteration": 2.6531193256378174 }, { "auxiliary_loss_clip": 0.01092968, "auxiliary_loss_mlp": 0.01027438, "balance_loss_clip": 1.03448093, "balance_loss_mlp": 1.01568961, "epoch": 0.9391853299263491, "flos": 16325961096960.0, "grad_norm": 3.857357976396781, "language_loss": 0.73641354, "learning_rate": 3.8633904605998025e-08, "loss": 0.75761759, "num_input_tokens_seen": 336868525, "step": 15621, "time_per_iteration": 2.5519356727600098 }, { "auxiliary_loss_clip": 0.01077862, "auxiliary_loss_mlp": 0.01033031, "balance_loss_clip": 1.03406405, "balance_loss_mlp": 1.01961446, "epoch": 0.939245453179017, "flos": 11655778590720.0, "grad_norm": 2.005738336602588, "language_loss": 0.66011965, "learning_rate": 3.855776169545688e-08, "loss": 0.68122858, "num_input_tokens_seen": 336886200, "step": 15622, "time_per_iteration": 2.649592876434326 }, { "auxiliary_loss_clip": 0.01080227, "auxiliary_loss_mlp": 0.01039822, "balance_loss_clip": 1.03199553, "balance_loss_mlp": 1.02594018, "epoch": 0.939305576431685, "flos": 23148700917120.0, "grad_norm": 1.5853407957277033, "language_loss": 0.71721888, "learning_rate": 3.848169316300209e-08, "loss": 0.73841941, "num_input_tokens_seen": 336905815, "step": 15623, "time_per_iteration": 2.6309492588043213 }, { "auxiliary_loss_clip": 0.01101847, "auxiliary_loss_mlp": 0.01031834, "balance_loss_clip": 1.03930306, "balance_loss_mlp": 1.01934707, "epoch": 0.9393656996843529, "flos": 33287790706560.0, "grad_norm": 1.923924688159949, "language_loss": 0.72363102, "learning_rate": 3.84056990115178e-08, "loss": 0.74496788, "num_input_tokens_seen": 336928460, "step": 15624, "time_per_iteration": 2.7837047576904297 }, { "auxiliary_loss_clip": 0.01071928, "auxiliary_loss_mlp": 0.01033178, "balance_loss_clip": 1.03422618, "balance_loss_mlp": 1.02049983, "epoch": 0.9394258229370209, "flos": 21689434984320.0, "grad_norm": 2.3403915430461333, "language_loss": 0.89429879, "learning_rate": 3.832977924388614e-08, "loss": 0.91534984, "num_input_tokens_seen": 336948320, "step": 15625, "time_per_iteration": 2.7144711017608643 }, { "auxiliary_loss_clip": 0.01096935, "auxiliary_loss_mlp": 0.01031047, "balance_loss_clip": 1.03645694, "balance_loss_mlp": 1.01787996, "epoch": 0.9394859461896888, "flos": 23874203819520.0, "grad_norm": 2.0332287450304074, "language_loss": 0.83621097, "learning_rate": 3.825393386298592e-08, "loss": 0.85749084, "num_input_tokens_seen": 336967670, "step": 15626, "time_per_iteration": 2.71279239654541 }, { "auxiliary_loss_clip": 0.01012548, "auxiliary_loss_mlp": 0.01006796, "balance_loss_clip": 1.00825083, "balance_loss_mlp": 1.00575864, "epoch": 0.9395460694423569, "flos": 61566116993280.0, "grad_norm": 0.7779274792928904, "language_loss": 0.56076801, "learning_rate": 3.8178162871693284e-08, "loss": 0.58096135, "num_input_tokens_seen": 337028395, "step": 15627, "time_per_iteration": 3.1956591606140137 }, { "auxiliary_loss_clip": 0.01058297, "auxiliary_loss_mlp": 0.01041812, "balance_loss_clip": 1.0335449, "balance_loss_mlp": 1.02838874, "epoch": 0.9396061926950248, "flos": 20995712640000.0, "grad_norm": 1.8076515347951383, "language_loss": 0.70110631, "learning_rate": 3.810246627288105e-08, "loss": 0.72210741, "num_input_tokens_seen": 337048150, "step": 15628, "time_per_iteration": 2.6945135593414307 }, { "auxiliary_loss_clip": 0.01096653, "auxiliary_loss_mlp": 0.01028484, "balance_loss_clip": 1.03629029, "balance_loss_mlp": 1.01632452, "epoch": 0.9396663159476928, "flos": 27487786832640.0, "grad_norm": 1.4683164605088868, "language_loss": 0.75408161, "learning_rate": 3.8026844069420025e-08, "loss": 0.77533293, "num_input_tokens_seen": 337069315, "step": 15629, "time_per_iteration": 2.697967052459717 }, { "auxiliary_loss_clip": 0.01044306, "auxiliary_loss_mlp": 0.01039724, "balance_loss_clip": 1.030352, "balance_loss_mlp": 1.02693844, "epoch": 0.9397264392003607, "flos": 19427457864960.0, "grad_norm": 1.8515111751581672, "language_loss": 0.74173099, "learning_rate": 3.795129626417748e-08, "loss": 0.76257128, "num_input_tokens_seen": 337087765, "step": 15630, "time_per_iteration": 2.7693710327148438 }, { "auxiliary_loss_clip": 0.01073693, "auxiliary_loss_mlp": 0.01035386, "balance_loss_clip": 1.03482318, "balance_loss_mlp": 1.02306604, "epoch": 0.9397865624530287, "flos": 18004820826240.0, "grad_norm": 2.3868141185330485, "language_loss": 0.69397956, "learning_rate": 3.787582286001845e-08, "loss": 0.71507031, "num_input_tokens_seen": 337106265, "step": 15631, "time_per_iteration": 2.7210657596588135 }, { "auxiliary_loss_clip": 0.01057041, "auxiliary_loss_mlp": 0.01038236, "balance_loss_clip": 1.0333792, "balance_loss_mlp": 1.02626777, "epoch": 0.9398466857056966, "flos": 22564613859840.0, "grad_norm": 1.5129301375877884, "language_loss": 0.75246739, "learning_rate": 3.7800423859805086e-08, "loss": 0.77342016, "num_input_tokens_seen": 337126090, "step": 15632, "time_per_iteration": 2.7409205436706543 }, { "auxiliary_loss_clip": 0.01103425, "auxiliary_loss_mlp": 0.01036148, "balance_loss_clip": 1.03828955, "balance_loss_mlp": 1.02260005, "epoch": 0.9399068089583646, "flos": 24535678728960.0, "grad_norm": 1.6016514710570828, "language_loss": 0.74265265, "learning_rate": 3.772509926639622e-08, "loss": 0.76404846, "num_input_tokens_seen": 337145655, "step": 15633, "time_per_iteration": 2.5950539112091064 }, { "auxiliary_loss_clip": 0.01110088, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.03653955, "balance_loss_mlp": 1.0211246, "epoch": 0.9399669322110327, "flos": 25630343660160.0, "grad_norm": 1.9203445491908095, "language_loss": 0.72707498, "learning_rate": 3.764984908264823e-08, "loss": 0.74852049, "num_input_tokens_seen": 337164805, "step": 15634, "time_per_iteration": 2.5872409343719482 }, { "auxiliary_loss_clip": 0.01098967, "auxiliary_loss_mlp": 0.01031223, "balance_loss_clip": 1.03486001, "balance_loss_mlp": 1.01823497, "epoch": 0.9400270554637006, "flos": 17089385783040.0, "grad_norm": 2.514594285895435, "language_loss": 0.68870479, "learning_rate": 3.75746733114144e-08, "loss": 0.71000671, "num_input_tokens_seen": 337182280, "step": 15635, "time_per_iteration": 2.600447654724121 }, { "auxiliary_loss_clip": 0.01056848, "auxiliary_loss_mlp": 0.01029105, "balance_loss_clip": 1.03640127, "balance_loss_mlp": 1.01715422, "epoch": 0.9400871787163686, "flos": 22055113393920.0, "grad_norm": 1.5676691824914186, "language_loss": 0.74045342, "learning_rate": 3.7499571955545985e-08, "loss": 0.76131296, "num_input_tokens_seen": 337203495, "step": 15636, "time_per_iteration": 2.6919074058532715 }, { "auxiliary_loss_clip": 0.01099321, "auxiliary_loss_mlp": 0.01032818, "balance_loss_clip": 1.03794205, "balance_loss_mlp": 1.02044368, "epoch": 0.9401473019690365, "flos": 16982767238400.0, "grad_norm": 2.177328379740788, "language_loss": 0.82646513, "learning_rate": 3.7424545017890054e-08, "loss": 0.84778643, "num_input_tokens_seen": 337220435, "step": 15637, "time_per_iteration": 4.119058132171631 }, { "auxiliary_loss_clip": 0.01065361, "auxiliary_loss_mlp": 0.01033669, "balance_loss_clip": 1.03724432, "balance_loss_mlp": 1.02082443, "epoch": 0.9402074252217045, "flos": 19681956702720.0, "grad_norm": 2.151104061404543, "language_loss": 0.6892854, "learning_rate": 3.7349592501292325e-08, "loss": 0.71027565, "num_input_tokens_seen": 337238095, "step": 15638, "time_per_iteration": 2.720820426940918 }, { "auxiliary_loss_clip": 0.01093316, "auxiliary_loss_mlp": 0.01038281, "balance_loss_clip": 1.03545761, "balance_loss_mlp": 1.02702212, "epoch": 0.9402675484743724, "flos": 24754302858240.0, "grad_norm": 1.6914013320453911, "language_loss": 0.84974968, "learning_rate": 3.727471440859498e-08, "loss": 0.87106568, "num_input_tokens_seen": 337256645, "step": 15639, "time_per_iteration": 5.851804733276367 }, { "auxiliary_loss_clip": 0.01083189, "auxiliary_loss_mlp": 0.00770067, "balance_loss_clip": 1.03247952, "balance_loss_mlp": 1.00016117, "epoch": 0.9403276717270405, "flos": 25558630156800.0, "grad_norm": 1.7768265457850463, "language_loss": 0.78339088, "learning_rate": 3.719991074263662e-08, "loss": 0.80192345, "num_input_tokens_seen": 337278360, "step": 15640, "time_per_iteration": 2.7363038063049316 }, { "auxiliary_loss_clip": 0.01100045, "auxiliary_loss_mlp": 0.0103256, "balance_loss_clip": 1.03647268, "balance_loss_mlp": 1.0201323, "epoch": 0.9403877949797084, "flos": 26689852154880.0, "grad_norm": 1.8593795246940288, "language_loss": 0.74102533, "learning_rate": 3.7125181506254544e-08, "loss": 0.76235145, "num_input_tokens_seen": 337302480, "step": 15641, "time_per_iteration": 2.7518787384033203 }, { "auxiliary_loss_clip": 0.0110061, "auxiliary_loss_mlp": 0.01034479, "balance_loss_clip": 1.03686595, "balance_loss_mlp": 1.01987553, "epoch": 0.9404479182323764, "flos": 15011666455680.0, "grad_norm": 2.217224537042475, "language_loss": 0.8267206, "learning_rate": 3.7050526702282256e-08, "loss": 0.84807152, "num_input_tokens_seen": 337316600, "step": 15642, "time_per_iteration": 4.0844972133636475 }, { "auxiliary_loss_clip": 0.01090346, "auxiliary_loss_mlp": 0.01030611, "balance_loss_clip": 1.03500628, "balance_loss_mlp": 1.01894593, "epoch": 0.9405080414850443, "flos": 24973573432320.0, "grad_norm": 1.8062008321344256, "language_loss": 0.68693364, "learning_rate": 3.697594633355084e-08, "loss": 0.70814323, "num_input_tokens_seen": 337336895, "step": 15643, "time_per_iteration": 2.57680344581604 }, { "auxiliary_loss_clip": 0.01098869, "auxiliary_loss_mlp": 0.01036038, "balance_loss_clip": 1.03659177, "balance_loss_mlp": 1.02258563, "epoch": 0.9405681647377123, "flos": 20844743777280.0, "grad_norm": 1.933266647542814, "language_loss": 0.76611924, "learning_rate": 3.6901440402888226e-08, "loss": 0.78746843, "num_input_tokens_seen": 337355105, "step": 15644, "time_per_iteration": 2.573357343673706 }, { "auxiliary_loss_clip": 0.01090012, "auxiliary_loss_mlp": 0.0103297, "balance_loss_clip": 1.03489494, "balance_loss_mlp": 1.02147841, "epoch": 0.9406282879903802, "flos": 23805578885760.0, "grad_norm": 1.6178233820471488, "language_loss": 0.67622656, "learning_rate": 3.682700891311974e-08, "loss": 0.69745636, "num_input_tokens_seen": 337374905, "step": 15645, "time_per_iteration": 2.615952730178833 }, { "auxiliary_loss_clip": 0.01077394, "auxiliary_loss_mlp": 0.00769887, "balance_loss_clip": 1.03552616, "balance_loss_mlp": 1.00019598, "epoch": 0.9406884112430483, "flos": 27674953626240.0, "grad_norm": 1.3954115728125809, "language_loss": 0.70446187, "learning_rate": 3.6752651867067774e-08, "loss": 0.72293472, "num_input_tokens_seen": 337397130, "step": 15646, "time_per_iteration": 2.6904642581939697 }, { "auxiliary_loss_clip": 0.01090467, "auxiliary_loss_mlp": 0.01031509, "balance_loss_clip": 1.03259134, "balance_loss_mlp": 1.0194633, "epoch": 0.9407485344957163, "flos": 23075048079360.0, "grad_norm": 1.4996623163528855, "language_loss": 0.74028134, "learning_rate": 3.667836926755208e-08, "loss": 0.76150107, "num_input_tokens_seen": 337418660, "step": 15647, "time_per_iteration": 2.6018729209899902 }, { "auxiliary_loss_clip": 0.01010109, "auxiliary_loss_mlp": 0.01000406, "balance_loss_clip": 1.00723958, "balance_loss_mlp": 0.99945861, "epoch": 0.9408086577483842, "flos": 71014034304000.0, "grad_norm": 0.8881598455052471, "language_loss": 0.63527632, "learning_rate": 3.660416111738907e-08, "loss": 0.65538144, "num_input_tokens_seen": 337478055, "step": 15648, "time_per_iteration": 3.2934350967407227 }, { "auxiliary_loss_clip": 0.01104482, "auxiliary_loss_mlp": 0.01034208, "balance_loss_clip": 1.03579104, "balance_loss_mlp": 1.02340806, "epoch": 0.9408687810010522, "flos": 23730956380800.0, "grad_norm": 1.5536213392749576, "language_loss": 0.66520309, "learning_rate": 3.653002741939337e-08, "loss": 0.68659002, "num_input_tokens_seen": 337499405, "step": 15649, "time_per_iteration": 2.553529739379883 }, { "auxiliary_loss_clip": 0.01075375, "auxiliary_loss_mlp": 0.01026505, "balance_loss_clip": 1.0331924, "balance_loss_mlp": 1.01497114, "epoch": 0.9409289042537201, "flos": 18369314087040.0, "grad_norm": 4.521362372265656, "language_loss": 0.77431417, "learning_rate": 3.645596817637586e-08, "loss": 0.79533303, "num_input_tokens_seen": 337517195, "step": 15650, "time_per_iteration": 2.665523052215576 }, { "auxiliary_loss_clip": 0.01064771, "auxiliary_loss_mlp": 0.01032458, "balance_loss_clip": 1.03697872, "balance_loss_mlp": 1.0203402, "epoch": 0.9409890275063881, "flos": 23878333883520.0, "grad_norm": 2.2066131550931942, "language_loss": 0.74314982, "learning_rate": 3.638198339114451e-08, "loss": 0.76412213, "num_input_tokens_seen": 337535245, "step": 15651, "time_per_iteration": 2.790637969970703 }, { "auxiliary_loss_clip": 0.0110668, "auxiliary_loss_mlp": 0.01032091, "balance_loss_clip": 1.03554559, "balance_loss_mlp": 1.01934731, "epoch": 0.941049150759056, "flos": 16545088016640.0, "grad_norm": 1.7324074128675258, "language_loss": 0.72721291, "learning_rate": 3.630807306650507e-08, "loss": 0.74860054, "num_input_tokens_seen": 337553040, "step": 15652, "time_per_iteration": 2.5541346073150635 }, { "auxiliary_loss_clip": 0.01073797, "auxiliary_loss_mlp": 0.01037686, "balance_loss_clip": 1.03517735, "balance_loss_mlp": 1.02407205, "epoch": 0.9411092740117241, "flos": 25118401069440.0, "grad_norm": 1.8143834266468624, "language_loss": 0.66641271, "learning_rate": 3.6234237205260645e-08, "loss": 0.68752754, "num_input_tokens_seen": 337574580, "step": 15653, "time_per_iteration": 2.7330899238586426 }, { "auxiliary_loss_clip": 0.01109084, "auxiliary_loss_mlp": 0.01035177, "balance_loss_clip": 1.03644657, "balance_loss_mlp": 1.02239227, "epoch": 0.941169397264392, "flos": 21142264129920.0, "grad_norm": 1.9183010885495058, "language_loss": 0.77979028, "learning_rate": 3.6160475810210536e-08, "loss": 0.80123287, "num_input_tokens_seen": 337593010, "step": 15654, "time_per_iteration": 2.5508615970611572 }, { "auxiliary_loss_clip": 0.01104499, "auxiliary_loss_mlp": 0.01029373, "balance_loss_clip": 1.03763437, "balance_loss_mlp": 1.01693344, "epoch": 0.94122952051706, "flos": 38508914995200.0, "grad_norm": 1.5713777366197268, "language_loss": 0.69984704, "learning_rate": 3.6086788884152065e-08, "loss": 0.7211858, "num_input_tokens_seen": 337616170, "step": 15655, "time_per_iteration": 2.7416152954101562 }, { "auxiliary_loss_clip": 0.01107647, "auxiliary_loss_mlp": 0.01036833, "balance_loss_clip": 1.03607106, "balance_loss_mlp": 1.02323759, "epoch": 0.9412896437697279, "flos": 18369206346240.0, "grad_norm": 5.7482907771024045, "language_loss": 0.72394556, "learning_rate": 3.601317642987944e-08, "loss": 0.74539036, "num_input_tokens_seen": 337635215, "step": 15656, "time_per_iteration": 2.569613456726074 }, { "auxiliary_loss_clip": 0.01074485, "auxiliary_loss_mlp": 0.0102962, "balance_loss_clip": 1.03419089, "balance_loss_mlp": 1.01772296, "epoch": 0.9413497670223959, "flos": 25884950238720.0, "grad_norm": 1.8612272314279366, "language_loss": 0.78241754, "learning_rate": 3.593963845018377e-08, "loss": 0.80345851, "num_input_tokens_seen": 337654195, "step": 15657, "time_per_iteration": 2.6432650089263916 }, { "auxiliary_loss_clip": 0.01072209, "auxiliary_loss_mlp": 0.01029018, "balance_loss_clip": 1.03471482, "balance_loss_mlp": 1.01653671, "epoch": 0.9414098902750638, "flos": 16618309891200.0, "grad_norm": 2.5566622926725193, "language_loss": 0.84468395, "learning_rate": 3.586617494785371e-08, "loss": 0.86569619, "num_input_tokens_seen": 337671810, "step": 15658, "time_per_iteration": 2.6943564414978027 }, { "auxiliary_loss_clip": 0.01112714, "auxiliary_loss_mlp": 0.01033078, "balance_loss_clip": 1.03760839, "balance_loss_mlp": 1.01849866, "epoch": 0.9414700135277319, "flos": 18625033987200.0, "grad_norm": 2.5090872722582627, "language_loss": 0.70395422, "learning_rate": 3.5792785925675254e-08, "loss": 0.72541213, "num_input_tokens_seen": 337689410, "step": 15659, "time_per_iteration": 2.537353038787842 }, { "auxiliary_loss_clip": 0.01079214, "auxiliary_loss_mlp": 0.01039877, "balance_loss_clip": 1.03404224, "balance_loss_mlp": 1.02849793, "epoch": 0.9415301367803999, "flos": 26280146649600.0, "grad_norm": 1.7562040343891887, "language_loss": 0.79511106, "learning_rate": 3.571947138643172e-08, "loss": 0.81630188, "num_input_tokens_seen": 337709950, "step": 15660, "time_per_iteration": 2.7002146244049072 }, { "auxiliary_loss_clip": 0.01071861, "auxiliary_loss_mlp": 0.01028763, "balance_loss_clip": 1.03252554, "balance_loss_mlp": 1.0167948, "epoch": 0.9415902600330678, "flos": 23261388860160.0, "grad_norm": 1.4022153462876712, "language_loss": 0.67921788, "learning_rate": 3.564623133290201e-08, "loss": 0.70022404, "num_input_tokens_seen": 337731320, "step": 15661, "time_per_iteration": 2.755877733230591 }, { "auxiliary_loss_clip": 0.01092284, "auxiliary_loss_mlp": 0.01031361, "balance_loss_clip": 1.03276324, "balance_loss_mlp": 1.01883173, "epoch": 0.9416503832857358, "flos": 14719138093440.0, "grad_norm": 2.2599485934603725, "language_loss": 0.66300029, "learning_rate": 3.557306576786434e-08, "loss": 0.68423676, "num_input_tokens_seen": 337747720, "step": 15662, "time_per_iteration": 2.5741324424743652 }, { "auxiliary_loss_clip": 0.01009662, "auxiliary_loss_mlp": 0.01000042, "balance_loss_clip": 1.00710607, "balance_loss_mlp": 0.99910659, "epoch": 0.9417105065384037, "flos": 70312698276480.0, "grad_norm": 0.7619674820211261, "language_loss": 0.59235966, "learning_rate": 3.5499974694092935e-08, "loss": 0.61245668, "num_input_tokens_seen": 337806930, "step": 15663, "time_per_iteration": 3.3059024810791016 }, { "auxiliary_loss_clip": 0.01103713, "auxiliary_loss_mlp": 0.01035518, "balance_loss_clip": 1.03747571, "balance_loss_mlp": 1.02217829, "epoch": 0.9417706297910717, "flos": 34057895322240.0, "grad_norm": 4.1621354717950885, "language_loss": 0.66886747, "learning_rate": 3.542695811435914e-08, "loss": 0.69025975, "num_input_tokens_seen": 337828100, "step": 15664, "time_per_iteration": 2.7219324111938477 }, { "auxiliary_loss_clip": 0.01083442, "auxiliary_loss_mlp": 0.01030443, "balance_loss_clip": 1.03674304, "balance_loss_mlp": 1.01874244, "epoch": 0.9418307530437396, "flos": 16471614746880.0, "grad_norm": 2.258809019140803, "language_loss": 0.73858142, "learning_rate": 3.535401603143207e-08, "loss": 0.75972033, "num_input_tokens_seen": 337844805, "step": 15665, "time_per_iteration": 2.6257636547088623 }, { "auxiliary_loss_clip": 0.01105775, "auxiliary_loss_mlp": 0.01032956, "balance_loss_clip": 1.03694832, "balance_loss_mlp": 1.02096939, "epoch": 0.9418908762964077, "flos": 11253543114240.0, "grad_norm": 3.0049644569052907, "language_loss": 0.63581872, "learning_rate": 3.528114844807773e-08, "loss": 0.65720612, "num_input_tokens_seen": 337860490, "step": 15666, "time_per_iteration": 2.5537686347961426 }, { "auxiliary_loss_clip": 0.01072039, "auxiliary_loss_mlp": 0.01029378, "balance_loss_clip": 1.0352211, "balance_loss_mlp": 1.01712298, "epoch": 0.9419509995490756, "flos": 18438836860800.0, "grad_norm": 1.6687010232077268, "language_loss": 0.78841943, "learning_rate": 3.520835536705902e-08, "loss": 0.80943358, "num_input_tokens_seen": 337878360, "step": 15667, "time_per_iteration": 2.66939377784729 }, { "auxiliary_loss_clip": 0.01105116, "auxiliary_loss_mlp": 0.01027413, "balance_loss_clip": 1.03544164, "balance_loss_mlp": 1.01629639, "epoch": 0.9420111228017436, "flos": 20737945664640.0, "grad_norm": 1.8566898819332656, "language_loss": 0.75282031, "learning_rate": 3.5135636791136404e-08, "loss": 0.7741456, "num_input_tokens_seen": 337895635, "step": 15668, "time_per_iteration": 2.5508882999420166 }, { "auxiliary_loss_clip": 0.0105425, "auxiliary_loss_mlp": 0.01029631, "balance_loss_clip": 1.03423977, "balance_loss_mlp": 1.01744199, "epoch": 0.9420712460544115, "flos": 21141940907520.0, "grad_norm": 2.159724886055292, "language_loss": 0.59023595, "learning_rate": 3.506299272306723e-08, "loss": 0.61107475, "num_input_tokens_seen": 337913940, "step": 15669, "time_per_iteration": 2.73180890083313 }, { "auxiliary_loss_clip": 0.01067029, "auxiliary_loss_mlp": 0.01027243, "balance_loss_clip": 1.03234708, "balance_loss_mlp": 1.01523852, "epoch": 0.9421313693070795, "flos": 15851760721920.0, "grad_norm": 1.528198079062627, "language_loss": 0.77025855, "learning_rate": 3.4990423165606406e-08, "loss": 0.79120123, "num_input_tokens_seen": 337932015, "step": 15670, "time_per_iteration": 2.69807767868042 }, { "auxiliary_loss_clip": 0.01109553, "auxiliary_loss_mlp": 0.01036081, "balance_loss_clip": 1.03725696, "balance_loss_mlp": 1.02321219, "epoch": 0.9421914925597474, "flos": 32415915882240.0, "grad_norm": 1.8154793470935222, "language_loss": 0.65174937, "learning_rate": 3.491792812150574e-08, "loss": 0.67320567, "num_input_tokens_seen": 337953345, "step": 15671, "time_per_iteration": 2.7444138526916504 }, { "auxiliary_loss_clip": 0.01082811, "auxiliary_loss_mlp": 0.01033924, "balance_loss_clip": 1.03383374, "balance_loss_mlp": 1.02096009, "epoch": 0.9422516158124155, "flos": 19718513769600.0, "grad_norm": 1.558684648583432, "language_loss": 0.79916745, "learning_rate": 3.48455075935139e-08, "loss": 0.82033479, "num_input_tokens_seen": 337973685, "step": 15672, "time_per_iteration": 2.803809881210327 }, { "auxiliary_loss_clip": 0.01075344, "auxiliary_loss_mlp": 0.01036959, "balance_loss_clip": 1.03470707, "balance_loss_mlp": 1.02285063, "epoch": 0.9423117390650835, "flos": 16253277926400.0, "grad_norm": 1.9824694157705243, "language_loss": 0.73236197, "learning_rate": 3.47731615843776e-08, "loss": 0.75348502, "num_input_tokens_seen": 337989175, "step": 15673, "time_per_iteration": 2.755509614944458 }, { "auxiliary_loss_clip": 0.01091118, "auxiliary_loss_mlp": 0.01030642, "balance_loss_clip": 1.03414345, "balance_loss_mlp": 1.01794672, "epoch": 0.9423718623177514, "flos": 31796564647680.0, "grad_norm": 1.4558092155999423, "language_loss": 0.70178533, "learning_rate": 3.470089009683974e-08, "loss": 0.72300291, "num_input_tokens_seen": 338011800, "step": 15674, "time_per_iteration": 2.695003032684326 }, { "auxiliary_loss_clip": 0.01107385, "auxiliary_loss_mlp": 0.01025942, "balance_loss_clip": 1.03574955, "balance_loss_mlp": 1.01402664, "epoch": 0.9424319855704194, "flos": 23331809473920.0, "grad_norm": 1.9582607226770616, "language_loss": 0.81163412, "learning_rate": 3.462869313364125e-08, "loss": 0.8329674, "num_input_tokens_seen": 338032120, "step": 15675, "time_per_iteration": 2.6292521953582764 }, { "auxiliary_loss_clip": 0.01081718, "auxiliary_loss_mlp": 0.01031642, "balance_loss_clip": 1.03442502, "balance_loss_mlp": 1.01966715, "epoch": 0.9424921088230873, "flos": 20777627214720.0, "grad_norm": 1.7260765463945456, "language_loss": 0.62643492, "learning_rate": 3.4556570697519494e-08, "loss": 0.64756858, "num_input_tokens_seen": 338051880, "step": 15676, "time_per_iteration": 4.179499387741089 }, { "auxiliary_loss_clip": 0.01092941, "auxiliary_loss_mlp": 0.01038222, "balance_loss_clip": 1.03998232, "balance_loss_mlp": 1.02615166, "epoch": 0.9425522320757553, "flos": 19026658932480.0, "grad_norm": 1.777162834544334, "language_loss": 0.67122662, "learning_rate": 3.448452279120984e-08, "loss": 0.69253826, "num_input_tokens_seen": 338069665, "step": 15677, "time_per_iteration": 2.6239006519317627 }, { "auxiliary_loss_clip": 0.01072255, "auxiliary_loss_mlp": 0.01035548, "balance_loss_clip": 1.03186798, "balance_loss_mlp": 1.02190459, "epoch": 0.9426123553284232, "flos": 25155353185920.0, "grad_norm": 2.176683290780186, "language_loss": 0.641137, "learning_rate": 3.441254941744387e-08, "loss": 0.66221505, "num_input_tokens_seen": 338090490, "step": 15678, "time_per_iteration": 4.263075113296509 }, { "auxiliary_loss_clip": 0.01082508, "auxiliary_loss_mlp": 0.01028954, "balance_loss_clip": 1.04040313, "balance_loss_mlp": 1.01706934, "epoch": 0.9426724785810913, "flos": 21179359900800.0, "grad_norm": 1.4630832933179898, "language_loss": 0.74250793, "learning_rate": 3.434065057895097e-08, "loss": 0.76362252, "num_input_tokens_seen": 338109825, "step": 15679, "time_per_iteration": 4.329301357269287 }, { "auxiliary_loss_clip": 0.01089711, "auxiliary_loss_mlp": 0.01034379, "balance_loss_clip": 1.0365119, "balance_loss_mlp": 1.02231526, "epoch": 0.9427326018337592, "flos": 14756916222720.0, "grad_norm": 3.717209623940925, "language_loss": 0.77565658, "learning_rate": 3.426882627845762e-08, "loss": 0.79689747, "num_input_tokens_seen": 338125790, "step": 15680, "time_per_iteration": 2.6704599857330322 }, { "auxiliary_loss_clip": 0.01097961, "auxiliary_loss_mlp": 0.01033168, "balance_loss_clip": 1.0371449, "balance_loss_mlp": 1.02055609, "epoch": 0.9427927250864272, "flos": 20923640000640.0, "grad_norm": 2.190384071517057, "language_loss": 0.75626266, "learning_rate": 3.419707651868742e-08, "loss": 0.77757394, "num_input_tokens_seen": 338145610, "step": 15681, "time_per_iteration": 2.6899359226226807 }, { "auxiliary_loss_clip": 0.01082824, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 1.03667855, "balance_loss_mlp": 1.02248073, "epoch": 0.9428528483390951, "flos": 19752520970880.0, "grad_norm": 2.3199120236961144, "language_loss": 0.65754902, "learning_rate": 3.412540130236086e-08, "loss": 0.6787324, "num_input_tokens_seen": 338165960, "step": 15682, "time_per_iteration": 4.124305963516235 }, { "auxiliary_loss_clip": 0.01071222, "auxiliary_loss_mlp": 0.01028797, "balance_loss_clip": 1.03226089, "balance_loss_mlp": 1.01655436, "epoch": 0.9429129715917631, "flos": 24534996370560.0, "grad_norm": 3.097159468574502, "language_loss": 0.76566684, "learning_rate": 3.405380063219665e-08, "loss": 0.78666705, "num_input_tokens_seen": 338187215, "step": 15683, "time_per_iteration": 2.71305775642395 }, { "auxiliary_loss_clip": 0.01100547, "auxiliary_loss_mlp": 0.01040046, "balance_loss_clip": 1.03645873, "balance_loss_mlp": 1.02684927, "epoch": 0.942973094844431, "flos": 17959824063360.0, "grad_norm": 2.6265213886695826, "language_loss": 0.75407404, "learning_rate": 3.398227451090885e-08, "loss": 0.77547991, "num_input_tokens_seen": 338201825, "step": 15684, "time_per_iteration": 2.6331522464752197 }, { "auxiliary_loss_clip": 0.01104685, "auxiliary_loss_mlp": 0.01027432, "balance_loss_clip": 1.03484631, "balance_loss_mlp": 1.01599407, "epoch": 0.9430332180970991, "flos": 26137689310080.0, "grad_norm": 1.6361176573488942, "language_loss": 0.77129638, "learning_rate": 3.391082294121017e-08, "loss": 0.79261756, "num_input_tokens_seen": 338220865, "step": 15685, "time_per_iteration": 2.7566094398498535 }, { "auxiliary_loss_clip": 0.01092602, "auxiliary_loss_mlp": 0.01030988, "balance_loss_clip": 1.03414559, "balance_loss_mlp": 1.01951969, "epoch": 0.943093341349767, "flos": 23951376190080.0, "grad_norm": 2.1272798688132775, "language_loss": 0.75766367, "learning_rate": 3.383944592581023e-08, "loss": 0.77889955, "num_input_tokens_seen": 338240160, "step": 15686, "time_per_iteration": 2.6965436935424805 }, { "auxiliary_loss_clip": 0.01097717, "auxiliary_loss_mlp": 0.0103222, "balance_loss_clip": 1.03482318, "balance_loss_mlp": 1.01981652, "epoch": 0.943153464602435, "flos": 17968407413760.0, "grad_norm": 1.7020578922272096, "language_loss": 0.80628002, "learning_rate": 3.376814346741575e-08, "loss": 0.82757938, "num_input_tokens_seen": 338259305, "step": 15687, "time_per_iteration": 2.5866737365722656 }, { "auxiliary_loss_clip": 0.01089927, "auxiliary_loss_mlp": 0.01034435, "balance_loss_clip": 1.03616667, "balance_loss_mlp": 1.02021337, "epoch": 0.943213587855103, "flos": 14501519544960.0, "grad_norm": 2.167672264682041, "language_loss": 0.75638962, "learning_rate": 3.369691556873011e-08, "loss": 0.77763325, "num_input_tokens_seen": 338274950, "step": 15688, "time_per_iteration": 2.6230926513671875 }, { "auxiliary_loss_clip": 0.01078255, "auxiliary_loss_mlp": 0.01026704, "balance_loss_clip": 1.03318596, "balance_loss_mlp": 1.01392508, "epoch": 0.9432737111077709, "flos": 28986411093120.0, "grad_norm": 1.671527451823547, "language_loss": 0.68622327, "learning_rate": 3.3625762232454504e-08, "loss": 0.70727283, "num_input_tokens_seen": 338295585, "step": 15689, "time_per_iteration": 2.7073707580566406 }, { "auxiliary_loss_clip": 0.01094693, "auxiliary_loss_mlp": 0.0103362, "balance_loss_clip": 1.03498852, "balance_loss_mlp": 1.0225811, "epoch": 0.9433338343604389, "flos": 21609066303360.0, "grad_norm": 1.766005404007046, "language_loss": 0.80373913, "learning_rate": 3.35546834612872e-08, "loss": 0.82502228, "num_input_tokens_seen": 338314555, "step": 15690, "time_per_iteration": 2.5873029232025146 }, { "auxiliary_loss_clip": 0.0109645, "auxiliary_loss_mlp": 0.01031556, "balance_loss_clip": 1.03644657, "balance_loss_mlp": 1.0193367, "epoch": 0.9433939576131068, "flos": 33182285483520.0, "grad_norm": 2.148376299443603, "language_loss": 0.59993267, "learning_rate": 3.348367925792317e-08, "loss": 0.62121278, "num_input_tokens_seen": 338336260, "step": 15691, "time_per_iteration": 2.7108116149902344 }, { "auxiliary_loss_clip": 0.01070974, "auxiliary_loss_mlp": 0.01032195, "balance_loss_clip": 1.03521907, "balance_loss_mlp": 1.01911151, "epoch": 0.9434540808657749, "flos": 20486391742080.0, "grad_norm": 2.2447371927481545, "language_loss": 0.66576785, "learning_rate": 3.341274962505514e-08, "loss": 0.68679953, "num_input_tokens_seen": 338354680, "step": 15692, "time_per_iteration": 2.6925716400146484 }, { "auxiliary_loss_clip": 0.01093305, "auxiliary_loss_mlp": 0.01032018, "balance_loss_clip": 1.03605986, "balance_loss_mlp": 1.01980531, "epoch": 0.9435142041184428, "flos": 21542955321600.0, "grad_norm": 2.467286667437946, "language_loss": 0.74455351, "learning_rate": 3.334189456537251e-08, "loss": 0.76580673, "num_input_tokens_seen": 338372490, "step": 15693, "time_per_iteration": 2.6023404598236084 }, { "auxiliary_loss_clip": 0.01074066, "auxiliary_loss_mlp": 0.01035078, "balance_loss_clip": 1.03401875, "balance_loss_mlp": 1.0216012, "epoch": 0.9435743273711108, "flos": 25009089004800.0, "grad_norm": 3.213380675885908, "language_loss": 0.73401213, "learning_rate": 3.327111408156291e-08, "loss": 0.75510359, "num_input_tokens_seen": 338390870, "step": 15694, "time_per_iteration": 2.695995569229126 }, { "auxiliary_loss_clip": 0.00992652, "auxiliary_loss_mlp": 0.01001259, "balance_loss_clip": 1.00752378, "balance_loss_mlp": 1.00031126, "epoch": 0.9436344506237787, "flos": 60158707320960.0, "grad_norm": 0.6938858298712827, "language_loss": 0.50570488, "learning_rate": 3.3200408176309316e-08, "loss": 0.52564394, "num_input_tokens_seen": 338453075, "step": 15695, "time_per_iteration": 3.2824831008911133 }, { "auxiliary_loss_clip": 0.01078605, "auxiliary_loss_mlp": 0.01034023, "balance_loss_clip": 1.03206229, "balance_loss_mlp": 1.02225113, "epoch": 0.9436945738764467, "flos": 22237252283520.0, "grad_norm": 1.8024183486894638, "language_loss": 0.65296769, "learning_rate": 3.312977685229335e-08, "loss": 0.67409396, "num_input_tokens_seen": 338471770, "step": 15696, "time_per_iteration": 2.7027387619018555 }, { "auxiliary_loss_clip": 0.01097587, "auxiliary_loss_mlp": 0.01027656, "balance_loss_clip": 1.03637338, "balance_loss_mlp": 1.01574719, "epoch": 0.9437546971291146, "flos": 25045179194880.0, "grad_norm": 1.5868519209040974, "language_loss": 0.65894949, "learning_rate": 3.305922011219353e-08, "loss": 0.68020189, "num_input_tokens_seen": 338492190, "step": 15697, "time_per_iteration": 2.696575880050659 }, { "auxiliary_loss_clip": 0.00999497, "auxiliary_loss_mlp": 0.01001147, "balance_loss_clip": 1.00481725, "balance_loss_mlp": 1.0002768, "epoch": 0.9438148203817827, "flos": 56790788400000.0, "grad_norm": 0.8460520685296222, "language_loss": 0.63194656, "learning_rate": 3.298873795868506e-08, "loss": 0.6519531, "num_input_tokens_seen": 338552560, "step": 15698, "time_per_iteration": 3.1992437839508057 }, { "auxiliary_loss_clip": 0.01088557, "auxiliary_loss_mlp": 0.01040163, "balance_loss_clip": 1.03655159, "balance_loss_mlp": 1.02691269, "epoch": 0.9438749436344506, "flos": 22346384780160.0, "grad_norm": 1.744031032402157, "language_loss": 0.69575948, "learning_rate": 3.291833039444092e-08, "loss": 0.71704668, "num_input_tokens_seen": 338571770, "step": 15699, "time_per_iteration": 2.71105694770813 }, { "auxiliary_loss_clip": 0.01069184, "auxiliary_loss_mlp": 0.01031098, "balance_loss_clip": 1.03235722, "balance_loss_mlp": 1.01913548, "epoch": 0.9439350668871186, "flos": 13370800337280.0, "grad_norm": 2.0518256803371444, "language_loss": 0.74715513, "learning_rate": 3.2847997422130734e-08, "loss": 0.76815796, "num_input_tokens_seen": 338587310, "step": 15700, "time_per_iteration": 2.7928857803344727 }, { "auxiliary_loss_clip": 0.01031212, "auxiliary_loss_mlp": 0.01031676, "balance_loss_clip": 1.02990246, "balance_loss_mlp": 1.02022016, "epoch": 0.9439951901397866, "flos": 17785334770560.0, "grad_norm": 1.7345550747234506, "language_loss": 0.70444047, "learning_rate": 3.2777739044421495e-08, "loss": 0.7250694, "num_input_tokens_seen": 338606235, "step": 15701, "time_per_iteration": 2.956749200820923 }, { "auxiliary_loss_clip": 0.01067175, "auxiliary_loss_mlp": 0.01028974, "balance_loss_clip": 1.03305924, "balance_loss_mlp": 1.01637959, "epoch": 0.9440553133924545, "flos": 18879568738560.0, "grad_norm": 6.919178162697029, "language_loss": 0.77767622, "learning_rate": 3.2707555263977505e-08, "loss": 0.79863775, "num_input_tokens_seen": 338624090, "step": 15702, "time_per_iteration": 3.149764060974121 }, { "auxiliary_loss_clip": 0.01093668, "auxiliary_loss_mlp": 0.01043554, "balance_loss_clip": 1.03391433, "balance_loss_mlp": 1.03058994, "epoch": 0.9441154366451225, "flos": 19572967860480.0, "grad_norm": 2.1610357777231397, "language_loss": 0.66376126, "learning_rate": 3.2637446083460194e-08, "loss": 0.68513346, "num_input_tokens_seen": 338643695, "step": 15703, "time_per_iteration": 2.5990066528320312 }, { "auxiliary_loss_clip": 0.01099113, "auxiliary_loss_mlp": 0.01029886, "balance_loss_clip": 1.03849339, "balance_loss_mlp": 1.01657009, "epoch": 0.9441755598977905, "flos": 30294995472000.0, "grad_norm": 6.247002123537392, "language_loss": 0.73099834, "learning_rate": 3.256741150552833e-08, "loss": 0.75228834, "num_input_tokens_seen": 338664725, "step": 15704, "time_per_iteration": 2.649864673614502 }, { "auxiliary_loss_clip": 0.01094284, "auxiliary_loss_mlp": 0.0103236, "balance_loss_clip": 1.03578568, "balance_loss_mlp": 1.01978898, "epoch": 0.9442356831504585, "flos": 20667884186880.0, "grad_norm": 1.839574518559296, "language_loss": 0.74311668, "learning_rate": 3.2497451532837336e-08, "loss": 0.76438308, "num_input_tokens_seen": 338683990, "step": 15705, "time_per_iteration": 2.611238956451416 }, { "auxiliary_loss_clip": 0.01087617, "auxiliary_loss_mlp": 0.01034408, "balance_loss_clip": 1.03792405, "balance_loss_mlp": 1.02303529, "epoch": 0.9442958064031264, "flos": 16107265140480.0, "grad_norm": 1.8860922128318132, "language_loss": 0.76915097, "learning_rate": 3.2427566168039986e-08, "loss": 0.79037118, "num_input_tokens_seen": 338702025, "step": 15706, "time_per_iteration": 2.651951313018799 }, { "auxiliary_loss_clip": 0.01091977, "auxiliary_loss_mlp": 0.01029566, "balance_loss_clip": 1.03399932, "balance_loss_mlp": 1.01796126, "epoch": 0.9443559296557944, "flos": 20447392550400.0, "grad_norm": 1.4620649428574009, "language_loss": 0.69324106, "learning_rate": 3.23577554137866e-08, "loss": 0.7144565, "num_input_tokens_seen": 338720920, "step": 15707, "time_per_iteration": 2.674379825592041 }, { "auxiliary_loss_clip": 0.0110044, "auxiliary_loss_mlp": 0.01027284, "balance_loss_clip": 1.0323143, "balance_loss_mlp": 1.01660287, "epoch": 0.9444160529084623, "flos": 21610897896960.0, "grad_norm": 1.6031633884107506, "language_loss": 0.69253683, "learning_rate": 3.22880192727244e-08, "loss": 0.71381414, "num_input_tokens_seen": 338739590, "step": 15708, "time_per_iteration": 2.6171586513519287 }, { "auxiliary_loss_clip": 0.01096213, "auxiliary_loss_mlp": 0.01030391, "balance_loss_clip": 1.03588486, "balance_loss_mlp": 1.01868427, "epoch": 0.9444761761611303, "flos": 18441781776000.0, "grad_norm": 2.3501834209242305, "language_loss": 0.70614785, "learning_rate": 3.221835774749748e-08, "loss": 0.72741389, "num_input_tokens_seen": 338757240, "step": 15709, "time_per_iteration": 2.5730903148651123 }, { "auxiliary_loss_clip": 0.01067094, "auxiliary_loss_mlp": 0.01031921, "balance_loss_clip": 1.03753853, "balance_loss_mlp": 1.01969028, "epoch": 0.9445362994137982, "flos": 20957144411520.0, "grad_norm": 2.0328452779578208, "language_loss": 0.84886342, "learning_rate": 3.214877084074774e-08, "loss": 0.86985362, "num_input_tokens_seen": 338773750, "step": 15710, "time_per_iteration": 2.803764581680298 }, { "auxiliary_loss_clip": 0.01086062, "auxiliary_loss_mlp": 0.01033102, "balance_loss_clip": 1.03906393, "balance_loss_mlp": 1.02019763, "epoch": 0.9445964226664663, "flos": 20303283185280.0, "grad_norm": 1.710819363130834, "language_loss": 0.71519732, "learning_rate": 3.2079258555113956e-08, "loss": 0.73638898, "num_input_tokens_seen": 338792115, "step": 15711, "time_per_iteration": 2.786268711090088 }, { "auxiliary_loss_clip": 0.01097144, "auxiliary_loss_mlp": 0.01032234, "balance_loss_clip": 1.03653646, "balance_loss_mlp": 1.01912701, "epoch": 0.9446565459191342, "flos": 26396030903040.0, "grad_norm": 1.6870746080253851, "language_loss": 0.69105422, "learning_rate": 3.200982089323179e-08, "loss": 0.71234798, "num_input_tokens_seen": 338812480, "step": 15712, "time_per_iteration": 2.7278430461883545 }, { "auxiliary_loss_clip": 0.01102036, "auxiliary_loss_mlp": 0.01036765, "balance_loss_clip": 1.03873301, "balance_loss_mlp": 1.02347302, "epoch": 0.9447166691718022, "flos": 16544764794240.0, "grad_norm": 2.4121732835994405, "language_loss": 0.70365906, "learning_rate": 3.1940457857734246e-08, "loss": 0.72504705, "num_input_tokens_seen": 338829105, "step": 15713, "time_per_iteration": 2.6644036769866943 }, { "auxiliary_loss_clip": 0.01083151, "auxiliary_loss_mlp": 0.0103271, "balance_loss_clip": 1.03377235, "balance_loss_mlp": 1.01964462, "epoch": 0.9447767924244702, "flos": 29164635400320.0, "grad_norm": 1.5448448829872168, "language_loss": 0.7672528, "learning_rate": 3.187116945125212e-08, "loss": 0.7884115, "num_input_tokens_seen": 338850670, "step": 15714, "time_per_iteration": 2.713848114013672 }, { "auxiliary_loss_clip": 0.01083406, "auxiliary_loss_mlp": 0.01030794, "balance_loss_clip": 1.03877974, "balance_loss_mlp": 1.01808619, "epoch": 0.9448369156771381, "flos": 19274908803840.0, "grad_norm": 3.6710113259545456, "language_loss": 0.67744088, "learning_rate": 3.1801955676412194e-08, "loss": 0.69858289, "num_input_tokens_seen": 338867795, "step": 15715, "time_per_iteration": 4.1955413818359375 }, { "auxiliary_loss_clip": 0.01076435, "auxiliary_loss_mlp": 0.01033314, "balance_loss_clip": 1.03516388, "balance_loss_mlp": 1.02042139, "epoch": 0.9448970389298061, "flos": 23841166285440.0, "grad_norm": 1.6653982996808796, "language_loss": 0.74771553, "learning_rate": 3.173281653583948e-08, "loss": 0.76881307, "num_input_tokens_seen": 338887205, "step": 15716, "time_per_iteration": 2.7072696685791016 }, { "auxiliary_loss_clip": 0.01092174, "auxiliary_loss_mlp": 0.01031218, "balance_loss_clip": 1.03965962, "balance_loss_mlp": 1.01850486, "epoch": 0.944957162182474, "flos": 22382259488640.0, "grad_norm": 2.4365311852184797, "language_loss": 0.62516659, "learning_rate": 3.166375203215565e-08, "loss": 0.64640057, "num_input_tokens_seen": 338906130, "step": 15717, "time_per_iteration": 4.276852369308472 }, { "auxiliary_loss_clip": 0.01094123, "auxiliary_loss_mlp": 0.0103479, "balance_loss_clip": 1.03850865, "balance_loss_mlp": 1.02269626, "epoch": 0.9450172854351421, "flos": 17383889393280.0, "grad_norm": 1.77862512223437, "language_loss": 0.79134482, "learning_rate": 3.1594762167979514e-08, "loss": 0.81263399, "num_input_tokens_seen": 338923045, "step": 15718, "time_per_iteration": 4.204078674316406 }, { "auxiliary_loss_clip": 0.01018497, "auxiliary_loss_mlp": 0.00999465, "balance_loss_clip": 1.00589895, "balance_loss_mlp": 0.99857122, "epoch": 0.94507740868781, "flos": 68466352406400.0, "grad_norm": 0.6985194200865079, "language_loss": 0.57825208, "learning_rate": 3.152584694592719e-08, "loss": 0.59843159, "num_input_tokens_seen": 338987545, "step": 15719, "time_per_iteration": 3.1477670669555664 }, { "auxiliary_loss_clip": 0.0106753, "auxiliary_loss_mlp": 0.00770827, "balance_loss_clip": 1.03413999, "balance_loss_mlp": 1.0002296, "epoch": 0.945137531940478, "flos": 21142479611520.0, "grad_norm": 1.6417560155484736, "language_loss": 0.75850344, "learning_rate": 3.145700636861193e-08, "loss": 0.77688694, "num_input_tokens_seen": 339007830, "step": 15720, "time_per_iteration": 2.7489445209503174 }, { "auxiliary_loss_clip": 0.01092778, "auxiliary_loss_mlp": 0.01027293, "balance_loss_clip": 1.0348984, "balance_loss_mlp": 1.01603997, "epoch": 0.9451976551931459, "flos": 24533918962560.0, "grad_norm": 1.6214864220397953, "language_loss": 0.72730792, "learning_rate": 3.138824043864452e-08, "loss": 0.74850857, "num_input_tokens_seen": 339028980, "step": 15721, "time_per_iteration": 4.25614595413208 }, { "auxiliary_loss_clip": 0.01062633, "auxiliary_loss_mlp": 0.0103276, "balance_loss_clip": 1.03165364, "balance_loss_mlp": 1.01968312, "epoch": 0.9452577784458139, "flos": 23440582834560.0, "grad_norm": 2.1250289522384933, "language_loss": 0.85435033, "learning_rate": 3.131954915863244e-08, "loss": 0.87530422, "num_input_tokens_seen": 339047950, "step": 15722, "time_per_iteration": 2.7739651203155518 }, { "auxiliary_loss_clip": 0.01008256, "auxiliary_loss_mlp": 0.00999124, "balance_loss_clip": 1.00595665, "balance_loss_mlp": 0.99822962, "epoch": 0.9453179016984818, "flos": 52017686449920.0, "grad_norm": 0.8877194495304748, "language_loss": 0.64485419, "learning_rate": 3.125093253118005e-08, "loss": 0.66492796, "num_input_tokens_seen": 339104535, "step": 15723, "time_per_iteration": 3.120633363723755 }, { "auxiliary_loss_clip": 0.01069344, "auxiliary_loss_mlp": 0.01031555, "balance_loss_clip": 1.03639483, "balance_loss_mlp": 1.01878786, "epoch": 0.9453780249511499, "flos": 13473001509120.0, "grad_norm": 2.137858204283182, "language_loss": 0.73015231, "learning_rate": 3.1182390558889715e-08, "loss": 0.75116134, "num_input_tokens_seen": 339122050, "step": 15724, "time_per_iteration": 2.7390730381011963 }, { "auxiliary_loss_clip": 0.01075665, "auxiliary_loss_mlp": 0.01027448, "balance_loss_clip": 1.03523171, "balance_loss_mlp": 1.01556301, "epoch": 0.9454381482038178, "flos": 23258515772160.0, "grad_norm": 2.0036441460727676, "language_loss": 0.84524632, "learning_rate": 3.111392324436024e-08, "loss": 0.8662774, "num_input_tokens_seen": 339138940, "step": 15725, "time_per_iteration": 2.7032201290130615 }, { "auxiliary_loss_clip": 0.01092034, "auxiliary_loss_mlp": 0.01027805, "balance_loss_clip": 1.03934574, "balance_loss_mlp": 1.01518655, "epoch": 0.9454982714564858, "flos": 19496621502720.0, "grad_norm": 1.7112621255845237, "language_loss": 0.71131301, "learning_rate": 3.104553059018822e-08, "loss": 0.7325114, "num_input_tokens_seen": 339158245, "step": 15726, "time_per_iteration": 2.633211135864258 }, { "auxiliary_loss_clip": 0.01083425, "auxiliary_loss_mlp": 0.0103136, "balance_loss_clip": 1.03504848, "balance_loss_mlp": 1.01800275, "epoch": 0.9455583947091538, "flos": 23258120722560.0, "grad_norm": 1.770426483467669, "language_loss": 0.60957873, "learning_rate": 3.097721259896735e-08, "loss": 0.63072664, "num_input_tokens_seen": 339178200, "step": 15727, "time_per_iteration": 2.66964054107666 }, { "auxiliary_loss_clip": 0.01093477, "auxiliary_loss_mlp": 0.01033773, "balance_loss_clip": 1.03381276, "balance_loss_mlp": 1.02250743, "epoch": 0.9456185179618217, "flos": 17673041877120.0, "grad_norm": 1.6947492443167869, "language_loss": 0.81717706, "learning_rate": 3.0908969273287566e-08, "loss": 0.83844954, "num_input_tokens_seen": 339193950, "step": 15728, "time_per_iteration": 2.6493005752563477 }, { "auxiliary_loss_clip": 0.00982318, "auxiliary_loss_mlp": 0.01006045, "balance_loss_clip": 1.00669122, "balance_loss_mlp": 1.00475144, "epoch": 0.9456786412144897, "flos": 61415040389760.0, "grad_norm": 0.7309632127975088, "language_loss": 0.59005105, "learning_rate": 3.08408006157368e-08, "loss": 0.60993469, "num_input_tokens_seen": 339252330, "step": 15729, "time_per_iteration": 3.3251638412475586 }, { "auxiliary_loss_clip": 0.01106055, "auxiliary_loss_mlp": 0.01026459, "balance_loss_clip": 1.03561211, "balance_loss_mlp": 1.01384068, "epoch": 0.9457387644671577, "flos": 18588369179520.0, "grad_norm": 1.8376283241487172, "language_loss": 0.76239592, "learning_rate": 3.077270662890052e-08, "loss": 0.78372103, "num_input_tokens_seen": 339270325, "step": 15730, "time_per_iteration": 2.822908401489258 }, { "auxiliary_loss_clip": 0.01086637, "auxiliary_loss_mlp": 0.01031322, "balance_loss_clip": 1.03977942, "balance_loss_mlp": 1.01842904, "epoch": 0.9457988877198257, "flos": 21108544237440.0, "grad_norm": 1.4741875108902043, "language_loss": 0.6241951, "learning_rate": 3.070468731536047e-08, "loss": 0.64537472, "num_input_tokens_seen": 339291980, "step": 15731, "time_per_iteration": 2.780259370803833 }, { "auxiliary_loss_clip": 0.01098616, "auxiliary_loss_mlp": 0.01027483, "balance_loss_clip": 1.03492093, "balance_loss_mlp": 1.01464427, "epoch": 0.9458590109724936, "flos": 26688379697280.0, "grad_norm": 1.8389995497060174, "language_loss": 0.63829595, "learning_rate": 3.063674267769589e-08, "loss": 0.65955698, "num_input_tokens_seen": 339311795, "step": 15732, "time_per_iteration": 2.6928884983062744 }, { "auxiliary_loss_clip": 0.01097602, "auxiliary_loss_mlp": 0.01030285, "balance_loss_clip": 1.03819144, "balance_loss_mlp": 1.01691008, "epoch": 0.9459191342251616, "flos": 18661591054080.0, "grad_norm": 2.312355275604837, "language_loss": 0.83734918, "learning_rate": 3.056887271848363e-08, "loss": 0.85862809, "num_input_tokens_seen": 339327745, "step": 15733, "time_per_iteration": 2.573761463165283 }, { "auxiliary_loss_clip": 0.01093698, "auxiliary_loss_mlp": 0.01029334, "balance_loss_clip": 1.03431845, "balance_loss_mlp": 1.01787257, "epoch": 0.9459792574778295, "flos": 23398459159680.0, "grad_norm": 2.0402128352906135, "language_loss": 0.7230435, "learning_rate": 3.0501077440297173e-08, "loss": 0.74427378, "num_input_tokens_seen": 339346445, "step": 15734, "time_per_iteration": 2.6132256984710693 }, { "auxiliary_loss_clip": 0.01092017, "auxiliary_loss_mlp": 0.01030532, "balance_loss_clip": 1.03411341, "balance_loss_mlp": 1.01973712, "epoch": 0.9460393807304975, "flos": 24392969994240.0, "grad_norm": 1.706482735493318, "language_loss": 0.86788249, "learning_rate": 3.043335684570692e-08, "loss": 0.88910794, "num_input_tokens_seen": 339367945, "step": 15735, "time_per_iteration": 2.6257829666137695 }, { "auxiliary_loss_clip": 0.01088315, "auxiliary_loss_mlp": 0.01028057, "balance_loss_clip": 1.0354389, "balance_loss_mlp": 1.0162971, "epoch": 0.9460995039831654, "flos": 21939408708480.0, "grad_norm": 2.0029981426507026, "language_loss": 0.6727972, "learning_rate": 3.036571093728102e-08, "loss": 0.69396096, "num_input_tokens_seen": 339386060, "step": 15736, "time_per_iteration": 2.6414105892181396 }, { "auxiliary_loss_clip": 0.00990794, "auxiliary_loss_mlp": 0.01001581, "balance_loss_clip": 1.01297307, "balance_loss_mlp": 1.00051391, "epoch": 0.9461596272358335, "flos": 70322466775680.0, "grad_norm": 0.8681687595231652, "language_loss": 0.65302682, "learning_rate": 3.029813971758499e-08, "loss": 0.67295063, "num_input_tokens_seen": 339446695, "step": 15737, "time_per_iteration": 3.2643556594848633 }, { "auxiliary_loss_clip": 0.01016522, "auxiliary_loss_mlp": 0.01001018, "balance_loss_clip": 1.00645328, "balance_loss_mlp": 0.99996263, "epoch": 0.9462197504885014, "flos": 58591242645120.0, "grad_norm": 0.8027511027658571, "language_loss": 0.58797008, "learning_rate": 3.0230643189181225e-08, "loss": 0.60814548, "num_input_tokens_seen": 339510080, "step": 15738, "time_per_iteration": 3.1644718647003174 }, { "auxiliary_loss_clip": 0.01093604, "auxiliary_loss_mlp": 0.0103372, "balance_loss_clip": 1.03396416, "balance_loss_mlp": 1.02237749, "epoch": 0.9462798737411694, "flos": 23433759250560.0, "grad_norm": 1.7606066776130818, "language_loss": 0.71841681, "learning_rate": 3.016322135462834e-08, "loss": 0.73969007, "num_input_tokens_seen": 339529335, "step": 15739, "time_per_iteration": 2.6679999828338623 }, { "auxiliary_loss_clip": 0.01093944, "auxiliary_loss_mlp": 0.01027852, "balance_loss_clip": 1.03307033, "balance_loss_mlp": 1.01536524, "epoch": 0.9463399969938374, "flos": 25046077034880.0, "grad_norm": 2.442002689709471, "language_loss": 0.65025353, "learning_rate": 3.009587421648363e-08, "loss": 0.67147148, "num_input_tokens_seen": 339548820, "step": 15740, "time_per_iteration": 2.703686237335205 }, { "auxiliary_loss_clip": 0.01082274, "auxiliary_loss_mlp": 0.01029406, "balance_loss_clip": 1.0356245, "balance_loss_mlp": 1.01749706, "epoch": 0.9464001202465053, "flos": 24352606085760.0, "grad_norm": 1.7330701520859766, "language_loss": 0.66210133, "learning_rate": 3.0028601777301045e-08, "loss": 0.68321818, "num_input_tokens_seen": 339566775, "step": 15741, "time_per_iteration": 2.664438009262085 }, { "auxiliary_loss_clip": 0.01097651, "auxiliary_loss_mlp": 0.01026578, "balance_loss_clip": 1.03633511, "balance_loss_mlp": 1.01461589, "epoch": 0.9464602434991733, "flos": 17165444832000.0, "grad_norm": 1.8465493774504513, "language_loss": 0.76130718, "learning_rate": 2.9961404039630987e-08, "loss": 0.7825495, "num_input_tokens_seen": 339581905, "step": 15742, "time_per_iteration": 2.5938029289245605 }, { "auxiliary_loss_clip": 0.01092873, "auxiliary_loss_mlp": 0.01030901, "balance_loss_clip": 1.0342133, "balance_loss_mlp": 1.01929605, "epoch": 0.9465203667518413, "flos": 19938107566080.0, "grad_norm": 1.7769383818229454, "language_loss": 0.72399694, "learning_rate": 2.989428100602187e-08, "loss": 0.74523461, "num_input_tokens_seen": 339599870, "step": 15743, "time_per_iteration": 2.678401470184326 }, { "auxiliary_loss_clip": 0.01073999, "auxiliary_loss_mlp": 0.01031411, "balance_loss_clip": 1.03740954, "balance_loss_mlp": 1.01843548, "epoch": 0.9465804900045093, "flos": 20120318282880.0, "grad_norm": 1.6693013333008395, "language_loss": 0.79701877, "learning_rate": 2.982723267901943e-08, "loss": 0.81807292, "num_input_tokens_seen": 339620250, "step": 15744, "time_per_iteration": 2.7061126232147217 }, { "auxiliary_loss_clip": 0.01086196, "auxiliary_loss_mlp": 0.01038507, "balance_loss_clip": 1.03539801, "balance_loss_mlp": 1.02565646, "epoch": 0.9466406132571772, "flos": 23911622812800.0, "grad_norm": 1.6306815093715024, "language_loss": 0.77796626, "learning_rate": 2.9760259061165417e-08, "loss": 0.79921329, "num_input_tokens_seen": 339639900, "step": 15745, "time_per_iteration": 2.667794704437256 }, { "auxiliary_loss_clip": 0.01082416, "auxiliary_loss_mlp": 0.01035804, "balance_loss_clip": 1.03260911, "balance_loss_mlp": 1.02258968, "epoch": 0.9467007365098452, "flos": 19933223316480.0, "grad_norm": 1.9540472383542953, "language_loss": 0.70444429, "learning_rate": 2.9693360155000014e-08, "loss": 0.72562647, "num_input_tokens_seen": 339658970, "step": 15746, "time_per_iteration": 2.671787738800049 }, { "auxiliary_loss_clip": 0.01083981, "auxiliary_loss_mlp": 0.01028625, "balance_loss_clip": 1.03787088, "balance_loss_mlp": 1.01583958, "epoch": 0.9467608597625131, "flos": 19310496203520.0, "grad_norm": 2.086321056520881, "language_loss": 0.55439335, "learning_rate": 2.962653596305964e-08, "loss": 0.57551944, "num_input_tokens_seen": 339675600, "step": 15747, "time_per_iteration": 2.6125731468200684 }, { "auxiliary_loss_clip": 0.00971726, "auxiliary_loss_mlp": 0.0100543, "balance_loss_clip": 1.00657761, "balance_loss_mlp": 1.00431538, "epoch": 0.9468209830151811, "flos": 69630252802560.0, "grad_norm": 0.6607022799380584, "language_loss": 0.53227079, "learning_rate": 2.955978648787871e-08, "loss": 0.55204231, "num_input_tokens_seen": 339744505, "step": 15748, "time_per_iteration": 3.6713624000549316 }, { "auxiliary_loss_clip": 0.01087901, "auxiliary_loss_mlp": 0.01036855, "balance_loss_clip": 1.03579473, "balance_loss_mlp": 1.02451098, "epoch": 0.946881106267849, "flos": 27016639113600.0, "grad_norm": 1.6437037457333494, "language_loss": 0.6632542, "learning_rate": 2.9493111731988096e-08, "loss": 0.68450171, "num_input_tokens_seen": 339765810, "step": 15749, "time_per_iteration": 4.178863286972046 }, { "auxiliary_loss_clip": 0.0107672, "auxiliary_loss_mlp": 0.01030414, "balance_loss_clip": 1.03375602, "balance_loss_mlp": 1.01670504, "epoch": 0.9469412295205171, "flos": 20190092451840.0, "grad_norm": 3.088565110530085, "language_loss": 0.75976688, "learning_rate": 2.942651169791621e-08, "loss": 0.78083825, "num_input_tokens_seen": 339784125, "step": 15750, "time_per_iteration": 2.7121167182922363 }, { "auxiliary_loss_clip": 0.01096615, "auxiliary_loss_mlp": 0.01028166, "balance_loss_clip": 1.0368098, "balance_loss_mlp": 1.01631653, "epoch": 0.947001352773185, "flos": 21324905809920.0, "grad_norm": 1.5838311482694045, "language_loss": 0.67727458, "learning_rate": 2.9359986388188372e-08, "loss": 0.69852245, "num_input_tokens_seen": 339803450, "step": 15751, "time_per_iteration": 2.709989070892334 }, { "auxiliary_loss_clip": 0.01076359, "auxiliary_loss_mlp": 0.01030459, "balance_loss_clip": 1.03434587, "balance_loss_mlp": 1.01857424, "epoch": 0.947061476025853, "flos": 21944041562880.0, "grad_norm": 1.6112027169393213, "language_loss": 0.65785074, "learning_rate": 2.929353580532723e-08, "loss": 0.6789189, "num_input_tokens_seen": 339823215, "step": 15752, "time_per_iteration": 2.731290102005005 }, { "auxiliary_loss_clip": 0.01092841, "auxiliary_loss_mlp": 0.01035403, "balance_loss_clip": 1.03419137, "balance_loss_mlp": 1.0214498, "epoch": 0.947121599278521, "flos": 21394715892480.0, "grad_norm": 1.9387183547098805, "language_loss": 0.71516705, "learning_rate": 2.9227159951852764e-08, "loss": 0.73644954, "num_input_tokens_seen": 339842230, "step": 15753, "time_per_iteration": 2.6081583499908447 }, { "auxiliary_loss_clip": 0.01109554, "auxiliary_loss_mlp": 0.01032964, "balance_loss_clip": 1.03532398, "balance_loss_mlp": 1.01883757, "epoch": 0.9471817225311889, "flos": 23075730437760.0, "grad_norm": 1.76483043172341, "language_loss": 0.70370275, "learning_rate": 2.9160858830281855e-08, "loss": 0.72512788, "num_input_tokens_seen": 339861640, "step": 15754, "time_per_iteration": 2.580967426300049 }, { "auxiliary_loss_clip": 0.0110967, "auxiliary_loss_mlp": 0.01032839, "balance_loss_clip": 1.03552377, "balance_loss_mlp": 1.02043486, "epoch": 0.947241845783857, "flos": 11910744305280.0, "grad_norm": 3.2827981258328207, "language_loss": 0.78840715, "learning_rate": 2.9094632443129153e-08, "loss": 0.80983222, "num_input_tokens_seen": 339878210, "step": 15755, "time_per_iteration": 4.16628360748291 }, { "auxiliary_loss_clip": 0.01070124, "auxiliary_loss_mlp": 0.0103368, "balance_loss_clip": 1.03388035, "balance_loss_mlp": 1.01844525, "epoch": 0.9473019690365249, "flos": 20740675098240.0, "grad_norm": 2.275356608148397, "language_loss": 0.75449395, "learning_rate": 2.9028480792904876e-08, "loss": 0.77553201, "num_input_tokens_seen": 339894255, "step": 15756, "time_per_iteration": 4.229847431182861 }, { "auxiliary_loss_clip": 0.01083084, "auxiliary_loss_mlp": 0.01030263, "balance_loss_clip": 1.03162217, "balance_loss_mlp": 1.01807952, "epoch": 0.9473620922891929, "flos": 17639896602240.0, "grad_norm": 2.263017805746966, "language_loss": 0.74833083, "learning_rate": 2.8962403882118347e-08, "loss": 0.76946425, "num_input_tokens_seen": 339912425, "step": 15757, "time_per_iteration": 2.64909291267395 }, { "auxiliary_loss_clip": 0.01089898, "auxiliary_loss_mlp": 0.0103155, "balance_loss_clip": 1.03554904, "balance_loss_mlp": 1.01819229, "epoch": 0.9474222155418608, "flos": 23550002640000.0, "grad_norm": 1.9625522630071812, "language_loss": 0.79462659, "learning_rate": 2.889640171327512e-08, "loss": 0.81584108, "num_input_tokens_seen": 339929635, "step": 15758, "time_per_iteration": 4.308099031448364 }, { "auxiliary_loss_clip": 0.01077085, "auxiliary_loss_mlp": 0.00769854, "balance_loss_clip": 1.03425276, "balance_loss_mlp": 1.00017619, "epoch": 0.9474823387945288, "flos": 27089753247360.0, "grad_norm": 1.3762409201655417, "language_loss": 0.71830899, "learning_rate": 2.8830474288877638e-08, "loss": 0.73677838, "num_input_tokens_seen": 339951200, "step": 15759, "time_per_iteration": 2.7510428428649902 }, { "auxiliary_loss_clip": 0.01091647, "auxiliary_loss_mlp": 0.01028463, "balance_loss_clip": 1.0367434, "balance_loss_mlp": 1.01805592, "epoch": 0.9475424620471967, "flos": 22966526113920.0, "grad_norm": 1.5226325808492376, "language_loss": 0.75499642, "learning_rate": 2.8764621611426344e-08, "loss": 0.77619755, "num_input_tokens_seen": 339971820, "step": 15760, "time_per_iteration": 4.22639799118042 }, { "auxiliary_loss_clip": 0.01107661, "auxiliary_loss_mlp": 0.00769288, "balance_loss_clip": 1.0366137, "balance_loss_mlp": 1.00024486, "epoch": 0.9476025852998647, "flos": 20047671025920.0, "grad_norm": 1.877401984510813, "language_loss": 0.7275269, "learning_rate": 2.8698843683418128e-08, "loss": 0.74629641, "num_input_tokens_seen": 339989420, "step": 15761, "time_per_iteration": 2.6196117401123047 }, { "auxiliary_loss_clip": 0.01086146, "auxiliary_loss_mlp": 0.01036281, "balance_loss_clip": 1.03789508, "balance_loss_mlp": 1.02441955, "epoch": 0.9476627085525327, "flos": 14975468524800.0, "grad_norm": 2.729446835084705, "language_loss": 0.71608138, "learning_rate": 2.863314050734722e-08, "loss": 0.73730564, "num_input_tokens_seen": 340006690, "step": 15762, "time_per_iteration": 2.579223155975342 }, { "auxiliary_loss_clip": 0.01111512, "auxiliary_loss_mlp": 0.01036338, "balance_loss_clip": 1.03547406, "balance_loss_mlp": 1.02280796, "epoch": 0.9477228318052007, "flos": 18697788984960.0, "grad_norm": 2.07051809457296, "language_loss": 0.66850615, "learning_rate": 2.856751208570518e-08, "loss": 0.68998462, "num_input_tokens_seen": 340025480, "step": 15763, "time_per_iteration": 2.5970752239227295 }, { "auxiliary_loss_clip": 0.01107023, "auxiliary_loss_mlp": 0.01036664, "balance_loss_clip": 1.03498352, "balance_loss_mlp": 1.02424252, "epoch": 0.9477829550578686, "flos": 23875065745920.0, "grad_norm": 1.7866390242550823, "language_loss": 0.69783157, "learning_rate": 2.8501958420980466e-08, "loss": 0.71926844, "num_input_tokens_seen": 340043785, "step": 15764, "time_per_iteration": 2.5758285522460938 }, { "auxiliary_loss_clip": 0.01095569, "auxiliary_loss_mlp": 0.00768781, "balance_loss_clip": 1.03836465, "balance_loss_mlp": 1.00017273, "epoch": 0.9478430783105366, "flos": 22562890007040.0, "grad_norm": 1.6268430916699592, "language_loss": 0.71237898, "learning_rate": 2.8436479515659306e-08, "loss": 0.73102248, "num_input_tokens_seen": 340064360, "step": 15765, "time_per_iteration": 2.7962822914123535 }, { "auxiliary_loss_clip": 0.01008188, "auxiliary_loss_mlp": 0.01003115, "balance_loss_clip": 1.00526595, "balance_loss_mlp": 1.00220859, "epoch": 0.9479032015632046, "flos": 60857885554560.0, "grad_norm": 0.805215239265766, "language_loss": 0.59051013, "learning_rate": 2.8371075372224384e-08, "loss": 0.61062312, "num_input_tokens_seen": 340114425, "step": 15766, "time_per_iteration": 2.9193778038024902 }, { "auxiliary_loss_clip": 0.01055212, "auxiliary_loss_mlp": 0.0104033, "balance_loss_clip": 1.03303111, "balance_loss_mlp": 1.02758038, "epoch": 0.9479633248158725, "flos": 14683873916160.0, "grad_norm": 1.7099233652082526, "language_loss": 0.74133235, "learning_rate": 2.8305745993155938e-08, "loss": 0.7622878, "num_input_tokens_seen": 340132200, "step": 15767, "time_per_iteration": 2.805891990661621 }, { "auxiliary_loss_clip": 0.0108313, "auxiliary_loss_mlp": 0.01032242, "balance_loss_clip": 1.03779268, "balance_loss_mlp": 1.0191226, "epoch": 0.9480234480685406, "flos": 20333878594560.0, "grad_norm": 2.313278201082517, "language_loss": 0.73025, "learning_rate": 2.8240491380931096e-08, "loss": 0.75140369, "num_input_tokens_seen": 340149175, "step": 15768, "time_per_iteration": 2.6399149894714355 }, { "auxiliary_loss_clip": 0.00990186, "auxiliary_loss_mlp": 0.01003636, "balance_loss_clip": 1.00721884, "balance_loss_mlp": 1.0025754, "epoch": 0.9480835713212085, "flos": 70293092428800.0, "grad_norm": 0.7345736556606803, "language_loss": 0.55274725, "learning_rate": 2.8175311538024326e-08, "loss": 0.57268548, "num_input_tokens_seen": 340208155, "step": 15769, "time_per_iteration": 3.346592664718628 }, { "auxiliary_loss_clip": 0.01060494, "auxiliary_loss_mlp": 0.01031034, "balance_loss_clip": 1.03297341, "balance_loss_mlp": 1.01898789, "epoch": 0.9481436945738765, "flos": 25449749055360.0, "grad_norm": 1.3356689895855771, "language_loss": 0.77657175, "learning_rate": 2.8110206466907428e-08, "loss": 0.79748702, "num_input_tokens_seen": 340229275, "step": 15770, "time_per_iteration": 2.967400074005127 }, { "auxiliary_loss_clip": 0.01090847, "auxiliary_loss_mlp": 0.0103804, "balance_loss_clip": 1.0389564, "balance_loss_mlp": 1.02452743, "epoch": 0.9482038178265444, "flos": 26979902478720.0, "grad_norm": 3.3844723552272304, "language_loss": 0.79788053, "learning_rate": 2.8045176170049313e-08, "loss": 0.81916934, "num_input_tokens_seen": 340248920, "step": 15771, "time_per_iteration": 2.6720709800720215 }, { "auxiliary_loss_clip": 0.0107385, "auxiliary_loss_mlp": 0.01029719, "balance_loss_clip": 1.03290439, "balance_loss_mlp": 1.01731515, "epoch": 0.9482639410792124, "flos": 17785442511360.0, "grad_norm": 1.7305869022137186, "language_loss": 0.69742543, "learning_rate": 2.7980220649915566e-08, "loss": 0.71846116, "num_input_tokens_seen": 340266775, "step": 15772, "time_per_iteration": 2.7055277824401855 }, { "auxiliary_loss_clip": 0.01091743, "auxiliary_loss_mlp": 0.0103086, "balance_loss_clip": 1.03463781, "balance_loss_mlp": 1.01827109, "epoch": 0.9483240643318803, "flos": 20996682307200.0, "grad_norm": 1.5174213608604383, "language_loss": 0.73862821, "learning_rate": 2.7915339908969327e-08, "loss": 0.7598542, "num_input_tokens_seen": 340285295, "step": 15773, "time_per_iteration": 2.594517469406128 }, { "auxiliary_loss_clip": 0.01075154, "auxiliary_loss_mlp": 0.01037032, "balance_loss_clip": 1.03320599, "balance_loss_mlp": 1.02424073, "epoch": 0.9483841875845483, "flos": 20083294339200.0, "grad_norm": 2.198949028085397, "language_loss": 0.62984806, "learning_rate": 2.7850533949671072e-08, "loss": 0.65096992, "num_input_tokens_seen": 340304265, "step": 15774, "time_per_iteration": 2.6656346321105957 }, { "auxiliary_loss_clip": 0.01108855, "auxiliary_loss_mlp": 0.01032808, "balance_loss_clip": 1.03615785, "balance_loss_mlp": 1.01968336, "epoch": 0.9484443108372163, "flos": 20813645577600.0, "grad_norm": 1.8448156686123751, "language_loss": 0.59319341, "learning_rate": 2.7785802774478396e-08, "loss": 0.61461002, "num_input_tokens_seen": 340323690, "step": 15775, "time_per_iteration": 2.6134490966796875 }, { "auxiliary_loss_clip": 0.0108818, "auxiliary_loss_mlp": 0.01028388, "balance_loss_clip": 1.03665876, "balance_loss_mlp": 1.01553738, "epoch": 0.9485044340898843, "flos": 36429184506240.0, "grad_norm": 1.5672743954307715, "language_loss": 0.61733031, "learning_rate": 2.772114638584555e-08, "loss": 0.63849604, "num_input_tokens_seen": 340345830, "step": 15776, "time_per_iteration": 2.759727954864502 }, { "auxiliary_loss_clip": 0.01079507, "auxiliary_loss_mlp": 0.01031725, "balance_loss_clip": 1.03297567, "balance_loss_mlp": 1.01894581, "epoch": 0.9485645573425522, "flos": 22602535643520.0, "grad_norm": 1.5939795755888917, "language_loss": 0.73614502, "learning_rate": 2.765656478622458e-08, "loss": 0.75725728, "num_input_tokens_seen": 340365910, "step": 15777, "time_per_iteration": 2.6045753955841064 }, { "auxiliary_loss_clip": 0.01108311, "auxiliary_loss_mlp": 0.01035184, "balance_loss_clip": 1.03904653, "balance_loss_mlp": 1.0216893, "epoch": 0.9486246805952202, "flos": 22017766227840.0, "grad_norm": 2.932173295404769, "language_loss": 0.7171486, "learning_rate": 2.759205797806441e-08, "loss": 0.73858356, "num_input_tokens_seen": 340383935, "step": 15778, "time_per_iteration": 2.5818030834198 }, { "auxiliary_loss_clip": 0.0109326, "auxiliary_loss_mlp": 0.00769105, "balance_loss_clip": 1.03678966, "balance_loss_mlp": 1.00016212, "epoch": 0.9486848038478882, "flos": 16508674604160.0, "grad_norm": 1.785656818158453, "language_loss": 0.70001411, "learning_rate": 2.7527625963810865e-08, "loss": 0.7186377, "num_input_tokens_seen": 340402760, "step": 15779, "time_per_iteration": 2.5735414028167725 }, { "auxiliary_loss_clip": 0.01109892, "auxiliary_loss_mlp": 0.01032242, "balance_loss_clip": 1.03769064, "balance_loss_mlp": 1.01942062, "epoch": 0.9487449271005561, "flos": 19244385221760.0, "grad_norm": 2.1485694494900045, "language_loss": 0.78390372, "learning_rate": 2.7463268745907542e-08, "loss": 0.80532503, "num_input_tokens_seen": 340422105, "step": 15780, "time_per_iteration": 2.571122169494629 }, { "auxiliary_loss_clip": 0.0108342, "auxiliary_loss_mlp": 0.00770056, "balance_loss_clip": 1.03853536, "balance_loss_mlp": 1.00020254, "epoch": 0.9488050503532242, "flos": 21762692772480.0, "grad_norm": 1.7364814662461427, "language_loss": 0.66234344, "learning_rate": 2.7398986326794494e-08, "loss": 0.68087816, "num_input_tokens_seen": 340441160, "step": 15781, "time_per_iteration": 2.6827192306518555 }, { "auxiliary_loss_clip": 0.01107411, "auxiliary_loss_mlp": 0.01034117, "balance_loss_clip": 1.03690338, "balance_loss_mlp": 1.02159953, "epoch": 0.9488651736058921, "flos": 18368919037440.0, "grad_norm": 9.76335675616754, "language_loss": 0.79928899, "learning_rate": 2.733477870890999e-08, "loss": 0.82070434, "num_input_tokens_seen": 340458200, "step": 15782, "time_per_iteration": 2.567207098007202 }, { "auxiliary_loss_clip": 0.010185, "auxiliary_loss_mlp": 0.01003019, "balance_loss_clip": 1.00588965, "balance_loss_mlp": 1.00194001, "epoch": 0.9489252968585601, "flos": 70084057230720.0, "grad_norm": 0.7221824593756564, "language_loss": 0.59740299, "learning_rate": 2.7270645894688082e-08, "loss": 0.6176182, "num_input_tokens_seen": 340526420, "step": 15783, "time_per_iteration": 3.296163558959961 }, { "auxiliary_loss_clip": 0.01096688, "auxiliary_loss_mlp": 0.01035623, "balance_loss_clip": 1.0347774, "balance_loss_mlp": 1.02289176, "epoch": 0.948985420111228, "flos": 27855440490240.0, "grad_norm": 1.6602222433603364, "language_loss": 0.73771024, "learning_rate": 2.720658788656105e-08, "loss": 0.75903332, "num_input_tokens_seen": 340546325, "step": 15784, "time_per_iteration": 2.671168804168701 }, { "auxiliary_loss_clip": 0.01060019, "auxiliary_loss_mlp": 0.01031532, "balance_loss_clip": 1.03550255, "balance_loss_mlp": 1.01762056, "epoch": 0.949045543363896, "flos": 24316049018880.0, "grad_norm": 1.7690180821758892, "language_loss": 0.69829547, "learning_rate": 2.714260468695806e-08, "loss": 0.71921104, "num_input_tokens_seen": 340565145, "step": 15785, "time_per_iteration": 2.718092203140259 }, { "auxiliary_loss_clip": 0.01108856, "auxiliary_loss_mlp": 0.01028867, "balance_loss_clip": 1.03556883, "balance_loss_mlp": 1.01650548, "epoch": 0.9491056666165639, "flos": 24241677909120.0, "grad_norm": 1.499623149824644, "language_loss": 0.75997609, "learning_rate": 2.707869629830495e-08, "loss": 0.78135335, "num_input_tokens_seen": 340585465, "step": 15786, "time_per_iteration": 2.5866501331329346 }, { "auxiliary_loss_clip": 0.01075928, "auxiliary_loss_mlp": 0.01032218, "balance_loss_clip": 1.03659058, "balance_loss_mlp": 1.02088714, "epoch": 0.949165789869232, "flos": 24531261356160.0, "grad_norm": 1.9121797564334724, "language_loss": 0.78743112, "learning_rate": 2.7014862723025335e-08, "loss": 0.80851257, "num_input_tokens_seen": 340606010, "step": 15787, "time_per_iteration": 2.6785271167755127 }, { "auxiliary_loss_clip": 0.0109935, "auxiliary_loss_mlp": 0.01029051, "balance_loss_clip": 1.03999209, "balance_loss_mlp": 1.01711869, "epoch": 0.9492259131218999, "flos": 22235348862720.0, "grad_norm": 1.5253176882486765, "language_loss": 0.76644206, "learning_rate": 2.6951103963540388e-08, "loss": 0.78772604, "num_input_tokens_seen": 340626135, "step": 15788, "time_per_iteration": 2.7900092601776123 }, { "auxiliary_loss_clip": 0.01098885, "auxiliary_loss_mlp": 0.01032975, "balance_loss_clip": 1.03593767, "balance_loss_mlp": 1.019647, "epoch": 0.9492860363745679, "flos": 22966310632320.0, "grad_norm": 1.7589364420140376, "language_loss": 0.71141213, "learning_rate": 2.6887420022266848e-08, "loss": 0.73273069, "num_input_tokens_seen": 340644870, "step": 15789, "time_per_iteration": 2.6160874366760254 }, { "auxiliary_loss_clip": 0.01059097, "auxiliary_loss_mlp": 0.01031485, "balance_loss_clip": 1.03295982, "balance_loss_mlp": 1.01794267, "epoch": 0.9493461596272358, "flos": 18370283754240.0, "grad_norm": 2.83542221151725, "language_loss": 0.73137754, "learning_rate": 2.682381090161989e-08, "loss": 0.75228333, "num_input_tokens_seen": 340663695, "step": 15790, "time_per_iteration": 2.6108055114746094 }, { "auxiliary_loss_clip": 0.01073497, "auxiliary_loss_mlp": 0.01035956, "balance_loss_clip": 1.03374732, "balance_loss_mlp": 1.02253366, "epoch": 0.9494062828799038, "flos": 20011724490240.0, "grad_norm": 1.9002383614444849, "language_loss": 0.77333057, "learning_rate": 2.6760276604012033e-08, "loss": 0.79442513, "num_input_tokens_seen": 340682970, "step": 15791, "time_per_iteration": 2.688148260116577 }, { "auxiliary_loss_clip": 0.01102402, "auxiliary_loss_mlp": 0.01034148, "balance_loss_clip": 1.03735161, "balance_loss_mlp": 1.02070129, "epoch": 0.9494664061325718, "flos": 27228583313280.0, "grad_norm": 1.7874682888186109, "language_loss": 0.73599547, "learning_rate": 2.6696817131852234e-08, "loss": 0.75736099, "num_input_tokens_seen": 340702275, "step": 15792, "time_per_iteration": 2.643265962600708 }, { "auxiliary_loss_clip": 0.01095336, "auxiliary_loss_mlp": 0.01034889, "balance_loss_clip": 1.03347254, "balance_loss_mlp": 1.02266979, "epoch": 0.9495265293852397, "flos": 18369816877440.0, "grad_norm": 1.8451462038230002, "language_loss": 0.78138769, "learning_rate": 2.663343248754679e-08, "loss": 0.80268991, "num_input_tokens_seen": 340719060, "step": 15793, "time_per_iteration": 2.5426347255706787 }, { "auxiliary_loss_clip": 0.01081824, "auxiliary_loss_mlp": 0.01030161, "balance_loss_clip": 1.03453922, "balance_loss_mlp": 1.0182879, "epoch": 0.9495866526379078, "flos": 23075766351360.0, "grad_norm": 1.686462964828876, "language_loss": 0.77439916, "learning_rate": 2.6570122673499562e-08, "loss": 0.79551899, "num_input_tokens_seen": 340737815, "step": 15794, "time_per_iteration": 4.211062669754028 }, { "auxiliary_loss_clip": 0.01078065, "auxiliary_loss_mlp": 0.00770639, "balance_loss_clip": 1.03476417, "balance_loss_mlp": 1.00026226, "epoch": 0.9496467758905757, "flos": 17529902179200.0, "grad_norm": 1.8326530487226782, "language_loss": 0.61200684, "learning_rate": 2.650688769211107e-08, "loss": 0.63049388, "num_input_tokens_seen": 340756035, "step": 15795, "time_per_iteration": 4.150991201400757 }, { "auxiliary_loss_clip": 0.01096105, "auxiliary_loss_mlp": 0.01034842, "balance_loss_clip": 1.03731775, "balance_loss_mlp": 1.02214646, "epoch": 0.9497068991432437, "flos": 24133910129280.0, "grad_norm": 1.6119216372134806, "language_loss": 0.79217291, "learning_rate": 2.644372754577895e-08, "loss": 0.81348234, "num_input_tokens_seen": 340775620, "step": 15796, "time_per_iteration": 2.6128690242767334 }, { "auxiliary_loss_clip": 0.01097993, "auxiliary_loss_mlp": 0.01029872, "balance_loss_clip": 1.03628421, "balance_loss_mlp": 1.01681852, "epoch": 0.9497670223959116, "flos": 20303319098880.0, "grad_norm": 1.8328846097658669, "language_loss": 0.75668991, "learning_rate": 2.6380642236898398e-08, "loss": 0.77796859, "num_input_tokens_seen": 340794510, "step": 15797, "time_per_iteration": 4.209908723831177 }, { "auxiliary_loss_clip": 0.01076014, "auxiliary_loss_mlp": 0.00770873, "balance_loss_clip": 1.03560376, "balance_loss_mlp": 1.00026107, "epoch": 0.9498271456485796, "flos": 13698916099200.0, "grad_norm": 2.550624917313258, "language_loss": 0.6578297, "learning_rate": 2.6317631767861727e-08, "loss": 0.67629862, "num_input_tokens_seen": 340812955, "step": 15798, "time_per_iteration": 2.6348631381988525 }, { "auxiliary_loss_clip": 0.011003, "auxiliary_loss_mlp": 0.0103345, "balance_loss_clip": 1.03773224, "balance_loss_mlp": 1.02130818, "epoch": 0.9498872689012475, "flos": 20814004713600.0, "grad_norm": 1.909884412198324, "language_loss": 0.77439278, "learning_rate": 2.6254696141058575e-08, "loss": 0.79573023, "num_input_tokens_seen": 340829200, "step": 15799, "time_per_iteration": 2.6085915565490723 }, { "auxiliary_loss_clip": 0.01091765, "auxiliary_loss_mlp": 0.01034579, "balance_loss_clip": 1.03405094, "balance_loss_mlp": 1.02236009, "epoch": 0.9499473921539155, "flos": 21032700670080.0, "grad_norm": 1.7422846362169004, "language_loss": 0.71096122, "learning_rate": 2.6191835358874814e-08, "loss": 0.7322247, "num_input_tokens_seen": 340848035, "step": 15800, "time_per_iteration": 4.11196756362915 }, { "auxiliary_loss_clip": 0.01081003, "auxiliary_loss_mlp": 0.01027785, "balance_loss_clip": 1.03178167, "balance_loss_mlp": 1.01504791, "epoch": 0.9500075154065835, "flos": 20998693468800.0, "grad_norm": 1.6797265038283544, "language_loss": 0.7196418, "learning_rate": 2.6129049423694315e-08, "loss": 0.74072969, "num_input_tokens_seen": 340870025, "step": 15801, "time_per_iteration": 2.7228105068206787 }, { "auxiliary_loss_clip": 0.01098003, "auxiliary_loss_mlp": 0.01032345, "balance_loss_clip": 1.03618026, "balance_loss_mlp": 1.02023363, "epoch": 0.9500676386592515, "flos": 25121956515840.0, "grad_norm": 1.5618247598543729, "language_loss": 0.80991805, "learning_rate": 2.6066338337898508e-08, "loss": 0.83122152, "num_input_tokens_seen": 340892290, "step": 15802, "time_per_iteration": 2.6597704887390137 }, { "auxiliary_loss_clip": 0.01111718, "auxiliary_loss_mlp": 0.01031196, "balance_loss_clip": 1.03881001, "balance_loss_mlp": 1.01899517, "epoch": 0.9501277619119194, "flos": 27523625627520.0, "grad_norm": 1.6749081287524619, "language_loss": 0.67810452, "learning_rate": 2.60037021038646e-08, "loss": 0.69953358, "num_input_tokens_seen": 340912260, "step": 15803, "time_per_iteration": 2.6744706630706787 }, { "auxiliary_loss_clip": 0.01082837, "auxiliary_loss_mlp": 0.01036561, "balance_loss_clip": 1.03429604, "balance_loss_mlp": 1.02377534, "epoch": 0.9501878851645874, "flos": 20813968800000.0, "grad_norm": 6.246974750170738, "language_loss": 0.76370931, "learning_rate": 2.5941140723968247e-08, "loss": 0.78490329, "num_input_tokens_seen": 340928930, "step": 15804, "time_per_iteration": 2.721076726913452 }, { "auxiliary_loss_clip": 0.01096211, "auxiliary_loss_mlp": 0.01035081, "balance_loss_clip": 1.03763199, "balance_loss_mlp": 1.0223074, "epoch": 0.9502480084172553, "flos": 18369385914240.0, "grad_norm": 1.716451063779602, "language_loss": 0.73370028, "learning_rate": 2.5878654200581775e-08, "loss": 0.75501317, "num_input_tokens_seen": 340946615, "step": 15805, "time_per_iteration": 2.573842763900757 }, { "auxiliary_loss_clip": 0.01084759, "auxiliary_loss_mlp": 0.01033637, "balance_loss_clip": 1.03832221, "balance_loss_mlp": 1.02066755, "epoch": 0.9503081316699233, "flos": 23549607590400.0, "grad_norm": 1.446963145068923, "language_loss": 0.80407286, "learning_rate": 2.5816242536074618e-08, "loss": 0.82525682, "num_input_tokens_seen": 340967545, "step": 15806, "time_per_iteration": 2.7522966861724854 }, { "auxiliary_loss_clip": 0.0107262, "auxiliary_loss_mlp": 0.010333, "balance_loss_clip": 1.03583097, "balance_loss_mlp": 1.0209856, "epoch": 0.9503682549225914, "flos": 18040444139520.0, "grad_norm": 2.3524275166414688, "language_loss": 0.82226515, "learning_rate": 2.5753905732813108e-08, "loss": 0.8433243, "num_input_tokens_seen": 340984955, "step": 15807, "time_per_iteration": 2.6490519046783447 }, { "auxiliary_loss_clip": 0.01089448, "auxiliary_loss_mlp": 0.01031628, "balance_loss_clip": 1.03269625, "balance_loss_mlp": 1.01936102, "epoch": 0.9504283781752593, "flos": 25886135387520.0, "grad_norm": 9.284971191525596, "language_loss": 0.71851462, "learning_rate": 2.5691643793161355e-08, "loss": 0.73972535, "num_input_tokens_seen": 341007300, "step": 15808, "time_per_iteration": 2.6571197509765625 }, { "auxiliary_loss_clip": 0.01097791, "auxiliary_loss_mlp": 0.01030365, "balance_loss_clip": 1.03632784, "balance_loss_mlp": 1.01830709, "epoch": 0.9504885014279273, "flos": 22124025636480.0, "grad_norm": 1.4241274902229573, "language_loss": 0.69725883, "learning_rate": 2.562945671948058e-08, "loss": 0.71854043, "num_input_tokens_seen": 341026695, "step": 15809, "time_per_iteration": 2.602086067199707 }, { "auxiliary_loss_clip": 0.0108373, "auxiliary_loss_mlp": 0.01027915, "balance_loss_clip": 1.03374672, "balance_loss_mlp": 1.01552939, "epoch": 0.9505486246805952, "flos": 21615961714560.0, "grad_norm": 1.5381287986116137, "language_loss": 0.75574476, "learning_rate": 2.5567344514128452e-08, "loss": 0.77686119, "num_input_tokens_seen": 341047080, "step": 15810, "time_per_iteration": 2.7851271629333496 }, { "auxiliary_loss_clip": 0.01074163, "auxiliary_loss_mlp": 0.01043017, "balance_loss_clip": 1.03387725, "balance_loss_mlp": 1.03033352, "epoch": 0.9506087479332632, "flos": 22528236360960.0, "grad_norm": 1.4680738031168652, "language_loss": 0.79985034, "learning_rate": 2.5505307179460643e-08, "loss": 0.82102215, "num_input_tokens_seen": 341067310, "step": 15811, "time_per_iteration": 2.716155767440796 }, { "auxiliary_loss_clip": 0.01082329, "auxiliary_loss_mlp": 0.01038784, "balance_loss_clip": 1.03409791, "balance_loss_mlp": 1.02606487, "epoch": 0.9506688711859311, "flos": 27527360641920.0, "grad_norm": 2.1864110496701823, "language_loss": 0.69794703, "learning_rate": 2.5443344717829495e-08, "loss": 0.71915817, "num_input_tokens_seen": 341085110, "step": 15812, "time_per_iteration": 2.7080633640289307 }, { "auxiliary_loss_clip": 0.01071236, "auxiliary_loss_mlp": 0.01036435, "balance_loss_clip": 1.03449655, "balance_loss_mlp": 1.0230056, "epoch": 0.9507289944385992, "flos": 19865783531520.0, "grad_norm": 1.621391502442825, "language_loss": 0.65664506, "learning_rate": 2.538145713158446e-08, "loss": 0.67772174, "num_input_tokens_seen": 341103190, "step": 15813, "time_per_iteration": 2.6422770023345947 }, { "auxiliary_loss_clip": 0.01099547, "auxiliary_loss_mlp": 0.01037026, "balance_loss_clip": 1.03611267, "balance_loss_mlp": 1.02409816, "epoch": 0.9507891176912671, "flos": 25193274969600.0, "grad_norm": 1.4581482978793308, "language_loss": 0.70320028, "learning_rate": 2.5319644423072327e-08, "loss": 0.72456604, "num_input_tokens_seen": 341125695, "step": 15814, "time_per_iteration": 2.658942699432373 }, { "auxiliary_loss_clip": 0.01097344, "auxiliary_loss_mlp": 0.01028268, "balance_loss_clip": 1.03695726, "balance_loss_mlp": 1.01665115, "epoch": 0.9508492409439351, "flos": 24899561458560.0, "grad_norm": 1.8950917263769373, "language_loss": 0.63310945, "learning_rate": 2.5257906594637445e-08, "loss": 0.65436554, "num_input_tokens_seen": 341143930, "step": 15815, "time_per_iteration": 2.633420944213867 }, { "auxiliary_loss_clip": 0.01084007, "auxiliary_loss_mlp": 0.01027739, "balance_loss_clip": 1.03480506, "balance_loss_mlp": 1.01581824, "epoch": 0.950909364196603, "flos": 29784094375680.0, "grad_norm": 1.8730237235822342, "language_loss": 0.58833039, "learning_rate": 2.519624364862061e-08, "loss": 0.60944784, "num_input_tokens_seen": 341164280, "step": 15816, "time_per_iteration": 2.7500061988830566 }, { "auxiliary_loss_clip": 0.0110715, "auxiliary_loss_mlp": 0.01039761, "balance_loss_clip": 1.03585255, "balance_loss_mlp": 1.02707124, "epoch": 0.950969487449271, "flos": 24717781704960.0, "grad_norm": 1.491116548169098, "language_loss": 0.73515993, "learning_rate": 2.513465558735994e-08, "loss": 0.75662911, "num_input_tokens_seen": 341183670, "step": 15817, "time_per_iteration": 2.6232523918151855 }, { "auxiliary_loss_clip": 0.01089005, "auxiliary_loss_mlp": 0.01034701, "balance_loss_clip": 1.03726554, "balance_loss_mlp": 1.02046156, "epoch": 0.9510296107019389, "flos": 13699167494400.0, "grad_norm": 1.5602178845218895, "language_loss": 0.60236609, "learning_rate": 2.5073142413190918e-08, "loss": 0.62360317, "num_input_tokens_seen": 341201900, "step": 15818, "time_per_iteration": 2.6116764545440674 }, { "auxiliary_loss_clip": 0.01109081, "auxiliary_loss_mlp": 0.01034014, "balance_loss_clip": 1.0376842, "balance_loss_mlp": 1.02124643, "epoch": 0.9510897339546069, "flos": 17311852667520.0, "grad_norm": 2.0566483218675438, "language_loss": 0.6942215, "learning_rate": 2.5011704128446552e-08, "loss": 0.71565247, "num_input_tokens_seen": 341218340, "step": 15819, "time_per_iteration": 2.560081958770752 }, { "auxiliary_loss_clip": 0.0107016, "auxiliary_loss_mlp": 0.01028125, "balance_loss_clip": 1.0372014, "balance_loss_mlp": 1.0156858, "epoch": 0.951149857207275, "flos": 14793940166400.0, "grad_norm": 1.7393168966248527, "language_loss": 0.73959541, "learning_rate": 2.49503407354561e-08, "loss": 0.76057822, "num_input_tokens_seen": 341235885, "step": 15820, "time_per_iteration": 2.797940969467163 }, { "auxiliary_loss_clip": 0.01089647, "auxiliary_loss_mlp": 0.01033566, "balance_loss_clip": 1.03681791, "balance_loss_mlp": 1.02076864, "epoch": 0.9512099804599429, "flos": 19391152193280.0, "grad_norm": 1.9037531735176971, "language_loss": 0.78643155, "learning_rate": 2.4889052236546804e-08, "loss": 0.80766368, "num_input_tokens_seen": 341255280, "step": 15821, "time_per_iteration": 2.6202476024627686 }, { "auxiliary_loss_clip": 0.01068626, "auxiliary_loss_mlp": 0.0102901, "balance_loss_clip": 1.03432822, "balance_loss_mlp": 1.01609325, "epoch": 0.9512701037126109, "flos": 36757874885760.0, "grad_norm": 1.5233600677568924, "language_loss": 0.71154618, "learning_rate": 2.4827838634042586e-08, "loss": 0.73252249, "num_input_tokens_seen": 341279055, "step": 15822, "time_per_iteration": 2.8137216567993164 }, { "auxiliary_loss_clip": 0.01094806, "auxiliary_loss_mlp": 0.01037452, "balance_loss_clip": 1.03667974, "balance_loss_mlp": 1.02534676, "epoch": 0.9513302269652788, "flos": 22638266697600.0, "grad_norm": 1.6180069901826792, "language_loss": 0.65828168, "learning_rate": 2.47666999302647e-08, "loss": 0.67960423, "num_input_tokens_seen": 341298560, "step": 15823, "time_per_iteration": 2.616811513900757 }, { "auxiliary_loss_clip": 0.01090848, "auxiliary_loss_mlp": 0.01031876, "balance_loss_clip": 1.03517807, "balance_loss_mlp": 1.01999104, "epoch": 0.9513903502179468, "flos": 22893232412160.0, "grad_norm": 1.8863282557920107, "language_loss": 0.77391565, "learning_rate": 2.4705636127531292e-08, "loss": 0.79514301, "num_input_tokens_seen": 341316650, "step": 15824, "time_per_iteration": 2.5897138118743896 }, { "auxiliary_loss_clip": 0.01110536, "auxiliary_loss_mlp": 0.01031394, "balance_loss_clip": 1.0360415, "balance_loss_mlp": 1.01814985, "epoch": 0.9514504734706147, "flos": 27928626451200.0, "grad_norm": 1.8804632984111238, "language_loss": 0.73739725, "learning_rate": 2.4644647228158065e-08, "loss": 0.75881654, "num_input_tokens_seen": 341336185, "step": 15825, "time_per_iteration": 2.59452223777771 }, { "auxiliary_loss_clip": 0.0101482, "auxiliary_loss_mlp": 0.00999967, "balance_loss_clip": 1.00606704, "balance_loss_mlp": 0.99895328, "epoch": 0.9515105967232828, "flos": 67366767312000.0, "grad_norm": 0.8525119381835639, "language_loss": 0.53459394, "learning_rate": 2.458373323445806e-08, "loss": 0.55474186, "num_input_tokens_seen": 341395795, "step": 15826, "time_per_iteration": 3.0530049800872803 }, { "auxiliary_loss_clip": 0.01084306, "auxiliary_loss_mlp": 0.01035155, "balance_loss_clip": 1.03550363, "balance_loss_mlp": 1.02248907, "epoch": 0.9515707199759507, "flos": 25846525664640.0, "grad_norm": 2.1311197223836458, "language_loss": 0.72489649, "learning_rate": 2.452289414874076e-08, "loss": 0.74609113, "num_input_tokens_seen": 341415675, "step": 15827, "time_per_iteration": 2.67301869392395 }, { "auxiliary_loss_clip": 0.01086796, "auxiliary_loss_mlp": 0.01030812, "balance_loss_clip": 1.03601933, "balance_loss_mlp": 1.01807427, "epoch": 0.9516308432286187, "flos": 21828983322240.0, "grad_norm": 1.785994656352798, "language_loss": 0.7409234, "learning_rate": 2.4462129973313207e-08, "loss": 0.7620995, "num_input_tokens_seen": 341432990, "step": 15828, "time_per_iteration": 2.6235291957855225 }, { "auxiliary_loss_clip": 0.0106734, "auxiliary_loss_mlp": 0.01033444, "balance_loss_clip": 1.03639388, "balance_loss_mlp": 1.02239347, "epoch": 0.9516909664812866, "flos": 27269593666560.0, "grad_norm": 1.6745966727407804, "language_loss": 0.72937709, "learning_rate": 2.440144071047978e-08, "loss": 0.75038493, "num_input_tokens_seen": 341454100, "step": 15829, "time_per_iteration": 2.831969738006592 }, { "auxiliary_loss_clip": 0.01093583, "auxiliary_loss_mlp": 0.01033735, "balance_loss_clip": 1.03440034, "balance_loss_mlp": 1.02001393, "epoch": 0.9517510897339546, "flos": 21215342350080.0, "grad_norm": 2.2807166636074863, "language_loss": 0.61247396, "learning_rate": 2.4340826362541533e-08, "loss": 0.6337471, "num_input_tokens_seen": 341472955, "step": 15830, "time_per_iteration": 2.57916522026062 }, { "auxiliary_loss_clip": 0.01095854, "auxiliary_loss_mlp": 0.01031396, "balance_loss_clip": 1.03783762, "balance_loss_mlp": 1.01720452, "epoch": 0.9518112129866225, "flos": 18733986915840.0, "grad_norm": 2.414229225065315, "language_loss": 0.72597665, "learning_rate": 2.428028693179729e-08, "loss": 0.74724913, "num_input_tokens_seen": 341490165, "step": 15831, "time_per_iteration": 2.590857982635498 }, { "auxiliary_loss_clip": 0.01054785, "auxiliary_loss_mlp": 0.01024066, "balance_loss_clip": 1.03245831, "balance_loss_mlp": 1.01274085, "epoch": 0.9518713362392905, "flos": 16763676232320.0, "grad_norm": 1.6809065599907524, "language_loss": 0.65303266, "learning_rate": 2.4219822420542545e-08, "loss": 0.67382115, "num_input_tokens_seen": 341508055, "step": 15832, "time_per_iteration": 2.7475693225860596 }, { "auxiliary_loss_clip": 0.01093001, "auxiliary_loss_mlp": 0.01036123, "balance_loss_clip": 1.03763044, "balance_loss_mlp": 1.02308214, "epoch": 0.9519314594919586, "flos": 15230649720960.0, "grad_norm": 1.7187781750552136, "language_loss": 0.77851391, "learning_rate": 2.4159432831070135e-08, "loss": 0.79980505, "num_input_tokens_seen": 341526155, "step": 15833, "time_per_iteration": 2.5683181285858154 }, { "auxiliary_loss_clip": 0.01069974, "auxiliary_loss_mlp": 0.01031366, "balance_loss_clip": 1.03459656, "balance_loss_mlp": 1.01919472, "epoch": 0.9519915827446265, "flos": 19352943100800.0, "grad_norm": 1.9155025813330617, "language_loss": 0.75245464, "learning_rate": 2.4099118165670007e-08, "loss": 0.77346802, "num_input_tokens_seen": 341540450, "step": 15834, "time_per_iteration": 5.729520559310913 }, { "auxiliary_loss_clip": 0.01098407, "auxiliary_loss_mlp": 0.01035182, "balance_loss_clip": 1.038692, "balance_loss_mlp": 1.02169371, "epoch": 0.9520517059972945, "flos": 22266303408000.0, "grad_norm": 2.297648558034633, "language_loss": 0.7629987, "learning_rate": 2.4038878426629216e-08, "loss": 0.78433454, "num_input_tokens_seen": 341557865, "step": 15835, "time_per_iteration": 2.570033073425293 }, { "auxiliary_loss_clip": 0.01086302, "auxiliary_loss_mlp": 0.01031867, "balance_loss_clip": 1.03379786, "balance_loss_mlp": 1.01873004, "epoch": 0.9521118292499624, "flos": 14862313704960.0, "grad_norm": 1.9369044520517964, "language_loss": 0.6651296, "learning_rate": 2.397871361623238e-08, "loss": 0.68631124, "num_input_tokens_seen": 341573890, "step": 15836, "time_per_iteration": 4.36873197555542 }, { "auxiliary_loss_clip": 0.01072203, "auxiliary_loss_mlp": 0.01027392, "balance_loss_clip": 1.03464746, "balance_loss_mlp": 1.01531649, "epoch": 0.9521719525026304, "flos": 23508812718720.0, "grad_norm": 1.945512889089952, "language_loss": 0.70333862, "learning_rate": 2.391862373676057e-08, "loss": 0.72433454, "num_input_tokens_seen": 341593770, "step": 15837, "time_per_iteration": 2.705793619155884 }, { "auxiliary_loss_clip": 0.01110794, "auxiliary_loss_mlp": 0.01033236, "balance_loss_clip": 1.03770208, "balance_loss_mlp": 1.01952064, "epoch": 0.9522320757552983, "flos": 19714922409600.0, "grad_norm": 2.0871537703897673, "language_loss": 0.73548734, "learning_rate": 2.3858608790492617e-08, "loss": 0.75692767, "num_input_tokens_seen": 341612065, "step": 15838, "time_per_iteration": 2.626145362854004 }, { "auxiliary_loss_clip": 0.01076517, "auxiliary_loss_mlp": 0.01027932, "balance_loss_clip": 1.03396976, "balance_loss_mlp": 1.0152297, "epoch": 0.9522921990079664, "flos": 25921291824000.0, "grad_norm": 4.0429942363631275, "language_loss": 0.78156877, "learning_rate": 2.379866877970449e-08, "loss": 0.80261326, "num_input_tokens_seen": 341631365, "step": 15839, "time_per_iteration": 4.274654865264893 }, { "auxiliary_loss_clip": 0.01085718, "auxiliary_loss_mlp": 0.01032381, "balance_loss_clip": 1.04032528, "balance_loss_mlp": 1.02013206, "epoch": 0.9523523222606343, "flos": 19208115463680.0, "grad_norm": 1.5000947489157939, "language_loss": 0.80272675, "learning_rate": 2.3738803706668585e-08, "loss": 0.82390767, "num_input_tokens_seen": 341650300, "step": 15840, "time_per_iteration": 2.7204654216766357 }, { "auxiliary_loss_clip": 0.01078473, "auxiliary_loss_mlp": 0.01028715, "balance_loss_clip": 1.0350554, "balance_loss_mlp": 1.0179565, "epoch": 0.9524124455133023, "flos": 20921269703040.0, "grad_norm": 2.113087759766638, "language_loss": 0.73338723, "learning_rate": 2.3679013573655314e-08, "loss": 0.75445914, "num_input_tokens_seen": 341667680, "step": 15841, "time_per_iteration": 2.6518993377685547 }, { "auxiliary_loss_clip": 0.01080022, "auxiliary_loss_mlp": 0.01026928, "balance_loss_clip": 1.03612816, "balance_loss_mlp": 1.01526952, "epoch": 0.9524725687659702, "flos": 18843550375680.0, "grad_norm": 1.7318468009780055, "language_loss": 0.79018557, "learning_rate": 2.3619298382931972e-08, "loss": 0.8112551, "num_input_tokens_seen": 341685760, "step": 15842, "time_per_iteration": 2.620762825012207 }, { "auxiliary_loss_clip": 0.01085992, "auxiliary_loss_mlp": 0.01032273, "balance_loss_clip": 1.03697205, "balance_loss_mlp": 1.01970291, "epoch": 0.9525326920186382, "flos": 22674680110080.0, "grad_norm": 2.120857377915384, "language_loss": 0.72623742, "learning_rate": 2.3559658136762973e-08, "loss": 0.74742007, "num_input_tokens_seen": 341705300, "step": 15843, "time_per_iteration": 2.643082618713379 }, { "auxiliary_loss_clip": 0.01080268, "auxiliary_loss_mlp": 0.00770279, "balance_loss_clip": 1.03644204, "balance_loss_mlp": 1.00023687, "epoch": 0.9525928152713061, "flos": 22086642556800.0, "grad_norm": 1.7610421238919713, "language_loss": 0.78494173, "learning_rate": 2.3500092837409612e-08, "loss": 0.80344719, "num_input_tokens_seen": 341724565, "step": 15844, "time_per_iteration": 2.672140121459961 }, { "auxiliary_loss_clip": 0.0107313, "auxiliary_loss_mlp": 0.01034185, "balance_loss_clip": 1.03377759, "balance_loss_mlp": 1.0192126, "epoch": 0.9526529385239741, "flos": 20704728562560.0, "grad_norm": 2.3272849000884133, "language_loss": 0.70132804, "learning_rate": 2.3440602487130977e-08, "loss": 0.7224012, "num_input_tokens_seen": 341743605, "step": 15845, "time_per_iteration": 2.6669421195983887 }, { "auxiliary_loss_clip": 0.01073757, "auxiliary_loss_mlp": 0.01035061, "balance_loss_clip": 1.03685403, "balance_loss_mlp": 1.02289605, "epoch": 0.9527130617766422, "flos": 23368043318400.0, "grad_norm": 1.600943165785114, "language_loss": 0.75702989, "learning_rate": 2.338118708818282e-08, "loss": 0.77811807, "num_input_tokens_seen": 341763475, "step": 15846, "time_per_iteration": 2.7024073600769043 }, { "auxiliary_loss_clip": 0.01078418, "auxiliary_loss_mlp": 0.01025955, "balance_loss_clip": 1.03588128, "balance_loss_mlp": 1.01366425, "epoch": 0.9527731850293101, "flos": 18985935888000.0, "grad_norm": 1.8711646240490332, "language_loss": 0.78105325, "learning_rate": 2.3321846642817998e-08, "loss": 0.80209702, "num_input_tokens_seen": 341781265, "step": 15847, "time_per_iteration": 2.780184507369995 }, { "auxiliary_loss_clip": 0.01066366, "auxiliary_loss_mlp": 0.01035518, "balance_loss_clip": 1.03419328, "balance_loss_mlp": 1.0241214, "epoch": 0.9528333082819781, "flos": 19318038059520.0, "grad_norm": 1.9530188537907924, "language_loss": 0.7798357, "learning_rate": 2.326258115328672e-08, "loss": 0.80085456, "num_input_tokens_seen": 341798825, "step": 15848, "time_per_iteration": 2.7238736152648926 }, { "auxiliary_loss_clip": 0.01089796, "auxiliary_loss_mlp": 0.01042438, "balance_loss_clip": 1.03580141, "balance_loss_mlp": 1.02845478, "epoch": 0.952893431534646, "flos": 23951340276480.0, "grad_norm": 1.8605077163556365, "language_loss": 0.72040188, "learning_rate": 2.320339062183674e-08, "loss": 0.74172425, "num_input_tokens_seen": 341819480, "step": 15849, "time_per_iteration": 2.682178258895874 }, { "auxiliary_loss_clip": 0.01105363, "auxiliary_loss_mlp": 0.01038455, "balance_loss_clip": 1.04046464, "balance_loss_mlp": 1.02527022, "epoch": 0.952953554787314, "flos": 21030545854080.0, "grad_norm": 1.819487619719596, "language_loss": 0.75498259, "learning_rate": 2.314427505071226e-08, "loss": 0.77642077, "num_input_tokens_seen": 341838035, "step": 15850, "time_per_iteration": 2.6890413761138916 }, { "auxiliary_loss_clip": 0.01080509, "auxiliary_loss_mlp": 0.01034554, "balance_loss_clip": 1.03441119, "balance_loss_mlp": 1.02248454, "epoch": 0.9530136780399819, "flos": 22382870019840.0, "grad_norm": 2.121651511514479, "language_loss": 0.72852147, "learning_rate": 2.308523444215482e-08, "loss": 0.74967206, "num_input_tokens_seen": 341855895, "step": 15851, "time_per_iteration": 2.681929111480713 }, { "auxiliary_loss_clip": 0.01082039, "auxiliary_loss_mlp": 0.01026961, "balance_loss_clip": 1.03587413, "balance_loss_mlp": 1.01521945, "epoch": 0.95307380129265, "flos": 22159613036160.0, "grad_norm": 1.7583423782489798, "language_loss": 0.79609531, "learning_rate": 2.3026268798403525e-08, "loss": 0.8171854, "num_input_tokens_seen": 341875240, "step": 15852, "time_per_iteration": 2.6543726921081543 }, { "auxiliary_loss_clip": 0.01097888, "auxiliary_loss_mlp": 0.01036154, "balance_loss_clip": 1.03511071, "balance_loss_mlp": 1.02323198, "epoch": 0.9531339245453179, "flos": 44022747214080.0, "grad_norm": 1.5582381447981437, "language_loss": 0.59615147, "learning_rate": 2.2967378121694138e-08, "loss": 0.61749196, "num_input_tokens_seen": 341901020, "step": 15853, "time_per_iteration": 2.7729127407073975 }, { "auxiliary_loss_clip": 0.01084188, "auxiliary_loss_mlp": 0.01032237, "balance_loss_clip": 1.0343461, "balance_loss_mlp": 1.02051926, "epoch": 0.9531940477979859, "flos": 20266690204800.0, "grad_norm": 1.8458954546465922, "language_loss": 0.72333086, "learning_rate": 2.290856241425998e-08, "loss": 0.74449503, "num_input_tokens_seen": 341919365, "step": 15854, "time_per_iteration": 2.667217254638672 }, { "auxiliary_loss_clip": 0.01081866, "auxiliary_loss_mlp": 0.01031192, "balance_loss_clip": 1.03433609, "balance_loss_mlp": 1.01909232, "epoch": 0.9532541710506538, "flos": 25335732309120.0, "grad_norm": 2.1969630613589057, "language_loss": 0.67196018, "learning_rate": 2.284982167833127e-08, "loss": 0.69309074, "num_input_tokens_seen": 341939985, "step": 15855, "time_per_iteration": 2.6534695625305176 }, { "auxiliary_loss_clip": 0.01109271, "auxiliary_loss_mlp": 0.01030983, "balance_loss_clip": 1.0367763, "balance_loss_mlp": 1.01885402, "epoch": 0.9533142943033218, "flos": 26469288691200.0, "grad_norm": 1.5274275727026758, "language_loss": 0.76655555, "learning_rate": 2.279115591613556e-08, "loss": 0.78795809, "num_input_tokens_seen": 341959255, "step": 15856, "time_per_iteration": 2.6008455753326416 }, { "auxiliary_loss_clip": 0.01080944, "auxiliary_loss_mlp": 0.0103369, "balance_loss_clip": 1.03132057, "balance_loss_mlp": 1.02190578, "epoch": 0.9533744175559897, "flos": 23656944407040.0, "grad_norm": 1.7148081146844736, "language_loss": 0.77968013, "learning_rate": 2.2732565129897075e-08, "loss": 0.80082643, "num_input_tokens_seen": 341977205, "step": 15857, "time_per_iteration": 2.6481335163116455 }, { "auxiliary_loss_clip": 0.01019391, "auxiliary_loss_mlp": 0.01003272, "balance_loss_clip": 1.00663853, "balance_loss_mlp": 1.00225866, "epoch": 0.9534345408086577, "flos": 61052055500160.0, "grad_norm": 0.7153193040155459, "language_loss": 0.6259079, "learning_rate": 2.267404932183803e-08, "loss": 0.6461345, "num_input_tokens_seen": 342038545, "step": 15858, "time_per_iteration": 3.112011671066284 }, { "auxiliary_loss_clip": 0.01057029, "auxiliary_loss_mlp": 0.01028806, "balance_loss_clip": 1.03275323, "balance_loss_mlp": 1.01722491, "epoch": 0.9534946640613258, "flos": 18951677291520.0, "grad_norm": 1.5293384678689539, "language_loss": 0.56808496, "learning_rate": 2.2615608494177097e-08, "loss": 0.58894336, "num_input_tokens_seen": 342058195, "step": 15859, "time_per_iteration": 2.699678897857666 }, { "auxiliary_loss_clip": 0.01104207, "auxiliary_loss_mlp": 0.01030176, "balance_loss_clip": 1.03593767, "balance_loss_mlp": 1.01922047, "epoch": 0.9535547873139937, "flos": 16654292340480.0, "grad_norm": 2.076157934676356, "language_loss": 0.81695747, "learning_rate": 2.2557242649130504e-08, "loss": 0.8383013, "num_input_tokens_seen": 342075025, "step": 15860, "time_per_iteration": 2.5248684883117676 }, { "auxiliary_loss_clip": 0.01057722, "auxiliary_loss_mlp": 0.0076914, "balance_loss_clip": 1.03329587, "balance_loss_mlp": 1.00022173, "epoch": 0.9536149105666617, "flos": 20667776446080.0, "grad_norm": 2.0339237108527195, "language_loss": 0.66784334, "learning_rate": 2.249895178891159e-08, "loss": 0.68611193, "num_input_tokens_seen": 342094595, "step": 15861, "time_per_iteration": 2.764711856842041 }, { "auxiliary_loss_clip": 0.01097732, "auxiliary_loss_mlp": 0.01036637, "balance_loss_clip": 1.03534853, "balance_loss_mlp": 1.02341676, "epoch": 0.9536750338193296, "flos": 30700499086080.0, "grad_norm": 1.722759616430161, "language_loss": 0.65783358, "learning_rate": 2.244073591573037e-08, "loss": 0.67917728, "num_input_tokens_seen": 342115970, "step": 15862, "time_per_iteration": 2.8370909690856934 }, { "auxiliary_loss_clip": 0.01067937, "auxiliary_loss_mlp": 0.01033375, "balance_loss_clip": 1.03313565, "balance_loss_mlp": 1.02116823, "epoch": 0.9537351570719976, "flos": 20405484357120.0, "grad_norm": 1.5180821577389316, "language_loss": 0.67942423, "learning_rate": 2.238259503179485e-08, "loss": 0.70043731, "num_input_tokens_seen": 342134080, "step": 15863, "time_per_iteration": 2.85260272026062 }, { "auxiliary_loss_clip": 0.01087422, "auxiliary_loss_mlp": 0.01028951, "balance_loss_clip": 1.03365183, "balance_loss_mlp": 1.01648188, "epoch": 0.9537952803246655, "flos": 29929245235200.0, "grad_norm": 2.093402061794127, "language_loss": 0.78434008, "learning_rate": 2.2324529139309267e-08, "loss": 0.80550379, "num_input_tokens_seen": 342154725, "step": 15864, "time_per_iteration": 2.7751903533935547 }, { "auxiliary_loss_clip": 0.01077785, "auxiliary_loss_mlp": 0.01026692, "balance_loss_clip": 1.03687298, "balance_loss_mlp": 1.01424658, "epoch": 0.9538554035773336, "flos": 20521404524160.0, "grad_norm": 2.5427902857740463, "language_loss": 0.60073441, "learning_rate": 2.226653824047586e-08, "loss": 0.6217792, "num_input_tokens_seen": 342172275, "step": 15865, "time_per_iteration": 2.668893337249756 }, { "auxiliary_loss_clip": 0.01066094, "auxiliary_loss_mlp": 0.01038068, "balance_loss_clip": 1.03419495, "balance_loss_mlp": 1.02391815, "epoch": 0.9539155268300015, "flos": 18406517598720.0, "grad_norm": 1.825358390609407, "language_loss": 0.70074368, "learning_rate": 2.2208622337493765e-08, "loss": 0.72178537, "num_input_tokens_seen": 342190880, "step": 15866, "time_per_iteration": 2.6656248569488525 }, { "auxiliary_loss_clip": 0.01083648, "auxiliary_loss_mlp": 0.0103856, "balance_loss_clip": 1.03381348, "balance_loss_mlp": 1.02469635, "epoch": 0.9539756500826695, "flos": 26213281482240.0, "grad_norm": 7.66097760902825, "language_loss": 0.84885997, "learning_rate": 2.215078143255855e-08, "loss": 0.87008202, "num_input_tokens_seen": 342208165, "step": 15867, "time_per_iteration": 2.7268359661102295 }, { "auxiliary_loss_clip": 0.01016664, "auxiliary_loss_mlp": 0.0100223, "balance_loss_clip": 1.0065484, "balance_loss_mlp": 1.00118721, "epoch": 0.9540357733353374, "flos": 68289097766400.0, "grad_norm": 0.8394455572132413, "language_loss": 0.61809933, "learning_rate": 2.2093015527864024e-08, "loss": 0.63828826, "num_input_tokens_seen": 342277110, "step": 15868, "time_per_iteration": 3.1767897605895996 }, { "auxiliary_loss_clip": 0.01070741, "auxiliary_loss_mlp": 0.0102867, "balance_loss_clip": 1.03546822, "balance_loss_mlp": 1.01636136, "epoch": 0.9540958965880054, "flos": 21288276915840.0, "grad_norm": 1.9166883744985537, "language_loss": 0.60024238, "learning_rate": 2.2035324625600425e-08, "loss": 0.62123656, "num_input_tokens_seen": 342294695, "step": 15869, "time_per_iteration": 2.825597047805786 }, { "auxiliary_loss_clip": 0.01069204, "auxiliary_loss_mlp": 0.00772179, "balance_loss_clip": 1.032269, "balance_loss_mlp": 1.00027442, "epoch": 0.9541560198406733, "flos": 19751407649280.0, "grad_norm": 1.8610543982135193, "language_loss": 0.71071583, "learning_rate": 2.197770872795579e-08, "loss": 0.72912961, "num_input_tokens_seen": 342314970, "step": 15870, "time_per_iteration": 2.7531421184539795 }, { "auxiliary_loss_clip": 0.01067012, "auxiliary_loss_mlp": 0.01028174, "balance_loss_clip": 1.03300095, "balance_loss_mlp": 1.01587772, "epoch": 0.9542161430933414, "flos": 24715626888960.0, "grad_norm": 1.7781564124647944, "language_loss": 0.76756346, "learning_rate": 2.1920167837114368e-08, "loss": 0.78851533, "num_input_tokens_seen": 342334255, "step": 15871, "time_per_iteration": 2.724163770675659 }, { "auxiliary_loss_clip": 0.01096753, "auxiliary_loss_mlp": 0.01035234, "balance_loss_clip": 1.0351069, "balance_loss_mlp": 1.02213871, "epoch": 0.9542762663460094, "flos": 31065818359680.0, "grad_norm": 1.9388679259393415, "language_loss": 0.58526534, "learning_rate": 2.1862701955258634e-08, "loss": 0.60658514, "num_input_tokens_seen": 342354730, "step": 15872, "time_per_iteration": 2.7208635807037354 }, { "auxiliary_loss_clip": 0.01085098, "auxiliary_loss_mlp": 0.01034184, "balance_loss_clip": 1.03341579, "balance_loss_mlp": 1.02033806, "epoch": 0.9543363895986773, "flos": 20776729374720.0, "grad_norm": 2.397894266058994, "language_loss": 0.74827802, "learning_rate": 2.1805311084567514e-08, "loss": 0.76947081, "num_input_tokens_seen": 342374565, "step": 15873, "time_per_iteration": 4.379558086395264 }, { "auxiliary_loss_clip": 0.01111454, "auxiliary_loss_mlp": 0.01032817, "balance_loss_clip": 1.0387311, "balance_loss_mlp": 1.01963258, "epoch": 0.9543965128513453, "flos": 24462744163200.0, "grad_norm": 1.9355802772017296, "language_loss": 0.62851435, "learning_rate": 2.1747995227217265e-08, "loss": 0.64995706, "num_input_tokens_seen": 342394590, "step": 15874, "time_per_iteration": 2.5884764194488525 }, { "auxiliary_loss_clip": 0.01084158, "auxiliary_loss_mlp": 0.01034983, "balance_loss_clip": 1.03476036, "balance_loss_mlp": 1.02254963, "epoch": 0.9544566361040132, "flos": 15261532439040.0, "grad_norm": 1.953622113172995, "language_loss": 0.89690936, "learning_rate": 2.169075438538104e-08, "loss": 0.91810071, "num_input_tokens_seen": 342410445, "step": 15875, "time_per_iteration": 4.317510604858398 }, { "auxiliary_loss_clip": 0.01112734, "auxiliary_loss_mlp": 0.01033431, "balance_loss_clip": 1.03820455, "balance_loss_mlp": 1.02027059, "epoch": 0.9545167593566812, "flos": 25918777872000.0, "grad_norm": 1.5906794520251055, "language_loss": 0.67873561, "learning_rate": 2.1633588561229765e-08, "loss": 0.70019734, "num_input_tokens_seen": 342430970, "step": 15876, "time_per_iteration": 2.599390983581543 }, { "auxiliary_loss_clip": 0.01097415, "auxiliary_loss_mlp": 0.01036316, "balance_loss_clip": 1.03623271, "balance_loss_mlp": 1.02300572, "epoch": 0.9545768826093491, "flos": 25628188844160.0, "grad_norm": 1.8099744313437123, "language_loss": 0.69018167, "learning_rate": 2.1576497756931267e-08, "loss": 0.711519, "num_input_tokens_seen": 342449505, "step": 15877, "time_per_iteration": 2.621135711669922 }, { "auxiliary_loss_clip": 0.01068154, "auxiliary_loss_mlp": 0.01036314, "balance_loss_clip": 1.03443968, "balance_loss_mlp": 1.02281344, "epoch": 0.9546370058620172, "flos": 22491499726080.0, "grad_norm": 1.864175160647524, "language_loss": 0.71021724, "learning_rate": 2.1519481974650035e-08, "loss": 0.73126197, "num_input_tokens_seen": 342470390, "step": 15878, "time_per_iteration": 4.388718843460083 }, { "auxiliary_loss_clip": 0.01104891, "auxiliary_loss_mlp": 0.01030745, "balance_loss_clip": 1.03498161, "balance_loss_mlp": 1.01849008, "epoch": 0.9546971291146851, "flos": 24609582961920.0, "grad_norm": 1.3169717238469103, "language_loss": 0.67999732, "learning_rate": 2.1462541216548335e-08, "loss": 0.70135367, "num_input_tokens_seen": 342492560, "step": 15879, "time_per_iteration": 2.634164571762085 }, { "auxiliary_loss_clip": 0.01071861, "auxiliary_loss_mlp": 0.0076975, "balance_loss_clip": 1.03325868, "balance_loss_mlp": 1.00017679, "epoch": 0.9547572523673531, "flos": 28657756627200.0, "grad_norm": 1.892359769973216, "language_loss": 0.84921825, "learning_rate": 2.1405675484785334e-08, "loss": 0.86763442, "num_input_tokens_seen": 342512315, "step": 15880, "time_per_iteration": 2.7207343578338623 }, { "auxiliary_loss_clip": 0.01043217, "auxiliary_loss_mlp": 0.01034447, "balance_loss_clip": 1.03212571, "balance_loss_mlp": 1.02152514, "epoch": 0.954817375620021, "flos": 33802606385280.0, "grad_norm": 1.832275853665566, "language_loss": 0.7208662, "learning_rate": 2.134888478151753e-08, "loss": 0.74164283, "num_input_tokens_seen": 342533060, "step": 15881, "time_per_iteration": 3.034219980239868 }, { "auxiliary_loss_clip": 0.01097589, "auxiliary_loss_mlp": 0.01035091, "balance_loss_clip": 1.03802884, "balance_loss_mlp": 1.02235389, "epoch": 0.954877498872689, "flos": 14428225843200.0, "grad_norm": 1.8582437523117474, "language_loss": 0.71399862, "learning_rate": 2.1292169108898083e-08, "loss": 0.73532546, "num_input_tokens_seen": 342550830, "step": 15882, "time_per_iteration": 2.5682435035705566 }, { "auxiliary_loss_clip": 0.0108781, "auxiliary_loss_mlp": 0.0103464, "balance_loss_clip": 1.0364579, "balance_loss_mlp": 1.02317178, "epoch": 0.9549376221253569, "flos": 59269447336320.0, "grad_norm": 1.5893740552045255, "language_loss": 0.65766758, "learning_rate": 2.1235528469078168e-08, "loss": 0.67889214, "num_input_tokens_seen": 342575070, "step": 15883, "time_per_iteration": 3.0329291820526123 }, { "auxiliary_loss_clip": 0.01099334, "auxiliary_loss_mlp": 0.0103149, "balance_loss_clip": 1.03810024, "balance_loss_mlp": 1.01847863, "epoch": 0.954997745378025, "flos": 17274397760640.0, "grad_norm": 2.161620424639411, "language_loss": 0.78009343, "learning_rate": 2.1178962864205175e-08, "loss": 0.80140173, "num_input_tokens_seen": 342592215, "step": 15884, "time_per_iteration": 2.62176513671875 }, { "auxiliary_loss_clip": 0.01109312, "auxiliary_loss_mlp": 0.01029181, "balance_loss_clip": 1.03558671, "balance_loss_mlp": 1.01618683, "epoch": 0.955057868630693, "flos": 13006378903680.0, "grad_norm": 1.803007960356649, "language_loss": 0.77870518, "learning_rate": 2.1122472296424054e-08, "loss": 0.80009007, "num_input_tokens_seen": 342610030, "step": 15885, "time_per_iteration": 2.5647974014282227 }, { "auxiliary_loss_clip": 0.01108326, "auxiliary_loss_mlp": 0.01033941, "balance_loss_clip": 1.03567576, "balance_loss_mlp": 1.02171004, "epoch": 0.9551179918833609, "flos": 22637692080000.0, "grad_norm": 1.6846495820783678, "language_loss": 0.69959128, "learning_rate": 2.1066056767877317e-08, "loss": 0.7210139, "num_input_tokens_seen": 342626475, "step": 15886, "time_per_iteration": 2.6008517742156982 }, { "auxiliary_loss_clip": 0.01074503, "auxiliary_loss_mlp": 0.0103392, "balance_loss_clip": 1.03510761, "balance_loss_mlp": 1.02006149, "epoch": 0.9551781151360289, "flos": 21542811667200.0, "grad_norm": 1.590980896681407, "language_loss": 0.72832477, "learning_rate": 2.1009716280703916e-08, "loss": 0.74940896, "num_input_tokens_seen": 342646645, "step": 15887, "time_per_iteration": 2.6831438541412354 }, { "auxiliary_loss_clip": 0.01084236, "auxiliary_loss_mlp": 0.01031385, "balance_loss_clip": 1.03418398, "balance_loss_mlp": 1.01973784, "epoch": 0.9552382383886968, "flos": 20702250524160.0, "grad_norm": 1.933274372299018, "language_loss": 0.5720163, "learning_rate": 2.0953450837040364e-08, "loss": 0.59317255, "num_input_tokens_seen": 342663615, "step": 15888, "time_per_iteration": 2.630725860595703 }, { "auxiliary_loss_clip": 0.01019029, "auxiliary_loss_mlp": 0.01004801, "balance_loss_clip": 1.00631261, "balance_loss_mlp": 1.0038352, "epoch": 0.9552983616413648, "flos": 67769792887680.0, "grad_norm": 0.7050649141864813, "language_loss": 0.57804728, "learning_rate": 2.0897260439020514e-08, "loss": 0.59828568, "num_input_tokens_seen": 342728275, "step": 15889, "time_per_iteration": 3.214216470718384 }, { "auxiliary_loss_clip": 0.01108889, "auxiliary_loss_mlp": 0.01030731, "balance_loss_clip": 1.03501582, "balance_loss_mlp": 1.01774263, "epoch": 0.9553584848940327, "flos": 21579979265280.0, "grad_norm": 1.4933948635050138, "language_loss": 0.6719625, "learning_rate": 2.084114508877466e-08, "loss": 0.69335872, "num_input_tokens_seen": 342748860, "step": 15890, "time_per_iteration": 2.600853443145752 }, { "auxiliary_loss_clip": 0.01108529, "auxiliary_loss_mlp": 0.01026392, "balance_loss_clip": 1.03781927, "balance_loss_mlp": 1.01449537, "epoch": 0.9554186081467008, "flos": 24208173498240.0, "grad_norm": 1.434726031550495, "language_loss": 0.74308884, "learning_rate": 2.0785104788430874e-08, "loss": 0.76443803, "num_input_tokens_seen": 342769705, "step": 15891, "time_per_iteration": 2.604349374771118 }, { "auxiliary_loss_clip": 0.01069647, "auxiliary_loss_mlp": 0.01028677, "balance_loss_clip": 1.03273499, "balance_loss_mlp": 1.01785886, "epoch": 0.9554787313993687, "flos": 16251554073600.0, "grad_norm": 1.905721456172026, "language_loss": 0.77943361, "learning_rate": 2.072913954011435e-08, "loss": 0.80041689, "num_input_tokens_seen": 342787000, "step": 15892, "time_per_iteration": 2.6338727474212646 }, { "auxiliary_loss_clip": 0.0110724, "auxiliary_loss_mlp": 0.01029475, "balance_loss_clip": 1.03596187, "balance_loss_mlp": 1.01690435, "epoch": 0.9555388546520367, "flos": 23404133508480.0, "grad_norm": 1.3333935754800896, "language_loss": 0.6973961, "learning_rate": 2.0673249345947386e-08, "loss": 0.71876323, "num_input_tokens_seen": 342807795, "step": 15893, "time_per_iteration": 2.64900803565979 }, { "auxiliary_loss_clip": 0.01089703, "auxiliary_loss_mlp": 0.00770181, "balance_loss_clip": 1.03906655, "balance_loss_mlp": 1.00022793, "epoch": 0.9555989779047046, "flos": 14794047907200.0, "grad_norm": 1.8780898151887826, "language_loss": 0.65497565, "learning_rate": 2.0617434208048955e-08, "loss": 0.67357445, "num_input_tokens_seen": 342825490, "step": 15894, "time_per_iteration": 2.640239953994751 }, { "auxiliary_loss_clip": 0.01098184, "auxiliary_loss_mlp": 0.01032823, "balance_loss_clip": 1.03628969, "balance_loss_mlp": 1.01997232, "epoch": 0.9556591011573726, "flos": 22236749493120.0, "grad_norm": 1.917235716935355, "language_loss": 0.82155561, "learning_rate": 2.056169412853581e-08, "loss": 0.84286571, "num_input_tokens_seen": 342844965, "step": 15895, "time_per_iteration": 2.605703592300415 }, { "auxiliary_loss_clip": 0.01083186, "auxiliary_loss_mlp": 0.01029801, "balance_loss_clip": 1.0364809, "balance_loss_mlp": 1.01701021, "epoch": 0.9557192244100405, "flos": 27855296835840.0, "grad_norm": 1.5092096829888868, "language_loss": 0.72777927, "learning_rate": 2.0506029109521593e-08, "loss": 0.74890918, "num_input_tokens_seen": 342865915, "step": 15896, "time_per_iteration": 2.800420045852661 }, { "auxiliary_loss_clip": 0.01105404, "auxiliary_loss_mlp": 0.01031303, "balance_loss_clip": 1.03542614, "balance_loss_mlp": 1.01926875, "epoch": 0.9557793476627086, "flos": 17602800831360.0, "grad_norm": 1.8673042529516892, "language_loss": 0.79697645, "learning_rate": 2.045043915311706e-08, "loss": 0.81834352, "num_input_tokens_seen": 342884000, "step": 15897, "time_per_iteration": 2.58010196685791 }, { "auxiliary_loss_clip": 0.01081754, "auxiliary_loss_mlp": 0.0103489, "balance_loss_clip": 1.03217518, "balance_loss_mlp": 1.02133036, "epoch": 0.9558394709153766, "flos": 23875496709120.0, "grad_norm": 1.7392744855605553, "language_loss": 0.7268827, "learning_rate": 2.03949242614303e-08, "loss": 0.74804914, "num_input_tokens_seen": 342903095, "step": 15898, "time_per_iteration": 2.675769567489624 }, { "auxiliary_loss_clip": 0.010026, "auxiliary_loss_mlp": 0.01003805, "balance_loss_clip": 1.0089612, "balance_loss_mlp": 1.00289333, "epoch": 0.9558995941680445, "flos": 53682001171200.0, "grad_norm": 0.8518438349506685, "language_loss": 0.52328175, "learning_rate": 2.033948443656652e-08, "loss": 0.54334575, "num_input_tokens_seen": 342958155, "step": 15899, "time_per_iteration": 3.1892175674438477 }, { "auxiliary_loss_clip": 0.01101857, "auxiliary_loss_mlp": 0.010327, "balance_loss_clip": 1.03783405, "balance_loss_mlp": 1.01899076, "epoch": 0.9559597174207125, "flos": 13764488376960.0, "grad_norm": 2.0360903333402183, "language_loss": 0.68228984, "learning_rate": 2.028411968062782e-08, "loss": 0.70363533, "num_input_tokens_seen": 342972500, "step": 15900, "time_per_iteration": 2.5987586975097656 }, { "auxiliary_loss_clip": 0.0109791, "auxiliary_loss_mlp": 0.0077014, "balance_loss_clip": 1.03479004, "balance_loss_mlp": 1.00019574, "epoch": 0.9560198406733804, "flos": 19936347799680.0, "grad_norm": 2.313544780745396, "language_loss": 0.83186281, "learning_rate": 2.0228829995713627e-08, "loss": 0.85054326, "num_input_tokens_seen": 342989035, "step": 15901, "time_per_iteration": 2.593118667602539 }, { "auxiliary_loss_clip": 0.00997227, "auxiliary_loss_mlp": 0.01005499, "balance_loss_clip": 1.00669014, "balance_loss_mlp": 1.00429535, "epoch": 0.9560799639260484, "flos": 57289550699520.0, "grad_norm": 0.70780862037499, "language_loss": 0.54323339, "learning_rate": 2.0173615383920485e-08, "loss": 0.56326067, "num_input_tokens_seen": 343051675, "step": 15902, "time_per_iteration": 3.3085649013519287 }, { "auxiliary_loss_clip": 0.01086623, "auxiliary_loss_mlp": 0.01032307, "balance_loss_clip": 1.03854203, "balance_loss_mlp": 1.02167904, "epoch": 0.9561400871787163, "flos": 18917167299840.0, "grad_norm": 1.7381730365709078, "language_loss": 0.8538748, "learning_rate": 2.01184758473425e-08, "loss": 0.87506413, "num_input_tokens_seen": 343068895, "step": 15903, "time_per_iteration": 2.6358137130737305 }, { "auxiliary_loss_clip": 0.01082056, "auxiliary_loss_mlp": 0.00772044, "balance_loss_clip": 1.03525889, "balance_loss_mlp": 1.00020206, "epoch": 0.9562002104313844, "flos": 18038576632320.0, "grad_norm": 1.7984010377221487, "language_loss": 0.80295885, "learning_rate": 2.0063411388070217e-08, "loss": 0.82149988, "num_input_tokens_seen": 343087115, "step": 15904, "time_per_iteration": 2.7303686141967773 }, { "auxiliary_loss_clip": 0.01098663, "auxiliary_loss_mlp": 0.01031681, "balance_loss_clip": 1.03495884, "balance_loss_mlp": 1.01843715, "epoch": 0.9562603336840523, "flos": 24717673964160.0, "grad_norm": 4.96352662326913, "language_loss": 0.60007298, "learning_rate": 2.0008422008191972e-08, "loss": 0.6213764, "num_input_tokens_seen": 343105575, "step": 15905, "time_per_iteration": 2.655217170715332 }, { "auxiliary_loss_clip": 0.01096188, "auxiliary_loss_mlp": 0.01028565, "balance_loss_clip": 1.03525162, "balance_loss_mlp": 1.01632822, "epoch": 0.9563204569367203, "flos": 21177205084800.0, "grad_norm": 1.9515891856264378, "language_loss": 0.70387208, "learning_rate": 1.995350770979254e-08, "loss": 0.72511959, "num_input_tokens_seen": 343123025, "step": 15906, "time_per_iteration": 2.6145029067993164 }, { "auxiliary_loss_clip": 0.01055579, "auxiliary_loss_mlp": 0.01030633, "balance_loss_clip": 1.03260493, "balance_loss_mlp": 1.01775253, "epoch": 0.9563805801893882, "flos": 20229738088320.0, "grad_norm": 1.7038523385332285, "language_loss": 0.70973694, "learning_rate": 1.9898668494954473e-08, "loss": 0.73059911, "num_input_tokens_seen": 343141625, "step": 15907, "time_per_iteration": 2.831192970275879 }, { "auxiliary_loss_clip": 0.0106678, "auxiliary_loss_mlp": 0.01031766, "balance_loss_clip": 1.03346992, "balance_loss_mlp": 1.01988053, "epoch": 0.9564407034420562, "flos": 25411001258880.0, "grad_norm": 1.938524110619909, "language_loss": 0.70548427, "learning_rate": 1.9843904365757447e-08, "loss": 0.72646976, "num_input_tokens_seen": 343161300, "step": 15908, "time_per_iteration": 2.704686164855957 }, { "auxiliary_loss_clip": 0.01085855, "auxiliary_loss_mlp": 0.00770126, "balance_loss_clip": 1.03650773, "balance_loss_mlp": 1.00021219, "epoch": 0.9565008266947241, "flos": 18623884752000.0, "grad_norm": 1.9260740881631984, "language_loss": 0.83019876, "learning_rate": 1.978921532427802e-08, "loss": 0.84875852, "num_input_tokens_seen": 343177815, "step": 15909, "time_per_iteration": 2.6200265884399414 }, { "auxiliary_loss_clip": 0.01096482, "auxiliary_loss_mlp": 0.01033356, "balance_loss_clip": 1.03509748, "balance_loss_mlp": 1.02116704, "epoch": 0.9565609499473922, "flos": 24862142465280.0, "grad_norm": 2.1314572111323717, "language_loss": 0.67602086, "learning_rate": 1.9734601372590086e-08, "loss": 0.69731927, "num_input_tokens_seen": 343198140, "step": 15910, "time_per_iteration": 2.6983892917633057 }, { "auxiliary_loss_clip": 0.01101245, "auxiliary_loss_mlp": 0.01034074, "balance_loss_clip": 1.03767276, "balance_loss_mlp": 1.02156329, "epoch": 0.9566210732000601, "flos": 21798459740160.0, "grad_norm": 1.6976880535824044, "language_loss": 0.74343169, "learning_rate": 1.968006251276444e-08, "loss": 0.76478493, "num_input_tokens_seen": 343218280, "step": 15911, "time_per_iteration": 2.6060009002685547 }, { "auxiliary_loss_clip": 0.01096979, "auxiliary_loss_mlp": 0.01030324, "balance_loss_clip": 1.03550327, "balance_loss_mlp": 1.01809359, "epoch": 0.9566811964527281, "flos": 18697609416960.0, "grad_norm": 2.080677167597926, "language_loss": 0.69605064, "learning_rate": 1.9625598746869198e-08, "loss": 0.71732366, "num_input_tokens_seen": 343236850, "step": 15912, "time_per_iteration": 4.122835874557495 }, { "auxiliary_loss_clip": 0.01086077, "auxiliary_loss_mlp": 0.01036299, "balance_loss_clip": 1.03359342, "balance_loss_mlp": 1.02379942, "epoch": 0.9567413197053961, "flos": 13000632727680.0, "grad_norm": 2.5288406213063466, "language_loss": 0.72268087, "learning_rate": 1.95712100769696e-08, "loss": 0.74390459, "num_input_tokens_seen": 343253065, "step": 15913, "time_per_iteration": 4.12858247756958 }, { "auxiliary_loss_clip": 0.01026666, "auxiliary_loss_mlp": 0.01032498, "balance_loss_clip": 1.03391707, "balance_loss_mlp": 1.02069664, "epoch": 0.956801442958064, "flos": 19719267955200.0, "grad_norm": 20.323205931222148, "language_loss": 0.73863947, "learning_rate": 1.9516896505128444e-08, "loss": 0.75923109, "num_input_tokens_seen": 343270330, "step": 15914, "time_per_iteration": 2.7809512615203857 }, { "auxiliary_loss_clip": 0.01107977, "auxiliary_loss_mlp": 0.01030147, "balance_loss_clip": 1.0365274, "balance_loss_mlp": 1.01769543, "epoch": 0.956861566210732, "flos": 18222834424320.0, "grad_norm": 1.3806320366510194, "language_loss": 0.67305696, "learning_rate": 1.9462658033404965e-08, "loss": 0.69443822, "num_input_tokens_seen": 343289625, "step": 15915, "time_per_iteration": 4.22941780090332 }, { "auxiliary_loss_clip": 0.01092649, "auxiliary_loss_mlp": 0.01028226, "balance_loss_clip": 1.03482556, "balance_loss_mlp": 1.01616824, "epoch": 0.9569216894634, "flos": 22196960202240.0, "grad_norm": 1.7476602306443554, "language_loss": 0.64463937, "learning_rate": 1.9408494663855967e-08, "loss": 0.66584814, "num_input_tokens_seen": 343309200, "step": 15916, "time_per_iteration": 2.5847983360290527 }, { "auxiliary_loss_clip": 0.0110232, "auxiliary_loss_mlp": 0.01028805, "balance_loss_clip": 1.03600883, "balance_loss_mlp": 1.01722336, "epoch": 0.956981812716068, "flos": 21689291329920.0, "grad_norm": 1.8359549722537702, "language_loss": 0.80332065, "learning_rate": 1.935440639853536e-08, "loss": 0.82463187, "num_input_tokens_seen": 343326270, "step": 15917, "time_per_iteration": 2.5821292400360107 }, { "auxiliary_loss_clip": 0.01077457, "auxiliary_loss_mlp": 0.01034662, "balance_loss_clip": 1.0340178, "balance_loss_mlp": 1.02204955, "epoch": 0.9570419359687359, "flos": 13990905757440.0, "grad_norm": 1.923592863089018, "language_loss": 0.73075807, "learning_rate": 1.9300393239494172e-08, "loss": 0.75187922, "num_input_tokens_seen": 343344430, "step": 15918, "time_per_iteration": 4.2131946086883545 }, { "auxiliary_loss_clip": 0.01002537, "auxiliary_loss_mlp": 0.0100177, "balance_loss_clip": 1.0084734, "balance_loss_mlp": 1.00083399, "epoch": 0.9571020592214039, "flos": 65196938534400.0, "grad_norm": 0.6358192020761055, "language_loss": 0.53063756, "learning_rate": 1.924645518878032e-08, "loss": 0.5506807, "num_input_tokens_seen": 343416155, "step": 15919, "time_per_iteration": 3.3149333000183105 }, { "auxiliary_loss_clip": 0.01106277, "auxiliary_loss_mlp": 0.01037331, "balance_loss_clip": 1.04041994, "balance_loss_mlp": 1.02374721, "epoch": 0.9571621824740718, "flos": 17384068961280.0, "grad_norm": 50.750331888616735, "language_loss": 0.74972582, "learning_rate": 1.919259224843972e-08, "loss": 0.77116191, "num_input_tokens_seen": 343431715, "step": 15920, "time_per_iteration": 2.6216814517974854 }, { "auxiliary_loss_clip": 0.01074302, "auxiliary_loss_mlp": 0.01033346, "balance_loss_clip": 1.03634095, "balance_loss_mlp": 1.02012527, "epoch": 0.9572223057267398, "flos": 14538184352640.0, "grad_norm": 1.6187560061674033, "language_loss": 0.7876358, "learning_rate": 1.9138804420514298e-08, "loss": 0.80871224, "num_input_tokens_seen": 343450425, "step": 15921, "time_per_iteration": 2.6776888370513916 }, { "auxiliary_loss_clip": 0.01102004, "auxiliary_loss_mlp": 0.01032194, "balance_loss_clip": 1.03531837, "balance_loss_mlp": 1.01865196, "epoch": 0.9572824289794077, "flos": 33947793158400.0, "grad_norm": 2.2537365266474776, "language_loss": 0.51078975, "learning_rate": 1.9085091707044197e-08, "loss": 0.53213173, "num_input_tokens_seen": 343470445, "step": 15922, "time_per_iteration": 2.7087700366973877 }, { "auxiliary_loss_clip": 0.01055425, "auxiliary_loss_mlp": 0.01043646, "balance_loss_clip": 1.0309701, "balance_loss_mlp": 1.02935278, "epoch": 0.9573425522320758, "flos": 18694915896960.0, "grad_norm": 1.958282285271952, "language_loss": 0.84238583, "learning_rate": 1.903145411006557e-08, "loss": 0.86337662, "num_input_tokens_seen": 343485200, "step": 15923, "time_per_iteration": 2.6815152168273926 }, { "auxiliary_loss_clip": 0.010812, "auxiliary_loss_mlp": 0.01036534, "balance_loss_clip": 1.03293872, "balance_loss_mlp": 1.02475667, "epoch": 0.9574026754847437, "flos": 28510307297280.0, "grad_norm": 1.538843441694693, "language_loss": 0.75172049, "learning_rate": 1.8977891631613008e-08, "loss": 0.77289784, "num_input_tokens_seen": 343505080, "step": 15924, "time_per_iteration": 2.7213785648345947 }, { "auxiliary_loss_clip": 0.01087824, "auxiliary_loss_mlp": 0.01031969, "balance_loss_clip": 1.03699768, "balance_loss_mlp": 1.01958311, "epoch": 0.9574627987374117, "flos": 24352390604160.0, "grad_norm": 2.227622693008034, "language_loss": 0.86090326, "learning_rate": 1.892440427371711e-08, "loss": 0.88210118, "num_input_tokens_seen": 343523995, "step": 15925, "time_per_iteration": 2.8542959690093994 }, { "auxiliary_loss_clip": 0.01079041, "auxiliary_loss_mlp": 0.01034192, "balance_loss_clip": 1.03561556, "balance_loss_mlp": 1.02103734, "epoch": 0.9575229219900797, "flos": 23510680225920.0, "grad_norm": 2.011468601980382, "language_loss": 0.75676179, "learning_rate": 1.8870992038406474e-08, "loss": 0.77789414, "num_input_tokens_seen": 343542015, "step": 15926, "time_per_iteration": 2.7330782413482666 }, { "auxiliary_loss_clip": 0.0108326, "auxiliary_loss_mlp": 0.01031405, "balance_loss_clip": 1.03742075, "balance_loss_mlp": 1.0197587, "epoch": 0.9575830452427476, "flos": 22674823764480.0, "grad_norm": 2.0888079382528333, "language_loss": 0.77707171, "learning_rate": 1.8817654927706373e-08, "loss": 0.79821837, "num_input_tokens_seen": 343561680, "step": 15927, "time_per_iteration": 2.704115390777588 }, { "auxiliary_loss_clip": 0.01063185, "auxiliary_loss_mlp": 0.01031462, "balance_loss_clip": 1.03502405, "balance_loss_mlp": 1.0171268, "epoch": 0.9576431684954156, "flos": 30485250835200.0, "grad_norm": 1.8055478063953943, "language_loss": 0.68572605, "learning_rate": 1.8764392943639183e-08, "loss": 0.70667255, "num_input_tokens_seen": 343585290, "step": 15928, "time_per_iteration": 2.8810582160949707 }, { "auxiliary_loss_clip": 0.01089186, "auxiliary_loss_mlp": 0.01031202, "balance_loss_clip": 1.03828859, "balance_loss_mlp": 1.018381, "epoch": 0.9577032917480836, "flos": 21687387909120.0, "grad_norm": 1.5717055472294992, "language_loss": 0.822155, "learning_rate": 1.871120608822485e-08, "loss": 0.84335887, "num_input_tokens_seen": 343604045, "step": 15929, "time_per_iteration": 2.6657960414886475 }, { "auxiliary_loss_clip": 0.01077088, "auxiliary_loss_mlp": 0.01046119, "balance_loss_clip": 1.03563619, "balance_loss_mlp": 1.03236794, "epoch": 0.9577634150007516, "flos": 29023147728000.0, "grad_norm": 1.8215200487797032, "language_loss": 0.72274351, "learning_rate": 1.8658094363480202e-08, "loss": 0.74397558, "num_input_tokens_seen": 343626595, "step": 15930, "time_per_iteration": 2.795675277709961 }, { "auxiliary_loss_clip": 0.0103609, "auxiliary_loss_mlp": 0.01032785, "balance_loss_clip": 1.03149176, "balance_loss_mlp": 1.01960611, "epoch": 0.9578235382534195, "flos": 19282235178240.0, "grad_norm": 1.4066251693487615, "language_loss": 0.62494546, "learning_rate": 1.8605057771419185e-08, "loss": 0.64563417, "num_input_tokens_seen": 343646195, "step": 15931, "time_per_iteration": 2.7418274879455566 }, { "auxiliary_loss_clip": 0.01106716, "auxiliary_loss_mlp": 0.01030993, "balance_loss_clip": 1.03693795, "balance_loss_mlp": 1.01945996, "epoch": 0.9578836615060875, "flos": 13699275235200.0, "grad_norm": 2.1628834321357746, "language_loss": 0.69288397, "learning_rate": 1.8552096314052633e-08, "loss": 0.71426117, "num_input_tokens_seen": 343663665, "step": 15932, "time_per_iteration": 2.6367006301879883 }, { "auxiliary_loss_clip": 0.01080267, "auxiliary_loss_mlp": 0.01036892, "balance_loss_clip": 1.0359807, "balance_loss_mlp": 1.02269387, "epoch": 0.9579437847587554, "flos": 17054516655360.0, "grad_norm": 1.883637531567824, "language_loss": 0.75359249, "learning_rate": 1.849920999338961e-08, "loss": 0.77476406, "num_input_tokens_seen": 343682145, "step": 15933, "time_per_iteration": 2.692196846008301 }, { "auxiliary_loss_clip": 0.00998155, "auxiliary_loss_mlp": 0.00999946, "balance_loss_clip": 1.01311505, "balance_loss_mlp": 0.99865836, "epoch": 0.9580039080114234, "flos": 60570887886720.0, "grad_norm": 0.7032232478309851, "language_loss": 0.57280135, "learning_rate": 1.8446398811434948e-08, "loss": 0.59278238, "num_input_tokens_seen": 343744685, "step": 15934, "time_per_iteration": 3.389955997467041 }, { "auxiliary_loss_clip": 0.01027383, "auxiliary_loss_mlp": 0.00751072, "balance_loss_clip": 1.00506508, "balance_loss_mlp": 0.99959391, "epoch": 0.9580640312640913, "flos": 66235365745920.0, "grad_norm": 0.9122482390110158, "language_loss": 0.65885007, "learning_rate": 1.8393662770191277e-08, "loss": 0.67663455, "num_input_tokens_seen": 343801835, "step": 15935, "time_per_iteration": 3.0524590015411377 }, { "auxiliary_loss_clip": 0.01007227, "auxiliary_loss_mlp": 0.01002986, "balance_loss_clip": 1.00997615, "balance_loss_mlp": 1.00185907, "epoch": 0.9581241545167594, "flos": 62218002971520.0, "grad_norm": 0.7845197827637053, "language_loss": 0.57026505, "learning_rate": 1.8341001871658546e-08, "loss": 0.5903672, "num_input_tokens_seen": 343861515, "step": 15936, "time_per_iteration": 3.161888837814331 }, { "auxiliary_loss_clip": 0.01048485, "auxiliary_loss_mlp": 0.0103042, "balance_loss_clip": 1.03310895, "balance_loss_mlp": 1.01798666, "epoch": 0.9581842777694273, "flos": 23768088065280.0, "grad_norm": 1.4930030330503186, "language_loss": 0.78472948, "learning_rate": 1.8288416117833825e-08, "loss": 0.80551857, "num_input_tokens_seen": 343881240, "step": 15937, "time_per_iteration": 2.777000665664673 }, { "auxiliary_loss_clip": 0.01096104, "auxiliary_loss_mlp": 0.01032818, "balance_loss_clip": 1.03548694, "balance_loss_mlp": 1.01956844, "epoch": 0.9582444010220953, "flos": 21213079793280.0, "grad_norm": 1.6538903091453836, "language_loss": 0.6840139, "learning_rate": 1.8235905510710636e-08, "loss": 0.70530319, "num_input_tokens_seen": 343900885, "step": 15938, "time_per_iteration": 2.638640880584717 }, { "auxiliary_loss_clip": 0.01076145, "auxiliary_loss_mlp": 0.01029859, "balance_loss_clip": 1.0352782, "balance_loss_mlp": 1.01772964, "epoch": 0.9583045242747633, "flos": 23805147922560.0, "grad_norm": 3.037057978485483, "language_loss": 0.6558556, "learning_rate": 1.8183470052280712e-08, "loss": 0.67691565, "num_input_tokens_seen": 343918460, "step": 15939, "time_per_iteration": 2.8998749256134033 }, { "auxiliary_loss_clip": 0.01079284, "auxiliary_loss_mlp": 0.01037887, "balance_loss_clip": 1.03366089, "balance_loss_mlp": 1.02551913, "epoch": 0.9583646475274312, "flos": 24131468004480.0, "grad_norm": 1.828143592735246, "language_loss": 0.73795086, "learning_rate": 1.8131109744532025e-08, "loss": 0.75912249, "num_input_tokens_seen": 343938030, "step": 15940, "time_per_iteration": 2.8199172019958496 }, { "auxiliary_loss_clip": 0.01109084, "auxiliary_loss_mlp": 0.01033619, "balance_loss_clip": 1.03673601, "balance_loss_mlp": 1.02029765, "epoch": 0.9584247707800992, "flos": 20886651970560.0, "grad_norm": 2.719526095639371, "language_loss": 0.72758561, "learning_rate": 1.8078824589450535e-08, "loss": 0.74901259, "num_input_tokens_seen": 343956635, "step": 15941, "time_per_iteration": 2.580655097961426 }, { "auxiliary_loss_clip": 0.01087013, "auxiliary_loss_mlp": 0.01036207, "balance_loss_clip": 1.03728151, "balance_loss_mlp": 1.02411318, "epoch": 0.9584848940327672, "flos": 26067591918720.0, "grad_norm": 2.197358569248491, "language_loss": 0.7112202, "learning_rate": 1.8026614589018442e-08, "loss": 0.73245239, "num_input_tokens_seen": 343976625, "step": 15942, "time_per_iteration": 2.6756019592285156 }, { "auxiliary_loss_clip": 0.0110919, "auxiliary_loss_mlp": 0.01033733, "balance_loss_clip": 1.03630304, "balance_loss_mlp": 1.02057219, "epoch": 0.9585450172854352, "flos": 34492988764800.0, "grad_norm": 1.5723180530156076, "language_loss": 0.72362167, "learning_rate": 1.797447974521571e-08, "loss": 0.74505079, "num_input_tokens_seen": 343997790, "step": 15943, "time_per_iteration": 2.6411077976226807 }, { "auxiliary_loss_clip": 0.0110037, "auxiliary_loss_mlp": 0.01037182, "balance_loss_clip": 1.03692496, "balance_loss_mlp": 1.02418852, "epoch": 0.9586051405381031, "flos": 23110743219840.0, "grad_norm": 2.3152366176868036, "language_loss": 0.68444526, "learning_rate": 1.792242006001965e-08, "loss": 0.7058208, "num_input_tokens_seen": 344016935, "step": 15944, "time_per_iteration": 2.608394145965576 }, { "auxiliary_loss_clip": 0.0110797, "auxiliary_loss_mlp": 0.01034536, "balance_loss_clip": 1.03546226, "balance_loss_mlp": 1.02167356, "epoch": 0.9586652637907711, "flos": 19603994232960.0, "grad_norm": 1.6133021726163184, "language_loss": 0.66145849, "learning_rate": 1.7870435535403795e-08, "loss": 0.6828835, "num_input_tokens_seen": 344035590, "step": 15945, "time_per_iteration": 2.590332508087158 }, { "auxiliary_loss_clip": 0.00971603, "auxiliary_loss_mlp": 0.01001306, "balance_loss_clip": 1.01690745, "balance_loss_mlp": 1.00031078, "epoch": 0.958725387043439, "flos": 72073327317120.0, "grad_norm": 0.7794478770145054, "language_loss": 0.61829185, "learning_rate": 1.7818526173339678e-08, "loss": 0.63802093, "num_input_tokens_seen": 344100845, "step": 15946, "time_per_iteration": 3.602818489074707 }, { "auxiliary_loss_clip": 0.0110601, "auxiliary_loss_mlp": 0.01029414, "balance_loss_clip": 1.03621078, "balance_loss_mlp": 1.01780248, "epoch": 0.958785510296107, "flos": 28911932242560.0, "grad_norm": 2.157451201161463, "language_loss": 0.7515372, "learning_rate": 1.7766691975795723e-08, "loss": 0.7728914, "num_input_tokens_seen": 344121780, "step": 15947, "time_per_iteration": 2.7516565322875977 }, { "auxiliary_loss_clip": 0.01080438, "auxiliary_loss_mlp": 0.01027644, "balance_loss_clip": 1.03239012, "balance_loss_mlp": 1.01584816, "epoch": 0.958845633548775, "flos": 18477189607680.0, "grad_norm": 2.209516368331818, "language_loss": 0.69477844, "learning_rate": 1.771493294473747e-08, "loss": 0.71585929, "num_input_tokens_seen": 344140150, "step": 15948, "time_per_iteration": 2.6244988441467285 }, { "auxiliary_loss_clip": 0.01057363, "auxiliary_loss_mlp": 0.01032491, "balance_loss_clip": 1.03523755, "balance_loss_mlp": 1.02053475, "epoch": 0.958905756801443, "flos": 24206916522240.0, "grad_norm": 2.7387902232592944, "language_loss": 0.78748626, "learning_rate": 1.7663249082127574e-08, "loss": 0.80838478, "num_input_tokens_seen": 344158200, "step": 15949, "time_per_iteration": 2.7260756492614746 }, { "auxiliary_loss_clip": 0.01111297, "auxiliary_loss_mlp": 0.01033396, "balance_loss_clip": 1.03865027, "balance_loss_mlp": 1.0205152, "epoch": 0.9589658800541109, "flos": 25007939769600.0, "grad_norm": 2.2665397116809634, "language_loss": 0.68637884, "learning_rate": 1.761164038992602e-08, "loss": 0.70782578, "num_input_tokens_seen": 344174720, "step": 15950, "time_per_iteration": 2.5775585174560547 }, { "auxiliary_loss_clip": 0.01089548, "auxiliary_loss_mlp": 0.0103189, "balance_loss_clip": 1.03689277, "balance_loss_mlp": 1.02061236, "epoch": 0.9590260033067789, "flos": 23514558894720.0, "grad_norm": 1.7626457055742824, "language_loss": 0.8612389, "learning_rate": 1.7560106870089687e-08, "loss": 0.88245326, "num_input_tokens_seen": 344192580, "step": 15951, "time_per_iteration": 2.691873550415039 }, { "auxiliary_loss_clip": 0.01085942, "auxiliary_loss_mlp": 0.01037676, "balance_loss_clip": 1.03608108, "balance_loss_mlp": 1.02455699, "epoch": 0.9590861265594469, "flos": 25520349237120.0, "grad_norm": 2.7020454087453434, "language_loss": 0.79673147, "learning_rate": 1.7508648524572568e-08, "loss": 0.81796771, "num_input_tokens_seen": 344210345, "step": 15952, "time_per_iteration": 5.9034318923950195 }, { "auxiliary_loss_clip": 0.01098084, "auxiliary_loss_mlp": 0.01032288, "balance_loss_clip": 1.03800857, "balance_loss_mlp": 1.01903152, "epoch": 0.9591462498121148, "flos": 21179323987200.0, "grad_norm": 1.6067487690898035, "language_loss": 0.69543386, "learning_rate": 1.7457265355326434e-08, "loss": 0.71673763, "num_input_tokens_seen": 344229540, "step": 15953, "time_per_iteration": 2.684041976928711 }, { "auxiliary_loss_clip": 0.0104366, "auxiliary_loss_mlp": 0.01036069, "balance_loss_clip": 1.03161478, "balance_loss_mlp": 1.02161503, "epoch": 0.9592063730647828, "flos": 21723047136000.0, "grad_norm": 2.5164695030249096, "language_loss": 0.58295131, "learning_rate": 1.7405957364299285e-08, "loss": 0.60374862, "num_input_tokens_seen": 344247830, "step": 15954, "time_per_iteration": 4.495413064956665 }, { "auxiliary_loss_clip": 0.01098901, "auxiliary_loss_mlp": 0.01035688, "balance_loss_clip": 1.03649294, "balance_loss_mlp": 1.02196693, "epoch": 0.9592664963174508, "flos": 29891395278720.0, "grad_norm": 2.3199338562306027, "language_loss": 0.74007273, "learning_rate": 1.7354724553437117e-08, "loss": 0.76141858, "num_input_tokens_seen": 344267760, "step": 15955, "time_per_iteration": 2.659421443939209 }, { "auxiliary_loss_clip": 0.0108768, "auxiliary_loss_mlp": 0.0103626, "balance_loss_clip": 1.03656662, "balance_loss_mlp": 1.02313542, "epoch": 0.9593266195701188, "flos": 17999613354240.0, "grad_norm": 1.8573128231358735, "language_loss": 0.62227011, "learning_rate": 1.7303566924682378e-08, "loss": 0.64350951, "num_input_tokens_seen": 344284905, "step": 15956, "time_per_iteration": 2.6006531715393066 }, { "auxiliary_loss_clip": 0.01071121, "auxiliary_loss_mlp": 0.01030387, "balance_loss_clip": 1.03647816, "balance_loss_mlp": 1.01758385, "epoch": 0.9593867428227867, "flos": 18838271076480.0, "grad_norm": 1.7774918774932997, "language_loss": 0.59834391, "learning_rate": 1.725248447997507e-08, "loss": 0.61935902, "num_input_tokens_seen": 344302025, "step": 15957, "time_per_iteration": 4.193215847015381 }, { "auxiliary_loss_clip": 0.0107309, "auxiliary_loss_mlp": 0.01038609, "balance_loss_clip": 1.03605032, "balance_loss_mlp": 1.02551365, "epoch": 0.9594468660754547, "flos": 29567050444800.0, "grad_norm": 1.9567714575730066, "language_loss": 0.74019581, "learning_rate": 1.7201477221252314e-08, "loss": 0.76131284, "num_input_tokens_seen": 344321935, "step": 15958, "time_per_iteration": 2.7699391841888428 }, { "auxiliary_loss_clip": 0.01084783, "auxiliary_loss_mlp": 0.00770183, "balance_loss_clip": 1.03334033, "balance_loss_mlp": 1.00015187, "epoch": 0.9595069893281226, "flos": 20703256104960.0, "grad_norm": 1.5869581385449567, "language_loss": 0.74366057, "learning_rate": 1.7150545150448116e-08, "loss": 0.76221019, "num_input_tokens_seen": 344340405, "step": 15959, "time_per_iteration": 2.6944100856781006 }, { "auxiliary_loss_clip": 0.01095064, "auxiliary_loss_mlp": 0.01030122, "balance_loss_clip": 1.03618193, "balance_loss_mlp": 1.01764679, "epoch": 0.9595671125807906, "flos": 22453613856000.0, "grad_norm": 2.137039778819024, "language_loss": 0.65102017, "learning_rate": 1.7099688269493816e-08, "loss": 0.67227197, "num_input_tokens_seen": 344359925, "step": 15960, "time_per_iteration": 2.6418590545654297 }, { "auxiliary_loss_clip": 0.01105547, "auxiliary_loss_mlp": 0.01034627, "balance_loss_clip": 1.03602636, "balance_loss_mlp": 1.02172852, "epoch": 0.9596272358334585, "flos": 23915214172800.0, "grad_norm": 1.6573536017589419, "language_loss": 0.78154403, "learning_rate": 1.7048906580318544e-08, "loss": 0.80294573, "num_input_tokens_seen": 344379100, "step": 15961, "time_per_iteration": 2.5798726081848145 }, { "auxiliary_loss_clip": 0.01064092, "auxiliary_loss_mlp": 0.01028572, "balance_loss_clip": 1.03795755, "balance_loss_mlp": 1.0165441, "epoch": 0.9596873590861266, "flos": 17672539086720.0, "grad_norm": 1.9079571548212244, "language_loss": 0.75957453, "learning_rate": 1.699820008484698e-08, "loss": 0.78050113, "num_input_tokens_seen": 344396895, "step": 15962, "time_per_iteration": 2.6588690280914307 }, { "auxiliary_loss_clip": 0.01089965, "auxiliary_loss_mlp": 0.01033348, "balance_loss_clip": 1.03721845, "balance_loss_mlp": 1.02023554, "epoch": 0.9597474823387945, "flos": 25808532053760.0, "grad_norm": 2.2256779220037965, "language_loss": 0.71570283, "learning_rate": 1.6947568785002698e-08, "loss": 0.73693591, "num_input_tokens_seen": 344415115, "step": 15963, "time_per_iteration": 2.6878271102905273 }, { "auxiliary_loss_clip": 0.01079235, "auxiliary_loss_mlp": 0.01033914, "balance_loss_clip": 1.03706479, "balance_loss_mlp": 1.02217138, "epoch": 0.9598076055914625, "flos": 23768519028480.0, "grad_norm": 1.5220973192302523, "language_loss": 0.74199623, "learning_rate": 1.689701268270527e-08, "loss": 0.76312768, "num_input_tokens_seen": 344435185, "step": 15964, "time_per_iteration": 2.6809606552124023 }, { "auxiliary_loss_clip": 0.00990624, "auxiliary_loss_mlp": 0.01004877, "balance_loss_clip": 1.00604916, "balance_loss_mlp": 1.00392365, "epoch": 0.9598677288441305, "flos": 56515962464640.0, "grad_norm": 0.8796440193210406, "language_loss": 0.57517397, "learning_rate": 1.684653177987161e-08, "loss": 0.59512901, "num_input_tokens_seen": 344488950, "step": 15965, "time_per_iteration": 3.202644109725952 }, { "auxiliary_loss_clip": 0.0110834, "auxiliary_loss_mlp": 0.01031239, "balance_loss_clip": 1.03589642, "balance_loss_mlp": 1.01969969, "epoch": 0.9599278520967984, "flos": 22997480659200.0, "grad_norm": 1.6089991513926745, "language_loss": 0.79091173, "learning_rate": 1.6796126078416627e-08, "loss": 0.81230754, "num_input_tokens_seen": 344506740, "step": 15966, "time_per_iteration": 2.545722723007202 }, { "auxiliary_loss_clip": 0.01080372, "auxiliary_loss_mlp": 0.01031178, "balance_loss_clip": 1.03162789, "balance_loss_mlp": 1.01910806, "epoch": 0.9599879753494664, "flos": 23039676161280.0, "grad_norm": 1.6133407382349438, "language_loss": 0.79225981, "learning_rate": 1.674579558025102e-08, "loss": 0.81337535, "num_input_tokens_seen": 344526670, "step": 15967, "time_per_iteration": 2.7037363052368164 }, { "auxiliary_loss_clip": 0.01052446, "auxiliary_loss_mlp": 0.01030727, "balance_loss_clip": 1.03173876, "balance_loss_mlp": 1.01654053, "epoch": 0.9600480986021344, "flos": 16392287560320.0, "grad_norm": 4.094831568098365, "language_loss": 0.80607283, "learning_rate": 1.669554028728348e-08, "loss": 0.82690465, "num_input_tokens_seen": 344541995, "step": 15968, "time_per_iteration": 2.6827492713928223 }, { "auxiliary_loss_clip": 0.01061685, "auxiliary_loss_mlp": 0.01040969, "balance_loss_clip": 1.0351243, "balance_loss_mlp": 1.02595484, "epoch": 0.9601082218548024, "flos": 24276439296000.0, "grad_norm": 2.307236682413655, "language_loss": 0.6711151, "learning_rate": 1.6645360201420044e-08, "loss": 0.69214165, "num_input_tokens_seen": 344559980, "step": 15969, "time_per_iteration": 2.709578037261963 }, { "auxiliary_loss_clip": 0.01097154, "auxiliary_loss_mlp": 0.01041579, "balance_loss_clip": 1.03613186, "balance_loss_mlp": 1.02947915, "epoch": 0.9601683451074703, "flos": 19609991804160.0, "grad_norm": 4.544697653030829, "language_loss": 0.79086411, "learning_rate": 1.6595255324563186e-08, "loss": 0.81225151, "num_input_tokens_seen": 344577765, "step": 15970, "time_per_iteration": 2.7411954402923584 }, { "auxiliary_loss_clip": 0.01094881, "auxiliary_loss_mlp": 0.01030457, "balance_loss_clip": 1.03728533, "balance_loss_mlp": 1.01805329, "epoch": 0.9602284683601383, "flos": 26651104358400.0, "grad_norm": 1.536259054605733, "language_loss": 0.7747072, "learning_rate": 1.654522565861316e-08, "loss": 0.79596055, "num_input_tokens_seen": 344597650, "step": 15971, "time_per_iteration": 2.7451272010803223 }, { "auxiliary_loss_clip": 0.01091946, "auxiliary_loss_mlp": 0.01027817, "balance_loss_clip": 1.0364567, "balance_loss_mlp": 1.01459122, "epoch": 0.9602885916128062, "flos": 15554096714880.0, "grad_norm": 1.7996222753948563, "language_loss": 0.67346907, "learning_rate": 1.64952712054669e-08, "loss": 0.69466674, "num_input_tokens_seen": 344613580, "step": 15972, "time_per_iteration": 2.6623332500457764 }, { "auxiliary_loss_clip": 0.01094982, "auxiliary_loss_mlp": 0.00769511, "balance_loss_clip": 1.03505695, "balance_loss_mlp": 1.00020421, "epoch": 0.9603487148654742, "flos": 16502353810560.0, "grad_norm": 2.1539804803600555, "language_loss": 0.76114738, "learning_rate": 1.644539196701844e-08, "loss": 0.77979231, "num_input_tokens_seen": 344626910, "step": 15973, "time_per_iteration": 2.6319777965545654 }, { "auxiliary_loss_clip": 0.01068013, "auxiliary_loss_mlp": 0.01045452, "balance_loss_clip": 1.03971171, "balance_loss_mlp": 1.03173113, "epoch": 0.9604088381181421, "flos": 20845354308480.0, "grad_norm": 1.5935467286542793, "language_loss": 0.68907356, "learning_rate": 1.639558794515983e-08, "loss": 0.71020818, "num_input_tokens_seen": 344644330, "step": 15974, "time_per_iteration": 2.722294569015503 }, { "auxiliary_loss_clip": 0.01097463, "auxiliary_loss_mlp": 0.01029415, "balance_loss_clip": 1.03470838, "balance_loss_mlp": 1.01666546, "epoch": 0.9604689613708102, "flos": 19683105937920.0, "grad_norm": 1.5822354896144846, "language_loss": 0.67808646, "learning_rate": 1.6345859141779105e-08, "loss": 0.69935524, "num_input_tokens_seen": 344663910, "step": 15975, "time_per_iteration": 2.5872485637664795 }, { "auxiliary_loss_clip": 0.01105768, "auxiliary_loss_mlp": 0.0102977, "balance_loss_clip": 1.03735399, "balance_loss_mlp": 1.01801634, "epoch": 0.9605290846234781, "flos": 24097568544000.0, "grad_norm": 2.0352421643496554, "language_loss": 0.55362296, "learning_rate": 1.6296205558762322e-08, "loss": 0.57497835, "num_input_tokens_seen": 344682320, "step": 15976, "time_per_iteration": 2.5711615085601807 }, { "auxiliary_loss_clip": 0.01079409, "auxiliary_loss_mlp": 0.0102813, "balance_loss_clip": 1.03170323, "balance_loss_mlp": 1.01589346, "epoch": 0.9605892078761461, "flos": 27122575299840.0, "grad_norm": 1.7385849927083583, "language_loss": 0.68164247, "learning_rate": 1.624662719799219e-08, "loss": 0.7027179, "num_input_tokens_seen": 344701355, "step": 15977, "time_per_iteration": 2.671110153198242 }, { "auxiliary_loss_clip": 0.01096711, "auxiliary_loss_mlp": 0.01039725, "balance_loss_clip": 1.0339942, "balance_loss_mlp": 1.02705932, "epoch": 0.9606493311288141, "flos": 14136918543360.0, "grad_norm": 1.9552633904927965, "language_loss": 0.81768823, "learning_rate": 1.6197124061348766e-08, "loss": 0.83905256, "num_input_tokens_seen": 344717980, "step": 15978, "time_per_iteration": 2.555152177810669 }, { "auxiliary_loss_clip": 0.01100379, "auxiliary_loss_mlp": 0.010331, "balance_loss_clip": 1.03526115, "balance_loss_mlp": 1.02026129, "epoch": 0.960709454381482, "flos": 15813336147840.0, "grad_norm": 2.5958973786310664, "language_loss": 0.83387029, "learning_rate": 1.614769615070921e-08, "loss": 0.85520506, "num_input_tokens_seen": 344733480, "step": 15979, "time_per_iteration": 2.5497281551361084 }, { "auxiliary_loss_clip": 0.0110855, "auxiliary_loss_mlp": 0.0103807, "balance_loss_clip": 1.03590512, "balance_loss_mlp": 1.02634561, "epoch": 0.96076957763415, "flos": 22565403959040.0, "grad_norm": 1.5387344792405315, "language_loss": 0.79981411, "learning_rate": 1.6098343467947805e-08, "loss": 0.82128036, "num_input_tokens_seen": 344752130, "step": 15980, "time_per_iteration": 2.5905473232269287 }, { "auxiliary_loss_clip": 0.0109877, "auxiliary_loss_mlp": 0.01028852, "balance_loss_clip": 1.03493583, "balance_loss_mlp": 1.01669884, "epoch": 0.960829700886818, "flos": 24681260551680.0, "grad_norm": 2.1133956076478664, "language_loss": 0.68550336, "learning_rate": 1.6049066014935942e-08, "loss": 0.7067796, "num_input_tokens_seen": 344771195, "step": 15981, "time_per_iteration": 2.612859010696411 }, { "auxiliary_loss_clip": 0.01093593, "auxiliary_loss_mlp": 0.00769185, "balance_loss_clip": 1.03381348, "balance_loss_mlp": 1.00022793, "epoch": 0.960889824139486, "flos": 26542223256960.0, "grad_norm": 1.7427488082202907, "language_loss": 0.69655585, "learning_rate": 1.5999863793542344e-08, "loss": 0.71518368, "num_input_tokens_seen": 344793150, "step": 15982, "time_per_iteration": 2.5976712703704834 }, { "auxiliary_loss_clip": 0.00999386, "auxiliary_loss_mlp": 0.00999842, "balance_loss_clip": 1.00873065, "balance_loss_mlp": 0.99883503, "epoch": 0.9609499473921539, "flos": 71114942586240.0, "grad_norm": 0.6662874466097782, "language_loss": 0.53221011, "learning_rate": 1.595073680563286e-08, "loss": 0.5522024, "num_input_tokens_seen": 344852855, "step": 15983, "time_per_iteration": 3.3874897956848145 }, { "auxiliary_loss_clip": 0.01107834, "auxiliary_loss_mlp": 0.01034719, "balance_loss_clip": 1.03694439, "balance_loss_mlp": 1.02233326, "epoch": 0.9610100706448219, "flos": 20552466810240.0, "grad_norm": 2.121316600078336, "language_loss": 0.67938662, "learning_rate": 1.5901685053070212e-08, "loss": 0.70081216, "num_input_tokens_seen": 344869830, "step": 15984, "time_per_iteration": 2.5932650566101074 }, { "auxiliary_loss_clip": 0.01074236, "auxiliary_loss_mlp": 0.01033486, "balance_loss_clip": 1.03595209, "balance_loss_mlp": 1.02153563, "epoch": 0.9610701938974898, "flos": 14064199459200.0, "grad_norm": 1.5705983940163633, "language_loss": 0.67496943, "learning_rate": 1.5852708537714477e-08, "loss": 0.69604665, "num_input_tokens_seen": 344888905, "step": 15985, "time_per_iteration": 2.726486921310425 }, { "auxiliary_loss_clip": 0.01108849, "auxiliary_loss_mlp": 0.01032566, "balance_loss_clip": 1.03674269, "balance_loss_mlp": 1.02043021, "epoch": 0.9611303171501578, "flos": 20229989483520.0, "grad_norm": 2.8268896406333237, "language_loss": 0.78626662, "learning_rate": 1.580380726142283e-08, "loss": 0.80768073, "num_input_tokens_seen": 344907160, "step": 15986, "time_per_iteration": 2.585028886795044 }, { "auxiliary_loss_clip": 0.01059902, "auxiliary_loss_mlp": 0.01031304, "balance_loss_clip": 1.03704977, "balance_loss_mlp": 1.01792264, "epoch": 0.9611904404028258, "flos": 20951075013120.0, "grad_norm": 3.983829786169989, "language_loss": 0.64043385, "learning_rate": 1.5754981226049792e-08, "loss": 0.6613459, "num_input_tokens_seen": 344922400, "step": 15987, "time_per_iteration": 2.6663405895233154 }, { "auxiliary_loss_clip": 0.01105457, "auxiliary_loss_mlp": 0.01030427, "balance_loss_clip": 1.03672767, "balance_loss_mlp": 1.01882839, "epoch": 0.9612505636554938, "flos": 24827740214400.0, "grad_norm": 1.6472459038973077, "language_loss": 0.66917932, "learning_rate": 1.5706230433446544e-08, "loss": 0.69053823, "num_input_tokens_seen": 344941910, "step": 15988, "time_per_iteration": 2.6424877643585205 }, { "auxiliary_loss_clip": 0.01096712, "auxiliary_loss_mlp": 0.01043698, "balance_loss_clip": 1.03608358, "balance_loss_mlp": 1.03205132, "epoch": 0.9613106869081617, "flos": 17164977955200.0, "grad_norm": 2.1053842108044876, "language_loss": 0.74786007, "learning_rate": 1.5657554885462055e-08, "loss": 0.76926422, "num_input_tokens_seen": 344960020, "step": 15989, "time_per_iteration": 2.5956602096557617 }, { "auxiliary_loss_clip": 0.01009811, "auxiliary_loss_mlp": 0.01009546, "balance_loss_clip": 1.00601673, "balance_loss_mlp": 1.00818145, "epoch": 0.9613708101608297, "flos": 61563818522880.0, "grad_norm": 0.8478358014550572, "language_loss": 0.63107759, "learning_rate": 1.5608954583941737e-08, "loss": 0.65127116, "num_input_tokens_seen": 345018290, "step": 15990, "time_per_iteration": 3.152273178100586 }, { "auxiliary_loss_clip": 0.01096035, "auxiliary_loss_mlp": 0.01034605, "balance_loss_clip": 1.03537118, "balance_loss_mlp": 1.02268958, "epoch": 0.9614309334134977, "flos": 27417904922880.0, "grad_norm": 1.9574002604644196, "language_loss": 0.77676558, "learning_rate": 1.5560429530729003e-08, "loss": 0.79807198, "num_input_tokens_seen": 345040235, "step": 15991, "time_per_iteration": 4.212686538696289 }, { "auxiliary_loss_clip": 0.01114207, "auxiliary_loss_mlp": 0.01034295, "balance_loss_clip": 1.0376842, "balance_loss_mlp": 1.02099133, "epoch": 0.9614910566661656, "flos": 22819148611200.0, "grad_norm": 2.7719678193079247, "language_loss": 0.84980291, "learning_rate": 1.5511979727663493e-08, "loss": 0.87128794, "num_input_tokens_seen": 345054540, "step": 15992, "time_per_iteration": 2.5331294536590576 }, { "auxiliary_loss_clip": 0.01084656, "auxiliary_loss_mlp": 0.01030807, "balance_loss_clip": 1.0333364, "balance_loss_mlp": 1.01737833, "epoch": 0.9615511799188337, "flos": 20667812359680.0, "grad_norm": 5.393855831063401, "language_loss": 0.7277714, "learning_rate": 1.5463605176582406e-08, "loss": 0.74892598, "num_input_tokens_seen": 345074035, "step": 15993, "time_per_iteration": 4.279495000839233 }, { "auxiliary_loss_clip": 0.01071095, "auxiliary_loss_mlp": 0.01033351, "balance_loss_clip": 1.03617215, "balance_loss_mlp": 1.02064323, "epoch": 0.9616113031715016, "flos": 33149212035840.0, "grad_norm": 1.4893605821515894, "language_loss": 0.68342292, "learning_rate": 1.5415305879320716e-08, "loss": 0.70446742, "num_input_tokens_seen": 345099270, "step": 15994, "time_per_iteration": 2.7772884368896484 }, { "auxiliary_loss_clip": 0.01072149, "auxiliary_loss_mlp": 0.01034975, "balance_loss_clip": 1.03700161, "balance_loss_mlp": 1.02212477, "epoch": 0.9616714264241696, "flos": 25009807276800.0, "grad_norm": 1.9283329127731719, "language_loss": 0.84783322, "learning_rate": 1.5367081837709183e-08, "loss": 0.86890447, "num_input_tokens_seen": 345116975, "step": 15995, "time_per_iteration": 2.7321677207946777 }, { "auxiliary_loss_clip": 0.0110129, "auxiliary_loss_mlp": 0.01035588, "balance_loss_clip": 1.03743207, "balance_loss_mlp": 1.02226043, "epoch": 0.9617315496768375, "flos": 13547480359680.0, "grad_norm": 2.3727684194410865, "language_loss": 0.75815755, "learning_rate": 1.5318933053576788e-08, "loss": 0.77952629, "num_input_tokens_seen": 345133645, "step": 15996, "time_per_iteration": 2.5802130699157715 }, { "auxiliary_loss_clip": 0.01082505, "auxiliary_loss_mlp": 0.0103401, "balance_loss_clip": 1.03420186, "balance_loss_mlp": 1.02108765, "epoch": 0.9617916729295055, "flos": 11254512781440.0, "grad_norm": 3.2502659425610156, "language_loss": 0.76369971, "learning_rate": 1.52708595287494e-08, "loss": 0.7848649, "num_input_tokens_seen": 345150740, "step": 15997, "time_per_iteration": 4.12961745262146 }, { "auxiliary_loss_clip": 0.01103332, "auxiliary_loss_mlp": 0.00769549, "balance_loss_clip": 1.03523898, "balance_loss_mlp": 1.0002147, "epoch": 0.9618517961821734, "flos": 22819723228800.0, "grad_norm": 1.6933641489070883, "language_loss": 0.67267382, "learning_rate": 1.522286126505001e-08, "loss": 0.69140267, "num_input_tokens_seen": 345170365, "step": 15998, "time_per_iteration": 2.5632731914520264 }, { "auxiliary_loss_clip": 0.01079044, "auxiliary_loss_mlp": 0.01030668, "balance_loss_clip": 1.03057599, "balance_loss_mlp": 1.01695287, "epoch": 0.9619119194348414, "flos": 16617340224000.0, "grad_norm": 1.6277881889337782, "language_loss": 0.7250607, "learning_rate": 1.5174938264298498e-08, "loss": 0.74615777, "num_input_tokens_seen": 345188930, "step": 15999, "time_per_iteration": 2.5826117992401123 }, { "auxiliary_loss_clip": 0.01079594, "auxiliary_loss_mlp": 0.01023964, "balance_loss_clip": 1.03278232, "balance_loss_mlp": 1.01237655, "epoch": 0.9619720426875094, "flos": 24535140024960.0, "grad_norm": 1.9376682350372685, "language_loss": 0.65341753, "learning_rate": 1.5127090528312514e-08, "loss": 0.67445314, "num_input_tokens_seen": 345209615, "step": 16000, "time_per_iteration": 2.6649346351623535 }, { "auxiliary_loss_clip": 0.01074444, "auxiliary_loss_mlp": 0.01027888, "balance_loss_clip": 1.034127, "balance_loss_mlp": 1.0147984, "epoch": 0.9620321659401774, "flos": 20632224960000.0, "grad_norm": 1.8942735733189127, "language_loss": 0.75229144, "learning_rate": 1.5079318058905723e-08, "loss": 0.77331471, "num_input_tokens_seen": 345229175, "step": 16001, "time_per_iteration": 2.690169095993042 }, { "auxiliary_loss_clip": 0.01093786, "auxiliary_loss_mlp": 0.0103193, "balance_loss_clip": 1.03392005, "balance_loss_mlp": 1.01907945, "epoch": 0.9620922891928453, "flos": 18515290959360.0, "grad_norm": 1.9242649128413576, "language_loss": 0.68372071, "learning_rate": 1.5031620857890447e-08, "loss": 0.70497787, "num_input_tokens_seen": 345247815, "step": 16002, "time_per_iteration": 2.609285831451416 }, { "auxiliary_loss_clip": 0.01096986, "auxiliary_loss_mlp": 0.01032704, "balance_loss_clip": 1.03659725, "balance_loss_mlp": 1.0204016, "epoch": 0.9621524124455133, "flos": 28767391914240.0, "grad_norm": 1.3322402005995133, "language_loss": 0.64338034, "learning_rate": 1.4983998927074804e-08, "loss": 0.66467726, "num_input_tokens_seen": 345269935, "step": 16003, "time_per_iteration": 2.64509654045105 }, { "auxiliary_loss_clip": 0.01056283, "auxiliary_loss_mlp": 0.0103757, "balance_loss_clip": 1.03516269, "balance_loss_mlp": 1.02617371, "epoch": 0.9622125356981813, "flos": 19098875226240.0, "grad_norm": 1.8799726685356375, "language_loss": 0.75980008, "learning_rate": 1.493645226826512e-08, "loss": 0.78073859, "num_input_tokens_seen": 345288310, "step": 16004, "time_per_iteration": 2.746777057647705 }, { "auxiliary_loss_clip": 0.01096501, "auxiliary_loss_mlp": 0.01030981, "balance_loss_clip": 1.03659928, "balance_loss_mlp": 1.01776099, "epoch": 0.9622726589508492, "flos": 20302816308480.0, "grad_norm": 1.8988665709450379, "language_loss": 0.79441619, "learning_rate": 1.4888980883263958e-08, "loss": 0.81569099, "num_input_tokens_seen": 345306615, "step": 16005, "time_per_iteration": 2.6173338890075684 }, { "auxiliary_loss_clip": 0.01093237, "auxiliary_loss_mlp": 0.01030093, "balance_loss_clip": 1.0344584, "balance_loss_mlp": 1.01876855, "epoch": 0.9623327822035173, "flos": 54929750889600.0, "grad_norm": 30.35501867161595, "language_loss": 0.67897928, "learning_rate": 1.4841584773871652e-08, "loss": 0.7002126, "num_input_tokens_seen": 345331935, "step": 16006, "time_per_iteration": 2.912827730178833 }, { "auxiliary_loss_clip": 0.0107661, "auxiliary_loss_mlp": 0.01037957, "balance_loss_clip": 1.03514838, "balance_loss_mlp": 1.02623272, "epoch": 0.9623929054561852, "flos": 21759029585280.0, "grad_norm": 1.8340702205023383, "language_loss": 0.77994108, "learning_rate": 1.479426394188521e-08, "loss": 0.80108666, "num_input_tokens_seen": 345351510, "step": 16007, "time_per_iteration": 2.6248257160186768 }, { "auxiliary_loss_clip": 0.0111027, "auxiliary_loss_mlp": 0.010323, "balance_loss_clip": 1.03747129, "balance_loss_mlp": 1.01994443, "epoch": 0.9624530287088532, "flos": 17931563038080.0, "grad_norm": 2.1097968556783244, "language_loss": 0.67964327, "learning_rate": 1.4747018389099198e-08, "loss": 0.701069, "num_input_tokens_seen": 345367750, "step": 16008, "time_per_iteration": 2.537191867828369 }, { "auxiliary_loss_clip": 0.01085992, "auxiliary_loss_mlp": 0.01032638, "balance_loss_clip": 1.03743672, "balance_loss_mlp": 1.01911998, "epoch": 0.9625131519615211, "flos": 23253739263360.0, "grad_norm": 2.1552022323644846, "language_loss": 0.72934628, "learning_rate": 1.469984811730529e-08, "loss": 0.75053251, "num_input_tokens_seen": 345384790, "step": 16009, "time_per_iteration": 2.6170432567596436 }, { "auxiliary_loss_clip": 0.01094692, "auxiliary_loss_mlp": 0.01032269, "balance_loss_clip": 1.03521502, "balance_loss_mlp": 1.02035999, "epoch": 0.9625732752141891, "flos": 18916628595840.0, "grad_norm": 2.236210012080847, "language_loss": 0.75740463, "learning_rate": 1.4652753128292061e-08, "loss": 0.77867424, "num_input_tokens_seen": 345403390, "step": 16010, "time_per_iteration": 2.6094565391540527 }, { "auxiliary_loss_clip": 0.0110126, "auxiliary_loss_mlp": 0.01033383, "balance_loss_clip": 1.03804505, "balance_loss_mlp": 1.01812458, "epoch": 0.962633398466857, "flos": 16252918790400.0, "grad_norm": 1.8499312675955801, "language_loss": 0.69607782, "learning_rate": 1.4605733423845635e-08, "loss": 0.71742427, "num_input_tokens_seen": 345418685, "step": 16011, "time_per_iteration": 2.5665814876556396 }, { "auxiliary_loss_clip": 0.01096422, "auxiliary_loss_mlp": 0.01034569, "balance_loss_clip": 1.03724504, "balance_loss_mlp": 1.02317858, "epoch": 0.962693521719525, "flos": 54197424403200.0, "grad_norm": 2.090107239169434, "language_loss": 0.68528754, "learning_rate": 1.4558789005748585e-08, "loss": 0.70659745, "num_input_tokens_seen": 345442380, "step": 16012, "time_per_iteration": 2.8673768043518066 }, { "auxiliary_loss_clip": 0.0109098, "auxiliary_loss_mlp": 0.01034537, "balance_loss_clip": 1.03467774, "balance_loss_mlp": 1.02032721, "epoch": 0.962753644972193, "flos": 33105795471360.0, "grad_norm": 1.7699818597155268, "language_loss": 0.72427005, "learning_rate": 1.4511919875781264e-08, "loss": 0.74552524, "num_input_tokens_seen": 345463815, "step": 16013, "time_per_iteration": 2.75661301612854 }, { "auxiliary_loss_clip": 0.01075741, "auxiliary_loss_mlp": 0.01033286, "balance_loss_clip": 1.03560877, "balance_loss_mlp": 1.02013755, "epoch": 0.962813768224861, "flos": 42230660837760.0, "grad_norm": 2.2049191715413996, "language_loss": 0.63640058, "learning_rate": 1.4465126035720698e-08, "loss": 0.65749085, "num_input_tokens_seen": 345484525, "step": 16014, "time_per_iteration": 2.801541328430176 }, { "auxiliary_loss_clip": 0.01084087, "auxiliary_loss_mlp": 0.01031024, "balance_loss_clip": 1.03718603, "balance_loss_mlp": 1.02020597, "epoch": 0.9628738914775289, "flos": 43944677003520.0, "grad_norm": 1.6444594137562585, "language_loss": 0.71679461, "learning_rate": 1.4418407487341688e-08, "loss": 0.73794574, "num_input_tokens_seen": 345508295, "step": 16015, "time_per_iteration": 2.8245065212249756 }, { "auxiliary_loss_clip": 0.01070924, "auxiliary_loss_mlp": 0.01031789, "balance_loss_clip": 1.03087783, "balance_loss_mlp": 1.01914668, "epoch": 0.9629340147301969, "flos": 15596184476160.0, "grad_norm": 1.8324403843710784, "language_loss": 0.77434921, "learning_rate": 1.4371764232415707e-08, "loss": 0.79537642, "num_input_tokens_seen": 345525155, "step": 16016, "time_per_iteration": 2.7069830894470215 }, { "auxiliary_loss_clip": 0.01027071, "auxiliary_loss_mlp": 0.01000442, "balance_loss_clip": 1.0047729, "balance_loss_mlp": 0.99953043, "epoch": 0.9629941379828649, "flos": 62951011816320.0, "grad_norm": 0.808956080436883, "language_loss": 0.63018364, "learning_rate": 1.4325196272711337e-08, "loss": 0.65045875, "num_input_tokens_seen": 345578905, "step": 16017, "time_per_iteration": 3.0989813804626465 }, { "auxiliary_loss_clip": 0.01093389, "auxiliary_loss_mlp": 0.0102702, "balance_loss_clip": 1.03960085, "balance_loss_mlp": 1.01511717, "epoch": 0.9630542612355328, "flos": 29899116702720.0, "grad_norm": 1.8256798404845316, "language_loss": 0.66259742, "learning_rate": 1.4278703609994502e-08, "loss": 0.68380153, "num_input_tokens_seen": 345598965, "step": 16018, "time_per_iteration": 2.7493810653686523 }, { "auxiliary_loss_clip": 0.01059951, "auxiliary_loss_mlp": 0.01036182, "balance_loss_clip": 1.03763199, "balance_loss_mlp": 1.02381968, "epoch": 0.9631143844882009, "flos": 17894575008000.0, "grad_norm": 1.944806621091631, "language_loss": 0.79563761, "learning_rate": 1.4232286246028457e-08, "loss": 0.81659889, "num_input_tokens_seen": 345617945, "step": 16019, "time_per_iteration": 2.6809628009796143 }, { "auxiliary_loss_clip": 0.01070109, "auxiliary_loss_mlp": 0.01029701, "balance_loss_clip": 1.03343022, "balance_loss_mlp": 1.01866817, "epoch": 0.9631745077408688, "flos": 26139161767680.0, "grad_norm": 1.782612322393727, "language_loss": 0.71960497, "learning_rate": 1.4185944182572907e-08, "loss": 0.74060309, "num_input_tokens_seen": 345637920, "step": 16020, "time_per_iteration": 2.724942684173584 }, { "auxiliary_loss_clip": 0.01084456, "auxiliary_loss_mlp": 0.01026989, "balance_loss_clip": 1.03556895, "balance_loss_mlp": 1.01536036, "epoch": 0.9632346309935368, "flos": 24973645259520.0, "grad_norm": 2.276745158926346, "language_loss": 0.77092677, "learning_rate": 1.4139677421385331e-08, "loss": 0.79204124, "num_input_tokens_seen": 345656195, "step": 16021, "time_per_iteration": 2.6800484657287598 }, { "auxiliary_loss_clip": 0.01074317, "auxiliary_loss_mlp": 0.01030211, "balance_loss_clip": 1.03503883, "balance_loss_mlp": 1.0156492, "epoch": 0.9632947542462047, "flos": 23617226943360.0, "grad_norm": 2.2141346213360498, "language_loss": 0.6477133, "learning_rate": 1.4093485964220331e-08, "loss": 0.66875851, "num_input_tokens_seen": 345676700, "step": 16022, "time_per_iteration": 2.6913392543792725 }, { "auxiliary_loss_clip": 0.01079957, "auxiliary_loss_mlp": 0.01037399, "balance_loss_clip": 1.03176844, "balance_loss_mlp": 1.02543032, "epoch": 0.9633548774988727, "flos": 26395599939840.0, "grad_norm": 1.8655459266575873, "language_loss": 0.73232532, "learning_rate": 1.4047369812829168e-08, "loss": 0.75349891, "num_input_tokens_seen": 345696725, "step": 16023, "time_per_iteration": 2.8063480854034424 }, { "auxiliary_loss_clip": 0.01092328, "auxiliary_loss_mlp": 0.01033534, "balance_loss_clip": 1.03424549, "balance_loss_mlp": 1.02042127, "epoch": 0.9634150007515406, "flos": 23767728929280.0, "grad_norm": 1.4474458645948844, "language_loss": 0.81416321, "learning_rate": 1.4001328968960891e-08, "loss": 0.8354218, "num_input_tokens_seen": 345716245, "step": 16024, "time_per_iteration": 2.6448142528533936 }, { "auxiliary_loss_clip": 0.01102103, "auxiliary_loss_mlp": 0.01032463, "balance_loss_clip": 1.03745365, "balance_loss_mlp": 1.01969528, "epoch": 0.9634751240042086, "flos": 24135346673280.0, "grad_norm": 1.5219305560935168, "language_loss": 0.81457579, "learning_rate": 1.3955363434361212e-08, "loss": 0.83592141, "num_input_tokens_seen": 345739060, "step": 16025, "time_per_iteration": 2.6108663082122803 }, { "auxiliary_loss_clip": 0.0110069, "auxiliary_loss_mlp": 0.01030354, "balance_loss_clip": 1.03579926, "balance_loss_mlp": 1.017694, "epoch": 0.9635352472568766, "flos": 24349086552960.0, "grad_norm": 2.10441973449587, "language_loss": 0.75937688, "learning_rate": 1.3909473210773181e-08, "loss": 0.78068733, "num_input_tokens_seen": 345758325, "step": 16026, "time_per_iteration": 2.6266496181488037 }, { "auxiliary_loss_clip": 0.01073067, "auxiliary_loss_mlp": 0.00772375, "balance_loss_clip": 1.03285146, "balance_loss_mlp": 1.00015044, "epoch": 0.9635953705095446, "flos": 23984772860160.0, "grad_norm": 1.7472949763500514, "language_loss": 0.632388, "learning_rate": 1.3863658299936965e-08, "loss": 0.65084237, "num_input_tokens_seen": 345778530, "step": 16027, "time_per_iteration": 2.7170257568359375 }, { "auxiliary_loss_clip": 0.01099141, "auxiliary_loss_mlp": 0.01031793, "balance_loss_clip": 1.0367496, "balance_loss_mlp": 1.01860225, "epoch": 0.9636554937622125, "flos": 19828436365440.0, "grad_norm": 2.4648071032004997, "language_loss": 0.87019849, "learning_rate": 1.3817918703589837e-08, "loss": 0.89150786, "num_input_tokens_seen": 345796535, "step": 16028, "time_per_iteration": 2.6614620685577393 }, { "auxiliary_loss_clip": 0.00989988, "auxiliary_loss_mlp": 0.009984, "balance_loss_clip": 1.01412296, "balance_loss_mlp": 0.99733889, "epoch": 0.9637156170148805, "flos": 67435499986560.0, "grad_norm": 0.7245646375690661, "language_loss": 0.53189749, "learning_rate": 1.3772254423466412e-08, "loss": 0.55178136, "num_input_tokens_seen": 345859700, "step": 16029, "time_per_iteration": 3.479651927947998 }, { "auxiliary_loss_clip": 0.01110359, "auxiliary_loss_mlp": 0.01030913, "balance_loss_clip": 1.0374155, "balance_loss_mlp": 1.01844335, "epoch": 0.9637757402675484, "flos": 20300912887680.0, "grad_norm": 1.5210490896959066, "language_loss": 0.7357558, "learning_rate": 1.372666546129797e-08, "loss": 0.75716853, "num_input_tokens_seen": 345878760, "step": 16030, "time_per_iteration": 4.589270353317261 }, { "auxiliary_loss_clip": 0.01082803, "auxiliary_loss_mlp": 0.01030981, "balance_loss_clip": 1.03516376, "balance_loss_mlp": 1.01882792, "epoch": 0.9638358635202164, "flos": 27234544970880.0, "grad_norm": 2.0480859370229485, "language_loss": 0.66053402, "learning_rate": 1.3681151818813575e-08, "loss": 0.68167186, "num_input_tokens_seen": 345900445, "step": 16031, "time_per_iteration": 4.3295276165008545 }, { "auxiliary_loss_clip": 0.0101801, "auxiliary_loss_mlp": 0.00751055, "balance_loss_clip": 1.00562906, "balance_loss_mlp": 0.99969733, "epoch": 0.9638959867728845, "flos": 70288998278400.0, "grad_norm": 0.8510769072154526, "language_loss": 0.60678655, "learning_rate": 1.3635713497738955e-08, "loss": 0.62447721, "num_input_tokens_seen": 345961020, "step": 16032, "time_per_iteration": 4.807501554489136 }, { "auxiliary_loss_clip": 0.01087947, "auxiliary_loss_mlp": 0.01029285, "balance_loss_clip": 1.03354275, "balance_loss_mlp": 1.01818657, "epoch": 0.9639561100255524, "flos": 25407517639680.0, "grad_norm": 1.683266113413322, "language_loss": 0.66466224, "learning_rate": 1.3590350499796954e-08, "loss": 0.68583459, "num_input_tokens_seen": 345980210, "step": 16033, "time_per_iteration": 2.6166305541992188 }, { "auxiliary_loss_clip": 0.01049582, "auxiliary_loss_mlp": 0.01033058, "balance_loss_clip": 1.03215432, "balance_loss_mlp": 1.02048707, "epoch": 0.9640162332782204, "flos": 18113881495680.0, "grad_norm": 1.6681343659776384, "language_loss": 0.65576452, "learning_rate": 1.3545062826707976e-08, "loss": 0.67659092, "num_input_tokens_seen": 345998280, "step": 16034, "time_per_iteration": 2.727808713912964 }, { "auxiliary_loss_clip": 0.01064646, "auxiliary_loss_mlp": 0.01034078, "balance_loss_clip": 1.03320181, "balance_loss_mlp": 1.02124476, "epoch": 0.9640763565308883, "flos": 23440295525760.0, "grad_norm": 4.072427623407378, "language_loss": 0.74320328, "learning_rate": 1.3499850480189313e-08, "loss": 0.7641905, "num_input_tokens_seen": 346015545, "step": 16035, "time_per_iteration": 2.690566301345825 }, { "auxiliary_loss_clip": 0.01111339, "auxiliary_loss_mlp": 0.0102927, "balance_loss_clip": 1.03984404, "balance_loss_mlp": 1.01689649, "epoch": 0.9641364797835563, "flos": 22419355259520.0, "grad_norm": 1.9463375206505085, "language_loss": 0.81678671, "learning_rate": 1.3454713461955591e-08, "loss": 0.83819282, "num_input_tokens_seen": 346034055, "step": 16036, "time_per_iteration": 4.158876180648804 }, { "auxiliary_loss_clip": 0.0107928, "auxiliary_loss_mlp": 0.0103524, "balance_loss_clip": 1.03454709, "balance_loss_mlp": 1.0221982, "epoch": 0.9641966030362242, "flos": 30622357048320.0, "grad_norm": 1.959482947327249, "language_loss": 0.69556695, "learning_rate": 1.340965177371789e-08, "loss": 0.71671212, "num_input_tokens_seen": 346054130, "step": 16037, "time_per_iteration": 2.7688260078430176 }, { "auxiliary_loss_clip": 0.01107935, "auxiliary_loss_mlp": 0.01027132, "balance_loss_clip": 1.03539455, "balance_loss_mlp": 1.014907, "epoch": 0.9642567262888923, "flos": 20953122088320.0, "grad_norm": 1.9894338477603324, "language_loss": 0.63357198, "learning_rate": 1.3364665417185506e-08, "loss": 0.65492266, "num_input_tokens_seen": 346072990, "step": 16038, "time_per_iteration": 2.5850584506988525 }, { "auxiliary_loss_clip": 0.01074768, "auxiliary_loss_mlp": 0.00773215, "balance_loss_clip": 1.03389633, "balance_loss_mlp": 1.00020552, "epoch": 0.9643168495415602, "flos": 22639415932800.0, "grad_norm": 1.7788742009808307, "language_loss": 0.71187615, "learning_rate": 1.3319754394064187e-08, "loss": 0.73035598, "num_input_tokens_seen": 346093745, "step": 16039, "time_per_iteration": 2.845629930496216 }, { "auxiliary_loss_clip": 0.01065131, "auxiliary_loss_mlp": 0.0103299, "balance_loss_clip": 1.03418183, "balance_loss_mlp": 1.02005541, "epoch": 0.9643769727942282, "flos": 20266259241600.0, "grad_norm": 2.1186364424609376, "language_loss": 0.73193431, "learning_rate": 1.327491870605657e-08, "loss": 0.7529155, "num_input_tokens_seen": 346110115, "step": 16040, "time_per_iteration": 2.786925792694092 }, { "auxiliary_loss_clip": 0.01098258, "auxiliary_loss_mlp": 0.01030326, "balance_loss_clip": 1.03442872, "balance_loss_mlp": 1.0174036, "epoch": 0.9644370960468961, "flos": 13881845088000.0, "grad_norm": 2.252747259214268, "language_loss": 0.72871804, "learning_rate": 1.3230158354863296e-08, "loss": 0.75000393, "num_input_tokens_seen": 346127165, "step": 16041, "time_per_iteration": 2.6087379455566406 }, { "auxiliary_loss_clip": 0.01079942, "auxiliary_loss_mlp": 0.01032192, "balance_loss_clip": 1.03319049, "balance_loss_mlp": 1.0204258, "epoch": 0.9644972192995641, "flos": 17238199829760.0, "grad_norm": 2.3450259817434675, "language_loss": 0.7170828, "learning_rate": 1.3185473342181674e-08, "loss": 0.73820412, "num_input_tokens_seen": 346145950, "step": 16042, "time_per_iteration": 2.630866765975952 }, { "auxiliary_loss_clip": 0.01071379, "auxiliary_loss_mlp": 0.0103485, "balance_loss_clip": 1.03418255, "balance_loss_mlp": 1.02246428, "epoch": 0.964557342552232, "flos": 23840340272640.0, "grad_norm": 2.7842681771990954, "language_loss": 0.80969441, "learning_rate": 1.3140863669705683e-08, "loss": 0.83075678, "num_input_tokens_seen": 346165005, "step": 16043, "time_per_iteration": 2.7390518188476562 }, { "auxiliary_loss_clip": 0.01080445, "auxiliary_loss_mlp": 0.01033983, "balance_loss_clip": 1.03533363, "balance_loss_mlp": 1.0219785, "epoch": 0.9646174658049, "flos": 21653129312640.0, "grad_norm": 1.6601766857412785, "language_loss": 0.71968645, "learning_rate": 1.3096329339127522e-08, "loss": 0.74083078, "num_input_tokens_seen": 346185095, "step": 16044, "time_per_iteration": 2.7201802730560303 }, { "auxiliary_loss_clip": 0.01082368, "auxiliary_loss_mlp": 0.01030048, "balance_loss_clip": 1.03337395, "balance_loss_mlp": 1.01734579, "epoch": 0.9646775890575681, "flos": 17129570123520.0, "grad_norm": 1.8680563800690775, "language_loss": 0.70015121, "learning_rate": 1.3051870352135397e-08, "loss": 0.72127533, "num_input_tokens_seen": 346202580, "step": 16045, "time_per_iteration": 2.612548589706421 }, { "auxiliary_loss_clip": 0.01038509, "auxiliary_loss_mlp": 0.01033861, "balance_loss_clip": 1.03134286, "balance_loss_mlp": 1.02050328, "epoch": 0.964737712310236, "flos": 13005732458880.0, "grad_norm": 1.8349369977772942, "language_loss": 0.74999833, "learning_rate": 1.3007486710415737e-08, "loss": 0.77072203, "num_input_tokens_seen": 346219395, "step": 16046, "time_per_iteration": 2.7376601696014404 }, { "auxiliary_loss_clip": 0.0110139, "auxiliary_loss_mlp": 0.01036417, "balance_loss_clip": 1.03724825, "balance_loss_mlp": 1.0229938, "epoch": 0.964797835562904, "flos": 24279240556800.0, "grad_norm": 2.6746075842728643, "language_loss": 0.62799901, "learning_rate": 1.2963178415651199e-08, "loss": 0.64937705, "num_input_tokens_seen": 346239715, "step": 16047, "time_per_iteration": 2.6332709789276123 }, { "auxiliary_loss_clip": 0.01088739, "auxiliary_loss_mlp": 0.01036737, "balance_loss_clip": 1.0394733, "balance_loss_mlp": 1.02437496, "epoch": 0.9648579588155719, "flos": 20522697413760.0, "grad_norm": 3.1779887131346722, "language_loss": 0.68779409, "learning_rate": 1.2918945469521992e-08, "loss": 0.70904881, "num_input_tokens_seen": 346258500, "step": 16048, "time_per_iteration": 2.6534385681152344 }, { "auxiliary_loss_clip": 0.01099634, "auxiliary_loss_mlp": 0.01033136, "balance_loss_clip": 1.03766835, "balance_loss_mlp": 1.02040398, "epoch": 0.9649180820682399, "flos": 32154844855680.0, "grad_norm": 1.6641327759979738, "language_loss": 0.63842821, "learning_rate": 1.2874787873705662e-08, "loss": 0.65975595, "num_input_tokens_seen": 346279110, "step": 16049, "time_per_iteration": 2.7865707874298096 }, { "auxiliary_loss_clip": 0.0110081, "auxiliary_loss_mlp": 0.01031714, "balance_loss_clip": 1.03886986, "balance_loss_mlp": 1.01909614, "epoch": 0.9649782053209078, "flos": 20522589672960.0, "grad_norm": 1.7000541737371648, "language_loss": 0.70881176, "learning_rate": 1.2830705629876427e-08, "loss": 0.73013705, "num_input_tokens_seen": 346297860, "step": 16050, "time_per_iteration": 2.6416265964508057 }, { "auxiliary_loss_clip": 0.01097319, "auxiliary_loss_mlp": 0.01036578, "balance_loss_clip": 1.03291678, "balance_loss_mlp": 1.02254736, "epoch": 0.9650383285735759, "flos": 43067953843200.0, "grad_norm": 1.8954759239301664, "language_loss": 0.70080233, "learning_rate": 1.278669873970606e-08, "loss": 0.72214133, "num_input_tokens_seen": 346319860, "step": 16051, "time_per_iteration": 2.8770833015441895 }, { "auxiliary_loss_clip": 0.0101809, "auxiliary_loss_mlp": 0.01006389, "balance_loss_clip": 1.0055362, "balance_loss_mlp": 1.00536346, "epoch": 0.9650984518262438, "flos": 61748255882880.0, "grad_norm": 0.8414397743745523, "language_loss": 0.59155834, "learning_rate": 1.2742767204863004e-08, "loss": 0.61180305, "num_input_tokens_seen": 346379025, "step": 16052, "time_per_iteration": 3.190720796585083 }, { "auxiliary_loss_clip": 0.01103599, "auxiliary_loss_mlp": 0.01027391, "balance_loss_clip": 1.03430974, "balance_loss_mlp": 1.01511848, "epoch": 0.9651585750789118, "flos": 29789337761280.0, "grad_norm": 1.6456088019089208, "language_loss": 0.74250531, "learning_rate": 1.2698911027013482e-08, "loss": 0.76381516, "num_input_tokens_seen": 346402250, "step": 16053, "time_per_iteration": 2.707024335861206 }, { "auxiliary_loss_clip": 0.01083745, "auxiliary_loss_mlp": 0.01030902, "balance_loss_clip": 1.03504825, "balance_loss_mlp": 1.01819479, "epoch": 0.9652186983315797, "flos": 16873060124160.0, "grad_norm": 2.539273604923119, "language_loss": 0.68519378, "learning_rate": 1.2655130207820386e-08, "loss": 0.70634031, "num_input_tokens_seen": 346419555, "step": 16054, "time_per_iteration": 2.650216817855835 }, { "auxiliary_loss_clip": 0.01091665, "auxiliary_loss_mlp": 0.00769869, "balance_loss_clip": 1.03783798, "balance_loss_mlp": 1.00018322, "epoch": 0.9652788215842477, "flos": 31649761762560.0, "grad_norm": 1.504057282029753, "language_loss": 0.6170547, "learning_rate": 1.2611424748943944e-08, "loss": 0.63567007, "num_input_tokens_seen": 346441245, "step": 16055, "time_per_iteration": 2.708653450012207 }, { "auxiliary_loss_clip": 0.01069001, "auxiliary_loss_mlp": 0.01032832, "balance_loss_clip": 1.03551924, "balance_loss_mlp": 1.02045822, "epoch": 0.9653389448369156, "flos": 24754266944640.0, "grad_norm": 1.8401015391403723, "language_loss": 0.77219534, "learning_rate": 1.2567794652041719e-08, "loss": 0.79321373, "num_input_tokens_seen": 346460065, "step": 16056, "time_per_iteration": 2.860055446624756 }, { "auxiliary_loss_clip": 0.01081129, "auxiliary_loss_mlp": 0.01031879, "balance_loss_clip": 1.03277361, "balance_loss_mlp": 1.01884317, "epoch": 0.9653990680895836, "flos": 20297249700480.0, "grad_norm": 1.5426450454186225, "language_loss": 0.71504593, "learning_rate": 1.2524239918767498e-08, "loss": 0.73617601, "num_input_tokens_seen": 346478005, "step": 16057, "time_per_iteration": 2.6402721405029297 }, { "auxiliary_loss_clip": 0.01104126, "auxiliary_loss_mlp": 0.01033201, "balance_loss_clip": 1.03448784, "balance_loss_mlp": 1.02154279, "epoch": 0.9654591913422517, "flos": 22528775064960.0, "grad_norm": 2.1611646514701786, "language_loss": 0.71808469, "learning_rate": 1.2480760550773295e-08, "loss": 0.73945796, "num_input_tokens_seen": 346497575, "step": 16058, "time_per_iteration": 2.5751798152923584 }, { "auxiliary_loss_clip": 0.01095378, "auxiliary_loss_mlp": 0.01032935, "balance_loss_clip": 1.0353595, "balance_loss_mlp": 1.02075791, "epoch": 0.9655193145949196, "flos": 26763002202240.0, "grad_norm": 1.6714085825517457, "language_loss": 0.74098462, "learning_rate": 1.2437356549708011e-08, "loss": 0.76226771, "num_input_tokens_seen": 346520000, "step": 16059, "time_per_iteration": 2.7003426551818848 }, { "auxiliary_loss_clip": 0.01090004, "auxiliary_loss_mlp": 0.01034435, "balance_loss_clip": 1.03552127, "balance_loss_mlp": 1.02234113, "epoch": 0.9655794378475876, "flos": 41970703132800.0, "grad_norm": 1.9389350053805974, "language_loss": 0.73612213, "learning_rate": 1.239402791721722e-08, "loss": 0.75736654, "num_input_tokens_seen": 346541605, "step": 16060, "time_per_iteration": 2.784961462020874 }, { "auxiliary_loss_clip": 0.01084764, "auxiliary_loss_mlp": 0.01031011, "balance_loss_clip": 1.03691041, "balance_loss_mlp": 1.019889, "epoch": 0.9656395611002555, "flos": 27709427704320.0, "grad_norm": 2.3988386788091502, "language_loss": 0.76481092, "learning_rate": 1.2350774654944273e-08, "loss": 0.78596866, "num_input_tokens_seen": 346560955, "step": 16061, "time_per_iteration": 2.7270572185516357 }, { "auxiliary_loss_clip": 0.01012338, "auxiliary_loss_mlp": 0.01000976, "balance_loss_clip": 1.00929773, "balance_loss_mlp": 1.00001049, "epoch": 0.9656996843529235, "flos": 68968562411520.0, "grad_norm": 0.7235401443187384, "language_loss": 0.64154565, "learning_rate": 1.2307596764528749e-08, "loss": 0.66167879, "num_input_tokens_seen": 346621615, "step": 16062, "time_per_iteration": 3.263425827026367 }, { "auxiliary_loss_clip": 0.01055166, "auxiliary_loss_mlp": 0.01028261, "balance_loss_clip": 1.02995956, "balance_loss_mlp": 1.01672757, "epoch": 0.9657598076055914, "flos": 20631327120000.0, "grad_norm": 1.9907973555494325, "language_loss": 0.92924762, "learning_rate": 1.226449424760867e-08, "loss": 0.95008188, "num_input_tokens_seen": 346637460, "step": 16063, "time_per_iteration": 2.728024959564209 }, { "auxiliary_loss_clip": 0.01099068, "auxiliary_loss_mlp": 0.01033776, "balance_loss_clip": 1.03742814, "balance_loss_mlp": 1.02153897, "epoch": 0.9658199308582595, "flos": 20448577699200.0, "grad_norm": 1.7715018792062844, "language_loss": 0.82029349, "learning_rate": 1.2221467105818062e-08, "loss": 0.84162194, "num_input_tokens_seen": 346655625, "step": 16064, "time_per_iteration": 2.633328914642334 }, { "auxiliary_loss_clip": 0.01095042, "auxiliary_loss_mlp": 0.00770428, "balance_loss_clip": 1.03698933, "balance_loss_mlp": 1.0001657, "epoch": 0.9658800541109274, "flos": 24718033100160.0, "grad_norm": 1.5465951740979789, "language_loss": 0.84208536, "learning_rate": 1.2178515340788731e-08, "loss": 0.86074007, "num_input_tokens_seen": 346675220, "step": 16065, "time_per_iteration": 2.6656553745269775 }, { "auxiliary_loss_clip": 0.01083456, "auxiliary_loss_mlp": 0.01029904, "balance_loss_clip": 1.03509152, "balance_loss_mlp": 1.01748872, "epoch": 0.9659401773635954, "flos": 21610035970560.0, "grad_norm": 1.7587516083331964, "language_loss": 0.67517728, "learning_rate": 1.2135638954149151e-08, "loss": 0.69631088, "num_input_tokens_seen": 346694710, "step": 16066, "time_per_iteration": 2.6471195220947266 }, { "auxiliary_loss_clip": 0.01107434, "auxiliary_loss_mlp": 0.0102656, "balance_loss_clip": 1.03636479, "balance_loss_mlp": 1.01466918, "epoch": 0.9660003006162633, "flos": 20301200196480.0, "grad_norm": 1.8111019231916714, "language_loss": 0.82353568, "learning_rate": 1.209283794752558e-08, "loss": 0.84487563, "num_input_tokens_seen": 346712645, "step": 16067, "time_per_iteration": 2.605968952178955 }, { "auxiliary_loss_clip": 0.01087949, "auxiliary_loss_mlp": 0.01029805, "balance_loss_clip": 1.03769147, "balance_loss_mlp": 1.01721048, "epoch": 0.9660604238689313, "flos": 24461954064000.0, "grad_norm": 2.0465290050813496, "language_loss": 0.69553685, "learning_rate": 1.2050112322540496e-08, "loss": 0.71671438, "num_input_tokens_seen": 346732375, "step": 16068, "time_per_iteration": 2.7153985500335693 }, { "auxiliary_loss_clip": 0.01085915, "auxiliary_loss_mlp": 0.01031292, "balance_loss_clip": 1.0330863, "balance_loss_mlp": 1.02038455, "epoch": 0.9661205471215992, "flos": 19864023765120.0, "grad_norm": 1.6826807111904172, "language_loss": 0.68126762, "learning_rate": 1.20074620808146e-08, "loss": 0.70243973, "num_input_tokens_seen": 346750430, "step": 16069, "time_per_iteration": 2.576427936553955 }, { "auxiliary_loss_clip": 0.01089339, "auxiliary_loss_mlp": 0.01028006, "balance_loss_clip": 1.03860068, "balance_loss_mlp": 1.01594257, "epoch": 0.9661806703742672, "flos": 20557889763840.0, "grad_norm": 1.979804846920071, "language_loss": 0.8906877, "learning_rate": 1.1964887223964826e-08, "loss": 0.91186118, "num_input_tokens_seen": 346768455, "step": 16070, "time_per_iteration": 5.773402214050293 }, { "auxiliary_loss_clip": 0.01111791, "auxiliary_loss_mlp": 0.01038495, "balance_loss_clip": 1.03955567, "balance_loss_mlp": 1.02573359, "epoch": 0.9662407936269353, "flos": 21430949736960.0, "grad_norm": 2.2069490271978327, "language_loss": 0.77111554, "learning_rate": 1.1922387753605878e-08, "loss": 0.79261839, "num_input_tokens_seen": 346786530, "step": 16071, "time_per_iteration": 4.432383060455322 }, { "auxiliary_loss_clip": 0.01083604, "auxiliary_loss_mlp": 0.01031428, "balance_loss_clip": 1.03396428, "balance_loss_mlp": 1.01729596, "epoch": 0.9663009168796032, "flos": 14902893095040.0, "grad_norm": 1.7077316996855652, "language_loss": 0.65930271, "learning_rate": 1.1879963671349137e-08, "loss": 0.680453, "num_input_tokens_seen": 346804635, "step": 16072, "time_per_iteration": 2.6231675148010254 }, { "auxiliary_loss_clip": 0.01101171, "auxiliary_loss_mlp": 0.01031913, "balance_loss_clip": 1.03714108, "balance_loss_mlp": 1.02001643, "epoch": 0.9663610401322712, "flos": 24310877460480.0, "grad_norm": 1.7479386785661417, "language_loss": 0.77363575, "learning_rate": 1.1837614978803534e-08, "loss": 0.7949667, "num_input_tokens_seen": 346823070, "step": 16073, "time_per_iteration": 2.6588406562805176 }, { "auxiliary_loss_clip": 0.01113364, "auxiliary_loss_mlp": 0.01036179, "balance_loss_clip": 1.03833628, "balance_loss_mlp": 1.02317297, "epoch": 0.9664211633849391, "flos": 17637849527040.0, "grad_norm": 4.0139714248409515, "language_loss": 0.7596699, "learning_rate": 1.1795341677574677e-08, "loss": 0.78116536, "num_input_tokens_seen": 346841180, "step": 16074, "time_per_iteration": 2.5176475048065186 }, { "auxiliary_loss_clip": 0.01085316, "auxiliary_loss_mlp": 0.01031408, "balance_loss_clip": 1.03595638, "balance_loss_mlp": 1.01841474, "epoch": 0.9664812866376071, "flos": 29789409588480.0, "grad_norm": 1.5863798083052476, "language_loss": 0.75684714, "learning_rate": 1.1753143769265728e-08, "loss": 0.77801442, "num_input_tokens_seen": 346864250, "step": 16075, "time_per_iteration": 2.740597724914551 }, { "auxiliary_loss_clip": 0.01078205, "auxiliary_loss_mlp": 0.01035752, "balance_loss_clip": 1.03695774, "balance_loss_mlp": 1.02323484, "epoch": 0.966541409890275, "flos": 14282320798080.0, "grad_norm": 1.8962598568271254, "language_loss": 0.78820133, "learning_rate": 1.171102125547696e-08, "loss": 0.80934089, "num_input_tokens_seen": 346881955, "step": 16076, "time_per_iteration": 4.21985650062561 }, { "auxiliary_loss_clip": 0.01089256, "auxiliary_loss_mlp": 0.01043191, "balance_loss_clip": 1.03779173, "balance_loss_mlp": 1.02938676, "epoch": 0.9666015331429431, "flos": 19860432405120.0, "grad_norm": 1.7349669135653192, "language_loss": 0.7190969, "learning_rate": 1.166897413780532e-08, "loss": 0.74042135, "num_input_tokens_seen": 346900445, "step": 16077, "time_per_iteration": 2.626159191131592 }, { "auxiliary_loss_clip": 0.01093266, "auxiliary_loss_mlp": 0.01032581, "balance_loss_clip": 1.03399146, "balance_loss_mlp": 1.01980758, "epoch": 0.966661656395611, "flos": 27125951178240.0, "grad_norm": 1.9266659878552612, "language_loss": 0.593472, "learning_rate": 1.1627002417845533e-08, "loss": 0.61473054, "num_input_tokens_seen": 346920135, "step": 16078, "time_per_iteration": 2.6967270374298096 }, { "auxiliary_loss_clip": 0.01101009, "auxiliary_loss_mlp": 0.0103522, "balance_loss_clip": 1.03683424, "balance_loss_mlp": 1.02165985, "epoch": 0.966721779648279, "flos": 21508229848320.0, "grad_norm": 1.883589824979691, "language_loss": 0.72105432, "learning_rate": 1.158510609718899e-08, "loss": 0.74241656, "num_input_tokens_seen": 346940450, "step": 16079, "time_per_iteration": 2.63110089302063 }, { "auxiliary_loss_clip": 0.01094454, "auxiliary_loss_mlp": 0.01027425, "balance_loss_clip": 1.03552699, "balance_loss_mlp": 1.01592135, "epoch": 0.9667819029009469, "flos": 23878118401920.0, "grad_norm": 1.528864037931963, "language_loss": 0.71972895, "learning_rate": 1.1543285177424644e-08, "loss": 0.74094772, "num_input_tokens_seen": 346960935, "step": 16080, "time_per_iteration": 2.6290252208709717 }, { "auxiliary_loss_clip": 0.01075746, "auxiliary_loss_mlp": 0.0103416, "balance_loss_clip": 1.03217447, "balance_loss_mlp": 1.02045643, "epoch": 0.9668420261536149, "flos": 21507224267520.0, "grad_norm": 1.6987481016197885, "language_loss": 0.73362374, "learning_rate": 1.1501539660138115e-08, "loss": 0.75472283, "num_input_tokens_seen": 346980100, "step": 16081, "time_per_iteration": 2.6839892864227295 }, { "auxiliary_loss_clip": 0.01080983, "auxiliary_loss_mlp": 0.0102965, "balance_loss_clip": 1.03344238, "balance_loss_mlp": 1.01646531, "epoch": 0.9669021494062828, "flos": 26687266375680.0, "grad_norm": 1.771131179159937, "language_loss": 0.67452699, "learning_rate": 1.145986954691236e-08, "loss": 0.69563329, "num_input_tokens_seen": 347001250, "step": 16082, "time_per_iteration": 2.7003889083862305 }, { "auxiliary_loss_clip": 0.01065498, "auxiliary_loss_mlp": 0.01042485, "balance_loss_clip": 1.0319593, "balance_loss_mlp": 1.02886534, "epoch": 0.9669622726589508, "flos": 29825032901760.0, "grad_norm": 1.870561288505191, "language_loss": 0.76813722, "learning_rate": 1.141827483932789e-08, "loss": 0.78921711, "num_input_tokens_seen": 347022975, "step": 16083, "time_per_iteration": 2.736612558364868 }, { "auxiliary_loss_clip": 0.01061787, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.03434837, "balance_loss_mlp": 1.0202508, "epoch": 0.9670223959116189, "flos": 22922499018240.0, "grad_norm": 2.1852037151642203, "language_loss": 0.79155672, "learning_rate": 1.1376755538961669e-08, "loss": 0.81250471, "num_input_tokens_seen": 347038780, "step": 16084, "time_per_iteration": 2.7562255859375 }, { "auxiliary_loss_clip": 0.01101094, "auxiliary_loss_mlp": 0.01029867, "balance_loss_clip": 1.03601408, "balance_loss_mlp": 1.01673627, "epoch": 0.9670825191642868, "flos": 18624495283200.0, "grad_norm": 2.1888422828676655, "language_loss": 0.6779865, "learning_rate": 1.1335311647387991e-08, "loss": 0.69929618, "num_input_tokens_seen": 347056705, "step": 16085, "time_per_iteration": 2.5915327072143555 }, { "auxiliary_loss_clip": 0.01089717, "auxiliary_loss_mlp": 0.01031896, "balance_loss_clip": 1.03720474, "balance_loss_mlp": 1.01825213, "epoch": 0.9671426424169548, "flos": 24497936513280.0, "grad_norm": 2.1661579345086097, "language_loss": 0.69027126, "learning_rate": 1.1293943166178709e-08, "loss": 0.71148735, "num_input_tokens_seen": 347075710, "step": 16086, "time_per_iteration": 2.6948018074035645 }, { "auxiliary_loss_clip": 0.01095229, "auxiliary_loss_mlp": 0.01033969, "balance_loss_clip": 1.03497195, "balance_loss_mlp": 1.02086806, "epoch": 0.9672027656696227, "flos": 20371189847040.0, "grad_norm": 1.6309967969700185, "language_loss": 0.78254652, "learning_rate": 1.125265009690235e-08, "loss": 0.80383849, "num_input_tokens_seen": 347092325, "step": 16087, "time_per_iteration": 2.638317346572876 }, { "auxiliary_loss_clip": 0.0107816, "auxiliary_loss_mlp": 0.01029378, "balance_loss_clip": 1.0323391, "balance_loss_mlp": 1.0173974, "epoch": 0.9672628889222907, "flos": 18880179269760.0, "grad_norm": 1.8693958793677588, "language_loss": 0.71220851, "learning_rate": 1.1211432441124769e-08, "loss": 0.73328388, "num_input_tokens_seen": 347110595, "step": 16088, "time_per_iteration": 2.7119359970092773 }, { "auxiliary_loss_clip": 0.01105883, "auxiliary_loss_mlp": 0.00770003, "balance_loss_clip": 1.03694296, "balance_loss_mlp": 1.00009024, "epoch": 0.9673230121749586, "flos": 28695247447680.0, "grad_norm": 1.6915080049875915, "language_loss": 0.70655894, "learning_rate": 1.117029020040916e-08, "loss": 0.72531772, "num_input_tokens_seen": 347131625, "step": 16089, "time_per_iteration": 2.5807154178619385 }, { "auxiliary_loss_clip": 0.01110035, "auxiliary_loss_mlp": 0.01032325, "balance_loss_clip": 1.03722262, "balance_loss_mlp": 1.02004623, "epoch": 0.9673831354276267, "flos": 20484452407680.0, "grad_norm": 2.217046221899868, "language_loss": 0.7484473, "learning_rate": 1.1129223376315167e-08, "loss": 0.76987088, "num_input_tokens_seen": 347147910, "step": 16090, "time_per_iteration": 2.5390427112579346 }, { "auxiliary_loss_clip": 0.01087487, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.03507531, "balance_loss_mlp": 1.01677692, "epoch": 0.9674432586802946, "flos": 26797548107520.0, "grad_norm": 2.2040190235185158, "language_loss": 0.69111538, "learning_rate": 1.1088231970400653e-08, "loss": 0.71228355, "num_input_tokens_seen": 347168805, "step": 16091, "time_per_iteration": 2.672116279602051 }, { "auxiliary_loss_clip": 0.01106741, "auxiliary_loss_mlp": 0.01031458, "balance_loss_clip": 1.03664362, "balance_loss_mlp": 1.01816666, "epoch": 0.9675033819329626, "flos": 22310941034880.0, "grad_norm": 1.7246952798155581, "language_loss": 0.76974177, "learning_rate": 1.1047315984219484e-08, "loss": 0.79112375, "num_input_tokens_seen": 347189455, "step": 16092, "time_per_iteration": 2.562080144882202 }, { "auxiliary_loss_clip": 0.01107911, "auxiliary_loss_mlp": 0.01030137, "balance_loss_clip": 1.0373435, "balance_loss_mlp": 1.01862192, "epoch": 0.9675635051856305, "flos": 12675713276160.0, "grad_norm": 1.9074028879734577, "language_loss": 0.76118815, "learning_rate": 1.1006475419323313e-08, "loss": 0.78256863, "num_input_tokens_seen": 347206030, "step": 16093, "time_per_iteration": 2.5782711505889893 }, { "auxiliary_loss_clip": 0.01083204, "auxiliary_loss_mlp": 0.01028136, "balance_loss_clip": 1.03609204, "balance_loss_mlp": 1.01477861, "epoch": 0.9676236284382985, "flos": 24608469640320.0, "grad_norm": 1.8185482437095273, "language_loss": 0.68996257, "learning_rate": 1.096571027726112e-08, "loss": 0.71107602, "num_input_tokens_seen": 347226250, "step": 16094, "time_per_iteration": 2.642312526702881 }, { "auxiliary_loss_clip": 0.01099843, "auxiliary_loss_mlp": 0.01031519, "balance_loss_clip": 1.03670728, "balance_loss_mlp": 1.01940703, "epoch": 0.9676837516909664, "flos": 23367145478400.0, "grad_norm": 3.1538628444307912, "language_loss": 0.7587145, "learning_rate": 1.0925020559578557e-08, "loss": 0.7800281, "num_input_tokens_seen": 347247350, "step": 16095, "time_per_iteration": 2.6397533416748047 }, { "auxiliary_loss_clip": 0.01114159, "auxiliary_loss_mlp": 0.01035616, "balance_loss_clip": 1.03943849, "balance_loss_mlp": 1.0225327, "epoch": 0.9677438749436345, "flos": 20486894532480.0, "grad_norm": 3.4142773987990513, "language_loss": 0.70483637, "learning_rate": 1.0884406267818392e-08, "loss": 0.7263341, "num_input_tokens_seen": 347266870, "step": 16096, "time_per_iteration": 2.571568727493286 }, { "auxiliary_loss_clip": 0.01086881, "auxiliary_loss_mlp": 0.01026382, "balance_loss_clip": 1.03495574, "balance_loss_mlp": 1.01391292, "epoch": 0.9678039981963025, "flos": 47555889719040.0, "grad_norm": 1.7358126863243992, "language_loss": 0.7179426, "learning_rate": 1.0843867403520946e-08, "loss": 0.73907518, "num_input_tokens_seen": 347290120, "step": 16097, "time_per_iteration": 2.8643288612365723 }, { "auxiliary_loss_clip": 0.01107467, "auxiliary_loss_mlp": 0.01035068, "balance_loss_clip": 1.03668487, "balance_loss_mlp": 1.02286124, "epoch": 0.9678641214489704, "flos": 25040474513280.0, "grad_norm": 1.9803265483631816, "language_loss": 0.78437316, "learning_rate": 1.0803403968223434e-08, "loss": 0.80579853, "num_input_tokens_seen": 347308785, "step": 16098, "time_per_iteration": 2.5864875316619873 }, { "auxiliary_loss_clip": 0.0107379, "auxiliary_loss_mlp": 0.01027915, "balance_loss_clip": 1.03619742, "balance_loss_mlp": 1.01629841, "epoch": 0.9679242447016384, "flos": 19240937516160.0, "grad_norm": 2.361712995723687, "language_loss": 0.90639651, "learning_rate": 1.0763015963459965e-08, "loss": 0.92741358, "num_input_tokens_seen": 347326375, "step": 16099, "time_per_iteration": 2.786999464035034 }, { "auxiliary_loss_clip": 0.01100177, "auxiliary_loss_mlp": 0.01034182, "balance_loss_clip": 1.03700566, "balance_loss_mlp": 1.02131963, "epoch": 0.9679843679543063, "flos": 33254681345280.0, "grad_norm": 1.5241755755242299, "language_loss": 0.66061008, "learning_rate": 1.0722703390762643e-08, "loss": 0.68195367, "num_input_tokens_seen": 347348250, "step": 16100, "time_per_iteration": 2.6941099166870117 }, { "auxiliary_loss_clip": 0.01069319, "auxiliary_loss_mlp": 0.01035758, "balance_loss_clip": 1.03754771, "balance_loss_mlp": 1.02276969, "epoch": 0.9680444912069743, "flos": 22783633038720.0, "grad_norm": 1.6628994278317477, "language_loss": 0.73592603, "learning_rate": 1.0682466251659584e-08, "loss": 0.75697684, "num_input_tokens_seen": 347367400, "step": 16101, "time_per_iteration": 2.6911606788635254 }, { "auxiliary_loss_clip": 0.01085079, "auxiliary_loss_mlp": 0.01031615, "balance_loss_clip": 1.03593111, "balance_loss_mlp": 1.01842427, "epoch": 0.9681046144596422, "flos": 24024095274240.0, "grad_norm": 1.6107351715516067, "language_loss": 0.73375201, "learning_rate": 1.0642304547676672e-08, "loss": 0.75491893, "num_input_tokens_seen": 347387600, "step": 16102, "time_per_iteration": 2.6399521827697754 }, { "auxiliary_loss_clip": 0.01076768, "auxiliary_loss_mlp": 0.01035627, "balance_loss_clip": 1.04000163, "balance_loss_mlp": 1.02195942, "epoch": 0.9681647377123103, "flos": 23441013797760.0, "grad_norm": 1.9528459851096875, "language_loss": 0.77444363, "learning_rate": 1.0602218280337139e-08, "loss": 0.79556757, "num_input_tokens_seen": 347406915, "step": 16103, "time_per_iteration": 2.7056915760040283 }, { "auxiliary_loss_clip": 0.01086653, "auxiliary_loss_mlp": 0.0103195, "balance_loss_clip": 1.03456104, "balance_loss_mlp": 1.0200057, "epoch": 0.9682248609649782, "flos": 22675075159680.0, "grad_norm": 1.639893337105475, "language_loss": 0.80586064, "learning_rate": 1.0562207451160655e-08, "loss": 0.82704663, "num_input_tokens_seen": 347425140, "step": 16104, "time_per_iteration": 2.648461103439331 }, { "auxiliary_loss_clip": 0.01088229, "auxiliary_loss_mlp": 0.01035055, "balance_loss_clip": 1.03242385, "balance_loss_mlp": 1.02403986, "epoch": 0.9682849842176462, "flos": 24428413739520.0, "grad_norm": 1.4906398802277745, "language_loss": 0.77576089, "learning_rate": 1.0522272061664672e-08, "loss": 0.79699373, "num_input_tokens_seen": 347446350, "step": 16105, "time_per_iteration": 2.6988043785095215 }, { "auxiliary_loss_clip": 0.01000224, "auxiliary_loss_mlp": 0.01003466, "balance_loss_clip": 1.00602651, "balance_loss_mlp": 1.00240505, "epoch": 0.9683451074703141, "flos": 59995132784640.0, "grad_norm": 0.8146584091458852, "language_loss": 0.56716478, "learning_rate": 1.0482412113363536e-08, "loss": 0.58720171, "num_input_tokens_seen": 347510135, "step": 16106, "time_per_iteration": 3.2270421981811523 }, { "auxiliary_loss_clip": 0.01008919, "auxiliary_loss_mlp": 0.0100775, "balance_loss_clip": 1.01353073, "balance_loss_mlp": 1.00654626, "epoch": 0.9684052307229821, "flos": 52696145514240.0, "grad_norm": 0.9121301732264848, "language_loss": 0.61534059, "learning_rate": 1.0442627607768707e-08, "loss": 0.63550723, "num_input_tokens_seen": 347562505, "step": 16107, "time_per_iteration": 3.1101765632629395 }, { "auxiliary_loss_clip": 0.01098789, "auxiliary_loss_mlp": 0.01035497, "balance_loss_clip": 1.03623629, "balance_loss_mlp": 1.02143073, "epoch": 0.96846535397565, "flos": 22783848520320.0, "grad_norm": 2.462094722606181, "language_loss": 0.74264908, "learning_rate": 1.040291854638875e-08, "loss": 0.76399195, "num_input_tokens_seen": 347579150, "step": 16108, "time_per_iteration": 2.743326187133789 }, { "auxiliary_loss_clip": 0.01093024, "auxiliary_loss_mlp": 0.01027661, "balance_loss_clip": 1.03480208, "balance_loss_mlp": 1.01471508, "epoch": 0.968525477228318, "flos": 23323980309120.0, "grad_norm": 2.296731755168933, "language_loss": 0.56901729, "learning_rate": 1.0363284930729576e-08, "loss": 0.59022415, "num_input_tokens_seen": 347596705, "step": 16109, "time_per_iteration": 5.880841255187988 }, { "auxiliary_loss_clip": 0.01018006, "auxiliary_loss_mlp": 0.01003432, "balance_loss_clip": 1.0045774, "balance_loss_mlp": 1.00251389, "epoch": 0.9685856004809861, "flos": 67882947707520.0, "grad_norm": 0.6721206293032698, "language_loss": 0.54183471, "learning_rate": 1.0323726762294205e-08, "loss": 0.56204915, "num_input_tokens_seen": 347661870, "step": 16110, "time_per_iteration": 3.1392929553985596 }, { "auxiliary_loss_clip": 0.01040803, "auxiliary_loss_mlp": 0.01042377, "balance_loss_clip": 1.03240716, "balance_loss_mlp": 1.02792311, "epoch": 0.968645723733654, "flos": 33947900899200.0, "grad_norm": 1.336975675669519, "language_loss": 0.62198687, "learning_rate": 1.0284244042582325e-08, "loss": 0.64281869, "num_input_tokens_seen": 347684295, "step": 16111, "time_per_iteration": 4.477367401123047 }, { "auxiliary_loss_clip": 0.01084355, "auxiliary_loss_mlp": 0.01029493, "balance_loss_clip": 1.0346024, "balance_loss_mlp": 1.01831102, "epoch": 0.968705846986322, "flos": 18551488890240.0, "grad_norm": 1.8865919253091008, "language_loss": 0.74626237, "learning_rate": 1.024483677309118e-08, "loss": 0.76740086, "num_input_tokens_seen": 347702585, "step": 16112, "time_per_iteration": 2.6802995204925537 }, { "auxiliary_loss_clip": 0.01096094, "auxiliary_loss_mlp": 0.01029113, "balance_loss_clip": 1.03605258, "balance_loss_mlp": 1.01711464, "epoch": 0.9687659702389899, "flos": 17420913336960.0, "grad_norm": 3.1829431624893, "language_loss": 0.66342431, "learning_rate": 1.020550495531558e-08, "loss": 0.68467641, "num_input_tokens_seen": 347721810, "step": 16113, "time_per_iteration": 2.6158058643341064 }, { "auxiliary_loss_clip": 0.01016205, "auxiliary_loss_mlp": 0.01002001, "balance_loss_clip": 1.00489581, "balance_loss_mlp": 1.00113058, "epoch": 0.9688260934916579, "flos": 62047176865920.0, "grad_norm": 0.7821411345284778, "language_loss": 0.56506634, "learning_rate": 1.0166248590746329e-08, "loss": 0.58524841, "num_input_tokens_seen": 347782330, "step": 16114, "time_per_iteration": 3.127088785171509 }, { "auxiliary_loss_clip": 0.01081645, "auxiliary_loss_mlp": 0.01038103, "balance_loss_clip": 1.03432035, "balance_loss_mlp": 1.02492452, "epoch": 0.9688862167443258, "flos": 15076520461440.0, "grad_norm": 1.9569215626202732, "language_loss": 0.82965726, "learning_rate": 1.0127067680872458e-08, "loss": 0.85085475, "num_input_tokens_seen": 347794835, "step": 16115, "time_per_iteration": 4.220075607299805 }, { "auxiliary_loss_clip": 0.01092985, "auxiliary_loss_mlp": 0.0102895, "balance_loss_clip": 1.03631961, "balance_loss_mlp": 1.01743448, "epoch": 0.9689463399969939, "flos": 19938215306880.0, "grad_norm": 1.7648967305379544, "language_loss": 0.72280598, "learning_rate": 1.0087962227179448e-08, "loss": 0.74402535, "num_input_tokens_seen": 347814320, "step": 16116, "time_per_iteration": 2.603519916534424 }, { "auxiliary_loss_clip": 0.01068294, "auxiliary_loss_mlp": 0.01034519, "balance_loss_clip": 1.03542447, "balance_loss_mlp": 1.02141237, "epoch": 0.9690064632496618, "flos": 19573039687680.0, "grad_norm": 2.086312122853078, "language_loss": 0.75657129, "learning_rate": 1.0048932231150553e-08, "loss": 0.77759945, "num_input_tokens_seen": 347832125, "step": 16117, "time_per_iteration": 2.6519157886505127 }, { "auxiliary_loss_clip": 0.01109753, "auxiliary_loss_mlp": 0.01030569, "balance_loss_clip": 1.03619337, "balance_loss_mlp": 1.01758695, "epoch": 0.9690665865023298, "flos": 21872292145920.0, "grad_norm": 2.3448073541677275, "language_loss": 0.77482766, "learning_rate": 1.000997769426548e-08, "loss": 0.79623091, "num_input_tokens_seen": 347850765, "step": 16118, "time_per_iteration": 2.5268216133117676 }, { "auxiliary_loss_clip": 0.0108528, "auxiliary_loss_mlp": 0.00771043, "balance_loss_clip": 1.03405607, "balance_loss_mlp": 1.00030315, "epoch": 0.9691267097549977, "flos": 20994491577600.0, "grad_norm": 1.8097325369712165, "language_loss": 0.78219616, "learning_rate": 9.971098618001272e-09, "loss": 0.80075938, "num_input_tokens_seen": 347870125, "step": 16119, "time_per_iteration": 2.629453659057617 }, { "auxiliary_loss_clip": 0.01056904, "auxiliary_loss_mlp": 0.01034502, "balance_loss_clip": 1.03209758, "balance_loss_mlp": 1.0223546, "epoch": 0.9691868330076657, "flos": 24279132816000.0, "grad_norm": 1.885470946971698, "language_loss": 0.75497305, "learning_rate": 9.932295003832747e-09, "loss": 0.77588713, "num_input_tokens_seen": 347890615, "step": 16120, "time_per_iteration": 2.746344566345215 }, { "auxiliary_loss_clip": 0.01097943, "auxiliary_loss_mlp": 0.0103265, "balance_loss_clip": 1.03581011, "balance_loss_mlp": 1.02084804, "epoch": 0.9692469562603336, "flos": 17675699483520.0, "grad_norm": 1.8693618447103497, "language_loss": 0.70098805, "learning_rate": 9.89356685323095e-09, "loss": 0.72229403, "num_input_tokens_seen": 347908685, "step": 16121, "time_per_iteration": 2.5618736743927 }, { "auxiliary_loss_clip": 0.01094421, "auxiliary_loss_mlp": 0.01032945, "balance_loss_clip": 1.03476155, "balance_loss_mlp": 1.02091098, "epoch": 0.9693070795130017, "flos": 26834392483200.0, "grad_norm": 1.8372756092604514, "language_loss": 0.69241065, "learning_rate": 9.854914167664486e-09, "loss": 0.71368432, "num_input_tokens_seen": 347926385, "step": 16122, "time_per_iteration": 2.56386661529541 }, { "auxiliary_loss_clip": 0.01066781, "auxiliary_loss_mlp": 0.0103272, "balance_loss_clip": 1.0308547, "balance_loss_mlp": 1.02011967, "epoch": 0.9693672027656697, "flos": 18077288515200.0, "grad_norm": 2.0146395561935058, "language_loss": 0.7544961, "learning_rate": 9.81633694859907e-09, "loss": 0.77549112, "num_input_tokens_seen": 347945290, "step": 16123, "time_per_iteration": 2.6407599449157715 }, { "auxiliary_loss_clip": 0.01072153, "auxiliary_loss_mlp": 0.01038605, "balance_loss_clip": 1.03460908, "balance_loss_mlp": 1.02459204, "epoch": 0.9694273260183376, "flos": 21763015994880.0, "grad_norm": 1.5149029001764542, "language_loss": 0.74644059, "learning_rate": 9.777835197497753e-09, "loss": 0.7675482, "num_input_tokens_seen": 347966330, "step": 16124, "time_per_iteration": 2.671185255050659 }, { "auxiliary_loss_clip": 0.01098188, "auxiliary_loss_mlp": 0.01035267, "balance_loss_clip": 1.0364728, "balance_loss_mlp": 1.02335227, "epoch": 0.9694874492710056, "flos": 24426115269120.0, "grad_norm": 2.520792760553443, "language_loss": 0.74161977, "learning_rate": 9.739408915820258e-09, "loss": 0.76295435, "num_input_tokens_seen": 347982590, "step": 16125, "time_per_iteration": 2.6353843212127686 }, { "auxiliary_loss_clip": 0.01019443, "auxiliary_loss_mlp": 0.01000194, "balance_loss_clip": 1.00674295, "balance_loss_mlp": 0.99920446, "epoch": 0.9695475725236735, "flos": 67650748237440.0, "grad_norm": 0.8991349506597905, "language_loss": 0.61446786, "learning_rate": 9.70105810502364e-09, "loss": 0.63466424, "num_input_tokens_seen": 348043310, "step": 16126, "time_per_iteration": 3.190199851989746 }, { "auxiliary_loss_clip": 0.01097272, "auxiliary_loss_mlp": 0.01035813, "balance_loss_clip": 1.03880358, "balance_loss_mlp": 1.02390397, "epoch": 0.9696076957763415, "flos": 19129326981120.0, "grad_norm": 1.964418438296789, "language_loss": 0.75083786, "learning_rate": 9.662782766562738e-09, "loss": 0.77216876, "num_input_tokens_seen": 348062200, "step": 16127, "time_per_iteration": 2.6186792850494385 }, { "auxiliary_loss_clip": 0.01063108, "auxiliary_loss_mlp": 0.01033166, "balance_loss_clip": 1.03249013, "balance_loss_mlp": 1.02036893, "epoch": 0.9696678190290094, "flos": 15486836497920.0, "grad_norm": 1.6000021312142574, "language_loss": 0.69262868, "learning_rate": 9.62458290188839e-09, "loss": 0.71359146, "num_input_tokens_seen": 348080685, "step": 16128, "time_per_iteration": 2.6917450428009033 }, { "auxiliary_loss_clip": 0.01076173, "auxiliary_loss_mlp": 0.0103545, "balance_loss_clip": 1.03701282, "balance_loss_mlp": 1.02326083, "epoch": 0.9697279422816775, "flos": 36208692869760.0, "grad_norm": 1.6481386717416904, "language_loss": 0.65212297, "learning_rate": 9.586458512449213e-09, "loss": 0.67323917, "num_input_tokens_seen": 348102500, "step": 16129, "time_per_iteration": 2.761218309402466 }, { "auxiliary_loss_clip": 0.01076577, "auxiliary_loss_mlp": 0.01032504, "balance_loss_clip": 1.03635514, "balance_loss_mlp": 1.01933169, "epoch": 0.9697880655343454, "flos": 25484007651840.0, "grad_norm": 2.2154494130728852, "language_loss": 0.6313777, "learning_rate": 9.548409599691166e-09, "loss": 0.6524685, "num_input_tokens_seen": 348122515, "step": 16130, "time_per_iteration": 2.6841318607330322 }, { "auxiliary_loss_clip": 0.01098965, "auxiliary_loss_mlp": 0.01031057, "balance_loss_clip": 1.0350318, "balance_loss_mlp": 1.01859963, "epoch": 0.9698481887870134, "flos": 15333533251200.0, "grad_norm": 2.2812543570754005, "language_loss": 0.69271004, "learning_rate": 9.510436165056867e-09, "loss": 0.7140103, "num_input_tokens_seen": 348138775, "step": 16131, "time_per_iteration": 2.5763492584228516 }, { "auxiliary_loss_clip": 0.0110919, "auxiliary_loss_mlp": 0.00770076, "balance_loss_clip": 1.03628075, "balance_loss_mlp": 1.00023901, "epoch": 0.9699083120396813, "flos": 21982250655360.0, "grad_norm": 1.8419150080244562, "language_loss": 0.76590043, "learning_rate": 9.472538209986058e-09, "loss": 0.78469312, "num_input_tokens_seen": 348157115, "step": 16132, "time_per_iteration": 2.563215732574463 }, { "auxiliary_loss_clip": 0.01075956, "auxiliary_loss_mlp": 0.01038228, "balance_loss_clip": 1.03480387, "balance_loss_mlp": 1.02540684, "epoch": 0.9699684352923493, "flos": 15664055224320.0, "grad_norm": 2.851724008499009, "language_loss": 0.79010421, "learning_rate": 9.434715735916477e-09, "loss": 0.81124604, "num_input_tokens_seen": 348173035, "step": 16133, "time_per_iteration": 2.623619794845581 }, { "auxiliary_loss_clip": 0.01078402, "auxiliary_loss_mlp": 0.01028089, "balance_loss_clip": 1.03522897, "balance_loss_mlp": 1.01685965, "epoch": 0.9700285585450172, "flos": 21908382336000.0, "grad_norm": 2.3483627644840444, "language_loss": 0.64470112, "learning_rate": 9.396968744281863e-09, "loss": 0.66576606, "num_input_tokens_seen": 348192960, "step": 16134, "time_per_iteration": 2.6657004356384277 }, { "auxiliary_loss_clip": 0.01083734, "auxiliary_loss_mlp": 0.01032972, "balance_loss_clip": 1.03266311, "balance_loss_mlp": 1.01973999, "epoch": 0.9700886817976853, "flos": 23914890950400.0, "grad_norm": 1.8798527935954052, "language_loss": 0.80912268, "learning_rate": 9.359297236513519e-09, "loss": 0.83028972, "num_input_tokens_seen": 348212805, "step": 16135, "time_per_iteration": 2.744619131088257 }, { "auxiliary_loss_clip": 0.01099551, "auxiliary_loss_mlp": 0.01032267, "balance_loss_clip": 1.03586113, "balance_loss_mlp": 1.01880264, "epoch": 0.9701488050503532, "flos": 25447845634560.0, "grad_norm": 1.8400261223826226, "language_loss": 0.7311669, "learning_rate": 9.321701214040079e-09, "loss": 0.7524851, "num_input_tokens_seen": 348232900, "step": 16136, "time_per_iteration": 2.6270158290863037 }, { "auxiliary_loss_clip": 0.01106517, "auxiliary_loss_mlp": 0.0103259, "balance_loss_clip": 1.03631723, "balance_loss_mlp": 1.02158737, "epoch": 0.9702089283030212, "flos": 20590855470720.0, "grad_norm": 1.723008357652219, "language_loss": 0.7604568, "learning_rate": 9.28418067828729e-09, "loss": 0.78184789, "num_input_tokens_seen": 348253065, "step": 16137, "time_per_iteration": 2.611590623855591 }, { "auxiliary_loss_clip": 0.0099169, "auxiliary_loss_mlp": 0.01002259, "balance_loss_clip": 1.01290679, "balance_loss_mlp": 1.00113201, "epoch": 0.9702690515556892, "flos": 70651516291200.0, "grad_norm": 0.7712451352581947, "language_loss": 0.54897171, "learning_rate": 9.246735630678015e-09, "loss": 0.56891119, "num_input_tokens_seen": 348316075, "step": 16138, "time_per_iteration": 3.3687798976898193 }, { "auxiliary_loss_clip": 0.01087536, "auxiliary_loss_mlp": 0.01031679, "balance_loss_clip": 1.03544235, "balance_loss_mlp": 1.02002001, "epoch": 0.9703291748083571, "flos": 35881439034240.0, "grad_norm": 1.941978950715942, "language_loss": 0.7094661, "learning_rate": 9.209366072632007e-09, "loss": 0.73065829, "num_input_tokens_seen": 348337605, "step": 16139, "time_per_iteration": 2.725593328475952 }, { "auxiliary_loss_clip": 0.01100195, "auxiliary_loss_mlp": 0.01032609, "balance_loss_clip": 1.03781474, "balance_loss_mlp": 1.01973999, "epoch": 0.9703892980610251, "flos": 24316479982080.0, "grad_norm": 1.5269759850750149, "language_loss": 0.72774076, "learning_rate": 9.172072005566134e-09, "loss": 0.7490688, "num_input_tokens_seen": 348359430, "step": 16140, "time_per_iteration": 2.6335747241973877 }, { "auxiliary_loss_clip": 0.01102225, "auxiliary_loss_mlp": 0.00771179, "balance_loss_clip": 1.03837323, "balance_loss_mlp": 1.00030136, "epoch": 0.970449421313693, "flos": 18003743418240.0, "grad_norm": 2.2771543266487586, "language_loss": 0.67710316, "learning_rate": 9.13485343089504e-09, "loss": 0.6958372, "num_input_tokens_seen": 348377890, "step": 16141, "time_per_iteration": 2.588693141937256 }, { "auxiliary_loss_clip": 0.01093094, "auxiliary_loss_mlp": 0.01033621, "balance_loss_clip": 1.03493285, "balance_loss_mlp": 1.02134275, "epoch": 0.9705095445663611, "flos": 25337994865920.0, "grad_norm": 2.0049530138805856, "language_loss": 0.69002879, "learning_rate": 9.097710350029597e-09, "loss": 0.71129596, "num_input_tokens_seen": 348396550, "step": 16142, "time_per_iteration": 2.727897882461548 }, { "auxiliary_loss_clip": 0.01052884, "auxiliary_loss_mlp": 0.01032081, "balance_loss_clip": 1.03308058, "balance_loss_mlp": 1.01940298, "epoch": 0.970569667819029, "flos": 26833602384000.0, "grad_norm": 1.764320667349442, "language_loss": 0.55796802, "learning_rate": 9.060642764378457e-09, "loss": 0.57881761, "num_input_tokens_seen": 348417120, "step": 16143, "time_per_iteration": 2.7790820598602295 }, { "auxiliary_loss_clip": 0.01097025, "auxiliary_loss_mlp": 0.01033348, "balance_loss_clip": 1.0362649, "balance_loss_mlp": 1.02201712, "epoch": 0.970629791071697, "flos": 25848644567040.0, "grad_norm": 2.1311740509263157, "language_loss": 0.67920631, "learning_rate": 9.023650675347382e-09, "loss": 0.70051003, "num_input_tokens_seen": 348437750, "step": 16144, "time_per_iteration": 2.6120004653930664 }, { "auxiliary_loss_clip": 0.01096108, "auxiliary_loss_mlp": 0.0103709, "balance_loss_clip": 1.03683603, "balance_loss_mlp": 1.0254854, "epoch": 0.9706899143243649, "flos": 36540184510080.0, "grad_norm": 1.6337482713348195, "language_loss": 0.71880758, "learning_rate": 8.986734084339253e-09, "loss": 0.74013954, "num_input_tokens_seen": 348460935, "step": 16145, "time_per_iteration": 2.7305266857147217 }, { "auxiliary_loss_clip": 0.0108585, "auxiliary_loss_mlp": 0.0102977, "balance_loss_clip": 1.03421783, "balance_loss_mlp": 1.01635957, "epoch": 0.9707500375770329, "flos": 12268234414080.0, "grad_norm": 5.028438763995754, "language_loss": 0.80458283, "learning_rate": 8.949892992753395e-09, "loss": 0.82573903, "num_input_tokens_seen": 348474480, "step": 16146, "time_per_iteration": 2.6035280227661133 }, { "auxiliary_loss_clip": 0.00997894, "auxiliary_loss_mlp": 0.0100175, "balance_loss_clip": 1.00757813, "balance_loss_mlp": 1.00062394, "epoch": 0.9708101608297008, "flos": 60853040196480.0, "grad_norm": 0.7531682380125572, "language_loss": 0.54495502, "learning_rate": 8.91312740198713e-09, "loss": 0.56495154, "num_input_tokens_seen": 348541220, "step": 16147, "time_per_iteration": 3.225588798522949 }, { "auxiliary_loss_clip": 0.01073097, "auxiliary_loss_mlp": 0.00771677, "balance_loss_clip": 1.03335106, "balance_loss_mlp": 1.00021338, "epoch": 0.9708702840823689, "flos": 27124766029440.0, "grad_norm": 3.20403684561858, "language_loss": 0.61148691, "learning_rate": 8.876437313434682e-09, "loss": 0.62993467, "num_input_tokens_seen": 348559230, "step": 16148, "time_per_iteration": 4.195791482925415 }, { "auxiliary_loss_clip": 0.01070921, "auxiliary_loss_mlp": 0.01038429, "balance_loss_clip": 1.03563893, "balance_loss_mlp": 1.02597761, "epoch": 0.9709304073350368, "flos": 20777699041920.0, "grad_norm": 1.6574498866467469, "language_loss": 0.73563087, "learning_rate": 8.839822728487155e-09, "loss": 0.75672436, "num_input_tokens_seen": 348577850, "step": 16149, "time_per_iteration": 4.327805519104004 }, { "auxiliary_loss_clip": 0.01096097, "auxiliary_loss_mlp": 0.01036533, "balance_loss_clip": 1.03510022, "balance_loss_mlp": 1.02391517, "epoch": 0.9709905305877048, "flos": 41934541115520.0, "grad_norm": 2.151336781292665, "language_loss": 0.75191128, "learning_rate": 8.803283648533222e-09, "loss": 0.77323759, "num_input_tokens_seen": 348598345, "step": 16150, "time_per_iteration": 4.396034479141235 }, { "auxiliary_loss_clip": 0.0109299, "auxiliary_loss_mlp": 0.01030493, "balance_loss_clip": 1.03820729, "balance_loss_mlp": 1.01590753, "epoch": 0.9710506538403728, "flos": 17165588486400.0, "grad_norm": 1.9672912428051808, "language_loss": 0.73628724, "learning_rate": 8.766820074958214e-09, "loss": 0.75752205, "num_input_tokens_seen": 348616300, "step": 16151, "time_per_iteration": 2.6692330837249756 }, { "auxiliary_loss_clip": 0.0109559, "auxiliary_loss_mlp": 0.01028623, "balance_loss_clip": 1.03646886, "balance_loss_mlp": 1.01655281, "epoch": 0.9711107770930407, "flos": 21173470070400.0, "grad_norm": 2.2868567232439787, "language_loss": 0.74524468, "learning_rate": 8.730432009145027e-09, "loss": 0.76648676, "num_input_tokens_seen": 348633845, "step": 16152, "time_per_iteration": 2.639920473098755 }, { "auxiliary_loss_clip": 0.0107224, "auxiliary_loss_mlp": 0.01033418, "balance_loss_clip": 1.03668654, "balance_loss_mlp": 1.02151465, "epoch": 0.9711709003457087, "flos": 22237072715520.0, "grad_norm": 1.850919590804903, "language_loss": 0.67173874, "learning_rate": 8.694119452473448e-09, "loss": 0.69279528, "num_input_tokens_seen": 348653070, "step": 16153, "time_per_iteration": 2.69380521774292 }, { "auxiliary_loss_clip": 0.01048504, "auxiliary_loss_mlp": 0.01029289, "balance_loss_clip": 1.03318441, "balance_loss_mlp": 1.01809549, "epoch": 0.9712310235983767, "flos": 26213856099840.0, "grad_norm": 13.061634148388642, "language_loss": 0.70930749, "learning_rate": 8.65788240632037e-09, "loss": 0.73008543, "num_input_tokens_seen": 348672145, "step": 16154, "time_per_iteration": 4.310068607330322 }, { "auxiliary_loss_clip": 0.01063978, "auxiliary_loss_mlp": 0.01032173, "balance_loss_clip": 1.04066324, "balance_loss_mlp": 1.01833844, "epoch": 0.9712911468510447, "flos": 20668171495680.0, "grad_norm": 1.6587681231692977, "language_loss": 0.80700165, "learning_rate": 8.621720872059812e-09, "loss": 0.82796311, "num_input_tokens_seen": 348690615, "step": 16155, "time_per_iteration": 2.7987523078918457 }, { "auxiliary_loss_clip": 0.01098298, "auxiliary_loss_mlp": 0.00771693, "balance_loss_clip": 1.03783345, "balance_loss_mlp": 1.00030363, "epoch": 0.9713512701037126, "flos": 13552903313280.0, "grad_norm": 1.945476752267927, "language_loss": 0.6769433, "learning_rate": 8.58563485106334e-09, "loss": 0.69564319, "num_input_tokens_seen": 348708665, "step": 16156, "time_per_iteration": 2.679084062576294 }, { "auxiliary_loss_clip": 0.01098233, "auxiliary_loss_mlp": 0.01031321, "balance_loss_clip": 1.03533268, "balance_loss_mlp": 1.01955533, "epoch": 0.9714113933563806, "flos": 25848752307840.0, "grad_norm": 2.586712346196416, "language_loss": 0.9075287, "learning_rate": 8.54962434469919e-09, "loss": 0.92882419, "num_input_tokens_seen": 348726105, "step": 16157, "time_per_iteration": 2.6537325382232666 }, { "auxiliary_loss_clip": 0.01071902, "auxiliary_loss_mlp": 0.00770052, "balance_loss_clip": 1.03688991, "balance_loss_mlp": 1.00026488, "epoch": 0.9714715166090485, "flos": 12743081233920.0, "grad_norm": 1.749407686127521, "language_loss": 0.72465503, "learning_rate": 8.513689354332721e-09, "loss": 0.74307454, "num_input_tokens_seen": 348743360, "step": 16158, "time_per_iteration": 2.7036380767822266 }, { "auxiliary_loss_clip": 0.01059022, "auxiliary_loss_mlp": 0.01037853, "balance_loss_clip": 1.03384304, "balance_loss_mlp": 1.02509737, "epoch": 0.9715316398617165, "flos": 18405547931520.0, "grad_norm": 2.013888583799996, "language_loss": 0.60360491, "learning_rate": 8.477829881326836e-09, "loss": 0.62457371, "num_input_tokens_seen": 348759045, "step": 16159, "time_per_iteration": 2.6209466457366943 }, { "auxiliary_loss_clip": 0.01103648, "auxiliary_loss_mlp": 0.01025108, "balance_loss_clip": 1.03575277, "balance_loss_mlp": 1.01424837, "epoch": 0.9715917631143844, "flos": 28913799749760.0, "grad_norm": 1.651339792325088, "language_loss": 0.78989285, "learning_rate": 8.44204592704112e-09, "loss": 0.81118041, "num_input_tokens_seen": 348779910, "step": 16160, "time_per_iteration": 2.5234336853027344 }, { "auxiliary_loss_clip": 0.01027371, "auxiliary_loss_mlp": 0.01000477, "balance_loss_clip": 1.00497746, "balance_loss_mlp": 0.99951786, "epoch": 0.9716518863670525, "flos": 65939712900480.0, "grad_norm": 0.7683763573739155, "language_loss": 0.54203629, "learning_rate": 8.406337492832704e-09, "loss": 0.56231475, "num_input_tokens_seen": 348838995, "step": 16161, "time_per_iteration": 3.0858347415924072 }, { "auxiliary_loss_clip": 0.01094745, "auxiliary_loss_mlp": 0.00769904, "balance_loss_clip": 1.03734314, "balance_loss_mlp": 1.00019956, "epoch": 0.9717120096197204, "flos": 17712759340800.0, "grad_norm": 1.8388776753499438, "language_loss": 0.72078347, "learning_rate": 8.3707045800554e-09, "loss": 0.73942995, "num_input_tokens_seen": 348858090, "step": 16162, "time_per_iteration": 2.4713857173919678 }, { "auxiliary_loss_clip": 0.01070522, "auxiliary_loss_mlp": 0.010289, "balance_loss_clip": 1.03172445, "balance_loss_mlp": 1.01611447, "epoch": 0.9717721328723884, "flos": 24463426521600.0, "grad_norm": 1.6638325085203318, "language_loss": 0.78620613, "learning_rate": 8.335147190060787e-09, "loss": 0.80720031, "num_input_tokens_seen": 348877885, "step": 16163, "time_per_iteration": 2.6069257259368896 }, { "auxiliary_loss_clip": 0.01083213, "auxiliary_loss_mlp": 0.01027578, "balance_loss_clip": 1.03707957, "balance_loss_mlp": 1.01624179, "epoch": 0.9718322561250564, "flos": 20776477979520.0, "grad_norm": 2.364704456697354, "language_loss": 0.72864258, "learning_rate": 8.299665324196903e-09, "loss": 0.74975049, "num_input_tokens_seen": 348897720, "step": 16164, "time_per_iteration": 2.6400234699249268 }, { "auxiliary_loss_clip": 0.01045604, "auxiliary_loss_mlp": 0.01044632, "balance_loss_clip": 1.03097391, "balance_loss_mlp": 1.029773, "epoch": 0.9718923793777243, "flos": 19025904746880.0, "grad_norm": 1.8541776614197814, "language_loss": 0.83818543, "learning_rate": 8.264258983809114e-09, "loss": 0.85908771, "num_input_tokens_seen": 348915410, "step": 16165, "time_per_iteration": 2.729191303253174 }, { "auxiliary_loss_clip": 0.01071333, "auxiliary_loss_mlp": 0.01027443, "balance_loss_clip": 1.03399253, "balance_loss_mlp": 1.01615393, "epoch": 0.9719525026303923, "flos": 21871717528320.0, "grad_norm": 2.4684136710713664, "language_loss": 0.79201269, "learning_rate": 8.228928170240345e-09, "loss": 0.81300044, "num_input_tokens_seen": 348934335, "step": 16166, "time_per_iteration": 2.6733477115631104 }, { "auxiliary_loss_clip": 0.01084172, "auxiliary_loss_mlp": 0.01027293, "balance_loss_clip": 1.03812957, "balance_loss_mlp": 1.01548481, "epoch": 0.9720126258830603, "flos": 14429303251200.0, "grad_norm": 1.7663595445124196, "language_loss": 0.70758253, "learning_rate": 8.193672884830195e-09, "loss": 0.72869724, "num_input_tokens_seen": 348952405, "step": 16167, "time_per_iteration": 2.7085564136505127 }, { "auxiliary_loss_clip": 0.01079731, "auxiliary_loss_mlp": 0.01035805, "balance_loss_clip": 1.03778422, "balance_loss_mlp": 1.02379441, "epoch": 0.9720727491357283, "flos": 26251167352320.0, "grad_norm": 1.8138771680519867, "language_loss": 0.75927782, "learning_rate": 8.158493128915812e-09, "loss": 0.78043312, "num_input_tokens_seen": 348973580, "step": 16168, "time_per_iteration": 2.67354154586792 }, { "auxiliary_loss_clip": 0.01049039, "auxiliary_loss_mlp": 0.01050689, "balance_loss_clip": 1.03055644, "balance_loss_mlp": 1.03582323, "epoch": 0.9721328723883962, "flos": 22674105492480.0, "grad_norm": 2.5093639466048896, "language_loss": 0.72537249, "learning_rate": 8.123388903830797e-09, "loss": 0.74636978, "num_input_tokens_seen": 348992035, "step": 16169, "time_per_iteration": 2.7542500495910645 }, { "auxiliary_loss_clip": 0.01073449, "auxiliary_loss_mlp": 0.01038056, "balance_loss_clip": 1.03180361, "balance_loss_mlp": 1.02368569, "epoch": 0.9721929956410642, "flos": 28074172360320.0, "grad_norm": 1.7172146559968202, "language_loss": 0.57560009, "learning_rate": 8.088360210906309e-09, "loss": 0.59671509, "num_input_tokens_seen": 349013160, "step": 16170, "time_per_iteration": 2.784191370010376 }, { "auxiliary_loss_clip": 0.01075999, "auxiliary_loss_mlp": 0.01032463, "balance_loss_clip": 1.03437006, "balance_loss_mlp": 1.01930237, "epoch": 0.9722531188937321, "flos": 20996251344000.0, "grad_norm": 1.991276787299532, "language_loss": 0.71702683, "learning_rate": 8.053407051471062e-09, "loss": 0.7381115, "num_input_tokens_seen": 349033485, "step": 16171, "time_per_iteration": 2.7290470600128174 }, { "auxiliary_loss_clip": 0.01074193, "auxiliary_loss_mlp": 0.01036428, "balance_loss_clip": 1.03374755, "balance_loss_mlp": 1.02373838, "epoch": 0.9723132421464001, "flos": 16070600332800.0, "grad_norm": 3.7050893371500973, "language_loss": 0.68799138, "learning_rate": 8.018529426850218e-09, "loss": 0.70909762, "num_input_tokens_seen": 349051705, "step": 16172, "time_per_iteration": 2.7984087467193604 }, { "auxiliary_loss_clip": 0.01092548, "auxiliary_loss_mlp": 0.01030417, "balance_loss_clip": 1.03369451, "balance_loss_mlp": 1.01790619, "epoch": 0.972373365399068, "flos": 27745769289600.0, "grad_norm": 2.273393003122684, "language_loss": 0.85909021, "learning_rate": 7.983727338366274e-09, "loss": 0.88031983, "num_input_tokens_seen": 349070825, "step": 16173, "time_per_iteration": 2.637646198272705 }, { "auxiliary_loss_clip": 0.01058492, "auxiliary_loss_mlp": 0.01037401, "balance_loss_clip": 1.03226995, "balance_loss_mlp": 1.02290511, "epoch": 0.9724334886517361, "flos": 23002939526400.0, "grad_norm": 2.532344740288213, "language_loss": 0.64345253, "learning_rate": 7.949000787339289e-09, "loss": 0.66441143, "num_input_tokens_seen": 349089730, "step": 16174, "time_per_iteration": 2.6890182495117188 }, { "auxiliary_loss_clip": 0.0109623, "auxiliary_loss_mlp": 0.01029844, "balance_loss_clip": 1.03573728, "balance_loss_mlp": 1.01808977, "epoch": 0.972493611904404, "flos": 25447055535360.0, "grad_norm": 1.5574440695217635, "language_loss": 0.78149283, "learning_rate": 7.914349775085538e-09, "loss": 0.80275363, "num_input_tokens_seen": 349111315, "step": 16175, "time_per_iteration": 2.65380597114563 }, { "auxiliary_loss_clip": 0.01098527, "auxiliary_loss_mlp": 0.0103633, "balance_loss_clip": 1.03696692, "balance_loss_mlp": 1.02305567, "epoch": 0.972553735157072, "flos": 16983054547200.0, "grad_norm": 2.4406961253744637, "language_loss": 0.56965649, "learning_rate": 7.879774302919307e-09, "loss": 0.59100509, "num_input_tokens_seen": 349129495, "step": 16176, "time_per_iteration": 2.564636707305908 }, { "auxiliary_loss_clip": 0.01088801, "auxiliary_loss_mlp": 0.01032276, "balance_loss_clip": 1.03812397, "balance_loss_mlp": 1.02081394, "epoch": 0.97261385840974, "flos": 26104651776000.0, "grad_norm": 2.4918025895156557, "language_loss": 0.72519267, "learning_rate": 7.845274372151545e-09, "loss": 0.74640346, "num_input_tokens_seen": 349148850, "step": 16177, "time_per_iteration": 2.677704334259033 }, { "auxiliary_loss_clip": 0.01087782, "auxiliary_loss_mlp": 0.01029248, "balance_loss_clip": 1.03436661, "balance_loss_mlp": 1.01660562, "epoch": 0.9726739816624079, "flos": 25447881548160.0, "grad_norm": 1.6303663037965777, "language_loss": 0.68360388, "learning_rate": 7.810849984090984e-09, "loss": 0.70477414, "num_input_tokens_seen": 349167620, "step": 16178, "time_per_iteration": 2.6498606204986572 }, { "auxiliary_loss_clip": 0.01054589, "auxiliary_loss_mlp": 0.01032086, "balance_loss_clip": 1.03061843, "balance_loss_mlp": 1.01890159, "epoch": 0.972734104915076, "flos": 29014923513600.0, "grad_norm": 1.7151954479923888, "language_loss": 0.66904575, "learning_rate": 7.776501140042358e-09, "loss": 0.68991244, "num_input_tokens_seen": 349185845, "step": 16179, "time_per_iteration": 2.9617762565612793 }, { "auxiliary_loss_clip": 0.01083826, "auxiliary_loss_mlp": 0.00768898, "balance_loss_clip": 1.03630555, "balance_loss_mlp": 1.0001514, "epoch": 0.9727942281677439, "flos": 23437637919360.0, "grad_norm": 2.630214780518977, "language_loss": 0.77113461, "learning_rate": 7.742227841308624e-09, "loss": 0.78966182, "num_input_tokens_seen": 349204525, "step": 16180, "time_per_iteration": 2.6464152336120605 }, { "auxiliary_loss_clip": 0.01098634, "auxiliary_loss_mlp": 0.01030814, "balance_loss_clip": 1.03539276, "balance_loss_mlp": 1.01826119, "epoch": 0.9728543514204119, "flos": 31724599749120.0, "grad_norm": 2.262434982216008, "language_loss": 0.76220429, "learning_rate": 7.708030089189188e-09, "loss": 0.78349876, "num_input_tokens_seen": 349228075, "step": 16181, "time_per_iteration": 2.677198648452759 }, { "auxiliary_loss_clip": 0.01106677, "auxiliary_loss_mlp": 0.01035405, "balance_loss_clip": 1.03586745, "balance_loss_mlp": 1.02323365, "epoch": 0.9729144746730798, "flos": 16289368116480.0, "grad_norm": 1.5196924475010567, "language_loss": 0.63252479, "learning_rate": 7.67390788498079e-09, "loss": 0.65394562, "num_input_tokens_seen": 349246990, "step": 16182, "time_per_iteration": 2.554809093475342 }, { "auxiliary_loss_clip": 0.01041817, "auxiliary_loss_mlp": 0.01042152, "balance_loss_clip": 1.04146159, "balance_loss_mlp": 1.0289433, "epoch": 0.9729745979257478, "flos": 25041408266880.0, "grad_norm": 1.789194263856678, "language_loss": 0.62447584, "learning_rate": 7.639861229977507e-09, "loss": 0.64531553, "num_input_tokens_seen": 349265890, "step": 16183, "time_per_iteration": 3.175109624862671 }, { "auxiliary_loss_clip": 0.01085962, "auxiliary_loss_mlp": 0.01037692, "balance_loss_clip": 1.03510141, "balance_loss_mlp": 1.02473438, "epoch": 0.9730347211784157, "flos": 22638733574400.0, "grad_norm": 1.6456930738589919, "language_loss": 0.78234679, "learning_rate": 7.605890125470527e-09, "loss": 0.80358338, "num_input_tokens_seen": 349285275, "step": 16184, "time_per_iteration": 2.9018943309783936 }, { "auxiliary_loss_clip": 0.01068538, "auxiliary_loss_mlp": 0.01033694, "balance_loss_clip": 1.03069115, "balance_loss_mlp": 1.02024758, "epoch": 0.9730948444310837, "flos": 10998613313280.0, "grad_norm": 2.161376757576218, "language_loss": 0.79345584, "learning_rate": 7.571994572747709e-09, "loss": 0.8144781, "num_input_tokens_seen": 349301515, "step": 16185, "time_per_iteration": 2.641317129135132 }, { "auxiliary_loss_clip": 0.01077077, "auxiliary_loss_mlp": 0.01028307, "balance_loss_clip": 1.03456235, "balance_loss_mlp": 1.01660085, "epoch": 0.9731549676837516, "flos": 16799479113600.0, "grad_norm": 2.015706111725158, "language_loss": 0.77789813, "learning_rate": 7.538174573094469e-09, "loss": 0.79895198, "num_input_tokens_seen": 349319590, "step": 16186, "time_per_iteration": 2.698368787765503 }, { "auxiliary_loss_clip": 0.01084734, "auxiliary_loss_mlp": 0.01029196, "balance_loss_clip": 1.0357089, "balance_loss_mlp": 1.01675642, "epoch": 0.9732150909364197, "flos": 21141761339520.0, "grad_norm": 1.7572799383983544, "language_loss": 0.65494901, "learning_rate": 7.504430127793337e-09, "loss": 0.67608833, "num_input_tokens_seen": 349339230, "step": 16187, "time_per_iteration": 4.130638122558594 }, { "auxiliary_loss_clip": 0.01079645, "auxiliary_loss_mlp": 0.01038619, "balance_loss_clip": 1.03164029, "balance_loss_mlp": 1.02523208, "epoch": 0.9732752141890876, "flos": 33727337435520.0, "grad_norm": 1.8431543667714356, "language_loss": 0.80137229, "learning_rate": 7.47076123812418e-09, "loss": 0.82255495, "num_input_tokens_seen": 349361155, "step": 16188, "time_per_iteration": 4.257014989852905 }, { "auxiliary_loss_clip": 0.01072207, "auxiliary_loss_mlp": 0.0103015, "balance_loss_clip": 1.03375018, "balance_loss_mlp": 1.01883137, "epoch": 0.9733353374417556, "flos": 23404384903680.0, "grad_norm": 1.7664281085479938, "language_loss": 0.78316271, "learning_rate": 7.437167905363084e-09, "loss": 0.80418628, "num_input_tokens_seen": 349379335, "step": 16189, "time_per_iteration": 2.675529718399048 }, { "auxiliary_loss_clip": 0.01092046, "auxiliary_loss_mlp": 0.0102785, "balance_loss_clip": 1.03294408, "balance_loss_mlp": 1.01514196, "epoch": 0.9733954606944236, "flos": 39165792963840.0, "grad_norm": 1.7197781596757225, "language_loss": 0.51230407, "learning_rate": 7.403650130784367e-09, "loss": 0.533503, "num_input_tokens_seen": 349401575, "step": 16190, "time_per_iteration": 4.908695459365845 }, { "auxiliary_loss_clip": 0.01098154, "auxiliary_loss_mlp": 0.01030754, "balance_loss_clip": 1.03689873, "balance_loss_mlp": 1.01865995, "epoch": 0.9734555839470915, "flos": 21981819692160.0, "grad_norm": 1.7152390443101855, "language_loss": 0.80948341, "learning_rate": 7.3702079156590105e-09, "loss": 0.83077252, "num_input_tokens_seen": 349420650, "step": 16191, "time_per_iteration": 2.6668500900268555 }, { "auxiliary_loss_clip": 0.01091143, "auxiliary_loss_mlp": 0.01031598, "balance_loss_clip": 1.03202808, "balance_loss_mlp": 1.01971912, "epoch": 0.9735157071997596, "flos": 16575539771520.0, "grad_norm": 1.6910464048805458, "language_loss": 0.8259176, "learning_rate": 7.336841261255111e-09, "loss": 0.84714502, "num_input_tokens_seen": 349436830, "step": 16192, "time_per_iteration": 2.569251537322998 }, { "auxiliary_loss_clip": 0.01046721, "auxiliary_loss_mlp": 0.01039813, "balance_loss_clip": 1.03544569, "balance_loss_mlp": 1.02665234, "epoch": 0.9735758304524275, "flos": 20223237726720.0, "grad_norm": 1.8106504266161225, "language_loss": 0.74773109, "learning_rate": 7.303550168837658e-09, "loss": 0.76859641, "num_input_tokens_seen": 349454325, "step": 16193, "time_per_iteration": 4.564434051513672 }, { "auxiliary_loss_clip": 0.01079567, "auxiliary_loss_mlp": 0.01031423, "balance_loss_clip": 1.03505838, "balance_loss_mlp": 1.020015, "epoch": 0.9736359537050955, "flos": 23653353047040.0, "grad_norm": 1.8191654334710798, "language_loss": 0.85254693, "learning_rate": 7.270334639669417e-09, "loss": 0.87365687, "num_input_tokens_seen": 349470230, "step": 16194, "time_per_iteration": 2.687668561935425 }, { "auxiliary_loss_clip": 0.01070428, "auxiliary_loss_mlp": 0.01037259, "balance_loss_clip": 1.03369021, "balance_loss_mlp": 1.02468801, "epoch": 0.9736960769577634, "flos": 15560202026880.0, "grad_norm": 1.6441349965800172, "language_loss": 0.75818932, "learning_rate": 7.237194675009828e-09, "loss": 0.77926624, "num_input_tokens_seen": 349486250, "step": 16195, "time_per_iteration": 2.6451404094696045 }, { "auxiliary_loss_clip": 0.01004847, "auxiliary_loss_mlp": 0.01000872, "balance_loss_clip": 1.00990903, "balance_loss_mlp": 0.99979365, "epoch": 0.9737562002104314, "flos": 65351783088000.0, "grad_norm": 0.708245154030494, "language_loss": 0.52467954, "learning_rate": 7.204130276115439e-09, "loss": 0.54473674, "num_input_tokens_seen": 349545865, "step": 16196, "time_per_iteration": 3.186091184616089 }, { "auxiliary_loss_clip": 0.01084909, "auxiliary_loss_mlp": 0.0103142, "balance_loss_clip": 1.03660226, "balance_loss_mlp": 1.01945114, "epoch": 0.9738163234630993, "flos": 27196730928000.0, "grad_norm": 2.030454883195646, "language_loss": 0.7627387, "learning_rate": 7.171141444240136e-09, "loss": 0.78390199, "num_input_tokens_seen": 349566080, "step": 16197, "time_per_iteration": 2.8780059814453125 }, { "auxiliary_loss_clip": 0.0111131, "auxiliary_loss_mlp": 0.01028167, "balance_loss_clip": 1.03635693, "balance_loss_mlp": 1.01535797, "epoch": 0.9738764467157673, "flos": 21069365477760.0, "grad_norm": 1.7142052132721648, "language_loss": 0.67503351, "learning_rate": 7.13822818063492e-09, "loss": 0.69642824, "num_input_tokens_seen": 349585665, "step": 16198, "time_per_iteration": 2.689474582672119 }, { "auxiliary_loss_clip": 0.01107297, "auxiliary_loss_mlp": 0.01031978, "balance_loss_clip": 1.03572273, "balance_loss_mlp": 1.01887083, "epoch": 0.9739365699684353, "flos": 21361211481600.0, "grad_norm": 1.916549844614904, "language_loss": 0.78117663, "learning_rate": 7.10539048654768e-09, "loss": 0.80256933, "num_input_tokens_seen": 349605125, "step": 16199, "time_per_iteration": 2.5536978244781494 }, { "auxiliary_loss_clip": 0.0108445, "auxiliary_loss_mlp": 0.01035036, "balance_loss_clip": 1.03713942, "balance_loss_mlp": 1.02260256, "epoch": 0.9739966932211033, "flos": 21902061542400.0, "grad_norm": 1.9705409422067974, "language_loss": 0.79409683, "learning_rate": 7.072628363223865e-09, "loss": 0.81529176, "num_input_tokens_seen": 349623360, "step": 16200, "time_per_iteration": 2.6256768703460693 }, { "auxiliary_loss_clip": 0.01058782, "auxiliary_loss_mlp": 0.01035774, "balance_loss_clip": 1.03694201, "balance_loss_mlp": 1.02227926, "epoch": 0.9740568164737712, "flos": 24827345164800.0, "grad_norm": 2.0331349042288878, "language_loss": 0.68434143, "learning_rate": 7.039941811905592e-09, "loss": 0.70528698, "num_input_tokens_seen": 349644390, "step": 16201, "time_per_iteration": 2.8037257194519043 }, { "auxiliary_loss_clip": 0.01075577, "auxiliary_loss_mlp": 0.01033677, "balance_loss_clip": 1.03323948, "balance_loss_mlp": 1.02163649, "epoch": 0.9741169397264392, "flos": 23623583650560.0, "grad_norm": 1.5025292618741064, "language_loss": 0.72862577, "learning_rate": 7.0073308338325364e-09, "loss": 0.74971825, "num_input_tokens_seen": 349663200, "step": 16202, "time_per_iteration": 2.662804126739502 }, { "auxiliary_loss_clip": 0.0108729, "auxiliary_loss_mlp": 0.01034456, "balance_loss_clip": 1.03576303, "balance_loss_mlp": 1.02150416, "epoch": 0.9741770629791072, "flos": 18841144164480.0, "grad_norm": 2.6824456959299052, "language_loss": 0.72871369, "learning_rate": 6.974795430241265e-09, "loss": 0.74993122, "num_input_tokens_seen": 349681975, "step": 16203, "time_per_iteration": 2.5910871028900146 }, { "auxiliary_loss_clip": 0.01109424, "auxiliary_loss_mlp": 0.01033432, "balance_loss_clip": 1.03729725, "balance_loss_mlp": 1.02117181, "epoch": 0.9742371862317751, "flos": 22346241125760.0, "grad_norm": 1.9882435140281416, "language_loss": 0.77292311, "learning_rate": 6.942335602365235e-09, "loss": 0.7943517, "num_input_tokens_seen": 349701185, "step": 16204, "time_per_iteration": 2.599534273147583 }, { "auxiliary_loss_clip": 0.01091233, "auxiliary_loss_mlp": 0.01034218, "balance_loss_clip": 1.03830349, "balance_loss_mlp": 1.02127194, "epoch": 0.9742973094844432, "flos": 21762764599680.0, "grad_norm": 2.04301514318933, "language_loss": 0.79557073, "learning_rate": 6.909951351435905e-09, "loss": 0.81682527, "num_input_tokens_seen": 349720360, "step": 16205, "time_per_iteration": 2.611509323120117 }, { "auxiliary_loss_clip": 0.01106984, "auxiliary_loss_mlp": 0.01033558, "balance_loss_clip": 1.03618968, "balance_loss_mlp": 1.02133942, "epoch": 0.9743574327371111, "flos": 26248725227520.0, "grad_norm": 1.7129263404714312, "language_loss": 0.74342418, "learning_rate": 6.87764267868074e-09, "loss": 0.76482964, "num_input_tokens_seen": 349741040, "step": 16206, "time_per_iteration": 2.5808560848236084 }, { "auxiliary_loss_clip": 0.01055158, "auxiliary_loss_mlp": 0.01032161, "balance_loss_clip": 1.03472948, "balance_loss_mlp": 1.019382, "epoch": 0.9744175559897791, "flos": 12349321367040.0, "grad_norm": 2.3020742472105375, "language_loss": 0.83948338, "learning_rate": 6.8454095853252015e-09, "loss": 0.86035663, "num_input_tokens_seen": 349758895, "step": 16207, "time_per_iteration": 3.118260622024536 }, { "auxiliary_loss_clip": 0.01096985, "auxiliary_loss_mlp": 0.0103392, "balance_loss_clip": 1.03668702, "balance_loss_mlp": 1.0217663, "epoch": 0.974477679242447, "flos": 28397834835840.0, "grad_norm": 1.7142608213779496, "language_loss": 0.71005446, "learning_rate": 6.813252072591425e-09, "loss": 0.73136348, "num_input_tokens_seen": 349779740, "step": 16208, "time_per_iteration": 2.631779909133911 }, { "auxiliary_loss_clip": 0.01068659, "auxiliary_loss_mlp": 0.01025995, "balance_loss_clip": 1.03373158, "balance_loss_mlp": 1.01523638, "epoch": 0.974537802495115, "flos": 17785370684160.0, "grad_norm": 1.6324180098602632, "language_loss": 0.77270913, "learning_rate": 6.781170141698878e-09, "loss": 0.79365563, "num_input_tokens_seen": 349796820, "step": 16209, "time_per_iteration": 2.648383617401123 }, { "auxiliary_loss_clip": 0.01070166, "auxiliary_loss_mlp": 0.0077274, "balance_loss_clip": 1.03177297, "balance_loss_mlp": 1.0000906, "epoch": 0.9745979257477829, "flos": 23842315520640.0, "grad_norm": 1.7959688952091581, "language_loss": 0.79134548, "learning_rate": 6.749163793864144e-09, "loss": 0.80977452, "num_input_tokens_seen": 349816550, "step": 16210, "time_per_iteration": 2.693124294281006 }, { "auxiliary_loss_clip": 0.01082394, "auxiliary_loss_mlp": 0.01035247, "balance_loss_clip": 1.03368235, "balance_loss_mlp": 1.02293837, "epoch": 0.9746580490004509, "flos": 27016172236800.0, "grad_norm": 2.111674380643122, "language_loss": 0.7811175, "learning_rate": 6.7172330303009176e-09, "loss": 0.80229396, "num_input_tokens_seen": 349834350, "step": 16211, "time_per_iteration": 2.7423813343048096 }, { "auxiliary_loss_clip": 0.01074533, "auxiliary_loss_mlp": 0.0103461, "balance_loss_clip": 1.03468013, "balance_loss_mlp": 1.02106786, "epoch": 0.9747181722531189, "flos": 19792022952960.0, "grad_norm": 2.2346090535911345, "language_loss": 0.78309953, "learning_rate": 6.685377852219787e-09, "loss": 0.80419093, "num_input_tokens_seen": 349853460, "step": 16212, "time_per_iteration": 2.7550909519195557 }, { "auxiliary_loss_clip": 0.01076477, "auxiliary_loss_mlp": 0.01032217, "balance_loss_clip": 1.03523839, "balance_loss_mlp": 1.02030253, "epoch": 0.9747782955057869, "flos": 31430598929280.0, "grad_norm": 1.4958012465208934, "language_loss": 0.79993176, "learning_rate": 6.653598260829118e-09, "loss": 0.8210187, "num_input_tokens_seen": 349874830, "step": 16213, "time_per_iteration": 2.8637707233428955 }, { "auxiliary_loss_clip": 0.01062528, "auxiliary_loss_mlp": 0.01026041, "balance_loss_clip": 1.03252554, "balance_loss_mlp": 1.01400709, "epoch": 0.9748384187584548, "flos": 15961288268160.0, "grad_norm": 1.9405763574770338, "language_loss": 0.66294038, "learning_rate": 6.6218942573335044e-09, "loss": 0.68382609, "num_input_tokens_seen": 349893690, "step": 16214, "time_per_iteration": 2.699460029602051 }, { "auxiliary_loss_clip": 0.01095715, "auxiliary_loss_mlp": 0.01030057, "balance_loss_clip": 1.04145873, "balance_loss_mlp": 1.01690817, "epoch": 0.9748985420111228, "flos": 20558715776640.0, "grad_norm": 1.7124616404956563, "language_loss": 0.73894978, "learning_rate": 6.5902658429355386e-09, "loss": 0.76020747, "num_input_tokens_seen": 349912480, "step": 16215, "time_per_iteration": 2.6812703609466553 }, { "auxiliary_loss_clip": 0.01057347, "auxiliary_loss_mlp": 0.01034106, "balance_loss_clip": 1.03506923, "balance_loss_mlp": 1.02194071, "epoch": 0.9749586652637908, "flos": 36721605127680.0, "grad_norm": 2.1409618688352583, "language_loss": 0.6697464, "learning_rate": 6.558713018834483e-09, "loss": 0.69066095, "num_input_tokens_seen": 349932470, "step": 16216, "time_per_iteration": 2.8369500637054443 }, { "auxiliary_loss_clip": 0.01053374, "auxiliary_loss_mlp": 0.01031983, "balance_loss_clip": 1.03023767, "balance_loss_mlp": 1.01911426, "epoch": 0.9750187885164587, "flos": 10999223844480.0, "grad_norm": 2.017970280416665, "language_loss": 0.71706629, "learning_rate": 6.527235786226937e-09, "loss": 0.73791993, "num_input_tokens_seen": 349949060, "step": 16217, "time_per_iteration": 2.7381694316864014 }, { "auxiliary_loss_clip": 0.01074463, "auxiliary_loss_mlp": 0.01028184, "balance_loss_clip": 1.03641343, "balance_loss_mlp": 1.01587594, "epoch": 0.9750789117691268, "flos": 25739512070400.0, "grad_norm": 1.6438610736190364, "language_loss": 0.78195894, "learning_rate": 6.495834146306167e-09, "loss": 0.80298543, "num_input_tokens_seen": 349968010, "step": 16218, "time_per_iteration": 2.7634236812591553 }, { "auxiliary_loss_clip": 0.01079204, "auxiliary_loss_mlp": 0.01029255, "balance_loss_clip": 1.03390265, "balance_loss_mlp": 1.01689315, "epoch": 0.9751390350217947, "flos": 13333955961600.0, "grad_norm": 2.591457126969395, "language_loss": 0.77337241, "learning_rate": 6.464508100263222e-09, "loss": 0.79445708, "num_input_tokens_seen": 349985270, "step": 16219, "time_per_iteration": 2.7380733489990234 }, { "auxiliary_loss_clip": 0.01087952, "auxiliary_loss_mlp": 0.01032908, "balance_loss_clip": 1.03535342, "balance_loss_mlp": 1.02096331, "epoch": 0.9751991582744627, "flos": 22820621068800.0, "grad_norm": 1.7048563405563817, "language_loss": 0.81480777, "learning_rate": 6.433257649285817e-09, "loss": 0.83601636, "num_input_tokens_seen": 350003935, "step": 16220, "time_per_iteration": 2.6495344638824463 }, { "auxiliary_loss_clip": 0.01106693, "auxiliary_loss_mlp": 0.01032081, "balance_loss_clip": 1.03613138, "balance_loss_mlp": 1.02025533, "epoch": 0.9752592815271306, "flos": 19646189735040.0, "grad_norm": 1.7107968412659516, "language_loss": 0.75237, "learning_rate": 6.402082794559227e-09, "loss": 0.77375782, "num_input_tokens_seen": 350023595, "step": 16221, "time_per_iteration": 2.5049870014190674 }, { "auxiliary_loss_clip": 0.01072645, "auxiliary_loss_mlp": 0.01031441, "balance_loss_clip": 1.0333364, "balance_loss_mlp": 1.01963329, "epoch": 0.9753194047797986, "flos": 26690462686080.0, "grad_norm": 1.478633421454376, "language_loss": 0.66371262, "learning_rate": 6.370983537265395e-09, "loss": 0.68475342, "num_input_tokens_seen": 350045920, "step": 16222, "time_per_iteration": 2.7511966228485107 }, { "auxiliary_loss_clip": 0.0109569, "auxiliary_loss_mlp": 0.01029193, "balance_loss_clip": 1.03627598, "balance_loss_mlp": 1.01753998, "epoch": 0.9753795280324665, "flos": 23221779137280.0, "grad_norm": 1.9485164555129428, "language_loss": 0.8856619, "learning_rate": 6.3399598785836004e-09, "loss": 0.90691066, "num_input_tokens_seen": 350063925, "step": 16223, "time_per_iteration": 2.864657163619995 }, { "auxiliary_loss_clip": 0.01045431, "auxiliary_loss_mlp": 0.01035052, "balance_loss_clip": 1.03191626, "balance_loss_mlp": 1.02308941, "epoch": 0.9754396512851345, "flos": 19463835363840.0, "grad_norm": 1.8028212337000589, "language_loss": 0.74985182, "learning_rate": 6.309011819690457e-09, "loss": 0.7706567, "num_input_tokens_seen": 350080900, "step": 16224, "time_per_iteration": 2.7734134197235107 }, { "auxiliary_loss_clip": 0.01010696, "auxiliary_loss_mlp": 0.0100273, "balance_loss_clip": 1.00753188, "balance_loss_mlp": 1.00170505, "epoch": 0.9754997745378025, "flos": 68459313340800.0, "grad_norm": 0.8348178291134782, "language_loss": 0.5909391, "learning_rate": 6.278139361759249e-09, "loss": 0.61107337, "num_input_tokens_seen": 350144550, "step": 16225, "time_per_iteration": 3.203700065612793 }, { "auxiliary_loss_clip": 0.01075593, "auxiliary_loss_mlp": 0.00770137, "balance_loss_clip": 1.03655672, "balance_loss_mlp": 1.00027668, "epoch": 0.9755598977904705, "flos": 26395168976640.0, "grad_norm": 2.1280321736121364, "language_loss": 0.68929291, "learning_rate": 6.247342505960818e-09, "loss": 0.7077502, "num_input_tokens_seen": 350164050, "step": 16226, "time_per_iteration": 2.7182259559631348 }, { "auxiliary_loss_clip": 0.01094266, "auxiliary_loss_mlp": 0.01042676, "balance_loss_clip": 1.03407538, "balance_loss_mlp": 1.02954507, "epoch": 0.9756200210431384, "flos": 16617663446400.0, "grad_norm": 1.92516234582211, "language_loss": 0.82812244, "learning_rate": 6.216621253462894e-09, "loss": 0.84949183, "num_input_tokens_seen": 350181350, "step": 16227, "time_per_iteration": 4.278809547424316 }, { "auxiliary_loss_clip": 0.01106745, "auxiliary_loss_mlp": 0.01029012, "balance_loss_clip": 1.03587723, "balance_loss_mlp": 1.01710916, "epoch": 0.9756801442958064, "flos": 23623044946560.0, "grad_norm": 1.6986847521997988, "language_loss": 0.77753866, "learning_rate": 6.185975605430549e-09, "loss": 0.79889619, "num_input_tokens_seen": 350199765, "step": 16228, "time_per_iteration": 4.098712205886841 }, { "auxiliary_loss_clip": 0.01018838, "auxiliary_loss_mlp": 0.01000083, "balance_loss_clip": 1.00571454, "balance_loss_mlp": 0.99909353, "epoch": 0.9757402675484744, "flos": 61625799440640.0, "grad_norm": 0.84298125837055, "language_loss": 0.55775201, "learning_rate": 6.155405563025962e-09, "loss": 0.57794118, "num_input_tokens_seen": 350256420, "step": 16229, "time_per_iteration": 4.671915292739868 }, { "auxiliary_loss_clip": 0.01097026, "auxiliary_loss_mlp": 0.01031698, "balance_loss_clip": 1.03510642, "balance_loss_mlp": 1.01906228, "epoch": 0.9758003908011423, "flos": 24058964401920.0, "grad_norm": 1.6630448372353723, "language_loss": 0.74857068, "learning_rate": 6.124911127407984e-09, "loss": 0.76985788, "num_input_tokens_seen": 350276270, "step": 16230, "time_per_iteration": 2.637298822402954 }, { "auxiliary_loss_clip": 0.01080882, "auxiliary_loss_mlp": 0.01029958, "balance_loss_clip": 1.03464866, "balance_loss_mlp": 1.01841259, "epoch": 0.9758605140538104, "flos": 17493093717120.0, "grad_norm": 2.3627285859767992, "language_loss": 0.72050405, "learning_rate": 6.094492299733245e-09, "loss": 0.74161243, "num_input_tokens_seen": 350295000, "step": 16231, "time_per_iteration": 2.606243133544922 }, { "auxiliary_loss_clip": 0.01087789, "auxiliary_loss_mlp": 0.01032046, "balance_loss_clip": 1.03723931, "balance_loss_mlp": 1.019225, "epoch": 0.9759206373064783, "flos": 24826950115200.0, "grad_norm": 1.897185559618263, "language_loss": 0.76273429, "learning_rate": 6.064149081155267e-09, "loss": 0.78393269, "num_input_tokens_seen": 350314980, "step": 16232, "time_per_iteration": 4.806816816329956 }, { "auxiliary_loss_clip": 0.01007054, "auxiliary_loss_mlp": 0.00999094, "balance_loss_clip": 1.0077014, "balance_loss_mlp": 0.99789584, "epoch": 0.9759807605591463, "flos": 68161182456960.0, "grad_norm": 0.7408233152349849, "language_loss": 0.53817546, "learning_rate": 6.033881472824465e-09, "loss": 0.55823696, "num_input_tokens_seen": 350371985, "step": 16233, "time_per_iteration": 3.143988847732544 }, { "auxiliary_loss_clip": 0.01108543, "auxiliary_loss_mlp": 0.0103539, "balance_loss_clip": 1.03642726, "balance_loss_mlp": 1.02313495, "epoch": 0.9760408838118142, "flos": 18989239939200.0, "grad_norm": 1.8846866025891749, "language_loss": 0.71843183, "learning_rate": 6.003689475888807e-09, "loss": 0.7398712, "num_input_tokens_seen": 350390590, "step": 16234, "time_per_iteration": 2.5556411743164062 }, { "auxiliary_loss_clip": 0.01098185, "auxiliary_loss_mlp": 0.0103115, "balance_loss_clip": 1.03558266, "balance_loss_mlp": 1.01847827, "epoch": 0.9761010070644822, "flos": 17125978763520.0, "grad_norm": 2.9772233129130825, "language_loss": 0.79668027, "learning_rate": 5.973573091493156e-09, "loss": 0.81797361, "num_input_tokens_seen": 350403770, "step": 16235, "time_per_iteration": 2.5155222415924072 }, { "auxiliary_loss_clip": 0.0109002, "auxiliary_loss_mlp": 0.01034289, "balance_loss_clip": 1.03545177, "balance_loss_mlp": 1.02070475, "epoch": 0.9761611303171501, "flos": 22052599441920.0, "grad_norm": 2.420578950017542, "language_loss": 0.7674818, "learning_rate": 5.943532320779265e-09, "loss": 0.7887249, "num_input_tokens_seen": 350421870, "step": 16236, "time_per_iteration": 2.641690731048584 }, { "auxiliary_loss_clip": 0.01096794, "auxiliary_loss_mlp": 0.01027707, "balance_loss_clip": 1.03507769, "balance_loss_mlp": 1.01571465, "epoch": 0.9762212535698181, "flos": 21757521214080.0, "grad_norm": 3.537228150180641, "language_loss": 0.75424302, "learning_rate": 5.913567164886446e-09, "loss": 0.77548802, "num_input_tokens_seen": 350440025, "step": 16237, "time_per_iteration": 2.5821526050567627 }, { "auxiliary_loss_clip": 0.01061626, "auxiliary_loss_mlp": 0.01037494, "balance_loss_clip": 1.03076112, "balance_loss_mlp": 1.02306354, "epoch": 0.9762813768224861, "flos": 25921615046400.0, "grad_norm": 1.5766064307721592, "language_loss": 0.72649348, "learning_rate": 5.8836776249509e-09, "loss": 0.74748468, "num_input_tokens_seen": 350459435, "step": 16238, "time_per_iteration": 2.716170072555542 }, { "auxiliary_loss_clip": 0.01090292, "auxiliary_loss_mlp": 0.00771217, "balance_loss_clip": 1.03843439, "balance_loss_mlp": 1.0002383, "epoch": 0.9763415000751541, "flos": 24051853509120.0, "grad_norm": 2.1577792438101646, "language_loss": 0.83911026, "learning_rate": 5.8538637021063875e-09, "loss": 0.85772538, "num_input_tokens_seen": 350472655, "step": 16239, "time_per_iteration": 2.7628393173217773 }, { "auxiliary_loss_clip": 0.01067831, "auxiliary_loss_mlp": 0.01043243, "balance_loss_clip": 1.03342855, "balance_loss_mlp": 1.02861595, "epoch": 0.976401623327822, "flos": 17018677860480.0, "grad_norm": 3.0421443760450266, "language_loss": 0.60336649, "learning_rate": 5.824125397483115e-09, "loss": 0.62447721, "num_input_tokens_seen": 350488160, "step": 16240, "time_per_iteration": 2.6417906284332275 }, { "auxiliary_loss_clip": 0.01069406, "auxiliary_loss_mlp": 0.01029004, "balance_loss_clip": 1.0350244, "balance_loss_mlp": 1.01704097, "epoch": 0.97646174658049, "flos": 16106941918080.0, "grad_norm": 1.952892588808636, "language_loss": 0.82362419, "learning_rate": 5.7944627122088474e-09, "loss": 0.84460825, "num_input_tokens_seen": 350506065, "step": 16241, "time_per_iteration": 2.6529223918914795 }, { "auxiliary_loss_clip": 0.01069965, "auxiliary_loss_mlp": 0.01037931, "balance_loss_clip": 1.03480554, "balance_loss_mlp": 1.02566481, "epoch": 0.9765218698331579, "flos": 21252725429760.0, "grad_norm": 1.9374011472646437, "language_loss": 0.83271652, "learning_rate": 5.764875647408463e-09, "loss": 0.85379553, "num_input_tokens_seen": 350524495, "step": 16242, "time_per_iteration": 2.7075135707855225 }, { "auxiliary_loss_clip": 0.01097999, "auxiliary_loss_mlp": 0.01027861, "balance_loss_clip": 1.03740764, "balance_loss_mlp": 1.01545691, "epoch": 0.9765819930858259, "flos": 18588045957120.0, "grad_norm": 1.5885372986539104, "language_loss": 0.75616562, "learning_rate": 5.7353642042037294e-09, "loss": 0.77742422, "num_input_tokens_seen": 350544185, "step": 16243, "time_per_iteration": 2.8476173877716064 }, { "auxiliary_loss_clip": 0.01096151, "auxiliary_loss_mlp": 0.0103737, "balance_loss_clip": 1.03450549, "balance_loss_mlp": 1.02472222, "epoch": 0.976642116338494, "flos": 20266833859200.0, "grad_norm": 1.629420195076715, "language_loss": 0.70183492, "learning_rate": 5.705928383713754e-09, "loss": 0.72317016, "num_input_tokens_seen": 350562675, "step": 16244, "time_per_iteration": 2.648705244064331 }, { "auxiliary_loss_clip": 0.01090661, "auxiliary_loss_mlp": 0.01030766, "balance_loss_clip": 1.03870106, "balance_loss_mlp": 1.01780796, "epoch": 0.9767022395911619, "flos": 25550477769600.0, "grad_norm": 1.816908720128117, "language_loss": 0.83598977, "learning_rate": 5.676568187055197e-09, "loss": 0.85720408, "num_input_tokens_seen": 350581535, "step": 16245, "time_per_iteration": 2.7069408893585205 }, { "auxiliary_loss_clip": 0.01056812, "auxiliary_loss_mlp": 0.01028217, "balance_loss_clip": 1.03245211, "balance_loss_mlp": 1.0164988, "epoch": 0.9767623628438299, "flos": 21762656858880.0, "grad_norm": 1.6507047411461764, "language_loss": 0.78559917, "learning_rate": 5.647283615340726e-09, "loss": 0.80644941, "num_input_tokens_seen": 350601615, "step": 16246, "time_per_iteration": 2.766493558883667 }, { "auxiliary_loss_clip": 0.01101377, "auxiliary_loss_mlp": 0.01033211, "balance_loss_clip": 1.03545284, "balance_loss_mlp": 1.02206457, "epoch": 0.9768224860964978, "flos": 15851114277120.0, "grad_norm": 1.4053965502785082, "language_loss": 0.74026012, "learning_rate": 5.6180746696812275e-09, "loss": 0.76160598, "num_input_tokens_seen": 350619580, "step": 16247, "time_per_iteration": 2.56381893157959 }, { "auxiliary_loss_clip": 0.01053333, "auxiliary_loss_mlp": 0.01033648, "balance_loss_clip": 1.03346825, "balance_loss_mlp": 1.02078581, "epoch": 0.9768826093491658, "flos": 25151151294720.0, "grad_norm": 4.399397738721356, "language_loss": 0.79704082, "learning_rate": 5.58894135118404e-09, "loss": 0.81791055, "num_input_tokens_seen": 350640015, "step": 16248, "time_per_iteration": 2.8011584281921387 }, { "auxiliary_loss_clip": 0.01049095, "auxiliary_loss_mlp": 0.01046497, "balance_loss_clip": 1.03563344, "balance_loss_mlp": 1.03080893, "epoch": 0.9769427326018337, "flos": 22967028904320.0, "grad_norm": 1.8090514517823602, "language_loss": 0.79385042, "learning_rate": 5.559883660954278e-09, "loss": 0.81480634, "num_input_tokens_seen": 350659155, "step": 16249, "time_per_iteration": 2.7398455142974854 }, { "auxiliary_loss_clip": 0.01092723, "auxiliary_loss_mlp": 0.01035301, "balance_loss_clip": 1.03559923, "balance_loss_mlp": 1.02318323, "epoch": 0.9770028558545018, "flos": 15264297786240.0, "grad_norm": 1.9233029914398667, "language_loss": 0.66280472, "learning_rate": 5.530901600093507e-09, "loss": 0.68408501, "num_input_tokens_seen": 350676615, "step": 16250, "time_per_iteration": 2.556757688522339 }, { "auxiliary_loss_clip": 0.01027067, "auxiliary_loss_mlp": 0.01001957, "balance_loss_clip": 1.00477159, "balance_loss_mlp": 1.0009917, "epoch": 0.9770629791071697, "flos": 71450348808960.0, "grad_norm": 0.7726336256949028, "language_loss": 0.59797513, "learning_rate": 5.501995169700846e-09, "loss": 0.61826539, "num_input_tokens_seen": 350736805, "step": 16251, "time_per_iteration": 3.1876869201660156 }, { "auxiliary_loss_clip": 0.01093869, "auxiliary_loss_mlp": 0.01031057, "balance_loss_clip": 1.03425992, "balance_loss_mlp": 1.01817012, "epoch": 0.9771231023598377, "flos": 22412854897920.0, "grad_norm": 1.7259349246741458, "language_loss": 0.78470027, "learning_rate": 5.473164370872307e-09, "loss": 0.80594945, "num_input_tokens_seen": 350753600, "step": 16252, "time_per_iteration": 2.606030225753784 }, { "auxiliary_loss_clip": 0.01090281, "auxiliary_loss_mlp": 0.01034389, "balance_loss_clip": 1.0339278, "balance_loss_mlp": 1.02142549, "epoch": 0.9771832256125056, "flos": 19025940660480.0, "grad_norm": 2.220084536547545, "language_loss": 0.64542538, "learning_rate": 5.444409204701461e-09, "loss": 0.66667211, "num_input_tokens_seen": 350771225, "step": 16253, "time_per_iteration": 2.5694305896759033 }, { "auxiliary_loss_clip": 0.01101639, "auxiliary_loss_mlp": 0.0103353, "balance_loss_clip": 1.03819561, "balance_loss_mlp": 1.01936197, "epoch": 0.9772433488651736, "flos": 17822143232640.0, "grad_norm": 2.129791137286544, "language_loss": 0.7626065, "learning_rate": 5.415729672278324e-09, "loss": 0.7839582, "num_input_tokens_seen": 350789100, "step": 16254, "time_per_iteration": 2.6212127208709717 }, { "auxiliary_loss_clip": 0.0110148, "auxiliary_loss_mlp": 0.01031853, "balance_loss_clip": 1.03698289, "balance_loss_mlp": 1.0193603, "epoch": 0.9773034721178415, "flos": 37629785623680.0, "grad_norm": 1.8907694352431208, "language_loss": 0.63917691, "learning_rate": 5.387125774690471e-09, "loss": 0.66051024, "num_input_tokens_seen": 350811085, "step": 16255, "time_per_iteration": 2.7545289993286133 }, { "auxiliary_loss_clip": 0.01080709, "auxiliary_loss_mlp": 0.00771506, "balance_loss_clip": 1.03611016, "balance_loss_mlp": 1.0002296, "epoch": 0.9773635953705095, "flos": 20302457172480.0, "grad_norm": 1.5349410458335684, "language_loss": 0.75715804, "learning_rate": 5.358597513023033e-09, "loss": 0.77568018, "num_input_tokens_seen": 350831065, "step": 16256, "time_per_iteration": 2.718520164489746 }, { "auxiliary_loss_clip": 0.01107482, "auxiliary_loss_mlp": 0.010355, "balance_loss_clip": 1.0382638, "balance_loss_mlp": 1.02249467, "epoch": 0.9774237186231776, "flos": 22309253095680.0, "grad_norm": 4.923302241947984, "language_loss": 0.77929807, "learning_rate": 5.330144888357369e-09, "loss": 0.80072796, "num_input_tokens_seen": 350849675, "step": 16257, "time_per_iteration": 2.578667163848877 }, { "auxiliary_loss_clip": 0.01092876, "auxiliary_loss_mlp": 0.01032433, "balance_loss_clip": 1.03653014, "balance_loss_mlp": 1.01965332, "epoch": 0.9774838418758455, "flos": 24204905360640.0, "grad_norm": 1.5736578002879344, "language_loss": 0.75143224, "learning_rate": 5.301767901772391e-09, "loss": 0.77268535, "num_input_tokens_seen": 350868955, "step": 16258, "time_per_iteration": 2.679143190383911 }, { "auxiliary_loss_clip": 0.01019519, "auxiliary_loss_mlp": 0.01001235, "balance_loss_clip": 1.00671029, "balance_loss_mlp": 1.00025165, "epoch": 0.9775439651285135, "flos": 66357139829760.0, "grad_norm": 0.6768337597392673, "language_loss": 0.59785736, "learning_rate": 5.273466554344353e-09, "loss": 0.61806488, "num_input_tokens_seen": 350935110, "step": 16259, "time_per_iteration": 3.1992921829223633 }, { "auxiliary_loss_clip": 0.01093161, "auxiliary_loss_mlp": 0.0103161, "balance_loss_clip": 1.03717732, "balance_loss_mlp": 1.01865828, "epoch": 0.9776040883811814, "flos": 22601565976320.0, "grad_norm": 1.55851171401808, "language_loss": 0.73553669, "learning_rate": 5.2452408471461705e-09, "loss": 0.75678444, "num_input_tokens_seen": 350953220, "step": 16260, "time_per_iteration": 2.639127731323242 }, { "auxiliary_loss_clip": 0.01098909, "auxiliary_loss_mlp": 0.01032118, "balance_loss_clip": 1.03642654, "balance_loss_mlp": 1.01898706, "epoch": 0.9776642116338494, "flos": 18442176825600.0, "grad_norm": 1.9328284113468908, "language_loss": 0.79923123, "learning_rate": 5.2170907812485456e-09, "loss": 0.8205415, "num_input_tokens_seen": 350971915, "step": 16261, "time_per_iteration": 2.5895767211914062 }, { "auxiliary_loss_clip": 0.01099762, "auxiliary_loss_mlp": 0.01026055, "balance_loss_clip": 1.03615069, "balance_loss_mlp": 1.01340127, "epoch": 0.9777243348865173, "flos": 22638446265600.0, "grad_norm": 2.216385126324637, "language_loss": 0.74283129, "learning_rate": 5.189016357718845e-09, "loss": 0.76408947, "num_input_tokens_seen": 350990470, "step": 16262, "time_per_iteration": 2.5935211181640625 }, { "auxiliary_loss_clip": 0.01098991, "auxiliary_loss_mlp": 0.01033569, "balance_loss_clip": 1.03628135, "balance_loss_mlp": 1.01945508, "epoch": 0.9777844581391854, "flos": 31321394605440.0, "grad_norm": 2.37269608442218, "language_loss": 0.70012951, "learning_rate": 5.16101757762133e-09, "loss": 0.7214551, "num_input_tokens_seen": 351010755, "step": 16263, "time_per_iteration": 2.8126862049102783 }, { "auxiliary_loss_clip": 0.01098892, "auxiliary_loss_mlp": 0.01029577, "balance_loss_clip": 1.03766048, "balance_loss_mlp": 1.01819265, "epoch": 0.9778445813918533, "flos": 23039101543680.0, "grad_norm": 2.465472735999823, "language_loss": 0.66363978, "learning_rate": 5.133094442018038e-09, "loss": 0.68492448, "num_input_tokens_seen": 351029965, "step": 16264, "time_per_iteration": 2.6721160411834717 }, { "auxiliary_loss_clip": 0.01063654, "auxiliary_loss_mlp": 0.01031661, "balance_loss_clip": 1.03675711, "balance_loss_mlp": 1.01770782, "epoch": 0.9779047046445213, "flos": 17566351505280.0, "grad_norm": 1.8950171968116294, "language_loss": 0.73092592, "learning_rate": 5.105246951967679e-09, "loss": 0.7518791, "num_input_tokens_seen": 351046205, "step": 16265, "time_per_iteration": 2.7303049564361572 }, { "auxiliary_loss_clip": 0.01095694, "auxiliary_loss_mlp": 0.01031211, "balance_loss_clip": 1.03564298, "balance_loss_mlp": 1.01908779, "epoch": 0.9779648278971892, "flos": 20741141975040.0, "grad_norm": 1.771422811009788, "language_loss": 0.68976378, "learning_rate": 5.077475108526297e-09, "loss": 0.71103287, "num_input_tokens_seen": 351065390, "step": 16266, "time_per_iteration": 4.168168306350708 }, { "auxiliary_loss_clip": 0.01058776, "auxiliary_loss_mlp": 0.01027743, "balance_loss_clip": 1.03172088, "balance_loss_mlp": 1.01640654, "epoch": 0.9780249511498572, "flos": 21026954494080.0, "grad_norm": 1.5799613960571957, "language_loss": 0.86905551, "learning_rate": 5.049778912747049e-09, "loss": 0.88992071, "num_input_tokens_seen": 351084355, "step": 16267, "time_per_iteration": 4.231276512145996 }, { "auxiliary_loss_clip": 0.01043069, "auxiliary_loss_mlp": 0.01029392, "balance_loss_clip": 1.03164184, "balance_loss_mlp": 1.01611769, "epoch": 0.9780850744025251, "flos": 30774223751040.0, "grad_norm": 1.9539809119147387, "language_loss": 0.70374393, "learning_rate": 5.022158365679985e-09, "loss": 0.72446853, "num_input_tokens_seen": 351105870, "step": 16268, "time_per_iteration": 4.722951412200928 }, { "auxiliary_loss_clip": 0.01087833, "auxiliary_loss_mlp": 0.01024552, "balance_loss_clip": 1.0350287, "balance_loss_mlp": 1.01256526, "epoch": 0.9781451976551931, "flos": 20302995876480.0, "grad_norm": 1.5865109872612446, "language_loss": 0.7393145, "learning_rate": 4.994613468372711e-09, "loss": 0.76043838, "num_input_tokens_seen": 351124760, "step": 16269, "time_per_iteration": 3.1291208267211914 }, { "auxiliary_loss_clip": 0.01085029, "auxiliary_loss_mlp": 0.01034789, "balance_loss_clip": 1.03650665, "balance_loss_mlp": 1.02071679, "epoch": 0.9782053209078612, "flos": 24316479982080.0, "grad_norm": 2.0431692702838613, "language_loss": 0.70405006, "learning_rate": 4.967144221869501e-09, "loss": 0.72524822, "num_input_tokens_seen": 351142820, "step": 16270, "time_per_iteration": 2.6683366298675537 }, { "auxiliary_loss_clip": 0.01110841, "auxiliary_loss_mlp": 0.01034856, "balance_loss_clip": 1.03801334, "balance_loss_mlp": 1.02240467, "epoch": 0.9782654441605291, "flos": 32489425065600.0, "grad_norm": 1.7714650926331987, "language_loss": 0.63896102, "learning_rate": 4.939750627212191e-09, "loss": 0.66041803, "num_input_tokens_seen": 351164805, "step": 16271, "time_per_iteration": 4.501726388931274 }, { "auxiliary_loss_clip": 0.01082074, "auxiliary_loss_mlp": 0.01033993, "balance_loss_clip": 1.03665876, "balance_loss_mlp": 1.02143383, "epoch": 0.9783255674131971, "flos": 26979076465920.0, "grad_norm": 1.43784561644357, "language_loss": 0.70358956, "learning_rate": 4.912432685439505e-09, "loss": 0.72475022, "num_input_tokens_seen": 351187005, "step": 16272, "time_per_iteration": 2.727437734603882 }, { "auxiliary_loss_clip": 0.0105355, "auxiliary_loss_mlp": 0.01034254, "balance_loss_clip": 1.03778529, "balance_loss_mlp": 1.02120066, "epoch": 0.978385690665865, "flos": 23112251591040.0, "grad_norm": 1.7381276500973775, "language_loss": 0.66595173, "learning_rate": 4.88519039758728e-09, "loss": 0.68682981, "num_input_tokens_seen": 351208450, "step": 16273, "time_per_iteration": 2.929959774017334 }, { "auxiliary_loss_clip": 0.01075306, "auxiliary_loss_mlp": 0.01023021, "balance_loss_clip": 1.03366828, "balance_loss_mlp": 1.01021206, "epoch": 0.978445813918533, "flos": 25409672455680.0, "grad_norm": 1.7200499959996831, "language_loss": 0.7406745, "learning_rate": 4.85802376468869e-09, "loss": 0.76165771, "num_input_tokens_seen": 351229585, "step": 16274, "time_per_iteration": 2.6932880878448486 }, { "auxiliary_loss_clip": 0.01084441, "auxiliary_loss_mlp": 0.01029809, "balance_loss_clip": 1.03532362, "balance_loss_mlp": 1.01775074, "epoch": 0.9785059371712009, "flos": 23550218121600.0, "grad_norm": 1.6821946772712648, "language_loss": 0.77833498, "learning_rate": 4.830932787773579e-09, "loss": 0.79947746, "num_input_tokens_seen": 351249525, "step": 16275, "time_per_iteration": 2.6410744190216064 }, { "auxiliary_loss_clip": 0.01037951, "auxiliary_loss_mlp": 0.0103008, "balance_loss_clip": 1.03559256, "balance_loss_mlp": 1.01765788, "epoch": 0.978566060423869, "flos": 34351177870080.0, "grad_norm": 2.314426015292287, "language_loss": 0.71095657, "learning_rate": 4.803917467869567e-09, "loss": 0.73163688, "num_input_tokens_seen": 351272530, "step": 16276, "time_per_iteration": 2.91654109954834 }, { "auxiliary_loss_clip": 0.01077494, "auxiliary_loss_mlp": 0.01032563, "balance_loss_clip": 1.033566, "balance_loss_mlp": 1.02052927, "epoch": 0.9786261836765369, "flos": 11618862387840.0, "grad_norm": 2.1039915765233674, "language_loss": 0.85744458, "learning_rate": 4.776977806000726e-09, "loss": 0.87854517, "num_input_tokens_seen": 351288530, "step": 16277, "time_per_iteration": 2.6657748222351074 }, { "auxiliary_loss_clip": 0.01090092, "auxiliary_loss_mlp": 0.01031629, "balance_loss_clip": 1.03429365, "balance_loss_mlp": 1.01809239, "epoch": 0.9786863069292049, "flos": 17420949250560.0, "grad_norm": 1.719720636117993, "language_loss": 0.70917892, "learning_rate": 4.7501138031891264e-09, "loss": 0.73039615, "num_input_tokens_seen": 351305890, "step": 16278, "time_per_iteration": 2.5898592472076416 }, { "auxiliary_loss_clip": 0.01087893, "auxiliary_loss_mlp": 0.01034892, "balance_loss_clip": 1.03455925, "balance_loss_mlp": 1.02192235, "epoch": 0.9787464301818728, "flos": 20844923345280.0, "grad_norm": 1.8454549463354188, "language_loss": 0.84530413, "learning_rate": 4.723325460453065e-09, "loss": 0.86653197, "num_input_tokens_seen": 351325010, "step": 16279, "time_per_iteration": 2.6659061908721924 }, { "auxiliary_loss_clip": 0.01096633, "auxiliary_loss_mlp": 0.01032409, "balance_loss_clip": 1.0337534, "balance_loss_mlp": 1.01924253, "epoch": 0.9788065534345408, "flos": 18222942165120.0, "grad_norm": 2.8453433920494753, "language_loss": 0.79117471, "learning_rate": 4.696612778808395e-09, "loss": 0.81246513, "num_input_tokens_seen": 351343060, "step": 16280, "time_per_iteration": 2.636876106262207 }, { "auxiliary_loss_clip": 0.01064547, "auxiliary_loss_mlp": 0.01034092, "balance_loss_clip": 1.03490829, "balance_loss_mlp": 1.02280319, "epoch": 0.9788666766872087, "flos": 21578219498880.0, "grad_norm": 2.843907217465553, "language_loss": 0.79550928, "learning_rate": 4.669975759268085e-09, "loss": 0.81649566, "num_input_tokens_seen": 351363260, "step": 16281, "time_per_iteration": 2.710759162902832 }, { "auxiliary_loss_clip": 0.01096946, "auxiliary_loss_mlp": 0.01032797, "balance_loss_clip": 1.03685427, "balance_loss_mlp": 1.01961863, "epoch": 0.9789267999398767, "flos": 24900495212160.0, "grad_norm": 1.5976604846892302, "language_loss": 0.80062044, "learning_rate": 4.643414402842216e-09, "loss": 0.82191795, "num_input_tokens_seen": 351382610, "step": 16282, "time_per_iteration": 2.6593406200408936 }, { "auxiliary_loss_clip": 0.0108946, "auxiliary_loss_mlp": 0.01043417, "balance_loss_clip": 1.03729296, "balance_loss_mlp": 1.03109717, "epoch": 0.9789869231925448, "flos": 19573111514880.0, "grad_norm": 1.9818178483973194, "language_loss": 0.82860035, "learning_rate": 4.616928710538204e-09, "loss": 0.84992909, "num_input_tokens_seen": 351401075, "step": 16283, "time_per_iteration": 2.696199655532837 }, { "auxiliary_loss_clip": 0.01092588, "auxiliary_loss_mlp": 0.01034978, "balance_loss_clip": 1.0365355, "balance_loss_mlp": 1.02216268, "epoch": 0.9790470464452127, "flos": 16796641939200.0, "grad_norm": 1.8242453499893954, "language_loss": 0.71959805, "learning_rate": 4.590518683360134e-09, "loss": 0.74087369, "num_input_tokens_seen": 351419275, "step": 16284, "time_per_iteration": 2.651407241821289 }, { "auxiliary_loss_clip": 0.01094663, "auxiliary_loss_mlp": 0.01033179, "balance_loss_clip": 1.03582513, "balance_loss_mlp": 1.02136493, "epoch": 0.9791071696978807, "flos": 18369350000640.0, "grad_norm": 1.8718782161226852, "language_loss": 0.64339489, "learning_rate": 4.56418432230965e-09, "loss": 0.66467333, "num_input_tokens_seen": 351437375, "step": 16285, "time_per_iteration": 2.651705026626587 }, { "auxiliary_loss_clip": 0.01084456, "auxiliary_loss_mlp": 0.01031691, "balance_loss_clip": 1.0361805, "balance_loss_mlp": 1.01931071, "epoch": 0.9791672929505486, "flos": 24170323541760.0, "grad_norm": 1.7019695394425336, "language_loss": 0.70606256, "learning_rate": 4.537925628385286e-09, "loss": 0.72722405, "num_input_tokens_seen": 351457810, "step": 16286, "time_per_iteration": 2.6652472019195557 }, { "auxiliary_loss_clip": 0.01091075, "auxiliary_loss_mlp": 0.01032716, "balance_loss_clip": 1.03533125, "balance_loss_mlp": 1.02069998, "epoch": 0.9792274162032166, "flos": 24354114456960.0, "grad_norm": 2.0320936196191526, "language_loss": 0.58058381, "learning_rate": 4.511742602582691e-09, "loss": 0.60182172, "num_input_tokens_seen": 351478825, "step": 16287, "time_per_iteration": 2.617825746536255 }, { "auxiliary_loss_clip": 0.01096267, "auxiliary_loss_mlp": 0.01035937, "balance_loss_clip": 1.0363822, "balance_loss_mlp": 1.02341986, "epoch": 0.9792875394558845, "flos": 26395779507840.0, "grad_norm": 1.7210061810048285, "language_loss": 0.81500298, "learning_rate": 4.485635245894626e-09, "loss": 0.83632499, "num_input_tokens_seen": 351498785, "step": 16288, "time_per_iteration": 2.657498359680176 }, { "auxiliary_loss_clip": 0.01082554, "auxiliary_loss_mlp": 0.00771248, "balance_loss_clip": 1.03415895, "balance_loss_mlp": 1.00014818, "epoch": 0.9793476627085526, "flos": 28148004766080.0, "grad_norm": 1.396084239179073, "language_loss": 0.71853596, "learning_rate": 4.459603559311631e-09, "loss": 0.73707396, "num_input_tokens_seen": 351520235, "step": 16289, "time_per_iteration": 2.8403937816619873 }, { "auxiliary_loss_clip": 0.01073083, "auxiliary_loss_mlp": 0.01036624, "balance_loss_clip": 1.03831482, "balance_loss_mlp": 1.02417815, "epoch": 0.9794077859612205, "flos": 16763927627520.0, "grad_norm": 2.8328336335773523, "language_loss": 0.75429696, "learning_rate": 4.43364754382003e-09, "loss": 0.77539402, "num_input_tokens_seen": 351538900, "step": 16290, "time_per_iteration": 2.6202123165130615 }, { "auxiliary_loss_clip": 0.01099176, "auxiliary_loss_mlp": 0.01032346, "balance_loss_clip": 1.03652453, "balance_loss_mlp": 1.01942921, "epoch": 0.9794679092138885, "flos": 19280834547840.0, "grad_norm": 1.5711405452036733, "language_loss": 0.6725769, "learning_rate": 4.4077672004048105e-09, "loss": 0.69389218, "num_input_tokens_seen": 351558715, "step": 16291, "time_per_iteration": 2.5787715911865234 }, { "auxiliary_loss_clip": 0.01111756, "auxiliary_loss_mlp": 0.00770961, "balance_loss_clip": 1.03711116, "balance_loss_mlp": 1.00023437, "epoch": 0.9795280324665564, "flos": 32156640535680.0, "grad_norm": 1.7868335154862072, "language_loss": 0.63048244, "learning_rate": 4.3819625300467456e-09, "loss": 0.64930964, "num_input_tokens_seen": 351578450, "step": 16292, "time_per_iteration": 2.6509621143341064 }, { "auxiliary_loss_clip": 0.01072425, "auxiliary_loss_mlp": 0.01030776, "balance_loss_clip": 1.03524005, "balance_loss_mlp": 1.01897478, "epoch": 0.9795881557192244, "flos": 19060953442560.0, "grad_norm": 1.8984825692794804, "language_loss": 0.73462898, "learning_rate": 4.356233533724829e-09, "loss": 0.75566101, "num_input_tokens_seen": 351597195, "step": 16293, "time_per_iteration": 2.64638614654541 }, { "auxiliary_loss_clip": 0.01100837, "auxiliary_loss_mlp": 0.01030263, "balance_loss_clip": 1.03659904, "balance_loss_mlp": 1.01774597, "epoch": 0.9796482789718923, "flos": 28329928174080.0, "grad_norm": 1.6810560431936798, "language_loss": 0.84062809, "learning_rate": 4.330580212414503e-09, "loss": 0.86193907, "num_input_tokens_seen": 351617460, "step": 16294, "time_per_iteration": 2.614396095275879 }, { "auxiliary_loss_clip": 0.01071284, "auxiliary_loss_mlp": 0.01032864, "balance_loss_clip": 1.03327644, "balance_loss_mlp": 1.02134895, "epoch": 0.9797084022245603, "flos": 17967976450560.0, "grad_norm": 2.3230891850787168, "language_loss": 0.71972656, "learning_rate": 4.305002567088767e-09, "loss": 0.74076802, "num_input_tokens_seen": 351635900, "step": 16295, "time_per_iteration": 2.6593565940856934 }, { "auxiliary_loss_clip": 0.01103524, "auxiliary_loss_mlp": 0.01035987, "balance_loss_clip": 1.0375762, "balance_loss_mlp": 1.02305257, "epoch": 0.9797685254772284, "flos": 20266726118400.0, "grad_norm": 1.590993337993389, "language_loss": 0.80806482, "learning_rate": 4.2795005987170674e-09, "loss": 0.82946002, "num_input_tokens_seen": 351655400, "step": 16296, "time_per_iteration": 2.571876287460327 }, { "auxiliary_loss_clip": 0.01079454, "auxiliary_loss_mlp": 0.01033969, "balance_loss_clip": 1.03264189, "balance_loss_mlp": 1.02170789, "epoch": 0.9798286487298963, "flos": 26907147480960.0, "grad_norm": 2.4272229698670986, "language_loss": 0.75518107, "learning_rate": 4.254074308266853e-09, "loss": 0.77631521, "num_input_tokens_seen": 351675505, "step": 16297, "time_per_iteration": 2.737135410308838 }, { "auxiliary_loss_clip": 0.01097573, "auxiliary_loss_mlp": 0.01036214, "balance_loss_clip": 1.03408372, "balance_loss_mlp": 1.02367926, "epoch": 0.9798887719825643, "flos": 27161071701120.0, "grad_norm": 2.8449878357962, "language_loss": 0.78244084, "learning_rate": 4.228723696702019e-09, "loss": 0.80377865, "num_input_tokens_seen": 351697920, "step": 16298, "time_per_iteration": 2.662205457687378 }, { "auxiliary_loss_clip": 0.01092637, "auxiliary_loss_mlp": 0.01026617, "balance_loss_clip": 1.03448844, "balance_loss_mlp": 1.01479197, "epoch": 0.9799488952352322, "flos": 20668422890880.0, "grad_norm": 1.5107423407180305, "language_loss": 0.72837794, "learning_rate": 4.203448764984019e-09, "loss": 0.74957049, "num_input_tokens_seen": 351717615, "step": 16299, "time_per_iteration": 2.6172263622283936 }, { "auxiliary_loss_clip": 0.01084816, "auxiliary_loss_mlp": 0.01028705, "balance_loss_clip": 1.0337168, "balance_loss_mlp": 1.01527619, "epoch": 0.9800090184879002, "flos": 21981209160960.0, "grad_norm": 2.196732565554413, "language_loss": 0.89433563, "learning_rate": 4.178249514071419e-09, "loss": 0.91547084, "num_input_tokens_seen": 351735260, "step": 16300, "time_per_iteration": 2.665531873703003 }, { "auxiliary_loss_clip": 0.01099488, "auxiliary_loss_mlp": 0.01029393, "balance_loss_clip": 1.03554893, "balance_loss_mlp": 1.01669717, "epoch": 0.9800691417405681, "flos": 21288420570240.0, "grad_norm": 3.318186041205299, "language_loss": 0.7811656, "learning_rate": 4.1531259449194555e-09, "loss": 0.80245435, "num_input_tokens_seen": 351755800, "step": 16301, "time_per_iteration": 2.6590991020202637 }, { "auxiliary_loss_clip": 0.01085984, "auxiliary_loss_mlp": 0.01035207, "balance_loss_clip": 1.03470898, "balance_loss_mlp": 1.022452, "epoch": 0.9801292649932362, "flos": 18439878355200.0, "grad_norm": 2.4753221911438525, "language_loss": 0.75696325, "learning_rate": 4.128078058480921e-09, "loss": 0.77817523, "num_input_tokens_seen": 351774790, "step": 16302, "time_per_iteration": 2.5974133014678955 }, { "auxiliary_loss_clip": 0.01080371, "auxiliary_loss_mlp": 0.01032591, "balance_loss_clip": 1.03640592, "balance_loss_mlp": 1.01979423, "epoch": 0.9801893882459041, "flos": 25046364343680.0, "grad_norm": 1.7046850739781914, "language_loss": 0.79628474, "learning_rate": 4.103105855705724e-09, "loss": 0.8174144, "num_input_tokens_seen": 351792855, "step": 16303, "time_per_iteration": 2.6679980754852295 }, { "auxiliary_loss_clip": 0.01066992, "auxiliary_loss_mlp": 0.0103858, "balance_loss_clip": 1.03263092, "balance_loss_mlp": 1.02466226, "epoch": 0.9802495114985721, "flos": 18511484117760.0, "grad_norm": 2.0636991947077696, "language_loss": 0.83625126, "learning_rate": 4.078209337540883e-09, "loss": 0.85730696, "num_input_tokens_seen": 351811450, "step": 16304, "time_per_iteration": 2.6905360221862793 }, { "auxiliary_loss_clip": 0.01070996, "auxiliary_loss_mlp": 0.01026297, "balance_loss_clip": 1.03549314, "balance_loss_mlp": 1.01519823, "epoch": 0.98030963475124, "flos": 21469841187840.0, "grad_norm": 1.8378321137403202, "language_loss": 0.70343494, "learning_rate": 4.053388504930089e-09, "loss": 0.72440791, "num_input_tokens_seen": 351831960, "step": 16305, "time_per_iteration": 2.745544910430908 }, { "auxiliary_loss_clip": 0.0107968, "auxiliary_loss_mlp": 0.01040728, "balance_loss_clip": 1.03601217, "balance_loss_mlp": 1.0259459, "epoch": 0.980369758003908, "flos": 20412272027520.0, "grad_norm": 2.7789217747629182, "language_loss": 0.71784663, "learning_rate": 4.028643358815032e-09, "loss": 0.73905075, "num_input_tokens_seen": 351851585, "step": 16306, "time_per_iteration": 4.391748905181885 }, { "auxiliary_loss_clip": 0.01080084, "auxiliary_loss_mlp": 0.01032092, "balance_loss_clip": 1.03247881, "balance_loss_mlp": 1.02032626, "epoch": 0.9804298812565759, "flos": 23399177431680.0, "grad_norm": 1.5354293339216485, "language_loss": 0.73557943, "learning_rate": 4.00397390013385e-09, "loss": 0.75670117, "num_input_tokens_seen": 351871085, "step": 16307, "time_per_iteration": 4.338375091552734 }, { "auxiliary_loss_clip": 0.01076228, "auxiliary_loss_mlp": 0.01030733, "balance_loss_clip": 1.03920865, "balance_loss_mlp": 1.01993847, "epoch": 0.980490004509244, "flos": 23292666627840.0, "grad_norm": 1.5555541089180664, "language_loss": 0.74765921, "learning_rate": 3.979380129822018e-09, "loss": 0.76872879, "num_input_tokens_seen": 351891775, "step": 16308, "time_per_iteration": 2.79581618309021 }, { "auxiliary_loss_clip": 0.01007996, "auxiliary_loss_mlp": 0.0100217, "balance_loss_clip": 1.0048188, "balance_loss_mlp": 1.00120437, "epoch": 0.980550127761912, "flos": 56051027798400.0, "grad_norm": 0.7557884098405707, "language_loss": 0.57835835, "learning_rate": 3.954862048811902e-09, "loss": 0.59845996, "num_input_tokens_seen": 351946770, "step": 16309, "time_per_iteration": 3.0556421279907227 }, { "auxiliary_loss_clip": 0.01065215, "auxiliary_loss_mlp": 0.0103267, "balance_loss_clip": 1.03367877, "balance_loss_mlp": 1.02015948, "epoch": 0.9806102510145799, "flos": 25333290184320.0, "grad_norm": 1.8451853001469216, "language_loss": 0.66008574, "learning_rate": 3.930419658033646e-09, "loss": 0.68106461, "num_input_tokens_seen": 351966155, "step": 16310, "time_per_iteration": 2.729114055633545 }, { "auxiliary_loss_clip": 0.01008303, "auxiliary_loss_mlp": 0.01000216, "balance_loss_clip": 1.00770998, "balance_loss_mlp": 0.99920315, "epoch": 0.9806703742672479, "flos": 67274837429760.0, "grad_norm": 1.0882913527970195, "language_loss": 0.54503131, "learning_rate": 3.906052958413841e-09, "loss": 0.56511647, "num_input_tokens_seen": 352031655, "step": 16311, "time_per_iteration": 4.943628311157227 }, { "auxiliary_loss_clip": 0.01095664, "auxiliary_loss_mlp": 0.01027322, "balance_loss_clip": 1.0345304, "balance_loss_mlp": 1.01559234, "epoch": 0.9807304975199158, "flos": 25228970110080.0, "grad_norm": 2.5422868238543836, "language_loss": 0.79856956, "learning_rate": 3.881761950876638e-09, "loss": 0.81979948, "num_input_tokens_seen": 352051920, "step": 16312, "time_per_iteration": 2.635751247406006 }, { "auxiliary_loss_clip": 0.0108546, "auxiliary_loss_mlp": 0.01029669, "balance_loss_clip": 1.03607917, "balance_loss_mlp": 1.01784945, "epoch": 0.9807906207725838, "flos": 17456392995840.0, "grad_norm": 1.855062658283189, "language_loss": 0.6311661, "learning_rate": 3.8575466363430785e-09, "loss": 0.65231735, "num_input_tokens_seen": 352069315, "step": 16313, "time_per_iteration": 2.71441650390625 }, { "auxiliary_loss_clip": 0.01098236, "auxiliary_loss_mlp": 0.01030326, "balance_loss_clip": 1.03771138, "balance_loss_mlp": 1.01765394, "epoch": 0.9808507440252517, "flos": 21032413361280.0, "grad_norm": 2.0090087344647496, "language_loss": 0.72602594, "learning_rate": 3.833407015731316e-09, "loss": 0.74731159, "num_input_tokens_seen": 352089480, "step": 16314, "time_per_iteration": 2.789362907409668 }, { "auxiliary_loss_clip": 0.01003668, "auxiliary_loss_mlp": 0.01002, "balance_loss_clip": 1.01054919, "balance_loss_mlp": 1.00098097, "epoch": 0.9809108672779198, "flos": 64044491598720.0, "grad_norm": 0.6894102027306396, "language_loss": 0.51673484, "learning_rate": 3.80934308995684e-09, "loss": 0.53679156, "num_input_tokens_seen": 352150000, "step": 16315, "time_per_iteration": 3.215070962905884 }, { "auxiliary_loss_clip": 0.01097221, "auxiliary_loss_mlp": 0.01032423, "balance_loss_clip": 1.03522766, "balance_loss_mlp": 1.02035928, "epoch": 0.9809709905305877, "flos": 22780616296320.0, "grad_norm": 1.4165501611522262, "language_loss": 0.69878519, "learning_rate": 3.785354859932033e-09, "loss": 0.72008169, "num_input_tokens_seen": 352170990, "step": 16316, "time_per_iteration": 2.677259683609009 }, { "auxiliary_loss_clip": 0.0111046, "auxiliary_loss_mlp": 0.01032289, "balance_loss_clip": 1.03727913, "balance_loss_mlp": 1.02019501, "epoch": 0.9810311137832557, "flos": 37013415217920.0, "grad_norm": 2.664112062764968, "language_loss": 0.55067998, "learning_rate": 3.76144232656661e-09, "loss": 0.57210749, "num_input_tokens_seen": 352195335, "step": 16317, "time_per_iteration": 2.7027530670166016 }, { "auxiliary_loss_clip": 0.01052915, "auxiliary_loss_mlp": 0.01035996, "balance_loss_clip": 1.02941895, "balance_loss_mlp": 1.02321708, "epoch": 0.9810912370359236, "flos": 18916305373440.0, "grad_norm": 1.7269155229815298, "language_loss": 0.73437709, "learning_rate": 3.737605490767404e-09, "loss": 0.75526619, "num_input_tokens_seen": 352214170, "step": 16318, "time_per_iteration": 2.7383875846862793 }, { "auxiliary_loss_clip": 0.01082811, "auxiliary_loss_mlp": 0.01027164, "balance_loss_clip": 1.0344367, "balance_loss_mlp": 1.01589835, "epoch": 0.9811513602885916, "flos": 18441602208000.0, "grad_norm": 2.1831479646107597, "language_loss": 0.82135093, "learning_rate": 3.7138443534383555e-09, "loss": 0.84245068, "num_input_tokens_seen": 352231470, "step": 16319, "time_per_iteration": 2.6357314586639404 }, { "auxiliary_loss_clip": 0.01018205, "auxiliary_loss_mlp": 0.01008734, "balance_loss_clip": 1.00481987, "balance_loss_mlp": 1.00751829, "epoch": 0.9812114835412595, "flos": 68058945371520.0, "grad_norm": 0.723170548219491, "language_loss": 0.5353533, "learning_rate": 3.6901589154803014e-09, "loss": 0.55562276, "num_input_tokens_seen": 352291770, "step": 16320, "time_per_iteration": 3.0364413261413574 }, { "auxiliary_loss_clip": 0.01057502, "auxiliary_loss_mlp": 0.01036848, "balance_loss_clip": 1.03194261, "balance_loss_mlp": 1.02422416, "epoch": 0.9812716067939276, "flos": 25373007648000.0, "grad_norm": 2.190666128056564, "language_loss": 0.73492098, "learning_rate": 3.6665491777914116e-09, "loss": 0.7558645, "num_input_tokens_seen": 352310735, "step": 16321, "time_per_iteration": 2.7965734004974365 }, { "auxiliary_loss_clip": 0.0108786, "auxiliary_loss_mlp": 0.0103188, "balance_loss_clip": 1.03798234, "balance_loss_mlp": 1.01972055, "epoch": 0.9813317300465956, "flos": 22856818999680.0, "grad_norm": 1.5299966395206919, "language_loss": 0.78483856, "learning_rate": 3.6430151412669698e-09, "loss": 0.806036, "num_input_tokens_seen": 352329545, "step": 16322, "time_per_iteration": 2.762363910675049 }, { "auxiliary_loss_clip": 0.0109714, "auxiliary_loss_mlp": 0.01034816, "balance_loss_clip": 1.03686166, "balance_loss_mlp": 1.02228689, "epoch": 0.9813918532992635, "flos": 23586954756480.0, "grad_norm": 1.7335029741380326, "language_loss": 0.81064153, "learning_rate": 3.619556806799595e-09, "loss": 0.8319611, "num_input_tokens_seen": 352352080, "step": 16323, "time_per_iteration": 2.674591541290283 }, { "auxiliary_loss_clip": 0.01110489, "auxiliary_loss_mlp": 0.01030491, "balance_loss_clip": 1.03752804, "balance_loss_mlp": 1.01852298, "epoch": 0.9814519765519315, "flos": 19606328616960.0, "grad_norm": 2.350364849870627, "language_loss": 0.84632325, "learning_rate": 3.596174175278799e-09, "loss": 0.86773306, "num_input_tokens_seen": 352366455, "step": 16324, "time_per_iteration": 2.5407159328460693 }, { "auxiliary_loss_clip": 0.01086747, "auxiliary_loss_mlp": 0.01033784, "balance_loss_clip": 1.03741539, "balance_loss_mlp": 1.02086782, "epoch": 0.9815120998045994, "flos": 33946284787200.0, "grad_norm": 1.4316902818633324, "language_loss": 0.74605346, "learning_rate": 3.5728672475909827e-09, "loss": 0.76725876, "num_input_tokens_seen": 352386090, "step": 16325, "time_per_iteration": 2.817761182785034 }, { "auxiliary_loss_clip": 0.01056448, "auxiliary_loss_mlp": 0.01032925, "balance_loss_clip": 1.03592491, "balance_loss_mlp": 1.02158785, "epoch": 0.9815722230572674, "flos": 20850023076480.0, "grad_norm": 1.5890667781038148, "language_loss": 0.7638427, "learning_rate": 3.5496360246201063e-09, "loss": 0.78473639, "num_input_tokens_seen": 352404000, "step": 16326, "time_per_iteration": 2.804213523864746 }, { "auxiliary_loss_clip": 0.01075422, "auxiliary_loss_mlp": 0.01032152, "balance_loss_clip": 1.03580999, "balance_loss_mlp": 1.01923585, "epoch": 0.9816323463099353, "flos": 22894525301760.0, "grad_norm": 2.465136585192098, "language_loss": 0.67442954, "learning_rate": 3.5264805072470205e-09, "loss": 0.69550526, "num_input_tokens_seen": 352423540, "step": 16327, "time_per_iteration": 2.725055694580078 }, { "auxiliary_loss_clip": 0.01102074, "auxiliary_loss_mlp": 0.01036968, "balance_loss_clip": 1.03595459, "balance_loss_mlp": 1.0239681, "epoch": 0.9816924695626034, "flos": 31539444117120.0, "grad_norm": 1.5972210745113198, "language_loss": 0.73710746, "learning_rate": 3.5034006963501337e-09, "loss": 0.75849789, "num_input_tokens_seen": 352445530, "step": 16328, "time_per_iteration": 2.739084243774414 }, { "auxiliary_loss_clip": 0.01091132, "auxiliary_loss_mlp": 0.01035924, "balance_loss_clip": 1.03452396, "balance_loss_mlp": 1.0219171, "epoch": 0.9817525928152713, "flos": 21506901045120.0, "grad_norm": 1.7593287132982667, "language_loss": 0.8105092, "learning_rate": 3.4803965928040802e-09, "loss": 0.83177972, "num_input_tokens_seen": 352466325, "step": 16329, "time_per_iteration": 2.6751110553741455 }, { "auxiliary_loss_clip": 0.0111119, "auxiliary_loss_mlp": 0.01031516, "balance_loss_clip": 1.03624225, "balance_loss_mlp": 1.01837909, "epoch": 0.9818127160679393, "flos": 25550513683200.0, "grad_norm": 3.221253947949931, "language_loss": 0.75986403, "learning_rate": 3.4574681974817168e-09, "loss": 0.78129113, "num_input_tokens_seen": 352485505, "step": 16330, "time_per_iteration": 2.6681814193725586 }, { "auxiliary_loss_clip": 0.01117551, "auxiliary_loss_mlp": 0.01032164, "balance_loss_clip": 1.03826082, "balance_loss_mlp": 1.01716757, "epoch": 0.9818728393206072, "flos": 28803661672320.0, "grad_norm": 2.4256142149996562, "language_loss": 0.66364849, "learning_rate": 3.434615511252126e-09, "loss": 0.68514568, "num_input_tokens_seen": 352505360, "step": 16331, "time_per_iteration": 2.703917980194092 }, { "auxiliary_loss_clip": 0.01095043, "auxiliary_loss_mlp": 0.0102903, "balance_loss_clip": 1.03584874, "balance_loss_mlp": 1.01704907, "epoch": 0.9819329625732752, "flos": 23222246014080.0, "grad_norm": 1.857287483122114, "language_loss": 0.73337162, "learning_rate": 3.411838534981948e-09, "loss": 0.75461233, "num_input_tokens_seen": 352524035, "step": 16332, "time_per_iteration": 2.650766611099243 }, { "auxiliary_loss_clip": 0.01097564, "auxiliary_loss_mlp": 0.01029848, "balance_loss_clip": 1.03841019, "balance_loss_mlp": 1.01876128, "epoch": 0.9819930858259431, "flos": 17530440883200.0, "grad_norm": 1.7088986460158127, "language_loss": 0.76663387, "learning_rate": 3.389137269534936e-09, "loss": 0.78790796, "num_input_tokens_seen": 352543210, "step": 16333, "time_per_iteration": 2.6083765029907227 }, { "auxiliary_loss_clip": 0.01091914, "auxiliary_loss_mlp": 0.00769838, "balance_loss_clip": 1.03712809, "balance_loss_mlp": 1.00018179, "epoch": 0.9820532090786112, "flos": 12529915971840.0, "grad_norm": 2.124926384042051, "language_loss": 0.72888857, "learning_rate": 3.366511715771958e-09, "loss": 0.74750608, "num_input_tokens_seen": 352559770, "step": 16334, "time_per_iteration": 2.641460657119751 }, { "auxiliary_loss_clip": 0.01059033, "auxiliary_loss_mlp": 0.01035338, "balance_loss_clip": 1.03467429, "balance_loss_mlp": 1.02285099, "epoch": 0.9821133323312792, "flos": 18840174497280.0, "grad_norm": 2.150602428971571, "language_loss": 0.78196549, "learning_rate": 3.3439618745509934e-09, "loss": 0.8029092, "num_input_tokens_seen": 352577690, "step": 16335, "time_per_iteration": 2.813981056213379 }, { "auxiliary_loss_clip": 0.01084888, "auxiliary_loss_mlp": 0.01042166, "balance_loss_clip": 1.03453565, "balance_loss_mlp": 1.02693129, "epoch": 0.9821734555839471, "flos": 34824013528320.0, "grad_norm": 1.9795504478924333, "language_loss": 0.63792658, "learning_rate": 3.3214877467271362e-09, "loss": 0.65919709, "num_input_tokens_seen": 352598850, "step": 16336, "time_per_iteration": 2.8951098918914795 }, { "auxiliary_loss_clip": 0.01077961, "auxiliary_loss_mlp": 0.0103778, "balance_loss_clip": 1.03655946, "balance_loss_mlp": 1.02337968, "epoch": 0.9822335788366151, "flos": 17128169493120.0, "grad_norm": 2.0134726146913517, "language_loss": 0.73876464, "learning_rate": 3.299089333152372e-09, "loss": 0.75992203, "num_input_tokens_seen": 352616130, "step": 16337, "time_per_iteration": 2.7202372550964355 }, { "auxiliary_loss_clip": 0.0109231, "auxiliary_loss_mlp": 0.01031669, "balance_loss_clip": 1.03548503, "balance_loss_mlp": 1.01803732, "epoch": 0.982293702089283, "flos": 20813250528000.0, "grad_norm": 1.6907700121861502, "language_loss": 0.72918296, "learning_rate": 3.2767666346764645e-09, "loss": 0.75042278, "num_input_tokens_seen": 352636885, "step": 16338, "time_per_iteration": 2.5943961143493652 }, { "auxiliary_loss_clip": 0.0104046, "auxiliary_loss_mlp": 0.01032977, "balance_loss_clip": 1.03025174, "balance_loss_mlp": 1.02005458, "epoch": 0.982353825341951, "flos": 24680829588480.0, "grad_norm": 1.7984966178479147, "language_loss": 0.81313229, "learning_rate": 3.2545196521454045e-09, "loss": 0.83386666, "num_input_tokens_seen": 352657905, "step": 16339, "time_per_iteration": 2.8950557708740234 }, { "auxiliary_loss_clip": 0.01054842, "auxiliary_loss_mlp": 0.01039766, "balance_loss_clip": 1.02929127, "balance_loss_mlp": 1.02653337, "epoch": 0.982413948594619, "flos": 20850489953280.0, "grad_norm": 1.809625302780829, "language_loss": 0.62418073, "learning_rate": 3.232348386403405e-09, "loss": 0.64512682, "num_input_tokens_seen": 352676320, "step": 16340, "time_per_iteration": 2.8046703338623047 }, { "auxiliary_loss_clip": 0.01112791, "auxiliary_loss_mlp": 0.01031705, "balance_loss_clip": 1.03891397, "balance_loss_mlp": 1.01859832, "epoch": 0.982474071847287, "flos": 15377380778880.0, "grad_norm": 2.356487189204491, "language_loss": 0.86053795, "learning_rate": 3.2102528382904613e-09, "loss": 0.88198292, "num_input_tokens_seen": 352692665, "step": 16341, "time_per_iteration": 2.60111403465271 }, { "auxiliary_loss_clip": 0.0108126, "auxiliary_loss_mlp": 0.01031684, "balance_loss_clip": 1.03337705, "balance_loss_mlp": 1.01934528, "epoch": 0.9825341950999549, "flos": 23774732081280.0, "grad_norm": 1.4139542605019915, "language_loss": 0.66917169, "learning_rate": 3.188233008645014e-09, "loss": 0.69030112, "num_input_tokens_seen": 352716130, "step": 16342, "time_per_iteration": 3.006946325302124 }, { "auxiliary_loss_clip": 0.01109167, "auxiliary_loss_mlp": 0.01027299, "balance_loss_clip": 1.03658962, "balance_loss_mlp": 1.0151639, "epoch": 0.9825943183526229, "flos": 22746285872640.0, "grad_norm": 1.5649008890047298, "language_loss": 0.77261454, "learning_rate": 3.16628889830195e-09, "loss": 0.79397917, "num_input_tokens_seen": 352734705, "step": 16343, "time_per_iteration": 2.623782157897949 }, { "auxiliary_loss_clip": 0.01074162, "auxiliary_loss_mlp": 0.01030309, "balance_loss_clip": 1.03596878, "balance_loss_mlp": 1.01949716, "epoch": 0.9826544416052908, "flos": 27709966408320.0, "grad_norm": 1.540067239801228, "language_loss": 0.75307328, "learning_rate": 3.1444205080932707e-09, "loss": 0.77411795, "num_input_tokens_seen": 352756225, "step": 16344, "time_per_iteration": 2.747864007949829 }, { "auxiliary_loss_clip": 0.0108221, "auxiliary_loss_mlp": 0.01029576, "balance_loss_clip": 1.03211427, "balance_loss_mlp": 1.01698792, "epoch": 0.9827145648579588, "flos": 26941657472640.0, "grad_norm": 3.214033329820644, "language_loss": 0.66152173, "learning_rate": 3.122627838848313e-09, "loss": 0.6826396, "num_input_tokens_seen": 352776210, "step": 16345, "time_per_iteration": 4.445494651794434 }, { "auxiliary_loss_clip": 0.01092474, "auxiliary_loss_mlp": 0.01026144, "balance_loss_clip": 1.03578293, "balance_loss_mlp": 1.0152061, "epoch": 0.9827746881106267, "flos": 21866545969920.0, "grad_norm": 1.4391085603801235, "language_loss": 0.79666579, "learning_rate": 3.1009108913933045e-09, "loss": 0.81785202, "num_input_tokens_seen": 352795455, "step": 16346, "time_per_iteration": 4.2288713455200195 }, { "auxiliary_loss_clip": 0.01098997, "auxiliary_loss_mlp": 0.01037578, "balance_loss_clip": 1.03740525, "balance_loss_mlp": 1.02411294, "epoch": 0.9828348113632948, "flos": 20850777262080.0, "grad_norm": 2.0938671424216024, "language_loss": 0.75089842, "learning_rate": 3.079269666552031e-09, "loss": 0.77226412, "num_input_tokens_seen": 352812895, "step": 16347, "time_per_iteration": 2.571201801300049 }, { "auxiliary_loss_clip": 0.01033873, "auxiliary_loss_mlp": 0.01036396, "balance_loss_clip": 1.02937937, "balance_loss_mlp": 1.02430809, "epoch": 0.9828949346159628, "flos": 34569227381760.0, "grad_norm": 1.7026010770508515, "language_loss": 0.66808671, "learning_rate": 3.0577041651449474e-09, "loss": 0.68878937, "num_input_tokens_seen": 352835470, "step": 16348, "time_per_iteration": 2.9019980430603027 }, { "auxiliary_loss_clip": 0.01087559, "auxiliary_loss_mlp": 0.01032268, "balance_loss_clip": 1.03562045, "balance_loss_mlp": 1.0198462, "epoch": 0.9829550578686307, "flos": 24457464864000.0, "grad_norm": 1.7338187903548066, "language_loss": 0.69069308, "learning_rate": 3.0362143879898437e-09, "loss": 0.71189135, "num_input_tokens_seen": 352854295, "step": 16349, "time_per_iteration": 2.680927038192749 }, { "auxiliary_loss_clip": 0.01075988, "auxiliary_loss_mlp": 0.01029319, "balance_loss_clip": 1.03591371, "balance_loss_mlp": 1.01804733, "epoch": 0.9830151811212987, "flos": 16910084067840.0, "grad_norm": 2.350613893884081, "language_loss": 0.75915736, "learning_rate": 3.0148003359014018e-09, "loss": 0.78021044, "num_input_tokens_seen": 352869695, "step": 16350, "time_per_iteration": 4.1306681632995605 }, { "auxiliary_loss_clip": 0.01078562, "auxiliary_loss_mlp": 0.01032593, "balance_loss_clip": 1.03499365, "balance_loss_mlp": 1.01986754, "epoch": 0.9830753043739666, "flos": 21288312829440.0, "grad_norm": 2.112400068998379, "language_loss": 0.84269607, "learning_rate": 2.9934620096920826e-09, "loss": 0.86380762, "num_input_tokens_seen": 352887430, "step": 16351, "time_per_iteration": 2.6960017681121826 }, { "auxiliary_loss_clip": 0.0107955, "auxiliary_loss_mlp": 0.0102559, "balance_loss_clip": 1.03737783, "balance_loss_mlp": 1.0136925, "epoch": 0.9831354276266346, "flos": 31723522341120.0, "grad_norm": 1.6146338638201096, "language_loss": 0.68907672, "learning_rate": 2.972199410170795e-09, "loss": 0.71012813, "num_input_tokens_seen": 352907555, "step": 16352, "time_per_iteration": 2.7532811164855957 }, { "auxiliary_loss_clip": 0.01088475, "auxiliary_loss_mlp": 0.00769371, "balance_loss_clip": 1.03576922, "balance_loss_mlp": 1.00027871, "epoch": 0.9831955508793025, "flos": 21619050284160.0, "grad_norm": 1.4138760656880254, "language_loss": 0.66266984, "learning_rate": 2.951012538143782e-09, "loss": 0.68124831, "num_input_tokens_seen": 352928670, "step": 16353, "time_per_iteration": 2.6439483165740967 }, { "auxiliary_loss_clip": 0.01082262, "auxiliary_loss_mlp": 0.01030151, "balance_loss_clip": 1.03453684, "balance_loss_mlp": 1.01872444, "epoch": 0.9832556741319706, "flos": 22968214053120.0, "grad_norm": 1.5813502969627034, "language_loss": 0.74711162, "learning_rate": 2.9299013944144025e-09, "loss": 0.76823574, "num_input_tokens_seen": 352948345, "step": 16354, "time_per_iteration": 2.6886255741119385 }, { "auxiliary_loss_clip": 0.01098034, "auxiliary_loss_mlp": 0.010272, "balance_loss_clip": 1.03713632, "balance_loss_mlp": 1.01496959, "epoch": 0.9833157973846385, "flos": 21323900229120.0, "grad_norm": 2.034749936082402, "language_loss": 0.77509081, "learning_rate": 2.9088659797835702e-09, "loss": 0.79634321, "num_input_tokens_seen": 352967250, "step": 16355, "time_per_iteration": 2.655395269393921 }, { "auxiliary_loss_clip": 0.01094864, "auxiliary_loss_mlp": 0.01028209, "balance_loss_clip": 1.03562486, "balance_loss_mlp": 1.01627064, "epoch": 0.9833759206373065, "flos": 21068719032960.0, "grad_norm": 2.2520856858074594, "language_loss": 0.73119497, "learning_rate": 2.8879062950484256e-09, "loss": 0.75242567, "num_input_tokens_seen": 352984725, "step": 16356, "time_per_iteration": 2.604156017303467 }, { "auxiliary_loss_clip": 0.01082002, "auxiliary_loss_mlp": 0.01032823, "balance_loss_clip": 1.03355122, "balance_loss_mlp": 1.02010965, "epoch": 0.9834360438899744, "flos": 18697322108160.0, "grad_norm": 1.536085672752046, "language_loss": 0.75979388, "learning_rate": 2.8670223410041104e-09, "loss": 0.7809422, "num_input_tokens_seen": 353003480, "step": 16357, "time_per_iteration": 2.685453176498413 }, { "auxiliary_loss_clip": 0.01086973, "auxiliary_loss_mlp": 0.01028039, "balance_loss_clip": 1.03633261, "balance_loss_mlp": 1.01561737, "epoch": 0.9834961671426424, "flos": 21105240186240.0, "grad_norm": 2.377018060234898, "language_loss": 0.80362308, "learning_rate": 2.846214118442436e-09, "loss": 0.82477319, "num_input_tokens_seen": 353021425, "step": 16358, "time_per_iteration": 2.672687292098999 }, { "auxiliary_loss_clip": 0.01095168, "auxiliary_loss_mlp": 0.01025846, "balance_loss_clip": 1.03404856, "balance_loss_mlp": 1.01396728, "epoch": 0.9835562903953103, "flos": 26687625511680.0, "grad_norm": 2.5788189442251848, "language_loss": 0.67699122, "learning_rate": 2.8254816281523263e-09, "loss": 0.69820142, "num_input_tokens_seen": 353039870, "step": 16359, "time_per_iteration": 2.603217601776123 }, { "auxiliary_loss_clip": 0.0110407, "auxiliary_loss_mlp": 0.01030251, "balance_loss_clip": 1.03442788, "balance_loss_mlp": 1.01891446, "epoch": 0.9836164136479784, "flos": 22090162089600.0, "grad_norm": 1.6643082336304196, "language_loss": 0.69579446, "learning_rate": 2.804824870920264e-09, "loss": 0.71713769, "num_input_tokens_seen": 353059750, "step": 16360, "time_per_iteration": 2.590282440185547 }, { "auxiliary_loss_clip": 0.01097129, "auxiliary_loss_mlp": 0.01035655, "balance_loss_clip": 1.03531575, "balance_loss_mlp": 1.02293587, "epoch": 0.9836765369006463, "flos": 23878405710720.0, "grad_norm": 1.804692326953609, "language_loss": 0.8430177, "learning_rate": 2.7842438475293996e-09, "loss": 0.86434555, "num_input_tokens_seen": 353079940, "step": 16361, "time_per_iteration": 2.667570114135742 }, { "auxiliary_loss_clip": 0.01107883, "auxiliary_loss_mlp": 0.01027313, "balance_loss_clip": 1.03631568, "balance_loss_mlp": 1.01540446, "epoch": 0.9837366601533143, "flos": 25845017293440.0, "grad_norm": 1.7879750486860067, "language_loss": 0.75830048, "learning_rate": 2.76373855876022e-09, "loss": 0.77965236, "num_input_tokens_seen": 353099990, "step": 16362, "time_per_iteration": 2.5723037719726562 }, { "auxiliary_loss_clip": 0.01109574, "auxiliary_loss_mlp": 0.01035763, "balance_loss_clip": 1.03702784, "balance_loss_mlp": 1.023103, "epoch": 0.9837967834059823, "flos": 21358015171200.0, "grad_norm": 1.8659793314659867, "language_loss": 0.71063733, "learning_rate": 2.7433090053901043e-09, "loss": 0.73209071, "num_input_tokens_seen": 353118710, "step": 16363, "time_per_iteration": 2.580556631088257 }, { "auxiliary_loss_clip": 0.01083391, "auxiliary_loss_mlp": 0.01030702, "balance_loss_clip": 1.03492188, "balance_loss_mlp": 1.01919198, "epoch": 0.9838569066586502, "flos": 18515793749760.0, "grad_norm": 2.149367136223202, "language_loss": 0.63062841, "learning_rate": 2.7229551881937653e-09, "loss": 0.65176934, "num_input_tokens_seen": 353136415, "step": 16364, "time_per_iteration": 2.6873748302459717 }, { "auxiliary_loss_clip": 0.01071986, "auxiliary_loss_mlp": 0.01031043, "balance_loss_clip": 1.04158378, "balance_loss_mlp": 1.01967084, "epoch": 0.9839170299113182, "flos": 22452392793600.0, "grad_norm": 1.5415718965225467, "language_loss": 0.75180268, "learning_rate": 2.702677107943252e-09, "loss": 0.77283293, "num_input_tokens_seen": 353154650, "step": 16365, "time_per_iteration": 2.7838945388793945 }, { "auxiliary_loss_clip": 0.0106364, "auxiliary_loss_mlp": 0.01028118, "balance_loss_clip": 1.03559554, "balance_loss_mlp": 1.01572597, "epoch": 0.9839771531639862, "flos": 27892320779520.0, "grad_norm": 2.0418627891356365, "language_loss": 0.76325071, "learning_rate": 2.6824747654072832e-09, "loss": 0.78416824, "num_input_tokens_seen": 353174065, "step": 16366, "time_per_iteration": 2.723862886428833 }, { "auxiliary_loss_clip": 0.01105139, "auxiliary_loss_mlp": 0.01026883, "balance_loss_clip": 1.03549993, "balance_loss_mlp": 1.01568365, "epoch": 0.9840372764166542, "flos": 28214510797440.0, "grad_norm": 1.5805895259506346, "language_loss": 0.77362347, "learning_rate": 2.662348161352357e-09, "loss": 0.79494369, "num_input_tokens_seen": 353193560, "step": 16367, "time_per_iteration": 2.6186344623565674 }, { "auxiliary_loss_clip": 0.01085162, "auxiliary_loss_mlp": 0.01036358, "balance_loss_clip": 1.0372721, "balance_loss_mlp": 1.02363229, "epoch": 0.9840973996693221, "flos": 23403989854080.0, "grad_norm": 1.6315530107439746, "language_loss": 0.6176089, "learning_rate": 2.642297296540974e-09, "loss": 0.63882411, "num_input_tokens_seen": 353213525, "step": 16368, "time_per_iteration": 2.7051217555999756 }, { "auxiliary_loss_clip": 0.01093129, "auxiliary_loss_mlp": 0.0103668, "balance_loss_clip": 1.03431225, "balance_loss_mlp": 1.02538538, "epoch": 0.9841575229219901, "flos": 21395865127680.0, "grad_norm": 1.4838055886631645, "language_loss": 0.65539753, "learning_rate": 2.6223221717340816e-09, "loss": 0.67669564, "num_input_tokens_seen": 353234000, "step": 16369, "time_per_iteration": 2.684190273284912 }, { "auxiliary_loss_clip": 0.01098619, "auxiliary_loss_mlp": 0.00771023, "balance_loss_clip": 1.03682351, "balance_loss_mlp": 1.00028467, "epoch": 0.984217646174658, "flos": 24464072966400.0, "grad_norm": 2.1848510788789053, "language_loss": 0.68529809, "learning_rate": 2.6024227876886295e-09, "loss": 0.70399457, "num_input_tokens_seen": 353254940, "step": 16370, "time_per_iteration": 2.690066337585449 }, { "auxiliary_loss_clip": 0.01109517, "auxiliary_loss_mlp": 0.01035832, "balance_loss_clip": 1.03602791, "balance_loss_mlp": 1.02231407, "epoch": 0.984277769427326, "flos": 16435057680000.0, "grad_norm": 1.7624959425131688, "language_loss": 0.73149407, "learning_rate": 2.582599145159792e-09, "loss": 0.75294757, "num_input_tokens_seen": 353272590, "step": 16371, "time_per_iteration": 2.647754669189453 }, { "auxiliary_loss_clip": 0.01019499, "auxiliary_loss_mlp": 0.01000721, "balance_loss_clip": 1.00614071, "balance_loss_mlp": 0.99977916, "epoch": 0.9843378926799939, "flos": 64530615288960.0, "grad_norm": 0.8676443160581451, "language_loss": 0.65173286, "learning_rate": 2.562851244898745e-09, "loss": 0.67193508, "num_input_tokens_seen": 353334380, "step": 16372, "time_per_iteration": 3.1656858921051025 }, { "auxiliary_loss_clip": 0.01095097, "auxiliary_loss_mlp": 0.01034077, "balance_loss_clip": 1.03569186, "balance_loss_mlp": 1.02207279, "epoch": 0.984398015932662, "flos": 17382811985280.0, "grad_norm": 1.8275470136153955, "language_loss": 0.70694923, "learning_rate": 2.5431790876544456e-09, "loss": 0.72824109, "num_input_tokens_seen": 353351640, "step": 16373, "time_per_iteration": 2.658825635910034 }, { "auxiliary_loss_clip": 0.0110683, "auxiliary_loss_mlp": 0.01030818, "balance_loss_clip": 1.03669751, "balance_loss_mlp": 1.01893306, "epoch": 0.9844581391853299, "flos": 23879088069120.0, "grad_norm": 1.8485344334805096, "language_loss": 0.81536216, "learning_rate": 2.523582674173186e-09, "loss": 0.83673871, "num_input_tokens_seen": 353372555, "step": 16374, "time_per_iteration": 2.6585822105407715 }, { "auxiliary_loss_clip": 0.01064423, "auxiliary_loss_mlp": 0.01034079, "balance_loss_clip": 1.03823948, "balance_loss_mlp": 1.02220547, "epoch": 0.9845182624379979, "flos": 19865352568320.0, "grad_norm": 1.693148116934704, "language_loss": 0.69581914, "learning_rate": 2.504062005197927e-09, "loss": 0.71680415, "num_input_tokens_seen": 353391385, "step": 16375, "time_per_iteration": 2.7366557121276855 }, { "auxiliary_loss_clip": 0.01083548, "auxiliary_loss_mlp": 0.01043522, "balance_loss_clip": 1.03258562, "balance_loss_mlp": 1.02908564, "epoch": 0.9845783856906659, "flos": 28254659224320.0, "grad_norm": 2.704312105533632, "language_loss": 0.81189835, "learning_rate": 2.484617081468521e-09, "loss": 0.83316898, "num_input_tokens_seen": 353411630, "step": 16376, "time_per_iteration": 2.695854663848877 }, { "auxiliary_loss_clip": 0.01105113, "auxiliary_loss_mlp": 0.01036471, "balance_loss_clip": 1.03517056, "balance_loss_mlp": 1.02441287, "epoch": 0.9846385089433338, "flos": 28328383889280.0, "grad_norm": 1.6882577755341805, "language_loss": 0.62188119, "learning_rate": 2.4652479037228224e-09, "loss": 0.64329708, "num_input_tokens_seen": 353432895, "step": 16377, "time_per_iteration": 2.655351161956787 }, { "auxiliary_loss_clip": 0.01079528, "auxiliary_loss_mlp": 0.01034135, "balance_loss_clip": 1.03616428, "balance_loss_mlp": 1.02145696, "epoch": 0.9846986321960018, "flos": 24316767290880.0, "grad_norm": 1.743655266311487, "language_loss": 0.72909814, "learning_rate": 2.445954472695133e-09, "loss": 0.75023472, "num_input_tokens_seen": 353454195, "step": 16378, "time_per_iteration": 2.7620902061462402 }, { "auxiliary_loss_clip": 0.01107968, "auxiliary_loss_mlp": 0.0103452, "balance_loss_clip": 1.0362848, "balance_loss_mlp": 1.02246761, "epoch": 0.9847587554486698, "flos": 27271999877760.0, "grad_norm": 1.9591429215255713, "language_loss": 0.71231186, "learning_rate": 2.426736789116868e-09, "loss": 0.73373675, "num_input_tokens_seen": 353475125, "step": 16379, "time_per_iteration": 2.6217269897460938 }, { "auxiliary_loss_clip": 0.01076838, "auxiliary_loss_mlp": 0.01033584, "balance_loss_clip": 1.03647435, "balance_loss_mlp": 1.02120376, "epoch": 0.9848188787013378, "flos": 16542717719040.0, "grad_norm": 1.9414180351359185, "language_loss": 0.68380785, "learning_rate": 2.407594853716999e-09, "loss": 0.70491207, "num_input_tokens_seen": 353493265, "step": 16380, "time_per_iteration": 2.6951489448547363 }, { "auxiliary_loss_clip": 0.01078007, "auxiliary_loss_mlp": 0.01037173, "balance_loss_clip": 1.0345068, "balance_loss_mlp": 1.02463818, "epoch": 0.9848790019540057, "flos": 20193647898240.0, "grad_norm": 2.8812935007679146, "language_loss": 0.78948879, "learning_rate": 2.38852866722139e-09, "loss": 0.81064057, "num_input_tokens_seen": 353511650, "step": 16381, "time_per_iteration": 2.733790159225464 }, { "auxiliary_loss_clip": 0.01095102, "auxiliary_loss_mlp": 0.01029949, "balance_loss_clip": 1.03512669, "balance_loss_mlp": 1.01729441, "epoch": 0.9849391252066737, "flos": 28259723041920.0, "grad_norm": 1.4147052567064669, "language_loss": 0.82457852, "learning_rate": 2.3695382303527965e-09, "loss": 0.84582901, "num_input_tokens_seen": 353534035, "step": 16382, "time_per_iteration": 2.738605499267578 }, { "auxiliary_loss_clip": 0.01081484, "auxiliary_loss_mlp": 0.01033799, "balance_loss_clip": 1.03230476, "balance_loss_mlp": 1.02016735, "epoch": 0.9849992484593416, "flos": 22454942659200.0, "grad_norm": 1.7473709928633554, "language_loss": 0.74585968, "learning_rate": 2.3506235438315316e-09, "loss": 0.76701248, "num_input_tokens_seen": 353549950, "step": 16383, "time_per_iteration": 2.754387378692627 }, { "auxiliary_loss_clip": 0.01064953, "auxiliary_loss_mlp": 0.01031021, "balance_loss_clip": 1.03860319, "balance_loss_mlp": 1.01868236, "epoch": 0.9850593717120096, "flos": 34497190656000.0, "grad_norm": 1.8075355031260896, "language_loss": 0.66479164, "learning_rate": 2.3317846083750203e-09, "loss": 0.68575138, "num_input_tokens_seen": 353573745, "step": 16384, "time_per_iteration": 2.9240455627441406 }, { "auxiliary_loss_clip": 0.01090885, "auxiliary_loss_mlp": 0.01035183, "balance_loss_clip": 1.03831267, "balance_loss_mlp": 1.02080083, "epoch": 0.9851194949646775, "flos": 38837282152320.0, "grad_norm": 1.832467391931212, "language_loss": 0.70495671, "learning_rate": 2.313021424697359e-09, "loss": 0.72621739, "num_input_tokens_seen": 353595335, "step": 16385, "time_per_iteration": 6.049696922302246 }, { "auxiliary_loss_clip": 0.01090368, "auxiliary_loss_mlp": 0.01031869, "balance_loss_clip": 1.03864157, "balance_loss_mlp": 1.01980531, "epoch": 0.9851796182173456, "flos": 17712436118400.0, "grad_norm": 2.4314123145549336, "language_loss": 0.81251216, "learning_rate": 2.294333993509978e-09, "loss": 0.83373451, "num_input_tokens_seen": 353614270, "step": 16386, "time_per_iteration": 2.663780689239502 }, { "auxiliary_loss_clip": 0.01079909, "auxiliary_loss_mlp": 0.01031709, "balance_loss_clip": 1.03440416, "balance_loss_mlp": 1.01883996, "epoch": 0.9852397414700135, "flos": 27454318335360.0, "grad_norm": 1.9863892677340445, "language_loss": 0.67923307, "learning_rate": 2.2757223155216442e-09, "loss": 0.70034921, "num_input_tokens_seen": 353634900, "step": 16387, "time_per_iteration": 2.7573816776275635 }, { "auxiliary_loss_clip": 0.01089839, "auxiliary_loss_mlp": 0.00769242, "balance_loss_clip": 1.03422558, "balance_loss_mlp": 1.00012159, "epoch": 0.9852998647226815, "flos": 18296702743680.0, "grad_norm": 1.7527242127962226, "language_loss": 0.74020231, "learning_rate": 2.257186391438237e-09, "loss": 0.75879306, "num_input_tokens_seen": 353652890, "step": 16388, "time_per_iteration": 2.6196138858795166 }, { "auxiliary_loss_clip": 0.01089517, "auxiliary_loss_mlp": 0.01032434, "balance_loss_clip": 1.03372729, "balance_loss_mlp": 1.02051258, "epoch": 0.9853599879753495, "flos": 19642562461440.0, "grad_norm": 1.885399673475778, "language_loss": 0.82288045, "learning_rate": 2.238726221962528e-09, "loss": 0.8441, "num_input_tokens_seen": 353671295, "step": 16389, "time_per_iteration": 4.203902959823608 }, { "auxiliary_loss_clip": 0.01086383, "auxiliary_loss_mlp": 0.00770398, "balance_loss_clip": 1.03422093, "balance_loss_mlp": 1.00023174, "epoch": 0.9854201112280174, "flos": 23841956384640.0, "grad_norm": 2.246145821478881, "language_loss": 0.67169315, "learning_rate": 2.2203418077946234e-09, "loss": 0.69026095, "num_input_tokens_seen": 353690560, "step": 16390, "time_per_iteration": 2.7021732330322266 }, { "auxiliary_loss_clip": 0.01070253, "auxiliary_loss_mlp": 0.01034626, "balance_loss_clip": 1.03694236, "balance_loss_mlp": 1.02117944, "epoch": 0.9854802344806854, "flos": 30080573233920.0, "grad_norm": 1.5706472092274895, "language_loss": 0.77193004, "learning_rate": 2.2020331496312994e-09, "loss": 0.79297888, "num_input_tokens_seen": 353710660, "step": 16391, "time_per_iteration": 2.763343572616577 }, { "auxiliary_loss_clip": 0.01066236, "auxiliary_loss_mlp": 0.00769461, "balance_loss_clip": 1.03303838, "balance_loss_mlp": 1.00014699, "epoch": 0.9855403577333534, "flos": 21907412668800.0, "grad_norm": 2.034127349616756, "language_loss": 0.6821295, "learning_rate": 2.1838002481673333e-09, "loss": 0.70048642, "num_input_tokens_seen": 353730440, "step": 16392, "time_per_iteration": 2.741312026977539 }, { "auxiliary_loss_clip": 0.01076854, "auxiliary_loss_mlp": 0.01030267, "balance_loss_clip": 1.0342617, "balance_loss_mlp": 1.0166111, "epoch": 0.9856004809860214, "flos": 15413794191360.0, "grad_norm": 2.041115164847186, "language_loss": 0.55706286, "learning_rate": 2.1656431040937286e-09, "loss": 0.57813406, "num_input_tokens_seen": 353748360, "step": 16393, "time_per_iteration": 2.6840660572052 }, { "auxiliary_loss_clip": 0.01074406, "auxiliary_loss_mlp": 0.01031603, "balance_loss_clip": 1.0325073, "balance_loss_mlp": 1.01706505, "epoch": 0.9856606042386893, "flos": 13653201064320.0, "grad_norm": 2.576410490354787, "language_loss": 0.79111844, "learning_rate": 2.1475617180990444e-09, "loss": 0.81217849, "num_input_tokens_seen": 353760880, "step": 16394, "time_per_iteration": 2.683983087539673 }, { "auxiliary_loss_clip": 0.0109509, "auxiliary_loss_mlp": 0.01033335, "balance_loss_clip": 1.03339911, "balance_loss_mlp": 1.02005494, "epoch": 0.9857207274913573, "flos": 23479151063040.0, "grad_norm": 1.5070932402692028, "language_loss": 0.76305884, "learning_rate": 2.129556090869178e-09, "loss": 0.78434312, "num_input_tokens_seen": 353782255, "step": 16395, "time_per_iteration": 2.694324254989624 }, { "auxiliary_loss_clip": 0.01094719, "auxiliary_loss_mlp": 0.01031611, "balance_loss_clip": 1.03447509, "balance_loss_mlp": 1.01911831, "epoch": 0.9857808507440252, "flos": 21065486808960.0, "grad_norm": 1.9132501064588425, "language_loss": 0.7550149, "learning_rate": 2.1116262230866933e-09, "loss": 0.77627826, "num_input_tokens_seen": 353803580, "step": 16396, "time_per_iteration": 2.6768436431884766 }, { "auxiliary_loss_clip": 0.01070405, "auxiliary_loss_mlp": 0.01026163, "balance_loss_clip": 1.03497076, "balance_loss_mlp": 1.01392639, "epoch": 0.9858409739966932, "flos": 25301365971840.0, "grad_norm": 1.5225711689164605, "language_loss": 0.7122134, "learning_rate": 2.0937721154317133e-09, "loss": 0.73317909, "num_input_tokens_seen": 353824200, "step": 16397, "time_per_iteration": 2.7246475219726562 }, { "auxiliary_loss_clip": 0.01081841, "auxiliary_loss_mlp": 0.01028049, "balance_loss_clip": 1.0351944, "balance_loss_mlp": 1.01624179, "epoch": 0.9859010972493611, "flos": 20558751690240.0, "grad_norm": 1.7750069543692049, "language_loss": 0.7137388, "learning_rate": 2.0759937685810304e-09, "loss": 0.73483771, "num_input_tokens_seen": 353843350, "step": 16398, "time_per_iteration": 2.6708950996398926 }, { "auxiliary_loss_clip": 0.0106975, "auxiliary_loss_mlp": 0.01026745, "balance_loss_clip": 1.03256011, "balance_loss_mlp": 1.01534224, "epoch": 0.9859612205020292, "flos": 24754985216640.0, "grad_norm": 2.8400971198256215, "language_loss": 0.73956269, "learning_rate": 2.058291183208771e-09, "loss": 0.76052767, "num_input_tokens_seen": 353864520, "step": 16399, "time_per_iteration": 2.7505059242248535 }, { "auxiliary_loss_clip": 0.01107815, "auxiliary_loss_mlp": 0.01030039, "balance_loss_clip": 1.03546059, "balance_loss_mlp": 1.01738548, "epoch": 0.9860213437546971, "flos": 21105850717440.0, "grad_norm": 2.280806532195227, "language_loss": 0.57755244, "learning_rate": 2.0406643599863993e-09, "loss": 0.59893095, "num_input_tokens_seen": 353882240, "step": 16400, "time_per_iteration": 2.5837459564208984 }, { "auxiliary_loss_clip": 0.01087543, "auxiliary_loss_mlp": 0.01029777, "balance_loss_clip": 1.03587925, "balance_loss_mlp": 1.01624036, "epoch": 0.9860814670073651, "flos": 19136078737920.0, "grad_norm": 7.9161501077476775, "language_loss": 0.80533803, "learning_rate": 2.023113299582491e-09, "loss": 0.82651126, "num_input_tokens_seen": 353901590, "step": 16401, "time_per_iteration": 2.676846742630005 }, { "auxiliary_loss_clip": 0.01095929, "auxiliary_loss_mlp": 0.01033178, "balance_loss_clip": 1.03656411, "balance_loss_mlp": 1.02002931, "epoch": 0.9861415902600331, "flos": 17237050594560.0, "grad_norm": 1.9620055100104796, "language_loss": 0.77909809, "learning_rate": 2.005638002662069e-09, "loss": 0.80038917, "num_input_tokens_seen": 353918785, "step": 16402, "time_per_iteration": 2.580324172973633 }, { "auxiliary_loss_clip": 0.01099134, "auxiliary_loss_mlp": 0.01035077, "balance_loss_clip": 1.03702092, "balance_loss_mlp": 1.02319241, "epoch": 0.986201713512701, "flos": 27782577751680.0, "grad_norm": 1.8385726831624305, "language_loss": 0.69819796, "learning_rate": 1.9882384698881596e-09, "loss": 0.71954, "num_input_tokens_seen": 353940390, "step": 16403, "time_per_iteration": 2.6051719188690186 }, { "auxiliary_loss_clip": 0.01092549, "auxiliary_loss_mlp": 0.0102806, "balance_loss_clip": 1.03301835, "balance_loss_mlp": 1.01602554, "epoch": 0.986261836765369, "flos": 28730403884160.0, "grad_norm": 2.0540712691142, "language_loss": 0.74826646, "learning_rate": 1.9709147019204566e-09, "loss": 0.76947248, "num_input_tokens_seen": 353962180, "step": 16404, "time_per_iteration": 2.6757051944732666 }, { "auxiliary_loss_clip": 0.01096235, "auxiliary_loss_mlp": 0.00769718, "balance_loss_clip": 1.03480124, "balance_loss_mlp": 1.00010228, "epoch": 0.986321960018037, "flos": 34313471568000.0, "grad_norm": 1.7889327818353045, "language_loss": 0.69631529, "learning_rate": 1.953666699415768e-09, "loss": 0.71497488, "num_input_tokens_seen": 353984305, "step": 16405, "time_per_iteration": 2.7109172344207764 }, { "auxiliary_loss_clip": 0.01085878, "auxiliary_loss_mlp": 0.01034898, "balance_loss_clip": 1.03745413, "balance_loss_mlp": 1.02344775, "epoch": 0.986382083270705, "flos": 25189755436800.0, "grad_norm": 1.6951529246514412, "language_loss": 0.69703031, "learning_rate": 1.93649446302846e-09, "loss": 0.718238, "num_input_tokens_seen": 354004495, "step": 16406, "time_per_iteration": 2.725384473800659 }, { "auxiliary_loss_clip": 0.01049811, "auxiliary_loss_mlp": 0.01032731, "balance_loss_clip": 1.03564012, "balance_loss_mlp": 1.02081645, "epoch": 0.9864422065233729, "flos": 11025904671360.0, "grad_norm": 3.370275127153346, "language_loss": 0.74895245, "learning_rate": 1.9193979934095663e-09, "loss": 0.76977789, "num_input_tokens_seen": 354015985, "step": 16407, "time_per_iteration": 2.711702585220337 }, { "auxiliary_loss_clip": 0.01083953, "auxiliary_loss_mlp": 0.01030883, "balance_loss_clip": 1.03477526, "balance_loss_mlp": 1.01853251, "epoch": 0.9865023297760409, "flos": 16545590807040.0, "grad_norm": 2.111055087475785, "language_loss": 0.77460712, "learning_rate": 1.9023772912072357e-09, "loss": 0.79575551, "num_input_tokens_seen": 354033260, "step": 16408, "time_per_iteration": 2.593550443649292 }, { "auxiliary_loss_clip": 0.01101693, "auxiliary_loss_mlp": 0.01032331, "balance_loss_clip": 1.0380075, "balance_loss_mlp": 1.01906323, "epoch": 0.9865624530287088, "flos": 18880179269760.0, "grad_norm": 1.9451476197970003, "language_loss": 0.68269604, "learning_rate": 1.8854323570669515e-09, "loss": 0.70403636, "num_input_tokens_seen": 354052825, "step": 16409, "time_per_iteration": 2.587090492248535 }, { "auxiliary_loss_clip": 0.01011915, "auxiliary_loss_mlp": 0.01002193, "balance_loss_clip": 1.00871253, "balance_loss_mlp": 1.00125718, "epoch": 0.9866225762813768, "flos": 68887798680960.0, "grad_norm": 0.806349802754998, "language_loss": 0.61002564, "learning_rate": 1.8685631916313118e-09, "loss": 0.63016677, "num_input_tokens_seen": 354113920, "step": 16410, "time_per_iteration": 3.278089761734009 }, { "auxiliary_loss_clip": 0.0109769, "auxiliary_loss_mlp": 0.01032935, "balance_loss_clip": 1.03702283, "balance_loss_mlp": 1.02077615, "epoch": 0.9866826995340447, "flos": 29023111814400.0, "grad_norm": 3.0120361411647005, "language_loss": 0.65963012, "learning_rate": 1.8517697955400258e-09, "loss": 0.68093634, "num_input_tokens_seen": 354134210, "step": 16411, "time_per_iteration": 2.632351875305176 }, { "auxiliary_loss_clip": 0.01027186, "auxiliary_loss_mlp": 0.0100133, "balance_loss_clip": 1.00486875, "balance_loss_mlp": 1.00040567, "epoch": 0.9867428227867128, "flos": 65376814867200.0, "grad_norm": 0.7224745052479478, "language_loss": 0.56279814, "learning_rate": 1.8350521694299182e-09, "loss": 0.58308327, "num_input_tokens_seen": 354198010, "step": 16412, "time_per_iteration": 3.1897354125976562 }, { "auxiliary_loss_clip": 0.01079312, "auxiliary_loss_mlp": 0.01032576, "balance_loss_clip": 1.0352391, "balance_loss_mlp": 1.01961815, "epoch": 0.9868029460393807, "flos": 26506312634880.0, "grad_norm": 2.0935942074241685, "language_loss": 0.72890359, "learning_rate": 1.818410313934926e-09, "loss": 0.75002241, "num_input_tokens_seen": 354220000, "step": 16413, "time_per_iteration": 2.710663080215454 }, { "auxiliary_loss_clip": 0.01060652, "auxiliary_loss_mlp": 0.01030308, "balance_loss_clip": 1.03323412, "balance_loss_mlp": 1.01750505, "epoch": 0.9868630692920487, "flos": 22967280299520.0, "grad_norm": 2.0312404595944664, "language_loss": 0.71431053, "learning_rate": 1.8018442296858782e-09, "loss": 0.73522013, "num_input_tokens_seen": 354240910, "step": 16414, "time_per_iteration": 2.7031588554382324 }, { "auxiliary_loss_clip": 0.01089485, "auxiliary_loss_mlp": 0.01036435, "balance_loss_clip": 1.03525519, "balance_loss_mlp": 1.02446055, "epoch": 0.9869231925447167, "flos": 19828687760640.0, "grad_norm": 1.5435516461575216, "language_loss": 0.7039904, "learning_rate": 1.7853539173111608e-09, "loss": 0.72524959, "num_input_tokens_seen": 354259430, "step": 16415, "time_per_iteration": 2.702089309692383 }, { "auxiliary_loss_clip": 0.01066346, "auxiliary_loss_mlp": 0.01032014, "balance_loss_clip": 1.0336014, "balance_loss_mlp": 1.02089763, "epoch": 0.9869833157973846, "flos": 20195228096640.0, "grad_norm": 2.9079123838637604, "language_loss": 0.75465488, "learning_rate": 1.7689393774362737e-09, "loss": 0.77563846, "num_input_tokens_seen": 354279490, "step": 16416, "time_per_iteration": 2.703504800796509 }, { "auxiliary_loss_clip": 0.0108217, "auxiliary_loss_mlp": 0.01030102, "balance_loss_clip": 1.03591037, "balance_loss_mlp": 1.01787734, "epoch": 0.9870434390500527, "flos": 16099507802880.0, "grad_norm": 2.1800846259576216, "language_loss": 0.70927489, "learning_rate": 1.7526006106833858e-09, "loss": 0.7303977, "num_input_tokens_seen": 354295080, "step": 16417, "time_per_iteration": 2.694063663482666 }, { "auxiliary_loss_clip": 0.01087544, "auxiliary_loss_mlp": 0.0103492, "balance_loss_clip": 1.03795171, "balance_loss_mlp": 1.02209926, "epoch": 0.9871035623027206, "flos": 21760753438080.0, "grad_norm": 1.6696226113868622, "language_loss": 0.70512295, "learning_rate": 1.7363376176720013e-09, "loss": 0.72634757, "num_input_tokens_seen": 354314610, "step": 16418, "time_per_iteration": 2.7078118324279785 }, { "auxiliary_loss_clip": 0.01027164, "auxiliary_loss_mlp": 0.01000807, "balance_loss_clip": 1.00479984, "balance_loss_mlp": 0.99982989, "epoch": 0.9871636855553886, "flos": 70219583245440.0, "grad_norm": 0.658515497705567, "language_loss": 0.53645599, "learning_rate": 1.7201503990189603e-09, "loss": 0.55673575, "num_input_tokens_seen": 354383115, "step": 16419, "time_per_iteration": 3.2428295612335205 }, { "auxiliary_loss_clip": 0.01087155, "auxiliary_loss_mlp": 0.0104011, "balance_loss_clip": 1.03351521, "balance_loss_mlp": 1.0263052, "epoch": 0.9872238088080565, "flos": 25045825639680.0, "grad_norm": 2.1175189547094457, "language_loss": 0.77917069, "learning_rate": 1.7040389553382162e-09, "loss": 0.80044335, "num_input_tokens_seen": 354403115, "step": 16420, "time_per_iteration": 2.6854612827301025 }, { "auxiliary_loss_clip": 0.01071773, "auxiliary_loss_mlp": 0.01029772, "balance_loss_clip": 1.03993893, "balance_loss_mlp": 1.01702881, "epoch": 0.9872839320607245, "flos": 19465846525440.0, "grad_norm": 2.3612194130787505, "language_loss": 0.70805871, "learning_rate": 1.6880032872403916e-09, "loss": 0.72907424, "num_input_tokens_seen": 354424520, "step": 16421, "time_per_iteration": 2.7082440853118896 }, { "auxiliary_loss_clip": 0.01100703, "auxiliary_loss_mlp": 0.01035927, "balance_loss_clip": 1.03684855, "balance_loss_mlp": 1.02248001, "epoch": 0.9873440553133924, "flos": 26942914448640.0, "grad_norm": 2.428011594135223, "language_loss": 0.82735991, "learning_rate": 1.6720433953338886e-09, "loss": 0.84872615, "num_input_tokens_seen": 354444800, "step": 16422, "time_per_iteration": 2.6437931060791016 }, { "auxiliary_loss_clip": 0.0107317, "auxiliary_loss_mlp": 0.01028669, "balance_loss_clip": 1.03409743, "balance_loss_mlp": 1.0163486, "epoch": 0.9874041785660604, "flos": 19062210418560.0, "grad_norm": 1.6808127811152613, "language_loss": 0.86108935, "learning_rate": 1.656159280223779e-09, "loss": 0.88210779, "num_input_tokens_seen": 354464590, "step": 16423, "time_per_iteration": 2.7554445266723633 }, { "auxiliary_loss_clip": 0.01100362, "auxiliary_loss_mlp": 0.01026203, "balance_loss_clip": 1.03655839, "balance_loss_mlp": 1.01384747, "epoch": 0.9874643018187284, "flos": 21105814803840.0, "grad_norm": 2.086841049232087, "language_loss": 0.70854056, "learning_rate": 1.6403509425122475e-09, "loss": 0.72980618, "num_input_tokens_seen": 354484145, "step": 16424, "time_per_iteration": 7.414201736450195 }, { "auxiliary_loss_clip": 0.01097827, "auxiliary_loss_mlp": 0.00769696, "balance_loss_clip": 1.03443944, "balance_loss_mlp": 1.00012803, "epoch": 0.9875244250713964, "flos": 24426043441920.0, "grad_norm": 2.386486368838744, "language_loss": 0.80787611, "learning_rate": 1.6246183827990366e-09, "loss": 0.82655132, "num_input_tokens_seen": 354502475, "step": 16425, "time_per_iteration": 2.6806702613830566 }, { "auxiliary_loss_clip": 0.0105599, "auxiliary_loss_mlp": 0.01033218, "balance_loss_clip": 1.03098166, "balance_loss_mlp": 1.01937222, "epoch": 0.9875845483240643, "flos": 25117610970240.0, "grad_norm": 1.8226222464901614, "language_loss": 0.79747486, "learning_rate": 1.6089616016803364e-09, "loss": 0.81836694, "num_input_tokens_seen": 354521855, "step": 16426, "time_per_iteration": 2.931814432144165 }, { "auxiliary_loss_clip": 0.01099233, "auxiliary_loss_mlp": 0.01035808, "balance_loss_clip": 1.03837609, "balance_loss_mlp": 1.02355909, "epoch": 0.9876446715767323, "flos": 16581788737920.0, "grad_norm": 1.762658511590331, "language_loss": 0.84837222, "learning_rate": 1.593380599750338e-09, "loss": 0.8697226, "num_input_tokens_seen": 354539535, "step": 16427, "time_per_iteration": 2.615586042404175 }, { "auxiliary_loss_clip": 0.01107577, "auxiliary_loss_mlp": 0.01031141, "balance_loss_clip": 1.03742325, "balance_loss_mlp": 1.01907742, "epoch": 0.9877047948294003, "flos": 21616141282560.0, "grad_norm": 1.7238113053204014, "language_loss": 0.70466417, "learning_rate": 1.577875377599458e-09, "loss": 0.72605133, "num_input_tokens_seen": 354557430, "step": 16428, "time_per_iteration": 2.5831527709960938 }, { "auxiliary_loss_clip": 0.01068786, "auxiliary_loss_mlp": 0.01032334, "balance_loss_clip": 1.03269923, "balance_loss_mlp": 1.02058625, "epoch": 0.9877649180820682, "flos": 21178497974400.0, "grad_norm": 3.804683550860071, "language_loss": 0.79990548, "learning_rate": 1.5624459358158926e-09, "loss": 0.82091671, "num_input_tokens_seen": 354574735, "step": 16429, "time_per_iteration": 4.215754270553589 }, { "auxiliary_loss_clip": 0.01106379, "auxiliary_loss_mlp": 0.01030944, "balance_loss_clip": 1.03527224, "balance_loss_mlp": 1.01933873, "epoch": 0.9878250413347363, "flos": 39749233576320.0, "grad_norm": 1.5905003981287011, "language_loss": 0.6204477, "learning_rate": 1.5470922749845073e-09, "loss": 0.64182091, "num_input_tokens_seen": 354597050, "step": 16430, "time_per_iteration": 2.7417891025543213 }, { "auxiliary_loss_clip": 0.01109651, "auxiliary_loss_mlp": 0.01032938, "balance_loss_clip": 1.03770876, "balance_loss_mlp": 1.02093995, "epoch": 0.9878851645874042, "flos": 29425634599680.0, "grad_norm": 2.4386034001427848, "language_loss": 0.73058724, "learning_rate": 1.531814395687725e-09, "loss": 0.75201309, "num_input_tokens_seen": 354619095, "step": 16431, "time_per_iteration": 2.6387763023376465 }, { "auxiliary_loss_clip": 0.01109492, "auxiliary_loss_mlp": 0.01033474, "balance_loss_clip": 1.03863847, "balance_loss_mlp": 1.02136791, "epoch": 0.9879452878400722, "flos": 15806261168640.0, "grad_norm": 2.1754704610847115, "language_loss": 0.804088, "learning_rate": 1.5166122985048602e-09, "loss": 0.82551765, "num_input_tokens_seen": 354633790, "step": 16432, "time_per_iteration": 2.59206485748291 }, { "auxiliary_loss_clip": 0.01092115, "auxiliary_loss_mlp": 0.01029277, "balance_loss_clip": 1.0342344, "balance_loss_mlp": 1.01833928, "epoch": 0.9880054110927401, "flos": 22233912318720.0, "grad_norm": 1.600766850259046, "language_loss": 0.80293298, "learning_rate": 1.5014859840123405e-09, "loss": 0.82414687, "num_input_tokens_seen": 354653180, "step": 16433, "time_per_iteration": 2.705249071121216 }, { "auxiliary_loss_clip": 0.01105179, "auxiliary_loss_mlp": 0.01033969, "balance_loss_clip": 1.03657746, "balance_loss_mlp": 1.02180386, "epoch": 0.9880655343454081, "flos": 28763836467840.0, "grad_norm": 2.2504734279341543, "language_loss": 0.6503619, "learning_rate": 1.4864354527837075e-09, "loss": 0.67175341, "num_input_tokens_seen": 354669900, "step": 16434, "time_per_iteration": 2.5459141731262207 }, { "auxiliary_loss_clip": 0.01097534, "auxiliary_loss_mlp": 0.01033458, "balance_loss_clip": 1.03373504, "balance_loss_mlp": 1.0204587, "epoch": 0.988125657598076, "flos": 32853379622400.0, "grad_norm": 1.6032981258064614, "language_loss": 0.69355786, "learning_rate": 1.4714607053896154e-09, "loss": 0.71486771, "num_input_tokens_seen": 354693165, "step": 16435, "time_per_iteration": 2.691948652267456 }, { "auxiliary_loss_clip": 0.01051732, "auxiliary_loss_mlp": 0.01037505, "balance_loss_clip": 1.03534651, "balance_loss_mlp": 1.02496445, "epoch": 0.988185780850744, "flos": 19390685316480.0, "grad_norm": 1.6101555042177864, "language_loss": 0.75285351, "learning_rate": 1.4565617423980548e-09, "loss": 0.77374589, "num_input_tokens_seen": 354711915, "step": 16436, "time_per_iteration": 2.687253475189209 }, { "auxiliary_loss_clip": 0.01078926, "auxiliary_loss_mlp": 0.01034051, "balance_loss_clip": 1.03449368, "balance_loss_mlp": 1.02073479, "epoch": 0.988245904103412, "flos": 22528415928960.0, "grad_norm": 2.1049247557685486, "language_loss": 0.7397666, "learning_rate": 1.4417385643741286e-09, "loss": 0.76089633, "num_input_tokens_seen": 354729135, "step": 16437, "time_per_iteration": 2.6133415699005127 }, { "auxiliary_loss_clip": 0.01070653, "auxiliary_loss_mlp": 0.01032545, "balance_loss_clip": 1.03327727, "balance_loss_mlp": 1.02036154, "epoch": 0.98830602735608, "flos": 28659193171200.0, "grad_norm": 1.7371031624510076, "language_loss": 0.60138786, "learning_rate": 1.4269911718796103e-09, "loss": 0.62241983, "num_input_tokens_seen": 354752530, "step": 16438, "time_per_iteration": 2.747478485107422 }, { "auxiliary_loss_clip": 0.01082521, "auxiliary_loss_mlp": 0.01033624, "balance_loss_clip": 1.03334987, "balance_loss_mlp": 1.02030826, "epoch": 0.9883661506087479, "flos": 20996035862400.0, "grad_norm": 1.7884546303638278, "language_loss": 0.71630102, "learning_rate": 1.4123195654738295e-09, "loss": 0.7374624, "num_input_tokens_seen": 354771135, "step": 16439, "time_per_iteration": 2.64829158782959 }, { "auxiliary_loss_clip": 0.01094806, "auxiliary_loss_mlp": 0.01032761, "balance_loss_clip": 1.03649998, "balance_loss_mlp": 1.02029228, "epoch": 0.9884262738614159, "flos": 32706109860480.0, "grad_norm": 1.9552284330659928, "language_loss": 0.60129845, "learning_rate": 1.3977237457134528e-09, "loss": 0.62257409, "num_input_tokens_seen": 354791800, "step": 16440, "time_per_iteration": 2.709625482559204 }, { "auxiliary_loss_clip": 0.01109217, "auxiliary_loss_mlp": 0.0103132, "balance_loss_clip": 1.03572154, "balance_loss_mlp": 1.01920807, "epoch": 0.9884863971140839, "flos": 17564699479680.0, "grad_norm": 2.3996756667882346, "language_loss": 0.76234657, "learning_rate": 1.3832037131513707e-09, "loss": 0.78375196, "num_input_tokens_seen": 354809200, "step": 16441, "time_per_iteration": 2.5174717903137207 }, { "auxiliary_loss_clip": 0.01084665, "auxiliary_loss_mlp": 0.0102841, "balance_loss_clip": 1.03476977, "balance_loss_mlp": 1.0158329, "epoch": 0.9885465203667518, "flos": 40552519380480.0, "grad_norm": 1.8936887176516917, "language_loss": 0.67978179, "learning_rate": 1.3687594683386982e-09, "loss": 0.70091248, "num_input_tokens_seen": 354829945, "step": 16442, "time_per_iteration": 2.780667781829834 }, { "auxiliary_loss_clip": 0.01094262, "auxiliary_loss_mlp": 0.01030255, "balance_loss_clip": 1.03508973, "balance_loss_mlp": 1.01828051, "epoch": 0.9886066436194199, "flos": 13807976768640.0, "grad_norm": 2.546287070023655, "language_loss": 0.74541289, "learning_rate": 1.3543910118227753e-09, "loss": 0.76665807, "num_input_tokens_seen": 354845055, "step": 16443, "time_per_iteration": 2.5256857872009277 }, { "auxiliary_loss_clip": 0.01085844, "auxiliary_loss_mlp": 0.01029451, "balance_loss_clip": 1.03409505, "balance_loss_mlp": 1.01652241, "epoch": 0.9886667668720878, "flos": 23325129544320.0, "grad_norm": 6.700934436882059, "language_loss": 0.73816478, "learning_rate": 1.3400983441487213e-09, "loss": 0.75931776, "num_input_tokens_seen": 354864680, "step": 16444, "time_per_iteration": 2.6347739696502686 }, { "auxiliary_loss_clip": 0.01058824, "auxiliary_loss_mlp": 0.0103504, "balance_loss_clip": 1.03483725, "balance_loss_mlp": 1.02182567, "epoch": 0.9887268901247558, "flos": 22706029704960.0, "grad_norm": 2.0399337200236483, "language_loss": 0.69289607, "learning_rate": 1.325881465858547e-09, "loss": 0.7138347, "num_input_tokens_seen": 354885685, "step": 16445, "time_per_iteration": 2.7339391708374023 }, { "auxiliary_loss_clip": 0.01101302, "auxiliary_loss_mlp": 0.01026048, "balance_loss_clip": 1.03817463, "balance_loss_mlp": 1.01369166, "epoch": 0.9887870133774237, "flos": 13041283944960.0, "grad_norm": 2.484106533550889, "language_loss": 0.60372651, "learning_rate": 1.311740377491155e-09, "loss": 0.625, "num_input_tokens_seen": 354901505, "step": 16446, "time_per_iteration": 2.571403980255127 }, { "auxiliary_loss_clip": 0.01080619, "auxiliary_loss_mlp": 0.01032334, "balance_loss_clip": 1.03539968, "balance_loss_mlp": 1.02072275, "epoch": 0.9888471366300917, "flos": 15158864390400.0, "grad_norm": 2.54961121966749, "language_loss": 0.71147966, "learning_rate": 1.297675079582783e-09, "loss": 0.73260915, "num_input_tokens_seen": 354920060, "step": 16447, "time_per_iteration": 2.6204898357391357 }, { "auxiliary_loss_clip": 0.01106743, "auxiliary_loss_mlp": 0.00769349, "balance_loss_clip": 1.03625035, "balance_loss_mlp": 1.00023174, "epoch": 0.9889072598827596, "flos": 25118796119040.0, "grad_norm": 2.22311895255621, "language_loss": 0.83816832, "learning_rate": 1.2836855726667818e-09, "loss": 0.85692918, "num_input_tokens_seen": 354938690, "step": 16448, "time_per_iteration": 2.615037679672241 }, { "auxiliary_loss_clip": 0.01093774, "auxiliary_loss_mlp": 0.01028295, "balance_loss_clip": 1.03621387, "balance_loss_mlp": 1.0171665, "epoch": 0.9889673831354276, "flos": 16728663450240.0, "grad_norm": 1.5811514661156387, "language_loss": 0.69698024, "learning_rate": 1.26977185727406e-09, "loss": 0.71820092, "num_input_tokens_seen": 354956955, "step": 16449, "time_per_iteration": 2.5541889667510986 }, { "auxiliary_loss_clip": 0.0109972, "auxiliary_loss_mlp": 0.0102696, "balance_loss_clip": 1.03743207, "balance_loss_mlp": 1.01456869, "epoch": 0.9890275063880956, "flos": 35585175657600.0, "grad_norm": 2.2330985869575106, "language_loss": 0.7364139, "learning_rate": 1.25593393393153e-09, "loss": 0.75768065, "num_input_tokens_seen": 354976800, "step": 16450, "time_per_iteration": 2.722463846206665 }, { "auxiliary_loss_clip": 0.01108427, "auxiliary_loss_mlp": 0.01032028, "balance_loss_clip": 1.0343194, "balance_loss_mlp": 1.01945782, "epoch": 0.9890876296407636, "flos": 18952359649920.0, "grad_norm": 1.7405814084688636, "language_loss": 0.79538721, "learning_rate": 1.242171803164549e-09, "loss": 0.81679177, "num_input_tokens_seen": 354996625, "step": 16451, "time_per_iteration": 2.5799307823181152 }, { "auxiliary_loss_clip": 0.01072025, "auxiliary_loss_mlp": 0.01037826, "balance_loss_clip": 1.03292084, "balance_loss_mlp": 1.02433717, "epoch": 0.9891477528934315, "flos": 23769309127680.0, "grad_norm": 2.076913559177625, "language_loss": 0.70177102, "learning_rate": 1.2284854654946996e-09, "loss": 0.72286958, "num_input_tokens_seen": 355014535, "step": 16452, "time_per_iteration": 2.6568350791931152 }, { "auxiliary_loss_clip": 0.01106285, "auxiliary_loss_mlp": 0.010265, "balance_loss_clip": 1.03735638, "balance_loss_mlp": 1.01531219, "epoch": 0.9892078761460995, "flos": 20772922533120.0, "grad_norm": 1.7039408259240933, "language_loss": 0.73759556, "learning_rate": 1.2148749214409004e-09, "loss": 0.75892341, "num_input_tokens_seen": 355033280, "step": 16453, "time_per_iteration": 2.526846170425415 }, { "auxiliary_loss_clip": 0.01068886, "auxiliary_loss_mlp": 0.0103825, "balance_loss_clip": 1.03598034, "balance_loss_mlp": 1.02607906, "epoch": 0.9892679993987675, "flos": 23367827836800.0, "grad_norm": 2.0358391498117765, "language_loss": 0.69925886, "learning_rate": 1.2013401715191828e-09, "loss": 0.72033024, "num_input_tokens_seen": 355053320, "step": 16454, "time_per_iteration": 2.7736165523529053 }, { "auxiliary_loss_clip": 0.01077684, "auxiliary_loss_mlp": 0.01031622, "balance_loss_clip": 1.03315997, "balance_loss_mlp": 1.01950455, "epoch": 0.9893281226514354, "flos": 22705419173760.0, "grad_norm": 1.754815441534383, "language_loss": 0.75814426, "learning_rate": 1.1878812162433583e-09, "loss": 0.77923727, "num_input_tokens_seen": 355070230, "step": 16455, "time_per_iteration": 2.626431941986084 }, { "auxiliary_loss_clip": 0.0107961, "auxiliary_loss_mlp": 0.01026151, "balance_loss_clip": 1.03627825, "balance_loss_mlp": 1.01435518, "epoch": 0.9893882459041035, "flos": 21796664060160.0, "grad_norm": 2.3755436774026037, "language_loss": 0.6552164, "learning_rate": 1.1744980561230188e-09, "loss": 0.676274, "num_input_tokens_seen": 355090125, "step": 16456, "time_per_iteration": 2.6569387912750244 }, { "auxiliary_loss_clip": 0.01100413, "auxiliary_loss_mlp": 0.01031079, "balance_loss_clip": 1.03849792, "balance_loss_mlp": 1.01922965, "epoch": 0.9894483691567714, "flos": 18113773754880.0, "grad_norm": 2.3001936476450484, "language_loss": 0.73839563, "learning_rate": 1.161190691666203e-09, "loss": 0.75971055, "num_input_tokens_seen": 355107890, "step": 16457, "time_per_iteration": 2.674736738204956 }, { "auxiliary_loss_clip": 0.01108737, "auxiliary_loss_mlp": 0.01029092, "balance_loss_clip": 1.03762496, "balance_loss_mlp": 1.01680112, "epoch": 0.9895084924094394, "flos": 31211615664000.0, "grad_norm": 2.2264264445995474, "language_loss": 0.6879859, "learning_rate": 1.1479591233773954e-09, "loss": 0.70936424, "num_input_tokens_seen": 355126340, "step": 16458, "time_per_iteration": 2.615215301513672 }, { "auxiliary_loss_clip": 0.01093615, "auxiliary_loss_mlp": 0.01030607, "balance_loss_clip": 1.0354172, "balance_loss_mlp": 1.01881158, "epoch": 0.9895686156621073, "flos": 19678042120320.0, "grad_norm": 1.6836703680245058, "language_loss": 0.79359543, "learning_rate": 1.1348033517581956e-09, "loss": 0.81483769, "num_input_tokens_seen": 355144025, "step": 16459, "time_per_iteration": 2.5571677684783936 }, { "auxiliary_loss_clip": 0.01083172, "auxiliary_loss_mlp": 0.01034843, "balance_loss_clip": 1.03401232, "balance_loss_mlp": 1.02273118, "epoch": 0.9896287389147753, "flos": 23581675457280.0, "grad_norm": 1.883745911652252, "language_loss": 0.7132234, "learning_rate": 1.1217233773075373e-09, "loss": 0.73440349, "num_input_tokens_seen": 355163125, "step": 16460, "time_per_iteration": 2.626668691635132 }, { "auxiliary_loss_clip": 0.01086508, "auxiliary_loss_mlp": 0.01026002, "balance_loss_clip": 1.03445435, "balance_loss_mlp": 1.01346099, "epoch": 0.9896888621674432, "flos": 29605331364480.0, "grad_norm": 1.5613662208047323, "language_loss": 0.87661237, "learning_rate": 1.1087192005214685e-09, "loss": 0.8977375, "num_input_tokens_seen": 355184060, "step": 16461, "time_per_iteration": 2.7060861587524414 }, { "auxiliary_loss_clip": 0.01095459, "auxiliary_loss_mlp": 0.01032824, "balance_loss_clip": 1.03561902, "balance_loss_mlp": 1.01949632, "epoch": 0.9897489854201112, "flos": 23695045758720.0, "grad_norm": 1.7346501147556415, "language_loss": 0.62446827, "learning_rate": 1.09579082189315e-09, "loss": 0.64575106, "num_input_tokens_seen": 355204505, "step": 16462, "time_per_iteration": 2.64906907081604 }, { "auxiliary_loss_clip": 0.01100978, "auxiliary_loss_mlp": 0.01031988, "balance_loss_clip": 1.03905725, "balance_loss_mlp": 1.02028179, "epoch": 0.9898091086727792, "flos": 13225146687360.0, "grad_norm": 1.8196712786211515, "language_loss": 0.72961009, "learning_rate": 1.0829382419126343e-09, "loss": 0.75093973, "num_input_tokens_seen": 355223055, "step": 16463, "time_per_iteration": 5.664719343185425 }, { "auxiliary_loss_clip": 0.01097369, "auxiliary_loss_mlp": 0.01031261, "balance_loss_clip": 1.03589058, "balance_loss_mlp": 1.01759946, "epoch": 0.9898692319254472, "flos": 22930400010240.0, "grad_norm": 1.8790074246381347, "language_loss": 0.69955069, "learning_rate": 1.0701614610675314e-09, "loss": 0.720837, "num_input_tokens_seen": 355242000, "step": 16464, "time_per_iteration": 4.500953197479248 }, { "auxiliary_loss_clip": 0.01079876, "auxiliary_loss_mlp": 0.0102935, "balance_loss_clip": 1.03554177, "balance_loss_mlp": 1.01688099, "epoch": 0.9899293551781151, "flos": 12458346122880.0, "grad_norm": 2.0237880256001635, "language_loss": 0.73348618, "learning_rate": 1.0574604798421204e-09, "loss": 0.75457835, "num_input_tokens_seen": 355260175, "step": 16465, "time_per_iteration": 2.6900930404663086 }, { "auxiliary_loss_clip": 0.01104028, "auxiliary_loss_mlp": 0.010347, "balance_loss_clip": 1.03416681, "balance_loss_mlp": 1.02323794, "epoch": 0.9899894784307831, "flos": 26871129118080.0, "grad_norm": 1.754568063294171, "language_loss": 0.86540592, "learning_rate": 1.0448352987182386e-09, "loss": 0.88679326, "num_input_tokens_seen": 355281930, "step": 16466, "time_per_iteration": 2.5950276851654053 }, { "auxiliary_loss_clip": 0.01071496, "auxiliary_loss_mlp": 0.01024722, "balance_loss_clip": 1.03584099, "balance_loss_mlp": 1.01242614, "epoch": 0.990049601683451, "flos": 21542093395200.0, "grad_norm": 1.7230422201542275, "language_loss": 0.71486777, "learning_rate": 1.0322859181743915e-09, "loss": 0.73583001, "num_input_tokens_seen": 355301555, "step": 16467, "time_per_iteration": 2.7708022594451904 }, { "auxiliary_loss_clip": 0.0108033, "auxiliary_loss_mlp": 0.0104001, "balance_loss_clip": 1.03324151, "balance_loss_mlp": 1.02659893, "epoch": 0.990109724936119, "flos": 28771809287040.0, "grad_norm": 1.3753584839252895, "language_loss": 0.65033233, "learning_rate": 1.019812338686643e-09, "loss": 0.67153573, "num_input_tokens_seen": 355324925, "step": 16468, "time_per_iteration": 4.24141263961792 }, { "auxiliary_loss_clip": 0.01079098, "auxiliary_loss_mlp": 0.0103126, "balance_loss_clip": 1.03625393, "balance_loss_mlp": 1.01935673, "epoch": 0.9901698481887871, "flos": 29274270687360.0, "grad_norm": 2.0452120517655943, "language_loss": 0.62340331, "learning_rate": 1.0074145607281704e-09, "loss": 0.64450687, "num_input_tokens_seen": 355343875, "step": 16469, "time_per_iteration": 2.7885043621063232 }, { "auxiliary_loss_clip": 0.01073759, "auxiliary_loss_mlp": 0.01031927, "balance_loss_clip": 1.03337479, "balance_loss_mlp": 1.01906407, "epoch": 0.990229971441455, "flos": 15959025711360.0, "grad_norm": 2.562370039861896, "language_loss": 0.70241368, "learning_rate": 9.950925847685976e-10, "loss": 0.72347051, "num_input_tokens_seen": 355358835, "step": 16470, "time_per_iteration": 2.6540679931640625 }, { "auxiliary_loss_clip": 0.01019159, "auxiliary_loss_mlp": 0.01000231, "balance_loss_clip": 1.00684953, "balance_loss_mlp": 0.99926519, "epoch": 0.990290094694123, "flos": 69780287911680.0, "grad_norm": 0.6776686516780072, "language_loss": 0.55451435, "learning_rate": 9.828464112755509e-10, "loss": 0.57470822, "num_input_tokens_seen": 355431225, "step": 16471, "time_per_iteration": 3.345576047897339 }, { "auxiliary_loss_clip": 0.01088522, "auxiliary_loss_mlp": 0.01034754, "balance_loss_clip": 1.03816175, "balance_loss_mlp": 1.02205849, "epoch": 0.9903502179467909, "flos": 16252451913600.0, "grad_norm": 2.029976016877621, "language_loss": 0.83828497, "learning_rate": 9.706760407131032e-10, "loss": 0.85951781, "num_input_tokens_seen": 355448250, "step": 16472, "time_per_iteration": 2.7130064964294434 }, { "auxiliary_loss_clip": 0.01095822, "auxiliary_loss_mlp": 0.01026203, "balance_loss_clip": 1.03632557, "balance_loss_mlp": 1.01452053, "epoch": 0.9904103411994589, "flos": 21688393489920.0, "grad_norm": 1.9314835507092933, "language_loss": 0.8592447, "learning_rate": 9.585814735431075e-10, "loss": 0.88046497, "num_input_tokens_seen": 355467040, "step": 16473, "time_per_iteration": 2.6023082733154297 }, { "auxiliary_loss_clip": 0.01105804, "auxiliary_loss_mlp": 0.01029675, "balance_loss_clip": 1.03511405, "balance_loss_mlp": 1.01830196, "epoch": 0.9904704644521268, "flos": 25739440243200.0, "grad_norm": 1.8812560615029836, "language_loss": 0.84657192, "learning_rate": 9.465627102240859e-10, "loss": 0.86792672, "num_input_tokens_seen": 355487825, "step": 16474, "time_per_iteration": 2.6265671253204346 }, { "auxiliary_loss_clip": 0.01079812, "auxiliary_loss_mlp": 0.01035461, "balance_loss_clip": 1.03155684, "balance_loss_mlp": 1.0240823, "epoch": 0.9905305877047949, "flos": 21908346422400.0, "grad_norm": 1.8096895142828726, "language_loss": 0.76610988, "learning_rate": 9.346197512116738e-10, "loss": 0.78726262, "num_input_tokens_seen": 355507445, "step": 16475, "time_per_iteration": 2.642179012298584 }, { "auxiliary_loss_clip": 0.0106673, "auxiliary_loss_mlp": 0.01035079, "balance_loss_clip": 1.03210354, "balance_loss_mlp": 1.02151895, "epoch": 0.9905907109574628, "flos": 21392417422080.0, "grad_norm": 1.7909726122896426, "language_loss": 0.76034641, "learning_rate": 9.227525969588423e-10, "loss": 0.78136444, "num_input_tokens_seen": 355527205, "step": 16476, "time_per_iteration": 2.6616551876068115 }, { "auxiliary_loss_clip": 0.01101675, "auxiliary_loss_mlp": 0.00771329, "balance_loss_clip": 1.03651643, "balance_loss_mlp": 1.00030255, "epoch": 0.9906508342101308, "flos": 20521620005760.0, "grad_norm": 2.1261117563309884, "language_loss": 0.6759547, "learning_rate": 9.109612479154538e-10, "loss": 0.69468474, "num_input_tokens_seen": 355544740, "step": 16477, "time_per_iteration": 2.5836856365203857 }, { "auxiliary_loss_clip": 0.0109369, "auxiliary_loss_mlp": 0.01034434, "balance_loss_clip": 1.03875303, "balance_loss_mlp": 1.02113652, "epoch": 0.9907109574627987, "flos": 21361211481600.0, "grad_norm": 2.3791528799950283, "language_loss": 0.71740925, "learning_rate": 8.992457045289282e-10, "loss": 0.7386905, "num_input_tokens_seen": 355564385, "step": 16478, "time_per_iteration": 2.6684231758117676 }, { "auxiliary_loss_clip": 0.0110905, "auxiliary_loss_mlp": 0.01040049, "balance_loss_clip": 1.03671718, "balance_loss_mlp": 1.02660859, "epoch": 0.9907710807154667, "flos": 17338605321600.0, "grad_norm": 2.44296615516407, "language_loss": 0.80982149, "learning_rate": 8.876059672433545e-10, "loss": 0.83131254, "num_input_tokens_seen": 355579260, "step": 16479, "time_per_iteration": 2.536628484725952 }, { "auxiliary_loss_clip": 0.01099491, "auxiliary_loss_mlp": 0.01033699, "balance_loss_clip": 1.03593194, "balance_loss_mlp": 1.02183723, "epoch": 0.9908312039681346, "flos": 28621881918720.0, "grad_norm": 1.8095138235680064, "language_loss": 0.66404873, "learning_rate": 8.760420364999355e-10, "loss": 0.68538064, "num_input_tokens_seen": 355599790, "step": 16480, "time_per_iteration": 2.675546884536743 }, { "auxiliary_loss_clip": 0.0109416, "auxiliary_loss_mlp": 0.01032967, "balance_loss_clip": 1.03466868, "balance_loss_mlp": 1.02073646, "epoch": 0.9908913272208026, "flos": 35770654512000.0, "grad_norm": 1.7378127875185636, "language_loss": 0.72355247, "learning_rate": 8.645539127374313e-10, "loss": 0.74482375, "num_input_tokens_seen": 355620925, "step": 16481, "time_per_iteration": 2.702287197113037 }, { "auxiliary_loss_clip": 0.01095367, "auxiliary_loss_mlp": 0.01023626, "balance_loss_clip": 1.03528941, "balance_loss_mlp": 1.01157379, "epoch": 0.9909514504734707, "flos": 19902196944000.0, "grad_norm": 1.99195789913312, "language_loss": 0.77529383, "learning_rate": 8.531415963912713e-10, "loss": 0.79648376, "num_input_tokens_seen": 355639165, "step": 16482, "time_per_iteration": 2.623577117919922 }, { "auxiliary_loss_clip": 0.01099605, "auxiliary_loss_mlp": 0.01030033, "balance_loss_clip": 1.03522539, "balance_loss_mlp": 1.01804006, "epoch": 0.9910115737261386, "flos": 20004793165440.0, "grad_norm": 1.7513452456570024, "language_loss": 0.75167656, "learning_rate": 8.418050878944427e-10, "loss": 0.772973, "num_input_tokens_seen": 355657320, "step": 16483, "time_per_iteration": 2.6707489490509033 }, { "auxiliary_loss_clip": 0.01018817, "auxiliary_loss_mlp": 0.01002356, "balance_loss_clip": 1.00542712, "balance_loss_mlp": 1.0013783, "epoch": 0.9910716969788066, "flos": 70688432494080.0, "grad_norm": 0.6739717924016945, "language_loss": 0.53652573, "learning_rate": 8.305443876768237e-10, "loss": 0.55673742, "num_input_tokens_seen": 355726370, "step": 16484, "time_per_iteration": 3.2860820293426514 }, { "auxiliary_loss_clip": 0.01103552, "auxiliary_loss_mlp": 0.0103248, "balance_loss_clip": 1.03575015, "balance_loss_mlp": 1.0208987, "epoch": 0.9911318202314745, "flos": 21434038306560.0, "grad_norm": 1.844066586555626, "language_loss": 0.82151747, "learning_rate": 8.19359496165184e-10, "loss": 0.84287775, "num_input_tokens_seen": 355745840, "step": 16485, "time_per_iteration": 2.572507619857788 }, { "auxiliary_loss_clip": 0.0106644, "auxiliary_loss_mlp": 0.01039998, "balance_loss_clip": 1.03260577, "balance_loss_mlp": 1.02652156, "epoch": 0.9911919434841425, "flos": 19826820253440.0, "grad_norm": 1.5753889689051752, "language_loss": 0.81565136, "learning_rate": 8.082504137836288e-10, "loss": 0.83671576, "num_input_tokens_seen": 355763385, "step": 16486, "time_per_iteration": 2.6582581996917725 }, { "auxiliary_loss_clip": 0.01099209, "auxiliary_loss_mlp": 0.01034412, "balance_loss_clip": 1.03707922, "balance_loss_mlp": 1.02275944, "epoch": 0.9912520667368104, "flos": 41719364691840.0, "grad_norm": 1.4744278341882329, "language_loss": 0.66241539, "learning_rate": 7.972171409538209e-10, "loss": 0.68375158, "num_input_tokens_seen": 355786075, "step": 16487, "time_per_iteration": 2.90215802192688 }, { "auxiliary_loss_clip": 0.01094686, "auxiliary_loss_mlp": 0.00769572, "balance_loss_clip": 1.03547466, "balance_loss_mlp": 1.00026965, "epoch": 0.9913121899894785, "flos": 23769668263680.0, "grad_norm": 1.5978817773669494, "language_loss": 0.76796007, "learning_rate": 7.862596780936481e-10, "loss": 0.78660262, "num_input_tokens_seen": 355806295, "step": 16488, "time_per_iteration": 2.771479368209839 }, { "auxiliary_loss_clip": 0.01080089, "auxiliary_loss_mlp": 0.01030538, "balance_loss_clip": 1.03599024, "balance_loss_mlp": 1.01780689, "epoch": 0.9913723132421464, "flos": 23769668263680.0, "grad_norm": 9.931415679730078, "language_loss": 0.68562698, "learning_rate": 7.753780256190001e-10, "loss": 0.70673329, "num_input_tokens_seen": 355825730, "step": 16489, "time_per_iteration": 2.8262085914611816 }, { "auxiliary_loss_clip": 0.00990057, "auxiliary_loss_mlp": 0.01006045, "balance_loss_clip": 1.00620961, "balance_loss_mlp": 1.00509155, "epoch": 0.9914324364948144, "flos": 71267419820160.0, "grad_norm": 0.6117813667004609, "language_loss": 0.52562964, "learning_rate": 7.645721839424357e-10, "loss": 0.54559064, "num_input_tokens_seen": 355891545, "step": 16490, "time_per_iteration": 3.339395523071289 }, { "auxiliary_loss_clip": 0.01081829, "auxiliary_loss_mlp": 0.01038351, "balance_loss_clip": 1.03499651, "balance_loss_mlp": 1.02465963, "epoch": 0.9914925597474823, "flos": 23695440808320.0, "grad_norm": 1.555410578963239, "language_loss": 0.75512695, "learning_rate": 7.538421534734052e-10, "loss": 0.7763288, "num_input_tokens_seen": 355909920, "step": 16491, "time_per_iteration": 2.7577908039093018 }, { "auxiliary_loss_clip": 0.01068183, "auxiliary_loss_mlp": 0.01033404, "balance_loss_clip": 1.03941417, "balance_loss_mlp": 1.02027285, "epoch": 0.9915526830001503, "flos": 13433822749440.0, "grad_norm": 2.4260837732983656, "language_loss": 0.70534217, "learning_rate": 7.431879346191383e-10, "loss": 0.72635806, "num_input_tokens_seen": 355923130, "step": 16492, "time_per_iteration": 2.717663288116455 }, { "auxiliary_loss_clip": 0.01072141, "auxiliary_loss_mlp": 0.01033142, "balance_loss_clip": 1.03324449, "balance_loss_mlp": 1.02005327, "epoch": 0.9916128062528182, "flos": 20740962407040.0, "grad_norm": 1.9482484238506383, "language_loss": 0.6859107, "learning_rate": 7.326095277837563e-10, "loss": 0.7069636, "num_input_tokens_seen": 355941960, "step": 16493, "time_per_iteration": 2.626917839050293 }, { "auxiliary_loss_clip": 0.01084989, "auxiliary_loss_mlp": 0.01034596, "balance_loss_clip": 1.03742933, "balance_loss_mlp": 1.02203131, "epoch": 0.9916729295054862, "flos": 22487082353280.0, "grad_norm": 1.8545300883783822, "language_loss": 0.7110008, "learning_rate": 7.221069333678276e-10, "loss": 0.73219669, "num_input_tokens_seen": 355961640, "step": 16494, "time_per_iteration": 2.6934683322906494 }, { "auxiliary_loss_clip": 0.01098932, "auxiliary_loss_mlp": 0.0103153, "balance_loss_clip": 1.0362227, "balance_loss_mlp": 1.01829231, "epoch": 0.9917330527581543, "flos": 14792467708800.0, "grad_norm": 3.5378309901622007, "language_loss": 0.68413657, "learning_rate": 7.116801517701443e-10, "loss": 0.70544124, "num_input_tokens_seen": 355977980, "step": 16495, "time_per_iteration": 2.6093251705169678 }, { "auxiliary_loss_clip": 0.0101026, "auxiliary_loss_mlp": 0.01002792, "balance_loss_clip": 1.00642037, "balance_loss_mlp": 1.00182664, "epoch": 0.9917931760108222, "flos": 59191595585280.0, "grad_norm": 0.7170668056568608, "language_loss": 0.53470147, "learning_rate": 7.013291833859458e-10, "loss": 0.55483198, "num_input_tokens_seen": 356042900, "step": 16496, "time_per_iteration": 3.3146417140960693 }, { "auxiliary_loss_clip": 0.01085309, "auxiliary_loss_mlp": 0.00774025, "balance_loss_clip": 1.03571773, "balance_loss_mlp": 1.0002538, "epoch": 0.9918532992634902, "flos": 26761637485440.0, "grad_norm": 1.4656425983930446, "language_loss": 0.71419513, "learning_rate": 6.91054028607585e-10, "loss": 0.73278844, "num_input_tokens_seen": 356063000, "step": 16497, "time_per_iteration": 2.7043471336364746 }, { "auxiliary_loss_clip": 0.01081862, "auxiliary_loss_mlp": 0.01033667, "balance_loss_clip": 1.03701067, "balance_loss_mlp": 1.02047634, "epoch": 0.9919134225161581, "flos": 14975719920000.0, "grad_norm": 2.009806913835038, "language_loss": 0.82173979, "learning_rate": 6.808546878249721e-10, "loss": 0.84289509, "num_input_tokens_seen": 356078130, "step": 16498, "time_per_iteration": 2.7074508666992188 }, { "auxiliary_loss_clip": 0.01075485, "auxiliary_loss_mlp": 0.01035966, "balance_loss_clip": 1.03759611, "balance_loss_mlp": 1.02313316, "epoch": 0.9919735457688261, "flos": 27818201064960.0, "grad_norm": 1.7332426459484291, "language_loss": 0.68403494, "learning_rate": 6.707311614246869e-10, "loss": 0.70514941, "num_input_tokens_seen": 356101655, "step": 16499, "time_per_iteration": 2.7545838356018066 }, { "auxiliary_loss_clip": 0.01111074, "auxiliary_loss_mlp": 0.01029883, "balance_loss_clip": 1.03827095, "balance_loss_mlp": 1.01769972, "epoch": 0.992033669021494, "flos": 22562782266240.0, "grad_norm": 1.7667057026294906, "language_loss": 0.8223446, "learning_rate": 6.606834497904223e-10, "loss": 0.84375417, "num_input_tokens_seen": 356121425, "step": 16500, "time_per_iteration": 2.587153911590576 }, { "auxiliary_loss_clip": 0.01080633, "auxiliary_loss_mlp": 0.01032066, "balance_loss_clip": 1.03477311, "balance_loss_mlp": 1.01933479, "epoch": 0.9920937922741621, "flos": 25374587846400.0, "grad_norm": 1.774651095771433, "language_loss": 0.81949353, "learning_rate": 6.507115533036511e-10, "loss": 0.84062058, "num_input_tokens_seen": 356140710, "step": 16501, "time_per_iteration": 2.769408702850342 }, { "auxiliary_loss_clip": 0.01098639, "auxiliary_loss_mlp": 0.01029969, "balance_loss_clip": 1.03593433, "balance_loss_mlp": 1.01757097, "epoch": 0.99215391552683, "flos": 22054466949120.0, "grad_norm": 2.025781816358009, "language_loss": 0.76823413, "learning_rate": 6.408154723420711e-10, "loss": 0.78952026, "num_input_tokens_seen": 356159835, "step": 16502, "time_per_iteration": 4.115024566650391 }, { "auxiliary_loss_clip": 0.01083856, "auxiliary_loss_mlp": 0.01031996, "balance_loss_clip": 1.03553808, "balance_loss_mlp": 1.01820326, "epoch": 0.992214038779498, "flos": 15413937845760.0, "grad_norm": 3.0841815262581127, "language_loss": 0.71393132, "learning_rate": 6.309952072811597e-10, "loss": 0.7350899, "num_input_tokens_seen": 356177555, "step": 16503, "time_per_iteration": 4.208997964859009 }, { "auxiliary_loss_clip": 0.0101848, "auxiliary_loss_mlp": 0.01003931, "balance_loss_clip": 1.00507569, "balance_loss_mlp": 1.00273323, "epoch": 0.9922741620321659, "flos": 62014498467840.0, "grad_norm": 0.631144225573371, "language_loss": 0.55076844, "learning_rate": 6.212507584932858e-10, "loss": 0.57099259, "num_input_tokens_seen": 356244975, "step": 16504, "time_per_iteration": 4.945772647857666 }, { "auxiliary_loss_clip": 0.01075926, "auxiliary_loss_mlp": 0.01024279, "balance_loss_clip": 1.0352273, "balance_loss_mlp": 1.01286459, "epoch": 0.9923342852848339, "flos": 17165480745600.0, "grad_norm": 1.970652781818717, "language_loss": 0.6979568, "learning_rate": 6.115821263481536e-10, "loss": 0.71895891, "num_input_tokens_seen": 356262605, "step": 16505, "time_per_iteration": 2.655355453491211 }, { "auxiliary_loss_clip": 0.01074237, "auxiliary_loss_mlp": 0.01032467, "balance_loss_clip": 1.0348562, "balance_loss_mlp": 1.01904368, "epoch": 0.9923944085375018, "flos": 23183210908800.0, "grad_norm": 1.9797286096997044, "language_loss": 0.65255636, "learning_rate": 6.019893112119146e-10, "loss": 0.67362338, "num_input_tokens_seen": 356278935, "step": 16506, "time_per_iteration": 2.8993325233459473 }, { "auxiliary_loss_clip": 0.01044661, "auxiliary_loss_mlp": 0.01028638, "balance_loss_clip": 1.03355384, "balance_loss_mlp": 1.01587033, "epoch": 0.9924545317901698, "flos": 20813861059200.0, "grad_norm": 3.222870025511436, "language_loss": 0.62715226, "learning_rate": 5.924723134487219e-10, "loss": 0.64788526, "num_input_tokens_seen": 356295675, "step": 16507, "time_per_iteration": 4.278958559036255 }, { "auxiliary_loss_clip": 0.01108709, "auxiliary_loss_mlp": 0.01033793, "balance_loss_clip": 1.03649449, "balance_loss_mlp": 1.02098358, "epoch": 0.9925146550428379, "flos": 20083437993600.0, "grad_norm": 2.2915394393265567, "language_loss": 0.73150027, "learning_rate": 5.830311334193983e-10, "loss": 0.75292528, "num_input_tokens_seen": 356312885, "step": 16508, "time_per_iteration": 2.5459229946136475 }, { "auxiliary_loss_clip": 0.01107576, "auxiliary_loss_mlp": 0.0102854, "balance_loss_clip": 1.03538799, "balance_loss_mlp": 1.01548636, "epoch": 0.9925747782955058, "flos": 24973717086720.0, "grad_norm": 1.713644660775738, "language_loss": 0.70212501, "learning_rate": 5.736657714818793e-10, "loss": 0.72348613, "num_input_tokens_seen": 356334070, "step": 16509, "time_per_iteration": 2.731260299682617 }, { "auxiliary_loss_clip": 0.01096747, "auxiliary_loss_mlp": 0.01036856, "balance_loss_clip": 1.03462338, "balance_loss_mlp": 1.02401757, "epoch": 0.9926349015481738, "flos": 60472526492160.0, "grad_norm": 1.6611558247345184, "language_loss": 0.68540096, "learning_rate": 5.643762279912146e-10, "loss": 0.70673692, "num_input_tokens_seen": 356359410, "step": 16510, "time_per_iteration": 3.000253438949585 }, { "auxiliary_loss_clip": 0.01074893, "auxiliary_loss_mlp": 0.01037014, "balance_loss_clip": 1.03464723, "balance_loss_mlp": 1.02426445, "epoch": 0.9926950248008417, "flos": 20741716592640.0, "grad_norm": 2.1741536524362544, "language_loss": 0.81332445, "learning_rate": 5.551625032997886e-10, "loss": 0.83444357, "num_input_tokens_seen": 356378345, "step": 16511, "time_per_iteration": 2.708442211151123 }, { "auxiliary_loss_clip": 0.01064556, "auxiliary_loss_mlp": 0.01032872, "balance_loss_clip": 1.03382301, "balance_loss_mlp": 1.02089787, "epoch": 0.9927551480535097, "flos": 24352965221760.0, "grad_norm": 1.9230315347792497, "language_loss": 0.91452694, "learning_rate": 5.460245977570998e-10, "loss": 0.93550122, "num_input_tokens_seen": 356397345, "step": 16512, "time_per_iteration": 2.6810131072998047 }, { "auxiliary_loss_clip": 0.00999495, "auxiliary_loss_mlp": 0.01002603, "balance_loss_clip": 1.00600088, "balance_loss_mlp": 1.00150681, "epoch": 0.9928152713061776, "flos": 71275572207360.0, "grad_norm": 0.7168790027045711, "language_loss": 0.55182147, "learning_rate": 5.369625117095378e-10, "loss": 0.57184243, "num_input_tokens_seen": 356459160, "step": 16513, "time_per_iteration": 3.3187079429626465 }, { "auxiliary_loss_clip": 0.01081239, "auxiliary_loss_mlp": 0.01031377, "balance_loss_clip": 1.03556442, "balance_loss_mlp": 1.01862729, "epoch": 0.9928753945588457, "flos": 57809499045120.0, "grad_norm": 1.3394977995740782, "language_loss": 0.65011883, "learning_rate": 5.279762455006054e-10, "loss": 0.67124498, "num_input_tokens_seen": 356486405, "step": 16514, "time_per_iteration": 2.9586453437805176 }, { "auxiliary_loss_clip": 0.01077404, "auxiliary_loss_mlp": 0.01029066, "balance_loss_clip": 1.03275108, "balance_loss_mlp": 1.01589894, "epoch": 0.9929355178115136, "flos": 19568981450880.0, "grad_norm": 1.9082841618912534, "language_loss": 0.73075938, "learning_rate": 5.190657994713632e-10, "loss": 0.75182408, "num_input_tokens_seen": 356502905, "step": 16515, "time_per_iteration": 2.7386841773986816 }, { "auxiliary_loss_clip": 0.01065642, "auxiliary_loss_mlp": 0.0104261, "balance_loss_clip": 1.03322613, "balance_loss_mlp": 1.02893686, "epoch": 0.9929956410641816, "flos": 22964658606720.0, "grad_norm": 1.635364336808878, "language_loss": 0.77238375, "learning_rate": 5.102311739593191e-10, "loss": 0.79346621, "num_input_tokens_seen": 356523830, "step": 16516, "time_per_iteration": 2.7601654529571533 }, { "auxiliary_loss_clip": 0.01077729, "auxiliary_loss_mlp": 0.01026442, "balance_loss_clip": 1.0354166, "balance_loss_mlp": 1.01530802, "epoch": 0.9930557643168495, "flos": 22566409539840.0, "grad_norm": 1.7024793755229197, "language_loss": 0.78187561, "learning_rate": 5.014723692997602e-10, "loss": 0.8029173, "num_input_tokens_seen": 356543965, "step": 16517, "time_per_iteration": 2.891570568084717 }, { "auxiliary_loss_clip": 0.01097555, "auxiliary_loss_mlp": 0.01037224, "balance_loss_clip": 1.03813481, "balance_loss_mlp": 1.02333033, "epoch": 0.9931158875695175, "flos": 17201032231680.0, "grad_norm": 2.4652288610488604, "language_loss": 0.67716908, "learning_rate": 4.927893858248655e-10, "loss": 0.69851696, "num_input_tokens_seen": 356561530, "step": 16518, "time_per_iteration": 2.646632432937622 }, { "auxiliary_loss_clip": 0.01008101, "auxiliary_loss_mlp": 0.01002102, "balance_loss_clip": 1.00879121, "balance_loss_mlp": 1.00086808, "epoch": 0.9931760108221854, "flos": 63711204278400.0, "grad_norm": 0.7468001018305941, "language_loss": 0.53340489, "learning_rate": 4.84182223863483e-10, "loss": 0.55350691, "num_input_tokens_seen": 356616845, "step": 16519, "time_per_iteration": 3.0809152126312256 }, { "auxiliary_loss_clip": 0.01065697, "auxiliary_loss_mlp": 0.01041583, "balance_loss_clip": 1.03317142, "balance_loss_mlp": 1.02780831, "epoch": 0.9932361340748534, "flos": 15304805349120.0, "grad_norm": 1.6932132720943704, "language_loss": 0.6033656, "learning_rate": 4.756508837426842e-10, "loss": 0.62443841, "num_input_tokens_seen": 356633560, "step": 16520, "time_per_iteration": 2.7310233116149902 }, { "auxiliary_loss_clip": 0.01078536, "auxiliary_loss_mlp": 0.01034414, "balance_loss_clip": 1.03465533, "balance_loss_mlp": 1.021927, "epoch": 0.9932962573275215, "flos": 36064906727040.0, "grad_norm": 1.7244534802172446, "language_loss": 0.62099916, "learning_rate": 4.671953657853223e-10, "loss": 0.64212871, "num_input_tokens_seen": 356657600, "step": 16521, "time_per_iteration": 2.883345603942871 }, { "auxiliary_loss_clip": 0.01087845, "auxiliary_loss_mlp": 0.01036694, "balance_loss_clip": 1.03922451, "balance_loss_mlp": 1.02330661, "epoch": 0.9933563805801894, "flos": 21470523546240.0, "grad_norm": 4.353373904102065, "language_loss": 0.74153936, "learning_rate": 4.5881567031225145e-10, "loss": 0.76278472, "num_input_tokens_seen": 356675880, "step": 16522, "time_per_iteration": 2.718522071838379 }, { "auxiliary_loss_clip": 0.0107243, "auxiliary_loss_mlp": 0.01030252, "balance_loss_clip": 1.03389478, "balance_loss_mlp": 1.01866508, "epoch": 0.9934165038328574, "flos": 23986532626560.0, "grad_norm": 1.5310659871247791, "language_loss": 0.73152745, "learning_rate": 4.5051179764143964e-10, "loss": 0.75255424, "num_input_tokens_seen": 356696000, "step": 16523, "time_per_iteration": 2.7667906284332275 }, { "auxiliary_loss_clip": 0.0108301, "auxiliary_loss_mlp": 0.00769257, "balance_loss_clip": 1.03243899, "balance_loss_mlp": 1.00031519, "epoch": 0.9934766270855253, "flos": 21907807718400.0, "grad_norm": 1.6911603974446854, "language_loss": 0.71271038, "learning_rate": 4.422837480875241e-10, "loss": 0.73123306, "num_input_tokens_seen": 356716845, "step": 16524, "time_per_iteration": 2.716357707977295 }, { "auxiliary_loss_clip": 0.0107654, "auxiliary_loss_mlp": 0.01030893, "balance_loss_clip": 1.0359261, "balance_loss_mlp": 1.01835251, "epoch": 0.9935367503381933, "flos": 17129139160320.0, "grad_norm": 2.2457086362374863, "language_loss": 0.79875743, "learning_rate": 4.341315219624775e-10, "loss": 0.81983173, "num_input_tokens_seen": 356732100, "step": 16525, "time_per_iteration": 2.7416329383850098 }, { "auxiliary_loss_clip": 0.0106301, "auxiliary_loss_mlp": 0.01026465, "balance_loss_clip": 1.03451014, "balance_loss_mlp": 1.01388836, "epoch": 0.9935968735908612, "flos": 22346241125760.0, "grad_norm": 2.0057410904081165, "language_loss": 0.75025058, "learning_rate": 4.2605511957582995e-10, "loss": 0.77114534, "num_input_tokens_seen": 356751480, "step": 16526, "time_per_iteration": 2.772752046585083 }, { "auxiliary_loss_clip": 0.01103657, "auxiliary_loss_mlp": 0.00769996, "balance_loss_clip": 1.03466129, "balance_loss_mlp": 1.0002234, "epoch": 0.9936569968435293, "flos": 29460539640960.0, "grad_norm": 2.362336998601464, "language_loss": 0.72371536, "learning_rate": 4.180545412333369e-10, "loss": 0.74245191, "num_input_tokens_seen": 356772650, "step": 16527, "time_per_iteration": 2.622760057449341 }, { "auxiliary_loss_clip": 0.01088795, "auxiliary_loss_mlp": 0.01031074, "balance_loss_clip": 1.03722143, "balance_loss_mlp": 1.01860452, "epoch": 0.9937171200961972, "flos": 16544046522240.0, "grad_norm": 2.2246858984054185, "language_loss": 0.75991976, "learning_rate": 4.1012978723875547e-10, "loss": 0.78111851, "num_input_tokens_seen": 356788510, "step": 16528, "time_per_iteration": 2.6447527408599854 }, { "auxiliary_loss_clip": 0.01089717, "auxiliary_loss_mlp": 0.01029048, "balance_loss_clip": 1.03432751, "balance_loss_mlp": 1.01511216, "epoch": 0.9937772433488652, "flos": 24390276474240.0, "grad_norm": 2.1876724581944504, "language_loss": 0.6753338, "learning_rate": 4.022808578922898e-10, "loss": 0.6965214, "num_input_tokens_seen": 356809115, "step": 16529, "time_per_iteration": 2.7714054584503174 }, { "auxiliary_loss_clip": 0.01103653, "auxiliary_loss_mlp": 0.01036146, "balance_loss_clip": 1.03892541, "balance_loss_mlp": 1.02186477, "epoch": 0.9938373666015331, "flos": 15669909141120.0, "grad_norm": 2.3036099926169116, "language_loss": 0.65350854, "learning_rate": 3.9450775349170186e-10, "loss": 0.67490655, "num_input_tokens_seen": 356826410, "step": 16530, "time_per_iteration": 2.6250078678131104 }, { "auxiliary_loss_clip": 0.01093807, "auxiliary_loss_mlp": 0.01032892, "balance_loss_clip": 1.03568208, "balance_loss_mlp": 1.02088773, "epoch": 0.9938974898542011, "flos": 19496190539520.0, "grad_norm": 3.0743920406722274, "language_loss": 0.71364164, "learning_rate": 3.8681047433186676e-10, "loss": 0.7349087, "num_input_tokens_seen": 356844990, "step": 16531, "time_per_iteration": 2.574047803878784 }, { "auxiliary_loss_clip": 0.01094022, "auxiliary_loss_mlp": 0.01034706, "balance_loss_clip": 1.03513575, "balance_loss_mlp": 1.02152801, "epoch": 0.993957613106869, "flos": 26906896085760.0, "grad_norm": 1.3526587285658505, "language_loss": 0.74083483, "learning_rate": 3.791890207045512e-10, "loss": 0.76212215, "num_input_tokens_seen": 356866530, "step": 16532, "time_per_iteration": 2.6634178161621094 }, { "auxiliary_loss_clip": 0.01051179, "auxiliary_loss_mlp": 0.01032159, "balance_loss_clip": 1.03274739, "balance_loss_mlp": 1.02109611, "epoch": 0.994017736359537, "flos": 14939593816320.0, "grad_norm": 1.6154702582280394, "language_loss": 0.70493329, "learning_rate": 3.7164339289885717e-10, "loss": 0.7257666, "num_input_tokens_seen": 356884660, "step": 16533, "time_per_iteration": 2.721863031387329 }, { "auxiliary_loss_clip": 0.01097407, "auxiliary_loss_mlp": 0.01029682, "balance_loss_clip": 1.03669178, "balance_loss_mlp": 1.01622939, "epoch": 0.9940778596122051, "flos": 15377883569280.0, "grad_norm": 3.6789979959756676, "language_loss": 0.84027219, "learning_rate": 3.641735912007782e-10, "loss": 0.86154306, "num_input_tokens_seen": 356900895, "step": 16534, "time_per_iteration": 2.619920492172241 }, { "auxiliary_loss_clip": 0.01067064, "auxiliary_loss_mlp": 0.01026895, "balance_loss_clip": 1.03194451, "balance_loss_mlp": 1.01531422, "epoch": 0.994137982864873, "flos": 25228108183680.0, "grad_norm": 1.8397563877980199, "language_loss": 0.65771168, "learning_rate": 3.567796158934211e-10, "loss": 0.67865133, "num_input_tokens_seen": 356920985, "step": 16535, "time_per_iteration": 2.744962692260742 }, { "auxiliary_loss_clip": 0.01070974, "auxiliary_loss_mlp": 0.01028223, "balance_loss_clip": 1.03729725, "balance_loss_mlp": 1.01723814, "epoch": 0.994198106117541, "flos": 18442140912000.0, "grad_norm": 2.0070166211015517, "language_loss": 0.64754289, "learning_rate": 3.4946146725767235e-10, "loss": 0.66853487, "num_input_tokens_seen": 356939800, "step": 16536, "time_per_iteration": 2.706944465637207 }, { "auxiliary_loss_clip": 0.01060285, "auxiliary_loss_mlp": 0.01035363, "balance_loss_clip": 1.03116417, "balance_loss_mlp": 1.02181518, "epoch": 0.9942582293702089, "flos": 16654112772480.0, "grad_norm": 1.840427715417003, "language_loss": 0.78430796, "learning_rate": 3.4221914557064357e-10, "loss": 0.80526441, "num_input_tokens_seen": 356957780, "step": 16537, "time_per_iteration": 2.7647006511688232 }, { "auxiliary_loss_clip": 0.01105131, "auxiliary_loss_mlp": 0.01034471, "balance_loss_clip": 1.03843594, "balance_loss_mlp": 1.02052915, "epoch": 0.9943183526228769, "flos": 21944580266880.0, "grad_norm": 1.5935823863373109, "language_loss": 0.68781149, "learning_rate": 3.35052651107004e-10, "loss": 0.70920742, "num_input_tokens_seen": 356979185, "step": 16538, "time_per_iteration": 2.7235569953918457 }, { "auxiliary_loss_clip": 0.01063974, "auxiliary_loss_mlp": 0.01035944, "balance_loss_clip": 1.02961493, "balance_loss_mlp": 1.02304578, "epoch": 0.9943784758755448, "flos": 23842566915840.0, "grad_norm": 1.8915805101103145, "language_loss": 0.75187773, "learning_rate": 3.2796198413853614e-10, "loss": 0.77287686, "num_input_tokens_seen": 356997735, "step": 16539, "time_per_iteration": 2.8062071800231934 }, { "auxiliary_loss_clip": 0.01060765, "auxiliary_loss_mlp": 0.01033233, "balance_loss_clip": 1.03562832, "balance_loss_mlp": 1.02050102, "epoch": 0.9944385991282129, "flos": 21469984842240.0, "grad_norm": 2.030639619740989, "language_loss": 0.70239884, "learning_rate": 3.209471449341361e-10, "loss": 0.72333884, "num_input_tokens_seen": 357015660, "step": 16540, "time_per_iteration": 2.8070261478424072 }, { "auxiliary_loss_clip": 0.01093159, "auxiliary_loss_mlp": 0.01027717, "balance_loss_clip": 1.03431797, "balance_loss_mlp": 1.01676154, "epoch": 0.9944987223808808, "flos": 22927024131840.0, "grad_norm": 1.9807950756120538, "language_loss": 0.75202429, "learning_rate": 3.140081337600353e-10, "loss": 0.77323306, "num_input_tokens_seen": 357034800, "step": 16541, "time_per_iteration": 5.754985570907593 }, { "auxiliary_loss_clip": 0.01080974, "auxiliary_loss_mlp": 0.01036156, "balance_loss_clip": 1.0349412, "balance_loss_mlp": 1.02397323, "epoch": 0.9945588456335488, "flos": 22383013674240.0, "grad_norm": 1.7422746830873381, "language_loss": 0.76555264, "learning_rate": 3.0714495087891255e-10, "loss": 0.78672391, "num_input_tokens_seen": 357053785, "step": 16542, "time_per_iteration": 2.708519458770752 }, { "auxiliary_loss_clip": 0.01099205, "auxiliary_loss_mlp": 0.01030947, "balance_loss_clip": 1.03627014, "balance_loss_mlp": 1.01776206, "epoch": 0.9946189688862167, "flos": 21397517153280.0, "grad_norm": 2.4054715264061435, "language_loss": 0.74274677, "learning_rate": 3.0035759655122615e-10, "loss": 0.76404828, "num_input_tokens_seen": 357072025, "step": 16543, "time_per_iteration": 5.371897459030151 }, { "auxiliary_loss_clip": 0.01094886, "auxiliary_loss_mlp": 0.01036182, "balance_loss_clip": 1.03557873, "balance_loss_mlp": 1.02270508, "epoch": 0.9946790921388847, "flos": 12416545670400.0, "grad_norm": 6.4447794910959235, "language_loss": 0.82093954, "learning_rate": 2.9364607103454785e-10, "loss": 0.84225017, "num_input_tokens_seen": 357086960, "step": 16544, "time_per_iteration": 2.648569107055664 }, { "auxiliary_loss_clip": 0.0110726, "auxiliary_loss_mlp": 0.01027758, "balance_loss_clip": 1.0360719, "balance_loss_mlp": 1.01605737, "epoch": 0.9947392153915526, "flos": 19058295836160.0, "grad_norm": 1.8960221821622298, "language_loss": 0.78761363, "learning_rate": 2.870103745831187e-10, "loss": 0.80896378, "num_input_tokens_seen": 357105095, "step": 16545, "time_per_iteration": 2.6322624683380127 }, { "auxiliary_loss_clip": 0.01078686, "auxiliary_loss_mlp": 0.01030665, "balance_loss_clip": 1.03411245, "balance_loss_mlp": 1.01840401, "epoch": 0.9947993386442207, "flos": 27308808339840.0, "grad_norm": 1.8256767545594197, "language_loss": 0.72650462, "learning_rate": 2.8045050744873733e-10, "loss": 0.74759817, "num_input_tokens_seen": 357125065, "step": 16546, "time_per_iteration": 4.327521562576294 }, { "auxiliary_loss_clip": 0.0109393, "auxiliary_loss_mlp": 0.01034353, "balance_loss_clip": 1.03408468, "balance_loss_mlp": 1.02212179, "epoch": 0.9948594618968887, "flos": 20806498771200.0, "grad_norm": 2.038631565735454, "language_loss": 0.77378041, "learning_rate": 2.739664698798716e-10, "loss": 0.79506326, "num_input_tokens_seen": 357141600, "step": 16547, "time_per_iteration": 2.6839520931243896 }, { "auxiliary_loss_clip": 0.01085655, "auxiliary_loss_mlp": 0.01030142, "balance_loss_clip": 1.03360105, "balance_loss_mlp": 1.01823926, "epoch": 0.9949195851495566, "flos": 23292953936640.0, "grad_norm": 2.7868363629317097, "language_loss": 0.70053595, "learning_rate": 2.67558262122769e-10, "loss": 0.72169393, "num_input_tokens_seen": 357157880, "step": 16548, "time_per_iteration": 2.6629064083099365 }, { "auxiliary_loss_clip": 0.0109367, "auxiliary_loss_mlp": 0.01034955, "balance_loss_clip": 1.03438258, "balance_loss_mlp": 1.02264059, "epoch": 0.9949797084022246, "flos": 18515470527360.0, "grad_norm": 1.8066834079511649, "language_loss": 0.75463164, "learning_rate": 2.6122588442012427e-10, "loss": 0.77591789, "num_input_tokens_seen": 357176705, "step": 16549, "time_per_iteration": 2.6749610900878906 }, { "auxiliary_loss_clip": 0.01080946, "auxiliary_loss_mlp": 0.01034852, "balance_loss_clip": 1.03683913, "balance_loss_mlp": 1.02162528, "epoch": 0.9950398316548925, "flos": 30407719328640.0, "grad_norm": 1.591136058001085, "language_loss": 0.74426466, "learning_rate": 2.5496933701241177e-10, "loss": 0.7654227, "num_input_tokens_seen": 357197630, "step": 16550, "time_per_iteration": 2.8009731769561768 }, { "auxiliary_loss_clip": 0.01058637, "auxiliary_loss_mlp": 0.00770717, "balance_loss_clip": 1.03213239, "balance_loss_mlp": 1.00024307, "epoch": 0.9950999549075605, "flos": 19900868140800.0, "grad_norm": 1.804349453887292, "language_loss": 0.78024846, "learning_rate": 2.4878862013655297e-10, "loss": 0.79854202, "num_input_tokens_seen": 357215445, "step": 16551, "time_per_iteration": 2.7871713638305664 }, { "auxiliary_loss_clip": 0.01090386, "auxiliary_loss_mlp": 0.01032903, "balance_loss_clip": 1.03510332, "balance_loss_mlp": 1.02215683, "epoch": 0.9951600781602284, "flos": 17603555016960.0, "grad_norm": 1.3788671688466283, "language_loss": 0.66577691, "learning_rate": 2.426837340270271e-10, "loss": 0.68700981, "num_input_tokens_seen": 357234285, "step": 16552, "time_per_iteration": 2.7981386184692383 }, { "auxiliary_loss_clip": 0.01108432, "auxiliary_loss_mlp": 0.010277, "balance_loss_clip": 1.03545749, "balance_loss_mlp": 1.01527882, "epoch": 0.9952202014128965, "flos": 28950715952640.0, "grad_norm": 1.414763440540152, "language_loss": 0.81504261, "learning_rate": 2.3665467891520465e-10, "loss": 0.83640391, "num_input_tokens_seen": 357257565, "step": 16553, "time_per_iteration": 2.7514050006866455 }, { "auxiliary_loss_clip": 0.01016193, "auxiliary_loss_mlp": 0.01001561, "balance_loss_clip": 1.00488806, "balance_loss_mlp": 1.00064945, "epoch": 0.9952803246655644, "flos": 70810386145920.0, "grad_norm": 0.7163424076202736, "language_loss": 0.57331538, "learning_rate": 2.3070145503001348e-10, "loss": 0.59349293, "num_input_tokens_seen": 357320205, "step": 16554, "time_per_iteration": 3.343486785888672 }, { "auxiliary_loss_clip": 0.01092483, "auxiliary_loss_mlp": 0.01037848, "balance_loss_clip": 1.03661346, "balance_loss_mlp": 1.02556348, "epoch": 0.9953404479182324, "flos": 21799070271360.0, "grad_norm": 1.572030875373758, "language_loss": 0.77075458, "learning_rate": 2.24824062597051e-10, "loss": 0.79205793, "num_input_tokens_seen": 357340695, "step": 16555, "time_per_iteration": 2.6856164932250977 }, { "auxiliary_loss_clip": 0.01077447, "auxiliary_loss_mlp": 0.01031566, "balance_loss_clip": 1.03370774, "balance_loss_mlp": 1.01910233, "epoch": 0.9954005711709003, "flos": 21937397546880.0, "grad_norm": 2.812435043549039, "language_loss": 0.86056131, "learning_rate": 2.1902250183902793e-10, "loss": 0.8816514, "num_input_tokens_seen": 357357505, "step": 16556, "time_per_iteration": 2.8494834899902344 }, { "auxiliary_loss_clip": 0.01062031, "auxiliary_loss_mlp": 0.01032977, "balance_loss_clip": 1.03475928, "balance_loss_mlp": 1.02018583, "epoch": 0.9954606944235683, "flos": 19354559212800.0, "grad_norm": 1.7341010844964493, "language_loss": 0.73350233, "learning_rate": 2.132967729762125e-10, "loss": 0.75445241, "num_input_tokens_seen": 357375395, "step": 16557, "time_per_iteration": 2.776954412460327 }, { "auxiliary_loss_clip": 0.01096785, "auxiliary_loss_mlp": 0.01035737, "balance_loss_clip": 1.03676844, "balance_loss_mlp": 1.02380407, "epoch": 0.9955208176762362, "flos": 30518611591680.0, "grad_norm": 1.8889126824734808, "language_loss": 0.76071554, "learning_rate": 2.0764687622554233e-10, "loss": 0.78204083, "num_input_tokens_seen": 357397375, "step": 16558, "time_per_iteration": 2.725471258163452 }, { "auxiliary_loss_clip": 0.01082875, "auxiliary_loss_mlp": 0.0103135, "balance_loss_clip": 1.0333569, "balance_loss_mlp": 1.01868999, "epoch": 0.9955809409289043, "flos": 30008249199360.0, "grad_norm": 1.8788857895854807, "language_loss": 0.6342541, "learning_rate": 2.0207281180129044e-10, "loss": 0.65539634, "num_input_tokens_seen": 357418880, "step": 16559, "time_per_iteration": 2.754697322845459 }, { "auxiliary_loss_clip": 0.01094664, "auxiliary_loss_mlp": 0.01027311, "balance_loss_clip": 1.03535438, "balance_loss_mlp": 1.01506233, "epoch": 0.9956410641815723, "flos": 21543278544000.0, "grad_norm": 2.2357683381447044, "language_loss": 0.74527764, "learning_rate": 1.965745799148433e-10, "loss": 0.76649737, "num_input_tokens_seen": 357438310, "step": 16560, "time_per_iteration": 2.675863265991211 }, { "auxiliary_loss_clip": 0.01050704, "auxiliary_loss_mlp": 0.01026972, "balance_loss_clip": 1.03353262, "balance_loss_mlp": 1.0149498, "epoch": 0.9957011874342402, "flos": 21689470897920.0, "grad_norm": 1.7541279105695071, "language_loss": 0.7902168, "learning_rate": 1.9115218077470073e-10, "loss": 0.81099355, "num_input_tokens_seen": 357457155, "step": 16561, "time_per_iteration": 2.800518751144409 }, { "auxiliary_loss_clip": 0.01105362, "auxiliary_loss_mlp": 0.01030097, "balance_loss_clip": 1.03662086, "balance_loss_mlp": 1.01839638, "epoch": 0.9957613106869082, "flos": 17702667619200.0, "grad_norm": 2.712205364252532, "language_loss": 0.65797567, "learning_rate": 1.8580561458647614e-10, "loss": 0.67933023, "num_input_tokens_seen": 357468060, "step": 16562, "time_per_iteration": 2.6822054386138916 }, { "auxiliary_loss_clip": 0.01086196, "auxiliary_loss_mlp": 0.00770624, "balance_loss_clip": 1.03828645, "balance_loss_mlp": 1.00018549, "epoch": 0.9958214339395761, "flos": 30555994671360.0, "grad_norm": 3.146501927176202, "language_loss": 0.6437341, "learning_rate": 1.805348815528962e-10, "loss": 0.66230226, "num_input_tokens_seen": 357489665, "step": 16563, "time_per_iteration": 2.7867605686187744 }, { "auxiliary_loss_clip": 0.01085894, "auxiliary_loss_mlp": 0.01032654, "balance_loss_clip": 1.03683317, "balance_loss_mlp": 1.01987505, "epoch": 0.9958815571922441, "flos": 24169174306560.0, "grad_norm": 2.104507608134006, "language_loss": 0.64749634, "learning_rate": 1.7533998187380105e-10, "loss": 0.66868186, "num_input_tokens_seen": 357511975, "step": 16564, "time_per_iteration": 2.7374000549316406 }, { "auxiliary_loss_clip": 0.010846, "auxiliary_loss_mlp": 0.00769579, "balance_loss_clip": 1.03644037, "balance_loss_mlp": 1.00024891, "epoch": 0.995941680444912, "flos": 15487016065920.0, "grad_norm": 2.0341967172049857, "language_loss": 0.7462337, "learning_rate": 1.7022091574636633e-10, "loss": 0.76477551, "num_input_tokens_seen": 357529345, "step": 16565, "time_per_iteration": 2.6312754154205322 }, { "auxiliary_loss_clip": 0.01087362, "auxiliary_loss_mlp": 0.01027615, "balance_loss_clip": 1.03376865, "balance_loss_mlp": 1.0157181, "epoch": 0.9960018036975801, "flos": 18621227145600.0, "grad_norm": 1.7027522514634321, "language_loss": 0.79018843, "learning_rate": 1.6517768336443694e-10, "loss": 0.81133819, "num_input_tokens_seen": 357547615, "step": 16566, "time_per_iteration": 2.6870059967041016 }, { "auxiliary_loss_clip": 0.01056958, "auxiliary_loss_mlp": 0.00769517, "balance_loss_clip": 1.03390598, "balance_loss_mlp": 1.0001384, "epoch": 0.996061926950248, "flos": 20084120352000.0, "grad_norm": 1.7390992367091276, "language_loss": 0.70729011, "learning_rate": 1.6021028491941535e-10, "loss": 0.72555488, "num_input_tokens_seen": 357567380, "step": 16567, "time_per_iteration": 2.7366580963134766 }, { "auxiliary_loss_clip": 0.01097619, "auxiliary_loss_mlp": 0.01032972, "balance_loss_clip": 1.03566027, "balance_loss_mlp": 1.01965046, "epoch": 0.996122050202916, "flos": 24347829576960.0, "grad_norm": 2.9373159802346076, "language_loss": 0.79025483, "learning_rate": 1.5531872059959538e-10, "loss": 0.81156075, "num_input_tokens_seen": 357586435, "step": 16568, "time_per_iteration": 2.6557395458221436 }, { "auxiliary_loss_clip": 0.01093714, "auxiliary_loss_mlp": 0.01028241, "balance_loss_clip": 1.03664947, "balance_loss_mlp": 1.0173099, "epoch": 0.9961821734555839, "flos": 24199302839040.0, "grad_norm": 1.7081910845577881, "language_loss": 0.81825495, "learning_rate": 1.5050299059060634e-10, "loss": 0.83947456, "num_input_tokens_seen": 357604720, "step": 16569, "time_per_iteration": 2.750368118286133 }, { "auxiliary_loss_clip": 0.0106979, "auxiliary_loss_mlp": 0.00770531, "balance_loss_clip": 1.03594494, "balance_loss_mlp": 1.00018477, "epoch": 0.9962422967082519, "flos": 22633741584000.0, "grad_norm": 1.812159782234162, "language_loss": 0.7033971, "learning_rate": 1.457630950747468e-10, "loss": 0.72180027, "num_input_tokens_seen": 357622345, "step": 16570, "time_per_iteration": 2.6845390796661377 }, { "auxiliary_loss_clip": 0.01079783, "auxiliary_loss_mlp": 0.01026272, "balance_loss_clip": 1.03678036, "balance_loss_mlp": 1.01413023, "epoch": 0.9963024199609198, "flos": 26396030903040.0, "grad_norm": 1.5778392939659474, "language_loss": 0.75031984, "learning_rate": 1.4109903423209502e-10, "loss": 0.77138042, "num_input_tokens_seen": 357642710, "step": 16571, "time_per_iteration": 2.6998531818389893 }, { "auxiliary_loss_clip": 0.01085876, "auxiliary_loss_mlp": 0.01032303, "balance_loss_clip": 1.03418159, "balance_loss_mlp": 1.01976252, "epoch": 0.9963625432135879, "flos": 16581537342720.0, "grad_norm": 2.056267788643989, "language_loss": 0.79312503, "learning_rate": 1.3651080823939843e-10, "loss": 0.81430686, "num_input_tokens_seen": 357659870, "step": 16572, "time_per_iteration": 2.6602699756622314 }, { "auxiliary_loss_clip": 0.01083413, "auxiliary_loss_mlp": 0.01031239, "balance_loss_clip": 1.0354054, "balance_loss_mlp": 1.01907969, "epoch": 0.9964226664662559, "flos": 26468534505600.0, "grad_norm": 1.9072175246303182, "language_loss": 0.7072866, "learning_rate": 1.3199841727074e-10, "loss": 0.72843313, "num_input_tokens_seen": 357677075, "step": 16573, "time_per_iteration": 2.7399983406066895 }, { "auxiliary_loss_clip": 0.01085736, "auxiliary_loss_mlp": 0.01031859, "balance_loss_clip": 1.03562653, "balance_loss_mlp": 1.01902056, "epoch": 0.9964827897189238, "flos": 27448320764160.0, "grad_norm": 17.706098733972073, "language_loss": 0.63426065, "learning_rate": 1.275618614968721e-10, "loss": 0.65543658, "num_input_tokens_seen": 357696715, "step": 16574, "time_per_iteration": 2.7760708332061768 }, { "auxiliary_loss_clip": 0.01079151, "auxiliary_loss_mlp": 0.01032112, "balance_loss_clip": 1.03830504, "balance_loss_mlp": 1.01859987, "epoch": 0.9965429129715918, "flos": 11721566350080.0, "grad_norm": 2.269343954820431, "language_loss": 0.76514804, "learning_rate": 1.2320114108654856e-10, "loss": 0.78626072, "num_input_tokens_seen": 357712345, "step": 16575, "time_per_iteration": 2.670433759689331 }, { "auxiliary_loss_clip": 0.01086412, "auxiliary_loss_mlp": 0.01030377, "balance_loss_clip": 1.03638375, "balance_loss_mlp": 1.01757431, "epoch": 0.9966030362242597, "flos": 19756004590080.0, "grad_norm": 1.890239065955032, "language_loss": 0.70341682, "learning_rate": 1.1891625620474855e-10, "loss": 0.72458476, "num_input_tokens_seen": 357731815, "step": 16576, "time_per_iteration": 2.7393879890441895 }, { "auxiliary_loss_clip": 0.0109524, "auxiliary_loss_mlp": 0.01024289, "balance_loss_clip": 1.03612185, "balance_loss_mlp": 1.01186752, "epoch": 0.9966631594769277, "flos": 23915178259200.0, "grad_norm": 1.5127574576312723, "language_loss": 0.71783984, "learning_rate": 1.1470720701400871e-10, "loss": 0.73903513, "num_input_tokens_seen": 357751640, "step": 16577, "time_per_iteration": 2.6747822761535645 }, { "auxiliary_loss_clip": 0.0108308, "auxiliary_loss_mlp": 0.01034487, "balance_loss_clip": 1.03563082, "balance_loss_mlp": 1.02241135, "epoch": 0.9967232827295956, "flos": 15559591495680.0, "grad_norm": 1.8839463168037793, "language_loss": 0.78829128, "learning_rate": 1.1057399367397912e-10, "loss": 0.80946696, "num_input_tokens_seen": 357769850, "step": 16578, "time_per_iteration": 2.6458945274353027 }, { "auxiliary_loss_clip": 0.01069592, "auxiliary_loss_mlp": 0.0076966, "balance_loss_clip": 1.0383426, "balance_loss_mlp": 1.00028622, "epoch": 0.9967834059822637, "flos": 20813035046400.0, "grad_norm": 1.6210978789721697, "language_loss": 0.76015878, "learning_rate": 1.0651661634142328e-10, "loss": 0.77855128, "num_input_tokens_seen": 357789550, "step": 16579, "time_per_iteration": 2.7179081439971924 }, { "auxiliary_loss_clip": 0.01085459, "auxiliary_loss_mlp": 0.01037581, "balance_loss_clip": 1.03625321, "balance_loss_mlp": 1.02271509, "epoch": 0.9968435292349316, "flos": 36719234830080.0, "grad_norm": 2.1621427705186513, "language_loss": 0.69284117, "learning_rate": 1.0253507516999604e-10, "loss": 0.71407157, "num_input_tokens_seen": 357809525, "step": 16580, "time_per_iteration": 4.3343565464019775 }, { "auxiliary_loss_clip": 0.0105428, "auxiliary_loss_mlp": 0.01032231, "balance_loss_clip": 1.03120196, "balance_loss_mlp": 1.02024424, "epoch": 0.9969036524875996, "flos": 26760919213440.0, "grad_norm": 1.9697975439158977, "language_loss": 0.79967076, "learning_rate": 9.862937031113184e-11, "loss": 0.8205359, "num_input_tokens_seen": 357829795, "step": 16581, "time_per_iteration": 4.272336483001709 }, { "auxiliary_loss_clip": 0.01078953, "auxiliary_loss_mlp": 0.01027322, "balance_loss_clip": 1.03649044, "balance_loss_mlp": 1.01607418, "epoch": 0.9969637757402675, "flos": 24827237424000.0, "grad_norm": 1.8090343516567968, "language_loss": 0.80200183, "learning_rate": 9.479950191249031e-11, "loss": 0.82306457, "num_input_tokens_seen": 357851655, "step": 16582, "time_per_iteration": 4.770942449569702 }, { "auxiliary_loss_clip": 0.01092857, "auxiliary_loss_mlp": 0.01033363, "balance_loss_clip": 1.03387117, "balance_loss_mlp": 1.02106702, "epoch": 0.9970238989929355, "flos": 23038742407680.0, "grad_norm": 1.6176460264436903, "language_loss": 0.60509884, "learning_rate": 9.104547011951069e-11, "loss": 0.62636101, "num_input_tokens_seen": 357871205, "step": 16583, "time_per_iteration": 2.670657157897949 }, { "auxiliary_loss_clip": 0.01088101, "auxiliary_loss_mlp": 0.01037237, "balance_loss_clip": 1.03633022, "balance_loss_mlp": 1.0250237, "epoch": 0.9970840222456034, "flos": 25298816106240.0, "grad_norm": 1.6127986377425965, "language_loss": 0.77779889, "learning_rate": 8.736727507452357e-11, "loss": 0.79905224, "num_input_tokens_seen": 357892145, "step": 16584, "time_per_iteration": 2.6968681812286377 }, { "auxiliary_loss_clip": 0.01081813, "auxiliary_loss_mlp": 0.01030965, "balance_loss_clip": 1.03400755, "balance_loss_mlp": 1.01991463, "epoch": 0.9971441454982715, "flos": 21615602578560.0, "grad_norm": 1.5491233705844139, "language_loss": 0.69406962, "learning_rate": 8.376491691697297e-11, "loss": 0.71519732, "num_input_tokens_seen": 357911205, "step": 16585, "time_per_iteration": 4.212535381317139 }, { "auxiliary_loss_clip": 0.0110602, "auxiliary_loss_mlp": 0.01033386, "balance_loss_clip": 1.03605747, "balance_loss_mlp": 1.02094698, "epoch": 0.9972042687509394, "flos": 14975612179200.0, "grad_norm": 2.688083222566017, "language_loss": 0.82222629, "learning_rate": 8.023839578363834e-11, "loss": 0.84362036, "num_input_tokens_seen": 357928190, "step": 16586, "time_per_iteration": 2.5343804359436035 }, { "auxiliary_loss_clip": 0.01084137, "auxiliary_loss_mlp": 0.01038457, "balance_loss_clip": 1.03290653, "balance_loss_mlp": 1.02660799, "epoch": 0.9972643920036074, "flos": 25806664546560.0, "grad_norm": 2.102200677561442, "language_loss": 0.7796334, "learning_rate": 7.678771180796851e-11, "loss": 0.80085933, "num_input_tokens_seen": 357946985, "step": 16587, "time_per_iteration": 2.653956174850464 }, { "auxiliary_loss_clip": 0.01083114, "auxiliary_loss_mlp": 0.01036983, "balance_loss_clip": 1.03732991, "balance_loss_mlp": 1.02448964, "epoch": 0.9973245152562754, "flos": 23326242865920.0, "grad_norm": 5.715123647273254, "language_loss": 0.73174369, "learning_rate": 7.341286512074773e-11, "loss": 0.75294471, "num_input_tokens_seen": 357966720, "step": 16588, "time_per_iteration": 2.5937352180480957 }, { "auxiliary_loss_clip": 0.01112154, "auxiliary_loss_mlp": 0.01029109, "balance_loss_clip": 1.03663898, "balance_loss_mlp": 1.01646113, "epoch": 0.9973846385089433, "flos": 12166212810240.0, "grad_norm": 5.411177211250548, "language_loss": 0.82386965, "learning_rate": 7.011385585031781e-11, "loss": 0.84528232, "num_input_tokens_seen": 357981375, "step": 16589, "time_per_iteration": 2.5262768268585205 }, { "auxiliary_loss_clip": 0.01100757, "auxiliary_loss_mlp": 0.0103796, "balance_loss_clip": 1.03564775, "balance_loss_mlp": 1.02382755, "epoch": 0.9974447617616113, "flos": 20045157073920.0, "grad_norm": 4.308142641596885, "language_loss": 0.70464408, "learning_rate": 6.689068412168986e-11, "loss": 0.72603118, "num_input_tokens_seen": 358000290, "step": 16590, "time_per_iteration": 2.5830941200256348 }, { "auxiliary_loss_clip": 0.01086738, "auxiliary_loss_mlp": 0.01032028, "balance_loss_clip": 1.03551257, "balance_loss_mlp": 1.01895654, "epoch": 0.9975048850142793, "flos": 32014614159360.0, "grad_norm": 4.864987201646496, "language_loss": 0.63802195, "learning_rate": 6.374335005676634e-11, "loss": 0.65920961, "num_input_tokens_seen": 358022075, "step": 16591, "time_per_iteration": 2.68571400642395 }, { "auxiliary_loss_clip": 0.0108584, "auxiliary_loss_mlp": 0.01030286, "balance_loss_clip": 1.03423333, "balance_loss_mlp": 1.01809728, "epoch": 0.9975650082669473, "flos": 36933728895360.0, "grad_norm": 2.6236190500257726, "language_loss": 0.73096275, "learning_rate": 6.067185377522933e-11, "loss": 0.75212401, "num_input_tokens_seen": 358043940, "step": 16592, "time_per_iteration": 2.7373883724212646 }, { "auxiliary_loss_clip": 0.01087724, "auxiliary_loss_mlp": 0.01032375, "balance_loss_clip": 1.03737628, "balance_loss_mlp": 1.01964951, "epoch": 0.9976251315196152, "flos": 16472117537280.0, "grad_norm": 1.4821546433362938, "language_loss": 0.85078406, "learning_rate": 5.767619539343016e-11, "loss": 0.87198508, "num_input_tokens_seen": 358062720, "step": 16593, "time_per_iteration": 2.662369966506958 }, { "auxiliary_loss_clip": 0.01104576, "auxiliary_loss_mlp": 0.0076981, "balance_loss_clip": 1.03564858, "balance_loss_mlp": 1.00020099, "epoch": 0.9976852547722832, "flos": 19646836179840.0, "grad_norm": 1.9769219864730705, "language_loss": 0.6983223, "learning_rate": 5.4756375024833656e-11, "loss": 0.71706617, "num_input_tokens_seen": 358081560, "step": 16594, "time_per_iteration": 2.5857043266296387 }, { "auxiliary_loss_clip": 0.01069224, "auxiliary_loss_mlp": 0.01026926, "balance_loss_clip": 1.03892672, "balance_loss_mlp": 1.01451635, "epoch": 0.9977453780249511, "flos": 20448434044800.0, "grad_norm": 2.022789522013575, "language_loss": 0.72606945, "learning_rate": 5.1912392780462113e-11, "loss": 0.74703097, "num_input_tokens_seen": 358099065, "step": 16595, "time_per_iteration": 2.7689433097839355 }, { "auxiliary_loss_clip": 0.01007096, "auxiliary_loss_mlp": 0.01003373, "balance_loss_clip": 1.00481117, "balance_loss_mlp": 1.00250244, "epoch": 0.9978055012776191, "flos": 65455097581440.0, "grad_norm": 0.7875629365454856, "language_loss": 0.60383916, "learning_rate": 4.9144248768007156e-11, "loss": 0.62394392, "num_input_tokens_seen": 358156095, "step": 16596, "time_per_iteration": 3.08450984954834 }, { "auxiliary_loss_clip": 0.01096892, "auxiliary_loss_mlp": 0.01029762, "balance_loss_clip": 1.03594232, "balance_loss_mlp": 1.01738787, "epoch": 0.997865624530287, "flos": 20631506688000.0, "grad_norm": 2.0986961825985087, "language_loss": 0.77297747, "learning_rate": 4.645194309227385e-11, "loss": 0.79424405, "num_input_tokens_seen": 358175230, "step": 16597, "time_per_iteration": 2.6868622303009033 }, { "auxiliary_loss_clip": 0.01097035, "auxiliary_loss_mlp": 0.01030189, "balance_loss_clip": 1.03486156, "balance_loss_mlp": 1.01730847, "epoch": 0.9979257477829551, "flos": 29387102284800.0, "grad_norm": 1.756861728101755, "language_loss": 0.82217014, "learning_rate": 4.383547585562475e-11, "loss": 0.84344238, "num_input_tokens_seen": 358197075, "step": 16598, "time_per_iteration": 2.7054567337036133 }, { "auxiliary_loss_clip": 0.01081519, "auxiliary_loss_mlp": 0.01044245, "balance_loss_clip": 1.03558803, "balance_loss_mlp": 1.03068531, "epoch": 0.997985871035623, "flos": 22635070387200.0, "grad_norm": 2.4545106380847335, "language_loss": 0.64762008, "learning_rate": 4.129484715709175e-11, "loss": 0.66887772, "num_input_tokens_seen": 358215925, "step": 16599, "time_per_iteration": 2.6614456176757812 }, { "auxiliary_loss_clip": 0.01010593, "auxiliary_loss_mlp": 0.01000422, "balance_loss_clip": 1.00784099, "balance_loss_mlp": 0.9994688, "epoch": 0.998045994288291, "flos": 61806968663040.0, "grad_norm": 0.9370474706148707, "language_loss": 0.62274885, "learning_rate": 3.8830057093264256e-11, "loss": 0.64285898, "num_input_tokens_seen": 358269035, "step": 16600, "time_per_iteration": 3.1614274978637695 }, { "auxiliary_loss_clip": 0.01085288, "auxiliary_loss_mlp": 0.01032015, "balance_loss_clip": 1.03679216, "balance_loss_mlp": 1.02095842, "epoch": 0.998106117540959, "flos": 19245534456960.0, "grad_norm": 1.511083911813729, "language_loss": 0.78393221, "learning_rate": 3.644110575717896e-11, "loss": 0.80510521, "num_input_tokens_seen": 358287680, "step": 16601, "time_per_iteration": 2.772331953048706 }, { "auxiliary_loss_clip": 0.01077732, "auxiliary_loss_mlp": 0.01031133, "balance_loss_clip": 1.03514004, "balance_loss_mlp": 1.01892638, "epoch": 0.9981662407936269, "flos": 21106209853440.0, "grad_norm": 2.513777021712519, "language_loss": 0.82537293, "learning_rate": 3.412799323987414e-11, "loss": 0.84646153, "num_input_tokens_seen": 358304080, "step": 16602, "time_per_iteration": 2.6796252727508545 }, { "auxiliary_loss_clip": 0.01068281, "auxiliary_loss_mlp": 0.01034598, "balance_loss_clip": 1.03651309, "balance_loss_mlp": 1.02249801, "epoch": 0.998226364046295, "flos": 24316839118080.0, "grad_norm": 2.030453284598539, "language_loss": 0.62777632, "learning_rate": 3.189071962883538e-11, "loss": 0.64880514, "num_input_tokens_seen": 358323670, "step": 16603, "time_per_iteration": 2.693939447402954 }, { "auxiliary_loss_clip": 0.01084524, "auxiliary_loss_mlp": 0.01027967, "balance_loss_clip": 1.03433418, "balance_loss_mlp": 1.01537895, "epoch": 0.9982864872989629, "flos": 23836389776640.0, "grad_norm": 1.7079397475017406, "language_loss": 0.70913982, "learning_rate": 2.972928500866168e-11, "loss": 0.73026478, "num_input_tokens_seen": 358341980, "step": 16604, "time_per_iteration": 2.8074941635131836 }, { "auxiliary_loss_clip": 0.0110762, "auxiliary_loss_mlp": 0.01027575, "balance_loss_clip": 1.03609681, "balance_loss_mlp": 1.01511717, "epoch": 0.9983466105516309, "flos": 18333116156160.0, "grad_norm": 1.8321992225796084, "language_loss": 0.64592469, "learning_rate": 2.7643689461953613e-11, "loss": 0.66727662, "num_input_tokens_seen": 358360400, "step": 16605, "time_per_iteration": 2.559711456298828 }, { "auxiliary_loss_clip": 0.01072745, "auxiliary_loss_mlp": 0.0103157, "balance_loss_clip": 1.03378582, "balance_loss_mlp": 1.01944005, "epoch": 0.9984067338042988, "flos": 17236763285760.0, "grad_norm": 1.7112583965701615, "language_loss": 0.7144081, "learning_rate": 2.5633933067092938e-11, "loss": 0.73545122, "num_input_tokens_seen": 358378990, "step": 16606, "time_per_iteration": 2.6522889137268066 }, { "auxiliary_loss_clip": 0.0109612, "auxiliary_loss_mlp": 0.00770001, "balance_loss_clip": 1.03534591, "balance_loss_mlp": 1.00014746, "epoch": 0.9984668570569668, "flos": 20667884186880.0, "grad_norm": 1.989921171025738, "language_loss": 0.82035434, "learning_rate": 2.370001590090709e-11, "loss": 0.8390156, "num_input_tokens_seen": 358395970, "step": 16607, "time_per_iteration": 2.6804637908935547 }, { "auxiliary_loss_clip": 0.0107541, "auxiliary_loss_mlp": 0.01033615, "balance_loss_clip": 1.03306758, "balance_loss_mlp": 1.02051961, "epoch": 0.9985269803096347, "flos": 30262532555520.0, "grad_norm": 1.6639542456977479, "language_loss": 0.67119384, "learning_rate": 2.184193803622669e-11, "loss": 0.69228399, "num_input_tokens_seen": 358417355, "step": 16608, "time_per_iteration": 2.906008005142212 }, { "auxiliary_loss_clip": 0.01063208, "auxiliary_loss_mlp": 0.0103334, "balance_loss_clip": 1.03657353, "balance_loss_mlp": 1.02062011, "epoch": 0.9985871035623027, "flos": 10560970005120.0, "grad_norm": 1.8164676216631945, "language_loss": 0.80704165, "learning_rate": 2.0059699543883978e-11, "loss": 0.82800716, "num_input_tokens_seen": 358434345, "step": 16609, "time_per_iteration": 2.7321889400482178 }, { "auxiliary_loss_clip": 0.01087746, "auxiliary_loss_mlp": 0.01034825, "balance_loss_clip": 1.03453326, "balance_loss_mlp": 1.02246904, "epoch": 0.9986472268149706, "flos": 16873455173760.0, "grad_norm": 1.4927424952025787, "language_loss": 0.62772417, "learning_rate": 1.8353300491158462e-11, "loss": 0.64894992, "num_input_tokens_seen": 358452870, "step": 16610, "time_per_iteration": 2.6517322063446045 }, { "auxiliary_loss_clip": 0.01089605, "auxiliary_loss_mlp": 0.01032987, "balance_loss_clip": 1.03502405, "balance_loss_mlp": 1.02091718, "epoch": 0.9987073500676387, "flos": 22054538776320.0, "grad_norm": 2.237128509248557, "language_loss": 0.67805243, "learning_rate": 1.672274094288717e-11, "loss": 0.69927835, "num_input_tokens_seen": 358472210, "step": 16611, "time_per_iteration": 2.634993553161621 }, { "auxiliary_loss_clip": 0.01066706, "auxiliary_loss_mlp": 0.01038076, "balance_loss_clip": 1.03627813, "balance_loss_mlp": 1.02527332, "epoch": 0.9987674733203066, "flos": 30482880537600.0, "grad_norm": 1.4582335875749615, "language_loss": 0.69769359, "learning_rate": 1.5168020961020544e-11, "loss": 0.71874142, "num_input_tokens_seen": 358493840, "step": 16612, "time_per_iteration": 2.8596408367156982 }, { "auxiliary_loss_clip": 0.01083064, "auxiliary_loss_mlp": 0.0103316, "balance_loss_clip": 1.03643417, "balance_loss_mlp": 1.02126336, "epoch": 0.9988275965729746, "flos": 27745230585600.0, "grad_norm": 1.6439272991561156, "language_loss": 0.73709273, "learning_rate": 1.3689140604400407e-11, "loss": 0.75825495, "num_input_tokens_seen": 358515060, "step": 16613, "time_per_iteration": 2.7902584075927734 }, { "auxiliary_loss_clip": 0.01071712, "auxiliary_loss_mlp": 0.0077277, "balance_loss_clip": 1.0345372, "balance_loss_mlp": 1.00019884, "epoch": 0.9988877198256426, "flos": 17524191916800.0, "grad_norm": 1.920035389313773, "language_loss": 0.73619223, "learning_rate": 1.2286099928981996e-11, "loss": 0.754637, "num_input_tokens_seen": 358528200, "step": 16614, "time_per_iteration": 2.6406190395355225 }, { "auxiliary_loss_clip": 0.01094466, "auxiliary_loss_mlp": 0.01032872, "balance_loss_clip": 1.03571415, "balance_loss_mlp": 1.02093267, "epoch": 0.9989478430783105, "flos": 20996502739200.0, "grad_norm": 1.5665679331066227, "language_loss": 0.722013, "learning_rate": 1.0958898988278065e-11, "loss": 0.74328637, "num_input_tokens_seen": 358548360, "step": 16615, "time_per_iteration": 2.639946222305298 }, { "auxiliary_loss_clip": 0.01112886, "auxiliary_loss_mlp": 0.00770149, "balance_loss_clip": 1.03912163, "balance_loss_mlp": 1.00027168, "epoch": 0.9990079663309785, "flos": 13370620769280.0, "grad_norm": 2.06456218997016, "language_loss": 0.77498305, "learning_rate": 9.70753783247069e-12, "loss": 0.79381335, "num_input_tokens_seen": 358566270, "step": 16616, "time_per_iteration": 2.703230619430542 }, { "auxiliary_loss_clip": 0.01081698, "auxiliary_loss_mlp": 0.01029148, "balance_loss_clip": 1.03569651, "balance_loss_mlp": 1.01701295, "epoch": 0.9990680895836465, "flos": 17310236555520.0, "grad_norm": 2.2671180152095696, "language_loss": 0.82647479, "learning_rate": 8.532016508855378e-12, "loss": 0.84758323, "num_input_tokens_seen": 358584710, "step": 16617, "time_per_iteration": 2.6513431072235107 }, { "auxiliary_loss_clip": 0.01086051, "auxiliary_loss_mlp": 0.01027312, "balance_loss_clip": 1.03479171, "balance_loss_mlp": 1.01599932, "epoch": 0.9991282128363145, "flos": 24207993930240.0, "grad_norm": 1.6210542380677302, "language_loss": 0.78575474, "learning_rate": 7.43233506206309e-12, "loss": 0.8068884, "num_input_tokens_seen": 358606750, "step": 16618, "time_per_iteration": 2.6931798458099365 }, { "auxiliary_loss_clip": 0.01105935, "auxiliary_loss_mlp": 0.01031572, "balance_loss_clip": 1.03507876, "balance_loss_mlp": 1.01963282, "epoch": 0.9991883360889824, "flos": 21175301664000.0, "grad_norm": 1.7008832792892883, "language_loss": 0.74742877, "learning_rate": 6.408493534060255e-12, "loss": 0.76880378, "num_input_tokens_seen": 358624675, "step": 16619, "time_per_iteration": 4.155118942260742 }, { "auxiliary_loss_clip": 0.01093229, "auxiliary_loss_mlp": 0.01028121, "balance_loss_clip": 1.03345323, "balance_loss_mlp": 1.01702261, "epoch": 0.9992484593416504, "flos": 19901155449600.0, "grad_norm": 2.899887344454389, "language_loss": 0.8699075, "learning_rate": 5.460491963260594e-12, "loss": 0.89112103, "num_input_tokens_seen": 358640715, "step": 16620, "time_per_iteration": 4.1041669845581055 }, { "auxiliary_loss_clip": 0.01066897, "auxiliary_loss_mlp": 0.01026513, "balance_loss_clip": 1.02997065, "balance_loss_mlp": 1.01463938, "epoch": 0.9993085825943183, "flos": 24857832833280.0, "grad_norm": 2.059624292912411, "language_loss": 0.72426653, "learning_rate": 4.58833038607942e-12, "loss": 0.74520063, "num_input_tokens_seen": 358659630, "step": 16621, "time_per_iteration": 4.831484794616699 }, { "auxiliary_loss_clip": 0.01000795, "auxiliary_loss_mlp": 0.00999466, "balance_loss_clip": 1.00817204, "balance_loss_mlp": 0.99854225, "epoch": 0.9993687058469863, "flos": 71284478780160.0, "grad_norm": 0.7355150485503724, "language_loss": 0.56584859, "learning_rate": 3.79200883515729e-12, "loss": 0.58585119, "num_input_tokens_seen": 358727840, "step": 16622, "time_per_iteration": 3.3878767490386963 }, { "auxiliary_loss_clip": 0.0106847, "auxiliary_loss_mlp": 0.01031465, "balance_loss_clip": 1.0328269, "balance_loss_mlp": 1.01847744, "epoch": 0.9994288290996542, "flos": 12199573566720.0, "grad_norm": 2.3925047005917244, "language_loss": 0.71642292, "learning_rate": 3.071527340914315e-12, "loss": 0.73742235, "num_input_tokens_seen": 358744125, "step": 16623, "time_per_iteration": 2.725473642349243 }, { "auxiliary_loss_clip": 0.01064784, "auxiliary_loss_mlp": 0.0103218, "balance_loss_clip": 1.0360446, "balance_loss_mlp": 1.01946068, "epoch": 0.9994889523523223, "flos": 17889942153600.0, "grad_norm": 1.8384385141171624, "language_loss": 0.7497015, "learning_rate": 2.4268859304399368e-12, "loss": 0.77067113, "num_input_tokens_seen": 358761420, "step": 16624, "time_per_iteration": 4.1755170822143555 }, { "auxiliary_loss_clip": 0.010734, "auxiliary_loss_mlp": 0.01030652, "balance_loss_clip": 1.03599191, "balance_loss_mlp": 1.01818919, "epoch": 0.9995490756049902, "flos": 26578888064640.0, "grad_norm": 1.666585360491219, "language_loss": 0.73861277, "learning_rate": 1.8580846286031514e-12, "loss": 0.75965327, "num_input_tokens_seen": 358782600, "step": 16625, "time_per_iteration": 2.77114200592041 }, { "auxiliary_loss_clip": 0.01094699, "auxiliary_loss_mlp": 0.01032931, "balance_loss_clip": 1.03575385, "balance_loss_mlp": 1.02107549, "epoch": 0.9996091988576582, "flos": 22200048771840.0, "grad_norm": 2.255734866069882, "language_loss": 0.76902807, "learning_rate": 1.3651234567202408e-12, "loss": 0.7903043, "num_input_tokens_seen": 358801220, "step": 16626, "time_per_iteration": 2.687950611114502 }, { "auxiliary_loss_clip": 0.01107588, "auxiliary_loss_mlp": 0.010339, "balance_loss_clip": 1.03792691, "balance_loss_mlp": 1.02201486, "epoch": 0.9996693221103262, "flos": 27373195468800.0, "grad_norm": 1.6905180098527337, "language_loss": 0.82313097, "learning_rate": 9.480024334429515e-13, "loss": 0.84454584, "num_input_tokens_seen": 358819190, "step": 16627, "time_per_iteration": 2.609881639480591 }, { "auxiliary_loss_clip": 0.01095764, "auxiliary_loss_mlp": 0.01035323, "balance_loss_clip": 1.0381639, "balance_loss_mlp": 1.02206075, "epoch": 0.9997294453629941, "flos": 26870410846080.0, "grad_norm": 1.8853432771890695, "language_loss": 0.70178521, "learning_rate": 6.067215747584952e-13, "loss": 0.72309601, "num_input_tokens_seen": 358839850, "step": 16628, "time_per_iteration": 2.7530713081359863 }, { "auxiliary_loss_clip": 0.01097289, "auxiliary_loss_mlp": 0.01026328, "balance_loss_clip": 1.0342133, "balance_loss_mlp": 1.01419258, "epoch": 0.9997895686156621, "flos": 23476996247040.0, "grad_norm": 1.3278590412352376, "language_loss": 0.75475144, "learning_rate": 3.4128089332341456e-13, "loss": 0.77598757, "num_input_tokens_seen": 358859805, "step": 16629, "time_per_iteration": 2.7801589965820312 }, { "auxiliary_loss_clip": 0.01089302, "auxiliary_loss_mlp": 0.0103599, "balance_loss_clip": 1.03652728, "balance_loss_mlp": 1.02316952, "epoch": 0.9998496918683301, "flos": 20224961579520.0, "grad_norm": 1.6419420095870436, "language_loss": 0.60239536, "learning_rate": 1.5168039935176126e-13, "loss": 0.62364829, "num_input_tokens_seen": 358877900, "step": 16630, "time_per_iteration": 2.6396772861480713 }, { "auxiliary_loss_clip": 0.0106418, "auxiliary_loss_mlp": 0.01027947, "balance_loss_clip": 1.03312218, "balance_loss_mlp": 1.01544785, "epoch": 0.9999098151209981, "flos": 21652913831040.0, "grad_norm": 3.1563978222436964, "language_loss": 0.6076231, "learning_rate": 3.792010017100722e-14, "loss": 0.62854433, "num_input_tokens_seen": 358897285, "step": 16631, "time_per_iteration": 2.699958086013794 }, { "auxiliary_loss_clip": 0.01046835, "auxiliary_loss_mlp": 0.00770368, "balance_loss_clip": 1.03351796, "balance_loss_mlp": 1.00018144, "epoch": 0.999969938373666, "flos": 11544599018880.0, "grad_norm": 13.911116352216522, "language_loss": 0.7268914, "learning_rate": 0.0, "loss": 0.74506336, "num_input_tokens_seen": 358911570, "step": 16632, "time_per_iteration": 2.6853044033050537 } ], "logging_steps": 1.0, "max_steps": 16632, "num_input_tokens_seen": 358911570, "num_train_epochs": 1, "save_steps": 3328, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3992169073237033e+18, "train_batch_size": 5, "trial_name": null, "trial_params": null }